Example #1
0
def scrape_genre_data():
    queue=DBQueue(queue_cls=URLQueue,queue_name="genre_data")

    scraper=GenreScraper(queue=queue)

    start=queue.get_location()
    scraper.scrape_pipeline((WebPageInfo(url=URLQueue.objects.get(number=i).document.url) for i in range(start,URLQueue.objects.count())),URLToGenre,
                            start=start)
Example #2
0
def full_page_bow():
    """
    Creates bow of the entire url pages present in URLToGenre that are original pages.

    :return:
    """
    queue=DBQueue(Queue_full_page,"full_page_bow_queue")
    bow_model=BagOfWords()


    for number in range(queue.get_location(),Queue_full_page.objects.count()):
        queue_obj=Queue_full_page.objects.get(number=number)

        url_obj=URLToGenre.objects.get(url=queue_obj.url)

        if number % 1000==0:
            print(number)

        try:

            bow=bow_model.get_word_count(url_obj.page)

            if url_obj.page.strip()=="":
                raise Exception("Bad Page")
        except Exception as ex:
            with open("bad_full_url.txt",mode="a") as out:
                out.write("{}:::{}\n".format(number,str(ex)))
                queue.increment_location()
                continue

        URLBow_fulltxt(bow=bow,bow_index=queue_obj.number,short_genres=[normalize_genre_string(genre.genre,2)
                                                                            for genre in url_obj.genre]).save()
        queue.increment_location()