def scrape_genre_data(): queue=DBQueue(queue_cls=URLQueue,queue_name="genre_data") scraper=GenreScraper(queue=queue) start=queue.get_location() scraper.scrape_pipeline((WebPageInfo(url=URLQueue.objects.get(number=i).document.url) for i in range(start,URLQueue.objects.count())),URLToGenre, start=start)
def full_page_bow(): """ Creates bow of the entire url pages present in URLToGenre that are original pages. :return: """ queue=DBQueue(Queue_full_page,"full_page_bow_queue") bow_model=BagOfWords() for number in range(queue.get_location(),Queue_full_page.objects.count()): queue_obj=Queue_full_page.objects.get(number=number) url_obj=URLToGenre.objects.get(url=queue_obj.url) if number % 1000==0: print(number) try: bow=bow_model.get_word_count(url_obj.page) if url_obj.page.strip()=="": raise Exception("Bad Page") except Exception as ex: with open("bad_full_url.txt",mode="a") as out: out.write("{}:::{}\n".format(number,str(ex))) queue.increment_location() continue URLBow_fulltxt(bow=bow,bow_index=queue_obj.number,short_genres=[normalize_genre_string(genre.genre,2) for genre in url_obj.genre]).save() queue.increment_location()