def start_bow(): """ Bag of word all webpages in URLToGenre database Pipeline: 1. Get genre and page from URLToGenre Object from the mongodb URLQueue 2. BOW the webpage in URLToGenre Object 3. Shorten the Genre 4. Insert the words in bow into the genre in CategoricalBow Mongodb table Repeat until exhaustion of URLToGenre Objects :return: Nothing! """ queue=DBQueue(None,"summarization") #don't trust anything summarizer=Summarizer() bow=BagOfWords() for url_obj in URLToGenre.objects.order_by("ref_index").no_cache(): try: print('New url {}'.format(url_obj.ref_index)) if not hasattr(url_obj,"original") or not url_obj["original"]: continue #skip conditionsL it does not have page or it is not an original url if not hasattr(url_obj,'page'): raise Exception('url {} No has page'.format(url_obj.ref_index)) #get genre strings #register the genre with the short genres for faster retrieval genre_string_list=[] for g in url_obj.genre: normalized_string=base_util.normalize_genre_string(g["genre"]) genre_string_list.append(normalized_string) genre_string_list=list(set(genre_string_list)) summarize_logger.info("Getting bow rep") #get BOW representation bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj))) summarize_logger.info("Update count:"+str(bow_dict)) if len(bow_dict)==0: raise Exception("No count available") #store the url bow in urlbow table if len(Summary.objects(url=url_obj.ref_index))==0: Summary(url=url_obj.url,ref_index=url_obj.ref_index,attr_map=bow_dict,short_genres=genre_string_list).save() else: print('Exists bow url number {}'.format(url_obj.ref_index)) except Exception as ex: summarize_logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow.txt")
def collect_bad_url(): """ Make bows of websites in the bad url list :return: """ queue=DBQueue_old("genre_bow") #don't trust anything summarizer=Summarizer() bow=BagOfWords() short_genre_to_genre=coll.ShortGenre() url_to_bow=coll.URLBow() start_pos=queue.get() for c,line in enumerate(open("bad_url_summarize_bow.txt")): if c<start_pos: continue url=line.split(" ")[1].split(":::")[0] try: print('New url {} num: {}'.format(url,c)) url_obj=coll.URLToGenre().select(url=url).find_one() if not hasattr(url_obj,"original") or not url_obj["original"]: print("Not original") continue #request page anyways, most of the bad pages are due to bad pagess data=Request().get_data(base_util.unreplace_dot_url(base_util.unreplace_dot_url(url_obj["url"]))) if data is None: raise Exception('url {} No has page'.format(url)) else: if not hasattr(url_obj,"page") or len(data)>len(url_obj["page"]): print("updating data") data=base_util.utf_8_safe_decode(data) if not hasattr(url_obj,"page"): #save page if the new page is significantly bigger than the old one url_obj.save(page=data) else: url_obj.update(page=data) url_obj.reload() if len(data) > len(url_obj.page): raise Exception("Inconsistency b/w data and page data") #url_obj=repair.genre_to_genre_data(url_obj.document) #get genre strings #register the genre with the short genres for faster retrieval genre_string_list=[] for g in url_obj.genre: normalized_string=base_util.normalize_genre_string(g["genre"]) genre_string_list.append(normalized_string) short_genre_to_genre.select(short_genre=normalized_string).update(upsert=True,add_to_set__genres=g) Logger.info("Getting bow rep") #get BOW representation bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj))) if len(bow_dict)<20: raise Exception("Words less than 20") Logger.info("Update count:"+str(bow_dict)) #store the url bow in urlbow table if not url_to_bow.select(url=url_obj["url"]).find_one(): url_to_bow.create(url=url_obj["url"],bow=bow_dict,short_genres=genre_string_list) else: print('Exists bow url number {}'.format(url)) queue.increment() except Exception as ex: Logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow1.txt")