def global_ref_id(): """ Had an issue with reference ids not matching b/w url allgrams, urltogenre pages, and summary in url bow. This resolves the issur and uses urlbow's index for all ids :return: """ urls=set() largest_ref_index=0 for c,url_summary_obj in enumerate(URLBow.objects.no_cache()): c%1000==0 and print("Done with {}".format(c)) url=url_summary_obj.url urls.add(url) ref_index=url_summary_obj.ref_index if ref_index is None: ref_index=largest_ref_index+1 url_summary_obj.update(ref_index=ref_index) if ref_index > largest_ref_index: largest_ref_index=ref_index URLToGenre.objects(url=url).update(ref_index=ref_index) print("Done with normal ones, just finishing off the rest of URLTOGenre") for c,url_to_genre_obj in enumerate(URLToGenre.objects.no_cache()): if url_to_genre_obj.url in urls: continue c%1000==0 and print("Done with {}".format(c)) largest_ref_index+=1 url_to_genre_obj.update(ref_index=largest_ref_index)
def grab_urls_and_genres(): file_name="url_list.txt" line_template="{}:::{}\n" with open(file_name,encoding="latin-1",mode="w") as url_file_handle: for url_obj in URLToGenre.objects(original=True): genres_list=[g.genre for g in url_obj.genre] url_file_handle.write(line_template.format(unreplace_dot_url(url_obj.url),":::".join(genres_list)))
def extract_meta_data(reference_db_cls,db_cls): """ For selected webpages in URLToGenre: Extract meta data descriptions(name=description) and keywords and form bag of words representation with it. Store it into a database :return: None """ comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls)) bow_transformer=BagOfWords() not_found_data=0 for c,ref_object in enumerate(reference_db_cls.objects.no_cache()): c%10000==0 and comp_logger.info("Done with {} MetaDatas".format(c)) url=ref_object.url ref_index=ref_object.ref_index short_genres=genre_normalizer(ref_object.short_genres,dim=1) page=URLToGenre.objects(url=url).only("page")[0].page page_soup=BeautifulSoup(page,"html.parser") contents=[] try: for meta_data_desc in page_soup.find_all("meta",{"name":"description"}): contents.append(meta_data_desc["content"]) for meta_data_desc in page_soup.find_all("meta",{"name":"Description"}): contents.append(meta_data_desc["content"]) for meta_data_desc in page_soup.find_all("meta",{"name":"keywords"}): contents.append(meta_data_desc["content"]) contents=" ".join(contents if contents else "") #meta_bow=bow_transformer.get_word_count(contents) if contents and contents.strip() else {} if not len(contents): not_found_data+=1 except (KeyError,AttributeError,ValueError): not_found_data+=1 meta_bow={} #store into db #db_cls(ref_index=ref_index,attr_map=meta_bow,short_genres=short_genres).save() comp_logger.info("The MetaData does not exists in {} instances".format(not_found_data))
def extract_title(reference_db_cls,db_cls): """ Extract title from some webpage in URLToGenre and save it to the db_cls database reference db's object must have url and ref_index attributes :param db_cls: :return: """ comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls)) bow_transformer=BagOfWords() title_not_exists=0 for c,ref_object in enumerate(reference_db_cls.objects.no_cache()): c%10==0 and comp_logger.info("Done with {} titles".format(c)) url=ref_object.url ref_index=ref_object.ref_index short_genres=genre_normalizer(ref_object.short_genres,dim=1) page=URLToGenre.objects(url=url).only("page")[0].page page_soup=BeautifulSoup(page,"html.parser") try: title=page_soup.title.string #bag of word #title_bow=bow_transformer.get_word_count(title) if title and title.strip() else {} except (AttributeError,ValueError): title_not_exists+=1 title_bow={} #store into db #db_cls(ref_index=ref_index,attr_map=title_bow,short_genres=short_genres).save() comp_logger.info("The title does not exists in {} instances".format(title_not_exists))