Exemple #1
0
def global_ref_id():

    """
    Had an issue with reference ids not matching b/w url allgrams, urltogenre pages, and summary in url bow.

    This resolves the issur and uses urlbow's index for all ids

    :return:
    """
    urls=set()
    largest_ref_index=0
    for c,url_summary_obj in enumerate(URLBow.objects.no_cache()):
        c%1000==0 and print("Done with {}".format(c))
        url=url_summary_obj.url
        urls.add(url)
        ref_index=url_summary_obj.ref_index

        if ref_index is None:
            ref_index=largest_ref_index+1
            url_summary_obj.update(ref_index=ref_index)

        if ref_index > largest_ref_index:
            largest_ref_index=ref_index

        URLToGenre.objects(url=url).update(ref_index=ref_index)

    print("Done with normal ones, just finishing off the rest of URLTOGenre")
    for c,url_to_genre_obj in enumerate(URLToGenre.objects.no_cache()):
        if url_to_genre_obj.url in urls:
            continue

        c%1000==0 and print("Done with {}".format(c))

        largest_ref_index+=1
        url_to_genre_obj.update(ref_index=largest_ref_index)
def grab_urls_and_genres():
    file_name="url_list.txt"

    line_template="{}:::{}\n"
    with open(file_name,encoding="latin-1",mode="w") as url_file_handle:


        for url_obj in URLToGenre.objects(original=True):
            genres_list=[g.genre for g in url_obj.genre]

            url_file_handle.write(line_template.format(unreplace_dot_url(url_obj.url),":::".join(genres_list)))
def extract_meta_data(reference_db_cls,db_cls):
    """
    For selected webpages in URLToGenre:

    Extract meta data descriptions(name=description) and keywords and form bag of words representation with it.

    Store it into a database

    :return: None
    """
    comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls))

    bow_transformer=BagOfWords()
    not_found_data=0
    for c,ref_object in enumerate(reference_db_cls.objects.no_cache()):
        c%10000==0 and comp_logger.info("Done with {} MetaDatas".format(c))

        url=ref_object.url
        ref_index=ref_object.ref_index
        short_genres=genre_normalizer(ref_object.short_genres,dim=1)

        page=URLToGenre.objects(url=url).only("page")[0].page

        page_soup=BeautifulSoup(page,"html.parser")

        contents=[]
        try:
            for meta_data_desc in page_soup.find_all("meta",{"name":"description"}):
                contents.append(meta_data_desc["content"])

            for meta_data_desc in page_soup.find_all("meta",{"name":"Description"}):
                contents.append(meta_data_desc["content"])

            for meta_data_desc in page_soup.find_all("meta",{"name":"keywords"}):
                contents.append(meta_data_desc["content"])

            contents=" ".join(contents if contents else "")
            #meta_bow=bow_transformer.get_word_count(contents) if contents and contents.strip() else {}

            if not len(contents):
                not_found_data+=1
        except (KeyError,AttributeError,ValueError):
            not_found_data+=1
            meta_bow={}

        #store into db
        #db_cls(ref_index=ref_index,attr_map=meta_bow,short_genres=short_genres).save()

    comp_logger.info("The MetaData does not exists in {} instances".format(not_found_data))
def extract_title(reference_db_cls,db_cls):
    """
    Extract title from some webpage in URLToGenre and save it to the db_cls database

    reference db's object must have url and ref_index attributes
    :param db_cls:
    :return:
    """
    comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls))

    bow_transformer=BagOfWords()
    title_not_exists=0
    for c,ref_object in enumerate(reference_db_cls.objects.no_cache()):
        c%10==0 and comp_logger.info("Done with {} titles".format(c))

        url=ref_object.url
        ref_index=ref_object.ref_index
        short_genres=genre_normalizer(ref_object.short_genres,dim=1)

        page=URLToGenre.objects(url=url).only("page")[0].page

        page_soup=BeautifulSoup(page,"html.parser")

        try:
            title=page_soup.title.string

            #bag of word
            #title_bow=bow_transformer.get_word_count(title) if title and title.strip() else {}

        except (AttributeError,ValueError):
            title_not_exists+=1
            title_bow={}

        #store into db
        #db_cls(ref_index=ref_index,attr_map=title_bow,short_genres=short_genres).save()

    comp_logger.info("The title does not exists in {} instances".format(title_not_exists))