Python BagOfWords Beispiele

Programmiersprache: Python

Namespace / Paketname: classification_attribute.word_based

Klasse / Typ: BagOfWords

Beispiele auf hotexamples.com: 3

Python BagOfWords - 3 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die classification_attribute.word_based.BagOfWords, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

get_word_count(2)

reverse_transformation(1)

Beispiel #1

Datei anzeigen

Datei: summarize.py Projekt: wangk1/research

def start_bow():
    """
    Bag of word all webpages in URLToGenre database

    Pipeline:
    1. Get genre and page from URLToGenre Object from the mongodb URLQueue
    2. BOW the webpage in URLToGenre Object
    3. Shorten the Genre
    4. Insert the words in bow into the genre in CategoricalBow Mongodb table

    Repeat until exhaustion of URLToGenre Objects

    :return: Nothing!
    """

    queue=DBQueue(None,"summarization")

    #don't trust anything
    summarizer=Summarizer()
    bow=BagOfWords()

    for url_obj in URLToGenre.objects.order_by("ref_index").no_cache():

        try:
            print('New url {}'.format(url_obj.ref_index))

            if not hasattr(url_obj,"original") or not url_obj["original"]:
                continue

            #skip conditionsL it does not have page or it is not an original url
            if not hasattr(url_obj,'page'):
                raise Exception('url {} No has page'.format(url_obj.ref_index))

            #get genre strings
            #register the genre with the short genres for faster retrieval
            genre_string_list=[]
            for g in url_obj.genre:
                normalized_string=base_util.normalize_genre_string(g["genre"])
                genre_string_list.append(normalized_string)

            genre_string_list=list(set(genre_string_list))

            summarize_logger.info("Getting bow rep")
            #get BOW representation
            bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj)))

            summarize_logger.info("Update count:"+str(bow_dict))

            if len(bow_dict)==0:
                raise Exception("No count available")

            #store the url bow in urlbow table
            if len(Summary.objects(url=url_obj.ref_index))==0:
                Summary(url=url_obj.url,ref_index=url_obj.ref_index,attr_map=bow_dict,short_genres=genre_string_list).save()
            else:
                print('Exists bow url number {}'.format(url_obj.ref_index))

        except Exception as ex:
            summarize_logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow.txt")

Beispiel #2

Datei anzeigen

Datei: attribute_eval.py Projekt: wangk1/research

def evaluate_attributes(bow_dicts,y):
    """
    Pass in multiple bow dictionary of features X and their corresponding labels: y.

    :param bow_dicts:
    :param y:
    :return:
    """
    #array the list
    y=np.array(y)
    bow_model=BagOfWords()

    X=[]

    #get the array of all the bows
    for bow_dict in bow_dicts:
        X.append(bow_model.reverse_transformation(bow_dict))

    exit()

Beispiel #3

Datei anzeigen

Datei: summarize.py Projekt: wangk1/research

def collect_bad_url():
    """
    Make bows of websites in the bad url list

    :return:
    """

    queue=DBQueue_old("genre_bow")

    #don't trust anything
    summarizer=Summarizer()
    bow=BagOfWords()
    short_genre_to_genre=coll.ShortGenre()
    url_to_bow=coll.URLBow()
    start_pos=queue.get()

    for c,line in enumerate(open("bad_url_summarize_bow.txt")):
        if c<start_pos:
            continue

        url=line.split(" ")[1].split(":::")[0]

        try:
            print('New url {} num: {}'.format(url,c))

            url_obj=coll.URLToGenre().select(url=url).find_one()

            if not hasattr(url_obj,"original") or not url_obj["original"]:
                print("Not original")
                continue

            #request page anyways, most of the bad pages are due to bad pagess
            data=Request().get_data(base_util.unreplace_dot_url(base_util.unreplace_dot_url(url_obj["url"])))

            if data is None:
                raise Exception('url {} No has page'.format(url))
            else:
                if not hasattr(url_obj,"page") or len(data)>len(url_obj["page"]):
                    print("updating data")
                    data=base_util.utf_8_safe_decode(data)

                    if not hasattr(url_obj,"page"):
                        #save page if the new page is significantly bigger than the old one
                        url_obj.save(page=data)

                    else:
                        url_obj.update(page=data)
                    url_obj.reload()

            if len(data) > len(url_obj.page):
                raise Exception("Inconsistency b/w data and page data")



            #url_obj=repair.genre_to_genre_data(url_obj.document)

            #get genre strings
            #register the genre with the short genres for faster retrieval
            genre_string_list=[]
            for g in url_obj.genre:
                normalized_string=base_util.normalize_genre_string(g["genre"])
                genre_string_list.append(normalized_string)
                short_genre_to_genre.select(short_genre=normalized_string).update(upsert=True,add_to_set__genres=g)

            Logger.info("Getting bow rep")
            #get BOW representation
            bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj)))

            if len(bow_dict)<20:
                raise Exception("Words less than 20")

            Logger.info("Update count:"+str(bow_dict))


            #store the url bow in urlbow table
            if not url_to_bow.select(url=url_obj["url"]).find_one():
                url_to_bow.create(url=url_obj["url"],bow=bow_dict,short_genres=genre_string_list)

            else:
                print('Exists bow url number {}'.format(url))

            queue.increment()
        except Exception as ex:
            Logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow1.txt")