コード例 #1
0
ファイル: genre_count.py プロジェクト: wangk1/research
def tabulate_genre_dist(y,normalize_to_level=1):
    """
    Takes a vector of lists of genres and tabulate the distribution of each genre in the dataset

    Prints to console a ascii table of the distribution with count and percentage

    :param y:
    :return:
    """
    genre_to_count=coll.Counter()

    y=genre_normalizer(y,normalize_to_level)

    if np.issubdtype(y.dtype,np.str):
        genre_to_count.update(y)
        total=y.shape[0]

    else:
        genre_to_count.update((i for i in itertools.chain(*y)))
        total=sum((len(i) for i in y))

    headers=["genre","count","percent"]


    data=[headers]
    data.extend([[k,v,round(v/total,2)] for k,v in genre_to_count.items()])

    print(tabulate(data,headers="firstrow"))
コード例 #2
0
ファイル: webpage_components.py プロジェクト: wangk1/research
def extract_meta_data(reference_db_cls,db_cls):
    """
    For selected webpages in URLToGenre:

    Extract meta data descriptions(name=description) and keywords and form bag of words representation with it.

    Store it into a database

    :return: None
    """
    comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls))

    bow_transformer=BagOfWords()
    not_found_data=0
    for c,ref_object in enumerate(reference_db_cls.objects.no_cache()):
        c%10000==0 and comp_logger.info("Done with {} MetaDatas".format(c))

        url=ref_object.url
        ref_index=ref_object.ref_index
        short_genres=genre_normalizer(ref_object.short_genres,dim=1)

        page=URLToGenre.objects(url=url).only("page")[0].page

        page_soup=BeautifulSoup(page,"html.parser")

        contents=[]
        try:
            for meta_data_desc in page_soup.find_all("meta",{"name":"description"}):
                contents.append(meta_data_desc["content"])

            for meta_data_desc in page_soup.find_all("meta",{"name":"Description"}):
                contents.append(meta_data_desc["content"])

            for meta_data_desc in page_soup.find_all("meta",{"name":"keywords"}):
                contents.append(meta_data_desc["content"])

            contents=" ".join(contents if contents else "")
            #meta_bow=bow_transformer.get_word_count(contents) if contents and contents.strip() else {}

            if not len(contents):
                not_found_data+=1
        except (KeyError,AttributeError,ValueError):
            not_found_data+=1
            meta_bow={}

        #store into db
        #db_cls(ref_index=ref_index,attr_map=meta_bow,short_genres=short_genres).save()

    comp_logger.info("The MetaData does not exists in {} instances".format(not_found_data))
コード例 #3
0
ファイル: mixed_effect.py プロジェクト: wangk1/research
def _generate_mixed_effect_matrix(X_path,y_path,feat_selector):
    """
    Converts X to a COO Matrix of Mixed effect matrix

    :param X_path:
    :param y_path:
    :param feat_selector:
    :return:
    """

    mixed_effect_logger.debug("Flattening")

    #Reduce the column count
    X,y,_=flatten_set(*random_pick_samples(unpickle_obj(X_path),genre_normalizer(unpickle_obj(y_path))))
    feat_selector.fit(X,y)

    mixed_effect_logger.debug("Final size of X: {} y:{}".format(X.shape,y.shape))

    #Get the column selector, indices
    vocab_selector=feat_selector.get_support(True)
    num_vocab=vocab_selector.shape[0]

    vstack_list=[0]*X.shape[0]
    for ind,X_row in enumerate(X):
        ind % 10==0 and mixed_effect_logger.info("Done with {}".format(ind))

        row=np.zeros((1,num_vocab**2))
        select_col=X_row[0,vocab_selector].toarray() #convert to dense rep.

        #Compare each index to each row. Record the minimum as cooccurence
        for col_ind in range(0,select_col.shape[1]):
            if not select_col[0,col_ind]:
                continue

            cmp=np.full((1,select_col.shape[1]),fill_value=select_col[0,col_ind])
            select_col=np.minimum(select_col,cmp)
            row[0,col_ind*num_vocab:(col_ind+1)*num_vocab]=select_col

        vstack_list[ind]=lil_matrix(row)
        del row,select_col

    return vstack(vstack_list).tocoo()
コード例 #4
0
ファイル: webpage_components.py プロジェクト: wangk1/research
def extract_title(reference_db_cls,db_cls):
    """
    Extract title from some webpage in URLToGenre and save it to the db_cls database

    reference db's object must have url and ref_index attributes
    :param db_cls:
    :return:
    """
    comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls))

    bow_transformer=BagOfWords()
    title_not_exists=0
    for c,ref_object in enumerate(reference_db_cls.objects.no_cache()):
        c%10==0 and comp_logger.info("Done with {} titles".format(c))

        url=ref_object.url
        ref_index=ref_object.ref_index
        short_genres=genre_normalizer(ref_object.short_genres,dim=1)

        page=URLToGenre.objects(url=url).only("page")[0].page

        page_soup=BeautifulSoup(page,"html.parser")

        try:
            title=page_soup.title.string

            #bag of word
            #title_bow=bow_transformer.get_word_count(title) if title and title.strip() else {}

        except (AttributeError,ValueError):
            title_not_exists+=1
            title_bow={}

        #store into db
        #db_cls(ref_index=ref_index,attr_map=title_bow,short_genres=short_genres).save()

    comp_logger.info("The title does not exists in {} instances".format(title_not_exists))