def tabulate_genre_dist(y,normalize_to_level=1): """ Takes a vector of lists of genres and tabulate the distribution of each genre in the dataset Prints to console a ascii table of the distribution with count and percentage :param y: :return: """ genre_to_count=coll.Counter() y=genre_normalizer(y,normalize_to_level) if np.issubdtype(y.dtype,np.str): genre_to_count.update(y) total=y.shape[0] else: genre_to_count.update((i for i in itertools.chain(*y))) total=sum((len(i) for i in y)) headers=["genre","count","percent"] data=[headers] data.extend([[k,v,round(v/total,2)] for k,v in genre_to_count.items()]) print(tabulate(data,headers="firstrow"))
def extract_meta_data(reference_db_cls,db_cls): """ For selected webpages in URLToGenre: Extract meta data descriptions(name=description) and keywords and form bag of words representation with it. Store it into a database :return: None """ comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls)) bow_transformer=BagOfWords() not_found_data=0 for c,ref_object in enumerate(reference_db_cls.objects.no_cache()): c%10000==0 and comp_logger.info("Done with {} MetaDatas".format(c)) url=ref_object.url ref_index=ref_object.ref_index short_genres=genre_normalizer(ref_object.short_genres,dim=1) page=URLToGenre.objects(url=url).only("page")[0].page page_soup=BeautifulSoup(page,"html.parser") contents=[] try: for meta_data_desc in page_soup.find_all("meta",{"name":"description"}): contents.append(meta_data_desc["content"]) for meta_data_desc in page_soup.find_all("meta",{"name":"Description"}): contents.append(meta_data_desc["content"]) for meta_data_desc in page_soup.find_all("meta",{"name":"keywords"}): contents.append(meta_data_desc["content"]) contents=" ".join(contents if contents else "") #meta_bow=bow_transformer.get_word_count(contents) if contents and contents.strip() else {} if not len(contents): not_found_data+=1 except (KeyError,AttributeError,ValueError): not_found_data+=1 meta_bow={} #store into db #db_cls(ref_index=ref_index,attr_map=meta_bow,short_genres=short_genres).save() comp_logger.info("The MetaData does not exists in {} instances".format(not_found_data))
def _generate_mixed_effect_matrix(X_path,y_path,feat_selector): """ Converts X to a COO Matrix of Mixed effect matrix :param X_path: :param y_path: :param feat_selector: :return: """ mixed_effect_logger.debug("Flattening") #Reduce the column count X,y,_=flatten_set(*random_pick_samples(unpickle_obj(X_path),genre_normalizer(unpickle_obj(y_path)))) feat_selector.fit(X,y) mixed_effect_logger.debug("Final size of X: {} y:{}".format(X.shape,y.shape)) #Get the column selector, indices vocab_selector=feat_selector.get_support(True) num_vocab=vocab_selector.shape[0] vstack_list=[0]*X.shape[0] for ind,X_row in enumerate(X): ind % 10==0 and mixed_effect_logger.info("Done with {}".format(ind)) row=np.zeros((1,num_vocab**2)) select_col=X_row[0,vocab_selector].toarray() #convert to dense rep. #Compare each index to each row. Record the minimum as cooccurence for col_ind in range(0,select_col.shape[1]): if not select_col[0,col_ind]: continue cmp=np.full((1,select_col.shape[1]),fill_value=select_col[0,col_ind]) select_col=np.minimum(select_col,cmp) row[0,col_ind*num_vocab:(col_ind+1)*num_vocab]=select_col vstack_list[ind]=lil_matrix(row) del row,select_col return vstack(vstack_list).tocoo()
def extract_title(reference_db_cls,db_cls): """ Extract title from some webpage in URLToGenre and save it to the db_cls database reference db's object must have url and ref_index attributes :param db_cls: :return: """ comp_logger.info("Extracting from the database {}, putting into {}".format(reference_db_cls,db_cls)) bow_transformer=BagOfWords() title_not_exists=0 for c,ref_object in enumerate(reference_db_cls.objects.no_cache()): c%10==0 and comp_logger.info("Done with {} titles".format(c)) url=ref_object.url ref_index=ref_object.ref_index short_genres=genre_normalizer(ref_object.short_genres,dim=1) page=URLToGenre.objects(url=url).only("page")[0].page page_soup=BeautifulSoup(page,"html.parser") try: title=page_soup.title.string #bag of word #title_bow=bow_transformer.get_word_count(title) if title and title.strip() else {} except (AttributeError,ValueError): title_not_exists+=1 title_bow={} #store into db #db_cls(ref_index=ref_index,attr_map=title_bow,short_genres=short_genres).save() comp_logger.info("The title does not exists in {} instances".format(title_not_exists))