def load_X_y_refIndex(path_X,path_y,path_refId): """ Load X,y, and refIndex from their respective pickle directory. :param feature_type: :param pickle_dir: :return: """ X=unpickle_obj(path_X) y=unpickle_obj(path_y) ref_id=unpickle_obj(path_refId) return X,y,ref_id
def _generate_mixed_effect_matrix(X_path,y_path,feat_selector): """ Converts X to a COO Matrix of Mixed effect matrix :param X_path: :param y_path: :param feat_selector: :return: """ mixed_effect_logger.debug("Flattening") #Reduce the column count X,y,_=flatten_set(*random_pick_samples(unpickle_obj(X_path),genre_normalizer(unpickle_obj(y_path)))) feat_selector.fit(X,y) mixed_effect_logger.debug("Final size of X: {} y:{}".format(X.shape,y.shape)) #Get the column selector, indices vocab_selector=feat_selector.get_support(True) num_vocab=vocab_selector.shape[0] vstack_list=[0]*X.shape[0] for ind,X_row in enumerate(X): ind % 10==0 and mixed_effect_logger.info("Done with {}".format(ind)) row=np.zeros((1,num_vocab**2)) select_col=X_row[0,vocab_selector].toarray() #convert to dense rep. #Compare each index to each row. Record the minimum as cooccurence for col_ind in range(0,select_col.shape[1]): if not select_col[0,col_ind]: continue cmp=np.full((1,select_col.shape[1]),fill_value=select_col[0,col_ind]) select_col=np.minimum(select_col,cmp) row[0,col_ind*num_vocab:(col_ind+1)*num_vocab]=select_col vstack_list[ind]=lil_matrix(row) del row,select_col return vstack(vstack_list).tocoo()
def num_genre_per_webpage(matrix_path): """ Create a box plot of how many other genres each webpage has for each genre Also, record the occurence of genres with each other :param matrix_path: :return: """ label_matrix=unpickle_obj(matrix_path) genre_to_num_webpages=coll.defaultdict(lambda:[]) for webpage_genre in label_matrix: normalized_genre=set([normalize_genre_string(g,1) for g in webpage_genre]) for g in normalized_genre: if g in bad_genre_set: continue #if normalized_genre-{g}: genre_to_num_webpages[g].append(normalized_genre-{g}) #box plot it genre_to_num_item_iter=genre_to_num_webpages.items() plt.clf() plt.figure(1) plt.xticks([i for i in range(0,len(genre_to_num_item_iter))],[op.itemgetter(0)(i) for i in genre_to_num_item_iter]) plt.yticks(range(0,6)) plt.tick_params(axis="both",which="major",labelsize=5) for c,(g,counts) in enumerate(genre_to_num_item_iter): add_bar_plot(c,[ len(gs) for gs in counts]) plt.savefig("C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\genre_analysis\\genre_dist.pdf") #print print(genre_to_num_webpages)
for (w,c) in sorted_list: file.write("{}, {}\n".format(w,c)) if __name__=="__main__": dmoz_alexa_similarity() exit(0) path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_chi_top1cls_10000" outpath="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_2000_chi2\\miss_plt" y_path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir\\y_summary_pickle" y=unpickle_obj(y_path) tabulate_genre_dist(y) #num_genre_per_webpage("C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir\\y_summary_pickle") #dmoz_alexa_similarity() # #prob_dict=load_prob_dict() #for i in range(1,5): #consensus_count,consensus_total=consensus_class_per_genre(path,filter_func=lambda x:len(x)==i) #plot_consensus_percentile(consensus_count,consensus_total) #multi_class_misprediction_freq(path) #plot_miss_per_genre(path,outpath,classifiers="LogisticRegression")
__author__ = 'Kevin' import itertools from analytics.classification_results.res_iterator import RightResultsIter,WrongResultsIter from misc_scripts.assign_ref_index import global_ref_id from misc_scripts.remove_summary_duplicates import remove_summary_duplicates_in_urlbow from data.util import unpickle_obj from util.base_util import normalize_genre_string if __name__=="__main__": y=unpickle_obj("C:/Users/wangk1/Desktop/Research/research/pickle_dir/summary/y_summary_pickle") num=0 for y_i in y: if len(set(normalize_genre_string(n) for n in y_i))==2: num+=1 print(num) #remove_summary_duplicates_in_urlbow() #assign_ref_index_to_each_url() """ path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_chi_top4cls_10000" #path="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\classification_res\\summary_100_chi_truncated_lsa" classifier="LogisticRegression" num_top=1 correct=0 wrong=0
def unsupervised(settings,train_set,clusterer,clustering_alg_cls): clustering_logger.info("Unsupervised Algorithm training size: {}".format(train_set.X.shape)) for num_cluster in sorted(settings.num_clusters,reverse=True): X,y,ref_ids=train_set.to_matrices() additional_notes="" if train_set.X.shape[0]<=settings.spectre_clustering_limit: clustering_alg=AgglomerativeClustering(n_clusters=num_cluster) additional_notes="_agglomerative" X=X.toarray() else: clustering_alg=clustering_alg_cls(n_clusters=num_cluster) clustering_logger.info("Using {}".format(str(clustering_alg)+additional_notes)) res_labels=clustering_alg.fit_predict(X) occurence_dict=clusterer.get_clusters_genre_distribution(y,res_labels) #the directory to store the results of clustering res_dir=os.path.join(UNSUPERVISED_DIR,settings.clustering_alg,*settings.parent_clusters) os.makedirs(res_dir,exist_ok=True) #ELIMATE CLUSTER LESS THAN 2 pages in size for cluster_name, cluster_genre_count in list(occurence_dict.items()): total_count_in_cluster=sum((count for genre,count in cluster_genre_count.items())) if total_count_in_cluster < 12: del occurence_dict[cluster_name] else: path=os.path.join(res_dir,"{}_{}_pages".format(num_cluster,cluster_name)) #OUTPUT the pages in the current cluster clusterer.output_pages_in_cluster(path,train_set.ref_index[res_labels==cluster_name]) res_file="{}/{}.pdf".format(res_dir,str(num_cluster)) clusterer.generate_cluster_distribution_graphs(res_file,occurence_dict,res_labels) #output closeness metrics if additional_notes=="": inter_cluster,inter_cluster_count,intra_cluster,intra_cluster_count=Clustering().cluster_closeness(clustering_alg.cluster_centers_,X,res_labels) clusterer.output_cluster_closeness("{}/{}.txt".format(res_dir,num_cluster),inter_cluster, inter_cluster_count,intra_cluster,intra_cluster_count) #do a dfs on clusters bigger than the prescribed size if settings.break_up_clusters: breakup_candidate=[] for i in range(0,num_cluster): if np.sum(res_labels==i)>=settings.max_cluster_size: breakup_candidate.append(i) X_path=os.path.join(res_dir,"X") y_path=os.path.join(res_dir,"y") ref_indexes_path=os.path.join(res_dir,"ref_indexes") clustering_logger.info("Pickling X,y,ref_index to conserve memory") pickle_obj(train_set.X,X_path) pickle_obj(train_set.y,y_path) pickle_obj(train_set.ref_index,ref_indexes_path) for cluster_name in breakup_candidate: clustering_logger.info("Breaking up cluster {} of size greater than {}".format(cluster_name,settings.max_cluster_size)) settings.parent_clusters.append("{}_{}".format(num_cluster,cluster_name)) selector=(res_labels==cluster_name) train_set.X=train_set.X[selector] train_set.y=train_set.y[selector] train_set.ref_index=train_set.ref_index[selector] unsupervised(settings,train_set,clusterer,clustering_alg_cls) settings.parent_clusters.pop() train_set.X=unpickle_obj(X_path) train_set.y=unpickle_obj(y_path) train_set.ref_index=unpickle_obj(ref_indexes_path) #remove the cache files os.remove(ref_indexes_path) os.remove(X_path) os.remove(y_path)
def load_X_y(path_X,path_y): return unpickle_obj(path_X), unpickle_obj(path_y)
#CLASSIFICATION, adjust weights classifier_util=ClassifierUtil() """ LOAD DATA, preprocess """ #WARNING: REF INDEX for each individual X set must match row to row Xs=[] ys=[] ref_indexes_unmatched=[] ref_indexes=[] for setting in settings: supervised_logger.info("Loading data for {}".format(setting)) X=unpickle_obj("pickle_dir\\{}\\X_{}_pickle".format(setting.feature_selection,setting.feature_selection)) ref_index=unpickle_obj("pickle_dir\\{}\\refIndex_{}_pickle".format(*itertools.repeat(setting.feature_selection,2))) y=unpickle_obj("pickle_dir\\{}\\y_{}_pickle".format(*itertools.repeat(setting.feature_selection,2))) y=np.array([list(set((normalize_genre_string(g,1) for g in g_list))) for g_list in y]) #filter out unwanted genres X_filtered,y_filtered,ref_index_filtered=filter_genres(X,y,ref_index,ignore_genre) ref_indexes_unmatched.append(ref_index_filtered) Xs.append(X_filtered) ys.append(y_filtered) #match refids supervised_logger.info("Making ref indexes match for the data sets") Xs,ys,ref_indexes=match_sets_based_on_ref_id(Xs,ys,ref_indexes_unmatched) #make sure ref indexes match