from util.base_util import normalize_genre_string pickle_dir="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir" if __name__=="__main__": mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"} #s=SourceMapper(URLBow.objects(),mapping) X_pickle_path=os.path.join(pickle_dir,"X_summary_pickle") y_pickle_path=os.path.join(pickle_dir,"y_summary_pickle") ref_index_pickle_path=os.path.join(pickle_dir,"refIndex_summary_pickle") mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"} label="summary_unsupervised_chi_top1cls_10000" #generate_random_sample(unpickle_obj(X_pickle_path),unpickle_obj(y_pickle_path),unpickle_obj(ref_index_pickle_path),1000) #load training, feature selection train_set=Training(label,pickle_dir=pickle_dir) train_set.load_training() train_set.y=np.array([list(set(normalize_genre_string(genre,1) for genre in g_list)) for g_list in train_set.y]) train_set.X=chi_squared_feature_select(train_set.X,train_set.y,k_best=10000) params=GraphCutParams(X=train_set.X,y=train_set.y,ref_id=train_set._ref_index, k_closest_neighbors=4,vocab_size=train_set._X.shape[1],num_clusters=3) alpha_beta_swap(params)
def load_training_testing(Xs,ys,ref_indexes,settings,train_set_index,test_set_index): """ Load training and testing set based on indexes provided by crossvalidation :return: List of train_set and test_set objs """ train_sets=[] test_sets=[] for c,setting in enumerate(settings): train_set=Training(setting,pickle_dir=setting.pickle_dir) train_set.X=Xs[c][train_set_index] train_set.y=ys[c][train_set_index] train_set.ref_index=ref_indexes[c][train_set_index] test_set=Testing(setting,pickle_dir=setting.pickle_dir) test_set.X=Xs[c][test_set_index] test_set.y=ys[c][test_set_index] test_set.ref_index=ref_indexes[c][test_set_index] train_sets.append(train_set) test_sets.append(test_set) #flatten training for train_set in train_sets: flatten_train_set(train_set) #make sure the sets match classification_logger.info("Checking the sets match") ys=[train_set.y for train_set in train_sets] ref_indexes=[train_set.ref_index for train_set in train_sets] test_ys=np.array([test_set.y for test_set in test_sets]) test_ref_indexes=[test_set.ref_index for test_set in test_sets] for c,elem in enumerate((ys,ref_indexes,test_ys,test_ref_indexes)): prev=elem[0] match=True for e in elem[1:]: match=match and (e==prev).all() if not match: raise AttributeError("NOT MATCH FOR {} ELEMENT".format(c)) return train_sets,test_sets # def select_training_testing_sets(settings,Xs,y,ref_index,num,do_pickle=True): # """ # Randomly choose from a super set of data and split it into a training set of size num. The remainder will become # the Test set. Uses _pick_random_samples # # :param setting: # :param X: # :param y: # :param ref_index: # :param num: # :return: tuple_training,tuple_testing # """ # # selector=np.not_equal(ref_index,None) # ref_index=ref_index[selector] # Xs=[X[selector] for X in Xs] # y=y[selector] # # train_Xs,train_y,train_ref_index,test_Xs,test_y,test_ref_index=_pick_random_samples(Xs,y,ref_index,num) # # train_objs=[] # test_objs=[] # if do_pickle: # for c,setting in enumerate(settings): # train_X=train_Xs[c] # test_X=test_Xs[c] # # _pickle_training_testing(setting,train_X,train_y,train_ref_index,test_X,test_y,test_ref_index) # # training_obj=Training(label=setting,pickle_dir=setting.pickle_dir) # training_obj.set_data(train_X,train_y,train_ref_index) # # testing_obj=Testing(label=setting,pickle_dir=setting.pickle_dir) # testing_obj.set_data(test_X,test_y,test_ref_index) # # train_objs.append(training_obj) # test_objs.append(testing_obj) # # return train_objs,test_objs
clustering_alg=KMeans settings.num_clusters=list({16}) settings.num_top_words=20 #LDA only settings.max_cluster_size=10000 #the cluster will be further broken up if it is greater than this size settings.break_up_clusters=True settings.spectre_clustering_limit=15000 # if the cluster is less than 15K in size, use spectre clustering instead #LOAD DATA #generate_random_sample(unpickle_obj(X_pickle_path),unpickle_obj(y_pickle_path),unpickle_obj(ref_index_pickle_path),50000) train_set=Training(settings,pickle_dir=PICKLE_DIR) train_set.load_training() #FEATURE SELECTION best_k_attr=10000 feature_selector=Pipeline([("chi2",SelectKBest(chi2,best_k_attr))]) clustering_logger.info("Choosing best {} features".format(best_k_attr)) clustering_logger.debug("Normalizing to LV1") #NORMALIZING THE Y train_set.y=np.array([[normalize_genre_string(g,1) for g in r] for r in (row for row in train_set.y)]) clusterer=Clustering() clusterer.feature_selection(train_set,feature_selector,fit=True) lda_alg=LDA(n_topics=settings.num_clusters[0],n_iter=500, random_state=1) lda(lda_alg,train_set,settings.num_top_words) #unsupervised(train_set=train_set, settings=settings,clusterer=clusterer, clustering_alg_cls=clustering_alg)