def load_training_testing(Xs,ys,ref_indexes,settings,train_set_index,test_set_index): """ Load training and testing set based on indexes provided by crossvalidation :return: List of train_set and test_set objs """ train_sets=[] test_sets=[] for c,setting in enumerate(settings): train_set=Training(setting,pickle_dir=setting.pickle_dir) train_set.X=Xs[c][train_set_index] train_set.y=ys[c][train_set_index] train_set.ref_index=ref_indexes[c][train_set_index] test_set=Testing(setting,pickle_dir=setting.pickle_dir) test_set.X=Xs[c][test_set_index] test_set.y=ys[c][test_set_index] test_set.ref_index=ref_indexes[c][test_set_index] train_sets.append(train_set) test_sets.append(test_set) #flatten training for train_set in train_sets: flatten_train_set(train_set) #make sure the sets match classification_logger.info("Checking the sets match") ys=[train_set.y for train_set in train_sets] ref_indexes=[train_set.ref_index for train_set in train_sets] test_ys=np.array([test_set.y for test_set in test_sets]) test_ref_indexes=[test_set.ref_index for test_set in test_sets] for c,elem in enumerate((ys,ref_indexes,test_ys,test_ref_indexes)): prev=elem[0] match=True for e in elem[1:]: match=match and (e==prev).all() if not match: raise AttributeError("NOT MATCH FOR {} ELEMENT".format(c)) return train_sets,test_sets # def select_training_testing_sets(settings,Xs,y,ref_index,num,do_pickle=True): # """ # Randomly choose from a super set of data and split it into a training set of size num. The remainder will become # the Test set. Uses _pick_random_samples # # :param setting: # :param X: # :param y: # :param ref_index: # :param num: # :return: tuple_training,tuple_testing # """ # # selector=np.not_equal(ref_index,None) # ref_index=ref_index[selector] # Xs=[X[selector] for X in Xs] # y=y[selector] # # train_Xs,train_y,train_ref_index,test_Xs,test_y,test_ref_index=_pick_random_samples(Xs,y,ref_index,num) # # train_objs=[] # test_objs=[] # if do_pickle: # for c,setting in enumerate(settings): # train_X=train_Xs[c] # test_X=test_Xs[c] # # _pickle_training_testing(setting,train_X,train_y,train_ref_index,test_X,test_y,test_ref_index) # # training_obj=Training(label=setting,pickle_dir=setting.pickle_dir) # training_obj.set_data(train_X,train_y,train_ref_index) # # testing_obj=Testing(label=setting,pickle_dir=setting.pickle_dir) # testing_obj.set_data(test_X,test_y,test_ref_index) # # train_objs.append(training_obj) # test_objs.append(testing_obj) # # return train_objs,test_objs
from util.base_util import normalize_genre_string pickle_dir="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir" if __name__=="__main__": mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"} #s=SourceMapper(URLBow.objects(),mapping) X_pickle_path=os.path.join(pickle_dir,"X_summary_pickle") y_pickle_path=os.path.join(pickle_dir,"y_summary_pickle") ref_index_pickle_path=os.path.join(pickle_dir,"refIndex_summary_pickle") mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"} label="summary_unsupervised_chi_top1cls_10000" #generate_random_sample(unpickle_obj(X_pickle_path),unpickle_obj(y_pickle_path),unpickle_obj(ref_index_pickle_path),1000) #load training, feature selection train_set=Training(label,pickle_dir=pickle_dir) train_set.load_training() train_set.y=np.array([list(set(normalize_genre_string(genre,1) for genre in g_list)) for g_list in train_set.y]) train_set.X=chi_squared_feature_select(train_set.X,train_set.y,k_best=10000) params=GraphCutParams(X=train_set.X,y=train_set.y,ref_id=train_set._ref_index, k_closest_neighbors=4,vocab_size=train_set._X.shape[1],num_clusters=3) alpha_beta_swap(params)