classes, others=None, min_size_given=None, rng=rng) print "data.shape" print data.shape print "classes.shape" print classes.shape print "numpy.unique(classes)" print numpy.unique(classes) # partition the data train_set_x_org, train_set_y_org, valid_set_x_org, valid_set_y_org, test_set_x_org, test_set_y_org = cl.partition_train_valid_test( data, classes, ratio=(2, 1, 1), rng=rng) print "valid set shape" print valid_set_x_org.shape print valid_set_y_org.shape # normalization train_set_x_org, data_min, data_max = cl.normalize_col_scale01( train_set_x_org, tol=1e-10) valid_set_x_org, _, _ = cl.normalize_col_scale01(valid_set_x_org, tol=1e-10, data_min=data_min, data_max=data_max) test_set_x_org, _, _ = cl.normalize_col_scale01(test_set_x_org, tol=1e-10, data_min=data_min,
print numpy.unique(classes) classes_unique,classes=cl.change_class_labels(classes) print numpy.unique(classes) # set random state #numpy.random.seed(1000) rng=numpy.random.RandomState(2000) data,classes,others=cl.balance_sample_size(data,classes,others=None,min_size_given=None,rng=rng) print data.shape print numpy.unique(classes) # partition the data train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org=cl.partition_train_valid_test(data,classes,ratio=(2,1,1),rng=rng) # normalization train_set_x_org,data_min,data_max=cl.normalize_col_scale01(train_set_x_org,tol=1e-10) valid_set_x_org,_,_=cl.normalize_col_scale01(valid_set_x_org,tol=1e-10,data_min=data_min,data_max=data_max) test_set_x_org,_,_=cl.normalize_col_scale01(test_set_x_org,tol=1e-10,data_min=data_min,data_max=data_max) # train # setting the parameter pretrain_lr=0.01 finetune_lr=0.1 alpha=0.1 lambda2=1.0 alpha1=0.001 alpha2=0.0 n_hidden=[256,64,16]
print data.shape print numpy.unique(classes) kfolds = 10 ind_folds = cl.kfold_cross_validation(classes, k=kfolds, shuffle=True, rng=rng) for i in range(kfolds): test_set_x_org = data[ind_folds == i, :] test_set_y_org = classes[ind_folds == i] train_set_x_org, train_set_y_org, valid_set_x_org, valid_set_y_org, _, _ = cl.partition_train_valid_test( data[ind_folds != i, :], classes[ind_folds != i], ratio=(3, 1, 0), rng=rng) # normalization train_set_x_org, data_min, data_max = cl.normalize_col_scale01( train_set_x_org, tol=1e-10) valid_set_x_org, _, _ = cl.normalize_col_scale01(valid_set_x_org, tol=1e-10, data_min=data_min, data_max=data_max) test_set_x_org, _, _ = cl.normalize_col_scale01(test_set_x_org, tol=1e-10, data_min=data_min, data_max=data_max)
print data.shape print numpy.unique(classes) # normalization data[:,0:wid]=numpy.absolute(data[:,0:wid]) # change the RNA-seq minus track to positive values num_feat=len(features) cl.normalize_matrical_samples(data,num_feat,method="scale01") # 10-fold CV kfolds=10 ind_folds=cl.kfold_cross_validation(classes,k=kfolds,shuffle=True,rng=rng) for i in range(kfolds): test_set_x_org=data[ind_folds==i,:] test_set_y_org=classes[ind_folds==i] train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,_,_=cl.partition_train_valid_test(data[ind_folds!=i,:],classes[ind_folds!=i],ratio=(3,1,0),rng=rng) # parameter setting pretrain_lr=0.04 # 0.1, 0.08, 0.06, 0.04, 0.02, 0.01 finetune_lr=0.04 # 0.1, 0.08, 0.06, 0.04, 0.02, 0.01 alpha=0.1 pretraining_epochs=100 # 20, 50, 100 training_epochs=100 # 50, 100, 200, 500 nkerns=[4,4,8] # [4,4,4], [4,6,8] batch_size=100 # 100, 200 receptive_fields=((1,4),(1,4),(1,4)) # ((1,4),(1,4),(1,4)), ((1,4),(1,4),(1,2)), ((1,8),(1,4),(1,2)) poolsizes=((1,4),(1,4),(1,2)) # ((1,4),(1,4),(1,4)), ((1,4),(1,4),(1,2)), ((1,6),(1,4),(1,2)) full_hidden_sub=[16] # [8], [16], [32], [64] full_hidden_all=[32] # [8], [16], [32], [64] max_num_epoch_change_learning_rate=50 #20, 50, 80 not crucial max_num_epoch_change_rate=0.8 learning_rate_decay_rate=0.8
filename="/home/yifengli/research/dnashape/result/Data_1000bp.txt"; data=numpy.loadtxt(filename,delimiter='\t',dtype='float16') filename="/home/yifengli/research/dnashape/result/Classes_1000bp.txt"; classes=numpy.loadtxt(filename,delimiter='\t',dtype=str) filename="/home/yifengli/research/dnashape/result/Features.txt"; features=numpy.loadtxt(filename,delimiter='\t',dtype=str) # change class labels given=["Enhancer","EnhancerFalse"] data,classes=cl.take_some_classes(data,classes,given) given={"Enhancer":0,"EnhancerFalse":1} classes=cl.change_class_labels_to_given(classes,given) train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org \ =cl.partition_train_valid_test(data,classes,ratio=(1,1,1)) del data gc_collect() rng=numpy.random.RandomState(1000) numpy.warnings.filterwarnings('ignore') # train classifier,training_time=convolutional_mlp.train_model( train_set_x_org=train_set_x_org, train_set_y_org=train_set_y_org, valid_set_x_org=valid_set_x_org, valid_set_y_org=valid_set_y_org, n_row_each_sample=4, learning_rate=0.1, alpha=0.01, n_epochs=1000, rng=rng, nkerns=[4,4,8],batch_size=500, receptive_fields=((2,8),(2,8),(2,2)),poolsizes=((1,8),(1,8),(1,2)),full_hidden=8) # test test_set_y_pred,test_set_y_pred_prob,test_time=convolutional_mlp.test_model(classifier,test_set_x_org)
def train(self,train_set_x_org=None, train_set_y_org=None,features=None, num_samplings=100, randomize_method="random_rescaling", random_rescaling_alpha=0.5, random_sampling_portion=0.66, learning_rate=0.1, alpha=0.01, lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.01, n_hidden=[256,128,16], n_epochs=1000, batch_size=100, activation_func="relu", rng=numpy.random.RandomState(100), dfs_select_method="top_num", dfs_threshold=0.001, dfs_top_num=10, max_num_epoch_change_learning_rate=80,max_num_epoch_change_rate=0.8,learning_rate_decay_rate=0.8): """ Train the randomized DFS. num_samplings: int, number of reruns of DFS. randomize_method: string, the randomizing method, can be one of {"random_rescaling","random_sampling","random_sampling_and_random_rescaling"}. """ if randomize_method=="random_rescaling": train_set_x,train_set_y,valid_set_x,valid_set_y,_,_=cl.partition_train_valid_test(train_set_x_org, train_set_y_org,ratio=(3,1,0), rng=rng) self.num_samplings=num_samplings self.randomize_method=randomize_method self.random_rescaling_alpha=random_rescaling_alpha self.random_sampling_portion=random_sampling_portion self.learning_rate=learning_rate self.alpha=alpha self.lambda1=lambda1 self.lambda2=lambda2 self.alpha1=alpha1 self.alpha2=alpha2 self.n_hidden=n_hidden self.n_epochs=n_epochs self.batch_size=batch_size self.activation_func="relu" self.max_num_epoch_change_learning_rate=max_num_epoch_change_learning_rate self.max_num_epoch_change_rate=max_num_epoch_change_rate self.learning_rate_decay_rate=learning_rate_decay_rate self.features=features self.classifiers=[] self.feature_counts=numpy.zeros(shape=(self.n_in,),dtype=int) self.feature_weights=numpy.zeros(shape=(self.n_in,self.num_samplings),dtype=float) self.training_time=0 self.classes_unique=numpy.unique(train_set_y_org) self.rescale_Ws=[] for ns in range(self.num_samplings): print "The {0}-th run of randomized DFS...".format(ns) rng_ns=numpy.random.RandomState(ns) # generate a subsample of data points if randomize_method=="random_sampling" or randomize_method=="random_sampling_and_random_rescaling": # sample data train_set_x,train_set_y,ind_train,_=cl.sampling(train_set_x_org,train_set_y_org,others=None,portion=random_sampling_portion,max_size_given=None,rng=rng_ns) valid_set_x=numpy.delete(train_set_x_org,ind_train,axis=0) valid_set_y=numpy.delete(train_set_y_org,ind_train) # reorder the sampled data train_set_x,train_set_y,_=cl.sort_classes(train_set_x,train_set_y) if randomize_method=="random_rescaling" or randomize_method=="random_sampling_and_random_rescaling": rescale_ws=rng_ns.uniform(low=random_rescaling_alpha,high=1.0,size=(1,self.num_features)) train_set_x=train_set_x*rescale_ws ### may multiplify, not division, leave it for future!!!!!!!!!! valid_set_x=valid_set_x*rescale_ws self.rescale_Ws.extend([rescale_ws]) # run DFS classifier,training_time=deep_feat_select_mlp.train_model(train_set_x_org=train_set_x, train_set_y_org=train_set_y, valid_set_x_org=valid_set_x, valid_set_y_org=valid_set_y, learning_rate=learning_rate, alpha=alpha, lambda1=lambda1, lambda2=lambda2, alpha1=alpha1, alpha2=alpha2, n_hidden=n_hidden, n_epochs=n_epochs, batch_size=batch_size, activation_func=activation_func, rng=rng_ns, max_num_epoch_change_learning_rate=max_num_epoch_change_learning_rate, max_num_epoch_change_rate=max_num_epoch_change_rate, learning_rate_decay_rate=learning_rate_decay_rate) features_selected,logind_selected,weights_selected,weights=deep_feat_select_mlp.select_features(classifier,features,select_method=dfs_select_method,threshold=dfs_threshold,top_num=dfs_top_num) print weights self.classifiers.append(classifier) self.feature_counts=self.feature_counts+numpy.array(logind_selected,dtype=int) print self.feature_counts self.feature_weights[:,ns]=weights self.training_time=self.training_time+training_time # final clean up self.feature_importance=self.feature_counts/self.num_samplings print self.feature_importance return self.feature_importance,self.feature_weights,self.training_time
filename = "/home/yifengli/research/dnashape/result/Data_1000bp.txt" data = numpy.loadtxt(filename, delimiter='\t', dtype='float16') filename = "/home/yifengli/research/dnashape/result/Classes_1000bp.txt" classes = numpy.loadtxt(filename, delimiter='\t', dtype=str) filename = "/home/yifengli/research/dnashape/result/Features.txt" features = numpy.loadtxt(filename, delimiter='\t', dtype=str) # change class labels given = ["Enhancer", "EnhancerFalse"] data, classes = cl.take_some_classes(data, classes, given) given = {"Enhancer": 0, "EnhancerFalse": 1} classes = cl.change_class_labels_to_given(classes, given) train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org \ =cl.partition_train_valid_test(data,classes,ratio=(1,1,1)) del data gc_collect() rng = numpy.random.RandomState(1000) numpy.warnings.filterwarnings('ignore') # train classifier, training_time = convolutional_mlp.train_model( train_set_x_org=train_set_x_org, train_set_y_org=train_set_y_org, valid_set_x_org=valid_set_x_org, valid_set_y_org=valid_set_y_org, n_row_each_sample=4, learning_rate=0.1, alpha=0.01, n_epochs=1000,