by concatnating the rows of the original sample. [2]. A txt file including the class labels. Each row is a string (white space not allowed) as the class label of the corresponding row in [1]. [3]. A txt file including the name of features. Each row is a string (white space not allowed) as the feature name of the corresponding column in [1]. """ data_dir="/home/yifengli/prog/my/deep_learning_v1_0/data/" # train set filename=data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TrainSet.txt"; train_set_x_org=numpy.loadtxt(filename,delimiter='\t',dtype='float32') filename=data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TrainSet.txt"; train_set_y_org=numpy.loadtxt(filename,delimiter='\t',dtype=object) prev,train_set_y_org=cl.change_class_labels(train_set_y_org) # valid set filename=data_dir + "GM12878_200bp_Data_3Cl_l2normalized_ValidSet.txt"; valid_set_x_org=numpy.loadtxt(filename,delimiter='\t',dtype='float32') filename=data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_ValidSet.txt"; valid_set_y_org=numpy.loadtxt(filename,delimiter='\t',dtype=object) prev,valid_set_y_org=cl.change_class_labels(valid_set_y_org) # test set filename=data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TestSet.txt"; test_set_x_org=numpy.loadtxt(filename,delimiter='\t',dtype='float32') filename=data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TestSet.txt"; test_set_y_org=numpy.loadtxt(filename,delimiter='\t',dtype=object) prev,test_set_y_org=cl.change_class_labels(test_set_y_org) filename=data_dir + "GM12878_Features_Unique.txt"; features=numpy.loadtxt(filename,delimiter='\t',dtype=object)
#group=[["A-E"],["I-E"]] #group=[["A-P"],["I-P"]] #group=[["A-E"],["A-P"]] #group=[["A-E"],["A-X"]] #group=[["A-P"],["A-X"]] #group=[["A-E"],["A-P"],["A-X"]] #group=[["A-E","I-E"],["A-P","I-P"]] #group=[["A-E","A-P"],["I-E","I-P"]] #group=[["A-E","I-E"],["A-P","I-P"],["A-X","I-X"]] #group=[["A-E","A-P","A-X"],["I-E","I-P","I-X"]] #group=[["I-E"],["I-P"]] classes=cl.merge_class_labels(classes,group) print numpy.unique(classes) classes_unique,classes=cl.change_class_labels(classes) print numpy.unique(classes) # set random state #numpy.random.seed(1000) rng=numpy.random.RandomState(2000) data,classes,others=cl.balance_sample_size(data,classes,others=None,min_size_given=None,rng=rng) print data.shape print numpy.unique(classes) # partition the data train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org=cl.partition_train_valid_test(data,classes,ratio=(2,1,1),rng=rng) # normalization
#group=[["A-E"],["A-P"],["I-E","I-P","A-X","I-X","UK"]] #group=[["A-E"],["A-P"],["A-X"],["I-E","I-P","I-X","UK"]] #group=[["A-E"],["I-E"]] #group=[["A-P"],["I-P"]] #group=[["A-E"],["A-P"]] #group=[["A-E"],["A-X"]] #group=[["A-P"],["A-X"]] #group=[["A-E"],["A-P"],["A-X"]] #group=[["A-E","I-E"],["A-P","I-P"]] #group=[["A-E","A-P"],["I-E","I-P"]] #group=[["A-E","I-E"],["A-P","I-P"],["A-X","I-X"]] #group=[["A-E","A-P","A-X"],["I-E","I-P","I-X"]] #group=[["I-E"],["I-P"]] #classes=cl.merge_class_labels(classes,group) classes_unique, classes = cl.change_class_labels(classes) # set random state #numpy.random.seed(1000) rng = numpy.random.RandomState(100) data, classes, others = cl.balance_sample_size(data, classes, others=None, min_size_given=None, rng=rng) print "data.shape" print data.shape print "classes.shape" print classes.shape
dir_work="/home/yifeng/research/mf/mvmf_v1_1/" dir_data="/home/yifeng/research/mf/mvmf_v1_1/data/" rng=numpy.random.RandomState(100) cancers="brca_coad_gbm_hnsc_kirc_lgg_lihc_luad_lusc_ov_prad_normal" data=numpy.loadtxt(dir_data+"tcga_mrnaseq_"+cancers+"_data_normalized.txt",dtype=float,delimiter="\t") features=numpy.loadtxt(dir_data+"tcga_mrnaseq_"+cancers+"_features.txt",dtype=str,delimiter="\t") classes=numpy.loadtxt(dir_data+"tcga_mrnaseq_"+cancers+"_classes.txt",dtype=str,delimiter="\t") print data.shape print classes.shape classes_str=classes unique_class_names,classes=cl.change_class_labels(classes) print classes print unique_class_names data,classes,classes_str=cl.sort_classes(numpy.transpose(data),classes,classes_str) print classes print data.shape # split data train_set_x,train_set_y,train_set_ystr,valid_set_x,valid_set_y,valid_set_ystr,test_set_x,test_set_y,test_set_ystr=cl.partition_train_valid_test2(data, classes, classes_str, ratio=(2,0,1), rng=rng) # put samples of the same class together train_set_x,train_set_y,train_set_ystr=cl.sort_classes(train_set_x,train_set_y,train_set_ystr) train_set_x=numpy.transpose(train_set_x) test_set_x=numpy.transpose(test_set_x) ################################### feature selection ######################################## #z=[-1,-1,-1,0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5]
a_active_H=10 b_active_H=1 prefix="simulated_data" + "_a_active_W="+str(a_active_W) + "_b_active_W="+str(b_active_W) + "_a_active_H="+str(a_active_H) + "_b_active_H="+str(b_active_H) data=numpy.loadtxt(dir_data+prefix+"_X.txt",dtype=float,delimiter="\t") features=numpy.loadtxt(dir_data+prefix+"_Features.txt",dtype=str,delimiter="\t") feature_patterns=numpy.loadtxt(dir_data+prefix+"_Feature_Patterns.txt",dtype=str,delimiter="\t") feature_patterns_matrix=numpy.loadtxt(dir_data+prefix+"_Feature_Patterns_Matrix.txt",dtype=bool,delimiter="\t") classes=numpy.loadtxt(dir_data+prefix+"_Classes.txt",dtype=str,delimiter="\t") print data.shape print classes.shape #classes_str=classes unique_class_names,classes=cl.change_class_labels(classes) prefix="simulated_data_stability_selection" + "_a_active_W="+str(a_active_W) + "_b_active_W="+str(b_active_W) + "_a_active_H="+str(a_active_H) + "_b_active_H="+str(b_active_H) z=3 a_0s=[1e3,1e2,5e1,1e1,5,1,0.8,0.6,0.4,0.2,0.1,5e-2,1e-2,5e-3,1e-3] #a_0s=[0.6] b_0s=[10,1,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7] # not used, if tied a_larges=[1e3,1e2,5e1,1e1,5,1,0.8,0.6,0.4,0.2,0.1,5e-2,1e-2,5e-3,1e-3] #a_larges=[0.4] b_larges=[1e-2] # not used, if tied a_small=1e2 b_small=1e-32 ab_tied=True num_samplings=[20,40,60,80,100,150,200,400] #num_samplings=[5,10] prob_empiricals=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.99]