Esempio n. 1
0
by concatnating the rows of the original sample.

[2]. A txt file including the class labels. 
Each row is a string (white space not allowed) as the class label of the corresponding row in [1].

[3]. A txt file including the name of features.
Each row is a string (white space not allowed) as the feature name of the corresponding column in [1].
"""

data_dir="/home/yifengli/prog/my/deep_learning_v1_0/data/"
# train set
filename=data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TrainSet.txt";
train_set_x_org=numpy.loadtxt(filename,delimiter='\t',dtype='float32')
filename=data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TrainSet.txt";
train_set_y_org=numpy.loadtxt(filename,delimiter='\t',dtype=object)
prev,train_set_y_org=cl.change_class_labels(train_set_y_org)
# valid set
filename=data_dir + "GM12878_200bp_Data_3Cl_l2normalized_ValidSet.txt";
valid_set_x_org=numpy.loadtxt(filename,delimiter='\t',dtype='float32')
filename=data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_ValidSet.txt";
valid_set_y_org=numpy.loadtxt(filename,delimiter='\t',dtype=object)
prev,valid_set_y_org=cl.change_class_labels(valid_set_y_org)
# test set
filename=data_dir + "GM12878_200bp_Data_3Cl_l2normalized_TestSet.txt";
test_set_x_org=numpy.loadtxt(filename,delimiter='\t',dtype='float32')
filename=data_dir + "GM12878_200bp_Classes_3Cl_l2normalized_TestSet.txt";
test_set_y_org=numpy.loadtxt(filename,delimiter='\t',dtype=object)
prev,test_set_y_org=cl.change_class_labels(test_set_y_org)

filename=data_dir + "GM12878_Features_Unique.txt";
features=numpy.loadtxt(filename,delimiter='\t',dtype=object)  
        #group=[["A-E"],["I-E"]]
        #group=[["A-P"],["I-P"]]
        #group=[["A-E"],["A-P"]]
        #group=[["A-E"],["A-X"]]
        #group=[["A-P"],["A-X"]]
        #group=[["A-E"],["A-P"],["A-X"]]
        #group=[["A-E","I-E"],["A-P","I-P"]]
        #group=[["A-E","A-P"],["I-E","I-P"]]
        #group=[["A-E","I-E"],["A-P","I-P"],["A-X","I-X"]]
        #group=[["A-E","A-P","A-X"],["I-E","I-P","I-X"]]
        #group=[["I-E"],["I-P"]]
        classes=cl.merge_class_labels(classes,group)

        print numpy.unique(classes)

        classes_unique,classes=cl.change_class_labels(classes)
        
        print numpy.unique(classes)
        
        # set random state
        #numpy.random.seed(1000)
        rng=numpy.random.RandomState(2000)
        data,classes,others=cl.balance_sample_size(data,classes,others=None,min_size_given=None,rng=rng)

        print data.shape
        print numpy.unique(classes)

        # partition the data
        train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org=cl.partition_train_valid_test(data,classes,ratio=(2,1,1),rng=rng)

        # normalization
        #group=[["A-E"],["A-P"],["I-E","I-P","A-X","I-X","UK"]]
        #group=[["A-E"],["A-P"],["A-X"],["I-E","I-P","I-X","UK"]]
        #group=[["A-E"],["I-E"]]
        #group=[["A-P"],["I-P"]]
        #group=[["A-E"],["A-P"]]
        #group=[["A-E"],["A-X"]]
        #group=[["A-P"],["A-X"]]
        #group=[["A-E"],["A-P"],["A-X"]]
        #group=[["A-E","I-E"],["A-P","I-P"]]
        #group=[["A-E","A-P"],["I-E","I-P"]]
        #group=[["A-E","I-E"],["A-P","I-P"],["A-X","I-X"]]
        #group=[["A-E","A-P","A-X"],["I-E","I-P","I-X"]]
        #group=[["I-E"],["I-P"]]
        #classes=cl.merge_class_labels(classes,group)

        classes_unique, classes = cl.change_class_labels(classes)

        # set random state
        #numpy.random.seed(1000)
        rng = numpy.random.RandomState(100)
        data, classes, others = cl.balance_sample_size(data,
                                                       classes,
                                                       others=None,
                                                       min_size_given=None,
                                                       rng=rng)

        print "data.shape"
        print data.shape

        print "classes.shape"
        print classes.shape
Esempio n. 4
0
dir_work="/home/yifeng/research/mf/mvmf_v1_1/"
dir_data="/home/yifeng/research/mf/mvmf_v1_1/data/"

rng=numpy.random.RandomState(100)

cancers="brca_coad_gbm_hnsc_kirc_lgg_lihc_luad_lusc_ov_prad_normal"

data=numpy.loadtxt(dir_data+"tcga_mrnaseq_"+cancers+"_data_normalized.txt",dtype=float,delimiter="\t")
features=numpy.loadtxt(dir_data+"tcga_mrnaseq_"+cancers+"_features.txt",dtype=str,delimiter="\t")
classes=numpy.loadtxt(dir_data+"tcga_mrnaseq_"+cancers+"_classes.txt",dtype=str,delimiter="\t")

print data.shape
print classes.shape

classes_str=classes
unique_class_names,classes=cl.change_class_labels(classes)
print classes
print unique_class_names
data,classes,classes_str=cl.sort_classes(numpy.transpose(data),classes,classes_str)
print classes
print data.shape

# split data
train_set_x,train_set_y,train_set_ystr,valid_set_x,valid_set_y,valid_set_ystr,test_set_x,test_set_y,test_set_ystr=cl.partition_train_valid_test2(data, classes, classes_str, ratio=(2,0,1), rng=rng)
# put samples of the same class together
train_set_x,train_set_y,train_set_ystr=cl.sort_classes(train_set_x,train_set_y,train_set_ystr)
train_set_x=numpy.transpose(train_set_x)
test_set_x=numpy.transpose(test_set_x)

################################### feature selection ########################################
#z=[-1,-1,-1,0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5]
Esempio n. 5
0
a_active_H=10
b_active_H=1

prefix="simulated_data" + "_a_active_W="+str(a_active_W) + "_b_active_W="+str(b_active_W) + "_a_active_H="+str(a_active_H) + "_b_active_H="+str(b_active_H)

data=numpy.loadtxt(dir_data+prefix+"_X.txt",dtype=float,delimiter="\t")
features=numpy.loadtxt(dir_data+prefix+"_Features.txt",dtype=str,delimiter="\t")
feature_patterns=numpy.loadtxt(dir_data+prefix+"_Feature_Patterns.txt",dtype=str,delimiter="\t")
feature_patterns_matrix=numpy.loadtxt(dir_data+prefix+"_Feature_Patterns_Matrix.txt",dtype=bool,delimiter="\t")
classes=numpy.loadtxt(dir_data+prefix+"_Classes.txt",dtype=str,delimiter="\t")

print data.shape
print classes.shape

#classes_str=classes
unique_class_names,classes=cl.change_class_labels(classes)

prefix="simulated_data_stability_selection" + "_a_active_W="+str(a_active_W) + "_b_active_W="+str(b_active_W) + "_a_active_H="+str(a_active_H) + "_b_active_H="+str(b_active_H)
z=3
a_0s=[1e3,1e2,5e1,1e1,5,1,0.8,0.6,0.4,0.2,0.1,5e-2,1e-2,5e-3,1e-3]
#a_0s=[0.6]
b_0s=[10,1,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7] # not used, if tied
a_larges=[1e3,1e2,5e1,1e1,5,1,0.8,0.6,0.4,0.2,0.1,5e-2,1e-2,5e-3,1e-3]
#a_larges=[0.4]
b_larges=[1e-2] # not used, if tied
a_small=1e2
b_small=1e-32
ab_tied=True
num_samplings=[20,40,60,80,100,150,200,400]
#num_samplings=[5,10]
prob_empiricals=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.99]