classes,
                                                       others=None,
                                                       min_size_given=None,
                                                       rng=rng)

        print "data.shape"
        print data.shape

        print "classes.shape"
        print classes.shape

        print "numpy.unique(classes)"
        print numpy.unique(classes)

        # partition the data
        train_set_x_org, train_set_y_org, valid_set_x_org, valid_set_y_org, test_set_x_org, test_set_y_org = cl.partition_train_valid_test(
            data, classes, ratio=(2, 1, 1), rng=rng)

        print "valid set shape"
        print valid_set_x_org.shape
        print valid_set_y_org.shape

        # normalization
        train_set_x_org, data_min, data_max = cl.normalize_col_scale01(
            train_set_x_org, tol=1e-10)
        valid_set_x_org, _, _ = cl.normalize_col_scale01(valid_set_x_org,
                                                         tol=1e-10,
                                                         data_min=data_min,
                                                         data_max=data_max)
        test_set_x_org, _, _ = cl.normalize_col_scale01(test_set_x_org,
                                                        tol=1e-10,
                                                        data_min=data_min,
        print numpy.unique(classes)

        classes_unique,classes=cl.change_class_labels(classes)
        
        print numpy.unique(classes)
        
        # set random state
        #numpy.random.seed(1000)
        rng=numpy.random.RandomState(2000)
        data,classes,others=cl.balance_sample_size(data,classes,others=None,min_size_given=None,rng=rng)

        print data.shape
        print numpy.unique(classes)

        # partition the data
        train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org=cl.partition_train_valid_test(data,classes,ratio=(2,1,1),rng=rng)

        # normalization
        train_set_x_org,data_min,data_max=cl.normalize_col_scale01(train_set_x_org,tol=1e-10)
        valid_set_x_org,_,_=cl.normalize_col_scale01(valid_set_x_org,tol=1e-10,data_min=data_min,data_max=data_max)
        test_set_x_org,_,_=cl.normalize_col_scale01(test_set_x_org,tol=1e-10,data_min=data_min,data_max=data_max)

        # train
        # setting the parameter
        pretrain_lr=0.01
        finetune_lr=0.1
        alpha=0.1
        lambda2=1.0
        alpha1=0.001
        alpha2=0.0
        n_hidden=[256,64,16]
예제 #3
0
        print data.shape
        print numpy.unique(classes)

        kfolds = 10
        ind_folds = cl.kfold_cross_validation(classes,
                                              k=kfolds,
                                              shuffle=True,
                                              rng=rng)

        for i in range(kfolds):
            test_set_x_org = data[ind_folds == i, :]
            test_set_y_org = classes[ind_folds == i]
            train_set_x_org, train_set_y_org, valid_set_x_org, valid_set_y_org, _, _ = cl.partition_train_valid_test(
                data[ind_folds != i, :],
                classes[ind_folds != i],
                ratio=(3, 1, 0),
                rng=rng)

            # normalization
            train_set_x_org, data_min, data_max = cl.normalize_col_scale01(
                train_set_x_org, tol=1e-10)
            valid_set_x_org, _, _ = cl.normalize_col_scale01(valid_set_x_org,
                                                             tol=1e-10,
                                                             data_min=data_min,
                                                             data_max=data_max)
            test_set_x_org, _, _ = cl.normalize_col_scale01(test_set_x_org,
                                                            tol=1e-10,
                                                            data_min=data_min,
                                                            data_max=data_max)
예제 #4
0
        print data.shape
        print numpy.unique(classes)

        # normalization
        data[:,0:wid]=numpy.absolute(data[:,0:wid]) # change the RNA-seq minus track to positive values
        num_feat=len(features)
        cl.normalize_matrical_samples(data,num_feat,method="scale01")

        # 10-fold CV
        kfolds=10
        ind_folds=cl.kfold_cross_validation(classes,k=kfolds,shuffle=True,rng=rng)

        for i in range(kfolds):
            test_set_x_org=data[ind_folds==i,:]
            test_set_y_org=classes[ind_folds==i]
            train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,_,_=cl.partition_train_valid_test(data[ind_folds!=i,:],classes[ind_folds!=i],ratio=(3,1,0),rng=rng)
            # parameter setting
            pretrain_lr=0.04 # 0.1, 0.08, 0.06, 0.04, 0.02, 0.01
            finetune_lr=0.04 # 0.1, 0.08, 0.06, 0.04, 0.02, 0.01
            alpha=0.1
            pretraining_epochs=100 # 20, 50, 100
            training_epochs=100 # 50, 100, 200, 500
            nkerns=[4,4,8] # [4,4,4], [4,6,8]
            batch_size=100 # 100, 200
            receptive_fields=((1,4),(1,4),(1,4)) # ((1,4),(1,4),(1,4)), ((1,4),(1,4),(1,2)), ((1,8),(1,4),(1,2))
            poolsizes=((1,4),(1,4),(1,2)) # ((1,4),(1,4),(1,4)), ((1,4),(1,4),(1,2)), ((1,6),(1,4),(1,2))
            full_hidden_sub=[16] # [8], [16], [32], [64]
            full_hidden_all=[32] # [8], [16], [32], [64]
            max_num_epoch_change_learning_rate=50 #20, 50, 80 not crucial
            max_num_epoch_change_rate=0.8
            learning_rate_decay_rate=0.8
예제 #5
0
filename="/home/yifengli/research/dnashape/result/Data_1000bp.txt";
data=numpy.loadtxt(filename,delimiter='\t',dtype='float16')
filename="/home/yifengli/research/dnashape/result/Classes_1000bp.txt";
classes=numpy.loadtxt(filename,delimiter='\t',dtype=str)
filename="/home/yifengli/research/dnashape/result/Features.txt";
features=numpy.loadtxt(filename,delimiter='\t',dtype=str)

# change class labels
given=["Enhancer","EnhancerFalse"]
data,classes=cl.take_some_classes(data,classes,given)

given={"Enhancer":0,"EnhancerFalse":1}
classes=cl.change_class_labels_to_given(classes,given)

train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org \
=cl.partition_train_valid_test(data,classes,ratio=(1,1,1))    
del data
gc_collect()

rng=numpy.random.RandomState(1000)    
numpy.warnings.filterwarnings('ignore')        
# train
classifier,training_time=convolutional_mlp.train_model( train_set_x_org=train_set_x_org, train_set_y_org=train_set_y_org,
                        valid_set_x_org=valid_set_x_org, valid_set_y_org=valid_set_y_org, 
                        n_row_each_sample=4,
                        learning_rate=0.1, alpha=0.01, n_epochs=1000, rng=rng, 
                        nkerns=[4,4,8],batch_size=500,
                        receptive_fields=((2,8),(2,8),(2,2)),poolsizes=((1,8),(1,8),(1,2)),full_hidden=8)

# test
test_set_y_pred,test_set_y_pred_prob,test_time=convolutional_mlp.test_model(classifier,test_set_x_org)
예제 #6
0
    def train(self,train_set_x_org=None, train_set_y_org=None,features=None,
              num_samplings=100,
              randomize_method="random_rescaling",
              random_rescaling_alpha=0.5,
              random_sampling_portion=0.66,
              learning_rate=0.1, alpha=0.01, 
              lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.01, 
              n_hidden=[256,128,16], n_epochs=1000, batch_size=100, 
              activation_func="relu", rng=numpy.random.RandomState(100),
              dfs_select_method="top_num", dfs_threshold=0.001, dfs_top_num=10,
              max_num_epoch_change_learning_rate=80,max_num_epoch_change_rate=0.8,learning_rate_decay_rate=0.8):
        """
        Train the randomized DFS.
        num_samplings: int, number of reruns of DFS.
        randomize_method: string, the randomizing method, can be one of {"random_rescaling","random_sampling","random_sampling_and_random_rescaling"}.
        """
        if randomize_method=="random_rescaling":
            train_set_x,train_set_y,valid_set_x,valid_set_y,_,_=cl.partition_train_valid_test(train_set_x_org, train_set_y_org,ratio=(3,1,0), rng=rng)

        self.num_samplings=num_samplings
        self.randomize_method=randomize_method
        self.random_rescaling_alpha=random_rescaling_alpha
        self.random_sampling_portion=random_sampling_portion
        self.learning_rate=learning_rate
        self.alpha=alpha
        self.lambda1=lambda1
        self.lambda2=lambda2
        self.alpha1=alpha1
        self.alpha2=alpha2
        self.n_hidden=n_hidden
        self.n_epochs=n_epochs
        self.batch_size=batch_size
        self.activation_func="relu"
        self.max_num_epoch_change_learning_rate=max_num_epoch_change_learning_rate
        self.max_num_epoch_change_rate=max_num_epoch_change_rate
        self.learning_rate_decay_rate=learning_rate_decay_rate
        self.features=features
        self.classifiers=[]
        self.feature_counts=numpy.zeros(shape=(self.n_in,),dtype=int)
        self.feature_weights=numpy.zeros(shape=(self.n_in,self.num_samplings),dtype=float)
        self.training_time=0
        self.classes_unique=numpy.unique(train_set_y_org)
        self.rescale_Ws=[]
        for ns in range(self.num_samplings):
            print "The {0}-th run of randomized DFS...".format(ns)
            rng_ns=numpy.random.RandomState(ns)

            # generate a subsample of data points
            if randomize_method=="random_sampling" or randomize_method=="random_sampling_and_random_rescaling":
                # sample data
                train_set_x,train_set_y,ind_train,_=cl.sampling(train_set_x_org,train_set_y_org,others=None,portion=random_sampling_portion,max_size_given=None,rng=rng_ns)
                valid_set_x=numpy.delete(train_set_x_org,ind_train,axis=0)
                valid_set_y=numpy.delete(train_set_y_org,ind_train)
                # reorder the sampled data
                train_set_x,train_set_y,_=cl.sort_classes(train_set_x,train_set_y)
            if randomize_method=="random_rescaling" or randomize_method=="random_sampling_and_random_rescaling":
                rescale_ws=rng_ns.uniform(low=random_rescaling_alpha,high=1.0,size=(1,self.num_features))
                train_set_x=train_set_x*rescale_ws ### may multiplify, not division, leave it for future!!!!!!!!!!
                valid_set_x=valid_set_x*rescale_ws
                self.rescale_Ws.extend([rescale_ws])
                
            # run DFS
            classifier,training_time=deep_feat_select_mlp.train_model(train_set_x_org=train_set_x, train_set_y_org=train_set_y, 
                                                                      valid_set_x_org=valid_set_x, valid_set_y_org=valid_set_y, 
                                                                      learning_rate=learning_rate, alpha=alpha, lambda1=lambda1, lambda2=lambda2,
                                                                      alpha1=alpha1, alpha2=alpha2, n_hidden=n_hidden,
                                                                      n_epochs=n_epochs, batch_size=batch_size, activation_func=activation_func, rng=rng_ns,
                                                                      max_num_epoch_change_learning_rate=max_num_epoch_change_learning_rate,
                                                                      max_num_epoch_change_rate=max_num_epoch_change_rate,
                                                                      learning_rate_decay_rate=learning_rate_decay_rate)
            features_selected,logind_selected,weights_selected,weights=deep_feat_select_mlp.select_features(classifier,features,select_method=dfs_select_method,threshold=dfs_threshold,top_num=dfs_top_num)
            print weights
            self.classifiers.append(classifier)
            self.feature_counts=self.feature_counts+numpy.array(logind_selected,dtype=int)
            print self.feature_counts
            self.feature_weights[:,ns]=weights
            self.training_time=self.training_time+training_time

        # final clean up
        self.feature_importance=self.feature_counts/self.num_samplings
        print self.feature_importance
        return self.feature_importance,self.feature_weights,self.training_time
예제 #7
0
filename = "/home/yifengli/research/dnashape/result/Data_1000bp.txt"
data = numpy.loadtxt(filename, delimiter='\t', dtype='float16')
filename = "/home/yifengli/research/dnashape/result/Classes_1000bp.txt"
classes = numpy.loadtxt(filename, delimiter='\t', dtype=str)
filename = "/home/yifengli/research/dnashape/result/Features.txt"
features = numpy.loadtxt(filename, delimiter='\t', dtype=str)

# change class labels
given = ["Enhancer", "EnhancerFalse"]
data, classes = cl.take_some_classes(data, classes, given)

given = {"Enhancer": 0, "EnhancerFalse": 1}
classes = cl.change_class_labels_to_given(classes, given)

train_set_x_org,train_set_y_org,valid_set_x_org,valid_set_y_org,test_set_x_org,test_set_y_org \
=cl.partition_train_valid_test(data,classes,ratio=(1,1,1))
del data
gc_collect()

rng = numpy.random.RandomState(1000)
numpy.warnings.filterwarnings('ignore')
# train
classifier, training_time = convolutional_mlp.train_model(
    train_set_x_org=train_set_x_org,
    train_set_y_org=train_set_y_org,
    valid_set_x_org=valid_set_x_org,
    valid_set_y_org=valid_set_y_org,
    n_row_each_sample=4,
    learning_rate=0.1,
    alpha=0.01,
    n_epochs=1000,