Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    # parser.add_argument('-input',  dest='inputfile', type=str, help='Protein sequences to be predicted in fasta format.', required=True)
    parser.add_argument('-output', dest='outputfile', type=str, help='prefix of the prediction results.', required=True)
    parser.add_argument('-window', dest='window', type=int, help='specify the window size', required=True)
    parser.add_argument('-model-prefix', dest='modelprefix', type=str,
                        help='prefix of custom model used for prediciton. If you do not have one, please run train_models.py to train a model.',
                        required=False, default=None)
    parser.add_argument('-residue-types', dest='residues', type=str,
                        help='Residue types that to be predicted. For multiple residues, seperate each with \',\'',
                        required=False, default="C,H,E,D")
    parser.add_argument('-codingMode',  dest='codingMode', type=int, help='Set the input sequence encoding mode.', required=False, default=0)

    args = parser.parse_args()

    # inputfile=args.inputfile;
    outputfile = args.outputfile;
    residues = args.residues.split(",")
    modelprefix = args.modelprefix;
    window = args.window;
    codemode = args.codingMode
    print(outputfile, residues, modelprefix, window)
    # outputfile = r'/home/ucexbw/ZinCaps/ActiveSitePrediction/data/output/'
    # fp = open(outputfile+"eval_by_AUC_precision_scores_polynomial_decay_increase_decrease_1_0.5_1",'w')
    # fp = open(outputfile+"eval_by_AUC_precision_scores_polynomial_decay_1_0.5_1",'w')
    # fp = open(outputfile+"eval_by_AUC_precision_scores_10fold",'w')
    fp = open(outputfile + "eval_by_AUC_precision_scores_10fold_constantweight1_0.5_25.txt", 'w')

    model_arch = Capsnet_main(np.zeros([3, 2 * window + 1, 6]), [], nb_epoch=1, compiletimes=0, lr=0.001,
                              batch_size=500, lam_recon=0, routings=3, modeltype='nogradientstop', nb_classes=2,
                              predict=True)
    # model_arch=Capsnet_main(np.zeros([3,2*16+1,21]),[],nb_epoch=1,compiletimes=0,lr=0.001,batch_size=500,lam_recon=0,routings=3,modeltype='nogradientstop',nb_classes=2,predict=True)

    roc_average_weight = np.zeros(5)
    roc_average_predict = np.zeros(5)
    roc_average_last_predict = np.zeros(5)

    accuracy_average_last_predict = np.zeros(5)
    sensitivity_average_last_predict = np.zeros(5)
    specificity_average_last_predict = np.zeros(5)
    f1_score_average_last_predict = np.zeros(5)
    mcc_average_last_predict = np.zeros(5)

    pr_average_weight = np.zeros(5)
    pr_average_predict = np.zeros(5)
    pr_average_last_predict = np.zeros(5)

    for time in range(5):
        fp.write("############################" + str(time) + "\n")
        inputfile = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/lib/K-Fold/annotated_sequence.fasta_training_annotated_' + str(
            time) + '.fasta'
        # if os.path.exists(outputfile+"eval_by_AUC_precision_scores"):
        #   os.rm(outputfile+"eval_by_AUC_precision_scores")

        checkpointweights = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/data/weights/Zinc_' + str(time) + '_weights'
        modelprefix = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/data/models/Zinc_' + str(time) + '_model'
        eval_type = 'average_last_predict'  # all evaluate by all method
        # average_weight
        # average_predict
        # average_last_predict

        if modelprefix is None:
            # print ("Please specify the prefix for an existing custom model by "
            #        "-model-prefix!\n\
            # It indicates two files [-model-prefix]_HDF5model and [-model-prefix]_parameters.\n \
            # If you don't have such files, please run train_models.py to get the "
            #        "custom model first!\n")
            exit()
        else:  # custom prediction
            model = modelprefix + str("_HDF5model")
            parameter = modelprefix + str("_parameters")
            try:
                f = open(parameter, 'r')
            except IOError:
                print('cannot open ' + parameter + " ! check if the model exists. "
                                                   "please run train_general.py or train_kinase.py to get the custom model first!\n")
            else:
                f = open(parameter, 'r')
                parameters = f.read()
                f.close()

            nclass = int(parameters.split("\t")[0])
            window = int(parameters.split("\t")[1])
            residues = parameters.split("\t")[2]
            residues = residues.split(",")
            codemode = int(parameters.split("\t")[4])
            modeltype = str(parameters.split("\t")[5])
            nb_classes = int(parameters.split("\t")[6])

        testfrag, ids, poses, focuses = extractFragforPredict(inputfile, window, '-', focus=residues)

        testX, testY = convertRawToXY(testfrag.as_matrix(), codingMode=codemode)
        if len(testX.shape) > 3:
            testX.shape = (testX.shape[0], testX.shape[2], testX.shape[3])

        predict_average_weight = np.zeros((testX.shape[0], 2))
        predict_average_predict = np.zeros((testX.shape[0], 2))
        predict_average_last_predict = np.zeros((testX.shape[0], 2))

        for bt in range(nclass):  # 0 648 bt=2 len(tf.trainable_variables())=1530
            # load all involving mode weights
            # sess = tf.Session()
            inputweights = checkpointweights + "_nclass" + str(bt) + "_iteration"
            model_members = load_model_weights(inputweights, model_arch)
            if eval_type == "all" or eval_type == "average_weight":
                predict_temp = predict_by_avg_members(model_members, model_arch, testX)
                predict_average_weight += predict_temp
                auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY)
                roc_average_last_predict[time] = auc_score
                pr_average_last_predict[time] = pr_score
                accuracy_average_last_predict[time]  = accuracy
                sensitivity_average_last_predict[time]  = sensitivity
                specificity_average_last_predict[time]  = specificity
                f1_score_average_last_predict[time]  = f1_score
                mcc_average_last_predict[time]  = mcc
                # fp.write(
                #     "average_weight_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str(
                #         accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(
                #         f1_score) + "\t" + str(mcc) + "\n")

            if eval_type == "all" or eval_type == "average_predict":
                predict_temp = predict_by_snapshot(model_members, model_arch, testX)
                predict_average_predict += predict_temp
                auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY)
                roc_average_last_predict[time] = auc_score
                pr_average_last_predict[time] = pr_score
                accuracy_average_last_predict[time]  = accuracy
                sensitivity_average_last_predict[time]  = sensitivity
                specificity_average_last_predict[time]  = specificity
                f1_score_average_last_predict[time]  = f1_score
                mcc_average_last_predict[time]  = mcc
                print("average_predict results:")
                # fp.write(
                #     "average_predict_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str(
                #         accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(
                #         f1_score) + "\t" + str(mcc) + "\n")

            del model_members
            # sess.close()

        if eval_type == "all" or eval_type == "average_weight1":
            predict_average_weight = predict_average_weight / float(nclass)
            auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_average_weight,
                                                                                             testY)
            print("average_weight1")
            roc_average_last_predict[time] = auc_score
            pr_average_last_predict[time] = pr_score
            accuracy_average_last_predict[time]  = accuracy
            sensitivity_average_last_predict[time]  = sensitivity
            specificity_average_last_predict[time]  = specificity
            f1_score_average_last_predict[time]  = f1_score
            mcc_average_last_predict[time]  = mcc
            # fp.write(
            #     "average_weight_results\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str(accuracy) + "\t" + str(
            #         sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str(mcc) + "\n")
            # roc_average_weight[time] = auc_score
            # pr_average_weight[time] = pr_score
            # write_output(outputfile + "average_weight_results_fold"+str(time)+".txt",predict_average_weight,ids,poses,focuses)

        if eval_type == "all" or eval_type == "average_predict":
            predict_average_predict = predict_average_predict / float(nclass)
            auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_average_predict,
                                                                                              testY)
            roc_average_last_predict[time] = auc_score
            pr_average_last_predict[time] = pr_score
            accuracy_average_last_predict[time]  = accuracy
            sensitivity_average_last_predict[time]  = sensitivity
            specificity_average_last_predict[time]  = specificity
            f1_score_average_last_predict[time]  = f1_score
            mcc_average_last_predict[time]  = mcc
            # fp.write("average_predict_results:\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str(
            #     accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str(
            #     mcc) + "\n")
            # roc_average_predict[time] = auc_score
            # pr_average_predict[time] = pr_score
            # write_output(outputfile + "average_predict_results_fold"+str(time)+".txt",predict_average_predict,ids,poses,focuses)

        if eval_type == "all" or eval_type == "average_last_predict":
            nclass_ini = 1
            for bt in range(nclass):
                model_arch[0].load_weights(model + "_class" + str(bt))
                predict_temp = model_arch[1].predict(testX)[0]
                predict_average_last_predict += predict_temp
                auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY)
                # fp.write("average_last_predict_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str(
                #     pr_score) + "\t" + str(accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(
                #     f1_score) + "\t" + str(mcc) + "\n")

            predict_average_last_predict = predict_average_last_predict / (nclass * nclass_ini)
            auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(
                predict_average_last_predict, testY)
            # fp.write("average_last_predict_results\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str(
            #     accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str(
            #     mcc) + "\n")
            roc_average_last_predict[time] = auc_score
            pr_average_last_predict[time] = pr_score
            accuracy_average_last_predict[time]  = accuracy
            sensitivity_average_last_predict[time]  = sensitivity
            specificity_average_last_predict[time]  = specificity
            f1_score_average_last_predict[time]  = f1_score
            mcc_average_last_predict[time]  = mcc
            # write_output(outputfile + "average_last_predict_results_fold"+str(time)+".txt",predict_average_last_predict,ids,poses,focuses)
            print("Successfully predicted from custom models !\n")

    fp.write("!!!!!!!!!!!!!!!!!!!!!!!!!\n")
    # fp.write("average_weight_results\t" + ",".join([str(x) for x in roc_average_weight]) + "\t" + ",".join(
    #     [str(x) for x in pr_average_weight]) + "\t" + str(np.mean(roc_average_weight)) + "," + str(
    #     np.std(roc_average_weight)) + "\t" + str(np.mean(pr_average_weight)) + "," + str(
    #     np.std(pr_average_weight)) + "\n")
    # fp.write("average_predict_results\t" + ",".join([str(x) for x in roc_average_predict]) + "\t" + ",".join(
    #     [str(x) for x in pr_average_predict]) + "\t" + str(np.mean(roc_average_predict)) + "," + str(
    #     np.std(roc_average_predict)) + "\t" + str(np.mean(pr_average_predict)) + "," + str(
    #     np.std(pr_average_predict)) + "\n")
    # fp.write("average_last_predict_results\t" + ",".join([str(x) for x in roc_average_last_predict]) + "\t" + ",".join(
    #     [str(x) for x in pr_average_last_predict]) + "\t" + str(np.mean(roc_average_last_predict)) + "," + str(
    #     np.std(roc_average_last_predict)) + "\t" + str(np.mean(pr_average_last_predict)) + "," + str(
    #     np.std(pr_average_last_predict)) + "\n")
    #

    print("roc: \n")
    print(roc_average_last_predict)
    fp.write("average_last_predict_results: \t" + "\t" + str(np.mean(roc_average_last_predict)) + ","  + "\t" + str(np.mean(pr_average_last_predict)) + "," +str(np.mean(accuracy_average_last_predict))+","  + "\t" +
             str(np.mean(sensitivity_average_last_predict)) +","  + "\t" +str(np.mean(specificity_average_last_predict)) +","  + "\t"  +str(np.mean(f1_score_average_last_predict)) +","  + "\t"  +str(np.mean(mcc_average_last_predict)) 
         + "\n")
    fp.close()
Ejemplo n.º 2
0
def main(srate=1,
         nb_epoch1=1,
         nb_epoch2=30,
         earlystop=20,
         maxneg=None,
         codingMode=0,
         transferlayer=1,
         inputweights=None,
         outputweights=None,
         forkinas=False):

    ########## Load Training Data ##########
    oneofkey_pos, oneofkey_neg, pssm_pos, pssm_neg, physical_pos, physical_neg = get_data(
        r'data/Ubisite_train3.txt', r'data/pssmpickle2/', label=True)

    ########## Load Testing Data ##########
    test_oneofkey_pos, test_oneofkey_neg, test_pssm_pos, test_pssm_neg, test_physical_pos, test_physical_neg = get_data(
        r'data/Ubisite_test3.txt', r'data/pssmpickle2/', label=False)

    ########## Oneofkey Testing ##########
    test_oneofkey_pos = pd.DataFrame(test_oneofkey_pos)
    test_oneofkey_neg = pd.DataFrame(test_oneofkey_neg)
    test_oneofkey_all = pd.concat([test_oneofkey_pos, test_oneofkey_neg])
    test_oneofkeyX, test_oneofkeyY = convertRawToXY(
        test_oneofkey_all.as_matrix(), codingMode=0)

    ########## Physical Testing ##########
    test_physical_pos = pd.DataFrame(test_physical_pos)
    test_physical_neg = pd.DataFrame(test_physical_neg)
    test_physical_all = pd.concat([test_physical_pos, test_physical_neg])
    test_physicalX, test_physicalY = convertRawToXY(
        test_physical_all.as_matrix(), codingMode=6)

    ########## Pssm Testing ##########
    test_pssm_all = test_pssm_pos + test_pssm_neg
    test_pssmX = convertRawToXY(test_pssm_all, codingMode=7)
    test_pssmY = test_oneofkeyY

    ########## OneofkeyX_t For Shape ##########
    test_oneofkeyX_t = test_oneofkeyX
    test_oneofkeyX_t.shape = (test_oneofkeyX.shape[0], test_oneofkeyX.shape[2],
                              test_oneofkeyX.shape[3])

    ########## PhysicalX_t For Shape ##########
    test_physicalX_t = test_physicalX
    test_physicalX_t.shape = (test_physicalX.shape[0], test_physicalX.shape[2],
                              test_physicalX.shape[3])

    ########### PssmX_t For Shape ##########
    testPssmX_t = test_pssmX
    testPssmX_t.shape = (test_pssmX.shape[0], test_pssmX.shape[2],
                         test_pssmX.shape[3])

    ########## Del Testall ##########
    del test_oneofkey_all, test_physical_all, test_pssm_all

    ########## Set Training Times ##########
    nclass = 20
    for cw in range(1, 3, 1):

        c_weight = {0: cw * 0.1, 1: 1}
        ########## Set Training Strate ##########
        for t in range(0, nclass):

            ########### Shulffle All Training Data ##########
            pssm_pos, pssm_neg, oneofkey_pos, oneofkey_neg, physical_pos, physical_neg = shufflewrr(
                pssm_pos, pssm_neg, oneofkey_pos, oneofkey_neg, physical_pos,
                physical_neg)

            ########## A For Positive Data Number Set ##########
            a = int(len(oneofkey_pos) * 0.8)

            ########## Oneofkey Training ##########
            train_oneofkey_pos = oneofkey_pos[0:a]
            train_oneofkey_neg = oneofkey_neg[0:a]

            ########## Physical Training ##########
            train_physical_pos = physical_pos[0:a]
            train_physical_neg = physical_neg[0:a]

            ########## Pssm Training ##########
            train_pssm_pos = pssm_pos[0:a]
            train_pssm_neg = pssm_neg[0:a]

            print('total train', len(train_oneofkey_pos),
                  len(train_oneofkey_neg), 'blblblblbl',
                  len(train_physical_pos), len(train_physical_neg),
                  len(train_pssm_pos), len(train_pssm_neg))

            ########## Pos Concat Neg ##########
            train_oneofkey_all = pd.concat(
                [train_oneofkey_pos, train_oneofkey_neg])
            train_physical_all = pd.concat(
                [train_physical_pos, train_physical_neg])
            train_pssm_all = train_pssm_pos + train_pssm_neg
            ########## Shuffle Again ##########
            train_pssm_all, train_oneofkey_all, train_physical_all = shufflePosNeg(
                train_pssm_all, train_oneofkey_all, train_physical_all)

            ########## Dprocess For Codes ##########
            train_oneofkey_all = pd.DataFrame(train_oneofkey_all)
            train_oneofkeyX, train_oneofkeyY = convertRawToXY(
                train_oneofkey_all.as_matrix(), codingMode=0)
            train_physical_all = pd.DataFrame(train_physical_all)
            train_physicalX, train_physicalY = convertRawToXY(
                train_physical_all.as_matrix(), codingMode=6)
            train_pssmX = convertRawToXY(train_pssm_all, codingMode=7)
            train_pssmY = train_oneofkeyY

            ########## Del Trainall ##########
            del train_oneofkey_all, train_physical_all, train_pssm_all

            ########## MultiCNN ##########
            if (t == 0):
                models = MultiCNN(
                    train_oneofkeyX,
                    train_oneofkeyY,
                    train_physicalX,
                    train_pssmX,
                    pre_train_seq_path='bestmodel/best - oneofk - model.h5',
                    pre_train_physical_path=
                    'bestmodel/best - physical - model.h5',
                    pre_train_pssm_path='bestmodel/best - pssm - model.h5',
                    nb_epoch=nb_epoch2,
                    earlystop=earlystop,
                    transferlayer=transferlayer,
                    weights=inputweights,
                    class_weights=c_weight,
                    forkinas=forkinas,
                    compiletimes=t)
                #predict_classes = kutils.probas_to_classes(models.predict([test_oneofkeyX_t,test_physicalX_t,testPssmX_t] ,batch_size=2048))
                #predict_classes = K.round(models.predict(test_physicalX))
                #print('sklearn mcc',sklearn.metrics.matthews_corrcoef(test_physicalY[:,1], predict_classes))
                #print('our calculation',calculate_performance(len(test_physicalY), test_physicalY[:,1], predict_classes))
                #print('No.'+ str(t)+':', models.metrics_names,models.evaluate([test_oneofkeyX_t,test_physicalX_t,testPssmX_t], test_oneofkeyY, batch_size=2048))

            else:
                models = MultiCNN(
                    train_oneofkeyX,
                    train_oneofkeyY,
                    train_physicalX,
                    train_pssmX,
                    pre_train_seq_path='bestmodel/best - oneofk - model.h5',
                    pre_train_physical_path=
                    'bestmodel/best - physical - model.h5',
                    pre_train_pssm_path='bestmodel/best - pssm - model.h5',
                    nb_epoch=nb_epoch2,
                    earlystop=earlystop,
                    transferlayer=transferlayer,
                    weights=inputweights,
                    class_weights=c_weight,
                    forkinas=forkinas,
                    compiletimes=t,
                    compilemodels=models)
                #models.save('physicalfinal',overwrite=True)
                #models.save_weights('physicalweightfinal',overwrite=True)
                #predict_classes = kutils.probas_to_classes(models.predict([test_oneofkeyX_t,test_physicalX_t,testPssmX_t] ,batch_size=2048))
                #predict_classes = K.round(models.predict(test_physicalX))
                #print('sklearn mcc',sklearn.metrics.matthews_corrcoef(test_physicalY[:,1], predict_classes))
                #print('our calculation', calculate_performance(len(test_physicalY), test_physicalY[:,1], predict_classes))
                #print('No.'+ str(t)+':', models.metrics_names,models.evaluate([test_oneofkeyX_t,test_physicalX_t,testPssmX_t], test_oneofkeyY, batch_size=2048))

            #predict testing set
            pred_proba = models.predict(
                [test_oneofkeyX, test_physicalX, test_pssmX], batch_size=2048)
            predict_classes = kutils.probas_to_classes(pred_proba)
            #SAVE the prediction metrics
            with open('result/evaluation.txt', mode='a') as resFile:
                resFile.write(
                    str(cw) + ' ' + str(t) + ' ' + calculate_performance(
                        len(test_physicalY), test_physicalY[:, 1],
                        predict_classes, pred_proba[:, 1]) + '\r\n')
            resFile.close()
            true_label = test_oneofkeyY
            result = np.column_stack((true_label[:, 1], pred_proba[:, 1]))
            result = pd.DataFrame(result)
            result.to_csv(path_or_buf='result/result' + '-' + str(t) + '-' +
                          str(cw) + '-' + '.txt',
                          index=False,
                          header=None,
                          sep='\t',
                          quoting=csv.QUOTE_NONNUMERIC)

    ########## Del Test Data ##########
    del test_pssm_pos, test_pssm_neg, test_oneofkey_pos, test_oneofkey_neg, test_physical_pos, test_physical_neg
Ejemplo n.º 3
0
def bootStrapping_allneg_continue_keras2(trainfile,
                                         valfile=None,
                                         srate=0.8,
                                         nb_epoch1=3,
                                         nb_epoch2=30,
                                         earlystop=None,
                                         maxneg=None,
                                         model=0,
                                         codingMode=0,
                                         lam_recon=0,
                                         inputweights=None,
                                         outputweights=None,
                                         nb_classes=2):
    trainX = trainfile
    train_pos = trainX[np.where(trainX[:, 0] != 0)]
    train_neg = trainX[np.where(trainX[:, 0] == 0)]
    train_pos = pd.DataFrame(train_pos)
    train_neg = pd.DataFrame(train_neg)
    train_pos_s = train_pos.sample(train_pos.shape[0])
    #shuffle train pos
    train_neg_s = train_neg.sample(train_neg.shape[0])
    #shuffle train neg
    slength = int(train_pos.shape[0] * srate)
    nclass = int(train_neg.shape[0] / slength)
    if (valfile is not None):  # use all data in valfile as val
        valX = valfile
        val_pos = valX[np.where(valX[:, 0] != 0)]
        val_neg = valX[np.where(valX[:, 0] == 0)]
        val_pos = pd.DataFrame(val_pos)
        val_neg = pd.DataFrame(val_neg)
        val_all = pd.concat([val_pos, val_neg])
        valX1, valY1 = convertRawToXY(val_all.as_matrix(),
                                      codingMode=codingMode)
    else:  #selct 0.1 samples of training data as val
        a = int(train_pos.shape[0] * 0.9)
        b = train_neg.shape[0] - int(train_pos.shape[0] * 0.1)
        print "train pos=" + str(train_pos.shape[0]) + str('\n')
        print "train neg=" + str(train_neg.shape[0]) + str('\n')
        print " a=" + str(a) + " b=" + str(b) + str('\n')
        train_pos_s = train_pos[0:a]
        train_neg_s = train_neg[0:b]
        print "train pos s=" + str(train_pos_s.shape[0]) + str('\n')
        print "train neg s=" + str(train_neg_s.shape[0]) + str('\n')

        val_pos = train_pos[(a + 1):]
        print "val_pos=" + str(val_pos.shape[0]) + str('\n')
        val_neg = train_neg[b + 1:]
        print "val_neg=" + str(val_neg.shape[0]) + str('\n')

        val_all = pd.concat([val_pos, val_neg])
        valX1, valY1 = convertRawToXY(val_all.as_matrix(),
                                      codingMode=codingMode)
        slength = int(train_pos_s.shape[0] * srate)
        #update slength
        nclass = int(train_neg_s.shape[0] / slength)

    if (maxneg is not None):
        nclass = min(maxneg, nclass)
        #cannot do more than maxneg times

    #modelweights=None;
    for I in range(nb_epoch1):
        train_neg_s = train_neg_s.sample(train_neg_s.shape[0])
        #shuffle neg sample
        train_pos_ss = train_pos_s.sample(slength)
        for t in range(nclass):
            train_neg_ss = train_neg_s[(slength * t):(slength * t + slength)]
            train_all = pd.concat([train_pos_ss, train_neg_ss])
            trainX1, trainY1 = convertRawToXY(train_all.as_matrix(),
                                              codingMode=codingMode)
            if t == 0:
                models, eval_model, manipulate_model, weight_c_model, fitHistory = Capsnet_main(
                    trainX=trainX1,
                    trainY=trainY1,
                    valX=valX1,
                    valY=valY1,
                    nb_classes=nb_classes,
                    nb_epoch=nb_epoch2,
                    earlystop=earlystop,
                    weights=inputweights,
                    compiletimes=t,
                    lr=0.001,
                    batch_size=500,
                    lam_recon=lam_recon,
                    routings=3,
                    class_weight=None,
                    modeltype=model)
            else:
                models, eval_model, manipulate_model, weight_c_model, fitHistory = Capsnet_main(
                    trainX=trainX1,
                    trainY=trainY1,
                    valX=valX1,
                    valY=valY1,
                    nb_classes=nb_classes,
                    nb_epoch=nb_epoch2,
                    earlystop=earlystop,
                    weights=inputweights,
                    compiletimes=t,
                    compilemodels=(models, eval_model, manipulate_model,
                                   weight_c_model),
                    lr=0.001,
                    batch_size=500,
                    lam_recon=lam_recon,
                    routings=3,
                    class_weight=None,
                    modeltype=model)

            print "modelweights assigned for " + str(I) + " and " + str(
                t) + "\n"
            if (outputweights is not None):
                models.save_weights(outputweights, overwrite=True)

    return models, eval_model, manipulate_model, weight_c_model, fitHistory
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-input',
        dest='inputfile',
        type=str,
        help='Protein sequences to be predicted in fasta format.',
        required=True)
    parser.add_argument('-output',
                        dest='outputfile',
                        type=str,
                        help='prefix of the prediction results.',
                        required=True)
    parser.add_argument(
        '-model-prefix',
        dest='modelprefix',
        type=str,
        help=
        'prefix of custom model used for prediciton. If donnot have one, please run train_general.py to train a custom general PTM model or run train_kinase.py to train a custom kinase-specific PTM model.',
        required=False,
        default=None)
    parser.add_argument(
        '-residue-types',
        dest='residues',
        type=str,
        help=
        'Residue types that to be predicted, only used when -predict-type is \'general\'. For multiple residues, seperate each with \',\'',
        required=False,
        default="S,T,Y")

    args = parser.parse_args()

    inputfile = args.inputfile
    outputfile = args.outputfile
    residues = args.residues.split(",")
    modelprefix = args.modelprefix

    if modelprefix is None:
        print "Please specify the prefix for an existing custom model by -model-prefix!\n\
       It indicates two files [-model-prefix]_HDF5model and [-model-prefix]_parameters.\n \
       If you don't have such files, please run train_models.py to get the custom model first!\n"

        exit()
    else:  #custom prediction
        model = modelprefix + str("_HDF5model")
        parameter = modelprefix + str("_parameters")
        try:
            f = open(parameter, 'r')
        except IOError:
            print 'cannot open ' + parameter + " ! check if the model exists. please run train_general.py or train_kinase.py to get the custom model first!\n"
        else:
            f = open(parameter, 'r')
            parameters = f.read()
            f.close()
        from DProcess import convertRawToXY
        from EXtractfragment_sort import extractFragforPredict
        from capsulenet import Capsnet_main
        nclass = int(parameters.split("\t")[0])
        window = int(parameters.split("\t")[1])
        residues = parameters.split("\t")[2]
        residues = residues.split(",")
        codemode = int(parameters.split("\t")[4])
        modeltype = str(parameters.split("\t")[5])
        nb_classes = int(parameters.split("\t")[6])
        #print "nclass="+str(nclass)+"codemode="+str(codemode)+"\n";
        testfrag, ids, poses, focuses = extractFragforPredict(inputfile,
                                                              window,
                                                              '-',
                                                              focus=residues)

        testX, testY = convertRawToXY(testfrag.as_matrix(),
                                      codingMode=codemode)
        predictproba = np.zeros((testX.shape[0], 2))
        models = Capsnet_main(testX,
                              testY,
                              nb_epoch=1,
                              compiletimes=0,
                              lr=0.001,
                              batch_size=500,
                              lam_recon=0,
                              routings=3,
                              modeltype=modeltype,
                              nb_classes=nb_classes,
                              predict=True)  # only to get config

        nclass_ini = 1
        for bt in range(nclass):
            models[0].load_weights(model + "_class" + str(bt))
            predictproba += models[1].predict(testX)[0]

        predictproba = predictproba / (nclass * nclass_ini)
        poses = poses + 1
        results = np.column_stack((ids, poses, focuses, predictproba[:, 1]))
        result = pd.DataFrame(results)
        result.to_csv(outputfile + ".txt",
                      index=False,
                      header=None,
                      sep='\t',
                      quoting=csv.QUOTE_NONNUMERIC)
        print "Successfully predicted from custom models !\n"
Ejemplo n.º 5
0
    del windows_pos, windows_neg
    return windows_all


all_train_windows_pos, all_train_windows_neg = get_data(
    r'data/pretrain/1train.txt', r'data/pssmpickle2/', label=True)
val_windows_pos, val_windows_neg = get_data(r'data/pretrain/1val.txt',
                                            r'data/pssmpickle2/',
                                            label=True)
test_windows_pos, test_windows_neg = get_data(r'data/pretrain/1test.txt',
                                              r'data/pssmpickle2/',
                                              label=True)

test_windows_all = get_matrix(test_windows_pos, test_windows_neg)

test_oneofkeyX, testY = convertRawToXY(test_windows_all, codingMode=0)
test_oneofkeyX.shape = (test_oneofkeyX.shape[0], test_oneofkeyX.shape[2],
                        test_oneofkeyX.shape[3])
test_physicalXo, _ = convertRawToXY(test_windows_all, codingMode=9)
test_physicalXo.shape = (test_physicalXo.shape[0], test_physicalXo.shape[2],
                         test_physicalXo.shape[3])
test_physicalXp, _ = convertRawToXY(test_windows_all, codingMode=10)
test_physicalXp.shape = (test_physicalXp.shape[0], test_physicalXp.shape[2],
                         test_physicalXp.shape[3])
test_physicalXh, _ = convertRawToXY(test_windows_all, codingMode=11)
test_physicalXh.shape = (test_physicalXh.shape[0], test_physicalXh.shape[2],
                         test_physicalXh.shape[3])
test_physicalXc, _ = convertRawToXY(test_windows_all, codingMode=12)
test_physicalXc.shape = (test_physicalXc.shape[0], test_physicalXc.shape[2],
                         test_physicalXc.shape[3])
test_physicalXb, _ = convertRawToXY(test_windows_all, codingMode=13)
def bootStrapping_allneg_continue_keras2(trainfile,valfile=None,srate=0.8,
                                         nb_epoch1=3,nb_epoch2=30,earlystop=None,
                                         maxneg=None,model=0,codingMode=0,lam_recon=0,
                                         inputweights=None,outputweights=None,nb_classes=2,
                                         hw_res=None,hc_res=None,hc_res2=None): #inputfile:fragments (n*34);srate:selection rate for positive data;nclass:number of class models
  train_pos={} #0 S/T positive;1Y positive
  train_neg={} #0 S/T negative;1Y negative
  train_pos_s={}
  train_neg_s={}
  train_pos_ss={}
  train_neg_ss={}
  slength={}
  nclass={}
  trainX = trainfile
  for i in range(len(trainX)):
      trainX[i,0]=int(trainX[i,0])


  for i in range(2):
      train_pos[i]=trainX[np.where(trainX[:,0]==i)] #sp/tp 0 yp 1 sn/tn 2 yn 3
      train_neg[i]=trainX[np.where(trainX[:,0]==i+2)]
      train_pos[i]=pd.DataFrame(train_pos[i])
      train_neg[i]=pd.DataFrame(train_neg[i])
      train_pos_s[i]=train_pos[i].sample(train_pos[i].shape[0]); #shuffle train pos
      train_neg_s[i]=train_neg[i].sample(train_neg[i].shape[0]); #shuffle train neg
      slength[i]=int(train_pos[i].shape[0]*srate);
      nclass[i]=int(train_neg[i].shape[0]/slength[i]);

  if(valfile is not None): # use all data as val
     valX = valfile
     for i in range(len(valX)):
         valX[i,0]=int(valX[i,0])

     val_all=pd.DataFrame();
     for i in range(2):
         val_pos=valX[np.where(valX[:,0]==i)]
         val_neg=valX[np.where(valX[:,0]==i+2)]
         val_pos=pd.DataFrame(val_pos)
         val_neg=pd.DataFrame(val_neg)
         val_all=pd.concat([val_all,val_pos,val_neg])

     valX1,valY1 = convertRawToXY(val_all.as_matrix(),codingMode=codingMode) #(355340,1,33,21) after extract same size as positive (48050,1,33,21)
  else:
        val_all=pd.DataFrame()
        nclass={}
        for i in range(2):
            a=int(train_pos[i].shape[0]*0.9);
            b=train_neg[i].shape[0]-int(train_pos[i].shape[0]*0.1);
            print "train pos="+str(train_pos[i].shape[0])+str('\n');
            print "train neg="+str(train_neg[i].shape[0])+str('\n');
            print " a="+str(a)+" b="+str(b)+str('\n');
            train_pos_s[i]=train_pos[i][0:a]
            train_neg_s[i]=train_neg[i][0:b];
            print "train pos s="+str(train_pos_s[i].shape[0])+str('\n');
            print "train neg s="+str(train_neg_s[i].shape[0])+str('\n');

            val_pos=train_pos[i][(a+1):];
            print "val_pos="+str(val_pos.shape[0])+str('\n');
            val_neg=train_neg[i][b+1:];
            print "val_neg="+str(val_neg.shape[0])+str('\n');
            val_all=pd.concat([val_all,val_pos,val_neg])

            slength[i]=int(train_pos_s[i].shape[0]*srate); #transfer 0.1 to val so update slength
            nclass[i]=int(train_neg_s[i].shape[0]/slength[i])

        valX1,valY1 = convertRawToXY(val_all.as_matrix(),codingMode=codingMode)

  if(maxneg is not None):
       nclass_n=min(max([nclass[0],nclass[1]]),maxneg)

  #modelweights=None;
  for I in range(nb_epoch1):
    for i in range(2):
        train_neg_s[i]=train_neg_s[i].sample(train_neg_s[i].shape[0]); #shuffle neg sample
        train_pos_ss[i]=train_pos_s[i].sample(slength[i])

    for t in range(nclass_n):
        train_all=pd.DataFrame()
        for i in range(2):
            train_neg_ss[i]=train_neg_s[i][(slength[i]*t%nclass[i]):(slength[i]*t%nclass[i]+slength[i])];
            train_all=pd.concat([train_all,train_pos_ss[i],train_neg_ss[i]])

        classweights=None
        if(hc_res2 is not None): #negative has weight! hc_res2 is [0,2] for T
             classweights = { k:1.0 for k in range(nb_classes)} #stp 0 yp 1 stn 2 yn 3
             classweights[hc_res2[0]]=float(sum(train_all.as_matrix()[:,0]<=1))/sum(train_all.as_matrix()[:,0]==hc_res2[0])
             classweights[hc_res2[1]]=float(sum(train_all.as_matrix()[:,0]<=1))/sum(train_all.as_matrix()[:,0]==hc_res2[1])

        trainX1,trainY1 = convertRawToXY(train_all.as_matrix(),codingMode=codingMode) #(355340,1,33,21) after extract same size as positive (48050,1,33,21)
        if t==0:
            models,eval_model,manipulate_model,weight_c_model,fitHistory=Capsnet_main(trainX=trainX1,trainY=trainY1,valX=valX1,valY=valY1,nb_classes=nb_classes,nb_epoch=nb_epoch2,earlystop=earlystop,weights=inputweights,compiletimes=t,lr=0.001,batch_size=1000,lam_recon=lam_recon,routings=3,class_weight=classweights,modeltype=model)
        else:
            models,eval_model,manipulate_model,weight_c_model,fitHistory=Capsnet_main(trainX=trainX1,trainY=trainY1,valX=valX1,valY=valY1,nb_classes=nb_classes,nb_epoch=nb_epoch2,earlystop=earlystop,weights=inputweights,compiletimes=t,compilemodels=(models,eval_models,manipulate_models,weight_c_models),lr=0.001,batch_size=1000,lam_recon=lam_recon,routings=3,class_weight=classweights,modeltype=model)
        #modelweights=models.get_weights()

        print "modelweights assigned for "+str(I)+" and "+str(t)+"\n";
        if(outputweights is not None):
            models.save_weights(outputweights+ '_iteration'+str(t),overwrite=True)
        #print "learning rate="+str(models.optimizer.lr.get_value())+"\n";


  return models,eval_model,manipulate_model,weight_c_model,fitHistory
Ejemplo n.º 7
0
def bootStrapping_allneg_continue_keras2(trainfile,
                                         valfile=None,
                                         srate=0.8,
                                         nb_epoch1=3,
                                         nb_epoch2=30,
                                         earlystop=None,
                                         maxneg=None,
                                         codingMode=0,
                                         inputweights=None,
                                         outputweights=None,
                                         nb_classes=2,
                                         transferlayer=0,
                                         forkinase=False,
                                         predict=False,
                                         balance_validation=True,
                                         monitor_file_name=None,
                                         save_best_only=True,
                                         load_average_weight=False):
    trainX = trainfile
    for i in range(len(trainX)):
        trainX[i, 0] = int(trainX[i, 0])

    train_pos = trainX[np.where(trainX[:, 0] != 0)]
    train_neg = trainX[np.where(trainX[:, 0] == 0)]
    train_pos = pd.DataFrame(train_pos)
    train_neg = pd.DataFrame(train_neg)
    train_pos_s = train_pos.sample(train_pos.shape[0])
    #shuffle train pos
    train_neg_s = train_neg.sample(train_neg.shape[0])
    #shuffle train neg
    slength = int(train_pos.shape[0] * srate)
    nclass = int(train_neg.shape[0] / slength)
    if (valfile is not None):  # use all data in valfile as val
        valX = valfile
        for i in range(len(valX)):
            valX[i, 0] = int(valX[i, 0])

        val_pos = valX[np.where(valX[:, 0] != 0)]
        val_neg = valX[np.where(valX[:, 0] == 0)]
        val_pos = pd.DataFrame(val_pos)
        val_neg = pd.DataFrame(val_neg)
        if balance_validation:  #extract balanced val_neg
            val_neg = val_neg.sample(val_pos.shape[0])

        val_all = pd.concat([val_pos, val_neg])
        val_all_s = val_all.sample(val_all.shape[0])
        valX1, valY1 = convertRawToXY(val_all_s.as_matrix(),
                                      codingMode=codingMode)
    else:  #selct 0.1 samples of training data as val
        a = int(train_pos.shape[0] * 0.9)
        b = train_neg.shape[0] - int(train_pos.shape[0] * 0.1)
        print("train pos=" + str(train_pos.shape[0]) + str('\n'))
        print("train neg=" + str(train_neg.shape[0]) + str('\n'))
        print(" a=" + str(a) + " b=" + str(b) + str('\n'))
        train_pos_s = train_pos[0:a]
        train_neg_s = train_neg[0:b]
        print("train pos s=" + str(train_pos_s.shape[0]) + str('\n'))
        print("train neg s=" + str(train_neg_s.shape[0]) + str('\n'))

        val_pos = train_pos[(a + 1):]
        print("val_pos=" + str(val_pos.shape[0]) + str('\n'))
        val_neg = train_neg[b + 1:]
        print("val_neg=" + str(val_neg.shape[0]) + str('\n'))

        val_all = pd.concat([val_pos, val_neg])
        val_all_s = val_all.sample(val_all.shape[0])
        valX1, valY1 = convertRawToXY(val_all_s.as_matrix(),
                                      codingMode=codingMode)
        slength = int(train_pos_s.shape[0] * srate)
        #update slength
        nclass = int(train_neg_s.shape[0] / slength)

    if (maxneg is not None):
        nclass = min(maxneg, nclass)
        #cannot do more than maxneg times

    #modelweights=None;
    for I in range(nb_epoch1):
        train_neg_s = train_neg_s.sample(
            train_neg_s.shape[0])  #shuffle neg sample
        train_pos_ss = train_pos_s.sample(slength)
        for t in range(nclass):
            train_neg_ss = train_neg_s[(slength * t):(slength * t + slength)]
            train_all = pd.concat([train_pos_ss, train_neg_ss])
            train_all_shuffle = train_all.sample(train_all.shape[0])
            trainX1, trainY1 = convertRawToXY(train_all_shuffle.as_matrix(),
                                              codingMode=codingMode)
            if t == 0:
                models = MultiCNN(trainX=trainX1,
                                  trainY=trainY1,
                                  valX=valX1,
                                  valY=valY1,
                                  nb_classes=nb_classes,
                                  nb_epoch=nb_epoch2,
                                  earlystop=earlystop,
                                  weights=inputweights,
                                  compiletimes=t,
                                  batch_size=1000,
                                  class_weight=None,
                                  transferlayer=transferlayer,
                                  forkinase=forkinase,
                                  predict=predict,
                                  outputweights=outputweights,
                                  monitor_file=monitor_file_name,
                                  save_best_only=save_best_only,
                                  load_average_weight=load_average_weight)
            else:
                models = MultiCNN(trainX=trainX1,
                                  trainY=trainY1,
                                  valX=valX1,
                                  valY=valY1,
                                  nb_classes=nb_classes,
                                  nb_epoch=nb_epoch2,
                                  earlystop=earlystop,
                                  weights=inputweights,
                                  compiletimes=t,
                                  compilemodels=models,
                                  batch_size=1000,
                                  class_weight=None,
                                  transferlayer=transferlayer,
                                  forkinase=forkinase,
                                  outputweights=outputweights,
                                  monitor_file=monitor_file_name,
                                  save_best_only=save_best_only,
                                  load_average_weight=load_average_weight)

            #save weights already does in callbacks in MultiCNN
            print("modelweights assigned for " + str(I) + " and " + str(t) +
                  "\n")
            #if(outputweights is not None):
            #    models.save_weights(outputweights+ '_iteration'+str(t),
            #                        overwrite=True) #already in callbacks

    return models
def bootStrapping_allneg_continue_keras2(
    trainfile,
    valfile=None,
    srate=0.8,
    nb_epoch1=3,
    nb_epoch2=30,
    earlystop=None,
    maxneg=None,
    model=0,
    codingMode=0,
    frozenlayer=1,
    inputweights=None,
    outputweights=None,
    forkinas=False,
    nb_classes=2,
    hw_res=None,
    hc_res=None,
    hc_res2=None
):  #inputfile:fragments (n*34);srate:selection rate for positive data;nclass:number of class models
    train_pos = {}  #0S,1T,2Y
    train_neg = {}  #0S,1T,2Y
    train_pos_s = {}
    train_neg_s = {}
    train_pos_ss = {}
    train_neg_ss = {}
    slength = {}
    nclass = {}
    trainX = pd.read_table(trainfile, sep='\t', header=None).values
    for i in range(2):
        train_pos[i] = trainX[np.where(
            trainX[:, 0] == i)]  #sp 0 tp 1 yp 2 sn 3 tn 4 yn 5  p<=2 n>2
        train_neg[i] = trainX[np.where(trainX[:, 0] == i +
                                       2)]  #sp 0 tp 1 yp 2 sn 3 tn 4 yn 5
        train_pos[i] = pd.DataFrame(train_pos[i])
        train_neg[i] = pd.DataFrame(train_neg[i])
        train_pos_s[i] = train_pos[i].sample(train_pos[i].shape[0])
        #shuffle train pos
        train_neg_s[i] = train_neg[i].sample(train_neg[i].shape[0])
        #shuffle train neg
        slength[i] = int(train_pos[i].shape[0] * srate)
        nclass[i] = int(train_neg[i].shape[0] / slength[i])

    if (valfile is not None):  # use all data as val
        valX = pd.read_table(valfile, sep='\t', header=None).values
        val_all = pd.DataFrame()
        for i in range(2):
            val_pos = valX[np.where(valX[:, 0] == i)]
            val_neg = valX[np.where(valX[:, 0] == i + 2)]
            val_pos = pd.DataFrame(val_pos)
            val_neg = pd.DataFrame(val_neg)
            val_all = pd.concat([val_all, val_pos, val_neg])

        valX1, valY1 = convertRawToXY(
            val_all.as_matrix(), codingMode=codingMode
        )  #(355340,1,33,21) after extract same size as positive (48050,1,33,21)
    else:
        val_all = pd.DataFrame()
        nclass = {}
        for i in range(2):
            a = int(train_pos[i].shape[0] * 0.9)
            b = train_neg[i].shape[0] - int(train_pos[i].shape[0] * 0.1)
            print "train pos=" + str(train_pos[i].shape[0]) + str('\n')
            print "train neg=" + str(train_neg[i].shape[0]) + str('\n')
            print " a=" + str(a) + " b=" + str(b) + str('\n')
            train_pos_s[i] = train_pos[i][0:a]
            train_neg_s[i] = train_neg[i][0:b]
            print "train pos s=" + str(train_pos_s[i].shape[0]) + str('\n')
            print "train neg s=" + str(train_neg_s[i].shape[0]) + str('\n')

            val_pos = train_pos[i][(a + 1):]
            print "val_pos=" + str(val_pos.shape[0]) + str('\n')
            val_neg = train_neg[i][b + 1:]
            print "val_neg=" + str(val_neg.shape[0]) + str('\n')
            val_all = pd.concat([val_all, val_pos, val_neg])

            slength[i] = int(train_pos_s[i].shape[0] * srate)
            #transfer 0.1 to val so update slength
            nclass[i] = int(train_neg_s[i].shape[0] / slength[i])
        valX1, valY1 = convertRawToXY(val_all.as_matrix(),
                                      codingMode=codingMode)

    if (maxneg is not None):
        #nclass_n=min(nclass[hc_res2[0]],maxneg); #cannot do more than maxneg times
        nclass_n = min(max(nclass.values()), maxneg)
    #modelweights=None;
    for I in range(nb_epoch1):
        for i in range(2):
            train_neg_s[i] = train_neg_s[i].sample(train_neg_s[i].shape[0])
            #shuffle neg sample
            train_pos_ss[i] = train_pos_s[i].sample(slength[i])

        for t in range(nclass_n):
            train_all = pd.DataFrame()
            for i in range(2):
                train_neg_ss[i] = train_neg_s[i][(slength[i] * t % nclass[i]):(
                    slength[i] * t % nclass[i] + slength[i])]
                train_all = pd.concat(
                    [train_all, train_pos_ss[i], train_neg_ss[i]])

            sampleweights = None

            if (hw_res is not None):
                sampleweights = np.ones(len(train_all))
                sampleweights[np.where(
                    train_all.as_matrix()[:, 0] == hw_res)] *= sum(
                        sampleweights[np.where(
                            train_all.as_matrix()[:, 0] != 0)]) / sum(
                                sampleweights[np.where(
                                    train_all.as_matrix()[:, 0] == hw_res)])

            classweights = None
            if (hc_res is not None):
                classweights = {
                    0: 1,
                    1: 1,
                    2: 1,
                    3: 1
                }  #0 negative, 1 S 2 T 3 Y
                classweights[hc_res] = sum(
                    train_all.as_matrix()[:, 0] != 0) / sum(
                        train_all.as_matrix()[:, 0] == hc_res)

            if (hc_res2 is not None):  #negative has weight!
                # classweights={0:1.0,1:1.0,2:1.0,3:1.0,4:1.0,5:1.0} #sp 0 tp 1 yp 2 sn 3 tn 4 yn 5
                classweights = {k: 1.0 for k in range(nb_classes)}
                classweights[hc_res2[0]] = float(
                    sum(train_all.as_matrix()[:, 0] < 2)) / sum(
                        train_all.as_matrix()[:, 0] == hc_res2[0])
                classweights[hc_res2[1]] = float(
                    sum(train_all.as_matrix()[:, 0] < 2)) / sum(
                        train_all.as_matrix()[:, 0] == hc_res2[1])
            print(train_all.as_matrix())
            trainX1, trainY1 = convertRawToXY(
                train_all.as_matrix(), codingMode=codingMode
            )  #(355340,1,33,21) after extract same size as positive (48050,1,33,21)
            #models=MultiCNN(trainX1,trainY1,valX1,valY1,nb_epoch=nb_epoch2,earlystop=earlystop,model=model,frozenlayer=frozenlayer,weights=inputweights,modelweights=modelweights,forkinas=forkinas,compiletimes=t,compilemodels=models)
            print("#" * 30)
            print(trainX1.shape)
            print("#" * 30)
            # print(trainX1.ix[:,0])
            if t == 0:
                models = MultiCNN(trainX1,
                                  trainY1,
                                  valX1,
                                  valY1,
                                  nb_epoch=nb_epoch2,
                                  earlystop=earlystop,
                                  model=model,
                                  frozenlayer=frozenlayer,
                                  weights=inputweights,
                                  sample_weight=sampleweights,
                                  nb_classes=nb_classes,
                                  class_weight=classweights,
                                  forkinas=forkinas,
                                  compiletimes=t)
            else:
                models = MultiCNN(trainX1,
                                  trainY1,
                                  valX1,
                                  valY1,
                                  nb_epoch=nb_epoch2,
                                  earlystop=earlystop,
                                  model=model,
                                  frozenlayer=frozenlayer,
                                  weights=inputweights,
                                  sample_weight=sampleweights,
                                  nb_classes=nb_classes,
                                  class_weight=classweights,
                                  forkinas=forkinas,
                                  compiletimes=t,
                                  compilemodels=models)

            #modelweights=models.get_weights()
            print "modelweights assigned for " + str(I) + " and " + str(
                t) + "\n"
            if (outputweights is not None):
                models.save_weights(outputweights + '_iteration' + str(t),
                                    overwrite=True)
            #print "learning rate="+str(models.optimizer.lr.get_value())+"\n";

    return models
Ejemplo n.º 9
0
def main():
    args = getArgs()
    inputfile = args.inputfile
    species = args.speciesarg
    out_result = args.outresult

    earlystop = 20
    batch_size = 512

    input_row_One_Hot = 41
    input_col_One_Hot = 5

    input_row_ANF_NCP = 41
    input_col_ANF_NCP = 9

    input_row_CKSNAP_NCP = 150
    input_col_CKSNAP_NCP = 17

    input_row_PSTNPss_NCP = 39
    input_col_PSTNPss_NCP = 25

    beta_1 = 0.9
    beta_2 = 0.999
    epsilon = 1e-08

    attentionhidden_x = 10
    attentionhidden_xr = 8
    attention_reg_x = 0.151948
    attention_reg_xr = 2

    learning_rate = 0.002
    weight_decay = 0.00005

    conv1_filter = 32
    dropoutMerge1 = 0.5

    conv2_filter = 136
    dropoutMerge2 = 0.1

    conv3_filter = 48
    dropoutMerge3 = 0.1

    dropoutMerge4 = 0.5

    dense1 = 64
    dropout_dense1 = 0.5

    dense3 = 8

    weightsModel = args.weightsModelarg
    best_model = weightsModel

    mycsvTrain = species+ '_train_cnn.txt'
    train_All = pd.read_csv(mycsvTrain)

    window = 20

    if args.inputType == 'file':
        predict_data,ids,poses,focuses=analyseFASTAPredict(inputfile,window,'-',focus='C')
    if args.inputType == 'fixed':
        predict_data,ids,poses,focuses=analyseFixedPredict(inputfile)
    

    testLabel = predict_data.iloc[:,0]

    #####################################ENAC#####################################
    testX_One_Hot,testY_One_Hot = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='ENAC')

    #####################################ANF_NCP_EIIP_Onehot#####################################
    testX_ANF_NCP,testY_ANF_NCP = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='ANF_NCP_EIIP_Onehot')

    #####################################CKSNAP_NCP_EIIP_Onehot#####################################
    testX_CKSNAP_NCP,testY_CKSNAP_NCP = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='CKSNAP_NCP_EIIP_Onehot')
    
    #####################################PSTNPss_NCP_EIIP_Onehot#####################################
    testX_PSTNPss_NCP,testY_PSTNPss_NCP = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='PSTNPss_NCP_EIIP_Onehot')



    predict_model = train_cnn(learning_rate=learning_rate,weight_decay=weight_decay,dropoutMerge1=dropoutMerge1,
                            dropoutMerge2=dropoutMerge2,dropoutMerge3=dropoutMerge3,dropoutMerge4=dropoutMerge4,
                            dropout_dense1=dropout_dense1,dense1=dense1,dense3=dense3,conv1_filter=conv1_filter,
                            conv2_filter=conv2_filter,conv3_filter=conv3_filter,earlystop = earlystop,batch_size = batch_size,
                            input_row_One_Hot = input_row_One_Hot,input_col_One_Hot = input_col_One_Hot,
                            input_row_ANF_NCP = input_row_ANF_NCP,input_col_ANF_NCP = input_col_ANF_NCP,
                            input_row_CKSNAP_NCP = input_row_CKSNAP_NCP,input_col_CKSNAP_NCP = input_col_CKSNAP_NCP,
                            input_row_PSTNPss_NCP = input_row_PSTNPss_NCP,input_col_PSTNPss_NCP = input_col_PSTNPss_NCP,
                            beta_1 = beta_1,beta_2 = beta_2,epsilon = epsilon,
                            attentionhidden_x = attentionhidden_x,attentionhidden_xr = attentionhidden_xr,
                            attention_reg_x = attention_reg_x,attention_reg_xr = attention_reg_xr, 
                            best_model = best_model,weightsModel = weightsModel,predict = True, train = False,transfer=False)

    predict_model.load_weights(weightsModel)

    predictproba=predict_model.predict([testX_One_Hot,testX_ANF_NCP,testX_CKSNAP_NCP,testX_PSTNPss_NCP])
               
    poses=poses+1

    speciesArr = species.split('/')
    speciess = speciesArr[len(speciesArr)-1]

    if(speciess == 'A'):
        speciess2 = 'A.thaliana'
    elif(speciess == 'C'):
        speciess2 = 'C.elegan'
    elif(speciess == 'D'):
        speciess2 = 'D.melanogaster'
    elif(speciess == 'E'):
        speciess2 = 'E.coli'
    elif(speciess == 'Gsub'):
        speciess2 = 'G.subterraneus'
    elif(speciess == 'Gpick'):
        speciess2 = 'G.pickeringii'

    typess = np.full((testLabel.shape[0],1),speciess2)
    results=np.column_stack((ids,poses,focuses,predictproba[:,1],typess))
    result=pd.DataFrame(results)
    result.to_csv(out_result+ "_custom2.txt", index=False, header=None, sep='\t',quoting=csv.QUOTE_NONNUMERIC)
def bootStrapping_allneg_continue_val(
    trainfile,
    valfile=None,
    srate=0.8,
    nb_epoch1=3,
    nb_epoch2=30,
    earlystop=None,
    maxneg=None,
    codingMode=0,
    transferlayer=1,
    inputweights=None,
    outputweights=None,
    forkinas=False
):  #inputfile:fragments (n*34);srate:selection rate for positive data;nclass:number of class models

    trainX = trainfile
    train_pos = trainX[np.where(trainX[:, 0] == 1)]
    train_neg = trainX[np.where(trainX[:, 0] != 1)]
    train_pos = pd.DataFrame(train_pos)
    train_neg = pd.DataFrame(train_neg)
    if (train_pos.shape[0] == 0):
        print 'ERROR: size of positive sites is 0. Please check positive sites in training data!\n'
        exit()

    if (train_neg.shape[0] == 0):
        print 'ERROR: size of negative sites is 0. Please check negative sites in training data!\n'
        exit()

    train_pos_s = train_pos.sample(train_pos.shape[0])
    #shuffle train pos
    train_neg_s = train_neg.sample(train_neg.shape[0])
    #shuffle train neg
    slength = int(train_pos.shape[0] * srate)
    nclass = int(train_neg.shape[0] / slength)
    if (valfile is not None):
        valX = valfile.as_matrix()
        val_pos = valX[np.where(valX[:, 0] == 1)]
        val_neg = valX[np.where(valX[:, 0] != 1)]
        val_pos = pd.DataFrame(val_pos)
        val_neg = pd.DataFrame(val_neg)
        val_all = pd.concat([val_pos, val_neg])
        valX1, valY1 = convertRawToXY(val_all.as_matrix(),
                                      codingMode=codingMode)
    else:
        a = int(train_pos.shape[0] * 0.9)
        b = train_neg.shape[0] - int(train_pos.shape[0] * 0.1)
        train_pos_s = train_pos[0:a]
        train_neg_s = train_neg[0:b]

        val_pos = train_pos[(a + 1):]
        val_neg = train_neg[b + 1:]

        val_all = pd.concat([val_pos, val_neg])
        valX1, valY1 = convertRawToXY(val_all.as_matrix(),
                                      codingMode=codingMode)
        slength = int(train_pos_s.shape[0] * srate)
        nclass = int(train_neg_s.shape[0] / slength)

    if (maxneg is not None):
        nclass = min(maxneg, nclass)
        #cannot do more than maxneg times

    for I in range(nb_epoch1):
        train_neg_s = train_neg_s.sample(train_neg_s.shape[0])
        #shuffle neg sample
        train_pos_ss = train_pos_s.sample(slength)
        for t in range(nclass):
            train_neg_ss = train_neg_s[(slength * t):(slength * t + slength)]
            train_all = pd.concat([train_pos_ss, train_neg_ss])
            trainX1, trainY1 = convertRawToXY(train_all.as_matrix(),
                                              codingMode=codingMode)
            if t == 0:
                models = MultiCNN(trainX1,
                                  trainY1,
                                  valX1,
                                  valY1,
                                  nb_epoch=nb_epoch2,
                                  earlystop=earlystop,
                                  transferlayer=transferlayer,
                                  weights=inputweights,
                                  forkinas=forkinas,
                                  compiletimes=t)
            else:
                models = MultiCNN(trainX1,
                                  trainY1,
                                  valX1,
                                  valY1,
                                  nb_epoch=nb_epoch2,
                                  earlystop=earlystop,
                                  transferlayer=transferlayer,
                                  weights=inputweights,
                                  forkinas=forkinas,
                                  compiletimes=t,
                                  compilemodels=models)

            print "modelweights assigned for " + str(t) + " bootstrap.\n"
            if (outputweights is not None):
                models.save_weights(outputweights, overwrite=True)

    return models