def whole_revc_kmer_psednc(pos_file, neg_file, k): """Generate revc_kmer and psednc feature into a file combined positive and negative file.""" revc_kmer = RevcKmer(k=k, normalize=True, upto=True) with open(pos_file) as fp: revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(neg_file) as fp: revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) lamada = 6 w = 0.8 psednc = PseDNC(lamada, w) with open(pos_file) as fp: psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp)) with open(neg_file) as fp: psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp)) pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:])) neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:])) vecs = pos_vecs.tolist() + neg_vecs.tolist() labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) # Write file. write_file = "data/whole_revc_kmer_psednc.txt" write_libsvm(vecs, labels, write_file)
def whole_revc_kmer_psednc_choose_args(pos_file, neg_file, k): """Generate revc_kmer and psednc feature into a file combined positive and negative file.""" revc_kmer = RevcKmer(k=k, normalize=True, upto=True) with open(pos_file) as fp: revc_kmer_pos_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(neg_file) as fp: revc_kmer_neg_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) for lamada in range(1, 2): w = 0.1 while w < 1: psednc = PseDNC(lamada, w) with open(pos_file) as fp: psednc_pos_vecs = np.array(psednc.make_psednc_vec(fp)) with open(neg_file) as fp: psednc_neg_vecs = np.array(psednc.make_psednc_vec(fp)) pos_vecs = np.column_stack((revc_kmer_pos_vecs, psednc_pos_vecs[:, -lamada:])) neg_vecs = np.column_stack((revc_kmer_neg_vecs, psednc_neg_vecs[:, -lamada:])) vecs = pos_vecs.tolist() + neg_vecs.tolist() labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) # Write file. lamada_w = str(lamada) + '_' + str(w) write_file = "data/whole_revc_kmer_psednc_" + lamada_w + ".txt" print(write_file) write_libsvm(vecs, labels, write_file) w += 0.1
def cv5_psednc(fold_path, filename): """Contrast experiment by psednc in article Prediction of DNase I Hypersensitive Sites by Using Pseudo Nucleotide Compositions. """ lamada = 6 w = 0.2 psednc = PseDNC(lamada, w) for i in range(5): # Generate RevcKmer_PseDNC vecs. with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_psednc_vecs = psednc.make_psednc_vec(fp) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_psednc_vecs = psednc.make_psednc_vec(fp) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_psednc_vecs = psednc.make_psednc_vec(fp) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_psednc_vecs = psednc.make_psednc_vec(fp) n_lamada = "_".join([str(lamada), str(w)]) # Write test file. write_file = fold_path + filename + "_" + n_lamada + "_test_" + str(i) + ".txt" test_vecs = test_pos_psednc_vecs + test_neg_psednc_vecs test_vecs_labels = [1] * len(test_pos_psednc_vecs) + [-1] * len(test_neg_psednc_vecs) write_libsvm(test_vecs, test_vecs_labels, write_file) # Write train file. write_file = fold_path + filename + "_" + n_lamada + "_train_" + str(i) + ".txt" train_vecs = train_pos_psednc_vecs + train_neg_psednc_vecs train_vecs_labels = [1] * len(train_pos_psednc_vecs) + [-1] * len(train_neg_psednc_vecs) write_libsvm(train_vecs, train_vecs_labels, write_file)
def borderline_smote_revc_psednc(fold_path): revc_kmer = RevcKmer(k=2, normalize=True, upto=True) with open("data/hs.fasta") as f: pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f)) with open("data/non-hs.fasta") as f: neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(f)) lamada = 6 w = 0.8 psednc = PseDNC(lamada, w) with open("data/hs.fasta") as f: pos_psednc_vecs = np.array(psednc.make_psednc_vec(f)) with open("data/non-hs.fasta") as f: neg_psednc_vecs = np.array(psednc.make_psednc_vec(f)) pos_vecs = np.column_stack((pos_revc_kmer_vecs, pos_psednc_vecs[:, -lamada:])) neg_vecs = np.column_stack((neg_revc_kmer_vecs, neg_psednc_vecs[:, -lamada:])) vecs = np.row_stack((pos_vecs, neg_vecs)) vecs_labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) _1, synthetic, _2 = (smote.borderline_smote(vecs, vecs_labels, 1, N=300, k=5)) pos_vecs = pos_vecs.tolist() + synthetic.tolist() vecs = pos_vecs + neg_vecs.tolist() labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) lamada_n = "_".join([str(lamada), str(w)]) write_file = "/".join([fold_path, lamada_n]) print(write_file) write_libsvm(vecs, labels, write_file)
def psednc_tool(lamada, w, pos_file, neg_file, write_file): psednc = PseDNC(lamada=lamada, w=w) with open(pos_file) as fp: pos_vecs = psednc.make_psednc_vec(fp) with open(neg_file) as fp: neg_vecs = psednc.make_psednc_vec(fp) vecs = pos_vecs + neg_vecs labels = [1] * len(pos_vecs) + [-1] * len(neg_vecs) # Write file. write_libsvm(vecs, labels, write_file)
def cv5_smote_revc_psednc(fold_path, filename, k): # Generate pos and neg vecs and SMOTE synthetic vecs. lamada = 6 w = 0.8 revc_kmer = RevcKmer(k=k, normalize=True, upto=True) psednc = PseDNC(lamada, w) for i in range(5): # Generate RevcKmer_PseDNC vecs. with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "test_neg_" + str(i)) as fp: test_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) test_neg_revc_psednc_vecs = np.column_stack((test_neg_revc_kmer_vecs, test_neg_psednc_vecs[:, -lamada:])) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "test_pos_" + str(i)) as fp: test_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) test_pos_revc_psednc_vecs = np.column_stack((test_pos_revc_kmer_vecs, test_pos_psednc_vecs[:, -lamada:])) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "train_neg_" + str(i)) as fp: train_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) train_neg_revc_psednc_vecs = np.column_stack((train_neg_revc_kmer_vecs, train_neg_psednc_vecs[:, -lamada:])) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp)) with open(fold_path + "train_pos_" + str(i)) as fp: train_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp)) train_pos_revc_psednc_vecs = np.column_stack((train_pos_revc_kmer_vecs, train_pos_psednc_vecs[:, -lamada:])) # Generate synthetic vecs from pos_vecs. synthetic1 = (smote.smote(train_pos_revc_psednc_vecs, N=100, k=5)).tolist() synthetic2 = (smote.smote(train_pos_revc_psednc_vecs, N=50, k=5)).tolist() synthetic = np.row_stack((synthetic1, synthetic2)) n_lamada = "_".join([str(lamada), str(w)]) # Write test file. write_file = fold_path + filename + '_' + n_lamada + "_test_" + str(i) + ".txt" test_vecs = test_pos_revc_psednc_vecs.tolist() + test_neg_revc_psednc_vecs.tolist() test_vecs_labels = [1] * len(test_pos_revc_psednc_vecs) + [-1] * len(test_neg_revc_psednc_vecs) write_libsvm(test_vecs, test_vecs_labels, write_file) # Write train file. write_file = fold_path + filename + '_' + n_lamada + "_train_" + str(i) + ".txt" train_pos_vecs = train_pos_revc_psednc_vecs.tolist() + synthetic.tolist() train_vecs = train_pos_vecs + train_neg_revc_psednc_vecs.tolist() train_vecs_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_revc_psednc_vecs) write_libsvm(train_vecs, train_vecs_labels, write_file)
def PseudoDinucleotideComposition(gene2seq): X = dict() succeed_cnt = 0 # let's use the default value psednc = PseDNC() for gene, seq in gene2seq.items(): seq = [seq] try: vec = psednc.make_psednc_vec(seq) X[gene] = vec succeed_cnt += 1 if succeed_cnt % 100 == 0: print(succeed_cnt) except Exception as e: continue print('PseDNC, succeed for %d gene' % (succeed_cnt)) with open('PseDNC-features.json', 'w') as output_f: output_f.write(json.dumps(X)) return X
def cv5_psednc_tool(lamada, w, test_neg_file, test_pos_file, train_neg_file, train_pos_file, test_write_file, train_write_file): psednc = PseDNC(lamada=lamada, w=w) with open(test_neg_file) as fp: test_neg_vecs = psednc.make_psednc_vec(fp) with open(test_pos_file) as fp: test_pos_vecs = psednc.make_psednc_vec(fp) with open(train_pos_file) as fp: train_pos_vecs = psednc.make_psednc_vec(fp) with open(train_neg_file) as fp: train_neg_vecs = psednc.make_psednc_vec(fp) train_vecs = train_pos_vecs + train_neg_vecs test_vecs = test_pos_vecs + test_neg_vecs train_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_vecs) test_labels = [1] * len(test_pos_vecs) + [-1] * len(test_neg_vecs) # Write file. write_libsvm(train_vecs, train_labels, train_write_file) write_libsvm(test_vecs, test_labels, test_write_file)
from sklearn import cross_validation from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import roc_curve, auc from scipy import interp import matplotlib.pyplot as plt if __name__ == '__main__': begin_time = time.time() print('Example1 Start.(This process may use several minutes, please do not close the program.)') # ############################################################################## # Data IO and generation. # Generate the PseDNC feature vector. psednc = PseDNC(lamada=3, w=0.05) pos_vec = psednc.make_psednc_vec(open('hotspots.fasta')) neg_vec = psednc.make_psednc_vec(open('coldspots.fasta')) print(len(pos_vec)) print(len(neg_vec)) # Merge positive and negative feature vectors and generate their corresponding labels. vec = np.array(pos_vec + neg_vec) vec_label = np.array([1] * len(pos_vec) + [0] * len(neg_vec)) # ############################################################################## # Classification and accurate analysis. # evaluate performance of the predictor by 5-fold cross-validation and plot the mean ROC curve. clf = svm.SVC(C=32, gamma=0.5)
def main(): pseknc = PseKNC(k=2, lamada=1, w=0.05) pos_vec3train = pseknc.make_pseknc_vec(open('postrain.txt')) neg_vec3train = pseknc.make_pseknc_vec(open('negtrain.txt')) pos_vec3test = pseknc.make_pseknc_vec(open('postest.txt')) neg_vec3test = pseknc.make_pseknc_vec(open('negtest.txt')) fea_vec3train = [] fea_vec3test = [] fea_vec3train.extend(pos_vec3train + neg_vec3train) fea_vec3test.extend(pos_vec3test + neg_vec3test) psednc = PseDNC(lamada=3, w=0.05) pos_vec1train = psednc.make_psednc_vec(open('postrain.txt')) neg_vec1train = psednc.make_psednc_vec(open('negtrain.txt')) pos_vec1test = psednc.make_psednc_vec(open('postest.txt')) neg_vec1test = psednc.make_psednc_vec(open('negtest.txt')) fea_vec1train = [] fea_vec1test = [] fea_vec1train.extend(pos_vec1train + neg_vec1train) fea_vec1test.extend(pos_vec1test + neg_vec1test) feature_matrix = [] label_vector = [] train_samples = open('./data_new.txt', 'r') i = 0 for line in train_samples: feature_vector = [] with open('./feature_importance.txt', 'r') as f: feature_importance = f.read().splitlines() if i < 596: label_vector.append(1) else: label_vector.append(0) sequence = line feature_vector.extend(fea_vec1train[i] + fea_vec3train[i] + ssc(sequence)) sequence = line.replace('\n', '') feature_vector.extend( kmer(sequence) + ksnpf(sequence) + binary_code(sequence)) feature = [] for m in range(0, 390): t = feature_importance[m] feature.append(feature_vector[int(t)]) feature_matrix.append(feature_vector) i = i + 1 train_samples.close() feature_array = np.array(feature_matrix, dtype=np.float32) min_max_scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(-1, 1)) feature_scaled = min_max_scaler.fit_transform(feature_array) X = feature_scaled y = label_vector clf = SVC(C=1.11, gamma=0.003, probability=True) clf.fit(X, y) feature_matrix = [] test_label_vector = [] test_samples = open('./data_test.txt', 'r') i = 0 for line in test_samples: feature_vector = [] with open('./feature_importance.txt', 'r') as f: feature_importance = f.read().splitlines() if i < 149: test_label_vector.append(1) else: test_label_vector.append(0) sequence = line feature_vector.extend(fea_vec1test[i] + fea_vec3test[i] + ssc(sequence)) sequence = line.replace('\n', '') feature_vector.extend( kmer(sequence) + ksnpf(sequence) + binary_code(sequence)) feature = [] for m in range(0, 390): t = feature_importance[m] feature.append(feature_vector[int(t)]) feature_matrix.append(feature_vector) i = i + 1 test_samples.close() test_feature_array = np.array(feature_matrix, dtype=np.float32) X_test = min_max_scaler.transform(test_feature_array) y_test = test_label_vector print clf.score(X_test, y_test) predict_y_test = clf.predict(X_test) TP = 0 TN = 0 FP = 0 FN = 0 for i in range(0, len(y_test)): if int(y_test[i]) == 1 and int(predict_y_test[i]) == 1: TP = TP + 1 elif int(y_test[i]) == 1 and int(predict_y_test[i]) == 0: FN = FN + 1 elif int(y_test[i]) == 0 and int(predict_y_test[i]) == 0: TN = TN + 1 elif int(y_test[i]) == 0 and int(predict_y_test[i]) == 1: FP = FP + 1 Sn = float(TP) / (TP + FN) Sp = float(TN) / (TN + FP) ACC = float((TP + TN)) / (TP + TN + FP + FN) prob_predict_y_test = clf.predict_proba(X_test) predictions_test = prob_predict_y_test[:, 1] #######generate combined negative scores #combined_prob=predictions_test y_validation = np.array(y_test, dtype=int) fpr, tpr, thresholds = metrics.roc_curve(y_validation, predictions_test, pos_label=1) roc_auc = auc(fpr, tpr) #print('AdaBoostClassifier AUC:%s'%roc_auc) F1 = metrics.f1_score(y_validation, map(int, predict_y_test)) MCC = metrics.matthews_corrcoef(y_validation, map(int, predict_y_test)) print('SVM Accuracy:%s' % ACC) print('SVM AUC:%s' % roc_auc) print('SVM Sensitive:%s' % Sn) print('SVM Specificity:%s' % Sp) print('SVM F1:%s' % F1) print('SVM MCC:%s' % MCC)
return normData ########################################################################################### if __name__ == '__main__': featurename = 'Psednc' # getting psednc feature print( '...............................................................................' ) print('Coding for ' + featurename + ' feature, beginning') tic = time.clock() psednc = PseDNC(lamada=1, w=0.05) pos_vec = psednc.make_psednc_vec(open('strong enhancers4.fasta')) neg_vec = psednc.make_psednc_vec(open('weak enhancers4.fasta')) Z = array(pos_vec + neg_vec) X = noramlization(Z) y = array([1] * len(pos_vec) + [0] * len(neg_vec)) print('The number of positive and negative samples: %d,%d' % (len(pos_vec), len(neg_vec))) print('Dimension of ' + featurename + ' feature vectors: %d' % len(X[0])) toc = time.clock() print("Coding time: %.3f minutes" % ((toc - tic) / 60.0)) print( '...............................................................................' )
def prepare_data_with_repDNA(include_acceptor=False, include_donor=False, save_file_name="dataset", samples_per_file=20000, start=0, pre_start=0, pre_end=299, post_start=302, post_end=601, include_kmer=False, include_DAC=False, include_DCC=False, include_TAC=False, include_TCC=False, include_PseDNC=False, include_PseKNC=False, include_PC_PseDNC=False, include_PC_PseTNC=False, include_SC_PseDNC=False, include_SC_PseTNC=False): print("Reading data ...") cpu_count = int(mp.cpu_count() * 2 / 3) # Prepare selected modes mode_list = [] if include_acceptor: mode_list.append("acceptor") if include_donor: mode_list.append("donor") # Read data and perform transformation for b in mode_list: if include_kmer: x_dataset = [] # kmer count occurences: kmer = Kmer(k=2, upto=True, normalize=True) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) my_time = time.time() seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend(kmer.make_kmer_vec(seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_kmer_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished Kmer data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_DAC: # Calculate and store Dinuleotide-based auto covariance # Initialize datasets x_dataset = [] dac = DAC(2) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(dac.make_dac_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_dac_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished DAC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_DCC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] dcc = DCC(1) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(dcc.make_dcc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_dcc_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished DCC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_TAC: # Calculate and store Trinuleotide-based cross covariance # Initialize datasets x_dataset = [] tac = TAC(3) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(tac.make_tac_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_tac_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished TAC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_TCC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] tcc = TCC(2) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(tcc.make_tcc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_tcc_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished TCC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_PseDNC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] pseDNC = PseDNC(2) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(pseDNC.make_psednc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ]) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_pseDNC_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished PseDNC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_PseKNC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] pseKNC = PseKNC(k=2, lamada=1, w=0.05) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(pseKNC.make_pseknc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ]) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_pseKNC_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished pseKNC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_PC_PseDNC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] pc_psednc = PCPseDNC(lamada=2, w=0.05) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(pc_psednc.make_pcpsednc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_PC_PseDNC_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished PC-PseDNC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_PC_PseTNC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] pc_psetnc = PCPseTNC(lamada=2, w=0.05) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(pc_psetnc.make_pcpsetnc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_PC_PseTNC_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished PC-PseTNC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_SC_PseDNC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] sc_psednc = SCPseDNC(lamada=2, w=0.05) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(sc_psednc.make_scpsednc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_SC_PseDNC_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished SC-PseDNC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename)) if include_SC_PseTNC: # Calculate and store Dinuleotide-based cross covariance # Initialize datasets x_dataset = [] sc_psetnc = SCPseTNC(lamada=2, w=0.05) for a in ["negative", "positive"]: # Read data file_name = "../data/{}_{}.fa".format(a, b) print("Processing", file_name) seqs = util.get_data(open(file_name))[start:start + samples_per_file] x_dataset.extend( Parallel(n_jobs=cpu_count)( delayed(sc_psetnc.make_scpsetnc_vec)([ seq[pre_start:pre_end + 1] + seq[300:301 + 1] + seq[post_start:post_end + 1] ], all_property=True) for seq in seqs)) x_dataset = np.array(x_dataset, dtype=np.float) x_filename = "../data/x_SC_PseTNC_" + save_file_name + ( "_" + str(start) + "_start" if start != 0 else "") + "_" + str(samples_per_file) + "_samples.npy" # save dataset in numpy readable files np.save(file=x_filename, arr=x_dataset) print("Finished SC-PseTNC data.") print("Shape:", x_dataset.shape) print("Data saved in {}.".format(x_filename))
return normData ########################################################################################### if __name__ == '__main__': featurename = 'Psednc' # getting psednc feature print( '...............................................................................' ) print('Coding for ' + featurename + ' feature, beginning') tic = time.clock() psednc = PseDNC(lamada=1, w=0.05) pos_vec = psednc.make_psednc_vec(open('enhancers4.fasta')) neg_vec = psednc.make_psednc_vec(open('non-enhancers4.fasta')) Z = array(pos_vec + neg_vec) X = noramlization(Z) y = array([1] * len(pos_vec) + [0] * len(neg_vec)) print('The number of positive and negative samples: %d,%d' % (len(pos_vec), len(neg_vec))) print('Dimension of ' + featurename + ' feature vectors: %d' % len(X[0])) toc = time.clock() print("Coding time: %.3f minutes" % ((toc - tic) / 60.0)) print( '...............................................................................' )
########################################################################################### if __name__ == '__main__': featurename = 'Psednc' #getting psednc feature print( '...............................................................................' ) print('Coding for ' + featurename + ' feature, beginning') tic = time.clock() psednc = PseDNC(lamada=1, w=0.05) pos_vec = psednc.make_psednc_vec(open('posi_samples.fasta')) neg_vec = psednc.make_psednc_vec(open('nega_samples.fasta')) X = array(pos_vec + neg_vec) y = array([1] * len(pos_vec) + [0] * len(neg_vec)) print('The number of positive and negative samples: %d,%d' % (len(pos_vec), len(neg_vec))) print('Dimension of ' + featurename + ' feature vectors: %d' % len(X[0])) toc = time.clock() print("Coding time: %.3f minutes" % ((toc - tic) / 60.0)) print( '...............................................................................' )
def main(): psednc = PseDNC(lamada=8, w=0.8) pos_vec = psednc.make_psednc_vec(open('postrain.txt')) neg_vec = psednc.make_psednc_vec(open('negtrain.txt')) fea_vec=[] fea_vec.extend(pos_vec+neg_vec) feature_matrix=[] label_vector=[] train_samples=open('./data_new.txt','r') i=0 for line in train_samples: feature_vector=[] if i<596: label_vector.append(1) else: label_vector.append(0) sequence=line feature_vector.extend(ssc(sequence)+fea_vec[i]) sequence=line.replace('\n','') feature_vector.extend(binary_code(sequence)) feature_matrix.append(feature_vector) i=i+1 train_samples.close() feature_array = np.array(feature_matrix,dtype=np.float32) min_max_scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(-1, 1)) feature_scaled= min_max_scaler.fit_transform(feature_array) X=feature_scaled y=label_vector X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.4,random_state=0) clf = SVC(C=0.98,gamma=0.001,probability=True) clf.fit(X_train,y_train) predict_y_test = clf.predict(X_test) TP=0 TN=0 FP=0 FN=0 for i in range(0,len(y_test)): if int(y_test[i])==1 and int(predict_y_test[i])==1: TP=TP+1 elif int(y_test[i])==1 and int(predict_y_test[i])==0: FN=FN+1 elif int(y_test[i])==0 and int(predict_y_test[i])==0: TN=TN+1 elif int(y_test[i])==0 and int(predict_y_test[i])==1: FP=FP+1 Sn=float(TP)/(TP+FN) Sp=float(TN)/(TN+FP) ACC=float((TP+TN))/(TP+TN+FP+FN) prob_predict_y_test = clf.predict_proba(X_test) predictions_test = prob_predict_y_test[:, 1] #######generate combined negative scores #combined_prob=predictions_test y_validation=np.array(y_test,dtype=int) fpr, tpr, thresholds =metrics.roc_curve(y_validation, predictions_test,pos_label=1) roc_auc = auc(fpr, tpr) #print('AdaBoostClassifier AUC:%s'%roc_auc) F1=metrics.f1_score(y_validation, map(int,predict_y_test)) MCC=metrics.matthews_corrcoef(y_validation,map(int,predict_y_test)) print('SVM Accuracy:%s'%ACC) print('SVM AUC:%s'%roc_auc) print('SVM Sensitive:%s'%Sn) print('SVM Specificity:%s'%Sp) print('SVM F1:%s'%F1) print('SVM MCC:%s'%MCC)