def kameris(dataset, train, k, dimention_reduction): #minMaxScaler = MinMaxScaler(feature_range=(0, 1), copy = False) #scaler1 = StandardScaler() ############################################# # compute k-mer frecuences X_train = [] y_train = [] for sample in train: y_train.append(sample[2]) seq = sample[1] k_mers_frecuencies = kam.cgr(seq, k) X_train.append(k_mers_frecuencies) X_train = np.matrix(X_train) ############################################# # scaling #scaler1.fit(X_train) # saving scaler #filename = current_dir + '/models/kameris_' + dataset.split('/')[1] + "_scaler.sav" #joblib.dump(scaler1, filename) #X_train = scaler1.transform(X_train) ############################################# # dimention reduction number_features = int((np.count_nonzero(X_train > 0) * 0.1) / len(X_train)) if dimention_reduction == 1 and pow( 4, k) > number_features and number_features > 4: #SVD svd = TruncatedSVD(n_components=number_features) rows, cols = X_train.shape svd.fit(X_train) X_train = svd.transform(X_train) print("SVD aplied ... X_train.shape: ", [rows, cols], X_train.shape) else: number_features = pow(4, k) ############################################# # train clf = SVC(kernel="linear", C=1) clf.fit(X_train, y_train) return clf, number_features
times_cnn_acc = 0 # para castror solo train = [] for i in range(sequences.shape[0]): train.append([0, sequences[i], 0]) k_mers_castor = fe.generate_K_mers(train, k=5) # castor time t_0 = time.time() X_train, y_train = fe.generateXYMatrice(train, k_mers_castor, k=5) times_castor_acc += time.time() - t_0 for i in range(sequences.shape[0]): # kameris time t_0 = time.time() k_mers_frecuencies = kam.cgr(sequences[i], k=5) times_kameris_acc += time.time() - t_0 # mldsp time t_0 = time.time() ns_new, fourier_transform, magnitud_spectra = descriptor(sequences[i], min_seq_len) times_mldsp_acc += time.time() - t_0 # cnn t_0 = time.time() chaos = chaos_game_representation(probabilities(str(sequences[i]), count_kmers(str(sequences[i]), 5), 5), 5) times_cnn_acc += time.time() - t_0 time_kameris = times_kameris_acc/sequences.shape[0] time_castor = times_castor_acc/sequences.shape[0] time_mldsp = times_mldsp_acc/sequences.shape[0]
def kameris(train, test, k, dimention_reduction): minMaxScaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler1 = StandardScaler() ############################################# # compute k-mer frecuences X_train = [] y_train = [] for sample in train: y_train.append(sample[2]) seq = sample[1] k_mers_frecuencies = kam.cgr(seq, k) X_train.append(k_mers_frecuencies) X_train = np.matrix(X_train) ############################################# # scaling #X_train = minMaxScaler.fit_transform(X_train) # this lose variance scaler1.fit(X_train) X_train = scaler1.transform(X_train) #print(X_train) #print("STD: ", X_train.std(axis=0)) #print("X_train > 0: ", np.count_nonzero(X_train > 0)) #print("nfeatures: ", np.count_nonzero(X_train > 0)*0.1 ) #print("nfeatures: ", (np.count_nonzero(X_train > 0)*0.1) /len(X_train) ) #print("real nfeatues: ", X_train.shape) ############################################# # dimention reduction number_features = int((np.count_nonzero(X_train > 0) * 0.1) / len(X_train)) if dimention_reduction == 1 and pow( 4, k) > number_features and number_features > 4: #PCA #print("before PCA: ", X_train.shape) #pca1 = PCA(n_components = number_features, svd_solver= "arpack") #pca1.fit(X_train) #X_train = pca1.transform(X_train) #print("PCA X_train.shape: ", X_train.shape) #SVD svd = TruncatedSVD(n_components=number_features) rows, cols = X_train.shape svd.fit(X_train) X_train = svd.transform(X_train) print("SVD aplied ... X_train.shape: ", [rows, cols], X_train.shape) ############################################# # train clf = SVC(kernel="linear", C=1) clf.fit(X_train, y_train) ############################################# # compute k-mer frecuences X_test = [] y_test = [] for sample in test: y_test.append(sample[2]) seq = sample[1] k_mers_frecuencies = kam.cgr(seq, k) X_test.append(k_mers_frecuencies) X_test = np.matrix(X_test) ############################################# # scaling #X_test = minMaxScaler.fit_transform(X_test) X_test = scaler1.transform(X_test) ############################################# # dimention reduction if dimention_reduction == 1 and pow( 4, k) > number_features and number_features > 4: #X_test = pca1.transform(X_test) X_test = svd.transform(X_test) ############################################# # predict y_pred = clf.predict(X_test) metrics = precision_recall_fscore_support(y_test, y_pred, average='weighted') acc = accuracy_score(y_test, y_pred) #print('metrics: acc, precision, recall, fscore ', acc, metrics) return acc, metrics[0], metrics[1], metrics[2], number_features
if dataset[0:3] == "HIV": k = 5 model = current_dir + "/models/kameris_" + dataset + "_dr=0_nf=1024_k=5.sav" elif dataset[0:3] == "POL": k = 2 model = current_dir + "/models/kameris_" + dataset + "_dr=0_nf=16_k=2.sav" clf = joblib.load(model) data = [] sequences = SeqIO.parse(fasta, "fasta") for record in sequences: data.append([record.id, record.seq.upper()]) X_test = [] for seq in data: #print("processing seq: ", seq[0]) k_mers_frecuencies = kam.cgr(seq[1], k) X_test.append(k_mers_frecuencies) result = clf.predict(X_test) print(result)
def kameris(X_train, y_train, X_test, y_test, k, dimention_reduction, database_name): minMaxScaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler1 = StandardScaler() ############################################# # compute k-mer frecuences X_train_new = [] for seq in X_train: k_mers_frecuencies = kam.cgr(seq, k) X_train_new.append(k_mers_frecuencies) X_train = np.matrix(X_train_new) X_test_new = [] for seq in X_test: k_mers_frecuencies = kam.cgr(seq, k) X_test_new.append(k_mers_frecuencies) X_test = np.matrix(X_test_new) ############################################# # scaling #X_train = minMaxScaler.fit_transform(X_train) # this lose variance scaler1.fit(X_train) X_train = scaler1.transform(X_train) ############################################# # dimention reduction number_features = int((np.count_nonzero(X_train > 0) * 0.1) / len(X_train)) if dimention_reduction == 1 and pow( 4, k) > number_features and number_features > 4: #SVD svd = TruncatedSVD(n_components=number_features) rows, cols = X_train.shape svd.fit(X_train) X_train = svd.transform(X_train) print("SVD aplied ... X_train.shape: ", [rows, cols], X_train.shape) ############################################# # train clf = svm.SVC(kernel="linear", C=1) clf.fit(X_train, y_train) pickle.dump( clf, open(current_dir + "/models/" + database_name + '-kameris.joblib', 'wb')) ############################################# # scaling #X_test = minMaxScaler.fit_transform(X_test) X_test = scaler1.transform(X_test) ############################################# # dimention reduction if dimention_reduction == 1 and pow( 4, k) > number_features and number_features > 4: #X_test = pca1.transform(X_test) X_test = svd.transform(X_test) ############################################# # predict y_pred = clf.predict(X_test) metrics = precision_recall_fscore_support(y_test, y_pred, average='weighted') acc = accuracy_score(y_test, y_pred) #print('metrics: acc, precision, recall, fscore ', acc, metrics) #return acc return acc, metrics[0], metrics[1], metrics[2]