Example #1
0
def kameris(dataset, train, k, dimention_reduction):

    #minMaxScaler = MinMaxScaler(feature_range=(0, 1), copy = False)
    #scaler1 = StandardScaler()

    #############################################
    # compute k-mer frecuences
    X_train = []
    y_train = []
    for sample in train:
        y_train.append(sample[2])
        seq = sample[1]
        k_mers_frecuencies = kam.cgr(seq, k)
        X_train.append(k_mers_frecuencies)

    X_train = np.matrix(X_train)

    #############################################
    # scaling
    #scaler1.fit(X_train)

    # saving scaler
    #filename = current_dir + '/models/kameris_' + dataset.split('/')[1] + "_scaler.sav"
    #joblib.dump(scaler1, filename)
    #X_train = scaler1.transform(X_train)

    #############################################
    # dimention reduction
    number_features = int((np.count_nonzero(X_train > 0) * 0.1) / len(X_train))
    if dimention_reduction == 1 and pow(
            4, k) > number_features and number_features > 4:
        #SVD
        svd = TruncatedSVD(n_components=number_features)
        rows, cols = X_train.shape
        svd.fit(X_train)
        X_train = svd.transform(X_train)
        print("SVD aplied ... X_train.shape: ", [rows, cols], X_train.shape)
    else:
        number_features = pow(4, k)

    #############################################
    # train
    clf = SVC(kernel="linear", C=1)
    clf.fit(X_train, y_train)

    return clf, number_features
Example #2
0
    times_cnn_acc = 0
    # para castror solo
    train = []
    for i in range(sequences.shape[0]):
        train.append([0, sequences[i], 0])   
    k_mers_castor   = fe.generate_K_mers(train, k=5) 

    # castor time
    t_0 = time.time()
    X_train, y_train    = fe.generateXYMatrice(train, k_mers_castor, k=5) 
    times_castor_acc += time.time() - t_0

    for i in range(sequences.shape[0]):
        # kameris time
        t_0 = time.time()
        k_mers_frecuencies = kam.cgr(sequences[i], k=5) 
        times_kameris_acc += time.time() - t_0

        # mldsp time
        t_0 = time.time()
        ns_new, fourier_transform, magnitud_spectra = descriptor(sequences[i], min_seq_len)
        times_mldsp_acc += time.time() - t_0

        # cnn
        t_0 = time.time()
        chaos = chaos_game_representation(probabilities(str(sequences[i]), count_kmers(str(sequences[i]), 5), 5), 5)
        times_cnn_acc += time.time() - t_0

    time_kameris = times_kameris_acc/sequences.shape[0]
    time_castor = times_castor_acc/sequences.shape[0]
    time_mldsp = times_mldsp_acc/sequences.shape[0]
Example #3
0
def kameris(train, test, k, dimention_reduction):

    minMaxScaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler1 = StandardScaler()

    #############################################
    # compute k-mer frecuences
    X_train = []
    y_train = []
    for sample in train:
        y_train.append(sample[2])
        seq = sample[1]
        k_mers_frecuencies = kam.cgr(seq, k)
        X_train.append(k_mers_frecuencies)

    X_train = np.matrix(X_train)

    #############################################
    # scaling
    #X_train = minMaxScaler.fit_transform(X_train)  # this lose variance

    scaler1.fit(X_train)
    X_train = scaler1.transform(X_train)
    #print(X_train)
    #print("STD: ", X_train.std(axis=0))
    #print("X_train > 0: ", np.count_nonzero(X_train > 0))
    #print("nfeatures: ",  np.count_nonzero(X_train > 0)*0.1  )
    #print("nfeatures: ",  (np.count_nonzero(X_train > 0)*0.1) /len(X_train) )
    #print("real nfeatues: ", X_train.shape)

    #############################################
    # dimention reduction
    number_features = int((np.count_nonzero(X_train > 0) * 0.1) / len(X_train))
    if dimention_reduction == 1 and pow(
            4, k) > number_features and number_features > 4:
        #PCA
        #print("before PCA: ", X_train.shape)
        #pca1 = PCA(n_components = number_features, svd_solver= "arpack")
        #pca1.fit(X_train)
        #X_train = pca1.transform(X_train)
        #print("PCA X_train.shape: ", X_train.shape)

        #SVD
        svd = TruncatedSVD(n_components=number_features)
        rows, cols = X_train.shape
        svd.fit(X_train)
        X_train = svd.transform(X_train)
        print("SVD aplied ... X_train.shape: ", [rows, cols], X_train.shape)

    #############################################
    # train
    clf = SVC(kernel="linear", C=1)
    clf.fit(X_train, y_train)

    #############################################
    # compute k-mer frecuences
    X_test = []
    y_test = []
    for sample in test:
        y_test.append(sample[2])
        seq = sample[1]
        k_mers_frecuencies = kam.cgr(seq, k)
        X_test.append(k_mers_frecuencies)

    X_test = np.matrix(X_test)

    #############################################
    # scaling
    #X_test = minMaxScaler.fit_transform(X_test)
    X_test = scaler1.transform(X_test)

    #############################################
    # dimention reduction
    if dimention_reduction == 1 and pow(
            4, k) > number_features and number_features > 4:
        #X_test = pca1.transform(X_test)
        X_test = svd.transform(X_test)

    #############################################
    # predict
    y_pred = clf.predict(X_test)

    metrics = precision_recall_fscore_support(y_test,
                                              y_pred,
                                              average='weighted')
    acc = accuracy_score(y_test, y_pred)

    #print('metrics: acc, precision, recall, fscore ', acc, metrics)
    return acc, metrics[0], metrics[1], metrics[2], number_features
Example #4
0
if dataset[0:3] == "HIV":
    k = 5
    model = current_dir + "/models/kameris_" + dataset + "_dr=0_nf=1024_k=5.sav"
elif dataset[0:3] == "POL":
    k = 2
    model = current_dir + "/models/kameris_" + dataset + "_dr=0_nf=16_k=2.sav"




clf = joblib.load(model)

data = []
sequences = SeqIO.parse(fasta, "fasta")
for record in sequences:
    data.append([record.id, record.seq.upper()])

X_test = []
for seq in data:
    #print("processing seq: ", seq[0])
    k_mers_frecuencies = kam.cgr(seq[1], k)  
    X_test.append(k_mers_frecuencies)

result = clf.predict(X_test)
print(result)

    
        
    

def kameris(X_train, y_train, X_test, y_test, k, dimention_reduction,
            database_name):

    minMaxScaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler1 = StandardScaler()

    #############################################
    # compute k-mer frecuences
    X_train_new = []
    for seq in X_train:
        k_mers_frecuencies = kam.cgr(seq, k)
        X_train_new.append(k_mers_frecuencies)
    X_train = np.matrix(X_train_new)

    X_test_new = []
    for seq in X_test:
        k_mers_frecuencies = kam.cgr(seq, k)
        X_test_new.append(k_mers_frecuencies)
    X_test = np.matrix(X_test_new)
    #############################################
    # scaling
    #X_train = minMaxScaler.fit_transform(X_train)  # this lose variance

    scaler1.fit(X_train)
    X_train = scaler1.transform(X_train)

    #############################################
    # dimention reduction
    number_features = int((np.count_nonzero(X_train > 0) * 0.1) / len(X_train))
    if dimention_reduction == 1 and pow(
            4, k) > number_features and number_features > 4:

        #SVD
        svd = TruncatedSVD(n_components=number_features)
        rows, cols = X_train.shape
        svd.fit(X_train)
        X_train = svd.transform(X_train)
        print("SVD aplied ... X_train.shape: ", [rows, cols], X_train.shape)

    #############################################
    # train
    clf = svm.SVC(kernel="linear", C=1)
    clf.fit(X_train, y_train)

    pickle.dump(
        clf,
        open(current_dir + "/models/" + database_name + '-kameris.joblib',
             'wb'))

    #############################################
    # scaling
    #X_test = minMaxScaler.fit_transform(X_test)
    X_test = scaler1.transform(X_test)

    #############################################
    # dimention reduction
    if dimention_reduction == 1 and pow(
            4, k) > number_features and number_features > 4:
        #X_test = pca1.transform(X_test)
        X_test = svd.transform(X_test)

    #############################################
    # predict
    y_pred = clf.predict(X_test)

    metrics = precision_recall_fscore_support(y_test,
                                              y_pred,
                                              average='weighted')
    acc = accuracy_score(y_test, y_pred)

    #print('metrics: acc, precision, recall, fscore ', acc, metrics)
    #return acc
    return acc, metrics[0], metrics[1], metrics[2]