コード例 #1
0
def train_model(path_config, args=None):
    x = get_data1(path_config[args.model_name]['train'], args)
    x_test = get_data2(path_config[args.model_name]['test'], args)
    hyperparams = utils.get_model_hyperparams('lstm_vae')
    dirpath_results = path_config[args.model_name]['results']
    logger = utils.get_logger()
    logger.info((x.shape, x_test.shape))
    input_dim = x.shape[-1]  # 13
    timesteps = x.shape[1]  # 3
    # batch_size = 1

    vae, enc, gen = create_lstm_vae(input_dim, timesteps,
                                    hyperparams['batch_size'],
                                    hyperparams['intermediate_dim'],
                                    hyperparams['latent_dim'],
                                    hyperparams['epsilon_std'])

    ep = hyperparams["num_iterations"]
    if args.is_demo:
        ep = 1

    vae.fit(x, x, epochs=ep)
    vae.save_weights(dirpath_results + args.model_name + "_weights.h5")
    logger.info(vae.summary())
    #     preds = vae.predict(x, batch_size=batch_size)
    predicted = enc.predict(x_test, batch_size=hyperparams['batch_size'])
    np.save(dirpath_results + args.model_name + "_predicted", predicted)
    logger.info(predicted.shape)
コード例 #2
0
def save_loss_curve(dirpath_results, model_name):
    logger = utils.get_logger()
    df_metrics = pd.read_csv('{}/{}_metrics.csv'.format(
        dirpath_results, model_name),
                             index_col=0)
    fpath_plot = '{}/{}_loss.png'.format(dirpath_results, model_name)

    train_ces = df_metrics['train_ce'].values
    valid_ces = df_metrics['valid_ce'].values

    if 'kmer' in model_name:
        train_ces = np.clip(train_ces, a_min=min(train_ces), a_max=0.07)
        valid_ces = np.clip(valid_ces, a_min=min(valid_ces), a_max=0.07)

    iterations = [i * 50 for i in range(df_metrics.shape[0])]

    plt.plot(iterations, train_ces, color=COLORS[0], label='Train CE')
    plt.plot(iterations, valid_ces, color=COLORS[1], label='Validation CE')

    plt.xlabel('Iteration')
    plt.ylabel('Cross Entropy')
    plt.title('Cross entropy curves {}'.format(model_name), fontsize=16)
    plt.legend()
    plt.savefig(fpath_plot)
    plt.clf()

    logger.info('Saved loss curves in {}'.format(fpath_plot))
コード例 #3
0
def generate_kmers(fpath_hierarchy, fpath_kmer):
    logger = utils.get_logger()
    for level in ['phylum', 'class', 'order']:
        logger.info('Generating K-Mers for {}'.format(level))

        fpath_level = os.path.join(fpath_kmer, level)

        if not os.path.exists(fpath_level):
            os.makedirs(fpath_level)

        for k in range(1, 7):
            logger.info('K={}'.format(k))
            kmerize_data(
                '{hie}/{level}/train.csv'.format(
                    hie=fpath_hierarchy, level=level),
                '{fpath}/train_{k}mer.csv'.format(
                    fpath=fpath_level, k=k
                ), k)

            kmerize_data(
                '{hie}/{level}/val.csv'.format(hie=fpath_hierarchy,
                                               level=level),
                '{fpath}/val_{k}mer.csv'.format(
                    fpath=fpath_level, level=level, k=k
                ), k)
コード例 #4
0
def embed_and_plot(df_data, embeddings, embed_config):
    logger = utils.get_logger()
    embed_data = embed_config['model'].fit_transform(embeddings)

    phy_labels = sorted(list(df_data['phylum'].value_counts().index[:3]))
    phy_indices = [i for i, _ in enumerate(phy_labels)]

    fpath_plot = embed_config['fpath_plot']

    for phy_idx in phy_indices:
        data_indices = np.where(
            df_data['phylum'].values == phy_labels[phy_idx])
        data_pts = embed_data[data_indices]
        plt.scatter(data_pts[:, 0],
                    data_pts[:, 1],
                    color=COLORS[phy_idx],
                    label=phy_labels[phy_idx])

    plt.xlabel(embed_config['xlabel'])
    plt.ylabel(embed_config['xlabel'])
    plt.title(embed_config['title'], fontsize=16)
    plt.legend()
    plt.savefig(fpath_plot)
    plt.clf()

    logger.info(embed_config['info_msg'].format(path=fpath_plot))
コード例 #5
0
ファイル: hierarchy.py プロジェクト: srivathsapv/cgda_project
def split_data(df_taxa, level, n_samples_per_class, fpath_hierarchy):
    logger = utils.get_logger()
    fpath_level = os.path.join(fpath_hierarchy, level)

    if not os.path.exists(fpath_level):
        os.makedirs(fpath_level)

    logger.info('Creating hierarchy data for {} level'.format(level))
    df_group = df_taxa.sample(frac=1).groupby(by=level)

    X = []
    y = []
    for name, group in df_group:
        gX, gy = get_xy_from_df(group, level)
        gX = gX[:int(n_samples_per_class)]
        gy = gy[:int(n_samples_per_class)]
        X.append(gX)
        y.append(gy)

    X = np.vstack(X)
    y = np.concatenate(y)

    (train_data, testval_data, train_labels, testval_labels) = \
        train_test_split(X, y, test_size=0.4, stratify=y, shuffle=True)

    (test_data, val_data, test_labels, val_labels) = \
        train_test_split(testval_data, testval_labels,
                         test_size=0.5, stratify=testval_labels, shuffle=True)

    create_split_csv(train_data, train_labels,
                     '{}/train.csv'.format(fpath_level, level))
    create_split_csv(test_data, test_labels,
                     '{}/test.csv'.format(fpath_level, level))
    create_split_csv(val_data, val_labels,
                     '{}/val.csv'.format(fpath_level, level))
コード例 #6
0
ファイル: interface.py プロジェクト: srivathsapv/cgda_project
def train_model(path_config, args=None):

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    logger = utils.get_logger()
    hyperparams = utils.get_model_hyperparams("rnn")
    learning_rate = hyperparams['learning_rate']
    path_config = path_config['rnn']
    dirpath_results = path_config['dirpath_rnn']

    logger.info('Training RNN Phylum Classifier')
    parameters = Parameters('phylum')
    data_config = path_config['phylum']
    train_data = load_data(data_config['train'])
    valid_data = load_data(data_config['val'])
    test_data = load_data(data_config['test'])
    model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim,
                           parameters.vocab_size, parameters.label_size,
                           device).to(device)
    loss_fun = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    train(train_data, valid_data, test_data, model, loss_fun, optimizer,
          dirpath_results, parameters, device, logger, args.is_demo,
          'rnn_phylum')

    logger.info('Training RNN Class Classifier')
    parameters = Parameters('class')
    data_config = path_config['class']
    train_data = load_data(data_config['train'])
    valid_data = load_data(data_config['val'])
    test_data = load_data(data_config['test'])
    model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim,
                           parameters.vocab_size, parameters.label_size,
                           device).to(device)
    loss_fun = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    train(train_data, valid_data, test_data, model, loss_fun, optimizer,
          dirpath_results, parameters, device, logger, args.is_demo,
          'rnn_class')

    logger.info('Training RNN Order Classifier')
    parameters = Parameters('order')
    data_config = path_config['order']
    train_data = load_data(data_config['train'])
    valid_data = load_data(data_config['val'])
    test_data = load_data(data_config['test'])
    model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim,
                           parameters.vocab_size, parameters.label_size,
                           device).to(device)
    loss_fun = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    train(train_data, valid_data, test_data, model, loss_fun, optimizer,
          dirpath_results, parameters, device, logger, args.is_demo,
          'rnn_order')
コード例 #7
0
ファイル: vae.py プロジェクト: srivathsapv/cgda_project
def generate_kmers(fpath_embeds, dirpath_vae, k):
    logger = utils.get_logger()
    logger.info('Generating {}-mers for VAE'.format(k))

    fpath_embed_kmer = '{}/embeds_{}mer.csv'.format(dirpath_vae, k)
    kmerize_data(fpath_embeds, fpath_embed_kmer, k)

    df_kmer = pd.read_csv(fpath_embed_kmer).drop(
        ['id', 'phylum', 'class', 'order'], axis=1)
    data_kmer = df_kmer.values.astype(np.float16)
    np.save('{}/features_{}mer.npy'.format(dirpath_vae, k), data_kmer)
コード例 #8
0
ファイル: hierarchy.py プロジェクト: srivathsapv/cgda_project
def create_hierarchy(fpath_taxa, fpath_hierarchy):
    logger = utils.get_logger()
    logger.info('Creating hierarchy files from taxa.csv')
    if not os.path.exists(fpath_hierarchy):
        os.makedirs(fpath_hierarchy)

    df_taxa = pd.read_csv(fpath_taxa)

    split_data(df_taxa, 'phylum', MAX_SAMPLES_PER_CLASS, fpath_hierarchy)
    split_data(df_taxa, 'class', MAX_SAMPLES_PER_CLASS, fpath_hierarchy)

    df_group = group_labels(df_taxa, ORDER_OTHER_LABELS)
    split_data(df_group, 'order', MAX_SAMPLES_PER_CLASS, fpath_hierarchy)
コード例 #9
0
ファイル: vae.py プロジェクト: srivathsapv/cgda_project
def generate_vae_data(fpath_embeds, dirpath_vae):
    logger = utils.get_logger()
    logger.info('Generating data files for VAE training')
    if not os.path.exists(dirpath_vae):
        os.makedirs(dirpath_vae)

    df_embeds = pd.read_csv(fpath_embeds)
    seqs = df_embeds['sequence'].values

    with open('{}/ordinal_sequences.txt'.format(dirpath_vae), 'w') as seqfile:
        seqfile.write('\n'.join(seqs))

    generate_kmers(fpath_embeds, dirpath_vae, 4)
    generate_kmers(fpath_embeds, dirpath_vae, 5)
コード例 #10
0
ファイル: kmer.py プロジェクト: srivathsapv/cgda_project
def train_basic(models, dirpath_kmer, dirpath_output, kmin, kmax, is_demo):
    logger = utils.get_logger()

    for model in models:
        model_str = type(model).__name__.lower()

        for level in ['phylum', 'class', 'order']:
            combined_train_data = []
            combined_val_data = []

            for k in range(kmin, kmax + 1):
                df_train = pd.read_csv('{}/{}/train_{}mer.csv'.format(
                    dirpath_kmer, level, k))
                df_val = pd.read_csv('{}/{}/val_{}mer.csv'.format(
                    dirpath_kmer, level, k))

                train_f1, pred_val = train_kmer_for_level(
                    model, level, k, df_train, df_val)

                combined_train_data.append(df_train.values[:, :-2].astype(
                    np.float16))
                combined_val_data.append(df_val.values[:, :-2].astype(
                    np.float16))

                logger.info(
                    'Train F1 Score for {} model for {} level and k={} is {:.3f}'
                    .format(model_str, level, k, train_f1))
                np.save(
                    '{}/{}_preds_{}_{}mer.npy'.format(dirpath_output,
                                                      model_str, level, k),
                    pred_val)
            if not is_demo:
                combined_train_data = np.hstack(combined_train_data)
                combined_val_data = np.hstack(combined_val_data)

                train_labels = df_train['label'].values.astype(np.int8)
                val_labels = df_val['label'].values.astype(np.int8)

                combined_f1, combined_pred = train_kmer_combined(
                    model, level, combined_train_data, train_labels,
                    combined_val_data, val_labels)

                logger.info(('Train F1 Score for {} model for {} level and ' +
                             'combined K from 1-5 is {:.3f}').format(
                                 model_str, level, combined_f1))
                np.save(
                    '{}/{}_preds_{}_combined.npy'.format(
                        dirpath_output, model_str, level), combined_pred)
コード例 #11
0
def get_embeddings(fpath_data, is_kmer, feature_type, use_gpu, dirpath_results,
                   model_name):

    logger = utils.get_logger()
    logger.info('Running inference and obtaining embeddings for {}'.format(
        feature_type))

    if is_kmer:
        kval = int(feature_type.split('_')[1])
        fpath_features = '{}/features_{}mer.npy'.format(fpath_data, kval)
        features = np.load(fpath_features)
    else:
        fpath_features = '{}/ordinal_sequences.txt'.format(fpath_data)
        features = open(fpath_features, 'r').read().split('\n')

    batch_loader = BatchLoader(data_path=fpath_features, is_kmer=(is_kmer))
    parameters = Parameters(batch_loader.vocab_size, feature_type=feature_type)

    vae = VAE(parameters)

    if use_gpu:
        vae = vae.cuda()

    vae.load_state_dict(
        t.load('{}/{}_best.pth'.format(dirpath_results, model_name)))

    batch_size = 5000
    batch_indices = [
        i for i in np.arange(
            start=batch_size, stop=len(features), step=batch_size)
    ]
    feature_batches = np.split(features, batch_indices)

    embeddings = []

    for feature_batch in feature_batches:
        if not is_kmer:
            idxs = [batch_loader._get_idxs(seq) for seq in feature_batch]
            encoder_input, _, _ = batch_loader._wrap_tensor(idxs,
                                                            use_cuda=use_gpu)
        else:
            encoder_input, _, _ = batch_loader._wrap_tensor(feature_batch,
                                                            use_cuda=use_gpu)
        mu, _ = vae.inference(encoder_input)
        embeddings.extend(mu.data.cpu().numpy())

    embeddings = np.array(embeddings)
    return embeddings
コード例 #12
0
ファイル: kmer.py プロジェクト: srivathsapv/cgda_project
def plot_kmer_metrics(path_config, args):
    logger = utils.get_logger()

    path_config = path_config['basic_kmer']

    dirpath_kmer = path_config['dirpath_kmer']
    dirpath_results = path_config['results']

    model_display = {'svc': 'SVM', 'randomforestclassifier': 'Random Forest'}

    xticks = [str(i) for i in range(1, 7)]
    xticks.append('combined')

    for model in ['svc', 'randomforestclassifier']:
        model_scores = {}
        for i, level in enumerate(['phylum', 'class', 'order']):
            model_scores[level] = []
            df_data = pd.read_csv('{}/{}/val_1mer.csv'.format(
                dirpath_kmer, level))
            gt_y = df_data['label'].values.astype(np.int8)

            for k in range(1, 7):
                pred_y = np.load('{}/{}_preds_{}_{}mer.npy'.format(
                    dirpath_results, model, level, k))
                model_scores[level].append(
                    f1_score(gt_y, pred_y, average='macro'))
            pred_y = np.load('{}/{}_preds_{}_combined.npy'.format(
                dirpath_results, model, level))
            model_scores[level].append(f1_score(gt_y, pred_y, average='macro'))

            plt.plot(range(1, 8),
                     model_scores[level],
                     label=level,
                     color=COLORS[i])

        fpath_plot = '{}/{}_kmer.png'.format(dirpath_results, model)
        plt.xticks(range(1, 8), xticks)
        plt.xlabel('K-Mer length')
        plt.ylabel('F1 Score')
        plt.title('F1 Metrics for {}'.format(model_display[model]))
        plt.legend()
        plt.savefig(fpath_plot)
        logger.info('Saving K-Mer comparison plot for {} in {}'.format(
            model_display[model], fpath_plot))
        plt.clf()
コード例 #13
0
ファイル: interface.py プロジェクト: srivathsapv/cgda_project
def test_model(path_config, args=None):

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    logger = utils.get_logger()
    path_config = path_config['rnn']
    dirpath_results = path_config['dirpath_rnn']

    logger.info('Testing RNN Phylum Classifier')
    parameters = Parameters('phylum')
    data_config = path_config['phylum']
    test_data = load_data(data_config['test'])
    model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim,
                           parameters.vocab_size, parameters.label_size,
                           device).to(device)
    evaluate(test_data, model, dirpath_results, parameters, device, logger,
             'rnn_phylum')

    logger.info('Testing RNN Class Classifier')
    parameters = Parameters('class')
    data_config = path_config['class']
    test_data = load_data(data_config['test'])
    model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim,
                           parameters.vocab_size, parameters.label_size,
                           device).to(device)
    evaluate(test_data, model, dirpath_results, parameters, device, logger,
             'rnn_class')

    logger.info('Testing RNN Order Classifier')
    parameters = Parameters('order')
    data_config = path_config['order']
    test_data = load_data(data_config['test'])
    model = LSTMClassifier(parameters.embedding_dim, parameters.hidden_dim,
                           parameters.vocab_size, parameters.label_size,
                           device).to(device)
    evaluate(test_data, model, dirpath_results, parameters, device, logger,
             'rnn_order')
コード例 #14
0
ファイル: interface.py プロジェクト: srivathsapv/cgda_project
def train_model(path_config, args=None):
    if args.model_name == 'basic_kmer':
        logger = utils.get_logger()
        path_config = path_config['basic_kmer']
        kmer_config = utils.get_model_hyperparams('basic_kmer')

        models = [
            SVC(**kmer_config['svm']),
            RandomForestClassifier(**kmer_config['random_forest'])
        ]

        if args.is_demo:
            logger.info(
                'WARNING! Running in Demo Mode. Because of the fact that SVM is taking a long time, '
                +
                'training will be run only for k=1 to 4. Training for combined data will not be run.'
            )
            kmer_config['kmax'] = 4

        kmer_train_basic(models,
                         dirpath_kmer=path_config['dirpath_kmer'],
                         dirpath_output=path_config['results'],
                         kmin=kmer_config['kmin'],
                         kmax=kmer_config['kmax'],
                         is_demo=args.is_demo)
    elif args.model_name == 'basic_vector':
        path_config = path_config['basic_vector']
        vector_train_basic(dirpath_vector=path_config['dirpath_vector'],
                           dirpath_output=path_config['results'])

    elif args.model_name == 'basic_onehot':
        path_config = path_config['basic_onehot']
        onehot_train_basic(dirpath_vector=path_config['dirpath_onehot'],
                           dirpath_output=path_config['results'])
    else:
        raise ValueError('Basic ML model {} not supported'.format(
            args.model_name))
コード例 #15
0
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.utils.data as torch_utils

from ml.utils import get_logger

plt.style.use('seaborn')

warnings.filterwarnings("ignore")
torch.set_num_threads(1)

LOGGER = get_logger()


def read_data_from_csv(path):
    df = pd.read_csv(path)
    X = df["sequence"].values
    y = df["label"].values
    y_names = df["class_name"].values
    return X, y, y_names


def load_train_val_test_data(base_path,
                             level,
                             analyze=True,
                             return_label_names=False):
    base_path_on_level = os.path.join(base_path, level)
コード例 #16
0
ファイル: onehot.py プロジェクト: srivathsapv/cgda_project
def train_basic(dirpath_vector, dirpath_output):
    logger = utils.get_logger()
    x_train = np.genfromtxt(
        dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                           delimiter='\n', dtype=None, encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                          delimiter='\n', dtype=None, encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi, 5))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    x_train = np.genfromtxt(
        dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/class/test.csv',
                           delimiter='\n', dtype=None, encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/class/val.csv',
                          delimiter='\n', dtype=None, encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi, 5))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()
    lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv',
                        delimiter='\n', dtype=None, encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                         delimiter='\n', dtype=None, encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                         delimiter='\n', dtype=None, encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        if item.split(",")[0][0] == "A":
            labels.append(0)
        elif item.split(",")[0][0] == "F":
            labels.append(1)
        else:
            labels.append(2)
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        if item.split(",")[0][0] == "A":
            labels1.append(0)
        elif item.split(",")[0][0] == "F":
            labels1.append(1)
        else:
            labels1.append(2)
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        if item.split(",")[0][0] == "A":
            labels2.append(0)
        elif item.split(",")[0][0] == "F":
            labels2.append(1)
        else:
            labels2.append(2)
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = SVC(kernel='rbf')
    clf = RandomForestClassifier()

    newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
    newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2])
    clf2.fit(newX, label)
    clf.fit(newX, label)

    preds2 = clf2.predict(newX)
    preds = clf.predict(newX)

    preds2_test = clf2.predict(newY)
    preds_test = clf.predict(newY)
    np.save(dirpath_output + '/SVM_phylum_predictions', preds2_test)
    np.save(dirpath_output + '/RF_phylum_predictions', preds_test)

    scores = clf2.decision_function(newY)
    scores2 = clf.predict(newY)

    score = np.amax(scores, axis=1)

    scores_train = clf2.decision_function(newX)
    scores2_train = clf.predict(newX)

    score_train = np.amax(scores_train, axis=1)

    np.save(dirpath_output + '/SVM_phylum_scores', score)
    np.save(dirpath_output + '/RF_phylum_scores', scores2)

    fpr, tpr, thresholds = roc_curve(label, score_train, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label, scores2_train, pos_label=2)

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(
        label, preds2, average='weighted')

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(
        label, preds, average='weighted')

    C = confusion_matrix(label, preds2)

    logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy2, p, r, f1))
    logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy, p2, r2, f12))

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()

    lab = np.genfromtxt(dirpath_vector + '/class/train.csv',
                        delimiter='\n', dtype=None, encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv',
                         delimiter='\n', dtype=None, encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv',
                         delimiter='\n', dtype=None, encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        labels.append(int(item.split(",")[2]))
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        labels1.append(int(item.split(",")[2]))
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        labels2.append(int(item.split(",")[2]))
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = RandomForestClassifier()
    clf = SVC(kernel='rbf')

    newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
    newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2])
    clf2.fit(newX, label)
    clf.fit(newX, label)
    preds2 = clf2.predict(newX)
    preds = clf.predict(newX)
    scores = clf2.predict(newY)
    scores1 = clf.decision_function(newY)

    preds2_test = clf2.predict(newY)
    preds_test = clf.predict(newY)

    np.save(dirpath_output + '/SVM_class_predictions', preds2_test)
    np.save(dirpath_output + '/RF_class_predictions', preds_test)

    score = np.amax(scores1, axis=1)

    scores_train = clf.decision_function(newX)
    scores2_train = clf2.predict(newX)

    score_train = np.amax(scores_train, axis=1)

    np.save(dirpath_output + '/SVM_class_scores', score)
    np.save(dirpath_output + '/RF_class_scores', scores)

    fpr, tpr, thresholds = roc_curve(label, scores_train, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label, score_train, pos_label=2)

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(
        label, preds2, average='weighted')
    C = confusion_matrix(label, preds2)

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(
        label, preds, average='weighted')

    logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy, p2, r2, f12))
    logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy2, p, r, f1))
コード例 #17
0
def train_vae(path_config, feature_type, hyperparams, model_name):
    t.cuda.empty_cache()
    logger = utils.get_logger()

    is_kmer = ('kmer' in feature_type)
    fpath_data = (path_config['features']
                  if is_kmer else path_config['sequences'])
    dirpath_results = path_config['results']

    use_gpu = t.cuda.is_available()

    batch_loader = BatchLoader(data_path=fpath_data, is_kmer=(is_kmer))
    parameters = Parameters(batch_loader.vocab_size, feature_type=feature_type)

    vae = VAE(parameters)

    if use_gpu:
        vae = vae.cuda()

    optimizer = Adam(vae.parameters(), hyperparams['learning_rate'])

    metrics = []
    min_ce = 1000
    min_vae = None

    num = hyperparams['num_iterations']

    for iteration in tqdm(range(num), total=num):
        '''Train step'''
        input, decoder_input, target = batch_loader.next_batch(
            hyperparams['batch_size'], 'train', use_gpu)
        target = target.view(-1)

        logits, aux_logits, kld = vae(hyperparams['dropout'], input,
                                      decoder_input)

        logits = logits.view(-1, batch_loader.vocab_size)
        cross_entropy = F.cross_entropy(logits, target, size_average=False)

        aux_logits = aux_logits.view(-1, batch_loader.vocab_size)
        aux_cross_entropy = F.cross_entropy(aux_logits,
                                            target,
                                            size_average=False)

        loss = cross_entropy + hyperparams['aux'] * aux_cross_entropy + kld

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        '''Validation'''
        input, decoder_input, target = batch_loader.next_batch(
            hyperparams['batch_size'], 'valid', use_gpu)
        target = target.view(-1)

        logits, aux_logits, valid_kld = vae(hyperparams['dropout'], input,
                                            decoder_input)

        logits = logits.view(-1, batch_loader.vocab_size)
        valid_cross_entropy = F.cross_entropy(logits,
                                              target,
                                              size_average=False)

        aux_logits = aux_logits.view(-1, batch_loader.vocab_size)
        valid_aux_cross_entropy = F.cross_entropy(aux_logits,
                                                  target,
                                                  size_average=False)

        loss = valid_cross_entropy + \
            hyperparams['aux'] * valid_aux_cross_entropy + kld

        if iteration % 50 == 0:
            metrics_dict = get_metrics_dict(cross_entropy, aux_cross_entropy,
                                            kld, valid_cross_entropy,
                                            valid_aux_cross_entropy, valid_kld,
                                            hyperparams['batch_size'])
            metrics.append(metrics_dict)

            valid_ce = metrics_dict['valid_ce']

            if valid_ce <= min_ce:
                min_vae_dict = vae.state_dict()
                min_ce = valid_ce
                logger.info(
                    'Saving best model in iteration {}'.format(iteration))
                t.save(vae.state_dict(),
                       '{}/{}_best.pth'.format(dirpath_results, model_name))

    logger.info('Saving final metrics')
    df_metrics = pd.DataFrame(metrics)
    df_metrics.to_csv('{}/{}_metrics.csv'.format(dirpath_results, model_name))
コード例 #18
0
def test_basic(dirpath_vector, dirpath_output, verbose=True):
    logger = utils.get_logger()
    x_train = np.genfromtxt(dirpath_vector + '/phylum/train.csv',
                            delimiter='\n',
                            dtype=None,
                            encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                           delimiter='\n',
                           dtype=None,
                           encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                          delimiter='\n',
                          dtype=None,
                          encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    x_train = np.genfromtxt(dirpath_vector + '/class/train.csv',
                            delimiter='\n',
                            dtype=None,
                            encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/class/test.csv',
                           delimiter='\n',
                           dtype=None,
                           encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/class/val.csv',
                          delimiter='\n',
                          dtype=None,
                          encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()
    lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv',
                        delimiter='\n',
                        dtype=None,
                        encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        if item.split(",")[0][0] == "A":
            labels.append(0)
        elif item.split(",")[0][0] == "F":
            labels.append(1)
        else:
            labels.append(2)
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        if item.split(",")[0][0] == "A":
            labels1.append(0)
        elif item.split(",")[0][0] == "F":
            labels1.append(1)
        else:
            labels1.append(2)
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        if item.split(",")[0][0] == "A":
            labels2.append(0)
        elif item.split(",")[0][0] == "F":
            labels2.append(1)
        else:
            labels2.append(2)
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = SVC(kernel='rbf')
    clf = RandomForestClassifier()

    clf2.fit(X, label)
    clf.fit(X, label)

    preds2 = clf2.predict(Y)
    preds = clf.predict(Y)

    scores = clf2.decision_function(Y)
    scores2 = clf.predict(Y)

    score = np.amax(scores, axis=1)

    fpr, tpr, thresholds = roc_curve(label1, score, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label1, scores2, pos_label=2)

    roc_auc = auc(fpr, tpr)
    roc_auc2 = auc(fpr2, tpr2)
    plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2))
    plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("ROC curve SVM vs RF - Phylum Level")
    plt.legend(("SVM", "RandomForest"))
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.savefig(dirpath_output + "/" + "ROC_Phylum")

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label1[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(label1,
                                                  preds2,
                                                  average='weighted')

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label1[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(label1,
                                                     preds,
                                                     average='weighted')

    C = confusion_matrix(label1, preds2)

    logger.info(
        'Test Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy2, p, r, f1))
    logger.info(
        'Test Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy, p2, r2, f12))

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()

    lab = np.genfromtxt(dirpath_vector + '/class/train.csv',
                        delimiter='\n',
                        dtype=None,
                        encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        labels.append(int(item.split(",")[2]))
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        labels1.append(int(item.split(",")[2]))
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        labels2.append(int(item.split(",")[2]))
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = RandomForestClassifier()
    clf = SVC(kernel='rbf')

    clf2.fit(X, label)
    clf.fit(X, label)
    preds2 = clf2.predict(Y)
    preds = clf.predict(Y)
    scores = clf2.predict(Y)
    scores1 = clf.decision_function(Y)

    score = np.amax(scores1, axis=1)

    fpr, tpr, thresholds = roc_curve(label1, scores, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label1, score, pos_label=2)

    roc_auc = auc(fpr, tpr)
    roc_auc2 = auc(fpr2, tpr2)
    plt.figure()
    plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2))
    plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("ROC curve SVM vs RF - Class Level")
    plt.legend(("SVM", "RandomForest"))
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    if not os.path.exists(dirpath_output):
        os.makedirs(dirpath_output)
    plt.savefig(dirpath_output + "/" + "ROC_Class")

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label1[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(label1,
                                                  preds2,
                                                  average='weighted')
    C = confusion_matrix(label1, preds2)

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label1[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(label1,
                                                     preds,
                                                     average='weighted')

    logger.info(
        'Test Accuracy, precision, recall and F1 Score for SVM model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy, p2, r2, f12))
    logger.info(
        'Test Accuracy, precision, recall and F1 Score for Random Forest model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy2, p, r, f1))
コード例 #19
0
def test_model(path_config, args=None):
    logger = utils.get_logger()
    predicted = np.load(path_config[args.model_name]['results'] + "/" +
                        args.model_name + "_predicted.npy")
    lab = np.genfromtxt(path_config[args.model_name]['test'],
                        delimiter='\n',
                        dtype=None,
                        encoding=None)
    labels = []
    scheme = [
        'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
        'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'
    ]
    if args.model_name == "lstm_vae_ordinal":
        i = 0
        for item in lab[1:]:
            if item.split(",")[0][0] == "P":
                labels.append(0)
            elif item.split(",")[0][0] == "F":
                labels.append(1)
            else:
                labels.append(2)
            i += 1
    else:
        i = 0
        for item in lab[1:]:
            labels.append(float(item.split(",")[-1]))
            i += 1

    label = np.array(labels)

    clf = PCA(n_components=3)
    pred_new = clf.fit_transform(predicted)

    est = KMeans(n_clusters=10, init=predicted[:10])

    plotFigs3D(est, pred_new, 'hclustering_order_10-3D', args, predicted,
               path_config, scheme)

    est2 = KMeans(n_clusters=5)

    clf2 = PCA(n_components=3)

    pred_new2 = clf2.fit_transform(est.cluster_centers_)

    plotFigs3D(est2, pred_new2, 'hclustering_class_5-3D', args,
               est.cluster_centers_, path_config, scheme)

    est3 = KMeans(n_clusters=3)

    clf3 = PCA(n_components=3)
    pred_new3 = clf3.fit_transform(est2.cluster_centers_)

    plotFigs3D(est3, pred_new3, 'hclustering_phylum_3-3D', args,
               est2.cluster_centers_, path_config, scheme)

    fig = plt.figure(figsize=(4, 3))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

    labelslis = []
    for i in range(label.shape[0]):
        labelslis.append(scheme[label[i]])

    ax.scatter(pred_new[:, 0],
               pred_new[:, 1],
               pred_new[:, 2],
               c=labelslis,
               edgecolor='k')

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_zlabel('PC3')
    ax.set_title('3D_groundClustering')
    plt.savefig(path_config[args.model_name]['results'] +
                '/3D_groundClustering')

    logger.info('3D clustering completed, plot stored in ' +
                path_config[args.model_name]['results'])

    # from mpl_toolkits.mplot3d import Axes3D
    clf = PCA(n_components=2)
    pred_new = clf.fit_transform(predicted)

    est = KMeans(n_clusters=10, init=predicted[:10])

    plotFigs2D(est, pred_new, 'hclustering_order_10-2D', args, predicted,
               path_config, scheme)

    clf2 = PCA(n_components=2)
    pred_new2 = clf2.fit_transform(est.cluster_centers_)

    est2 = KMeans(n_clusters=5)

    plotFigs2D(est2, pred_new2, 'hclustering_class_5-2D', args,
               est.cluster_centers_, path_config, scheme)

    clf3 = PCA(n_components=2)
    pred_new3 = clf3.fit_transform(est2.cluster_centers_)

    est3 = KMeans(n_clusters=3)

    plotFigs2D(est3, pred_new3, 'hclustering_phylum_3-2D', args,
               est2.cluster_centers_, path_config, scheme)

    clf_ground = PCA(n_components=2)
    pred_new_ground = clf_ground.fit_transform(predicted)

    plt.figure(figsize=(4, 3))
    labelslis = []
    for i in range(label.shape[0]):
        labelslis.append(scheme[label[i]])

    plt.scatter(pred_new_ground[:, 0],
                pred_new_ground[:, 1],
                c=labelslis,
                edgecolor='k')

    plt.xlabel('PC1')
    plt.ylabel('PC2')
    # ax.set_zlabel('Petal length')
    plt.title('hclustering_ground2D')
    plt.savefig(path_config[args.model_name]['results'] +
                '/hclustering_ground2D')
    logger.info('2D clustering completed, plot stored in ' +
                path_config[args.model_name]['results'])