Example #1
0
def get_oof(clfs, raw_texts, raw_labels, test_data, word2index, attr_dict):
    NFOLDS = len(clfs)
    n_train = len(raw_texts)
    n_test = len(test_data.sentences)
    class_num = 10
    oof_train = np.zeros((n_train, class_num))
    oof_train_y = np.zeros((n_train, class_num))
    oof_test = np.zeros((n_test, class_num))
    oof_test_skf = np.zeros((NFOLDS, n_test, class_num))

    kf = 0
    for (train_index,
         test_index), checkpoint in zip(kfold_split(n_train, NFOLDS), clfs):
        print(checkpoint)
        clf = torch.load(checkpoint)
        kf += 1
        print("FOLD:", kf)
        print("TRAIN:", str(len(train_index)), "TEST:", str(len(test_index)))
        # train_index, test_index = train_index.tolist(), test_index.tolist()
        dev_texts, dev_labels = [raw_texts[i] for i in test_index
                                 ], [raw_labels[i] for i in test_index]
        dev_data = Data((dev_texts, dev_labels), word2index, attr_dict, args)
        if args.use_elmo != 0:
            dev_elmo = load_elmo(dev_texts)
            dev_data.add_feature(dev_elmo)
        with torch.no_grad():
            dev_predict, oof_dev = train.predict_with_logit(
                clf, dev_data, args)
        pred_acc_p = score(dev_predict, dev_data.labels)
        print("[p:%.4f, r:%.4f, f:%.4f] acc:%.4f" %
              (pred_acc_p[0], pred_acc_p[1], pred_acc_p[2], pred_acc_p[3]))
        # label_prf = label_analysis(dev_predict, dev_data.labels)
        # for i in range(len(label_prf)):
        #     print("%s : [%.4f, %.4f, %.4f] %.4f" %
        #           (list(attr_dict.keys())[i], label_prf[i][0], label_prf[i][1], label_prf[i][2], label_prf[i][3]))
        oof_train[test_index] = oof_dev
        dev_y = [l[0].detach().numpy() for l in dev_data.labels]

        oof_train_y[test_index] = dev_y
        _, oof_test_skf[kf - 1, :, :] = train.predict_with_logit(
            clf, test_data, args)
    oof_test[:] = oof_test_skf.mean(axis=0)
    dir = os.path.dirname(clfs[0])
    if not os.path.exists(os.path.join(dir, 'npy')):
        os.mkdir(os.path.join(dir, 'npy'))
    print(dir)
    np.save(os.path.join(dir, 'npy', "oof_train"), oof_train)
    np.save(os.path.join(dir, 'npy', "oof_train_y"), oof_train_y)
    np.save(os.path.join(dir, 'npy', "oof_test"), oof_test)
    return oof_train, oof_train_y, oof_test
Example #2
0
def test():
    model = AttributeClassifier()
    check_point = "checkpoint_AttA3_0.8810.pt"
    model.load_model(check_point)

    test_file = "data/attribute_test.txt"
    test_texts = load_test_data(test_file)
    f_w2v = "../embedding/embedding_all_merge_300.txt"
    W, word2index = load_w2v(f_w2v)

    f_dict = "../dataset/attribute.json"
    attr_list, attr_dict = parse_json(f_dict)

    test_data = Data((test_texts, None), word2index)

    test_predict = train.predict(model.classifier, test_data, args)
    print(test_predict)

    fw = codecs.open("test_predict.txt", 'w', encoding='utf-8')
    for p in test_predict:
        attributes = []
        for i, l in enumerate(p):
            if l != 0:
                attributes.append(attr_list[i])
        fw.write('|'.join(attributes) + '\n')
    def predict(self, rnn, test_raw_data, word2index, args):
        test_texts = test_raw_data[0]
        test_t = test_raw_data[1]
        test_ow = test_raw_data[2]

        test_input_s = [self.to_tensor(s, word2index) for s in test_texts]
        # print(train_input_s[0])
        test_elmo = []
        if args.use_elmo:
            import h5py
            elmo_dict = h5py.File('data/%s/elmo_layers.hdf5' % args.ds, 'r')
            for i, current_sentence in enumerate(test_texts):
                current_sentence = ' '.join(current_sentence)
                embeddings = torch.from_numpy(
                    np.asarray(elmo_dict[current_sentence]))
                test_elmo.append(embeddings)
            elmo_dict.close()

        # print(train_input_s[0])
        test_input_t = [torch.LongTensor(t) for t in test_t]
        test_y_tensor = [torch.LongTensor(y) for y in test_ow]
        test_data = Data(args,
                         test_input_s,
                         test_input_t,
                         test_y_tensor,
                         features=test_elmo)
        with torch.no_grad():
            test_predict = predict(rnn, test_data, args)
        pred_acc_t = score(test_predict, test_data.labels)
        print("p:%.4f, r:%.4f, f:%.4f" %
              (pred_acc_t[0], pred_acc_t[1], pred_acc_t[2]))
        return test_predict
Example #4
0
def dev():
    model = AttributeClassifier()
    check_point = "checkpoints5/checkpoint_AttA3_0.8666.pt"
    model.load_model(check_point)

    f_train = "data/attribute_data.txt"
    # f_test = "data/test_attr2.txt"
    f_w2v = "../embedding/embedding_all_merge_300.txt"
    f_dict = "../dataset/attribute.json"
    print(f_w2v)
    raw_texts, raw_labels = load_attr_data(filename=f_train)
    W, word2index = load_w2v(f_w2v)
    attr_list, attr_dict = parse_json(f_dict)
    kf = 0

    _, test_index = kfold_split(len(raw_texts), args.folds)[2]
    test_texts, test_labels = [raw_texts[i] for i in test_index
                               ], [raw_labels[i] for i in test_index]
    test_data = Data((test_texts, test_labels), word2index, attr_dict, args)

    test_predict = train.predict(model.classifier, test_data, args)
    pred_acc_t = score(test_predict, test_data.labels)
    print(pred_acc_t)
Example #5
0
    def train_from_data(self,
                        train_raw_data,
                        test_raw_data,
                        W,
                        word2index,
                        attr_dict,
                        args,
                        Fold=0):

        word_embed_dim = W.shape[1]
        hidden_size = args.n_hidden
        vocab_size = len(W)
        output_size = len(attr_dict)

        if args.model == 'LSTM':
            self.classifier = networks.LSTM(word_embed_dim, output_size,
                                            vocab_size, args)
        elif args.model == 'Fasttext':
            self.classifier = networks.Fasttext(word_embed_dim, output_size,
                                                vocab_size, args)
        elif args.model == 'Average_LSTM2':
            self.classifier = networks.Average_LSTM2(word_embed_dim,
                                                     output_size, vocab_size,
                                                     args)
        elif args.model == 'AttA3':
            self.classifier = networks.AttA3(word_embed_dim, output_size,
                                             vocab_size, args)
            aspect_e_l = []
            for a in attr_dict:
                # print(a)
                if a == '舒适性':
                    a = '舒适'
                a_e = torch.FloatTensor(W[word2index[a]])
                aspect_e_l.append(a_e)
            aspect_embeds = torch.cat(aspect_e_l, 0)
            # print(aspect_embeds)
            # print(attr_dict)
            self.classifier.AE.weight = torch.nn.Parameter(aspect_embeds)
        elif args.model == 'Binary_LSTM':
            self.classifier = networks.Binary_LSTM(word_embed_dim, output_size,
                                                   vocab_size, args)
        elif args.model == 'CNN':
            self.classifier = networks.CNN(word_embed_dim, output_size,
                                           vocab_size, args)
        elif args.model == 'Attn_LSTM':
            self.classifier = networks.Attn_LSTM(word_embed_dim, output_size,
                                                 vocab_size, args)

        train_elmo, test_elmo = [], []

        if args.use_elmo != 0:
            import h5py
            elmo_dict = h5py.File('../embedding/embeddings_elmo_ly-1.hdf5',
                                  'r')
            for s in train_raw_data[0]:
                sentence = '\t'.join(s)
                sentence = sentence.replace('.', '$period$')
                sentence = sentence.replace('/', '$backslash$')
                # print(sentence)
                embeddings = torch.from_numpy(np.asarray(elmo_dict[sentence]))
                train_elmo.append(embeddings)
            for s in test_raw_data[0]:
                sentence = '\t'.join(s)
                sentence = sentence.replace('.', '$period$')
                sentence = sentence.replace('/', '$backslash$')
                embeddings = torch.from_numpy(np.asarray(elmo_dict[sentence]))
                test_elmo.append(embeddings)
            elmo_dict.close()
            print("finish elmo")

        train_data = Data(train_raw_data, word2index, attr_dict, args)
        # if args.use_dev:
        #     dev_data = Data(args, dev_input_s, dev_input_t, dev_y_tensor)
        # else:
        #     dev_data = None
        test_data = Data(test_raw_data, word2index, attr_dict, args)
        if args.use_elmo != 0:
            train_data.add_feature(train_elmo)
            test_data.add_feature(test_elmo)
        best_dict, max_acc = train.train(self.classifier,
                                         train_data,
                                         test_data,
                                         test_data,
                                         attr_dict,
                                         W,
                                         args=args)
        best_model = "%s/checkpoint_%s_%.6f_%d.pt" % (
            args.check_dir, args.model, max_acc, Fold)
        if args.save != 0:
            torch.save(best_dict, best_model)
        pass
Example #6
0
def stacking():
    saved = True if args.saved != 0 else False
    f_train = "../data/train.txt"
    test_file = "../data/test.txt"
    test_texts = load_test_data(test_file)
    raw_texts, raw_labels = load_attr_data(filename=f_train)
    word2index = pickle.load(open("../data/vocabulary.pkl", 'rb'))

    f_dict = "../dataset/attribute.json"
    attr_list, attr_dict = parse_json(f_dict)

    paths = args.test_dir.split('#')
    models_files = []
    for path in paths:
        models_files.append([
            os.path.join(path, f) for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f))
        ])

    test_data = Data((test_texts, None), word2index)
    if args.use_elmo != 0:
        test_elmo = load_elmo(test_texts)
        test_data.add_feature(test_elmo)

    x_train = []
    y_train = []  # TODO replace
    x_test = []
    for dir, checkpoints_per_model in zip(paths, models_files):
        print(dir, checkpoints_per_model)
        if saved == 1 and os.path.isfile(
                os.path.join(dir, 'npy', "oof_train.npy")):
            oof_train, oof_train_y, oof_test = load_oof(dir)
        else:
            NFOLDS = len(checkpoints_per_model)
            print(NFOLDS)
            assert NFOLDS == args.folds
            clfs = [None for i in range(NFOLDS)]
            for cp in checkpoints_per_model:
                fold = int(cp.replace('_', '.').split('.')[-2])
                print(fold)
                clfs[fold - 1] = cp
            oof_train, oof_train_y, oof_test = get_oof(clfs, raw_texts,
                                                       raw_labels, test_data,
                                                       word2index, attr_dict)
        x_train.append(oof_train)
        if y_train == []:
            y_train = oof_train_y
        else:
            assert (y_train == oof_train_y).all()
        x_test.append(oof_test)
    x_train = np.stack(x_train, axis=2)
    x_test = np.stack(x_test, axis=2)

    print(x_train.shape)
    num_train = x_train.shape[0]
    num_test = x_test.shape[0]
    test_predict = []
    for c in range(x_train.shape[1]):
        x_train_c = x_train[:, c, :].reshape(num_train, -1)
        x_test_c = x_test[:, c, :].reshape(num_test, -1)
        meta_clf_c = LogisticRegression()
        y_train_c = y_train[:, c]
        meta_clf_c.fit(x_train_c, y_train_c)
        test_predict_c = meta_clf_c.predict_proba(x_test_c)[:, 1]
        test_predict.append(test_predict_c)

    test_predict = np.stack(test_predict, axis=1)
    print(test_predict.shape)
    fw = codecs.open("../data/test_predict_aspect_ensemble.txt",
                     'w',
                     encoding='utf-8')

    for prob in test_predict:
        attributes = []
        voted = [0 for a in range(len(attr_list))]

        for i in range(len(prob)):
            p = prob[i]
            # print(p)
            if p > args.threshold:
                voted[i] = 1
                # categories.append(attrC[i])
        if sum(voted) == 0:
            voted[prob.argmax()] = 1
        for i, l in enumerate(voted):
            if l != 0:
                attributes.append(attr_list[i])
        fw.write('|'.join(attributes) + '\n')
    time_stamp = time.asctime().replace(':', '_').split()
    fw.close()
    shutil.copy2(
        "../data/test_predict_aspect_ensemble.txt",
        "../data/backup/test_predict_aspect_ensemble_%s.txt" % time_stamp)
#TODO We should look at metadata and discard wrong samples

#IDEA:
'''

- extract the baterias that are present in most of the samples and try to do analysis on them
- second option would be to select bacterias which amount varies a lot between the samples
- high in absolute value (negative or possitive) correlation between presence of drug and presence of the resistent bacteria
- cluster the samples based on the presence of bacterias (and their amount), we could use number of cluster equal to the #num_drugs*2 or *3 (should be still readable) => use colors for countries
- linear regression (input drugs and their doses => amount of each bacteria)

'''

# Load the data files
data = Data()

cummulative_per_country = data.get_cummulatives()
'''
Plots of cummulative number of resistent bacterias per sample
Grouped by country
2 variants - absolute & normalized
'''
plots.cummulative(cummulative_per_country)
'''
Binned cummulative number of resistent bacterias per sample
Grouped by country
2 variants - absolute & normalized
'''
# plots.binned_cummulative(cummulative_per_country, bins=1)
# plots.binned_cummulative(cummulative_per_country, bins=2)
from utils.Data import Data
from factories.NetworkFactory import NetworkFactory
from utils.Tags import NetworkTags
from utils.Functions import ActivaionFunction
from utils.NetworkHelper import NetworkHelper

data = Data("resources/IrisDataTrain.xls", 125, shufle_data=True)
data.label_iris_dat()
normalized_data = data.normalize_data()
layers = [data.def_input_neurons(), 3, data.def_input_neurons()]
encoder = NetworkFactory.create(NetworkTags.AutoEncoder, layers)
encoder.train(normalized_data,
              normalized_data,
              repetitions=500,
              activation_function=ActivaionFunction.HYPERBOLIC_TANGENT)
encoder.remove_unneeded_layers()

layers = [data.def_input_neurons(), 15, 7, data.def_output_neurons()]
mlp = NetworkFactory.create(NetworkTags.MLPWithEachLayerConnection, layers)
merged = NetworkHelper.merge_autoencoder_to_mlp(encoder, mlp)
finnal = NetworkHelper.add_neurons_to_first_hidden_layer(merged, 6)
finnal.update_id()

finnal.train(data.normalize_data(),
             data.label_iris_dat(),
             repetitions=500,
             activation_function=ActivaionFunction.HYPERBOLIC_TANGENT)

predict_data = Data("resources/IrisData.xls", 150, shufle_data=False)
returned_value = finnal.predict(
    predict_data.normalize_data(),
from utils.Data import Data
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cross_validation import train_test_split
from collections import Counter
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import MaxAbsScaler
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm



data = Data()

# change index and remap to string of metadata
data.metadata.set_index('sample_code', inplace=True)
data.metadata.index = data.metadata.index.map(unicode)
data.gene_counts_df.iloc[1:,1:] = data.gene_counts_df.iloc[1:,1:].astype(float)

# data.gene_counts_df.loc[:,data.metadata.index].astype(float, copy=False)

small_cluster_individuals = []
# Normalize with respect to number of pairs
for individual in data.metadata.index:
    data.gene_counts_df.ix[1:, individual] = data.gene_counts_df.ix[1:, individual].apply(lambda x: np.divide(x, data.metadata.ix[individual, 'norm_Bacteria_pairs']))

scaler = MaxAbsScaler(copy=False)
# Normalized by max value
data.gene_counts_df[data.gene_counts_df.columns[1:]] = scaler.fit_transform(data.gene_counts_df[data.gene_counts_df.columns[1:]])
# Remove false pultry from BG, bad label DE, low reads DE
Example #10
0
from utils.Data import Data
from factories.NetworkFactory import NetworkFactory
from utils.Tags import NetworkTags
from utils.WinnerHolder import WinnerHolder

data = Data("resources/IrisDataTrain.xls", 125, shufle_data=True)
data.label_iris_dat()
normalized_data = data.normalize_data()
layers = [data.def_input_neurons(), 4, 4, 4, 4]
som = NetworkFactory.create(NetworkTags.SOM, layers)
# som.print()
som.train(data.get_raw_data(), data.label_iris_dat(), 2000)
# som.print()
# print("=============================================")
# print(WinnerHolder.get_winner())
# print("distance")
# print(som.layers_list[1].neuron_vector[0])
Example #11
0
from utils.Data import Data
from factories.NetworkFactory import NetworkFactory
from utils.Tags import NetworkTags

from utils.Functions import ActivaionFunction
import time

time_start = time.time()

data = Data("resources/IrisDataTrain.xls", 125, shufle_data=True)
data.label_iris_dat()
normalized_data = data.normalize_data()
print(normalized_data[0])

layers = [data.def_input_neurons(), 15, 10, data.def_output_neurons()]
mlp = NetworkFactory.create(NetworkTags.MLPWithContiguousConnection, layers)

mlp.train(data.normalize_data(),
          data.label_iris_dat(),
          repetitions=500,
          activation_function=ActivaionFunction.SIGMOID)
predict_data = Data("resources/IrisData.xls", 150, shufle_data=False)
returned_value = mlp.predict(predict_data.normalize_data(),
                             predict_data.label_iris_dat(),
                             activation_function=ActivaionFunction.SIGMOID)
labels = predict_data.label_iris_dat()
counter = 0

for i in range(len(returned_value)):
    print("label: {0}".format(labels[i]))
    print("output: {0}".format(returned_value[i]))
def _normalize_dataset(kms, prices):
    normalized_kms = map(lambda km: normalize(km, kms), kms)
    normalized_prices = map(lambda price: normalize(price, prices), prices)
    return [Data(x, y) for x, y in zip(normalized_kms, normalized_prices)]