Example #1
0
def vqa_answer2(imgFeat, question, imageid, socketid):
    imgFeat = np.asarray([imgFeat])

    ques = process_string(question).encode('utf-8')
    print "Ques:", ques
    print "imgFeat Shape:", imgFeat.shape
    train_q_toked = qtok.texts_to_sequences([ques])
    # q_word_index = qtok.word_index
    train_q_toked = sequence.pad_sequences(train_q_toked, maxlen=20)
    print "train token:", train_q_toked.shape

    # max_question_features = len(q_word_index.keys()) + 1
    train_q_toked = np.asarray(train_q_toked)
    predictions = model.predict([train_q_toked, imgFeat])
    print predictions
    np_utils.categorical_probas_to_classes(predictions)
    label_set = sorted(range(len(predictions[0])), key=lambda x: predictions[0][x], reverse=True)[:5]
    a_word_rev = dict((v, k) for k, v in a_word_idx.items())

    ans = []
    for i in range(5):
        idx = label_set[i] + 1
        ans.append([a_word_rev[idx], predictions[0][idx - 1]])
        print 'The Top %d predict Answer is : %s, with score %.6f' % (i, a_word_rev[idx], predictions[0][idx - 1])

    web_result = {}
    web_result[imageid] = ans
    r.publish('result-rest', json.dumps({'web_result': web_result}))
    r.publish('chat', json.dumps({'web_result': json.dumps(web_result), 'socketid': str(socketid)}))

    return ans
def cateAccuracy(model_fitted, X_test, Y_test):
    Y_test = categorical_probas_to_classes(Y_test)
    Y_predict = model_fitted.predict(X_test)
    if len(Y_predict.shape) != 1:
        #转换onehot编码
        Y_predict = categorical_probas_to_classes(Y_predict)

    accu_count = {}
    accu_total = {}
    for cat in set(Y_test):
        total = list(Y_test).count(cat)
        accu_total[cat] = total
        accu_count[cat] = 0

    for iidx, cat in enumerate(Y_test):
        if cat == Y_predict[iidx]:
            accu_count[cat] = accu_count[cat] + 1
    sum1 = 0
    sum2 = 0
    for i in range(len(set(Y_test))):
        sum1 = sum1 + accu_total[i]
        sum2 = sum2 + accu_count[i]
    print(sum2 / float(sum1))
    return [
        accu_count[i] / float(accu_total[i]) for i in range(len(set(Y_test)))
    ]
def fn_model(fn, stype='testing'):
    # parse config options to use (except for LR, which conflicts with the . param separator)
    p = dict([
        x.split(":")
        for x in os.path.dirname(fn.replace("tmp/w/", "")).replace(
            "lr:0.001.", "").split(".")
    ])

    strk = ['af', 'ptype']  # force string conversion
    boolk = ['w2v', 'randposts', 'noempty', 'etrain', 'balbatch',
             'cosine']  # force bool conversion
    for k in p:
        if k == 'prep':
            if p[k].lower() == 'none':
                p[k] = None
            else:
                p[k] = str(p[k])
        elif k in strk:
            p[k] = str(p[k])
        elif k in boolk:
            p[k] = (p[k] == 'True')
        else:
            p[k] = int(p[k])
    p["lr"] = 0.001

    tf.reset_default_graph()
    model, _ = build_model(p)
    model.load_weights(fn)

    _, genf = datagen(p['max_posts'],
                      p['max_length'],
                      stype=stype,
                      force_full=True,
                      mintf=p['mintf'],
                      mindf=p['mindf'],
                      noempty=p['noempty'],
                      prep=p['prep'],
                      batch_size=9999999999)
    for i, (X, y) in enumerate(genf()):
        assert i == 0, "test set should contain only one batch (and it should not be sampled)"
    val_X, val_y = X, categorical_probas_to_classes(y)

    y_pred = categorical_probas_to_classes(model.predict(val_X, batch_size=32))

    y_true = val_y
    posf1 = sklearn.metrics.f1_score(y_true,
                                     y_pred,
                                     pos_label=1,
                                     average='binary')
    posp = sklearn.metrics.precision_score(y_true,
                                           y_pred,
                                           pos_label=1,
                                           average='binary')
    posr = sklearn.metrics.recall_score(y_true,
                                        y_pred,
                                        pos_label=1,
                                        average='binary')

    return (posf1, posp, posr), y_pred, val_y, y_true
def modelMetrics(model_fitted, X_test, Y_test):
    Y_predict = model_fitted.predict(X_test)
    if len(Y_predict.shape) != 1:
        #转换onehot编码
        Y_predict = categorical_probas_to_classes(Y_predict)
    report = classification_report(
        Y_predict, categorical_probas_to_classes(Y_test))  #各个类的f1score
    accuracy = accuracy_score(Y_predict,
                              categorical_probas_to_classes(Y_test))  #总的准确度
    return report, accuracy
Example #5
0
def cnn1d_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                   maxlen, vocab_size, embd_dim,
                   nb_filter, filter_length, batch_size, nb_epoch, optm):
    """
    - CNN-1d on text input (represented in int)
    - MOT
    - dropout + L2 softmax

    :param <X, Y> train and test sets
    :param nb_classes # of classes
    :param maxlen max of n char in a sentence
    :param vocab_size
    :param embd_dim
    :param nb_filter
    :param filter_length
    :param batch_size
    :param nb_epoch
    :param optm optimizer options, e.g., adam, rmsprop, etc.
    :return:
    """
    pool_length = maxlen - filter_length + 1

    model = Sequential()
    model.add(Embedding(vocab_size, embd_dim, input_length=maxlen))
    model.add(Dropout(0.25))

    model.add(Convolution1D(nb_filter=nb_filter,
                            filter_length=filter_length,
                            border_mode="valid",
                            activation="relu"))
    model.add(MaxPooling1D(pool_length=pool_length))

    model.add(Flatten())
    model.add(Dropout(0.5))

    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optm)

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test accuracy:', acc)
    # return(acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return (kappa)
Example #6
0
def test():
    with open("save_weight.pickle", mode="rb") as f:
        weights = pickle.load(f)

    model = Sequential()
    model.add(Dense(output_dim=100, input_dim=28*28))
    model.add(Activation("relu"))
    model.set_weights(weights)

    layey1_value = model.predict(X_test[:5])
    y_pred = np_utils.categorical_probas_to_classes(y)
    Y = np_utils.categorical_probas_to_classes(y_test)
    print np_utils.accuracy(y_pred,Y)
    print y_pred.shape
Example #7
0
def run_algorithm(train,test):
    EPOCHS=100
    BATCHES=124

    m=train.shape[1]-1

    model=NN_model(m,10,128,0.5)
    X_train=train.iloc[:,1:].values

    y_train=np_utils.to_categorical(train[target].values)

    model.fit(X_train,y_train, nb_epoch=EPOCHS, batch_size=BATCHES,shuffle=True,verbose=1,validation_split=0.2)
    X_test=test.values
    ans_m=X_test.shape[0]
    print("xtest=",X_test.shape)
    ans = model.predict_proba(X_test, verbose=0)
    ans = np_utils.categorical_probas_to_classes(ans)

    # ans = np.array(ans).reshape((ans_m,))

    print("ans.shape=",ans.shape)
    csvfile = 'results/keras-naive.csv'
    writer = csv.writer(open(csvfile,'w'), lineterminator='\n')
    writer.writerow(["ImageId,Label"])
    for i,x in enumerate(ans):
        writer.writerow([i+1,x])
Example #8
0
def main():
    rev_by_star = get_data()
    X_train, Y_test, X_target, Y_target = split_data(rev_by_star)
    X_train = X_train.toarray()
    Y_test = Y_test.toarray()
    #Y_target = trans_target(Y_target)
    #X_target = trans_target(X_target)
    input_num = 1000
    output_num = 5
    X_target = to_categorical(X_target, 5)
    #Y_target = to_categorical(Y_target, 5)
    #data = np.random.random((2148051, input_num))
    #labels = np.random.randint(output_num, size=(2148051, 1))
    #print(X_target[0])
    #print(X_train.dtype, X_target.dtype)
    #labels = to_categorical(labels, 5)
    #print("data shape", data.dtype)
    #print("label shape", labels.dtype)
    print(type(X_train))
    model = build_model(input_num, output_num)
    model.fit(X_train,
              X_target,
              batch_size=128,
              nb_epoch=5,
              validation_split=0.25)
    #model.fit(data, labels, batch_size=32, nb_epoch=10)
    Y_pred = model.predict(Y_test)
    Y_pred = categorical_probas_to_classes(Y_pred)
    print("Accuracy is : %.2f" % ((Y_target == Y_pred).sum() * 1.0 /
                                  (1.0 * Y_test.shape[0])))
    plot_confusion_matrix(Y_pred, Y_target, "neural_network")
Example #9
0
def report_performance():
    y_pred = model.predict(X_dev.as_matrix(), batch_size=2000, verbose=2)
    pred_classes = np_utils.categorical_probas_to_classes(y_pred)
    conf_matrix = confusion_matrix(y_dev, pred_classes)
    print conf_matrix
    print "accuracy", accuracy_score(y_dev, pred_classes)
    print "f1", f1_score(y_dev, pred_classes, average='weighted')
Example #10
0
def combinationPredict(predict, samples_test):
    labels_samples, merger_min, merger_max, merger_sum, merger_pro = add.fusoesDiego(
        predict, samples_test)

    classSeg = np_utils.categorical_probas_to_classes(predict)
    classMin = np_utils.categorical_probas_to_classes(merger_min)
    classMax = np_utils.categorical_probas_to_classes(merger_max)
    classSom = np_utils.categorical_probas_to_classes(merger_sum)
    classPro = np_utils.categorical_probas_to_classes(merger_pro)

    print()
    print("Min: " + str(acc(labels_samples, classMin)))
    print("Max: " + str(acc(labels_samples, classMax)))
    print("Sum: " + str(acc(labels_samples, classSom)))
    print("Product: " + str(acc(labels_samples, classPro)))
    print()
Example #11
0
def run_algorithm(train, test):
    EPOCHS = 100
    BATCHES = 124

    m = train.shape[1] - 1

    model = NN_model(m, 10, 128, 0.5)
    X_train = train.iloc[:, 1:].values

    y_train = np_utils.to_categorical(train[target].values)

    model.fit(X_train,
              y_train,
              nb_epoch=EPOCHS,
              batch_size=BATCHES,
              shuffle=True,
              verbose=1,
              validation_split=0.2)
    X_test = test.values
    ans_m = X_test.shape[0]
    print("xtest=", X_test.shape)
    ans = model.predict_proba(X_test, verbose=0)
    ans = np_utils.categorical_probas_to_classes(ans)

    # ans = np.array(ans).reshape((ans_m,))

    print("ans.shape=", ans.shape)
    csvfile = 'results/keras-naive.csv'
    writer = csv.writer(open(csvfile, 'w'), lineterminator='\n')
    writer.writerow(["ImageId,Label"])
    for i, x in enumerate(ans):
        writer.writerow([i + 1, x])
Example #12
0
File: train.py Project: techbala/ml
def predict(model, x, y, ix, output_dir):
    """
    Store predictions in a CSV file and predicted probabilities in an NPZ file.
    """

    y_proba_pred = model.predict(x)
    np.savez_compressed(output_dir + '/predictions_proba.npz',
                        y_proba_pred=y_proba_pred)

    df = pd.DataFrame({
        'y_pred': np_utils.probas_to_classes(y_proba_pred),
        'y_true': np_utils.categorical_probas_to_classes(y)
    })

    df['accurate'] = df['y_true'] == df['y_pred']

    df['split'] = ''
    for key, indexes in ix.items():
        df.ix[indexes, 'split'] = key

    df = df[['split', 'y_true', 'y_pred', 'accurate']]

    df.to_csv(output_dir + '/predictions.csv', index=None)

    return y_proba_pred
def sample_count(Y):
    sample_count_train = {}
    if len(Y.shape) != 1:
        #转换onehot编码
        Y = np_utils.categorical_probas_to_classes(Y)
    for i in set(Y):
        sample_count_train[i] = list(Y).count(i)
    return sample_count_train
Example #14
0
def keras_model():

    import pandas as pd
    import numpy as np

    from keras.preprocessing import sequence
    from keras.models import Sequential
    from keras.layers.core import Dense, Dropout, Activation, Flatten
    from keras.layers.convolutional import Convolution1D, MaxPooling1D
    from keras.callbacks import EarlyStopping
    from keras.utils import np_utils

    from data_util import load_csvs, load_other
    import ml_metrics as metrics

    nb_words = 6500
    maxlen = 175
    filter_length = 10
    other_col_dim = 4

    X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/tpov4/train_1.csv',
                                                             'data/tpov4/test_1.csv',
                                                              nb_words, maxlen, 'self', w2v=None)

    # read _other.csv
    other_train = load_other('data/tpov4/train_1_other.csv', maxlen, other_col_dim)
    other_test = load_other('data/tpov4/test_1_other.csv', maxlen, other_col_dim)

    print('other tensor:', other_train.shape)

    pool_length = maxlen - filter_length + 1

    model = Sequential()
    model.add(Convolution1D(nb_filter=50,
                            filter_length=filter_length,
                            border_mode="valid", activation="relu",
                            input_shape=(maxlen, other_col_dim)))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(Flatten())
    model.add(Dropout(0.05))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer={{choice(['rmsprop', 'adam', 'adadelta', 'adagrad'])}})

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

    model.fit(other_train, Y_train, batch_size=32, nb_epoch=25,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(other_test, batch_size=32)
    org_classes = np_utils.categorical_probas_to_classes(Y_test)

    acc = np_utils.accuracy(classes, org_classes)  # accuracy only supports classes
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes, org_classes)
    print('Test Kappa:', kappa)
    return {'loss': -acc, 'status': STATUS_OK}
    def _getECOCAccuracy(self, GT, pred, topN=5):
        """
            Calculates the topN accuracy obtained from a set of samples on a ECOC_Classifier.
        """

        top_pred = np.argsort(pred, axis=1)[:, ::-1][:, :np.min([topN, pred.shape[1]])]
        pred = np_utils.categorical_probas_to_classes(pred)
        GT = np_utils.categorical_probas_to_classes(GT)

        # Top1 accuracy
        correct = [1 if pred[i] == GT[i] else 0 for i in range(len(pred))]
        accuracies = float(np.sum(correct)) / float(len(correct))

        # TopN accuracy
        top_correct = [1 if GT[i] in top_pred[i, :] else 0 for i in range(top_pred.shape[0])]
        top_accuracies = float(np.sum(top_correct)) / float(len(top_correct))

        return [accuracies, top_accuracies]
    def applyClassMapping(self, Y):
        """
            Returns the corresponding integer identifiers for the current Stage's mapping given a set of categorical arrays Y.
        """
        # Get labels from Keras' categorical representation
        labels = np_utils.categorical_probas_to_classes(Y)

        # Map labels for this stage
        return [self.mapping[l] for l in labels]
Example #17
0
def cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes,
                  maxlen,
                  nb_filter, filter_length, batch_size, nb_epoch, optm):
    """
    - CNN-1d on 3d sensor which uses word2vec embedding
    - MOT

    :param <X, Y> train and test sets
    :param nb_classes # of classes
    :param maxlen max of n char in a sentence
    :param nb_filter
    :param filter_length
    :param batch_size
    :param nb_epoch
    :param optm
    :return:
    """
    pool_length = maxlen - filter_length + 1

    model = Sequential()

    model.add(Convolution1D(nb_filter=nb_filter,
                            filter_length=filter_length,
                            border_mode="valid",
                            activation="relu", input_shape=(maxlen, 300)))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optm)

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
    print('Test accuracy:', acc)
    # return(acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return (kappa)
Example #18
0
def lstm_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                  maxlen, vocab_size, embd_dim,
                  batch_size, nb_epoch, optm):
    """
    - LSTM  on text input (represented in int)
    - fully-connected model

    :param <X, Y> train and test sets
    :param nb_classes # of classes
    :param maxlen max of n char in a sentence
    :param vocab_size
    :param embd_dim
    :param batch_size
    :param nb_epoch
    :param optm optimizer options, e.g., adam, rmsprop, etc.
    :return:
    """

    model = Sequential()
    model.add(Embedding(vocab_size, embd_dim, input_length=maxlen))
    model.add(Dropout(0.25))

    # model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(50))

    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optm)

    earlystop = EarlyStopping(monitor='val_loss', patience=2, verbose=1)

    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return (kappa)
Example #19
0
    def predict(self, X_test):
        model = self.model
        nb_classes = self.nb_classes

        #Y_test = np_utils.to_categorical(y_test, nb_classes)

        X_test = self.X_reshape(X_test)
        Y_pred = model.predict(X_test, verbose=0)
        y_pred = np_utils.categorical_probas_to_classes(Y_pred)

        return y_pred
Example #20
0
def get_output(i, model, name, X, Y):
    # Build TSNE model
    tsne_model = TSNE(n_components=2, random_state=0)
    get_layer_output = K.function([model.layers[0].input], [model.layers[i].output])

    # We pick first 1000 points to do TSNE
    reduced_layer_output = tsne_model.fit_transform(get_layer_output([X])[0][:500])
    #plt.figure(figsize=(12,12))
    #plt.title(name, fontsize=20)
    plt = scatter(reduced_layer_output, np_utils.categorical_probas_to_classes(Y[:500]))
    plt.savefig('./img/' + name)
Example #21
0
    def predict(self, X_test):
        model = self.model
        nb_classes = self.nb_classes

        #Y_test = np_utils.to_categorical(y_test, nb_classes)

        X_test = self.X_reshape(X_test)
        Y_pred = model.predict(X_test, verbose=0)
        y_pred = np_utils.categorical_probas_to_classes(Y_pred)

        return y_pred
Example #22
0
def cnn_var_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                     maxlen, vocab_size, embd_size,
                     nb_filter, batch_size, nb_epoches, optm):
    ngram_filters = [2, 5, 8]

    input = Input(shape=(maxlen,), name='input', dtype='int32')
    embedded = Embedding(input_dim=vocab_size, output_dim=embd_size, input_length=maxlen)(input)

    convs = [None, None, None]
    # three CNNs
    for i, n_gram in enumerate(ngram_filters):
        pool_length = maxlen - n_gram + 1
        convs[i] = Convolution1D(nb_filter=nb_filter,
                                 filter_length=n_gram,
                                 border_mode="valid",
                                 activation="relu")(embedded)
        convs[i] = MaxPooling1D(pool_length=pool_length)(convs[i])
        convs[i] = Flatten()(convs[i])

    merged = merge([convs[0], convs[1], convs[2]], mode='concat', concat_axis=1)
    merged = Dropout(0.5)(merged)
    output = Dense(nb_classes, activation='softmax', name='output')(merged)

    model = Model(input, output)
    model.compile(optm, loss={'output': 'categorical_crossentropy'})
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    model.fit(X_train, Y_train,
              nb_epoch=nb_epoches, batch_size=batch_size,
              validation_split=0.1, callbacks=[earlystop])

    probs = earlystop.model.predict(X_test, batch_size=batch_size)
    classes = np_utils.categorical_probas_to_classes(probs)

    acc = np_utils.accuracy(classes,
                            np_utils.categorical_probas_to_classes(Y_test))
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes,
                                             np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return acc
    def __init__(self, outdir, p):
        if os.path.exists(outdir):
            if len(os.listdir(outdir)) > 0:
                raise RuntimeError("callback outdir already exists: %s" %
                                   outdir)
        else:
            os.makedirs(outdir)

        self.outdir = outdir
        self.p = p
        self.losses = []
        self.bestf1 = 0.0

        stype = 'validation'
        cache = "%s_%s_%s_mintf%s_df%s_%s" % (
            stype, self.p['max_posts'], self.p['max_length'], self.p['mintf'],
            self.p['mindf'], self.p['noempty'])
        if self.p['prep'] is not None:
            cache += "_prep-%s" % self.p['prep']

        if os.path.exists("data/redcache/%s_X.npy" % cache):
            self.val_X = np.load("data/redcache/%s_X.npy" % cache)
            self.val_y = categorical_probas_to_classes(
                np.load("data/redcache/%s_y.npy" % cache))
        else:
            _, genf = datagen(self.p['max_posts'],
                              self.p['max_length'],
                              stype=stype,
                              force_full=True,
                              mintf=self.p['mintf'],
                              mindf=self.p['mindf'],
                              noempty=self.p['noempty'],
                              prep=self.p['prep'],
                              batch_size=9999999999)
            for i, (X, y) in enumerate(genf()):
                assert i == 0
            self.val_X, self.val_y = X, categorical_probas_to_classes(y)
            np.save("data/redcache/%s_X.npy" % cache, X)
            np.save("data/redcache/%s_y.npy" % cache, y)
def train_pair(args, train_csv, test_csv):
    print('Reading word vectors.')
    embeddings_index = read_glove_vectors(args.embedding_file_path)
    print('Found {} word vectors.'.format(len(embeddings_index)))

    print('Processing input data')
    x_train, y_train, x_test, y_test, word_index, = read_input_csv(train_csv,
                                                                   test_csv,
                                                                   args.nb_words,
                                                                   args.max_sequence_len)
    print('train tensor {}.'.format(x_train.shape))

    print('Preparing embedding matrix.')
    # initiate embedding matrix with zero vectors.
    nb_words = min(args.nb_words, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
    for word, i in word_index.items():
        if i > nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    args.nb_words = nb_words
    # args.len_labels_index = len(labels_index)
    args.len_labels_index = 2  # fixed for sentiment detection.

    model = model_selector(args, embedding_matrix)

    checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5")
    checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss',
                                 verbose=1, save_best_only=True)

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    tsb = TensorBoard(log_dir='./log', histogram_freq=0, write_graph=True, write_images=False)

    callbacks_list = [checkpoint, earlystop, tsb]
    model_json = model.to_json()
    with open(os.path.join(args.model_dir, "model.json"), "w") as json_file:
        json_file.write(model_json)

    model.fit(x_train, y_train, validation_split=0.1,
              nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list)
    classes = earlystop.model.predict_classes(x_test, batch_size=args.batch_size)
    # acc only supports classes
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(y_test))
    print('Test accuracy: {}.'.format(acc))
    def detect_in_image(self, img):
        if len(img.shape) == 2:
            window_size = (self.window_size[0], self.window_size[1], 1)
            img = img.reshape((img.shape[0], img.shape[1], 1))
        else:
            window_size = self.window_size

        windows = view_as_windows(img, window_size, self.stride)
        windows_shape = windows.shape

        windows = windows.reshape((windows_shape[0] * windows_shape[1],
                                   windows_shape[3], windows_shape[4], 3))

        predictions = self.model.predict(windows, batch_size=32, verbose=1)

        y = np_utils.categorical_probas_to_classes(predictions)
        y = y.reshape(windows_shape[0], windows_shape[1])

        detections = zip(*np.where(y == 1))

        # print(detections)
        # print(predictions)

        bxs = []

        for c, r in detections:
            # print(p)
            bxs.append([
                r * self.stride,
                c * self.stride,
                r * self.stride + window_size[0],
                c * self.stride + window_size[1],
            ])

        bxs = non_max_suppression_fast(np.asarray(bxs), 0.1)
        # bxs = zip(*bxs)
        # print('Boxes :', bxs)

        return bxs
def evalon(stype, model, batch=32, X=None, y_true=None):
    print('\n-------- %s -------' % stype)
    print(datetime.datetime.now(), "predicting")
    y_pred = categorical_probas_to_classes(model.predict(X, batch_size=batch))

    print(datetime.datetime.now())
    posf1 = sklearn.metrics.f1_score(y_true,
                                     y_pred,
                                     pos_label=1,
                                     average='binary')
    posp = sklearn.metrics.precision_score(y_true,
                                           y_pred,
                                           pos_label=1,
                                           average='binary')
    posr = sklearn.metrics.recall_score(y_true,
                                        y_pred,
                                        pos_label=1,
                                        average='binary')
    print(sklearn.metrics.classification_report(y_true, y_pred))
    print("pred", collections.Counter(y_pred))
    print("true", collections.Counter(y_true))

    return posf1, posp, posr
Example #27
0
def predict(model, x, y, ix, output_dir):
    """
    Store predictions in a CSV file and predicted probabilities in an NPZ file.
    """

    y_proba_pred = model.predict(x)
    np.savez_compressed(output_dir + '/predictions_proba.npz',
        y_proba_pred=y_proba_pred)

    df = pd.DataFrame({
        'y_pred': np_utils.probas_to_classes(y_proba_pred),
        'y_true': np_utils.categorical_probas_to_classes(y)})

    df['accurate'] = df['y_true'] == df['y_pred']

    df['split'] = ''
    for key, indexes in ix.items():
        df.ix[indexes, 'split'] = key

    df = df[['split', 'y_true', 'y_pred', 'accurate']]

    df.to_csv(output_dir + '/predictions.csv', index=None)

    return y_proba_pred
Example #28
0
    model.fit(X_train,
              Y_train,
              batch_size=batch_size,
              nb_epoch=nb_epoch,
              verbose=2,
              validation_data=(X_test, Y_test))

    score, acc = model.evaluate(X_test, Y_test, verbose=0)

    return {'loss': -acc, 'status': STATUS_OK, 'model': model}


if __name__ == '__main__':

    X_train, X_test, Y_train, Y_test = data()
    '''
    Generate ensemble model from optimization run:
    First, run hyperas optimization on specified setup, i.e. 10 trials with TPE,
    then return the best 5 models and create a majority voting model from it.
    '''
    ensemble_model = optim.best_ensemble(nb_ensemble_models=5,
                                         model=model,
                                         data=data,
                                         algo=rand.suggest,
                                         max_evals=10,
                                         trials=Trials(),
                                         voting='hard')
    preds = ensemble_model.predict(X_test)
    y_test = np_utils.categorical_probas_to_classes(Y_test)
    print(accuracy_score(preds, y_test))
Example #29
0
 def validate(self, features, labels, number_folds, encoded_labels):
     """
     Compute a model's performance metrics based on k-fold cross-validation technique.
     
     Parameters
     ----------
     features: array-like of shape = [number_samples, number_features]
         The validation input samples.
         
     labels: array-like of shape = [number_samples]
         The target values (class labels in classification).
         
     number_folds: int
         The amount of folds for the k-fold cross-validation.
         If 0 compute metrics withput folds.
         If > 0 compute metrics with n folds, n=number_folds.
     
     encoded_labels: array-like of shape = [number_samples, number_outputs]
         The target values (class labels in classification) in one-hot-encoding.
         
     Return
     ----------
     accuracy: float
         The accuracy of the model based on it's confusion matrix.
         
     precision: float
         The precision of the model based on it's confusion matrix.
         
     sensitivity: float
         The sensitivity of the model based on it's confusion matrix.
         
     specificity: float
         The specificity of the model based on it's confusion matrix.
         
     kappa: float
         The Cohen's Kappa of the model based on it's confusion matrix.
     """
     if number_folds == 0:
         predictions = self.model.predict_classes(features)
     else:
         predictions = numpy.empty(len(labels), dtype=float)
         folds = Utilities.getFolds(labels, number_folds)
         for i, (train, test) in enumerate(folds):
             self.model.fit(features[train], encoded_labels[train], nb_epoch=250, batch_size=10, verbose=1)
             fold_prediction = self.model.predict_classes(features[test])
             for j in range(len(test)):
                 predictions[test[j]]=fold_prediction[j]
     matrix = confusion_matrix(np_utils.categorical_probas_to_classes(encoded_labels), predictions)
     sum_columns = numpy.sum(matrix, 0)
     sum_rows = numpy.sum(matrix, 1)
     diagonal_sum = numpy.trace(matrix)
     total_sum = numpy.sum(sum_rows)
     accuracy = diagonal_sum / total_sum
     temp_precision = []
     temp_sensitivity = []
     temp_specificity = []
     for i in range(len(matrix)):
         temp_precision.append(matrix[i][i] / sum_columns[i])
         temp_sensitivity.append(matrix[i][i] / sum_rows[i])
         temp_reduced_sum = total_sum - sum_rows[i] - sum_columns[i] + matrix[i][i]
         temp_specificity.append(temp_reduced_sum / (temp_reduced_sum + sum_columns[i] - matrix[i][i]))
     precision = sum(temp_precision * sum_rows) / total_sum
     sensitivity = sum(temp_sensitivity * sum_rows) / total_sum
     specificity = sum(temp_specificity * sum_rows) / total_sum
     kappa_sum = sum(sum_rows * sum_columns)
     kappa_numerator = (total_sum * diagonal_sum) - kappa_sum
     kappa_denominator =  (total_sum * total_sum) - kappa_sum
     kappa = kappa_numerator / kappa_denominator
     return accuracy, precision, sensitivity, specificity, kappa       
                        border_mode='valid',
                        input_shape=(3, img_rows, img_cols)))
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, nb_conv, nb_conv))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adadelta')
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,
          show_accuracy=True, verbose=1, validation_data=(X_test, y_test))

score = model.evaluate(X_test, y_test, show_accuracy=True, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

y_hat = model.predict(X_test)
y_test_classes = np_utils.categorical_probas_to_classes(y_test)
y_hat_classes = np_utils.categorical_probas_to_classes(y_hat)
print confusion_matrix(y_test_classes, y_hat_classes)

#showWrongOnes(X_test, y_test_classes, y_hat_classes, 0, 5)

np.random.seed(0)
print ("Method = MLP classification - Default features")
encoder = preprocessing.LabelEncoder( )
train_labels_aux = np.array( [ latlon2healpix( lat , lon , resolution ) for (lat,lon) in train_labels1 ] )
train_labels_aux = np.array( encoder.fit_transform( train_labels_aux ) )
num_classes = len( set( train_labels_aux ) )
train_labels_aux = np_utils.to_categorical( train_labels_aux , num_classes )
model = Sequential( )
model.add(Dense(hidden_dim, input_dim=train_matrix1.shape[1], init='uniform', activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(hidden_dim / 2, activation='sigmoid', init='uniform'))
model.add(Dropout(0.25))
model.add(Dense( num_classes , activation='softmax' , init='uniform' ))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit( train_matrix1 , train_labels_aux , nb_epoch=1500, batch_size=16, verbose=1)
results = encoder.inverse_transform( np_utils.categorical_probas_to_classes( model.predict( test_matrix1 ) ) )
results = np.array( [ healpix2latlon( code , resolution ) for code in results ] )
print ("Mean error = " + repr( np.mean( [ geodistance( results[i] , test_labels1[i] ) for i in range(results.shape[0]) ] ) ) )
print ("Median error = " + repr( np.median( [ geodistance( results[i] , test_labels1[i] ) for i in range(results.shape[0]) ] ) ) )
print ("Method = MLP classification - Default features + chromatic features")
np.random.seed(0)
encoder = preprocessing.LabelEncoder( )
train_labels_aux = np.array( [ latlon2healpix( lat , lon , resolution ) for (lat,lon) in train_labels1 ] )
train_labels_aux = np.array( encoder.fit_transform( train_labels_aux ) )
num_classes = len( set( train_labels_aux ) )                                                              
train_labels_aux = np_utils.to_categorical(train_labels_aux , num_classes )
model = Sequential( ) 
model.add(Dense(hidden_dim, input_dim=train_matrix2.shape[1], init='uniform', activation='sigmoid')) 
model.add(Dropout(0.25)) 
model.add(Dense(hidden_dim / 2, activation='sigmoid', init='uniform')) 
model.add(Dropout(0.25))
Example #32
0
def run_dl_mgh_params_2cl(X, y, Lx, Ly, nb_epoch=5000,     
                      batch_size = 128,
                      nb_classes = 2):

    # input image dimensions
    img_rows, img_cols = Lx, Ly
    # number of convolutional filters to use
    nb_filters = 8
    # size of pooling area for max pooling
    pool_size = (10, 10)
    # convolution kernel size
    kernel_size = (20, 20)

    # the data, shuffled and split between train and test sets
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size=0.2, random_state=0)

    if K.image_dim_ordering() == 'th':
        X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
        X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
        input_shape = (1, img_rows, img_cols)
    else:
        X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
        X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
        input_shape = (img_rows, img_cols, 1)

    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_test /= 255
    print('X_train shape:', X_train.shape)
    print(X_train.shape[0], 'train samples')
    print(X_test.shape[0], 'test samples')

    # convert class vectors to binary class matrices
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    model = Sequential()

    model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1],
                            border_mode='valid',
                            input_shape=input_shape))
    model.add(BatchNormalization())
    # model.add(Activation('relu'))
    model.add(Activation('tanh'))
    model.add(MaxPooling2D(pool_size=pool_size))
    #model.add(Dropout(0.25))

    model.add(Convolution2D(5, 5, 5, border_mode='valid'))
    model.add(BatchNormalization())
    model.add(Activation('tanh'))
    model.add(MaxPooling2D(pool_size=(5,5)))

    model.add(Flatten())
    model.add(Dense(4))
    model.add(BatchNormalization())
    model.add(Activation('tanh'))
    #model.add(Activation('relu'))
    #model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adadelta',
                  metrics=['accuracy'])

    # earlyStopping=callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')

    history = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              verbose=0, validation_data=(X_test, Y_test)) #, callbacks=[earlyStopping])
    score = model.evaluate(X_test, Y_test, verbose=0)

    Y_test_pred = model.predict(X_test, verbose=0)
    print('Confusion metrix')
    y_test_pred = np_utils.categorical_probas_to_classes(Y_test_pred)
    print(metrics.confusion_matrix(y_test, y_test_pred))

    print('Test score:', score[0])
    print('Test accuracy:', score[1])

    kkeras.plot_acc( history)
    plt.show()
    kkeras.plot_loss( history)
Example #33
0
    nb_epoch = 10
    batch_size = 128

    model.fit(X_train, Y_train,
              batch_size=batch_size, nb_epoch=nb_epoch,
              verbose=2,
              validation_data=(X_test, Y_test))

    score, acc = model.evaluate(X_test, Y_test, verbose=0)

    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

if __name__ == '__main__':

    X_train, X_test, Y_train, Y_test = data()

    '''
    Generate ensemble model from optimization run:
    First, run hyperas optimization on specified setup, i.e. 10 trials with TPE,
    then return the best 5 models and create a majority voting model from it.
    '''
    ensemble_model = optim.best_ensemble(nb_ensemble_models=5,
                                         model=model, data=data,
                                         algo=rand.suggest, max_evals=10,
                                         trials=Trials(),
                                         voting='hard')
    preds = ensemble_model.predict(X_test)
    y_test = np_utils.categorical_probas_to_classes(Y_test)
    print(accuracy_score(preds, y_test))
Example #34
0
def cnn_other(Y_train, Y_test, nb_classes,
              Other_train, Other_test, k,
              maxlen,
              nb_filter, filter_size, batch_size, nb_epoches, optm):
    """
    cnn1d using varying filter lengths
    note need using Graph
    :param Y_Train:
    :param Y_test:
    :param nb_classes:
    :param maxlen:
    :param vocab_size:
    :param embd_size:
    :param batch_size:
    :param nb_epoches:
    :param optm:
    :return:
    """
    model = Graph()

    # CNN for other
    pos_pool_len = maxlen / 2 - filter_size + 1
    model.add_input(name='other_input', input_shape=(maxlen, k), dtype='float')

    model.add_node(Convolution1D(nb_filter=nb_filter,
                                 filter_length=filter_size,
                                 border_mode='valid',
                                 activation='relu',
                                 input_shape=(maxlen, k)),
                   name='poscnn', input='other_input')
    model.add_node(MaxPooling1D(pool_length=5),
                   name='pospool', input='poscnn')

    # 2nd CNN
    model.add_node(Convolution1D(nb_filter=nb_filter * 2,
                                 filter_length=filter_size,
                                 border_mode='valid',
                                 activation='relu'),
                   name='cnn2', input='pospool')
    model.add_node(MaxPooling1D(pool_length=10),
                   name='cnn2_pool', input='cnn2')

    model.add_node(Flatten(), name='posflat', input='cnn2_pool')
    model.add_node(Dropout(0.5), name='posdropout', input='posflat')

    model.add_node(Dense(nb_classes, activation='softmax'), name='softmax',
                   input='posdropout')
    model.add_output(name='output', input='softmax')
    model.compile(optm, loss={'output': 'categorical_crossentropy'})  # note Graph()'s diff syntax

    # early stopping
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    model.fit({'other_input': Other_train, 'output': Y_train},
              nb_epoch=nb_epoches, batch_size=batch_size,
              validation_split=0.1, callbacks=[earlystop])
    # Graph doesn't have several arg/func existing in Sequential()
    # - fit no show-accuracy
    # - no predict_classes
    classes = model.predict({'other_input': Other_test},
                            batch_size=batch_size)['output'].argmax(axis=1)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return acc
Example #35
0
    def evaluate_model(self, segment_size, model, data_loader):
        '''

        :param segment_size:
        :param model:
        :param data_loader:
        :return:
        '''

        # ________________ Frame level evaluation for Test/Validation splits ________________________
        print('Validation segments = ' +
              str(data_loader.validation_segments.shape) +
              ' one-hot encoded target' +
              str(data_loader.validation_one_hot_target.shape))
        score = model.evaluate(data_loader.validation_segments,
                               data_loader.validation_one_hot_target,
                               verbose=0)
        print('Validation score:', score[0])
        print('Validation accuracy:', score[1])

        print('Test segments = ' + str(data_loader.test_segments.shape) +
              ' one-hot encoded target' +
              str(data_loader.test_one_hot_target.shape))
        score = model.evaluate(data_loader.test_segments,
                               data_loader.test_one_hot_target,
                               verbose=0)
        print('Test score:', score[0])
        print('Test accuracy:', score[1])

        # ___________________ predict frame-level classes ___________________________________
        test_predicted_labels = np_utils.categorical_probas_to_classes(
            model.predict(data_loader.test_segments))
        test_target_labels = np_utils.categorical_probas_to_classes(
            data_loader.test_one_hot_target)

        cm_frames = confusion_matrix(test_target_labels, test_predicted_labels)
        print('Confusion matrix, frame level')
        print(cm_frames)
        print(
            'Frame level accuracy :' +
            str(np_utils.accuracy(test_predicted_labels, test_target_labels)))

        # -------------- Voting ------------------------
        clip_predicted_probability_mean_vote = []
        clip_predicted_majority_vote = []
        for i, clip in enumerate(data_loader.test_clips):
            segments, segments_target_labels = data_loader.segment_clip(
                data=clip,
                label=data_loader.test_clips_labels[i],
                segment_size=segment_size,
                step_size=Config.STEP_SIZE)
            segments_predicted_prop = model.predict(segments)
            test_predicted_labels = np_utils.categorical_probas_to_classes(
                segments_predicted_prop)
            labels_histogram = np.bincount(test_predicted_labels)
            clip_predicted_majority_vote.append(np.argmax(labels_histogram))
            clip_predicted_probability_mean_vote.append(
                np.argmax(np.mean(segments_predicted_prop, axis=0)))

        cm_majority = confusion_matrix(data_loader.test_clips_labels,
                                       clip_predicted_majority_vote)
        print('Fold Confusion matrix - Majority voting - Clip level :')
        print(Config.CLASS_NAMES)
        print(cm_majority)
        print('Clip-level majority-vote Accuracy ' + str(
            np_utils.accuracy(clip_predicted_majority_vote,
                              data_loader.test_clips_labels)))

        print('Fold Confusion matrix - Probability MEAN voting - Clip level :')
        cm_probability = confusion_matrix(
            data_loader.test_clips_labels,
            clip_predicted_probability_mean_vote)
        print(Config.CLASS_NAMES)
        print(cm_probability)
        print('Clip-level probability-vote Accuracy ' + str(
            np_utils.accuracy(np.asarray(clip_predicted_probability_mean_vote),
                              np.squeeze(data_loader.test_clips_labels))))

        scoref1 = f1score(data_loader.test_clips_labels,
                          clip_predicted_probability_mean_vote,
                          average='micro')
        print('F1 Score micro ' + str(scoref1))

        scoref1 = f1score(data_loader.test_clips_labels,
                          clip_predicted_probability_mean_vote,
                          average='weighted')
        print('F1 Score weighted ' + str(scoref1))

        return cm_majority, cm_probability, clip_predicted_majority_vote, clip_predicted_probability_mean_vote, data_loader.test_clips_labels
Example #36
0
def cnn_multi_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                       maxlen, vocab_size, embd_size,
                       pos_train, pos_test, pos_embd_dim,
                       dp_train, dp_test, dp_embd_dim,
                       nb_filter, batch_size, nb_epoches, optm):
    """
    cnn1d using multi-inputs, i.e., word, POS, DP
    word using varying filter lengths
    note need using Graph
    :param X_train:
    :param Y_Train:
    :param X_test:
    :param Y_test:
    :param nb_classes:
    :param maxlen:
    :param vocab_size:
    :param embd_size:
    :param batch_size:
    :param nb_epoches:
    :param optm:
    :return:
    """
    ngram_filters = [2, 5, 8]
    nd_convs = ['conv_' + str(n) for n in ngram_filters]
    nd_pools = ['pool_' + str(n) for n in ngram_filters]
    nd_flats = ['flat_' + str(n) for n in ngram_filters]

    model = Graph()
    model.add_input(name='input', input_shape=(maxlen,), dtype=int)

    model.add_node(Embedding(vocab_size, embd_size, input_length=maxlen),
                   name='embedding', input='input')
    # three CNNs
    for i, n_gram in enumerate(ngram_filters):
        pool_length = maxlen - n_gram + 1
        model.add_node(Convolution1D(nb_filter=nb_filter,
                                     filter_length=n_gram,
                                     border_mode="valid",
                                     activation="relu"),
                       name=nd_convs[i], input='embedding')
        model.add_node(MaxPooling1D(pool_length=pool_length),
                       name=nd_pools[i], input=nd_convs[i])
        model.add_node(Flatten(), name=nd_flats[i], input=nd_pools[i])
    model.add_node(Dropout(0.5), name='dropout', inputs=nd_flats, merge_mode='concat')

    # POS CNN
    nb_pos = 15
    pos_f_len = 3
    pos_pool_len = maxlen - pos_f_len + 1
    model.add_input(name='posinput', input_shape=(maxlen,), dtype=int)
    model.add_node(Embedding(nb_pos, pos_embd_dim, input_length=maxlen),
                   name='posembd', input='posinput')
    model.add_node(Convolution1D(nb_filter=nb_filter,
                                 filter_length=pos_f_len,
                                 border_mode='valid',
                                 activation='relu'),
                   name='poscnn', input='posembd')
    model.add_node(MaxPooling1D(pool_length=pos_pool_len),
                   name='pospool', input='poscnn')
    model.add_node(Flatten(), name='posflat', input='pospool')
    model.add_node(Dropout(0.5), name='posdropout', input='posflat')

    # DP CNN
    nb_dp = vocab_size
    dp_f_len = 3
    dp_pool_len = maxlen - dp_f_len + 1
    model.add_input(name='dpinput', input_shape=(maxlen,), dtype=int)
    model.add_node(Embedding(nb_dp, dp_embd_dim, input_length=maxlen),
                   name='dpembd', input='dpinput')
    model.add_node(Convolution1D(nb_filter=nb_filter,
                                 filter_length=dp_f_len,
                                 border_mode='valid',
                                 activation='relu'),
                   name='dpcnn', input='dpembd')
    model.add_node(MaxPooling1D(pool_length=dp_pool_len),
                   name='dppool', input='dpcnn')
    model.add_node(Flatten(), name='dpflat', input='dppool')
    model.add_node(Dropout(0.5), name='dpdropout', input='dpflat')

    model.add_node(Dense(nb_classes, activation='softmax'), name='softmax',
                   inputs=['dropout', 'posdropout', 'dpdropout'],
                   merge_mode='concat')

    model.add_output(name='output', input='softmax')
    model.compile(optm, loss={'output': 'categorical_crossentropy'})  # note Graph()'s diff syntax

    # early stopping
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    model.fit({'input': X_train, 'posinput': pos_train, 'dpinput': dp_train,
               'output': Y_train},
              nb_epoch=nb_epoches, batch_size=batch_size,
              validation_split=0.1, callbacks=[earlystop])
    # Graph doesn't have several arg/func existing in Sequential()
    # - fit no show-accuracy
    # - no predict_classes
    classes = model.predict({'input': X_test, 'posinput': pos_test, 'dpinput': dp_test}
                            , batch_size=batch_size)['output'].argmax(axis=1)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return kappa
Example #37
0
predictions = Dense(5, activation="softmax", name="dl1preds")(fc1)

model = Model(input=[imgvecs], output=[predictions])

model.compile(optimizer="adadelta", loss="categorical_crossentropy",
              metrics=["accuracy"])

best_model = os.path.join(MODEL_DIR, "tl-dl1-model-best.h5")
checkpoint = ModelCheckpoint(filepath=best_model, verbose=1, 
                             save_best_only=True)
history = model.fit([Xtrain], [Ytrain], nb_epoch=NUM_EPOCHS, 
                    batch_size=BATCH_SIZE, validation_split=0.1, 
                    callbacks=[checkpoint])
fttlutils.plot_loss(history)

# evaluate final model
Ytest_ = model.predict(Xtest)
ytest = np_utils.categorical_probas_to_classes(Ytest)
ytest_ = np_utils.categorical_probas_to_classes(Ytest_)
fttlutils.print_stats(ytest, ytest_, "Final Model (DL#1)")
model.save(os.path.join(MODEL_DIR, "tl-dl1-model-final.h5"))

# load best model and evaluate
model = load_model(os.path.join(MODEL_DIR, "tl-dl1-model-best.h5"))
model.compile(optimizer="adadelta", loss="categorical_crossentropy",
              metrics=["accuracy"])
Ytest_ = model.predict(Xtest)
ytest = np_utils.categorical_probas_to_classes(Ytest)
ytest_ = np_utils.categorical_probas_to_classes(Ytest_)
fttlutils.print_stats(ytest, ytest_, "Best Model (DL#1)")
from HSIDatasetLoad import *
from keras.utils import np_utils
import numpy as np

HSI = HSIData(rootPath)
X_data = HSI.X_data
Y_data = HSI.Y_data
data_source = HSI.data_source
idx_data = HSI.idx_data

#是否使用PCA降维
if use_pca==True:
    data_source = HSI.PCA_data_Source(data_source,n_components=n_components)

X_data_nei = HSI.getNeighborData(data_source=data_source,idx_data=idx_data,block_size=block_size)
Y_data = np_utils.categorical_probas_to_classes(Y_data)
X_train_nei,X_test_nei,Y_train,Y_test,idx_train,idx_test = HSI.datasetSplit(X_data_nei,Y_data,idx_data,16,test_size = test_size)
X_train = data_source[idx_train]
X_test = data_source[idx_test]

#%% (2)自编码器
from keras.layers import Input, Dense, Flatten
from keras.models import Model
from keras.utils import np_utils
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping

"""
categorical_crossentropy
‘valid’:image_shape - filter_shape + 1.即滤波器在图像内部滑动
‘full’ shape: image_shape + filter_shape - 1.允许滤波器超过图像边界
Example #39
0
def CNN (batch_size,nb_classes,nb_epoch,img_rows, img_cols,nb_filters,nb_pool,nb_conv,X_train, Y_train, X_val,Y_val,X_test,Y_test,opt,dp):
    model = Sequential()
    k = (dp & 12)>>2
    dp = (dp& 3)
    if k==0:
        w1 = 'glorot_uniform'
        w2 = 'glorot_uniform'
    elif k ==1:
        w1 = 'glorot_uniform'
        w2 = 'he_uniform'
    elif k==2:
        w1 = 'he_uniform'
        w2 = 'glorot_uniform'
    else:
        w1 = 'he_uniform'
        w2 = 'he_uniform'   
    model.add(Convolution2D(nb_filters, nb_conv, nb_conv,init=w1,
                        border_mode='valid',subsample=(dp,dp),
                        input_shape=(1, img_rows, img_cols)))
    convonet1 = Activation('relu')                   
    model.add(convonet1)
    ##if ((dp&2)>>1) ==1:
      #3  print('dp2')
    model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
    model.add(Convolution2D(nb_filters, nb_conv, nb_conv,init=w1,subsample=(dp,dp)))
    convonet2 = Activation('relu')
    model.add(convonet2)
    model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
    ##if dp&1 ==1:
        ##print ('dp1')
    model.add(Dropout(0.5))
    model.add(Flatten())
    d_size = 128 / (dp * dp)
    print (d_size)
    model.add(Dense(d_size,init=w2))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes,init=w2))
    model.add(Activation('softmax'))
    #model.compile(loss='categorical_crossentropy', optimizer='adadelta')
    model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=["accuracy"])
    history = LossHistory()
    early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
       #    show_accuracy=True, verbose=1, validation_split=0.2)
          verbose=0,callbacks=[history,early_stop], validation_data=[X_val, Y_val])
    predicted = model.predict_classes(X_test)
    con_mat = confusion_matrix(np_utils.categorical_probas_to_classes(Y_test),predicted)
    model.summary()
    #from models import model_from_json
    #json_string = model.to_json()
    #print json_string
    #config = model.get_config() 
    #print config
    #print (con_mat)
    ##false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predicted)
    #roc_auc = auc(false_positive_rate, true_positive_rate)
    #plt.title('Receiver Operating Characteristic')
    #plt.plot(false_positive_rate, true_positive_rate, 'b'`, label='AUC = %0.2f'% roc_auc)
    #plt.legend(loc='lower right')
    #plt.plot([0,1],[0,1],'r--')
    #plt.xlim([-0.1,1.2])
    #plt.ylim([-0.1,1.2])
    #plt.ylabel('True Positive Rate')
    #plt.xlabel('False Positive Rate')
    #plt.show()           
    score = model.evaluate(X_test, Y_test, show_accuracy=True, verbose=0)
    return score , model,history,con_mat
Example #40
0
full_conn = Dense(128, activation='tanh')(flatten)
dropout_1 = Dropout(0.5)(full_conn)
full_conn_2 = Dense(64, activation='tanh')(dropout_1)
dropout_2 = Dropout(0.5)(full_conn_2)
output = Dense(6, activation='softmax')(dropout_2)

model = Model(input=inputs, output=output)

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=50,
          nb_epoch=int(epochs))  #validation_data = (x_test, y_test))
model.save('word2vec_main_model_' + epochs + '.h5')

#from keras.models import load_model
#model = load_model('500_filter_adam_50_50_valid_0.1.h5')
#model = load_model(sys.argv[1])
predictions = model.predict(x_test)
predictions = np_utils.categorical_probas_to_classes(predictions)
originals = np_utils.categorical_probas_to_classes(y_test)

lend = len(predictions) * 1.0
print np.sum(predictions == originals) / lend

from sklearn.metrics import confusion_matrix

print confusion_matrix(
    originals,
    predictions)  #labels = ['abbr', 'desc', 'enty', 'hum', 'loc', 'num'])
    for i in range(sample):
        for row in range(block_size):
            for col in range(block_size):
                new_X_data[i,:,row*block_size+col] = X_data[i,row,col,:]
    return new_X_data

HSI = HSIData(rootPath)
X_data = HSI.X_data
Y_data = HSI.Y_data
data_source = HSI.data_source
idx_data = HSI.idx_data

X_data = HSI.getNeighborData(data_source=data_source,idx_data=idx_data,block_size=block_size)
X_data = data_standard(X_data)

Y_data = np_utils.categorical_probas_to_classes(Y_data)
X_train,X_test,Y_train,Y_test,idx_train,idx_test = HSI.datasetSplit(X_data,Y_data,idx_data,16,test_size = test_size)


#%%
from keras.layers import Input,merge,Dense,Dropout,Flatten,Convolution1D,MaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
import tensorflow as tf
from keras.layers import LSTM



def get_model(input_shape, classify_output_num, my_optimizer):
    input_layer = Input(input_shape)
                             border_mode='valid',
                             activation='relu'),
               name='dpcnn', input='dpembd')
model.add_node(MaxPooling1D(pool_length=dp_pool_len),
               name='dppool', input='dpcnn')
model.add_node(Flatten(), name='dpflat', input='dppool')
model.add_node(Dropout(0.5), name='dpdropout', input='dpflat')

# using three CNNs to predict with L1
model.add_node(Dense(nb_classes, activation='softmax', W_regularizer=l2(0.05)), name='softmax',
               inputs=['dropout', 'posdropout', 'dpdropout'],
               merge_mode='concat')

model.add_output(name='output', input='softmax')
model.compile('rmsprop', loss={'output': 'categorical_crossentropy'})

# early stopping
earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
model.fit({'input': X_train, 'posinput': pos_train, 'dpinput': dp_train, 'output': Y_train},
          nb_epoch=nb_epoch, batch_size=batch_size,
          validation_split=0.1, callbacks=[earlystop])

# Graph doesn't have several arg/func existing in Sequential()
# - fit no show-accuracy
# - no predict_classes
classes = model.predict({'input': X_test, 'posinput': pos_test, 'dpinput': dp_test},
                        batch_size=batch_size)['output'].argmax(axis=1)
acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
print('Test accuracy:', acc)

Example #43
0
        class_weight_to_fit = incp.GetDataRatio(ppmethod, class_list,
                                                nb_training_samples, 'Train')
        #print('class_weight_to_fit: {}'.format(class_weight_to_fit))

        #Now we test the best weight. "Instrumented best weight run"
        full_model.load_weights(
            Best_Model_Weights_Path)  #LOAD (not set) the best weights.
        X_test, Y_test = incp.GetXYData(ppmethod, nb_testing_samples,
                                        class_list,
                                        'Test')  #get them big arrays
        y_predictions = full_model.predict(X_test, batch_size=BATCH_SIZE)
        Y_test_categorical = np_utils.to_categorical(Y_test, len(class_list))
        #print('Y_test_categorical:{}'.for)
        #print('y_predictions before conversion: {}'.format(y_predictions))
        y_preds_as_classes = np_utils.categorical_probas_to_classes(
            y_predictions)  #converts class probs to one-hot
        print('y_predictions after conversion: {}'.format(y_preds_as_classes))
        #print('pre-converted preds: {}'.format(y_predictions))
        print('converted preds: {}'.format(y_preds_as_classes))
        print('ground truth labels: {}'.format(Y_test))

        test_comparo = np.zeros((len(y_preds_as_classes), 2), dtype=np.uint8)
        for i in range(0,
                       len(y_preds_as_classes) -
                       1):  # first column is truth. 2nd is predict.
            test_comparo[i, 0] = Y_test[i]
            test_comparo[i, 1] = y_preds_as_classes[i]
        np.savetxt(Base_Path + 'Results/TestSetPredsComparo' + ppmethod +
                   '.txt',
                   test_comparo,
                   fmt='%1i',
Example #44
0
# 0 -> [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# 2 -> [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] # only the right element == 1

# create sequential model
model = Sequential()

# add network levels
model.add(Dense(800, input_dim=784, init="normal", activation="relu"))
model.add(Dense(10, init="normal", activation="softmax"))

# compile model
model.compile(loss="categorical_crossentropy",
              optimizer="SGD",
              metrics=["accuracy"])
print(model.summary())

#_________training_______________________
# batch_size - size of data portion
# nb_epoch - amount times of teaching (100 times)
model.fit(X_train, y_train, batch_size=200, nb_epoch=100, verbose=1)

#___________working_____________
# work on input data
predictions = model.predict(X_train)

# transform output data
# from categories to tags of classes (num from 0 to 9)
predictions = np_utils.categorical_probas_to_classes(predictions)

# now we can compare
import numpy as np

HSI = HSIData(rootPath)
X_data = HSI.X_data
Y_data = HSI.Y_data
data_source = HSI.data_source
idx_data = HSI.idx_data

#是否使用PCA降维
if use_pca == True:
    data_source = HSI.PCA_data_Source(data_source, n_components=n_components)

X_data_nei = HSI.getNeighborData(data_source=data_source,
                                 idx_data=idx_data,
                                 block_size=block_size)
Y_data = np_utils.categorical_probas_to_classes(Y_data)
X_train_nei, X_test_nei, Y_train, Y_test, idx_train, idx_test = HSI.datasetSplit(
    X_data_nei, Y_data, idx_data, 16, test_size=test_size)
X_train = data_source[idx_train]
X_test = data_source[idx_test]

#%% (2)自编码器
from keras.layers import Input, Dense, Flatten
from keras.models import Model
from keras.utils import np_utils
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping
"""
categorical_crossentropy
‘valid’:image_shape - filter_shape + 1.即滤波器在图像内部滑动
‘full’ shape: image_shape + filter_shape - 1.允许滤波器超过图像边界
               name='pospool', input='poscnn')
model.add_node(Flatten(), name='posflat', input='pospool')
model.add_node(Dropout(0.5), name='posdropout', input='posflat')


# using three CNNs to predict with L1
model.add_node(Dense(nb_classes, activation='softmax'), name='softmax',
               inputs=['dropout', 'posdropout'],
               merge_mode='concat')

model.add_output(name='output', input='softmax')
model.compile('rmsprop', loss={'output': 'categorical_crossentropy'})
# model.compile('rmsprop', loss={'output': 'mean_squared_error'})

# early stopping
earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
model.fit({'input': X_train, 'posinput': pos_train, 'output': Y_train},
          nb_epoch=nb_epoch, batch_size=batch_size,
          validation_split=0.1, callbacks=[earlystop])

# Graph doesn't have several arg/func existing in Sequential()
# - fit no show-accuracy
# - no predict_classes
classes = model.predict({'input': X_test, 'posinput': pos_test},
                        batch_size=batch_size)['output'].argmax(axis=1)
acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
print('Test accuracy:', acc)
kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
print('Test Kappa:', kappa)

Example #47
0
def calculate_f1(predictions, actual):
    return f1_score(categorical_probas_to_classes(actual),
                    categorical_probas_to_classes(predictions),
                    average="micro")
            batch=dev_batch)

        pos = np.array([(np.concatenate([
            np_utils.to_categorical(p, pos_length),
            np.zeros((step_length - length[l], pos_length))
        ])) for l, p in enumerate(pos)])
        y = np.array(
            [np_utils.to_categorical(each, output_length) for each in label])
        # for loss
        dev_metrics = model.test_on_batch([embed_index, hash_index, pos], y)
        dev_loss += dev_metrics[0]

        # for accuracy
        prob = model.predict_on_batch([embed_index, hash_index, pos])
        for i, l in enumerate(length):
            predict_label = np_utils.categorical_probas_to_classes(prob[i])
            correct_predict += np.sum(predict_label[:l] == label[i][:l])
        all_predict += np.sum(length)
    epcoh_accuracy = float(correct_predict) / all_predict
    all_dev_accuracy.append(epcoh_accuracy)

    all_dev_loss.append(dev_loss)

    if epcoh_accuracy >= best_accuracy:
        best_accuracy = epcoh_accuracy
        best_epoch = epoch

    end = datetime.now()

    model.save('%s/model_epoch_%d.h5' % (folder_path, epoch), overwrite=True)
Example #49
0
def main():
    print("::: midas started :::")

    param_dict = dict()
    config_path = os.path.abspath(sys.argv[1])
    print("> using config file: "+str(config_path))

    if config_path:
        param_dict = cmd_line.parse(config_path)
    else:
        raise ValueError("No config file specified.")
    #print(param_dict)

    model_name = param_dict["model_name"]

    if param_dict["mode"] == "train":

        print("> start training")

        # make sure that we have a fresh model folder to work in
        if not os.path.isdir("../models"):
            os.mkdir("../models")
        if os.path.isdir("../models/"+model_name):
            shutil.rmtree("../models/"+model_name)
        os.mkdir("../models/"+model_name)

        train_tokens, train_postags, train_lemmas = \
            datasets.load_annotated_data_dir(data_dir = os.path.abspath(param_dict["train_dir"]),
                                         nb_instances = 500000000000)
        dev_tokens, dev_postags, dev_lemmas = \
            datasets.load_annotated_data_dir(data_dir = os.path.abspath(param_dict["dev_dir"]),
                                         nb_instances = 500000000000)

        if param_dict["tokenize"]:
            left_X, right_X, concat_y, char_vector_dict = \
                tokenize_stuff.vectorize(tokens = train_tokens,
                                         nb_left_tokens = param_dict["tok_nb_left_tokens"],
                                         left_char_len = param_dict["tok_left_char_len"],
                                         nb_right_tokens = param_dict["tok_nb_right_tokens"],
                                         right_char_len = param_dict["tok_right_char_len"])
            
            tokenizer = tokenize_stuff.build_tokenizer(nb_filters = 2500,
                                        filter_length = 3,
                                        char_vector_dict = char_vector_dict)

            tokenizer.fit([left_X, right_X], concat_y, validation_split = 0.20, show_accuracy=True,
                            batch_size = BATCH_SIZE, nb_epoch = param_dict["tok_nb_epochs"], class_weight={0:1, 1:100})

            # save relevant objects:
            tokenizer.save_weights("../models/"+model_name+"/tokenizer.model_weights",
                                    overwrite=True)
            pickle.dump(char_vector_dict,
                         open("../models/"+model_name+"/char_vector_dict.p", "wb" ))

        if param_dict["postag"]:
            pass

        if param_dict["lemmatize"]:
            train_lemmas = [lem for lem in train_lemmas if lem not in ("@", "$")]
            train_postags = [pos for pos in train_postags if pos not in ("@", "$")]
            train_labels = [lem+"_"+pos for lem, pos in zip(train_lemmas, train_postags)]

            dev_lemmas = [lem for lem in dev_lemmas if lem not in ("@", "$")]
            dev_postags = [pos for pos in dev_postags if pos not in ("@", "$")]
            dev_labels = [lem+"_"+pos for lem, pos in zip(dev_lemmas, dev_postags)]

            label_encoder = LabelEncoder()
            label_encoder.fit(train_labels+dev_labels)
            train_ints = label_encoder.transform(train_labels)
            dev_ints = label_encoder.transform(dev_labels)

            train_y = np_utils.to_categorical(train_ints, len(label_encoder.classes_))
            dev_y = np_utils.to_categorical(dev_ints, len(label_encoder.classes_))

            train_left_X, train_tokens_X, train_right_X, train_char_vector_dict = tagger_stuff.vectorize(tokens = train_tokens,
                                        std_token_len = param_dict["lemma_std_len_token"],
                                        nb_left_tokens = param_dict["lemma_nb_left_tokens"],
                                        left_char_len = param_dict["lemma_left_char_len"],
                                        nb_right_tokens = param_dict["lemma_nb_right_tokens"],
                                        right_char_len = param_dict["lemma_right_char_len"],
                                        )
            print(train_tokens_X.shape)
            
            dev_left_X, dev_tokens_X, dev_right_X, _ = tagger_stuff.vectorize(tokens = dev_tokens,
                                        std_token_len = param_dict["lemma_std_len_token"],
                                        nb_left_tokens = param_dict["lemma_nb_left_tokens"],
                                        left_char_len = param_dict["lemma_left_char_len"],
                                        nb_right_tokens = param_dict["lemma_nb_right_tokens"],
                                        right_char_len = param_dict["lemma_right_char_len"],
                                        char_vector_dict = train_char_vector_dict
                                        )
            print(dev_tokens_X.shape)

            lemmatizer = tagger_stuff.build_lemmatizer_new(nb_filters = 1024,
                                        filter_length = 3,
                                        std_token_len = param_dict["lemma_std_len_token"],
                                        left_char_len = param_dict["lemma_left_char_len"],
                                        right_char_len = param_dict["lemma_right_char_len"],
                                        char_vector_dict = train_char_vector_dict,
                                        nb_labels = len(label_encoder.classes_),
                                        dense_dims = 1024,
                                        )

            for e in range(param_dict["lemma_nb_epochs"]):
                print("-> epoch ", e+1, "...")
                lemmatizer.fit({#'left_input': train_left_X,
                                          'token_input': train_tokens_X,
                                          #'right_input': train_right_X,
                                          'label_output': train_y
                                         },
                                nb_epoch = 1,
                                batch_size = BATCH_SIZE)

                print("+++ TRAIN SCORE")
                predictions = lemmatizer.predict({#'left_input': train_left_X,
                                          'token_input': train_tokens_X,
                                          #'right_input': train_right_X,
                                         },
                                batch_size = BATCH_SIZE)
                predictions = np_utils.categorical_probas_to_classes(predictions['label_output'])
                accuracy = np_utils.accuracy(predictions, train_ints)
                print("\t - acc:\t{:.2%}".format(accuracy))

                print("+++ DEV SCORE")
                predictions = lemmatizer.predict({#'left_input': dev_left_X,
                                          'token_input': dev_tokens_X,
                                          #'right_input': dev_right_X,
                                         },
                                batch_size = BATCH_SIZE)
                predictions = np_utils.categorical_probas_to_classes(predictions['label_output'])
                accuracy = np_utils.accuracy(predictions, dev_ints)
                print("\t - acc:\t{:.2%}".format(accuracy))
            """
            ########################################################################################################
            # train data:
            train_lemmas = [lem for lem in train_lemmas if lem not in ("@", "$")]
            train_postags = [pos for pos in train_postags if pos not in ("@", "$")]
            print("orig nb lemmas:", len(set(train_lemmas)))

            train_lemma_counter = Counter(train_lemmas)
            train_lemma_vocab = [k for k, v in train_lemma_counter.items() if v > 1]
            print("reduced nb lemmas:", len(train_lemma_vocab))

            train_lemmas = [lem if lem in train_lemma_vocab else '<unk>' for lem in train_lemmas]

            dev_lemmas = [lem for lem in dev_lemmas if lem not in ("@", "$")]
            dev_postags = [pos for pos in dev_postags if pos not in ("@", "$")]

            dev_lemmas = [lem if lem in train_lemma_vocab else '<unk>' for lem in dev_lemmas]

            lemma_encoder = LabelEncoder()
            lemma_encoder.fit(train_lemmas+dev_lemmas+['<unk>'])
            train_lemmas_y = lemma_encoder.transform(train_lemmas)
            dev_lemmas_y = lemma_encoder.transform(dev_lemmas)

            pos_encoder = LabelEncoder()
            pos_encoder.fit(train_postags+dev_postags)
            train_pos_y = pos_encoder.transform(train_postags)
            dev_pos_y = pos_encoder.transform(dev_postags)

            train_lemma_labels_y = np_utils.to_categorical(train_lemmas_y, len(lemma_encoder.classes_))
            train_pos_labels_y = np_utils.to_categorical(train_pos_y, len(pos_encoder.classes_))

            dev_lemma_labels_y = np_utils.to_categorical(dev_lemmas_y, len(lemma_encoder.classes_))
            dev_pos_labels_y = np_utils.to_categorical(dev_pos_y, len(pos_encoder.classes_))

            train_left_X, train_tokens_X, train_right_X, train_char_vector_dict = tagger_stuff.vectorize(tokens = train_tokens,
                                        std_token_len = param_dict["lemma_std_len_token"],
                                        nb_left_tokens = param_dict["lemma_nb_left_tokens"],
                                        left_char_len = param_dict["lemma_left_char_len"],
                                        nb_right_tokens = param_dict["lemma_nb_right_tokens"],
                                        right_char_len = param_dict["lemma_right_char_len"],
                                        )
            print(train_tokens_X.shape)
            
            dev_left_X, dev_tokens_X, dev_right_X, _ = tagger_stuff.vectorize(tokens = dev_tokens,
                                        std_token_len = param_dict["lemma_std_len_token"],
                                        nb_left_tokens = param_dict["lemma_nb_left_tokens"],
                                        left_char_len = param_dict["lemma_left_char_len"],
                                        nb_right_tokens = param_dict["lemma_nb_right_tokens"],
                                        right_char_len = param_dict["lemma_right_char_len"],
                                        char_vector_dict = train_char_vector_dict
                                        )
            print(dev_tokens_X.shape)

            lemmatizer = tagger_stuff.build_lemmatizer(nb_filters = 1024,
                                        filter_length = 3,
                                        std_token_len = param_dict["lemma_std_len_token"],
                                        left_char_len = param_dict["lemma_left_char_len"],
                                        right_char_len = param_dict["lemma_right_char_len"],
                                        char_vector_dict = train_char_vector_dict,
                                        nb_lemmas = len(lemma_encoder.classes_),
                                        nb_postags = len(pos_encoder.classes_),
                                        dense_dims = 500
                                        )

            for e in range(param_dict["lemma_nb_epochs"]):
                print("-> epoch ", e+1, "...")
                lemmatizer.fit({'left_input': train_left_X,
                                          'token_input': train_tokens_X,
                                          'right_input': train_right_X,
                                          'lemma_output': train_lemma_labels_y,
                                          'pos_output': train_pos_labels_y
                                         },
                                nb_epoch = 1,
                                batch_size = BATCH_SIZE)

                print("+++ TRAIN SCORE")
                predictions = lemmatizer.predict({'left_input': train_left_X,
                                          'token_input': train_tokens_X,
                                          'right_input': train_right_X,
                                         },
                                batch_size = BATCH_SIZE)
                pos_predictions = np_utils.categorical_probas_to_classes(predictions['pos_output'])
                pos_accuracy = np_utils.accuracy(pos_predictions, train_pos_y)
                print("\t - postags acc:\t{:.2%}".format(pos_accuracy))
                lemma_predictions = np_utils.categorical_probas_to_classes(predictions['lemma_output'])
                lemma_accuracy = np_utils.accuracy(lemma_predictions, train_lemmas_y)
                print("\t - lemmas acc:\t{:.2%}".format(lemma_accuracy))

                print("+++ DEV SCORE")
                dev_predictions = lemmatizer.predict({'left_input': dev_left_X,
                                          'token_input': dev_tokens_X,
                                          'right_input': dev_right_X,
                                         },
                                batch_size = BATCH_SIZE)
                dev_pos_predictions = np_utils.categorical_probas_to_classes(dev_predictions['pos_output'])
                dev_pos_accuracy = np_utils.accuracy(dev_pos_predictions, dev_pos_y)
                print("\t - postags acc:\t{:.2%}".format(dev_pos_accuracy))
                dev_lemma_predictions = np_utils.categorical_probas_to_classes(dev_predictions['lemma_output'])
                dev_lemma_accuracy = np_utils.accuracy(dev_lemma_predictions, dev_lemmas_y)
                print("\t - lemmas acc:\t{:.2%}".format(dev_lemma_accuracy))
        """

    elif param_dict["mode"] == "test":
        
        print("> start testing")

        test_tokens, test_postags, test_lemmas = \
            datasets.load_annotated_data_dir(data_dir = os.path.abspath(param_dict["test_dir"]),
                                         nb_instances = 5000)
        char_vector_dict = pickle.load(open("../models/"+model_name+"/char_vector_dict.p", "rb"))
        left_X, right_X, concat_y, _ = \
                tokenize_stuff.vectorize(tokens = test_tokens,
                                         nb_left_tokens = param_dict["tok_nb_left_tokens"],
                                         left_char_len = param_dict["tok_left_char_len"],
                                         nb_right_tokens = param_dict["tok_nb_right_tokens"],
                                         right_char_len = param_dict["tok_right_char_len"],
                                         char_vector_dict = char_vector_dict)
        tokenizer = tokenize_stuff.build_tokenizer(nb_filters = 250,
                                        filter_length = 3,
                                        char_vector_dict = char_vector_dict)
        tokenizer.load_weights("../models/"+model_name+"/tokenizer.model_weights")
        preds = tokenizer.predict_classes([left_X, right_X], batch_size = 1000)
        for item in zip(tokenize_stuff.unconcatenate_tokens(test_tokens)[0], preds):
            print(item)
        
        



    print("::: midas ended :::")