Beispiel #1
0
def data_dense():
    """Data to feed into code prediction model"""

    base = os.environ['DATA_ROOT']
    train_dir = os.path.join(base, cfg.get('data', 'train'))
    test_dir = os.path.join(base, cfg.get('data', 'test'))

    # load pre-trained model
    rl = cfg.get('data', 'rep_layer')
    model = load_model(cfg.get('data', 'model_file'))
    interm_layer_model = Model(inputs=model.input,
                               outputs=model.get_layer(rl).output)
    maxlen = model.get_layer(name='EL').get_config()['input_length']

    # load target task training data
    dataset_provider = dataset.DatasetProvider(
        train_dir, cfg.get('data', 'alphabet_pickle'))
    x_train, y_train = dataset_provider.load_keras(maxlen=maxlen)
    x_train = pad_sequences(x_train, maxlen=maxlen)

    # make training vectors for target task
    print('x_train shape (original):', x_train.shape)
    x_train = interm_layer_model.predict(x_train)
    print('x_train shape (new):', x_train.shape)

    # now load the test set
    dataset_provider = dataset.DatasetProvider(
        test_dir, cfg.get('data', 'alphabet_pickle'))
    x_test, y_test = dataset_provider.load_keras(maxlen=maxlen)
    x_test = pad_sequences(x_test, maxlen=maxlen)

    # make test vectors for target task
    x_test = interm_layer_model.predict(x_test)

    return x_train, y_train, x_test, y_test
Beispiel #2
0
def grid_search():
  """Grid search using sklearn API"""

  # load target task train and test data
  base = os.environ['DATA_ROOT']
  train_dir = os.path.join(base, cfg.get('data', 'train'))
  test_dir = os.path.join(base, cfg.get('data', 'test'))

  maxlen = get_maxlen()
  dataset_provider = dataset.DatasetProvider(
    train_dir,
    cfg.get('data', 'alphabet_pickle'))
  x_train, y_train = dataset_provider.load_keras(maxlen=maxlen)
  x_train = pad_sequences(x_train, maxlen=maxlen)

  classifier = KerasClassifier(make_model)

  param_grid = {
    'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'epochs':[3, 5, 7]}

  validator = GridSearchCV(
    classifier,
    param_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=1)

  validator.fit(x_train, y_train)

  print('best param:', validator.best_params_)
Beispiel #3
0
def main():
    """Do out-of-core training here"""

    configure_model_dir()
    base = os.environ['DATA_ROOT']

    dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')),
                                 cfg.get('args', 'max_files'),
                                 cfg.getint('args', 'max_cuis'),
                                 cfg.getint('args', 'samples_per_doc'),
                                 cfg.getint('bow', 'batch'),
                                 cfg.getboolean('args', 'make_alphabet'),
                                 cfg.getboolean('args', 'verbose'))

    max_cuis = int(cfg.get('args', 'max_cuis'))
    model = get_model(max_cuis, max_cuis - 1)
    optim = getattr(optimizers, cfg.get('bow', 'optimizer'))

    model.compile(loss='binary_crossentropy',
                  optimizer=optim(lr=10**cfg.getint('bow', 'log10lr')),
                  metrics=['accuracy'])

    callback = ModelCheckpoint('Model/model.h5',
                               verbose=1,
                               save_best_only=True)

    # load validation data
    val_x, val_y = dp.load(os.path.join(base, cfg.get('data', 'dev')))
    print('dev x, y shapes:', val_x.shape, val_y.shape)

    steps = math.ceil(dp.train_size / cfg.getint('bow', 'batch'))
    print('steps per epoch:', steps)

    model.fit_generator(dp.stream(),
                        validation_data=(val_x, val_y),
                        epochs=cfg.getint('bow', 'epochs'),
                        steps_per_epoch=steps,
                        verbose=0,
                        callbacks=[callback])

    # save final model
    model.save('Model/final.h5')

    # probability for each class; (test size, num of classes)
    distribution = model.predict(val_x)

    # turn into an indicator matrix
    distribution[distribution < 0.5] = 0
    distribution[distribution >= 0.5] = 1

    f1 = f1_score(val_y, distribution, average='macro')
    p = precision_score(val_y, distribution, average='macro')
    r = recall_score(val_y, distribution, average='macro')
    print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))

    f1 = f1_score(val_y, distribution, average='micro')
    p = precision_score(val_y, distribution, average='micro')
    r = recall_score(val_y, distribution, average='micro')
    print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
def main(args):
    #np.random.seed(1337)
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    #loading pre-trained embedding file:
    embeddings_index = {}
    f = open(os.path.join(working_dir, 'mimic.txt'))
    values = f.readline().split()
    EMBEDDING_WORDNUM = int(values[0])
    EMBEDDING_DIM = int(values[1])
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('load embeddings for %s=%s words.' %
          (len(embeddings_index), EMBEDDING_WORDNUM))

    # prepare embedding matrix
    nb_words = len(provider.word2int)
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in provider.word2int.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:  # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    #train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.1, random_state=18)

    optim = RandomSearch(
        lambda: get_random_config(), lambda x, y: run_one_eval(
            x, y, train_x, train_y, maxlen, len(provider.word2int), classes,
            embedding_matrix, EMBEDDING_DIM))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)

    sys.exit(0)
Beispiel #5
0
def main():
    """Driver function"""

    base = os.environ['DATA_ROOT']

    dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')),
                                 cfg.get('data', 'model_dir'),
                                 cfg.getint('args', 'max_seq_len'),
                                 cfg.get('args', 'n_files'),
                                 cfg.get('args', 'n_x1_cuis'),
                                 cfg.get('args', 'n_x2_cuis'))
    x1, x2, y = dp.load()

    print('x1 shape:', x1.shape)
    print('x2 shape:', x2.shape)
    print('y shape:', y.shape)

    train_x1, val_x1, train_x2, val_x2, train_y, val_y = train_test_split(
        x1, x2, y, test_size=cfg.getfloat('args', 'test_size'))

    # TODO: figure out what to do about negated cuis
    init_vectors = None
    if cfg.has_option('data', 'embed'):
        embed_file = os.path.join(base, cfg.get('data', 'embed'))
        w2v = word2vec.Model(embed_file, verbose=True)
        init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)]

    model = get_model_concat_no_sharing(
        len(dp.tokenizer.word_index) + 1, x1.shape[1], init_vectors)

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    # save the model after every epoch
    callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5',
                               verbose=1,
                               save_best_only=True)

    model.fit([train_x1, train_x2],
              train_y,
              validation_data=([val_x1, val_x2], val_y),
              epochs=cfg.getint('dan', 'epochs'),
              batch_size=cfg.getint('dan', 'batch'),
              validation_split=0.0,
              callbacks=[callback])

    # are we training the best model?
    if cfg.getfloat('args', 'test_size') == 0:
        model.save(cfg.get('data', 'model_dir') + 'model.h5')
        exit()

    probs = model.predict([val_x1, val_x2])
    predictions = (probs > 0.5).astype(int)
    accuracy = accuracy_score(val_y, predictions)
    print('accuracy: ', accuracy)
Beispiel #6
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb"))
    pickle.dump(provider.word2int,
                open(os.path.join(working_dir, 'word2int.p'), "wb"))
    pickle.dump(provider.label2int,
                open(os.path.join(working_dir, 'label2int.p'), "wb"))

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    model = Sequential()
    model.add(Embedding(len(provider.word2int), 300, input_length=maxlen))
    model.add(GlobalAveragePooling1D())

    model.add(Dropout(0.25))
    model.add(Dense(1000, kernel_regularizer=regularizers.l2(0.00001)))
    model.add(Activation('relu'))

    model.add(Dropout(0.25))
    model.add(Dense(classes, kernel_regularizer=regularizers.l2(0.00001)))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.0005, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(train_x,
              train_y,
              epochs=10,
              batch_size=50,
              verbose=0,
              validation_split=0.0,
              class_weight=None)

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
Beispiel #7
0
def data_sparse():
    """Bag-of-cuis data for sparse evaluation"""

    base = os.environ['DATA_ROOT']
    train_dir = os.path.join(base, cfg.get('data', 'train'))
    test_dir = os.path.join(base, cfg.get('data', 'test'))

    # load training data
    dataset_provider = dataset.DatasetProvider(train_dir)
    x_train, y_train = dataset_provider.load_sklearn()

    # load test data
    dataset_provider = dataset.DatasetProvider(test_dir)
    x_test, y_test = dataset_provider.load_sklearn()

    # turn xs into tfidf vectors
    vectorizer = TfidfVectorizer()
    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)

    return x_train.toarray(), y_train, x_test.toarray(), y_test
Beispiel #8
0
def main():
    """Driver function"""

    base = os.environ['DATA_ROOT']

    dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')),
                                 cfg.get('data', 'model_dir'),
                                 cfg.get('args', 'n_x_cuis'),
                                 cfg.get('args', 'n_y_cuis'),
                                 cfg.getfloat('args', 'min_examples_per_targ'))
    x, y = dp.load()

    print('x shape:', x.shape)
    print('y shape:', y.shape)

    fixed_args = {
        'vocabulary_size': len(dp.tokenizer.word_index) + 1,
        'max_seq_len': x.shape[1],
        'n_targets': y.shape[1],
        'init_vectors': None,
        'loss': 'binary_crossentropy',
        'epochs': cfg.getint('search', 'max_epochs')
    }

    param_space = {
        'emb_dim': (512, 1024, 2048, 4096),
        'hidden': (1000, 3000, 5000, 10000),
        'activation': ('linear', 'tanh', 'relu'),
        'dropout': uniform(0, 0.75),
        'optimizer': ('RMSprop', 'Adam'),
        'log10lr': (-5, -4, -3, -2),
        'batch': (4, 8, 16, 32, 64)
    }

    config2score = rndsearch.run(make_model,
                                 fixed_args,
                                 param_space,
                                 x,
                                 y,
                                 n=cfg.getint('search', 'n'),
                                 verbose=1)

    # display configs sorted by f1
    print('\nconfigurations sorted by score:')
    sorted_by_value = sorted(config2score, key=config2score.get)
    for config in sorted_by_value:
        print('%s: %.3f' % (config, config2score[config]))

    best_config = dict(sorted_by_value[-1])
    print('best config:', best_config)
    print('best score:', config2score[sorted_by_value[-1]])
Beispiel #9
0
def fine_tune():
  """Fine tuning dense vectors"""

  # load target task train and test data
  base = os.environ['DATA_ROOT']
  train_dir = os.path.join(base, cfg.get('data', 'train'))
  test_dir = os.path.join(base, cfg.get('data', 'test'))

  maxlen = get_maxlen()

  dataset_provider = dataset.DatasetProvider(
    train_dir,
    cfg.get('data', 'alphabet_pickle'))
  x_train, y_train = dataset_provider.load_keras(maxlen=maxlen)
  x_train = pad_sequences(x_train, maxlen=maxlen)

  dataset_provider = dataset.DatasetProvider(
    test_dir,
    cfg.get('data', 'alphabet_pickle'))
  x_test, y_test = dataset_provider.load_keras(maxlen=maxlen)
  x_test = pad_sequences(x_test, maxlen=maxlen)

  # train and evaluate
  model = make_model()
  epochs = cfg.getint('data', 'epochs')
  model.fit(x_train, y_train, epochs=epochs, validation_split=0.0)

  predictions = model.predict_classes(x_test)
  probs = model.predict(x_test)

  p = precision_score(y_test, predictions, average='macro')
  r = recall_score(y_test, predictions, average='macro')
  f1 = f1_score(y_test, predictions, average='macro')
  print("precision: %.3f - recall: %.3f - f1: %.3f" % (p, r, f1))

  accuracy = accuracy_score(y_test, predictions)
  roc_auc = roc_auc_score(y_test, probs)
  print("auc: %.3f - accuracy: %.3f" % (roc_auc, accuracy))
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
    pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb"))
    pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb"))

    w2v = word2vec.Model('/home/dima/Data/Word2VecModels/mimic.txt')
    init_vectors = [w2v.select_vectors(provider.word2int)]

    model = get_model(len(provider.word2int), maxlen, init_vectors, classes)
    optimizer = RMSprop(lr=LEARN_RATE, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(train_x,
              train_y,
              epochs=NUM_EPOCHS,
              batch_size=BATCH_SIZE,
              verbose=0,
              validation_split=0.0)

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
Beispiel #11
0
from sklearn.model_selection import cross_val_score
import keras as k
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.models import Model
import dataset

if __name__ == "__main__":

    cfg = ConfigParser.ConfigParser()
    cfg.read(sys.argv[1])
    base = os.environ['DATA_ROOT']
    data_dir = os.path.join(base, cfg.get('data', 'path'))

    # load target task data
    dataset = dataset.DatasetProvider(data_dir,
                                      cfg.get('data', 'alphabet_pickle'))

    x, y = dataset.load()
    # pad to same maxlen as data in source model
    x = pad_sequences(x, maxlen=cfg.getint('data', 'maxlen'))
    print 'x shape (original):', x.shape

    # make vectors for target task
    model = load_model(cfg.get('data', 'model_file'))
    interm_layer_model = Model(inputs=model.input,
                               outputs=model.get_layer('HL').output)
    x = interm_layer_model.predict(x)
    print 'x shape (new):', x.shape

    # ready for svm train/test now
Beispiel #12
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb"))
    pickle.dump(provider.word2int,
                open(os.path.join(working_dir, 'word2int.p'), "wb"))
    pickle.dump(provider.label2int,
                open(os.path.join(working_dir, 'label2int.p'), "wb"))

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    branches = []  # models to be merged
    train_xs = []  # train x for each branch

    for filter_len in '2,3,4,5'.split(','):
        branch = Sequential()
        branch.add(
            Embedding(len(provider.word2int),
                      300,
                      input_length=maxlen,
                      weights=None))
        branch.add(
            Convolution1D(nb_filter=200,
                          filter_length=int(filter_len),
                          border_mode='valid',
                          activation='relu',
                          subsample_length=1))
        branch.add(MaxPooling1D(pool_length=2))
        branch.add(Flatten())

        branches.append(branch)
        train_xs.append(train_x)

    model = Sequential()
    model.add(Merge(branches, mode='concat'))

    model.add(Dropout(0.25))
    model.add(Dense(300))
    model.add(Activation('relu'))

    model.add(Dropout(0.25))
    model.add(Dense(classes))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(train_xs,
              train_y,
              nb_epoch=4,
              batch_size=50,
              verbose=0,
              validation_split=0.1)

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    #read in data file
#    print("Reading data...")
    #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training and test data
    dataset1 = dataset.DatasetProvider([data_file])
    # now load training examples and labels
    train_x, train_y = dataset1.load(data_file)

    init_vectors = None #used for pre-trained embeddings
    
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    outcomes = set(train_y)
    classes = len(outcomes)

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
    pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
    #test_x = pad_sequences(test_x, maxlen=maxlen)
    #test_y = to_categorical(np.array(test_y), classes)

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    branches = [] # models to be merged
    train_xs = [] # train x for each branch
    #test_xs = []  # test x for each branch

    filtlens = "3,4,5"
    for filter_len in filtlens.split(','):
        branch = Sequential()
        branch.add(Embedding(len(dataset1.alphabet),
                         300,
                         input_length=maxlen,
                         weights=init_vectors))
        branch.add(Convolution1D(nb_filter=200,
                             filter_length=int(filter_len),
                             border_mode='valid',
                             activation='relu',
                             subsample_length=1))
        branch.add(MaxPooling1D(pool_length=2))
        branch.add(Flatten())

        branches.append(branch)
        train_xs.append(train_x)
        #test_xs.append(test_x)
    model = Sequential()
    model.add(Merge(branches, mode='concat'))

    model.add(Dense(250))#cfg.getint('cnn', 'hidden')))
    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
    model.add(Activation('relu'))

    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
    model.add(Dense(classes))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
                      rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])
    model.fit(train_xs,
            train_y,
            nb_epoch=3,#cfg.getint('cnn', 'epochs'),
            batch_size=50,#cfg.getint('cnn', 'batches'),
            verbose=1,
            validation_split=0.1,
            class_weight=None)

    model.summary()

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
Beispiel #14
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    #read in data file
    #    print("Reading data...")
    #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training and test data
    dataset1 = dataset.DatasetProvider([data_file])
    # now load training examples and labels
    train_x, train_y = dataset1.load(data_file)

    init_vectors = None  #used for pre-trained embeddings

    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    outcomes = set(train_y)
    classes = len(outcomes)

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb"))
    pickle.dump(dataset1.alphabet,
                open(os.path.join(working_dir, 'alphabet.p'), "wb"))
    #test_x = pad_sequences(test_x, maxlen=maxlen)
    #test_y = to_categorical(np.array(test_y), classes)

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    #branches = [] # models to be merged
    #train_xs = [] # train x for each branch
    #test_xs = []  # test x for each branch

    model = resnet(maxlen, dataset1.alphabet, classes)

    optimizer = RMSprop(
        lr=0.0001,  #cfg.getfloat('cnn', 'learnrt'),
        rho=0.9,
        epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(
        train_x,
        train_y,
        nb_epoch=3,  #cfg.getint('cnn', 'epochs'),
        batch_size=50,  #cfg.getint('cnn', 'batches'),
        verbose=1,
        validation_split=0.1,
        class_weight=None)

    model.summary()

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
Beispiel #15
0
def main():
    """Driver function"""

    base = os.environ['DATA_ROOT']

    dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')),
                                 cfg.get('data', 'model_dir'),
                                 cfg.get('args', 'n_x_cuis'),
                                 cfg.get('args', 'n_y_cuis'),
                                 cfg.getfloat('args', 'min_examples_per_targ'))
    x, y = dp.load()

    print('x shape:', x.shape)
    print('y shape:', y.shape)

    # are we training the best model?
    if cfg.getfloat('args', 'test_size') != 0:
        train_x, val_x, train_y, val_y = train_test_split(
            x, y, test_size=cfg.getfloat('args', 'test_size'))
        validation_data = (val_x, val_y)
    else:
        train_x, train_y = x, y
        validation_data = None

    # TODO: figure out what to do about negated cuis
    init_vectors = None
    if cfg.has_option('data', 'embed'):
        embed_file = os.path.join(base, cfg.get('data', 'embed'))
        w2v = word2vec.Model(embed_file, verbose=True)
        init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)]

    model = get_model(
        len(dp.tokenizer.word_index) + 1, x.shape[1], y.shape[1], init_vectors)

    optim = getattr(optimizers, cfg.get('dan', 'optimizer'))
    model.compile(loss='binary_crossentropy',
                  optimizer=optim(lr=10**cfg.getint('dan', 'log10lr')),
                  metrics=['accuracy'])

    # save the model after every epoch
    callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5',
                               verbose=1,
                               save_best_only=True)

    model.fit(train_x,
              train_y,
              validation_data=validation_data,
              epochs=cfg.getint('dan', 'epochs'),
              batch_size=cfg.getint('dan', 'batch'),
              validation_split=0.0,
              callbacks=[callback])

    # are we training the best model?
    if cfg.getfloat('args', 'test_size') == 0:
        model.save(cfg.get('data', 'model_dir') + 'model.h5')
        exit()

    # probability for each class; (test size, num of classes)
    distribution = model.predict(val_x)

    # turn into an indicator matrix
    distribution[distribution < 0.5] = 0
    distribution[distribution >= 0.5] = 1

    f1 = f1_score(val_y, distribution, average='macro')
    p = precision_score(val_y, distribution, average='macro')
    r = recall_score(val_y, distribution, average='macro')
    print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
    f1 = f1_score(val_y, distribution, average='micro')
    p = precision_score(val_y, distribution, average='micro')
    r = recall_score(val_y, distribution, average='micro')
    print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
Beispiel #16
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    #loading pre-trained embedding file:
    embeddings_index = {}
    f = open(os.path.join(working_dir, 'mimic.txt'))
    values = f.readline().split()
    EMBEDDING_WORDNUM = int(values[0])
    EMBEDDING_DIM = int(values[1])
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('load embeddings for %s=%s words.' %
          (len(embeddings_index), EMBEDDING_WORDNUM))

    # prepare embedding matrix
    nb_words = len(provider.word2int)
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in provider.word2int.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:  # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb"))
    pickle.dump(provider.word2int,
                open(os.path.join(working_dir, 'word2int.p'), "wb"))
    pickle.dump(provider.label2int,
                open(os.path.join(working_dir, 'label2int.p'), "wb"))

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    LSTM_DIM = 512
    DROPOUT = 0.5

    input = Input(shape=(maxlen, ), dtype='int32')
    embed = Embedding(nb_words,
                      EMBEDDING_DIM,
                      mask_zero=True,
                      input_length=maxlen,
                      weights=[embedding_matrix],
                      trainable=True)(input)
    lstm_fw = LSTM(LSTM_DIM, dropout=DROPOUT, recurrent_dropout=DROPOUT)(embed)
    lstm_bw = LSTM(LSTM_DIM,
                   dropout=DROPOUT,
                   recurrent_dropout=DROPOUT,
                   go_backwards=True)(embed)
    cat = concatenate([lstm_fw, lstm_bw])

    #drop = Dropout(DROPOUT)(cat)
    minV = -math.sqrt(6) / math.sqrt(LSTM_DIM * 2 + classes)
    maxV = math.sqrt(6) / math.sqrt(LSTM_DIM * 2 + classes)
    randUni = RandomUniform(minval=minV, maxval=maxV, seed=None)
    out = Dense(classes,
                activation='softmax',
                kernel_initializer=randUni,
                bias_initializer='zeros')(cat)
    model = Model(inputs=[input], outputs=[out])
    #optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08)
    optimizer = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    stopper = EarlyStopping(monitor='val_loss',
                            patience=10,
                            verbose=0,
                            mode='auto')
    model.fit(train_x,
              train_y,
              epochs=20,
              batch_size=256,
              verbose=2,
              validation_split=0.1,
              callbacks=[stopper])

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
Beispiel #17
0
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding

NFOLDS = 10
BATCH = 50
EPOCHS = 5
CLASSES = 2
EMBDIMS = 300
MAXLEN = 55
MAXFEATURES = 18000
FILTERS = 100
FILTLEN = 4

if __name__ == "__main__":

    dataset = dataset.DatasetProvider(MAXFEATURES)
    x, y = dataset.load_data()

    # TODO: what what are we doing for index 0 (oov words)?
    path = '/Users/Dima/Loyola/Data/Word2Vec/Models/GoogleNews-vectors-negative300.txt'
    word2vec = word2vec_model.Model(path)
    init_vectors = word2vec.select_vectors(dataset.alphabet)

    # turn x and y into numpy array among other things
    x = sequence.pad_sequences(x, maxlen=MAXLEN)
    y = k.utils.np_utils.to_categorical(np.array(y), CLASSES)

    scores = []
    folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS, shuffle=True)

    # todo: look at train_indices and test_indices
Beispiel #18
0
    return model


if __name__ == "__main__":

    # settings file specified as command-line argument
    cfg = ConfigParser.ConfigParser()
    cfg.read(sys.argv[1])
    print_config(cfg)
    base = os.environ['DATA_ROOT']
    train_file = os.path.join(base, cfg.get('data', 'train'))
    test_file = os.path.join(base, cfg.get('data', 'test'))

    # learn alphabet from training examples
    dataset = dataset.DatasetProvider(train_file)
    # now load training examples and labels
    train_x, train_y = dataset.load(train_file)
    maxlen = max([len(seq) for seq in train_x])
    # now load test examples and labels
    test_x, test_y = dataset.load(test_file, maxlen=maxlen)

    init_vectors = None
    # TODO: what what are we doing for index 0 (oov words)?
    # use pre-trained word embeddings?
    if cfg.has_option('data', 'embed'):
        embed_file = os.path.join(base, cfg.get('data', 'embed'))
        w2v = word2vec.Model(embed_file)
        init_vectors = [w2v.select_vectors(dataset.word2int)]

    # turn x and y into numpy array among other things
Beispiel #19
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    #loading pre-trained embedding file:
    embeddings_index = {}
    f = open(os.path.join(working_dir, 'mimic.txt'))
    values = f.readline().split()
    EMBEDDING_WORDNUM = int(values[0])
    EMBEDDING_DIM = int(values[1])
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('load embeddings for %s=%s words.' %
          (len(embeddings_index), EMBEDDING_WORDNUM))

    # prepare embedding matrix
    nb_words = len(provider.word2int)
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in provider.word2int.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:  # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb"))
    pickle.dump(provider.word2int,
                open(os.path.join(working_dir, 'word2int.p'), "wb"))
    pickle.dump(provider.label2int,
                open(os.path.join(working_dir, 'label2int.p'), "wb"))

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    branches = []  # models to be merged
    train_xs = []  # train x for each branch
    inflows = []  # placeholder for each input

    for filter_len in '2,5'.split(','):
        branch = Input(shape=(maxlen, ))
        embed = Embedding(len(provider.word2int),
                          EMBEDDING_DIM,
                          weights=[embedding_matrix],
                          trainable=True)(branch)
        conv = Conv1D(filters=200,
                      kernel_size=int(filter_len),
                      padding='valid',
                      activation='relu',
                      strides=1)(embed)
        pool = MaxPooling1D(pool_size=2)(conv)
        flat = Flatten()(pool)
        branches.append(flat)
        train_xs.append(train_x)
        inflows.append(branch)

    concat = concatenate(branches)

    drop1 = Dropout(0.25)(concat)
    dense = Dense(200, activation='relu')(drop1)

    drop2 = Dropout(0.25)(dense)
    out = Dense(classes, activation='softmax')(drop2)

    model = Model(inputs=inflows, outputs=out)
    #optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08)
    optimizer = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    stopper = EarlyStopping(monitor='val_loss',
                            patience=10,
                            verbose=0,
                            mode='auto')
    model.fit(train_xs,
              train_y,
              epochs=20,
              batch_size=128,
              verbose=1,
              validation_split=0.1,
              callbacks=[stopper])

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
Beispiel #20
0
def run(train_file, test_file, batch, epochs, embdims, filters, filtlen,
        hidden, dropout, learnrt):
    """Train/test with given parameters. Return F1."""

    np.random.seed(1337)

    print 'train:', train_file
    print 'test:', test_file
    print 'batch:', batch
    print 'epochs:', epochs
    print 'embdims:', embdims
    print 'filters:', filters
    print 'filtlen:', filtlen
    print 'hidden:', hidden
    print 'dropout:', dropout
    print 'learnrt:', learnrt

    # learn alphabet from training examples
    datset = dataset.DatasetProvider(train_file)
    # now load training examples and labels
    train_x, train_y = datset.load(train_file)
    maxlen = max([len(seq) for seq in train_x])
    # now load test examples and labels
    test_x, test_y = datset.load(test_file, maxlen=maxlen)

    # turn x and y into numpy array among other things
    classes = len(set(train_y))
    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)
    test_x = pad_sequences(test_x, maxlen=maxlen)
    test_y = to_categorical(np.array(test_y), classes)

    branches = []  # models to be merged
    train_xs = []  # train x for each branch
    test_xs = []  # test x for each branch

    for filter_len in filtlen.split(','):

        branch = Sequential()
        branch.add(
            Embedding(len(datset.word2int),
                      embdims,
                      trainable=False,
                      input_length=maxlen))
        branch.add(
            Convolution1D(nb_filter=filters,
                          filter_length=int(filter_len),
                          border_mode='valid',
                          activation='relu',
                          subsample_length=1))
        branch.add(MaxPooling1D(pool_length=2))
        branch.add(Flatten())

        branches.append(branch)
        train_xs.append(train_x)
        test_xs.append(test_x)

    model = Sequential()
    model.add(Merge(branches, mode='concat'))

    model.add(Dropout(dropout))
    model.add(Dense(hidden))
    model.add(Activation('relu'))

    model.add(Dropout(dropout))
    model.add(Dense(classes))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=learnrt, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(train_xs,
              train_y,
              nb_epoch=epochs,
              batch_size=batch,
              verbose=0,
              validation_split=0.0,
              class_weight=None)

    # probability for each class; (test size, num of classes)
    distribution = \
      model.predict(test_xs, batch_size=batch)
    # class predictions; (test size,)
    predictions = np.argmax(distribution, axis=1)
    # gold labels; (test size,)
    gold = np.argmax(test_y, axis=1)

    # f1 scores
    label_f1 = f1_score(gold, predictions, average=None)

    print
    for label, idx in datset.label2int.items():
        print 'f1(%s)=%f' % (label, label_f1[idx])

    if 'contains' in datset.label2int:
        idxs = [datset.label2int['contains'], datset.label2int['contains-1']]
        contains_f1 = f1_score(gold, predictions, labels=idxs, average='micro')
        print '\nf1(contains average) =', contains_f1
    else:
        idxs = datset.label2int.values()
        average_f1 = f1_score(gold, predictions, labels=idxs, average='micro')
        print 'f1(all) =', average_f1

    print '******************************************'
Beispiel #21
0
def main():
  """Driver function"""

  base = os.environ['DATA_ROOT']

  dp = dataset.DatasetProvider(
    os.path.join(base, cfg.get('data', 'cuis')),
    os.path.join(base, cfg.get('data', 'codes')),
    cfg.get('args', 'max_cuis'),
    cfg.get('args', 'max_codes'))
  x, y = dp.load()

  print('x shape:', x.shape)
  print('y shape:', y.shape)

  # are we training the best model?
  if cfg.getfloat('args', 'test_size') != 0:
    train_x, val_x, train_y, val_y = train_test_split(
      x, y, test_size=cfg.getfloat('args', 'test_size'))
    validation_data = (val_x, val_y)
  else:
    train_x, train_y = x, y
    validation_data = None

  # need to add one to account for the index 0 which is not used
  model = get_model(x.shape[1], y.shape[1])
  optim = getattr(optimizers, cfg.get('bow', 'optimizer'))
  model.compile(loss='binary_crossentropy',
                optimizer=optim(lr=10**cfg.getint('bow', 'log10lr')),
                metrics=['accuracy'])

  # save the model after every epoch
  callback = ModelCheckpoint(
    cfg.get('data', 'model_dir') + 'model.h5',
    verbose=1,
    save_best_only=True)

  model.fit(train_x,
            train_y,
            validation_data=validation_data,
            epochs=cfg.getint('bow', 'epochs'),
            batch_size=cfg.getint('bow', 'batch'),
            validation_split=0.0,
            callbacks=[callback])

  # are we training the best model?
  if cfg.getfloat('args', 'test_size') == 0:
    model.save(cfg.get('data', 'model_dir') + 'model.h5')
    exit()

  # probability for each class; (test size, num of classes)
  distribution = model.predict(val_x)

  # turn into an indicator matrix
  distribution[distribution < 0.5] = 0
  distribution[distribution >= 0.5] = 1

  f1 = f1_score(val_y, distribution, average='macro')
  p = precision_score(val_y, distribution, average='macro')
  r = recall_score(val_y, distribution, average='macro')
  print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
  f1 = f1_score(val_y, distribution, average='micro')
  p = precision_score(val_y, distribution, average='micro')
  r = recall_score(val_y, distribution, average='micro')
  print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
Beispiel #22
0
  model.add(Dense(classes))
  model.add(Activation('softmax'))

  return model

if __name__ == "__main__":

  cfg = ConfigParser.ConfigParser()
  cfg.read(sys.argv[1])
  print_config(cfg)

  base = os.environ['DATA_ROOT']
  data_dir = os.path.join(base, cfg.get('data', 'path'))
  dataset = dataset.DatasetProvider(
    data_dir,
    cfg.getint('args', 'min_token_freq'))
  x, y = dataset.load()

  classes = len(dataset.label2int)
  maxlen = max([len(seq) for seq in x])
  x = pad_sequences(x, maxlen=maxlen)
  y = to_categorical(y, classes)
  print 'x shape:', x.shape
  print 'y shape:', y.shape
  print 'number of features:', len(dataset.token2int)

  f1_scores = []
  kf = KFold(n_splits=5, shuffle=True, random_state=100)
  for train_indices, test_indices in kf.split(x):
Beispiel #23
0
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers import GlobalAveragePooling1D
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
import dataset

if __name__ == "__main__":

  cfg = ConfigParser.ConfigParser()
  cfg.read(sys.argv[1])
  base = os.environ['DATA_ROOT']
  train_dir = os.path.join(base, cfg.get('data', 'train'))
  code_file = os.path.join(base, cfg.get('data', 'codes'))

  dataset = dataset.DatasetProvider(train_dir, code_file)
  x, y = dataset.load()
  train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20)
  maxlen = max([len(seq) for seq in train_x])

  # turn x into numpy array among other things
  classes = len(dataset.code2int)
  train_x = pad_sequences(train_x, maxlen=maxlen)
  test_x = pad_sequences(test_x, maxlen=maxlen)
  train_y = np.array(train_y)
  test_y = np.array(test_y)
  print 'train_x shape:', train_x.shape
  print 'train_y shape:', train_y.shape
  print 'test_x shape:', test_x.shape
  print 'test_y shape:', test_y.shape
  print 'unique features:', len(dataset.token2int)
Beispiel #24
0

if __name__ == "__main__":

    # fyi this is a global variable now
    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])

    base = os.environ['DATA_ROOT']
    train_dir = os.path.join(base, cfg.get('data', 'train'))
    code_file = os.path.join(base, cfg.get('data', 'codes'))

    provider = dataset.DatasetProvider(train_dir,
                                       code_file,
                                       cfg.getint('args', 'min_token_freq'),
                                       cfg.getint('args',
                                                  'max_tokens_in_file'),
                                       cfg.getint('args',
                                                  'min_examples_per_code'),
                                       use_cuis=False)
    x, y = provider.load(tokens_as_set=False)

    maxlen = max([len(seq) for seq in x])
    x = pad_sequences(x, maxlen=maxlen)
    y = np.array(y)

    print('x shape:', x.shape)
    print('y shape:', y.shape)
    print('max seq len:', maxlen)
    print('vocab size:', x.max() + 1)
    print('number of features:', len(provider.token2int))
    print('number of labels:', len(provider.code2int))
Beispiel #25
0
import dataset

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU

NFOLDS = 10
BATCH = 50
EPOCHS = 5
EMBDIMS = 300

if __name__ == "__main__":

    dataset = dataset.DatasetProvider()
    x, y = dataset.load()
    print 'x shape:', x.shape
    print 'y shape:', y.shape

    scores = []
    folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS, shuffle=True)

    for fold_num, (train_indices, test_indices) in enumerate(folds):
        train_x = x[train_indices]
        train_y = y[train_indices]
        test_x = x[test_indices]
        test_y = y[test_indices]

        model = k.models.Sequential()
        model.add(LSTM(128, input_length=205845, input_dim=300))
Beispiel #26
0
from keras.models import Sequential
from keras.layers import Merge, LSTM
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
import dataset
import ConfigParser

if __name__ == "__main__":

    cfg = ConfigParser.ConfigParser()
    cfg.read('settings.ini')

    # learn alphabet from training data
    dataset = dataset.DatasetProvider(
        [cfg.get('data', 'train'),
         cfg.get('data', 'test')])
    # now load training examples and labels
    train_x, train_y = dataset.load(cfg.get('data', 'train'))
    # now load test examples and labels
    test_x, test_y = dataset.load(cfg.get('data', 'test'))

    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x + test_x])
    classes = len(set(train_y))
    train_x = sequence.pad_sequences(train_x, maxlen=maxlen)
    train_y = k.utils.np_utils.to_categorical(np.array(train_y), classes)
    test_x = sequence.pad_sequences(test_x, maxlen=maxlen)
    test_y = k.utils.np_utils.to_categorical(np.array(test_y), classes)

    print 'train_x shape:', train_x.shape
Beispiel #27
0
    # settings file specified as command-line argument
    cfg = ConfigParser.ConfigParser()
    cfg.read(sys.argv[1])
    print 'train:', cfg.get('data', 'train')
    print 'test:', cfg.get('data', 'test')
    print 'batch:', cfg.get('cnn', 'batch')
    print 'epochs:', cfg.get('cnn', 'epochs')
    print 'embdims:', cfg.get('cnn', 'embdims')
    print 'filters:', cfg.get('cnn', 'filters')
    print 'filtlen:', cfg.get('cnn', 'filtlen')
    print 'hidden:', cfg.get('cnn', 'hidden')
    print 'dropout:', cfg.get('cnn', 'dropout')
    print 'learnrt:', cfg.get('cnn', 'learnrt')

    # learn alphabets from training examples
    dataset = dataset.DatasetProvider(cfg.get('data', 'train'))
    # now load training examples and labels
    train_x1, train_x2, train_y = dataset.load(cfg.get('data', 'train'))
    maxlen = max([len(seq) for seq in train_x1])
    # now load test examples and labels
    test_x1, test_x2, test_y = dataset.load(cfg.get('data', 'test'),
                                            maxlen=maxlen)

    init_vectors = None
    # TODO: what what are we doing for index 0 (oov words)?
    # use pre-trained word embeddings?
    if cfg.has_option('data', 'embed'):
        print 'embeddings:', cfg.get('data', 'embed')
        word2vec = word2vec_model.Model(cfg.get('data', 'embed'))
        init_vectors = [word2vec.select_vectors(dataset.word2int)]