Exemple #1
0
def main():
  """Driver function"""

  base = os.environ['DATA_ROOT']
  dataset = TransferDataset(
    os.path.join(base, cfg.get('data', 'train')),
    os.path.join(base, cfg.get('data', 'codes')),
    os.path.join(base, cfg.get('data', 'targets')),
    cfg.getint('args', 'min_token_freq'),
    cfg.getint('args', 'max_tokens_in_file'),
    cfg.getint('args', 'min_examples_per_code'),
    cfg.getboolean('args', 'collapse_codes'))
  x, y = dataset.load()
  train_x, val_x, train_y, val_y = train_test_split(
    x,
    y,
    test_size=cfg.getfloat('args', 'test_size'))
  maxlen = max([len(seq) for seq in train_x])

  init_vectors = None
  if cfg.has_option('data', 'embed'):
    embed_file = os.path.join(base, cfg.get('data', 'embed'))
    w2v = word2vec.Model(embed_file)
    init_vectors = [w2v.select_vectors(dataset.token2int)]

  # turn x into numpy array among other things
  train_x = pad_sequences(train_x, maxlen=maxlen)
  val_x = pad_sequences(val_x, maxlen=maxlen)
  train_y = np.array(train_y)
  val_y = np.array(val_y)

  print('train_x shape:', train_x.shape)
  print('train_y shape:', train_y.shape)
  print('val_x shape:', val_x.shape)
  print('val_y shape:', val_y.shape)
  print('number of features:', len(dataset.token2int))
  print('positive examples:', sum(y))
  print('negative examples:', len(y) - sum(y))

  model = get_model(init_vectors, len(dataset.token2int), maxlen)
  op = getattr(keras.optimizers, cfg.get('dan', 'optimizer'))
  model.compile(loss='binary_crossentropy',
                optimizer=op(lr=10**cfg.getfloat('dan', 'log10lr')),
                metrics=['accuracy'])
  model.fit(train_x,
            train_y,
            validation_data=(val_x, val_y) if val_x.shape[0]>0 else None,
            epochs=cfg.getint('dan', 'epochs'),
            batch_size=cfg.getint('dan', 'batch'))

  model.save(MODEL_FILE)

  # do we need to evaluate?
  if cfg.getfloat('args', 'test_size') == 0:
    exit()

  predictions = model.predict_classes(val_x)
  report_results(val_y, predictions, 'macro')
  report_results(val_y, predictions, 'micro')
Exemple #2
0
def main():
  """Driver function"""

  cfg = configparser.ConfigParser()
  cfg.read(sys.argv[1])
  base = os.environ['DATA_ROOT']

  # load x and y and split
  dataset = TransferDataset(
    os.path.join(base, cfg.get('data', 'train')),
    os.path.join(base, cfg.get('data', 'codes')),
    os.path.join(base, cfg.get('data', 'targets')),
    cfg.getint('args', 'min_token_freq'),
    cfg.getint('args', 'max_tokens_in_file'),
    cfg.getint('args', 'min_examples_per_code'),
    cfg.getboolean('args', 'collapse_codes'))
  x, y = dataset.load()
  x_train, x_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2)
  max_len = max([len(seq) for seq in x_train])

  # load pretrained embeddings
  init_vectors = None
  if cfg.has_option('data', 'embed'):
    embed_file = os.path.join(base, cfg.get('data', 'embed'))
    w2v = word2vec.Model(embed_file)
    init_vectors = [w2v.select_vectors(dataset.token2int)]

  # turn x into numpy array among other things
  x_train = pad_sequences(x_train, maxlen=max_len)
  x_val = pad_sequences(x_val, maxlen=max_len)
  y_train = np.array(y_train)
  y_val = np.array(y_val)

  fixed_args = {
    'num_features': len(dataset.token2int),
    'emb_dims': cfg.getint('search', 'emb_dim'),
    'seq_len': max_len,
    'init_vectors': init_vectors
  }
  param_space = make_param_space()

  results = rndsearch.run(
    make_model,
    fixed_args,
    param_space,
    x_train,
    y_train,
    x_val,
    y_val,
    cfg.getint('search', 'n'))

  # display configs sorted by f1
  print('\nconfigurations sorted by score:')
  sorted_by_value = sorted(results, key=results.get)
  for config in sorted_by_value:
    print('%s: %.3f' % (config, results[config]))
Exemple #3
0
def main():
    """Driver function"""

    base = os.environ['DATA_ROOT']

    dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')),
                                 cfg.get('data', 'model_dir'),
                                 cfg.getint('args', 'max_seq_len'),
                                 cfg.get('args', 'n_files'),
                                 cfg.get('args', 'n_x1_cuis'),
                                 cfg.get('args', 'n_x2_cuis'))
    x1, x2, y = dp.load()

    print('x1 shape:', x1.shape)
    print('x2 shape:', x2.shape)
    print('y shape:', y.shape)

    train_x1, val_x1, train_x2, val_x2, train_y, val_y = train_test_split(
        x1, x2, y, test_size=cfg.getfloat('args', 'test_size'))

    # TODO: figure out what to do about negated cuis
    init_vectors = None
    if cfg.has_option('data', 'embed'):
        embed_file = os.path.join(base, cfg.get('data', 'embed'))
        w2v = word2vec.Model(embed_file, verbose=True)
        init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)]

    model = get_model_concat_no_sharing(
        len(dp.tokenizer.word_index) + 1, x1.shape[1], init_vectors)

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    # save the model after every epoch
    callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5',
                               verbose=1,
                               save_best_only=True)

    model.fit([train_x1, train_x2],
              train_y,
              validation_data=([val_x1, val_x2], val_y),
              epochs=cfg.getint('dan', 'epochs'),
              batch_size=cfg.getint('dan', 'batch'),
              validation_split=0.0,
              callbacks=[callback])

    # are we training the best model?
    if cfg.getfloat('args', 'test_size') == 0:
        model.save(cfg.get('data', 'model_dir') + 'model.h5')
        exit()

    probs = model.predict([val_x1, val_x2])
    predictions = (probs > 0.5).astype(int)
    accuracy = accuracy_score(val_y, predictions)
    print('accuracy: ', accuracy)
Exemple #4
0
def get_embeddings(cfg, token2int):
    """Initial weights for embedding layer"""

    init_vectors = None
    base = os.environ['DATA_ROOT']

    if cfg.has_option('data', 'embed'):
        embed_file = os.path.join(base, cfg.get('data', 'embed'))
        w2v = word2vec.Model(embed_file)
        init_vectors = [w2v.select_vectors(token2int)]

    return init_vectors
Exemple #5
0
def make_model(kernel_size, hidden_size, dropout):
  """Creating a model for sklearn"""

  print '\n'
  print 'kernel_size:', kernel_size
  print 'hidden_size:', hidden_size
  print 'dropout:', dropout
  print

  init_vectors = None
  if cfg.has_option('data', 'embed'):
    embed_file = os.path.join(base, cfg.get('data', 'embed'))
    w2v = word2vec.Model(embed_file)
    init_vectors = [w2v.select_vectors(dataset.word2int)]

  model = Sequential()
  model.add(Embedding(len(dataset.word2int),
                      cfg.getint('cnn', 'embdims'),
                      input_length=maxlen,
                      trainable=True,
                      weights=init_vectors))
  model.add(Conv1D(filters=cfg.getint('cnn', 'filters'),
                   kernel_size=kernel_size,
                   activation='relu'))
  model.add(GlobalMaxPooling1D())

  model.add(Dropout(dropout))
  model.add(Dense(hidden_size))
  model.add(Activation('relu'))

  model.add(Dropout(dropout))
  model.add(Dense(classes))
  model.add(Activation('softmax'))

  optimizer = RMSprop(lr=cfg.getfloat('cnn', 'learnrt'),
                      rho=0.9, epsilon=1e-08)
  model.compile(loss='categorical_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])

  return model
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
    pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb"))
    pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb"))

    w2v = word2vec.Model('/home/dima/Data/Word2VecModels/mimic.txt')
    init_vectors = [w2v.select_vectors(provider.word2int)]

    model = get_model(len(provider.word2int), maxlen, init_vectors, classes)
    optimizer = RMSprop(lr=LEARN_RATE, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(train_x,
              train_y,
              epochs=NUM_EPOCHS,
              batch_size=BATCH_SIZE,
              verbose=0,
              validation_split=0.0)

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)
Exemple #7
0
    def run_one_eval(self, train_x, train_y, valid_x, valid_y, epochs, config):
        """A single eval"""

        print(config)

        init_vectors = None
        if config['embed']:
            embed_file = os.path.join(base, cfg.get('data', 'embed'))
            w2v = word2vec.Model(embed_file)
            init_vectors = [w2v.select_vectors(provider.token2int)]

        vocab_size = train_x.max() + 1
        input_length = max([len(seq) for seq in x])
        output_units = train_y.shape[1]

        model = self.get_model(init_vectors, vocab_size, input_length,
                               output_units, config)
        model.compile(loss='binary_crossentropy',
                      optimizer=self.get_optimizer(config['optimizer'],
                                                   config['lr']),
                      metrics=['accuracy'])
        model.fit(train_x,
                  train_y,
                  epochs=epochs,
                  batch_size=config['batch'],
                  validation_split=0.0,
                  verbose=0)

        # probability for each class; (test size, num of classes)
        # batch_size needed because large batches cause OOM
        distribution = model.predict(valid_x, batch_size=8)

        # turn into an indicator matrix
        distribution[distribution < 0.5] = 0
        distribution[distribution >= 0.5] = 1

        f1 = f1_score(valid_y, distribution, average='macro')
        print('f1: %.3f after %d epochs\n' % (f1, epochs))

        return 1 - f1
Exemple #8
0
    dataset = dataset.DatasetProvider(
        train_dir, code_file, cfg.getint('args', 'min_token_freq'),
        cfg.getint('args', 'max_tokens_in_file'),
        cfg.getint('args', 'min_examples_per_code'))
    x, y = dataset.load()
    train_x, val_x, train_y, val_y = train_test_split(x,
                                                      y,
                                                      test_size=cfg.getfloat(
                                                          'args', 'test_size'))
    maxlen = max([len(seq) for seq in train_x])

    init_vectors = None
    if cfg.has_option('data', 'embed'):
        embed_file = os.path.join(base, cfg.get('data', 'embed'))
        w2v = word2vec.Model(embed_file)
        init_vectors = [w2v.select_vectors(dataset.token2int)]

    # turn x into numpy array among other things
    classes = len(dataset.code2int)
    train_x = pad_sequences(train_x, maxlen=maxlen)
    val_x = pad_sequences(val_x, maxlen=maxlen)
    train_y = np.array(train_y)
    val_y = np.array(val_y)

    print('train_x shape:', train_x.shape)
    print('train_y shape:', train_y.shape)
    print('val_x shape:', val_x.shape)
    print('val_y shape:', val_y.shape)
    print('number of features:', len(dataset.token2int))
    print('number of labels:', len(dataset.code2int))
Exemple #9
0
def main():
    """Driver function"""

    base = os.environ['DATA_ROOT']

    dp = dataset_dan.DatasetProvider(
        os.path.join(base, cfg.get('data', 'train')),
        cfg.get('data', 'model_dir'), cfg.getint('args', 'n_examples'))
    x, y = dp.load()

    print('x shape:', x.shape)
    print('y shape:', y.shape)

    # are we training the best model?
    if cfg.getfloat('args', 'test_size') != 0:
        train_x, val_x, train_y, val_y = train_test_split(
            x, y, test_size=cfg.getfloat('args', 'test_size'))
        validation_data = (val_x, val_y)
    else:
        train_x, train_y = x, y
        validation_data = None

    # TODO: figure out what to do about negated cuis
    init_vectors = None
    if cfg.has_option('data', 'embed'):
        embed_file = os.path.join(base, cfg.get('data', 'embed'))
        w2v = word2vec.Model(embed_file, verbose=True)
        init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)]

    model = get_model(
        len(dp.tokenizer.word_index) + 1, x.shape[1], y.shape[1], init_vectors)

    optim = getattr(optimizers, cfg.get('dan', 'optimizer'))
    model.compile(loss='binary_crossentropy',
                  optimizer=optim(lr=10**cfg.getint('dan', 'log10lr')),
                  metrics=['accuracy'])

    # save the model after every epoch
    callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5',
                               verbose=1,
                               save_best_only=True)

    model.fit(train_x,
              train_y,
              validation_data=validation_data,
              epochs=cfg.getint('dan', 'epochs'),
              batch_size=cfg.getint('dan', 'batch'),
              validation_split=0.0,
              callbacks=[callback])

    # are we training the best model?
    if cfg.getfloat('args', 'test_size') == 0:
        model.save(cfg.get('data', 'model_dir') + 'model.h5')
        exit()

    # probability for each class; (test size, num of classes)
    distribution = model.predict(val_x)

    # turn into an indicator matrix
    distribution[distribution < 0.5] = 0
    distribution[distribution >= 0.5] = 1

    f1 = f1_score(val_y, distribution, average='macro')
    p = precision_score(val_y, distribution, average='macro')
    r = recall_score(val_y, distribution, average='macro')
    print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
    f1 = f1_score(val_y, distribution, average='micro')
    p = precision_score(val_y, distribution, average='micro')
    r = recall_score(val_y, distribution, average='micro')
    print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
Exemple #10
0
    'embed_size': 256,
    'hid_size': 256,
    'neg_samples': batch_size * 2,
    'learn_rate': 0.01,
    'momentum': 0.9,
    'embed_noise': 0.1,
    'hid_noise': 0.3,
    'epoch': 10,
    'optimizer': 'Momentum',
}

split = round(X.shape[0] * 0.9)
train_X, train_Y = X[:split, :], Y[:split, :]
test_X, test_Y = X[split:, :], Y[split:, :]

model = word2vec.Model(graph_params)
print('model built, vocab size %d, document length %d' %
      (np.max(X) + 1, len(word_array)))

embed_weights, nce_weights = model.train(
    train_X,
    train_Y,
    test_X,
    test_Y,
    graph_params['epoch'],
    graph_params['batch_size'],
)

import pickle

with open('word2vec-wiki-256.p', 'wb') as fopen:
Exemple #11
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb"))
    pickle.dump(provider.word2int,
                open(os.path.join(working_dir, 'word2int.p'), "wb"))
    pickle.dump(provider.label2int,
                open(os.path.join(working_dir, 'label2int.p'), "wb"))

    w2v = word2vec.Model('/home/dima/Data/Word2VecModels/mimic.txt')
    init_vectors = [w2v.select_vectors(provider.word2int)]

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    model = Sequential()
    model.add(
        Embedding(len(provider.word2int),
                  300,
                  input_length=maxlen,
                  trainable=True,
                  weights=init_vectors))
    model.add(Conv1D(filters=200, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())

    model.add(Dropout(0.25))
    model.add(Dense(300))
    model.add(Activation('relu'))

    model.add(Dropout(0.25))
    model.add(Dense(classes))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(train_x,
              train_y,
              epochs=4,
              batch_size=50,
              verbose=0,
              validation_split=0.0)

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    sys.exit(0)