def main(): """Driver function""" base = os.environ['DATA_ROOT'] dataset = TransferDataset( os.path.join(base, cfg.get('data', 'train')), os.path.join(base, cfg.get('data', 'codes')), os.path.join(base, cfg.get('data', 'targets')), cfg.getint('args', 'min_token_freq'), cfg.getint('args', 'max_tokens_in_file'), cfg.getint('args', 'min_examples_per_code'), cfg.getboolean('args', 'collapse_codes')) x, y = dataset.load() train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=cfg.getfloat('args', 'test_size')) maxlen = max([len(seq) for seq in train_x]) init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file) init_vectors = [w2v.select_vectors(dataset.token2int)] # turn x into numpy array among other things train_x = pad_sequences(train_x, maxlen=maxlen) val_x = pad_sequences(val_x, maxlen=maxlen) train_y = np.array(train_y) val_y = np.array(val_y) print('train_x shape:', train_x.shape) print('train_y shape:', train_y.shape) print('val_x shape:', val_x.shape) print('val_y shape:', val_y.shape) print('number of features:', len(dataset.token2int)) print('positive examples:', sum(y)) print('negative examples:', len(y) - sum(y)) model = get_model(init_vectors, len(dataset.token2int), maxlen) op = getattr(keras.optimizers, cfg.get('dan', 'optimizer')) model.compile(loss='binary_crossentropy', optimizer=op(lr=10**cfg.getfloat('dan', 'log10lr')), metrics=['accuracy']) model.fit(train_x, train_y, validation_data=(val_x, val_y) if val_x.shape[0]>0 else None, epochs=cfg.getint('dan', 'epochs'), batch_size=cfg.getint('dan', 'batch')) model.save(MODEL_FILE) # do we need to evaluate? if cfg.getfloat('args', 'test_size') == 0: exit() predictions = model.predict_classes(val_x) report_results(val_y, predictions, 'macro') report_results(val_y, predictions, 'micro')
def main(): """Driver function""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] # load x and y and split dataset = TransferDataset( os.path.join(base, cfg.get('data', 'train')), os.path.join(base, cfg.get('data', 'codes')), os.path.join(base, cfg.get('data', 'targets')), cfg.getint('args', 'min_token_freq'), cfg.getint('args', 'max_tokens_in_file'), cfg.getint('args', 'min_examples_per_code'), cfg.getboolean('args', 'collapse_codes')) x, y = dataset.load() x_train, x_val, y_train, y_val = train_test_split( x, y, test_size=0.2) max_len = max([len(seq) for seq in x_train]) # load pretrained embeddings init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file) init_vectors = [w2v.select_vectors(dataset.token2int)] # turn x into numpy array among other things x_train = pad_sequences(x_train, maxlen=max_len) x_val = pad_sequences(x_val, maxlen=max_len) y_train = np.array(y_train) y_val = np.array(y_val) fixed_args = { 'num_features': len(dataset.token2int), 'emb_dims': cfg.getint('search', 'emb_dim'), 'seq_len': max_len, 'init_vectors': init_vectors } param_space = make_param_space() results = rndsearch.run( make_model, fixed_args, param_space, x_train, y_train, x_val, y_val, cfg.getint('search', 'n')) # display configs sorted by f1 print('\nconfigurations sorted by score:') sorted_by_value = sorted(results, key=results.get) for config in sorted_by_value: print('%s: %.3f' % (config, results[config]))
def main(): """Driver function""" base = os.environ['DATA_ROOT'] dp = dataset.DatasetProvider(os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'model_dir'), cfg.getint('args', 'max_seq_len'), cfg.get('args', 'n_files'), cfg.get('args', 'n_x1_cuis'), cfg.get('args', 'n_x2_cuis')) x1, x2, y = dp.load() print('x1 shape:', x1.shape) print('x2 shape:', x2.shape) print('y shape:', y.shape) train_x1, val_x1, train_x2, val_x2, train_y, val_y = train_test_split( x1, x2, y, test_size=cfg.getfloat('args', 'test_size')) # TODO: figure out what to do about negated cuis init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file, verbose=True) init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)] model = get_model_concat_no_sharing( len(dp.tokenizer.word_index) + 1, x1.shape[1], init_vectors) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # save the model after every epoch callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5', verbose=1, save_best_only=True) model.fit([train_x1, train_x2], train_y, validation_data=([val_x1, val_x2], val_y), epochs=cfg.getint('dan', 'epochs'), batch_size=cfg.getint('dan', 'batch'), validation_split=0.0, callbacks=[callback]) # are we training the best model? if cfg.getfloat('args', 'test_size') == 0: model.save(cfg.get('data', 'model_dir') + 'model.h5') exit() probs = model.predict([val_x1, val_x2]) predictions = (probs > 0.5).astype(int) accuracy = accuracy_score(val_y, predictions) print('accuracy: ', accuracy)
def get_embeddings(cfg, token2int): """Initial weights for embedding layer""" init_vectors = None base = os.environ['DATA_ROOT'] if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file) init_vectors = [w2v.select_vectors(token2int)] return init_vectors
def make_model(kernel_size, hidden_size, dropout): """Creating a model for sklearn""" print '\n' print 'kernel_size:', kernel_size print 'hidden_size:', hidden_size print 'dropout:', dropout print init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file) init_vectors = [w2v.select_vectors(dataset.word2int)] model = Sequential() model.add(Embedding(len(dataset.word2int), cfg.getint('cnn', 'embdims'), input_length=maxlen, trainable=True, weights=init_vectors)) model.add(Conv1D(filters=cfg.getint('cnn', 'filters'), kernel_size=kernel_size, activation='relu')) model.add(GlobalMaxPooling1D()) model.add(Dropout(dropout)) model.add(Dense(hidden_size)) model.add(Activation('relu')) model.add(Dropout(dropout)) model.add(Dense(classes)) model.add(Activation('softmax')) optimizer = RMSprop(lr=cfg.getfloat('cnn', 'learnrt'), rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) return model
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb")) pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb")) pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb")) w2v = word2vec.Model('/home/dima/Data/Word2VecModels/mimic.txt') init_vectors = [w2v.select_vectors(provider.word2int)] model = get_model(len(provider.word2int), maxlen, init_vectors, classes) optimizer = RMSprop(lr=LEARN_RATE, rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_x, train_y, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, verbose=0, validation_split=0.0) json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)
def run_one_eval(self, train_x, train_y, valid_x, valid_y, epochs, config): """A single eval""" print(config) init_vectors = None if config['embed']: embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file) init_vectors = [w2v.select_vectors(provider.token2int)] vocab_size = train_x.max() + 1 input_length = max([len(seq) for seq in x]) output_units = train_y.shape[1] model = self.get_model(init_vectors, vocab_size, input_length, output_units, config) model.compile(loss='binary_crossentropy', optimizer=self.get_optimizer(config['optimizer'], config['lr']), metrics=['accuracy']) model.fit(train_x, train_y, epochs=epochs, batch_size=config['batch'], validation_split=0.0, verbose=0) # probability for each class; (test size, num of classes) # batch_size needed because large batches cause OOM distribution = model.predict(valid_x, batch_size=8) # turn into an indicator matrix distribution[distribution < 0.5] = 0 distribution[distribution >= 0.5] = 1 f1 = f1_score(valid_y, distribution, average='macro') print('f1: %.3f after %d epochs\n' % (f1, epochs)) return 1 - f1
dataset = dataset.DatasetProvider( train_dir, code_file, cfg.getint('args', 'min_token_freq'), cfg.getint('args', 'max_tokens_in_file'), cfg.getint('args', 'min_examples_per_code')) x, y = dataset.load() train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=cfg.getfloat( 'args', 'test_size')) maxlen = max([len(seq) for seq in train_x]) init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file) init_vectors = [w2v.select_vectors(dataset.token2int)] # turn x into numpy array among other things classes = len(dataset.code2int) train_x = pad_sequences(train_x, maxlen=maxlen) val_x = pad_sequences(val_x, maxlen=maxlen) train_y = np.array(train_y) val_y = np.array(val_y) print('train_x shape:', train_x.shape) print('train_y shape:', train_y.shape) print('val_x shape:', val_x.shape) print('val_y shape:', val_y.shape) print('number of features:', len(dataset.token2int)) print('number of labels:', len(dataset.code2int))
def main(): """Driver function""" base = os.environ['DATA_ROOT'] dp = dataset_dan.DatasetProvider( os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'model_dir'), cfg.getint('args', 'n_examples')) x, y = dp.load() print('x shape:', x.shape) print('y shape:', y.shape) # are we training the best model? if cfg.getfloat('args', 'test_size') != 0: train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=cfg.getfloat('args', 'test_size')) validation_data = (val_x, val_y) else: train_x, train_y = x, y validation_data = None # TODO: figure out what to do about negated cuis init_vectors = None if cfg.has_option('data', 'embed'): embed_file = os.path.join(base, cfg.get('data', 'embed')) w2v = word2vec.Model(embed_file, verbose=True) init_vectors = [w2v.select_vectors(dp.tokenizer.word_index)] model = get_model( len(dp.tokenizer.word_index) + 1, x.shape[1], y.shape[1], init_vectors) optim = getattr(optimizers, cfg.get('dan', 'optimizer')) model.compile(loss='binary_crossentropy', optimizer=optim(lr=10**cfg.getint('dan', 'log10lr')), metrics=['accuracy']) # save the model after every epoch callback = ModelCheckpoint(cfg.get('data', 'model_dir') + 'model.h5', verbose=1, save_best_only=True) model.fit(train_x, train_y, validation_data=validation_data, epochs=cfg.getint('dan', 'epochs'), batch_size=cfg.getint('dan', 'batch'), validation_split=0.0, callbacks=[callback]) # are we training the best model? if cfg.getfloat('args', 'test_size') == 0: model.save(cfg.get('data', 'model_dir') + 'model.h5') exit() # probability for each class; (test size, num of classes) distribution = model.predict(val_x) # turn into an indicator matrix distribution[distribution < 0.5] = 0 distribution[distribution >= 0.5] = 1 f1 = f1_score(val_y, distribution, average='macro') p = precision_score(val_y, distribution, average='macro') r = recall_score(val_y, distribution, average='macro') print("\nmacro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1)) f1 = f1_score(val_y, distribution, average='micro') p = precision_score(val_y, distribution, average='micro') r = recall_score(val_y, distribution, average='micro') print("micro: p: %.3f - r: %.3f - f1: %.3f" % (p, r, f1))
'embed_size': 256, 'hid_size': 256, 'neg_samples': batch_size * 2, 'learn_rate': 0.01, 'momentum': 0.9, 'embed_noise': 0.1, 'hid_noise': 0.3, 'epoch': 10, 'optimizer': 'Momentum', } split = round(X.shape[0] * 0.9) train_X, train_Y = X[:split, :], Y[:split, :] test_X, test_Y = X[split:, :], Y[split:, :] model = word2vec.Model(graph_params) print('model built, vocab size %d, document length %d' % (np.max(X) + 1, len(word_array))) embed_weights, nce_weights = model.train( train_X, train_Y, test_X, test_Y, graph_params['epoch'], graph_params['batch_size'], ) import pickle with open('word2vec-wiki-256.p', 'wb') as fopen:
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'), "wb")) pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'), "wb")) pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'), "wb")) w2v = word2vec.Model('/home/dima/Data/Word2VecModels/mimic.txt') init_vectors = [w2v.select_vectors(provider.word2int)] print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape model = Sequential() model.add( Embedding(len(provider.word2int), 300, input_length=maxlen, trainable=True, weights=init_vectors)) model.add(Conv1D(filters=200, kernel_size=5, activation='relu')) model.add(GlobalMaxPooling1D()) model.add(Dropout(0.25)) model.add(Dense(300)) model.add(Activation('relu')) model.add(Dropout(0.25)) model.add(Dense(classes)) model.add(Activation('softmax')) optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_x, train_y, epochs=4, batch_size=50, verbose=0, validation_split=0.0) json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) sys.exit(0)