def eval_test(modelPath): global data # data = MultiLabelDataReader(Defaults.input_path).load(index) #model = ltlib.util.load_keras(modelPath) model.load_weights(modelPath + "model.h5") optimizer = get_optimizer(Defaults) print("STARTING TEST") force_oov = set(l.strip() for l in open(Defaults.oov)) if Defaults.oov else None w2v = NormEmbeddingFeature.from_file(Defaults.embedding_path, max_rank=Defaults.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) # logging.info(features[0].summary()) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(Defaults.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature # inputs, embeddings = inputs_and_embeddings(features, Defaults) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1, prec, rec]) predictions = model.predict(data.test.documents.inputs, batch_size=Defaults.batch_size) # print(str(predictions)) data.test.documents.set_predictions(predictions) print("TEST RESULTS for: " + str(len(predictions))) best_sigmoid = utility.readDictFromStringFile(Defaults.output_path + "out.txt")["best_sigmoid_t"] res = data.test.eval(sigmoid_t=best_sigmoid) res["sigmoid_t"] = best_sigmoid print(str(res)) np.save(Defaults.pred_path + "pred", data.test.get_predictions()) utility.writeDictAsStringFile(res, Defaults.results_path + "res.txt")
conv_outputs.append(cout) seq = concat(conv_outputs) for size in config.hidden_sizes: seq = Dense(size, activation=config.hidden_activation)(seq) seq = Dropout(config.output_drop_prob)(seq) #Create private outputs outs = [] for ind, dataset in enumerate(datasets): #outs.append(Dense(data[ind].tokens.target_dim, activation='softmax')(seq)) outs.append(Dense(max_y, activation='softmax')(seq)) model = Model(input=inputs, output=outs) optimizer = get_optimizer(config) model.compile(loss=['categorical_crossentropy']* len(datasets), optimizer=optimizer, metrics=['accuracy']) #model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) x_batch = [] y_batch = [] concatenated = True for ind, ds in enumerate(data): x_batch.append(ds.train.tokens.inputs['words']) #out_labels = [np.zeros(data[ind_].train.tokens.targets.shape) for ind_, dataset in enumerate(datasets)] #out_labels[ind] = data[ind].train.tokens.targets out_labels = [np.zeros((data[ind_].train.tokens.targets.shape[0], max_y)) for ind_, dataset in enumerate(datasets)] y_ = data[ind].train.tokens.targets
data.tokens.add_inputs(windowed_inputs(config.window_size, features)) # Log word vector feature stat summary info('{}: {}'.format(config.wordvecs, w2v.summary())) inputs, embeddings = inputs_and_embeddings(features, config) seq = concat(embeddings) seq = Flatten()(seq) for size in config.hidden_sizes: seq = Dense(size, activation=config.hidden_activation)(seq) seq = Dropout(config.output_drop_prob)(seq) out = Dense(data.tokens.target_dim, activation='softmax')(seq) model = Model(input=inputs, output=out) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) callbacks = [ EpochTimer(), token_evaluator(data.train, config=config), token_evaluator(data.test, mapper=vmapper, config=config), ] model.fit(data.train.tokens.inputs, data.train.tokens.targets, callbacks=callbacks, batch_size=config.batch_size, nb_epoch=config.epochs,
def main(argv): config = cli_settings(['datadir', 'wordvecs'], Defaults) data = load_dir(config.datadir, config) force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) logging.info(features[0].summary()) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='softmax')(seq) model = Model(input=inputs, output=out) if config.verbosity != 0: logging.info(model.summary()) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1, prec, rec]) weights, results = [], {} callbacks = [ EpochTimer(), WeightStore(weights), document_evaluator(data.train, label='train', results=results), document_evaluator(data.devel, label='devel', results=results), ] if config.test: callbacks.append( document_evaluator(data.test, label='test', results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks) # logging.info(history.history) for k, values in results.items(): s = lambda v: str(v) if not isinstance(v, float) else '{:.4f}'.format(v ) logging.info('\t'.join(s(i) for i in [k] + values)) evalsets = [data.devel] + ([data.test] if config.test else []) for s in evalsets: logging.info('last epoch, {}: {}'.format( s.name, evaluation_summary(model, s, 0, config))) epoch = get_best_epoch(results, 'devel', config) model.set_weights(weights[epoch]) if config.threshold: threshold = results['devel/maxf-threshold'][epoch] else: threshold = 0.0 for s in evalsets: logging.info('best devel epoch th {} ({}), {}: {}'.format( threshold, config.target_metric, s.name, evaluation_summary(model, s, threshold, config)))
def main(argv): global data config = cli_settings(['datadir', 'wordvecs'], Defaults) ##load_dir(config.datadir, config) print("finished reading data") force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens print("finished reading embeddings") features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='sigmoid')(seq) model = Model(input=inputs, output=out) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer #metrics=['accuracy', f1, prec, rec] ) weights, results = [], {} callbacks = [ EpochTimer(), #WeightStore(weights), #document_evaluator(data.train, label='train', results=results), evaluator(data.devel, label='devel', results=results) ] #if config.test: #callbacks.append(document_evaluator(data.test, label='test', # results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks)
seq = concat(conv_outputs[ind]) for size in config.hidden_sizes: fully_connected.append(Dense(size, activation=config.hidden_activation, name='dense-1-%d' % ind)(seq)) dropout.append(Dropout(config.output_drop_prob, name='dropout-%d' % ind)(fully_connected[ind])) pos_dense_out = Dense(data[0].tokens.target_dim, activation='softmax', name='pos-dense-out')(dropout[0]) ner_merged = merge([dropout[0], dropout[1]], mode='concat') ner_dense_out = Dense(data[1].tokens.target_dim, activation='softmax', name='ner-dense-out')(ner_merged) pos_model = Model(input=pos_inputs, output=pos_dense_out) ner_model = Model(input=pos_inputs + ner_inputs, output=ner_dense_out) pos_model.compile(optimizer=get_optimizer(config), loss='categorical_crossentropy', metrics=['accuracy']) ner_model.compile(optimizer=get_optimizer(config), loss='categorical_crossentropy', metrics=['accuracy']) models = [pos_model, ner_model] time_str = datetime.datetime.now().isoformat() print("Started training at: %s" % time_str) for ind, ds in enumerate(data): for ep in range(1, config.epochs + 1): percnt_keep = config.percent_keep amt_keep = len(ds.train.tokens.inputs['words-%s' % ind]) * percnt_keep print("Total: %s. Keeping: %s" % (len(ds.train.tokens.inputs['words-%s' % ind]), amt_keep)) start = random.randrange(int(len(ds.train.tokens.inputs['words-%s' % ind]) - amt_keep) + 1) end = int(start + amt_keep) x = ds.train.tokens.inputs['words-%s' % ind][start:end]