from ltlib import filelog from ltlib import conlldata from ltlib import viterbi from ltlib.features import NormEmbeddingFeature, SennaCapsFeature from ltlib.features import windowed_inputs from ltlib.callbacks import token_evaluator, EpochTimer from ltlib.layers import concat, inputs_and_embeddings from ltlib.settings import cli_settings, log_settings from ltlib.optimizers import get_optimizer from ltlib.util import unique from ltlib.output import save_token_predictions_multi_output from config import Defaults config = cli_settings(['datadir', 'datasets','wordvecs'], Defaults) assert len(config.filter_nums) == len(config.filter_sizes) datasets = config.datasets.split(',') data = [] max_fs = [] max_vfs = [] for ind, dataset in enumerate(datasets): data_path = config.datadir + '/' + dataset data.append(conlldata.load_dir(data_path, config)) max_fs.append((0.0,0.0)) max_vfs.append((0.0,0.0)) max_y = 0
def main(argv): config = cli_settings(['datadir', 'wordvecs'], Defaults) data = load_dir(config.datadir, config) force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) logging.info(features[0].summary()) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='softmax')(seq) model = Model(input=inputs, output=out) if config.verbosity != 0: logging.info(model.summary()) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1, prec, rec]) weights, results = [], {} callbacks = [ EpochTimer(), WeightStore(weights), document_evaluator(data.train, label='train', results=results), document_evaluator(data.devel, label='devel', results=results), ] if config.test: callbacks.append( document_evaluator(data.test, label='test', results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks) # logging.info(history.history) for k, values in results.items(): s = lambda v: str(v) if not isinstance(v, float) else '{:.4f}'.format(v ) logging.info('\t'.join(s(i) for i in [k] + values)) evalsets = [data.devel] + ([data.test] if config.test else []) for s in evalsets: logging.info('last epoch, {}: {}'.format( s.name, evaluation_summary(model, s, 0, config))) epoch = get_best_epoch(results, 'devel', config) model.set_weights(weights[epoch]) if config.threshold: threshold = results['devel/maxf-threshold'][epoch] else: threshold = 0.0 for s in evalsets: logging.info('best devel epoch th {} ({}), {}: {}'.format( threshold, config.target_metric, s.name, evaluation_summary(model, s, threshold, config)))
from ltlib import filelog from ltlib import conlldata from ltlib import viterbi from ltlib.features import NormEmbeddingFeature, SennaCapsFeature from ltlib.features import windowed_inputs from ltlib.callbacks import token_evaluator, EpochTimer from ltlib.layers import concat, inputs_and_embeddings from ltlib.settings import cli_settings, log_settings from ltlib.optimizers import get_optimizer from ltlib.output import save_token_predictions from baseline_config import Defaults config = cli_settings(['datadir', 'wordvecs'], Defaults) data = conlldata.load_dir(config.datadir, config) vmapper = viterbi.get_prediction_mapper(data.train.sentences, config) w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, name='words') features = [w2v] if config.word_features: features.append(SennaCapsFeature(name='caps')) data.tokens.add_features(features) data.tokens.add_inputs(windowed_inputs(config.window_size, features))
def main(argv): global data config = cli_settings(['datadir', 'wordvecs'], Defaults) ##load_dir(config.datadir, config) print("finished reading data") force_oov = set(l.strip() for l in open(config.oov)) if config.oov else None w2v = NormEmbeddingFeature.from_file(config.wordvecs, max_rank=config.max_vocab_size, vocabulary=data.vocabulary, force_oov=force_oov, name='text') # Add word vector features to tokens print("finished reading embeddings") features = [w2v] data.tokens.add_features(features) # Summarize word vector featurizer statistics (OOV etc.) # Create inputs at document level data.documents.add_inputs([ FixedWidthInput(config.doc_size, f['<PADDING>'], f.name) for f in features ]) # Create keras input and embedding for each feature inputs, embeddings = inputs_and_embeddings(features, config) # Combine and reshape for convolution seq = concat(embeddings) cshape = (config.doc_size, sum(f.output_dim for f in features) ) #calculating the size of documents and all features. seq = Reshape((1, ) + cshape)(seq) #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above # Convolution(s) convLayers = [] for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums): seq2 = Convolution2D(filter_num, filter_size, cshape[1], border_mode='valid', activation='relu', dim_ordering='th')(seq) seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1), dim_ordering='th')(seq2) seq2 = Flatten()(seq2) convLayers.append(seq2) seq = concat(convLayers) if config.drop_prob: seq = Dropout(config.drop_prob)(seq) for s in config.hidden_sizes: seq = Dense(s, activation='relu')(seq) out = Dense(data.documents.target_dim, W_regularizer=W_regularizer(config), activation='sigmoid')(seq) model = Model(input=inputs, output=out) optimizer = get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer #metrics=['accuracy', f1, prec, rec] ) weights, results = [], {} callbacks = [ EpochTimer(), #WeightStore(weights), #document_evaluator(data.train, label='train', results=results), evaluator(data.devel, label='devel', results=results) ] #if config.test: #callbacks.append(document_evaluator(data.test, label='test', # results=results)) hist = model.fit(data.train.documents.inputs, data.train.documents.targets, validation_data=( data.devel.documents.inputs, data.devel.documents.targets, ), batch_size=config.batch_size, nb_epoch=config.epochs, verbose=config.verbosity, callbacks=callbacks)