name='words')
features = [w2v]
if config.word_features:
    features.append(SennaCapsFeature('caps'))

for ind, dataset in enumerate(datasets):
    data[ind].tokens.add_features(features)
    data[ind].tokens.add_inputs(windowed_inputs(config.window_size, features))

# Log word vector feature stat summary
info('{}: {}'.format(config.wordvecs, w2v.summary()))

inputs, embeddings = inputs_and_embeddings(features, config)

# Combine and reshape for convolution
seq = concat(embeddings)
cshape = (config.window_size, sum(f.output_dim for f in features))
seq = Reshape((1,) + cshape)(seq)

# Convolutions
conv_outputs = []
for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums):
    conv = Convolution2D(filter_num, filter_size, cshape[1],activation='relu')(seq)
    cout = Flatten()(conv)
    conv_outputs.append(cout)
seq = concat(conv_outputs)

for size in config.hidden_sizes:
    seq = Dense(size, activation=config.hidden_activation)(seq)
seq = Dropout(config.output_drop_prob)(seq)
                                     max_rank=config.max_vocab_size,
                                     vocabulary=data.vocabulary,
                                     name='words')
features = [w2v]
if config.word_features:
    features.append(SennaCapsFeature(name='caps'))

data.tokens.add_features(features)
data.tokens.add_inputs(windowed_inputs(config.window_size, features))

# Log word vector feature stat summary
info('{}: {}'.format(config.wordvecs, w2v.summary()))

inputs, embeddings = inputs_and_embeddings(features, config)

seq = concat(embeddings)
seq = Flatten()(seq)
for size in config.hidden_sizes:
    seq = Dense(size, activation=config.hidden_activation)(seq)
seq = Dropout(config.output_drop_prob)(seq)
out = Dense(data.tokens.target_dim, activation='softmax')(seq)
model = Model(input=inputs, output=out)

optimizer = get_optimizer(config)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

callbacks = [
    EpochTimer(),
    token_evaluator(data.train, config=config),
Exemple #3
0
def main(argv):
    config = cli_settings(['datadir', 'wordvecs'], Defaults)
    data = load_dir(config.datadir, config)

    force_oov = set(l.strip()
                    for l in open(config.oov)) if config.oov else None
    w2v = NormEmbeddingFeature.from_file(config.wordvecs,
                                         max_rank=config.max_vocab_size,
                                         vocabulary=data.vocabulary,
                                         force_oov=force_oov,
                                         name='text')
    # Add word vector features to tokens
    features = [w2v]
    data.tokens.add_features(features)
    # Summarize word vector featurizer statistics (OOV etc.)
    logging.info(features[0].summary())
    # Create inputs at document level
    data.documents.add_inputs([
        FixedWidthInput(config.doc_size, f['<PADDING>'], f.name)
        for f in features
    ])

    # Create keras input and embedding for each feature
    inputs, embeddings = inputs_and_embeddings(features, config)

    # Combine and reshape for convolution
    seq = concat(embeddings)
    cshape = (config.doc_size, sum(f.output_dim for f in features)
              )  #calculating the size of documents and all features.
    seq = Reshape((1, ) + cshape)(seq)
    #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above

    # Convolution(s)
    convLayers = []
    for filter_size, filter_num in zip(config.filter_sizes,
                                       config.filter_nums):
        seq2 = Convolution2D(filter_num,
                             filter_size,
                             cshape[1],
                             border_mode='valid',
                             activation='relu',
                             dim_ordering='th')(seq)
        seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1),
                            dim_ordering='th')(seq2)
        seq2 = Flatten()(seq2)
        convLayers.append(seq2)

    seq = concat(convLayers)
    if config.drop_prob:
        seq = Dropout(config.drop_prob)(seq)
    for s in config.hidden_sizes:
        seq = Dense(s, activation='relu')(seq)
    out = Dense(data.documents.target_dim,
                W_regularizer=W_regularizer(config),
                activation='softmax')(seq)
    model = Model(input=inputs, output=out)

    if config.verbosity != 0:
        logging.info(model.summary())

    optimizer = get_optimizer(config)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy', f1, prec, rec])

    weights, results = [], {}
    callbacks = [
        EpochTimer(),
        WeightStore(weights),
        document_evaluator(data.train, label='train', results=results),
        document_evaluator(data.devel, label='devel', results=results),
    ]
    if config.test:
        callbacks.append(
            document_evaluator(data.test, label='test', results=results))

    hist = model.fit(data.train.documents.inputs,
                     data.train.documents.targets,
                     validation_data=(
                         data.devel.documents.inputs,
                         data.devel.documents.targets,
                     ),
                     batch_size=config.batch_size,
                     nb_epoch=config.epochs,
                     verbose=config.verbosity,
                     callbacks=callbacks)
    # logging.info(history.history)

    for k, values in results.items():
        s = lambda v: str(v) if not isinstance(v, float) else '{:.4f}'.format(v
                                                                              )
        logging.info('\t'.join(s(i) for i in [k] + values))

    evalsets = [data.devel] + ([data.test] if config.test else [])
    for s in evalsets:
        logging.info('last epoch, {}: {}'.format(
            s.name, evaluation_summary(model, s, 0, config)))
    epoch = get_best_epoch(results, 'devel', config)
    model.set_weights(weights[epoch])
    if config.threshold:
        threshold = results['devel/maxf-threshold'][epoch]
    else:
        threshold = 0.0
    for s in evalsets:
        logging.info('best devel epoch th {} ({}), {}: {}'.format(
            threshold, config.target_metric, s.name,
            evaluation_summary(model, s, threshold, config)))
                                     name='words')
features = [w2v]
if config.word_features:
    features.append(SennaCapsFeature('caps'))

for ind, dataset in enumerate(datasets):
    data[ind].tokens.add_features(features)
    data[ind].tokens.add_inputs(windowed_inputs(config.window_size, features))

# Log word vector feature stat summary
info('{}: {}'.format(config.wordvecs, w2v.summary()))

inputs, embeddings = inputs_and_embeddings(features, config)

# Combine and reshape for convolution
seq = concat(embeddings)
cshape = (config.window_size, sum(f.output_dim for f in features))
seq = Reshape((1, ) + cshape)(seq)

# Convolutions
conv_outputs = []
for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums):
    conv = Convolution2D(filter_num, filter_size, cshape[1],
                         activation='relu')(seq)
    cout = Flatten()(conv)
    conv_outputs.append(cout)
seq = concat(conv_outputs)

for size in config.hidden_sizes:
    seq = Dense(size, activation=config.hidden_activation)(seq)
seq = Dropout(config.output_drop_prob)(seq)
def main(argv):
    global data
    config = cli_settings(['datadir', 'wordvecs'], Defaults)
    ##load_dir(config.datadir, config)

    print("finished reading data")
    force_oov = set(l.strip()
                    for l in open(config.oov)) if config.oov else None
    w2v = NormEmbeddingFeature.from_file(config.wordvecs,
                                         max_rank=config.max_vocab_size,
                                         vocabulary=data.vocabulary,
                                         force_oov=force_oov,
                                         name='text')
    # Add word vector features to tokens
    print("finished reading embeddings")
    features = [w2v]
    data.tokens.add_features(features)
    # Summarize word vector featurizer statistics (OOV etc.)

    # Create inputs at document level
    data.documents.add_inputs([
        FixedWidthInput(config.doc_size, f['<PADDING>'], f.name)
        for f in features
    ])

    # Create keras input and embedding for each feature
    inputs, embeddings = inputs_and_embeddings(features, config)

    # Combine and reshape for convolution
    seq = concat(embeddings)
    cshape = (config.doc_size, sum(f.output_dim for f in features)
              )  #calculating the size of documents and all features.
    seq = Reshape((1, ) + cshape)(seq)
    #seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above

    # Convolution(s)
    convLayers = []
    for filter_size, filter_num in zip(config.filter_sizes,
                                       config.filter_nums):
        seq2 = Convolution2D(filter_num,
                             filter_size,
                             cshape[1],
                             border_mode='valid',
                             activation='relu',
                             dim_ordering='th')(seq)
        seq2 = MaxPooling2D(pool_size=(config.doc_size - filter_size + 1, 1),
                            dim_ordering='th')(seq2)
        seq2 = Flatten()(seq2)
        convLayers.append(seq2)

    seq = concat(convLayers)
    if config.drop_prob:
        seq = Dropout(config.drop_prob)(seq)
    for s in config.hidden_sizes:
        seq = Dense(s, activation='relu')(seq)
    out = Dense(data.documents.target_dim,
                W_regularizer=W_regularizer(config),
                activation='sigmoid')(seq)
    model = Model(input=inputs, output=out)

    optimizer = get_optimizer(config)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer
                  #metrics=['accuracy', f1, prec, rec]
                  )

    weights, results = [], {}
    callbacks = [
        EpochTimer(),
        #WeightStore(weights),
        #document_evaluator(data.train, label='train', results=results),
        evaluator(data.devel, label='devel', results=results)
    ]
    #if config.test:
    #callbacks.append(document_evaluator(data.test, label='test',
    #                                       results=results))

    hist = model.fit(data.train.documents.inputs,
                     data.train.documents.targets,
                     validation_data=(
                         data.devel.documents.inputs,
                         data.devel.documents.targets,
                     ),
                     batch_size=config.batch_size,
                     nb_epoch=config.epochs,
                     verbose=config.verbosity,
                     callbacks=callbacks)
Exemple #6
0
                                        vocabulary=all_vocab,
                                        name='words-%s' % ind)
    features = [w2v]
    if config.word_features:
        features.append(SennaCapsFeature('caps'))


    data[ind].tokens.add_features(features)
    data[ind].tokens.add_inputs(windowed_inputs(config.window_size, features))

    # Log word vector feature stat summary
    info('{}: {}'.format(config.wordvecs, w2v.summary()))

    if ind == 0:
        pos_inputs, pos_embeddings = inputs_and_embeddings(features, config)
        pos_x = concat(pos_embeddings)
    if ind == 1:
        ner_inputs, ner_embeddings = inputs_and_embeddings(features, config)
        ner_x = concat(ner_embeddings)
           
cshapes = []
reshapes = []                              
# Combine and reshape for convolution
pos_cshape = (config.window_size, sum(f.output_dim for f in features))
ner_cshape = (config.window_size, sum(f.output_dim for f in features))
cshapes.append(pos_cshape)
cshapes.append(ner_cshape)

pos_reshape = Reshape((1,) + (pos_cshape), name='pos-reshape')(pos_x)
ner_reshape = Reshape((1,) + (ner_cshape), name='ner-reshape')(ner_x)
reshapes.append(pos_reshape)