logger.info('Sampling training data...')
# FIXME: pick_positive_name ignores those whose gold standard length is not one (multiple or nil)

# positives = [sp_training.pick_positive_name(config,corpus_train,concept,i) for i in range(len(corpus_train.names))]
# positives = [*zip(positives,corpus_train.names)]

# positives_dev_truncated = [sp_training.pick_positive_name(config,corpus_dev_truncated,concept,i) for i in range(len(corpus_dev_truncated.names))]
# positives_dev_truncated = [*zip(positives_dev_truncated,corpus_dev_truncated.names)]

# with open('gitig_positive_indices.pickle','wb') as f:
#     pickle.dump([positives,positives_dev_truncated],f)

from sample import prepare_positives, examples
positives_training, positives_dev, positives_dev_truncated = pickle.load(
    open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb'))
positives_training = prepare_positives(positives_training, nltk.word_tokenize,
                                       vocabulary)
positives_dev = prepare_positives(positives_dev, nltk.word_tokenize,
                                  vocabulary)
del positives_dev_truncated


# sampling
def examples(config,
             concept,
             positives,
             vocab,
             neg_count=config.getint('sample', 'neg_count')):
    """
    Builds positive and negative examples.
    """
    while True:
Beispiel #2
0
logger.info('Using truncated development corpus for evaluation.')
#corpus_dev = sample.NewDataSet('dev corpus')
[real_val_data, concept_order, corpus_dev] = pickle.load(
    open(
        os.path.join(directory, 'gitig_real_val_data_truncated_d50_p5.pickle'),
        'rb'))
real_val_data.y = np.array(real_val_data.y)

concept = concept_obj(config, dictionary, order=concept_order)

from sample import prepare_positives, examples
positives_training, positives_dev, positives_dev_truncated = pickle.load(
    open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb'))
# positives_dev = prepare_positives(positives_dev,nltk.word_tokenize,vocabulary)
positives_dev_truncated = prepare_positives(positives_dev_truncated,
                                            nltk.word_tokenize, vocabulary)
del positives_dev, positives_training

# corpus
# corpus_train = sample.NewDataSet('training corpus')
# corpus_train.objects = load.load(os.path.normpath(config['corpus']['training_file']),'NCBI')

corpus_dev = sample.NewDataSet('dev corpus')
corpus_dev.objects = load.load(config['corpus']['development_file'], 'NCBI')

for corpus in [corpus_dev]:
    corpus.ids = []  # list of all ids (gold standard for each mention)
    corpus.names = []  # list of all names
    corpus.all = [
    ]  # list of tuples (mention_text,gold,context,(start,end,docid))