Beispiel #1
0
def evaluate(emb_wrapper,
             analogy_file,
             dataset,
             setting,
             analogy_type,
             analogy_method,
             log=log,
             predictions_file=None,
             report_top_k=5):
    t_main = log.startTimer()

    results = analogyTask(analogy_file,
                          dataset,
                          setting,
                          analogy_type,
                          emb_wrapper,
                          log=log,
                          predictions_file=predictions_file,
                          predictions_file_mode='w',
                          report_top_k=report_top_k)

    log.stopTimer(t_main, message='Program complete in {0:.2f}s.')

    return results
Beispiel #2
0
        if len(args) != 0 or (not options.src_embf) or (
                not options.trg_embf) or (not options.outf) or (
                    not options.pivotf):
            parser.print_help()
            exit()
        return options

    options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    # set the random seed here if necessary
    if options.random_seed <= 0:
        options.random_seed = int(time.time())

    t_sub = log.startTimer('Reading source embeddings from %s...' %
                           options.src_embf,
                           newline=False)
    src_embs = pyemblib.read(options.src_embf,
                             mode=options.src_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(src_embs))

    t_sub = log.startTimer('Reading target embeddings from %s...' %
                           options.trg_embf,
                           newline=False)
    trg_embs = pyemblib.read(options.trg_embf,
                             mode=options.trg_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(trg_embs))
Beispiel #3
0
        options.ent_embf = None
        options.word_embf = word_embf
        str_emb_wrapper = em.getEmbeddings(options, log=log, separator=separator)

    if options.mode == Mode.UMNSRS:
        datasets = (
            UMNSRS_Similarity(options.repr_method),
            UMNSRS_Relatedness(options.repr_method)
        )
    elif options.mode == Mode.WikiSRS:
        datasets = (
            WikiSRS_Similarity(options.repr_method),
            WikiSRS_Relatedness(options.repr_method)
        )

    t_sub = log.startTimer('Running similarity/relatedness evaluation...', newline=False)
    if options.use_combo:
        results = [
            twoModelEvaluate(dataset, ent_emb_wrapper, str_emb_wrapper, CosineSimilarity,
                    log_predictions=True, use_cross=options.use_cross, cross_only=options.cross_only,
                    skips_f=options.skips_f)
                for dataset in datasets
        ]
    else:
        results = [
            evaluateOn(dataset, emb_wrapper, CosineSimilarity, log_predictions=True, skips_f=options.skips_f)
                for dataset in datasets
        ]
    log.stopTimer(t_sub, message='Done in {0:.2f}s')

    log.writeln('\nResults:')
Beispiel #4
0
def getEmbeddings(options, log=log, separator=' '):
    word_embs, term_embs, ent_embs = None, None, None
    word_ids, term_ids, ent_ids = None, None, None

    # load in embeddings
    if options.ent_embf:
        if options.ent_filterf: filter_set = readFilterSet(options.ent_filterf)
        else: filter_set = None
        t_sub = log.startTimer('Reading entity embeddings from %s...' %
                               options.ent_embf,
                               newline=False)
        ent_embs = pyemblib.read(options.ent_embf,
                                 separator=separator,
                                 replace_errors=True,
                                 filter_to=filter_set,
                                 lower_keys=True)
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(ent_embs))
        ent_ids = ent_embs.keys()
    if options.term_embf:
        if options.term_filterf:
            filter_set = readFilterSet(options.term_filterf)
        else:
            filter_set = None
        t_sub = log.startTimer('Reading term embeddings from %s...' %
                               options.term_embf,
                               newline=False)
        term_embs = pyemblib.read(options.term_embf,
                                  separator=separator,
                                  replace_errors=True,
                                  filter_to=filter_set,
                                  lower_keys=True)
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(term_embs))
        term_ids = term_embs.keys()
    if options.word_embf:
        if options.word_filterf:
            filter_set = readFilterSet(options.word_filterf)
        else:
            filter_set = None
        t_sub = log.startTimer('Reading word embeddings from %s...' %
                               options.word_embf,
                               newline=False)
        word_embs = pyemblib.read(options.word_embf,
                                  separator=separator,
                                  replace_errors=True,
                                  filter_to=filter_set,
                                  lower_keys=(not options.keep_word_case))
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(word_embs))
        word_ids = word_embs.keys()

    # load in term/string maps
    if options.termmapf:
        t_sub = log.startTimer(
            'Reading term-entity mappings from %s (separated by "%s")...' %
            (options.termmapf, options.term_map_sep),
            newline=False)
        term_entity_map = readTermEntityMap(options.termmapf,
                                            entity_ids=ent_ids,
                                            term_ids=term_ids,
                                            map_sep=options.term_map_sep)
        log.stopTimer(t_sub,
                      message='Read mappings for %d terms ({0:.2f}s)' %
                      len(term_entity_map))
    if options.strmapf:
        t_sub = log.startTimer('Reading term-string mappings from %s...' %
                               options.strmapf,
                               newline=False)
        term_string_map = readTermStringMap(options.strmapf, term_ids=term_ids)
        log.stopTimer(t_sub,
                      message='Read mappings for %d terms ({0:.2f}s)' %
                      len(term_string_map))

    # perform actual approximations
    if options.repr_method == ENTITY:
        emb_wrapper = EmbeddingWrapper(options.repr_method,
                                       ent_embs,
                                       indexed=True)

    elif options.repr_method == TERM:
        # rekey term embeddings
        new_term_embs = {}
        for (term_id, term_emb) in term_embs.items():
            term_str = term_string_map.get(term_id, None)
            if term_str:
                new_term_embs[term_str] = term_emb
        emb_wrapper = EmbeddingWrapper(options.repr_method,
                                       new_term_embs,
                                       backoff_embeds=word_embs,
                                       indexed=True)

    elif options.repr_method == WORD:
        if options.term_embf:
            raise Exception("Honestly, I don't think this setting is used.")
        else:
            emb_wrapper = EmbeddingWrapper(options.repr_method,
                                           word_embs,
                                           indexed=True)

    else:
        raise Exception("Huh? %s" % options.repr_method)

    return emb_wrapper
Beispiel #5
0
        return args, options

    (configf, ), options = _cli()
    config = configparser.ConfigParser()
    config.read(configf)

    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('SemCor', [
            ('XML', config['SemCor']['XML']),
            ('Labels', config['SemCor']['Labels']),
        ]),
        ('Output file', config['SemCor']['Lemmas']),
    ])

    t_sub = log.startTimer('Pre-processing SemCor text from %s...' %
                           config['SemCor']['XML'])
    (sentences_words, sentences_instances) = wsd_parser.processSentences(
        config['SemCor']['XML'], get_lemmas=True)
    log.stopTimer(t_sub,
                  message='Read {0:,} sentences in {1}s.\n'.format(
                      len(sentences_words), '{0:.2f}'))

    log.writeln('Collecting set of SemCor lemmas...')
    lemmas = set()
    for sentence_instances in sentences_instances:
        for (instance_ID, ix, lemma) in sentence_instances:
            lemmas.add(lemma)
    log.writeln('Found {0:,} distinct lemmas.\n'.format(len(lemmas)))

    log.writeln('Writing list of lemmas to %s...' % config['SemCor']['Lemmas'])
    with codecs.open(config['SemCor']['Lemmas'], 'w', 'utf-8') as stream:
        ('WordNet first sense baseline settings', [
            ('Output predictions file',
             options.wordnet_baseline_eval_predictions),
        ]),
        ('ELMo baseline settings', [
            ('Output predictions file',
             options.elmo_baseline_eval_predictions),
            ('SemCor embeddings', options.semcor_embf),
            ('Training lemmas file', options.training_lemmasf),
            ('Pre-calculated WN first sense backoff predictions',
             options.wordnet_baseline_input_predictions),
        ]),
    ],
                             title="ELMo WSD baselines replication")

    t_sub = log.startTimer('Reading mentions from %s...' % mentionf,
                           newline=False)
    mentions = mention_file.read(mentionf)
    log.stopTimer(t_sub, message='Read %d mentions ({0:.2f}s)' % len(mentions))

    log.writeln('Reading mention dataset data from %s...' %
                options.mention_mapf)
    mention_map = dataset_map_utils.readDatasetMap(options.mention_mapf,
                                                   get_IDs=True,
                                                   get_lemmas=True)
    log.writeln('Mapped dataset info for {0:,} mentions.\n'.format(
        len(mention_map)))

    if options.wordnet_baseline_eval_predictions:
        wordnetFirstSenseBaseline(mentions, mention_map,
                                  options.wordnet_baseline_eval_predictions)
    if options.elmo_baseline_eval_predictions:
Beispiel #7
0
def analogyTask(analogy_file,
                dataset,
                setting,
                analogy_type,
                embeddings,
                log=log,
                report_top_k=5,
                predictions_file=None,
                predictions_file_mode='w',
                to_lower=False):
    analogies = parsers.parse(analogy_file,
                              dataset,
                              setting,
                              analogy_type,
                              to_lower=to_lower)

    # if we're saving the predictions, start that file first
    if predictions_file:
        pred_stream = codecs.open(predictions_file, predictions_file_mode,
                                  'utf-8')

    # build the analogy completion model
    (vocab, emb_arr) = embeddings.toarray()
    vocab_indexer = {vocab[i]: i for i in range(len(vocab))}
    sess = tf.Session()
    grph = AnalogyModel(sess, emb_arr)

    completed, results = 0, {}
    for (relation, rel_analogies) in analogies.items():
        t_file = log.startTimer('  Starting relation: %s (%d/%d)' %
                                (relation, completed + 1, len(analogies)))

        rel_results = completeAnalogySet(rel_analogies,
                                         setting,
                                         emb_arr,
                                         vocab,
                                         vocab_indexer,
                                         grph,
                                         report_top_k,
                                         log=log)
        results[relation] = rel_results

        (correct, MAP, MRR, total, skipped, predictions) = rel_results
        log.stopTimer(
            t_file,
            message=
            '  Completed file: %s (%d/%d) [{0:.2f}s]\n    >> Skipped %d/%d' %
            (relation, completed + 1, len(analogies), skipped, total))

        if predictions_file:
            pred_stream.write(('{0}\n  %s\n{0}\n'.format('-' * 79)) % relation)
            for prediction in predictions:
                ((a, b, c, d), is_correct, num_candidates, top_k) = prediction
                pred_stream.write(
                    '\n%s:%s::%s:%s\nCorrect: %s\nPredictions: %d\n%s\n' %
                    (a, b, c, d, str(is_correct), num_candidates, '\n'.join(
                        [('    %s' % guess) for guess in top_k])))

        completed += 1

    # tie off the predictions file
    if predictions_file: pred_stream.close()

    return results
Beispiel #8
0
                not options.vocabf):
            parser.print_help()
            exit()
        return options

    options = _cli()

    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
    log.writeln('Writing remapped embeddings to %s...' % options.outputf)
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format)
    pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True)
    log.writeln('Done!')
        (options, args) = parser.parse_args()
        if len(args) != 1:
            parser.print_help()
            exit()
        (outf, ) = args
        return outf, options

    outf, options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Dataset configuration file', options.dataset_configf),
        ('Mention ID->dataset map file', options.wsd_mention_map_file),
        ('Mentions for test data only', options.wsd_test_only),
    ],
                             title='Mention extraction for entity linking')

    config = configparser.ConfigParser()
    config.read(options.dataset_configf)

    t_sub = log.startTimer('Generating WSD Evaluation Framework features.')
    datasets = wsd.allAsList(config, test_only=options.wsd_test_only)
    mentions = wsd.getAllMentions(
        datasets, log=log, mention_map_file=options.wsd_mention_map_file)
    log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions))

    t_sub = log.startTimer('Writing samples to %s...' % outf, newline=False)
    mention_file.write(mentions, outf)
    log.stopTimer(t_sub, message='Done ({0:.2f}s).')

    log.stop()
        ]),
        ('Training settings', [
            ('Patience', options.patience),
            ('Early stopping criterion', options.early_stopping),
            ('Max training epochs', options.max_epochs),
            ('Checkpoint file', options.checkpoint_path),
            ('Cross validation splits file', options.cross_validation_file),
            ('Number of folds', options.n_folds),
            ('Fraction of training used for dev', options.dev_size),
            ('Writing predictions to', options.predictions_file),
            ('Writing dev results to', options.dev_results_file),
            ('Random seed', options.random_seed),
        ]),
    ], 'WordNet classification experiment')

    t_sub = log.startTimer('Reading word embeddings from %s...' %
                           options.embedding_f)
    embeddings = pyemblib.read(options.embedding_f)
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings ({1}s).\n'.format(
                      len(embeddings), '{0:.2f}'))

    log.writeln('Reading dataset from %s...' % dataset_f)
    ds = dataset.load(dataset_f)
    log.writeln('Read {0:,} samples.\n'.format(len(ds)))

    preprocessed = preprocess(ds, embeddings, options)

    if options.predictions_file:
        preds_stream = codecs.open(options.predictions_file, 'w', 'utf-8')

    runCrossValidationExperiment(preprocessed, options, preds_stream)
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('SemCor', [
            ('XML', config['SemCor']['XML']),
            ('Labels', config['SemCor']['Labels']),
            ('Vocab', config['SemCor']['Vocab']),
        ]),
        ('ELMo', [
            ('Weights', config['ELMo']['Weights']),
            ('Options', config['ELMo']['Options']),
        ]),
        ('Output file', config['SemCor']['Embeddings']),
    ])

    t_sub = log.startTimer('Reading SemCor labels from %s...' % config['SemCor']['Labels'])
    semcor_labels, unique_sense_IDs = wsd_parser.readLabels(config['SemCor']['Labels'])
    log.stopTimer(t_sub, message='Read {0:,} labels ({1:,} unique senses) in {2}s.\n'.format(
        len(semcor_labels), len(unique_sense_IDs), '{0:.2f}'
    ))

    t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML'])
    (sentences_words, sentences_instances) = wsd_parser.processSentences(config['SemCor']['XML'])
    log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format(
        len(sentences_words), '{0:.2f}'
    ))

    log.writeln('Pre-processing SemCor vocabulary...')
    max_char_len = prepVocabulary(sentences_words, config['SemCor']['Vocab'])
    log.writeln('Wrote vocabulary to {0}.\nMax character length: {1:,}\n'.format(
        config['SemCor']['Vocab'], max_char_len
Beispiel #12
0
def analogyTask(analogy_file,
                dataset,
                setting,
                analogy_type,
                emb_wrapper,
                log=log,
                report_top_k=5,
                predictions_file=None,
                predictions_file_mode='w'):
    if dataset == datasets.BMASS:
        if analogy_type == 'string':
            analogies = BMASS_parser.read(analogy_file,
                                          setting,
                                          strings_only=True)
        else:
            analogies = BMASS_parser.read(analogy_file,
                                          setting,
                                          cuis_only=True)
    elif dataset == datasets.Google:
        analogies = Google_parser.read(analogy_file, to_lower=True)

    # if we're saving the predictions, start that file first
    if predictions_file:
        pred_stream = codecs.open(predictions_file, predictions_file_mode,
                                  'utf-8')

    # build the analogy completion model
    sess = tf.Session()
    grph = AnalogyModel(sess, emb_wrapper.asArray())

    completed, results = 0, {}
    for (relation, rel_analogies) in analogies.items():
        t_file = log.startTimer('  Starting relation: %s (%d/%d)' %
                                (relation, completed + 1, len(analogies)))

        rel_results = completeAnalogySet(rel_analogies,
                                         setting,
                                         emb_wrapper,
                                         grph,
                                         report_top_k,
                                         log=log)
        results[relation] = rel_results

        (correct, MAP, MRR, total, skipped, predictions) = rel_results
        log.stopTimer(
            t_file,
            message=
            '  Completed file: %s (%d/%d) [{0:.2f}s]\n    >> Skipped %d/%d' %
            (relation, completed + 1, len(analogies), skipped, total))

        if predictions_file:
            pred_stream.write(('{0}\n  %s\n{0}\n'.format('-' * 79)) % relation)
            for prediction in predictions:
                ((a, b, c, d), is_correct, num_candidates, top_k) = prediction
                pred_stream.write(
                    '\n%s:%s::%s:%s\nCorrect: %s\nPredictions: %d\n%s\n' %
                    (a, b, c, d, str(is_correct), num_candidates, '\n'.join(
                        [('    %s' % guess) for guess in top_k])))

        completed += 1

    # tie off the predictions file
    if predictions_file: pred_stream.close()

    return results
Beispiel #13
0
        parser = optparse.OptionParser(usage='Usage: %prog VOCABF OUTF')
        parser.add_option('--write-lemma', dest='write_lemma',
                action='store_true', default=False,
                help='write the lemma for the synset instead of the synset ID')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        return args, options
    (vocabf, outf), options = _cli()
    log.start(logfile=options.logfile)

    configlogger.writeConfig(log, [
        ('Vocabulary file to filter to', vocabf),
        ('Output file for relations', outf),
        ('Writing lemmas', options.write_lemma),
    ], 'Filtered WordNet relation generation')

    log.writeln('Reading filter vocab from %s...' % vocabf)
    vocab = loadVocabulary(vocabf)
    log.writeln('Read {0:,} words to filter to.\n'.format(len(vocab)))

    t_sub = log.startTimer('Extracting WordNet pairs....\n')
    enumerateWordNetPairs(vocab, outf, write_lemma=options.write_lemma)
    log.stopTimer(t_sub, message='\nExtraction complete in {0:.2f}s.')

    log.stop()
Beispiel #14
0
        parser.add_option('--to-lower',
                          dest='to_lower',
                          action='store_true',
                          default=False,
                          help='lowercase all analogies')
        (options, args) = parser.parse_args()
        if not options.dataset:
            parser.print_help()
            exit()

        return options

    options = _cli()
    log.start(logfile=options.logfile)

    t_main = log.startTimer()

    config = configparser.ConfigParser()
    config.read(options.config)

    analogy_file = datasets.getpath(options.dataset, config, options.setting)

    if not options.embeddings:
        options.embeddings = config['Default']['Embeddings']
        options.embeddings_mode = config['Default']['EmbeddingsMode']

    configlogger.writeConfig(
        log,
        settings=[
            ('Config file', options.config),
            ('Dataset', options.dataset),
Beispiel #15
0
    if filter_stopwords:
        stops = set(stopwords.words('english'))
        old_word_filter = word_filter
        word_filter = lambda w: old_word_filter(w) and (not w in stops)

    if e_vocabf:
        log.writeln('Getting concept filtering vocabulary from %s...' %
                    e_vocabf)
        entity_vocab = set(
            [c.lower() for c in util.readList(e_vocabf, encoding='utf-8')])
        entity_filter = lambda c: c.lower() in entity_vocab
    else:
        entity_filter = lambda c: True

    if dataset == datasets.NLM_WSD:
        t_sub = log.startTimer('Generating NLM WSD features.')
        dataset = nlm_wsd.NLM_WSD()
        mentions = nlm_wsd.getAllMentions(dataset, window_size, word_filter,
                                          entity_filter)
        log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions))
    elif dataset == datasets.AIDA:
        t_sub = log.startTimer('Generating AIDA features.')
        dataset = aida.AIDA()
        mentions = aida.getAllMentions(dataset, window_size, word_filter,
                                       entity_filter)
        log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions))

    t_sub = log.startTimer('Writing samples to %s...' % outf, newline=False)
    mention_file.write(mentions, outf)
    log.stopTimer(t_sub, message='Done ({0:.2f}s).')
                help='number of threads to use for parallel calculation (default: %default)',
                type='int', default=1)
        parser.add_option('--batch-size', dest='batch_size',
                help='number of samples to process in each batch (default: %default)',
                type='int', default=25)
        parser.add_option('--keys', dest='keysf',
                help='file listing keys to restrict NN analysis to')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        embf, outf = args
        return embf, options.mode, options.keysf, outf, options.top_k, options.batch_size, options.threads, options.logfile
    embf, embf_mode, keysf, outf, top_k, batch_size, threads, logfile = _cli()

    if keysf:
        keys = readKeys(keysf)
        print("Read %d keys to restrict to" % len(keys))
    else:
        keys = None

    t = log.startTimer('Reading embeddings...', newline=False)
    embeds = pyemblib.read(embf, mode=embf_mode, filter_to=keys, lower_keys=True)
    log.stopTimer(t, message='Done! Read %d embeddings ({0:.2f}s)' % len(embeds))

    nearest_neighbors = calculateNearestNeighbors(embeds, outf, top_k=top_k, batch_size=batch_size, threads=threads)
    log.writeln('Wrote nearest neighbors to %s.' % outf)
Beispiel #17
0
    embf, options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(
        log, [
            ('Input embedding file', embf),
            ('Input embedding file mode', options.embedding_mode),
            ('Output neighbor file', options.outputf),
            ('Ordered vocabulary file', options.vocabf),
            ('Number of nearest neighbors', options.k),
            ('Batch size', options.batch_size),
            ('Number of threads', options.threads),
            ('Partial nearest neighbors file for resuming',
             options.partial_neighbors_file),
        ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace')
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings in {1}s.\n'.format(
                      len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing node ID <-> vocab map to %s...\n' %
                    options.vocabf)
        writeNodeMap(emb, options.vocabf)
    else:
        log.writeln('Reading node ID <-> vocab map from %s...\n' %
                    options.vocabf)
    node_map = readNodeMap(options.vocabf)

    # get the vocabulary in node ID order, and map index in emb_arr
Beispiel #18
0
                                          and not options.word_embf):
            parser.print_help()
            exit()
        (mentionf, ) = args
        return mentionf, options

    mentionf, options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    if options.tab_separated:
        sep = '\t'
    else:
        sep = ' '

    t_sub = log.startTimer('Reading entity embeddings from %s...' %
                           options.entity_embf,
                           newline=False)
    entity_embeds = pyemblib.read(options.entity_embf, separator=sep)
    log.stopTimer(t_sub,
                  message='Read %d embeddings ({0:.2f}s)' % len(entity_embeds))

    t_sub = log.startTimer('Reading context embeddings from %s...' %
                           options.ctx_embf,
                           newline=False)
    ctx_embeds = pyemblib.read(options.ctx_embf, separator=sep)
    log.stopTimer(t_sub,
                  message='Read %d embeddings ({0:.2f}s)' % len(ctx_embeds))

    if options.entity_defnf:
        t_sub = log.startTimer('Reading word embeddings from %s...' %
                               options.word_embf,
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if (not options.inputf) or (not options.outputf) or (not options.datasetf):
            parser.print_help()
            exit()
        return options
    options = _cli()
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input embeddings file', options.inputf),
        ('Output embeddings file', options.outputf),
        ('Dataset file', options.datasetf),
    ], 'Embedding filtering for WordNet classification experiments')

    t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf)
    embeddings = pyemblib.read(options.inputf)
    log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(
        len(embeddings), '{0:.2f}'
    ))

    log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf)
    ds = dataset.load(options.datasetf)
    vocab = set()
    for (_, src, snk, _) in ds:
        vocab.add(src)
        vocab.add(snk)
    log.writeln('Found {0:,} unique words in {1:,} samples.\n'.format(
        len(vocab), len(ds)
    ))