Beispiel #1
0
def evaluate(emb_wrapper,
             analogy_file,
             dataset,
             setting,
             analogy_type,
             analogy_method,
             log=log,
             predictions_file=None,
             report_top_k=5):
    t_main = log.startTimer()

    results = analogyTask(analogy_file,
                          dataset,
                          setting,
                          analogy_type,
                          emb_wrapper,
                          log=log,
                          predictions_file=predictions_file,
                          predictions_file_mode='w',
                          report_top_k=report_top_k)

    log.stopTimer(t_main, message='Program complete in {0:.2f}s.')

    return results
Beispiel #2
0
        (mentionf, ) = args
        return mentionf, options

    mentionf, options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    if options.tab_separated:
        sep = '\t'
    else:
        sep = ' '

    t_sub = log.startTimer('Reading entity embeddings from %s...' %
                           options.entity_embf,
                           newline=False)
    entity_embeds = pyemblib.read(options.entity_embf, separator=sep)
    log.stopTimer(t_sub,
                  message='Read %d embeddings ({0:.2f}s)' % len(entity_embeds))

    t_sub = log.startTimer('Reading context embeddings from %s...' %
                           options.ctx_embf,
                           newline=False)
    ctx_embeds = pyemblib.read(options.ctx_embf, separator=sep)
    log.stopTimer(t_sub,
                  message='Read %d embeddings ({0:.2f}s)' % len(ctx_embeds))

    if options.entity_defnf:
        t_sub = log.startTimer('Reading word embeddings from %s...' %
                               options.word_embf,
                               newline=False)
        #word_embeds = pyemblib.read(options.word_embf)
        word_embeds = ctx_embeds
        log.stopTimer(t_sub,
Beispiel #3
0
        return options

    options = _cli()

    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
    log.writeln('Writing remapped embeddings to %s...' % options.outputf)
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format)
    pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True)
    log.writeln('Done!')

    log.stop()
                help='number of threads to use for parallel calculation (default: %default)',
                type='int', default=1)
        parser.add_option('--batch-size', dest='batch_size',
                help='number of samples to process in each batch (default: %default)',
                type='int', default=25)
        parser.add_option('--keys', dest='keysf',
                help='file listing keys to restrict NN analysis to')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        embf, outf = args
        return embf, options.mode, options.keysf, outf, options.top_k, options.batch_size, options.threads, options.logfile
    embf, embf_mode, keysf, outf, top_k, batch_size, threads, logfile = _cli()

    if keysf:
        keys = readKeys(keysf)
        print("Read %d keys to restrict to" % len(keys))
    else:
        keys = None

    t = log.startTimer('Reading embeddings...', newline=False)
    embeds = pyemblib.read(embf, mode=embf_mode, filter_to=keys, lower_keys=True)
    log.stopTimer(t, message='Done! Read %d embeddings ({0:.2f}s)' % len(embeds))

    nearest_neighbors = calculateNearestNeighbors(embeds, outf, top_k=top_k, batch_size=batch_size, threads=threads)
    log.writeln('Wrote nearest neighbors to %s.' % outf)
Beispiel #5
0
        return options

    options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)

    # set the random seed here if necessary
    if options.random_seed <= 0:
        options.random_seed = int(time.time())

    t_sub = log.startTimer('Reading source embeddings from %s...' %
                           options.src_embf,
                           newline=False)
    src_embs = pyemblib.read(options.src_embf,
                             mode=options.src_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(src_embs))

    t_sub = log.startTimer('Reading target embeddings from %s...' %
                           options.trg_embf,
                           newline=False)
    trg_embs = pyemblib.read(options.trg_embf,
                             mode=options.trg_embf_mode,
                             lower_keys=True)
    log.stopTimer(t_sub,
                  message='Read %d embeddings in {0:.2f}s' % len(trg_embs))

    pivots = readPivotsFile(options.pivotf, tolower=True)
    log.writeln('Loaded %d pivot terms.' % len(pivots))

    # double check that pivots are present in both embedding files
    validated_pivots = set()
Beispiel #6
0
def getEmbeddings(options, log=log, separator=' '):
    word_embs, term_embs, ent_embs = None, None, None
    word_ids, term_ids, ent_ids = None, None, None

    # load in embeddings
    if options.ent_embf:
        if options.ent_filterf: filter_set = readFilterSet(options.ent_filterf)
        else: filter_set = None
        t_sub = log.startTimer('Reading entity embeddings from %s...' %
                               options.ent_embf,
                               newline=False)
        ent_embs = pyemblib.read(options.ent_embf,
                                 separator=separator,
                                 replace_errors=True,
                                 filter_to=filter_set,
                                 lower_keys=True)
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(ent_embs))
        ent_ids = ent_embs.keys()
    if options.term_embf:
        if options.term_filterf:
            filter_set = readFilterSet(options.term_filterf)
        else:
            filter_set = None
        t_sub = log.startTimer('Reading term embeddings from %s...' %
                               options.term_embf,
                               newline=False)
        term_embs = pyemblib.read(options.term_embf,
                                  separator=separator,
                                  replace_errors=True,
                                  filter_to=filter_set,
                                  lower_keys=True)
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(term_embs))
        term_ids = term_embs.keys()
    if options.word_embf:
        if options.word_filterf:
            filter_set = readFilterSet(options.word_filterf)
        else:
            filter_set = None
        t_sub = log.startTimer('Reading word embeddings from %s...' %
                               options.word_embf,
                               newline=False)
        word_embs = pyemblib.read(options.word_embf,
                                  separator=separator,
                                  replace_errors=True,
                                  filter_to=filter_set,
                                  lower_keys=(not options.keep_word_case))
        log.stopTimer(t_sub,
                      message='Read %d embeddings ({0:.2f}s)' % len(word_embs))
        word_ids = word_embs.keys()

    # load in term/string maps
    if options.termmapf:
        t_sub = log.startTimer(
            'Reading term-entity mappings from %s (separated by "%s")...' %
            (options.termmapf, options.term_map_sep),
            newline=False)
        term_entity_map = readTermEntityMap(options.termmapf,
                                            entity_ids=ent_ids,
                                            term_ids=term_ids,
                                            map_sep=options.term_map_sep)
        log.stopTimer(t_sub,
                      message='Read mappings for %d terms ({0:.2f}s)' %
                      len(term_entity_map))
    if options.strmapf:
        t_sub = log.startTimer('Reading term-string mappings from %s...' %
                               options.strmapf,
                               newline=False)
        term_string_map = readTermStringMap(options.strmapf, term_ids=term_ids)
        log.stopTimer(t_sub,
                      message='Read mappings for %d terms ({0:.2f}s)' %
                      len(term_string_map))

    # perform actual approximations
    if options.repr_method == ENTITY:
        emb_wrapper = EmbeddingWrapper(options.repr_method,
                                       ent_embs,
                                       indexed=True)

    elif options.repr_method == TERM:
        # rekey term embeddings
        new_term_embs = {}
        for (term_id, term_emb) in term_embs.items():
            term_str = term_string_map.get(term_id, None)
            if term_str:
                new_term_embs[term_str] = term_emb
        emb_wrapper = EmbeddingWrapper(options.repr_method,
                                       new_term_embs,
                                       backoff_embeds=word_embs,
                                       indexed=True)

    elif options.repr_method == WORD:
        if options.term_embf:
            raise Exception("Honestly, I don't think this setting is used.")
        else:
            emb_wrapper = EmbeddingWrapper(options.repr_method,
                                           word_embs,
                                           indexed=True)

    else:
        raise Exception("Huh? %s" % options.repr_method)

    return emb_wrapper
Beispiel #7
0
    if options.mode == Mode.UMNSRS:
        datasets = (
            UMNSRS_Similarity(options.repr_method),
            UMNSRS_Relatedness(options.repr_method)
        )
    elif options.mode == Mode.WikiSRS:
        datasets = (
            WikiSRS_Similarity(options.repr_method),
            WikiSRS_Relatedness(options.repr_method)
        )

    t_sub = log.startTimer('Running similarity/relatedness evaluation...', newline=False)
    if options.use_combo:
        results = [
            twoModelEvaluate(dataset, ent_emb_wrapper, str_emb_wrapper, CosineSimilarity,
                    log_predictions=True, use_cross=options.use_cross, cross_only=options.cross_only,
                    skips_f=options.skips_f)
                for dataset in datasets
        ]
    else:
        results = [
            evaluateOn(dataset, emb_wrapper, CosineSimilarity, log_predictions=True, skips_f=options.skips_f)
                for dataset in datasets
        ]
    log.stopTimer(t_sub, message='Done in {0:.2f}s')

    log.writeln('\nResults:')
    for i in range(len(datasets)):
        (rho, compared, ttl) = results[i]
        log.writeln('  %s --> %.4f (%d/%d)' % (datasets[i].name, rho, compared, ttl))
        ]),
        ('ELMo baseline settings', [
            ('Output predictions file',
             options.elmo_baseline_eval_predictions),
            ('SemCor embeddings', options.semcor_embf),
            ('Training lemmas file', options.training_lemmasf),
            ('Pre-calculated WN first sense backoff predictions',
             options.wordnet_baseline_input_predictions),
        ]),
    ],
                             title="ELMo WSD baselines replication")

    t_sub = log.startTimer('Reading mentions from %s...' % mentionf,
                           newline=False)
    mentions = mention_file.read(mentionf)
    log.stopTimer(t_sub, message='Read %d mentions ({0:.2f}s)' % len(mentions))

    log.writeln('Reading mention dataset data from %s...' %
                options.mention_mapf)
    mention_map = dataset_map_utils.readDatasetMap(options.mention_mapf,
                                                   get_IDs=True,
                                                   get_lemmas=True)
    log.writeln('Mapped dataset info for {0:,} mentions.\n'.format(
        len(mention_map)))

    if options.wordnet_baseline_eval_predictions:
        wordnetFirstSenseBaseline(mentions, mention_map,
                                  options.wordnet_baseline_eval_predictions)
    if options.elmo_baseline_eval_predictions:
        log.writeln('Reading set of training lemmas from %s...' %
                    options.training_lemmasf)
Beispiel #9
0
    log.start(logfile=options.logfile)
    configlogger.writeConfig(log, [
        ('SemCor', [
            ('XML', config['SemCor']['XML']),
            ('Labels', config['SemCor']['Labels']),
        ]),
        ('Output file', config['SemCor']['Lemmas']),
    ])

    t_sub = log.startTimer('Pre-processing SemCor text from %s...' %
                           config['SemCor']['XML'])
    (sentences_words, sentences_instances) = wsd_parser.processSentences(
        config['SemCor']['XML'], get_lemmas=True)
    log.stopTimer(t_sub,
                  message='Read {0:,} sentences in {1}s.\n'.format(
                      len(sentences_words), '{0:.2f}'))

    log.writeln('Collecting set of SemCor lemmas...')
    lemmas = set()
    for sentence_instances in sentences_instances:
        for (instance_ID, ix, lemma) in sentence_instances:
            lemmas.add(lemma)
    log.writeln('Found {0:,} distinct lemmas.\n'.format(len(lemmas)))

    log.writeln('Writing list of lemmas to %s...' % config['SemCor']['Lemmas'])
    with codecs.open(config['SemCor']['Lemmas'], 'w', 'utf-8') as stream:
        for lemma in lemmas:
            stream.write('%s\n' % lemma)
    log.writeln('Done.\n')
            ('Max training epochs', options.max_epochs),
            ('Checkpoint file', options.checkpoint_path),
            ('Cross validation splits file', options.cross_validation_file),
            ('Number of folds', options.n_folds),
            ('Fraction of training used for dev', options.dev_size),
            ('Writing predictions to', options.predictions_file),
            ('Writing dev results to', options.dev_results_file),
            ('Random seed', options.random_seed),
        ]),
    ], 'WordNet classification experiment')

    t_sub = log.startTimer('Reading word embeddings from %s...' %
                           options.embedding_f)
    embeddings = pyemblib.read(options.embedding_f)
    log.stopTimer(t_sub,
                  message='Read {0:,} embeddings ({1}s).\n'.format(
                      len(embeddings), '{0:.2f}'))

    log.writeln('Reading dataset from %s...' % dataset_f)
    ds = dataset.load(dataset_f)
    log.writeln('Read {0:,} samples.\n'.format(len(ds)))

    preprocessed = preprocess(ds, embeddings, options)

    if options.predictions_file:
        preds_stream = codecs.open(options.predictions_file, 'w', 'utf-8')

    runCrossValidationExperiment(preprocessed, options, preds_stream)

    if options.predictions_file:
        preds_stream.close()
Beispiel #11
0
    if filter_stopwords:
        stops = set(stopwords.words('english'))
        old_word_filter = word_filter
        word_filter = lambda w: old_word_filter(w) and (not w in stops)

    if e_vocabf:
        log.writeln('Getting concept filtering vocabulary from %s...' %
                    e_vocabf)
        entity_vocab = set(
            [c.lower() for c in util.readList(e_vocabf, encoding='utf-8')])
        entity_filter = lambda c: c.lower() in entity_vocab
    else:
        entity_filter = lambda c: True

    if dataset == datasets.NLM_WSD:
        t_sub = log.startTimer('Generating NLM WSD features.')
        dataset = nlm_wsd.NLM_WSD()
        mentions = nlm_wsd.getAllMentions(dataset, window_size, word_filter,
                                          entity_filter)
        log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions))
    elif dataset == datasets.AIDA:
        t_sub = log.startTimer('Generating AIDA features.')
        dataset = aida.AIDA()
        mentions = aida.getAllMentions(dataset, window_size, word_filter,
                                       entity_filter)
        log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions))

    t_sub = log.startTimer('Writing samples to %s...' % outf, newline=False)
    mention_file.write(mentions, outf)
    log.stopTimer(t_sub, message='Done ({0:.2f}s).')
        ('SemCor', [
            ('XML', config['SemCor']['XML']),
            ('Labels', config['SemCor']['Labels']),
            ('Vocab', config['SemCor']['Vocab']),
        ]),
        ('ELMo', [
            ('Weights', config['ELMo']['Weights']),
            ('Options', config['ELMo']['Options']),
        ]),
        ('Output file', config['SemCor']['Embeddings']),
    ])

    t_sub = log.startTimer('Reading SemCor labels from %s...' % config['SemCor']['Labels'])
    semcor_labels, unique_sense_IDs = wsd_parser.readLabels(config['SemCor']['Labels'])
    log.stopTimer(t_sub, message='Read {0:,} labels ({1:,} unique senses) in {2}s.\n'.format(
        len(semcor_labels), len(unique_sense_IDs), '{0:.2f}'
    ))

    t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML'])
    (sentences_words, sentences_instances) = wsd_parser.processSentences(config['SemCor']['XML'])
    log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format(
        len(sentences_words), '{0:.2f}'
    ))

    log.writeln('Pre-processing SemCor vocabulary...')
    max_char_len = prepVocabulary(sentences_words, config['SemCor']['Vocab'])
    log.writeln('Wrote vocabulary to {0}.\nMax character length: {1:,}\n'.format(
        config['SemCor']['Vocab'], max_char_len
    ))

    log.writeln('OVERRIDING max_char_len to 50!\n')
Beispiel #13
0
def analogyTask(analogy_file,
                dataset,
                setting,
                analogy_type,
                emb_wrapper,
                log=log,
                report_top_k=5,
                predictions_file=None,
                predictions_file_mode='w'):
    if dataset == datasets.BMASS:
        if analogy_type == 'string':
            analogies = BMASS_parser.read(analogy_file,
                                          setting,
                                          strings_only=True)
        else:
            analogies = BMASS_parser.read(analogy_file,
                                          setting,
                                          cuis_only=True)
    elif dataset == datasets.Google:
        analogies = Google_parser.read(analogy_file, to_lower=True)

    # if we're saving the predictions, start that file first
    if predictions_file:
        pred_stream = codecs.open(predictions_file, predictions_file_mode,
                                  'utf-8')

    # build the analogy completion model
    sess = tf.Session()
    grph = AnalogyModel(sess, emb_wrapper.asArray())

    completed, results = 0, {}
    for (relation, rel_analogies) in analogies.items():
        t_file = log.startTimer('  Starting relation: %s (%d/%d)' %
                                (relation, completed + 1, len(analogies)))

        rel_results = completeAnalogySet(rel_analogies,
                                         setting,
                                         emb_wrapper,
                                         grph,
                                         report_top_k,
                                         log=log)
        results[relation] = rel_results

        (correct, MAP, MRR, total, skipped, predictions) = rel_results
        log.stopTimer(
            t_file,
            message=
            '  Completed file: %s (%d/%d) [{0:.2f}s]\n    >> Skipped %d/%d' %
            (relation, completed + 1, len(analogies), skipped, total))

        if predictions_file:
            pred_stream.write(('{0}\n  %s\n{0}\n'.format('-' * 79)) % relation)
            for prediction in predictions:
                ((a, b, c, d), is_correct, num_candidates, top_k) = prediction
                pred_stream.write(
                    '\n%s:%s::%s:%s\nCorrect: %s\nPredictions: %d\n%s\n' %
                    (a, b, c, d, str(is_correct), num_candidates, '\n'.join(
                        [('    %s' % guess) for guess in top_k])))

        completed += 1

    # tie off the predictions file
    if predictions_file: pred_stream.close()

    return results
Beispiel #14
0
        parser = optparse.OptionParser(usage='Usage: %prog VOCABF OUTF')
        parser.add_option('--write-lemma', dest='write_lemma',
                action='store_true', default=False,
                help='write the lemma for the synset instead of the synset ID')
        parser.add_option('-l', '--logfile', dest='logfile',
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 2:
            parser.print_help()
            exit()
        return args, options
    (vocabf, outf), options = _cli()
    log.start(logfile=options.logfile)

    configlogger.writeConfig(log, [
        ('Vocabulary file to filter to', vocabf),
        ('Output file for relations', outf),
        ('Writing lemmas', options.write_lemma),
    ], 'Filtered WordNet relation generation')

    log.writeln('Reading filter vocab from %s...' % vocabf)
    vocab = loadVocabulary(vocabf)
    log.writeln('Read {0:,} words to filter to.\n'.format(len(vocab)))

    t_sub = log.startTimer('Extracting WordNet pairs....\n')
    enumerateWordNetPairs(vocab, outf, write_lemma=options.write_lemma)
    log.stopTimer(t_sub, message='\nExtraction complete in {0:.2f}s.')

    log.stop()
Beispiel #15
0
        log.writeln(
            '[WARNING] Invalid --analogy-type setting for %s dataset; Overriding to "%s"'
            % (options.dataset, data_mode.String))
        options.anlg_type = data_mode.String

    t_sub = log.startTimer('Reading %s embeddings from %s...' %
                           (options.embeddings_mode, options.embeddings))
    separator = '\t' if options.tab_sep else ' '
    (fmt, mode) = pyemblib.CLI_Formats.parse(options.embeddings_mode)
    embeddings = pyemblib.read(options.embeddings,
                               format=fmt,
                               mode=mode,
                               separator=separator,
                               lower_keys=options.to_lower)
    log.stopTimer(
        t_sub,
        'Read {0:,} embeddings in {1}s.\n'.format(len(embeddings), '{0:.2f}'))

    t_sub = log.startTimer('Running analogy task on %s dataset...' %
                           options.dataset)
    results = analogyTask(analogy_file,
                          options.dataset,
                          options.setting,
                          options.anlg_type,
                          embeddings,
                          log=log,
                          predictions_file=options.predictions_file,
                          predictions_file_mode='w',
                          report_top_k=options.report_top_k,
                          to_lower=options.to_lower)
    log.stopTimer(t_sub, 'Done in {0:.2f}s.')
Beispiel #16
0
def analogyTask(analogy_file,
                dataset,
                setting,
                analogy_type,
                embeddings,
                log=log,
                report_top_k=5,
                predictions_file=None,
                predictions_file_mode='w',
                to_lower=False):
    analogies = parsers.parse(analogy_file,
                              dataset,
                              setting,
                              analogy_type,
                              to_lower=to_lower)

    # if we're saving the predictions, start that file first
    if predictions_file:
        pred_stream = codecs.open(predictions_file, predictions_file_mode,
                                  'utf-8')

    # build the analogy completion model
    (vocab, emb_arr) = embeddings.toarray()
    vocab_indexer = {vocab[i]: i for i in range(len(vocab))}
    sess = tf.Session()
    grph = AnalogyModel(sess, emb_arr)

    completed, results = 0, {}
    for (relation, rel_analogies) in analogies.items():
        t_file = log.startTimer('  Starting relation: %s (%d/%d)' %
                                (relation, completed + 1, len(analogies)))

        rel_results = completeAnalogySet(rel_analogies,
                                         setting,
                                         emb_arr,
                                         vocab,
                                         vocab_indexer,
                                         grph,
                                         report_top_k,
                                         log=log)
        results[relation] = rel_results

        (correct, MAP, MRR, total, skipped, predictions) = rel_results
        log.stopTimer(
            t_file,
            message=
            '  Completed file: %s (%d/%d) [{0:.2f}s]\n    >> Skipped %d/%d' %
            (relation, completed + 1, len(analogies), skipped, total))

        if predictions_file:
            pred_stream.write(('{0}\n  %s\n{0}\n'.format('-' * 79)) % relation)
            for prediction in predictions:
                ((a, b, c, d), is_correct, num_candidates, top_k) = prediction
                pred_stream.write(
                    '\n%s:%s::%s:%s\nCorrect: %s\nPredictions: %d\n%s\n' %
                    (a, b, c, d, str(is_correct), num_candidates, '\n'.join(
                        [('    %s' % guess) for guess in top_k])))

        completed += 1

    # tie off the predictions file
    if predictions_file: pred_stream.close()

    return results