def evaluate(emb_wrapper, analogy_file, dataset, setting, analogy_type, analogy_method, log=log, predictions_file=None, report_top_k=5): t_main = log.startTimer() results = analogyTask(analogy_file, dataset, setting, analogy_type, emb_wrapper, log=log, predictions_file=predictions_file, predictions_file_mode='w', report_top_k=report_top_k) log.stopTimer(t_main, message='Program complete in {0:.2f}s.') return results
if len(args) != 0 or (not options.src_embf) or ( not options.trg_embf) or (not options.outf) or ( not options.pivotf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile, stdout_also=True) # set the random seed here if necessary if options.random_seed <= 0: options.random_seed = int(time.time()) t_sub = log.startTimer('Reading source embeddings from %s...' % options.src_embf, newline=False) src_embs = pyemblib.read(options.src_embf, mode=options.src_embf_mode, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings in {0:.2f}s' % len(src_embs)) t_sub = log.startTimer('Reading target embeddings from %s...' % options.trg_embf, newline=False) trg_embs = pyemblib.read(options.trg_embf, mode=options.trg_embf_mode, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings in {0:.2f}s' % len(trg_embs))
options.ent_embf = None options.word_embf = word_embf str_emb_wrapper = em.getEmbeddings(options, log=log, separator=separator) if options.mode == Mode.UMNSRS: datasets = ( UMNSRS_Similarity(options.repr_method), UMNSRS_Relatedness(options.repr_method) ) elif options.mode == Mode.WikiSRS: datasets = ( WikiSRS_Similarity(options.repr_method), WikiSRS_Relatedness(options.repr_method) ) t_sub = log.startTimer('Running similarity/relatedness evaluation...', newline=False) if options.use_combo: results = [ twoModelEvaluate(dataset, ent_emb_wrapper, str_emb_wrapper, CosineSimilarity, log_predictions=True, use_cross=options.use_cross, cross_only=options.cross_only, skips_f=options.skips_f) for dataset in datasets ] else: results = [ evaluateOn(dataset, emb_wrapper, CosineSimilarity, log_predictions=True, skips_f=options.skips_f) for dataset in datasets ] log.stopTimer(t_sub, message='Done in {0:.2f}s') log.writeln('\nResults:')
def getEmbeddings(options, log=log, separator=' '): word_embs, term_embs, ent_embs = None, None, None word_ids, term_ids, ent_ids = None, None, None # load in embeddings if options.ent_embf: if options.ent_filterf: filter_set = readFilterSet(options.ent_filterf) else: filter_set = None t_sub = log.startTimer('Reading entity embeddings from %s...' % options.ent_embf, newline=False) ent_embs = pyemblib.read(options.ent_embf, separator=separator, replace_errors=True, filter_to=filter_set, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(ent_embs)) ent_ids = ent_embs.keys() if options.term_embf: if options.term_filterf: filter_set = readFilterSet(options.term_filterf) else: filter_set = None t_sub = log.startTimer('Reading term embeddings from %s...' % options.term_embf, newline=False) term_embs = pyemblib.read(options.term_embf, separator=separator, replace_errors=True, filter_to=filter_set, lower_keys=True) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(term_embs)) term_ids = term_embs.keys() if options.word_embf: if options.word_filterf: filter_set = readFilterSet(options.word_filterf) else: filter_set = None t_sub = log.startTimer('Reading word embeddings from %s...' % options.word_embf, newline=False) word_embs = pyemblib.read(options.word_embf, separator=separator, replace_errors=True, filter_to=filter_set, lower_keys=(not options.keep_word_case)) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(word_embs)) word_ids = word_embs.keys() # load in term/string maps if options.termmapf: t_sub = log.startTimer( 'Reading term-entity mappings from %s (separated by "%s")...' % (options.termmapf, options.term_map_sep), newline=False) term_entity_map = readTermEntityMap(options.termmapf, entity_ids=ent_ids, term_ids=term_ids, map_sep=options.term_map_sep) log.stopTimer(t_sub, message='Read mappings for %d terms ({0:.2f}s)' % len(term_entity_map)) if options.strmapf: t_sub = log.startTimer('Reading term-string mappings from %s...' % options.strmapf, newline=False) term_string_map = readTermStringMap(options.strmapf, term_ids=term_ids) log.stopTimer(t_sub, message='Read mappings for %d terms ({0:.2f}s)' % len(term_string_map)) # perform actual approximations if options.repr_method == ENTITY: emb_wrapper = EmbeddingWrapper(options.repr_method, ent_embs, indexed=True) elif options.repr_method == TERM: # rekey term embeddings new_term_embs = {} for (term_id, term_emb) in term_embs.items(): term_str = term_string_map.get(term_id, None) if term_str: new_term_embs[term_str] = term_emb emb_wrapper = EmbeddingWrapper(options.repr_method, new_term_embs, backoff_embeds=word_embs, indexed=True) elif options.repr_method == WORD: if options.term_embf: raise Exception("Honestly, I don't think this setting is used.") else: emb_wrapper = EmbeddingWrapper(options.repr_method, word_embs, indexed=True) else: raise Exception("Huh? %s" % options.repr_method) return emb_wrapper
return args, options (configf, ), options = _cli() config = configparser.ConfigParser() config.read(configf) log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('SemCor', [ ('XML', config['SemCor']['XML']), ('Labels', config['SemCor']['Labels']), ]), ('Output file', config['SemCor']['Lemmas']), ]) t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML']) (sentences_words, sentences_instances) = wsd_parser.processSentences( config['SemCor']['XML'], get_lemmas=True) log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format( len(sentences_words), '{0:.2f}')) log.writeln('Collecting set of SemCor lemmas...') lemmas = set() for sentence_instances in sentences_instances: for (instance_ID, ix, lemma) in sentence_instances: lemmas.add(lemma) log.writeln('Found {0:,} distinct lemmas.\n'.format(len(lemmas))) log.writeln('Writing list of lemmas to %s...' % config['SemCor']['Lemmas']) with codecs.open(config['SemCor']['Lemmas'], 'w', 'utf-8') as stream:
('WordNet first sense baseline settings', [ ('Output predictions file', options.wordnet_baseline_eval_predictions), ]), ('ELMo baseline settings', [ ('Output predictions file', options.elmo_baseline_eval_predictions), ('SemCor embeddings', options.semcor_embf), ('Training lemmas file', options.training_lemmasf), ('Pre-calculated WN first sense backoff predictions', options.wordnet_baseline_input_predictions), ]), ], title="ELMo WSD baselines replication") t_sub = log.startTimer('Reading mentions from %s...' % mentionf, newline=False) mentions = mention_file.read(mentionf) log.stopTimer(t_sub, message='Read %d mentions ({0:.2f}s)' % len(mentions)) log.writeln('Reading mention dataset data from %s...' % options.mention_mapf) mention_map = dataset_map_utils.readDatasetMap(options.mention_mapf, get_IDs=True, get_lemmas=True) log.writeln('Mapped dataset info for {0:,} mentions.\n'.format( len(mention_map))) if options.wordnet_baseline_eval_predictions: wordnetFirstSenseBaseline(mentions, mention_map, options.wordnet_baseline_eval_predictions) if options.elmo_baseline_eval_predictions:
def analogyTask(analogy_file, dataset, setting, analogy_type, embeddings, log=log, report_top_k=5, predictions_file=None, predictions_file_mode='w', to_lower=False): analogies = parsers.parse(analogy_file, dataset, setting, analogy_type, to_lower=to_lower) # if we're saving the predictions, start that file first if predictions_file: pred_stream = codecs.open(predictions_file, predictions_file_mode, 'utf-8') # build the analogy completion model (vocab, emb_arr) = embeddings.toarray() vocab_indexer = {vocab[i]: i for i in range(len(vocab))} sess = tf.Session() grph = AnalogyModel(sess, emb_arr) completed, results = 0, {} for (relation, rel_analogies) in analogies.items(): t_file = log.startTimer(' Starting relation: %s (%d/%d)' % (relation, completed + 1, len(analogies))) rel_results = completeAnalogySet(rel_analogies, setting, emb_arr, vocab, vocab_indexer, grph, report_top_k, log=log) results[relation] = rel_results (correct, MAP, MRR, total, skipped, predictions) = rel_results log.stopTimer( t_file, message= ' Completed file: %s (%d/%d) [{0:.2f}s]\n >> Skipped %d/%d' % (relation, completed + 1, len(analogies), skipped, total)) if predictions_file: pred_stream.write(('{0}\n %s\n{0}\n'.format('-' * 79)) % relation) for prediction in predictions: ((a, b, c, d), is_correct, num_candidates, top_k) = prediction pred_stream.write( '\n%s:%s::%s:%s\nCorrect: %s\nPredictions: %d\n%s\n' % (a, b, c, d, str(is_correct), num_candidates, '\n'.join( [(' %s' % guess) for guess in top_k]))) completed += 1 # tie off the predictions file if predictions_file: pred_stream.close() return results
not options.vocabf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()} log.writeln('Writing remapped embeddings to %s...' % options.outputf) (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format) pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True) log.writeln('Done!')
(options, args) = parser.parse_args() if len(args) != 1: parser.print_help() exit() (outf, ) = args return outf, options outf, options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Dataset configuration file', options.dataset_configf), ('Mention ID->dataset map file', options.wsd_mention_map_file), ('Mentions for test data only', options.wsd_test_only), ], title='Mention extraction for entity linking') config = configparser.ConfigParser() config.read(options.dataset_configf) t_sub = log.startTimer('Generating WSD Evaluation Framework features.') datasets = wsd.allAsList(config, test_only=options.wsd_test_only) mentions = wsd.getAllMentions( datasets, log=log, mention_map_file=options.wsd_mention_map_file) log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions)) t_sub = log.startTimer('Writing samples to %s...' % outf, newline=False) mention_file.write(mentions, outf) log.stopTimer(t_sub, message='Done ({0:.2f}s).') log.stop()
]), ('Training settings', [ ('Patience', options.patience), ('Early stopping criterion', options.early_stopping), ('Max training epochs', options.max_epochs), ('Checkpoint file', options.checkpoint_path), ('Cross validation splits file', options.cross_validation_file), ('Number of folds', options.n_folds), ('Fraction of training used for dev', options.dev_size), ('Writing predictions to', options.predictions_file), ('Writing dev results to', options.dev_results_file), ('Random seed', options.random_seed), ]), ], 'WordNet classification experiment') t_sub = log.startTimer('Reading word embeddings from %s...' % options.embedding_f) embeddings = pyemblib.read(options.embedding_f) log.stopTimer(t_sub, message='Read {0:,} embeddings ({1}s).\n'.format( len(embeddings), '{0:.2f}')) log.writeln('Reading dataset from %s...' % dataset_f) ds = dataset.load(dataset_f) log.writeln('Read {0:,} samples.\n'.format(len(ds))) preprocessed = preprocess(ds, embeddings, options) if options.predictions_file: preds_stream = codecs.open(options.predictions_file, 'w', 'utf-8') runCrossValidationExperiment(preprocessed, options, preds_stream)
log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('SemCor', [ ('XML', config['SemCor']['XML']), ('Labels', config['SemCor']['Labels']), ('Vocab', config['SemCor']['Vocab']), ]), ('ELMo', [ ('Weights', config['ELMo']['Weights']), ('Options', config['ELMo']['Options']), ]), ('Output file', config['SemCor']['Embeddings']), ]) t_sub = log.startTimer('Reading SemCor labels from %s...' % config['SemCor']['Labels']) semcor_labels, unique_sense_IDs = wsd_parser.readLabels(config['SemCor']['Labels']) log.stopTimer(t_sub, message='Read {0:,} labels ({1:,} unique senses) in {2}s.\n'.format( len(semcor_labels), len(unique_sense_IDs), '{0:.2f}' )) t_sub = log.startTimer('Pre-processing SemCor text from %s...' % config['SemCor']['XML']) (sentences_words, sentences_instances) = wsd_parser.processSentences(config['SemCor']['XML']) log.stopTimer(t_sub, message='Read {0:,} sentences in {1}s.\n'.format( len(sentences_words), '{0:.2f}' )) log.writeln('Pre-processing SemCor vocabulary...') max_char_len = prepVocabulary(sentences_words, config['SemCor']['Vocab']) log.writeln('Wrote vocabulary to {0}.\nMax character length: {1:,}\n'.format( config['SemCor']['Vocab'], max_char_len
def analogyTask(analogy_file, dataset, setting, analogy_type, emb_wrapper, log=log, report_top_k=5, predictions_file=None, predictions_file_mode='w'): if dataset == datasets.BMASS: if analogy_type == 'string': analogies = BMASS_parser.read(analogy_file, setting, strings_only=True) else: analogies = BMASS_parser.read(analogy_file, setting, cuis_only=True) elif dataset == datasets.Google: analogies = Google_parser.read(analogy_file, to_lower=True) # if we're saving the predictions, start that file first if predictions_file: pred_stream = codecs.open(predictions_file, predictions_file_mode, 'utf-8') # build the analogy completion model sess = tf.Session() grph = AnalogyModel(sess, emb_wrapper.asArray()) completed, results = 0, {} for (relation, rel_analogies) in analogies.items(): t_file = log.startTimer(' Starting relation: %s (%d/%d)' % (relation, completed + 1, len(analogies))) rel_results = completeAnalogySet(rel_analogies, setting, emb_wrapper, grph, report_top_k, log=log) results[relation] = rel_results (correct, MAP, MRR, total, skipped, predictions) = rel_results log.stopTimer( t_file, message= ' Completed file: %s (%d/%d) [{0:.2f}s]\n >> Skipped %d/%d' % (relation, completed + 1, len(analogies), skipped, total)) if predictions_file: pred_stream.write(('{0}\n %s\n{0}\n'.format('-' * 79)) % relation) for prediction in predictions: ((a, b, c, d), is_correct, num_candidates, top_k) = prediction pred_stream.write( '\n%s:%s::%s:%s\nCorrect: %s\nPredictions: %d\n%s\n' % (a, b, c, d, str(is_correct), num_candidates, '\n'.join( [(' %s' % guess) for guess in top_k]))) completed += 1 # tie off the predictions file if predictions_file: pred_stream.close() return results
parser = optparse.OptionParser(usage='Usage: %prog VOCABF OUTF') parser.add_option('--write-lemma', dest='write_lemma', action='store_true', default=False, help='write the lemma for the synset instead of the synset ID') parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 2: parser.print_help() exit() return args, options (vocabf, outf), options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Vocabulary file to filter to', vocabf), ('Output file for relations', outf), ('Writing lemmas', options.write_lemma), ], 'Filtered WordNet relation generation') log.writeln('Reading filter vocab from %s...' % vocabf) vocab = loadVocabulary(vocabf) log.writeln('Read {0:,} words to filter to.\n'.format(len(vocab))) t_sub = log.startTimer('Extracting WordNet pairs....\n') enumerateWordNetPairs(vocab, outf, write_lemma=options.write_lemma) log.stopTimer(t_sub, message='\nExtraction complete in {0:.2f}s.') log.stop()
parser.add_option('--to-lower', dest='to_lower', action='store_true', default=False, help='lowercase all analogies') (options, args) = parser.parse_args() if not options.dataset: parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) t_main = log.startTimer() config = configparser.ConfigParser() config.read(options.config) analogy_file = datasets.getpath(options.dataset, config, options.setting) if not options.embeddings: options.embeddings = config['Default']['Embeddings'] options.embeddings_mode = config['Default']['EmbeddingsMode'] configlogger.writeConfig( log, settings=[ ('Config file', options.config), ('Dataset', options.dataset),
if filter_stopwords: stops = set(stopwords.words('english')) old_word_filter = word_filter word_filter = lambda w: old_word_filter(w) and (not w in stops) if e_vocabf: log.writeln('Getting concept filtering vocabulary from %s...' % e_vocabf) entity_vocab = set( [c.lower() for c in util.readList(e_vocabf, encoding='utf-8')]) entity_filter = lambda c: c.lower() in entity_vocab else: entity_filter = lambda c: True if dataset == datasets.NLM_WSD: t_sub = log.startTimer('Generating NLM WSD features.') dataset = nlm_wsd.NLM_WSD() mentions = nlm_wsd.getAllMentions(dataset, window_size, word_filter, entity_filter) log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions)) elif dataset == datasets.AIDA: t_sub = log.startTimer('Generating AIDA features.') dataset = aida.AIDA() mentions = aida.getAllMentions(dataset, window_size, word_filter, entity_filter) log.stopTimer(t_sub, 'Extracted %d samples.' % len(mentions)) t_sub = log.startTimer('Writing samples to %s...' % outf, newline=False) mention_file.write(mentions, outf) log.stopTimer(t_sub, message='Done ({0:.2f}s).')
help='number of threads to use for parallel calculation (default: %default)', type='int', default=1) parser.add_option('--batch-size', dest='batch_size', help='number of samples to process in each batch (default: %default)', type='int', default=25) parser.add_option('--keys', dest='keysf', help='file listing keys to restrict NN analysis to') parser.add_option('-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) != 2: parser.print_help() exit() embf, outf = args return embf, options.mode, options.keysf, outf, options.top_k, options.batch_size, options.threads, options.logfile embf, embf_mode, keysf, outf, top_k, batch_size, threads, logfile = _cli() if keysf: keys = readKeys(keysf) print("Read %d keys to restrict to" % len(keys)) else: keys = None t = log.startTimer('Reading embeddings...', newline=False) embeds = pyemblib.read(embf, mode=embf_mode, filter_to=keys, lower_keys=True) log.stopTimer(t, message='Done! Read %d embeddings ({0:.2f}s)' % len(embeds)) nearest_neighbors = calculateNearestNeighbors(embeds, outf, top_k=top_k, batch_size=batch_size, threads=threads) log.writeln('Wrote nearest neighbors to %s.' % outf)
embf, options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig( log, [ ('Input embedding file', embf), ('Input embedding file mode', options.embedding_mode), ('Output neighbor file', options.outputf), ('Ordered vocabulary file', options.vocabf), ('Number of nearest neighbors', options.k), ('Batch size', options.batch_size), ('Number of threads', options.threads), ('Partial nearest neighbors file for resuming', options.partial_neighbors_file), ], 'k Nearest Neighbor calculation with cosine similarity') t_sub = log.startTimer('Reading embeddings from %s...' % embf) emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace') log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(emb), '{0:.2f}')) if not os.path.isfile(options.vocabf): log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf) writeNodeMap(emb, options.vocabf) else: log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf) node_map = readNodeMap(options.vocabf) # get the vocabulary in node ID order, and map index in emb_arr
and not options.word_embf): parser.print_help() exit() (mentionf, ) = args return mentionf, options mentionf, options = _cli() log.start(logfile=options.logfile, stdout_also=True) if options.tab_separated: sep = '\t' else: sep = ' ' t_sub = log.startTimer('Reading entity embeddings from %s...' % options.entity_embf, newline=False) entity_embeds = pyemblib.read(options.entity_embf, separator=sep) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(entity_embeds)) t_sub = log.startTimer('Reading context embeddings from %s...' % options.ctx_embf, newline=False) ctx_embeds = pyemblib.read(options.ctx_embf, separator=sep) log.stopTimer(t_sub, message='Read %d embeddings ({0:.2f}s)' % len(ctx_embeds)) if options.entity_defnf: t_sub = log.startTimer('Reading word embeddings from %s...' % options.word_embf,
help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if (not options.inputf) or (not options.outputf) or (not options.datasetf): parser.print_help() exit() return options options = _cli() log.start(logfile=options.logfile) configlogger.writeConfig(log, [ ('Input embeddings file', options.inputf), ('Output embeddings file', options.outputf), ('Dataset file', options.datasetf), ], 'Embedding filtering for WordNet classification experiments') t_sub = log.startTimer('Reading input embeddings from %s...' % options.inputf) embeddings = pyemblib.read(options.inputf) log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format( len(embeddings), '{0:.2f}' )) log.writeln('Reading vocabulary from dataset in %s...' % options.datasetf) ds = dataset.load(options.datasetf) vocab = set() for (_, src, snk, _) in ds: vocab.add(src) vocab.add(snk) log.writeln('Found {0:,} unique words in {1:,} samples.\n'.format( len(vocab), len(ds) ))