def runCrossfoldExperiment(preprocessed, preds_stream, options): cross_fold_metrics = [] for i in range(len(preprocessed.splits)): log.writeln( ('\n\n{0}\n Starting fold %d/%d\n{0}\n'.format('#' * 80)) % (i + 1, len(preprocessed.splits))) (train_ids, dev_ids, test_ids) = preprocessed.splits[i] train, test = [], [] for _id in train_ids: if _id in preprocessed.mentions_by_id: train.append(preprocessed.mentions_by_id[_id]) for _id in dev_ids: if _id in preprocessed.mentions_by_id: if options.eval_on_dev: test.append(preprocessed.mentions_by_id[_id]) else: train.append(preprocessed.mentions_by_id[_id]) if not options.eval_on_dev: for _id in test_ids: if _id in preprocessed.mentions_by_id: test.append(preprocessed.mentions_by_id[_id]) if options.unigram_features: unigram_vocab = getTextVocabulary(train, preprocessed, options) unigram_vectorizer = CountVectorizer(vocabulary=unigram_vocab, binary=True) else: unigram_vectorizer = None training_features, training_labels = [], [] for m in train: (feature_vector, label) = prepSample(m, preprocessed, preprocessed.per_fold_unigram_features[i], options) if feature_vector is None or label is None: continue training_features.append(feature_vector) training_labels.append(label) test_features, test_labels = [], [] for m in test: (feature_vector, label) = prepSample(m, preprocessed, preprocessed.per_fold_unigram_features[i], options) if feature_vector is None or label is None: continue test_features.append(feature_vector) test_labels.append(label) log.writeln('Number of training samples: {0:,}'.format( len(training_labels))) log.writeln('Number of test samples: {0:,}\n'.format(len(test_labels))) if len(test_labels) == 0: log.writeln( '[WARNING] Test ids list is empty due to rounding in cross-validation splits, skipping...' ) continue if len(set(training_labels)) == 1: log.writeln( '[WARNING] Training samples for this subset have only one label class. Skipping...' ) return None if options.unigram_features: training_features = scipy.sparse.vstack(training_features) test_features = scipy.sparse.vstack(test_features) scaler = StandardScaler(with_mean=False) if options.normalize_features: training_features = scaler.fit_transform(training_features) test_features = scaler.transform(test_features) if options.classifier == Classifier.SVM: t = log.startTimer('Training SVM classifier...') classifier = sklearn.svm.SVC(kernel='linear', random_state=options.random_seed + i) classifier.fit(training_features, training_labels) log.stopTimer(t, message='Training complete in {0:.2f}s.\n') t = log.startTimer('Running trained SVM on test set...') predictions = classifier.predict(test_features) log.stopTimer(t, message='Complete in {0:.2f}s.\n') elif options.classifier == Classifier.KNN: t = log.startTimer('Training k-NN classifier...') classifier = sklearn.neighbors.KNeighborsClassifier( n_neighbors=5, #random_state=options.random_seed+i ) classifier.fit(training_features, training_labels) log.stopTimer(t, message='Training complete in {0:.2f}s.\n') t = log.startTimer('Running trained k-NN on test set...') predictions = classifier.predict(test_features) log.stopTimer(t, message='Complete in {0:.2f}s.\n') elif options.classifier == Classifier.MLP: t = log.startTimer('Training MLP classifier...') classifier = sklearn.neural_network.multilayer_perceptron.MLPClassifier( max_iter=1000, random_state=options.random_seed + i) classifier.fit(training_features, training_labels) log.stopTimer(t, message='Training complete in {0:.2f}s.\n') t = log.startTimer('Running trained MLP on test set...') predictions = classifier.predict(test_features) log.stopTimer(t, message='Complete in {0:.2f}s.\n') metrics = SimpleNamespace() metrics.correct = 0 metrics.total = 0 for j in range(len(predictions)): if predictions[j] == test_labels[j]: metrics.correct += 1 metrics.total += 1 if preds_stream: preds_stream.write( 'Mention %d -- Pred: %d -> %s Gold: %d -> %s\n' % (test[j].ID, predictions[j], test[j].candidates[predictions[j]], test_labels[j], test[j].candidates[test_labels[j]])) metrics.accuracy = float(metrics.correct) / metrics.total log.writeln('Fold accuracy: {0:.2f} ({1:,}/{2:,})'.format( metrics.accuracy, metrics.correct, metrics.total)) cross_fold_metrics.append(metrics) overall_metrics = SimpleNamespace() overall_metrics.correct = 0 overall_metrics.total = 0 log.writeln('\n\n-- Cross-validation report --\n') for i in range(len(cross_fold_metrics)): m = cross_fold_metrics[i] overall_metrics.correct += m.correct overall_metrics.total += m.total log.writeln(' Fold %d -- Accuracy: %f (%d/%d)' % (i + 1, m.accuracy, m.correct, m.total)) overall_metrics.accuracy = np.mean( [m.accuracy for m in cross_fold_metrics]) log.writeln('\nOverall cross-validation accuracy: %f' % overall_metrics.accuracy) return overall_metrics
return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()} log.writeln('Writing remapped embeddings to %s...' % options.outputf) (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format) pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True) log.writeln('Done!') log.stop()
('Pre-embedded mentions', options.pre_embedded), ]), ], title= "Entity linking (disambiguation) experiment using scikit-learn baseline algorithms" ) ## Data loading/setup entity_embeds = [] for i in range(len(options.entity_embfs)): f = options.entity_embfs[i] t_sub = log.startTimer( 'Reading set %d of entity embeddings from %s...' % (i + 1, f)) entity_embeds.append(pyemblib.read(f, lower_keys=True)) log.stopTimer(t_sub, message='Read %s embeddings ({0:.2f}s)\n' % ('{0:,}'.format(len(entity_embeds[-1])))) if options.word_vocabf: t_sub = log.startTimer('Reading word/context vocabulary from %s...' % options.word_vocabf) word_vocab = readVocab(options.word_vocabf) log.stopTimer(t_sub, message='Read %s words ({0:.2f}s)\n' % ('{0:,}'.format(len(word_vocab)))) else: word_vocab = None if options.use_ctx_embeddings: t_sub = log.startTimer('Reading context embeddings from %s...' % options.ctx_embf)
] if config['ExtractionMode'] == 'csv': settings.extend([ ('Plaintext directory', config['PlaintextDirectory']), ('CSV file ID pattern', config['CSVIdentifierPattern']), ('Plaintext file render pattern', config['PlaintextIdentifierPattern']) ]) settings.extend([ ('Output mentions file', options.outputf), ('Mention map file (automatic)', options.mention_map_file), ]) log.writeConfig(settings, title='Mention extraction for action classification') t_sub = log.startTimer('Generating %s features.' % options.dataset) mentions, mention_map = getAllMentions(config, options, tokenizer=options.tokenizer, bert_vocab_file=options.bert_vocab_file, log=log) log.stopTimer(t_sub, 'Extracted {0:,} samples.'.format(len(mentions))) log.writeln('Writing mention map information to %s...' % options.mention_map_file) with open(options.mention_map_file, 'w') as stream: for (mention_ID, mention_info) in mention_map.items(): stream.write('%d\t%s\n' % (mention_ID, mention_info)) log.writeln('Wrote info for {0:,} mentions.\n'.format(len(mention_map))) t_sub = log.startTimer('Writing samples to %s...' % options.outputf, newline=False) mention_file.write(mentions, options.outputf) log.stopTimer(t_sub, message='Done ({0:.2f}s).') log.stop()
return mentionf, options ## Getting configuration settings mentionf, options = _cli() log.start(logfile=options.logfile) log.writeConfig([ ('Mention file', mentionf), ('Entity definitions file', options.definitions_file), ('Restricting to main definitions only', options.main_only), ], title="Adapted Lesk similarity baseline") t_sub = log.startTimer('Reading mentions from %s...' % mentionf) mentions = mention_file.read(mentionf) log.stopTimer(t_sub, message='Read %s mentions ({0:.2f}s)\n' % ('{0:,}'.format(len(mentions)))) log.writeln('Reading definitions from %s...' % options.definitions_file) definitions = readCodeDefinitions(options.definitions_file, options.main_only) log.writeln('Read definitions for {0:,} codes.\n'.format(len(definitions))) if options.preds_file: preds_stream = open(options.preds_file, 'w') else: preds_stream = None results = experimentWrapper(mentions, definitions, options, preds_stream) if options.preds_file:
sys.setrecursionlimit(1800) options = args = _cli() log.start(options.logfile) log.writeConfig([ ('Terminology file', options.input_f), ('Storing pickled maps to', options.output_dir), ('Map concepts separated by', options.sep), ('Removing stopword terms', options.remove_stopwords), ('Tokenization settings', tokenization.CLI.logOptions(options)), ], 'JET -- STR -> CUI file preprocessing') t_sub = log.startTimer('Initializing tokenizer...') tokenizer = tokenization.CLI.initializeTokenizer(options) log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n') t_sub = log.startTimer('Reading terminology file...') ngrams, entities_by_term = readTerminology( options.input_f, tokenizer, remove_stopwords=options.remove_stopwords, use_collapsed_string=options.use_collapsed_string) log.stopTimer(t_sub, message='Completed in {0:.2f}s.\n') if options.verbose: log.writeln('\nRead map:') NGramMapPrinter.prn(ngrams) log.writeln('\nTerm ID-Entity mapping:') for term_ID in entities_by_term.keys():
log.start(options.logfile) log.writeConfig([ ('Plaintext corpus file', options.input_f), ('Pickled ngram->term map', options.terminology_pkl_f), ('Output annotations file', options.output_f), ('Tagging settings', [ ('Number of tagging threads', options.threads), ('Line queue size cap', 'unlimited' if options.maxlines <= 0 else options.maxlines), ]), ('Tokenization settings', tokenization.CLI.logOptions(options)), ], 'JET -- Automated corpus tagging') t_sub = log.startTimer('Loading pickled strings map...') compiled_terminology = pickleio.read(options.terminology_pkl_f) log.stopTimer(t_sub, message='Done in {0:.2f}s.\n') t_sub = log.startTimer('Initializing tokenizer...') tokenizer = tokenization.CLI.initializeTokenizer(options) log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n') t_sub = log.startTimer('Tagging corpus...') tagCorpus( options.input_f, compiled_terminology, options.output_f, tokenizer, options.threads, max_lines_in_queue=options.maxlines, ) log.stopTimer(t_sub, message='Done in {0:.2f}s')