def crosslinkSubentityType(mobilities, subentities, child_op, _type, log=log): # sort by starting, then ending positions sorted_mobilities = sorted( mobilities, key = lambda mob: (100 * mob.start) + (0.00001 * mob.end) ) sorted_subentities = sorted( subentities, key = lambda act: (100 * act.start) + (0.00001 * act.end) ) # for each action, find its containing mobility annotation for subent in sorted_subentities: i = 0 while i < len(sorted_mobilities) and sorted_mobilities[i].start < subent.start: i += 1 if i >= len(sorted_mobilities) or sorted_mobilities[i].start > subent.start: i -= 1 mob = sorted_mobilities[i] if (mob.start > subent.start) or (mob.end < subent.end): log.writeln('[WARNING] Failed to map {0} to Mobility, skipping'.format(_type)) elif (mob.text[subent.start-mob.start:subent.end-mob.start] != subent.text): log.writeln('[WARNING] Text mismatch in entity crosslinking: Mobility has text "{0}", {1} has text "{2}"; skipping'.format(mob.text[subent.start-mob.start:subent.end-mob.start], _type, subent.text)) else: subent.mobility = mob child_op(mob, subent)
def runLeskExperiment(preprocessed, definitions, preds_stream, options): log.writeln(('\n\n{0}\n Starting experiment\n{0}\n'.format('#' * 80))) test_labels, predictions = [], [] for m in preprocessed.mentions: test_labels.append(m.CUI.lower()) predictions.append(getMostSimilar(m, definitions, default='d450')) metrics = SimpleNamespace() metrics.correct = 0 metrics.total = 0 for j in range(len(predictions)): m = preprocessed.mentions[j] if m.candidates[predictions[j]] == test_labels[j]: metrics.correct += 1 metrics.total += 1 if preds_stream: preds_stream.write( 'Mention %d -- Pred: %d -> %s Gold: %d -> %s\n' % (preprocessed.mentions[j].ID, predictions[j], m.candidates[predictions[j]], m.candidates.index(test_labels[j]), test_labels[j])) metrics.accuracy = float(metrics.correct) / metrics.total log.writeln('Accuracy: {0:.2f} ({1:,}/{2:,})'.format( metrics.accuracy, metrics.correct, metrics.total))
def matchAnnotationAndTextFiles(data_directories, text_directory, csv_id_pattern, txt_sub_pattern, log=log): csv_files = {} csv_id_getter = re.compile(csv_id_pattern) for csvdir in data_directories: for f in os.listdir(csvdir): match = re.match(csv_id_getter, f) if match: _id = match.groups(1)[0] fpath = os.path.join(csvdir, f) csv_files[_id] = fpath paired_files = {} for (_id, csv_path) in csv_files.items(): txt_path = os.path.join( text_directory, txt_sub_pattern.format(_id) ) if os.path.isfile(txt_path): paired_files[_id] = ( csv_path, txt_path ) else: log.writeln('[WARNING] Could not find plaintext file for ID {0}'.format(_id)) return paired_files
def calculateMetricsPerCode(predictions, mentions_by_ID, eval_set): preds_keys = set(predictions.keys()) if len(preds_keys - eval_set) > 0: log.writeln( '[WARNING] Predictions file includes outputs for {0} samples not included in reference evaluation set\n' .format(len(preds_keys - eval_set))) input('[Enter] to continue') def initMetric(): obj = SimpleNamespace() obj.tp = 0 obj.fp = 0 obj.fn = 0 return obj metrics = {} for mention_ID in eval_set: results = predictions.get(mention_ID, None) if results is None: mention = mentions_by_ID[mention_ID] gold_ix = mention.candidates.index(mention.CUI) if not gold_ix in metrics: metrics[gold_ix] = initMetric() metrics[gold_ix].fn += 1 else: (scores, pred_ix, gold_ix, correct) = results if not pred_ix in metrics: metrics[pred_ix] = initMetric() if not gold_ix in metrics: metrics[gold_ix] = initMetric() if correct: metrics[gold_ix].tp += 1 else: metrics[pred_ix].fp += 1 metrics[gold_ix].fn += 1 for (ix, code_metrics) in metrics.items(): if code_metrics.tp + code_metrics.fp > 0: code_metrics.precision = (float(code_metrics.tp) / (code_metrics.tp + code_metrics.fp)) else: code_metrics.precision = 0 if code_metrics.tp + code_metrics.fn > 0: code_metrics.recall = (float(code_metrics.tp) / (code_metrics.tp + code_metrics.fn)) else: code_metrics.recall = 0 if code_metrics.precision + code_metrics.recall > 0: code_metrics.f1 = ( (2 * code_metrics.precision * code_metrics.recall) / (code_metrics.precision + code_metrics.recall)) else: code_metrics.f1 = 0 return metrics
def KNearestNeighbors(emb_arr, node_IDs, top_k, neighbor_file, threads=2, batch_size=5, completed_neighbors=None): '''docstring goes here ''' # set up threads log.writeln('1 | Thread initialization') all_indices = list(range(len(emb_arr))) if completed_neighbors: filtered_indices = [] for ix in all_indices: if not ix in completed_neighbors: filtered_indices.append(ix) all_indices = filtered_indices log.writeln(' >> Filtered out {0:,} completed indices'.format(len(emb_arr) - len(filtered_indices))) log.writeln(' >> Filtered set size: {0:,}'.format(len(all_indices))) index_subsets = _prepareForParallel(all_indices, threads-1, data_only=True) nn_q = mp.Queue() nn_writer = mp.Process(target=_nn_writer, args=(neighbor_file, node_IDs, nn_q)) computers = [ mp.Process(target=_threadedNeighbors, args=(index_subsets[i], emb_arr, batch_size, top_k, nn_q)) for i in range(threads - 1) ] nn_writer.start() log.writeln('2 | Neighbor computation') util.parallelExecute(computers) nn_q.put(_SIGNALS.HALT) nn_writer.join()
def baseParse(record, txtf): start_pos = int(record[1]) end_pos = int(record[2]) expected_text = record[3] with open(txtf, 'r') as stream: doc_text = stream.read() actual_text = doc_text[start_pos:end_pos] if expected_text != actual_text: log.writeln('[WARNING] Mis-alignment on {0} mention -- Expected "{1}" Found "{2}"'.format( record[0], expected_text, actual_text )) return (start_pos, end_pos, expected_text)
def buildGraph(neighbor_files, k): log.writeln('Building neighborhood graph...') graph = {} # construct frequency-weighted edges log.track(message=' >> Loaded {0}/%d neighborhood files' % len(neighbor_files), writeInterval=1) for neighbor_file in neighbor_files: neighborhoods = readNeighbors(neighbor_file, k) for (source, neighbors) in neighborhoods.items(): if graph.get(source, None) is None: graph[source] = {} for nbr in neighbors: graph[source][nbr] = graph[source].get(nbr, 0) + 1 log.tick() log.flushTracker() log.writeln(' >> Normalizing edge weights...') max_count = float(len(neighbor_files)) for (source, neighborhood) in graph.items(): for (nbr, freq) in neighborhood.items(): graph[source][nbr] = freq / max_count log.writeln('Graph complete!') return graph
def experimentWrapper(mentions, entity_embeds, ctx_embeds, options, preds_stream): preprocessed = preprocessData(mentions, entity_embeds, ctx_embeds, options) log.writeln('Filtering mentions for these embeddings...') preprocessed.mentions, skipped = filterMentions(preprocessed, options) # re-calculate mentions_by_id to remove filtered sampled preprocessed.mentions_by_id = {m.ID: m for m in preprocessed.mentions} log.writeln( ' Removed {0:,} mentions with no valid features'.format(skipped)) log.writeln('Filtered dataset size: {0:,} mentions\n'.format( len(preprocessed.mentions))) results = runCrossfoldExperiment(preprocessed, preds_stream, options) return results
exit() return args, options (mentionf, predsf), options = _cli() log.start(logfile=options.logfile) log.writeConfig([ ('Mention file', mentionf), ('Key remapping file', options.keymapf), ('Predictions file', predsf), ('No scores in predictions', options.no_scores), ('Cross-validation splits file', options.splitsf), ('Evaluating on development data', options.dev), ], 'BTRIS Mobility code-level predictions analysis') log.writeln('Reading mentions from %s...' % mentionf) mentions = mention_file.read(mentionf) log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) log.writeln('Reading splits from %s...' % options.splitsf) splits = cross_validation.readSplits(options.splitsf) log.writeln('Read {0:,} splits.\n'.format(len(splits))) log.writeln('Compiling evaluation set...') eval_set = compileEvaluationSet(splits, options.dev) log.writeln('Evaluating on {0:,} samples.\n'.format(len(eval_set))) log.writeln('Parsing predictions from %s...' % predsf) predictions = predictions_parser.parsePredictions( predsf, no_scores=options.no_scores) log.writeln('Read {0:,} predictions.\n'.format(len(predictions)))
def _cli(): import optparse parser = optparse.OptionParser( usage= 'Usage: %prog MENTIONS [options] --entities=ENTITY_FILE --ctxs=CTX_FILE', description= 'Runs the LogLinearLinker model using the embeddings in ENTITY_FILE and CTX_FILE' ' on the mentions in MENTIONS.') parser.add_option( '--entities', dest='entity_embfs', help='comma-separated list of entity embedding files (required)') parser.add_option( '--word-vocab', dest='word_vocabf', help= 'file listing words to load embeddings for (one per line); if unused, loads all embeddings' ) parser.add_option('--ctxs', dest='ctx_embf', help='context embedding file (required)') parser.add_option( '--ctxs-format', dest='ctx_emb_fmt', type='choice', choices=[pyemblib.Mode.Binary, pyemblib.Mode.Text], default=pyemblib.Mode.Text, help='file format of embedding file (word2vec format)') parser.add_option( '--input-predictions', dest='input_predsf', help='file with previously generated scores to include as features' ) parser.add_option('--predictions', dest='preds_file', help='file to write prediction details to') parser.add_option( '--n-fold', dest='n_folds', type='int', default=10, help='number of folds for cross validation (default: %default)') parser.add_option( '--dev-size', dest='dev_size', type='float', default=0.1, help= 'portion of cross-validation training data to hold back for development' ' (default %default; must be >0 and <1)') parser.add_option( '--cross-validation-splits', dest='cross_validation_file', help= 'path to save cross-validation splits to (generates multiple files; optional)' ) parser.add_option( '--normalize-features', dest='normalize_features', action='store_true', default=False, help='use sklearn feature normalization (default off)') parser.add_option('--classifier', dest='classifier', type='choice', choices=Classifier.tolist(), default=Classifier.default(), help='classification algorithm to use') parser.add_option( '--random-seed', dest='random_seed', type='int', default=-1, help='random seed for reproducibility (defaults to epoch time)') parser.add_option( '-l', '--logfile', dest='logfile', help=str.format( 'name of file to write log contents to (empty for stdout)'), default=None) hyperparameters = optparse.OptionGroup(parser, 'Hyperparameter options') hyperparameters.add_option( '--eval-on-dev', dest='eval_on_dev', action='store_true', default=False, help='evaluate on development data (for hyperparam tuning)') hyperparameters.add_option( '--no-ctx-embeddings', dest='use_ctx_embeddings', action='store_false', default=True, help='dont\'t use context embeddings in features') hyperparameters.add_option( '--no-entities', dest='use_entity_embeddings', action='store_false', default=True, help='don\'t use entity embeddings at all in features') hyperparameters.add_option( '--full-entity-embeddings', dest='full_entity_embeddings', action='store_true', default=False, help= 'use full entity embeddings instead of cosine similarity to context' ) hyperparameters.add_option( '--unigram-features', dest='unigram_features', action='store_true', default=False, help='use unigram features (indicators unless --tfidf is specified)' ) hyperparameters.add_option( '--tfidf', dest='unigrams_as_tfidf', action='store_true', default=False, help='use TF-IDF values for unigram features (w/r/t input samples as' ' documents; ignored if not using --unigram-features)') hyperparameters.add_option('--action-oracle', dest='action_oracle', action='store_true', default=False, help='use Action oracle') hyperparameters.add_option( '--pre-embedded', dest='pre_embedded', action='store_true', default=False, help='mention file is pre-embedded (overrides --unigram-features)') (options, args) = parser.parse_args() if options.random_seed < 0: options.random_seed = int(time.time()) if options.logfile and not options.preds_file: options.preds_file = '%s.predictions' % (os.path.splitext( options.logfile)[0]) now_stamp = datetime.strftime(datetime.now(), '%Y-%m-%d_%H-%M-%S') if options.logfile: options.logfile = '%s.%s' % (options.logfile, now_stamp) if options.preds_file: options.preds_file = '%s.%s' % (options.preds_file, now_stamp) if options.pre_embedded and options.unigram_features: log.writeln( '[WARNING] Cannot use --unigram-features together with --pre-embedded' ) log.writeln('[WARNING] Disabling --unigram-features') options.unigram_features = False if options.use_entity_embeddings: options.entity_embfs = options.entity_embfs.split(',') else: options.entity_embfs = [] def _bail(msg): import sys print(sys.argv) parser.print_help() print('\n' + msg) exit() if len(args) != 1: _bail('Must supply only MENTIONS') elif (options.use_entity_embeddings and len(options.entity_embfs) == 0): _bail('Must supply --entities') elif (options.use_ctx_embeddings and not options.ctx_embf): _bail('Must supply --ctxs') elif (options.dev_size <= 0 or options.dev_size >= 1): _bail('--dev-size must be between (0,1)') (mentionf, ) = args return mentionf, options
def runCrossfoldExperiment(preprocessed, preds_stream, options): cross_fold_metrics = [] for i in range(len(preprocessed.splits)): log.writeln( ('\n\n{0}\n Starting fold %d/%d\n{0}\n'.format('#' * 80)) % (i + 1, len(preprocessed.splits))) (train_ids, dev_ids, test_ids) = preprocessed.splits[i] train, test = [], [] for _id in train_ids: if _id in preprocessed.mentions_by_id: train.append(preprocessed.mentions_by_id[_id]) for _id in dev_ids: if _id in preprocessed.mentions_by_id: if options.eval_on_dev: test.append(preprocessed.mentions_by_id[_id]) else: train.append(preprocessed.mentions_by_id[_id]) if not options.eval_on_dev: for _id in test_ids: if _id in preprocessed.mentions_by_id: test.append(preprocessed.mentions_by_id[_id]) if options.unigram_features: unigram_vocab = getTextVocabulary(train, preprocessed, options) unigram_vectorizer = CountVectorizer(vocabulary=unigram_vocab, binary=True) else: unigram_vectorizer = None training_features, training_labels = [], [] for m in train: (feature_vector, label) = prepSample(m, preprocessed, preprocessed.per_fold_unigram_features[i], options) if feature_vector is None or label is None: continue training_features.append(feature_vector) training_labels.append(label) test_features, test_labels = [], [] for m in test: (feature_vector, label) = prepSample(m, preprocessed, preprocessed.per_fold_unigram_features[i], options) if feature_vector is None or label is None: continue test_features.append(feature_vector) test_labels.append(label) log.writeln('Number of training samples: {0:,}'.format( len(training_labels))) log.writeln('Number of test samples: {0:,}\n'.format(len(test_labels))) if len(test_labels) == 0: log.writeln( '[WARNING] Test ids list is empty due to rounding in cross-validation splits, skipping...' ) continue if len(set(training_labels)) == 1: log.writeln( '[WARNING] Training samples for this subset have only one label class. Skipping...' ) return None if options.unigram_features: training_features = scipy.sparse.vstack(training_features) test_features = scipy.sparse.vstack(test_features) scaler = StandardScaler(with_mean=False) if options.normalize_features: training_features = scaler.fit_transform(training_features) test_features = scaler.transform(test_features) if options.classifier == Classifier.SVM: t = log.startTimer('Training SVM classifier...') classifier = sklearn.svm.SVC(kernel='linear', random_state=options.random_seed + i) classifier.fit(training_features, training_labels) log.stopTimer(t, message='Training complete in {0:.2f}s.\n') t = log.startTimer('Running trained SVM on test set...') predictions = classifier.predict(test_features) log.stopTimer(t, message='Complete in {0:.2f}s.\n') elif options.classifier == Classifier.KNN: t = log.startTimer('Training k-NN classifier...') classifier = sklearn.neighbors.KNeighborsClassifier( n_neighbors=5, #random_state=options.random_seed+i ) classifier.fit(training_features, training_labels) log.stopTimer(t, message='Training complete in {0:.2f}s.\n') t = log.startTimer('Running trained k-NN on test set...') predictions = classifier.predict(test_features) log.stopTimer(t, message='Complete in {0:.2f}s.\n') elif options.classifier == Classifier.MLP: t = log.startTimer('Training MLP classifier...') classifier = sklearn.neural_network.multilayer_perceptron.MLPClassifier( max_iter=1000, random_state=options.random_seed + i) classifier.fit(training_features, training_labels) log.stopTimer(t, message='Training complete in {0:.2f}s.\n') t = log.startTimer('Running trained MLP on test set...') predictions = classifier.predict(test_features) log.stopTimer(t, message='Complete in {0:.2f}s.\n') metrics = SimpleNamespace() metrics.correct = 0 metrics.total = 0 for j in range(len(predictions)): if predictions[j] == test_labels[j]: metrics.correct += 1 metrics.total += 1 if preds_stream: preds_stream.write( 'Mention %d -- Pred: %d -> %s Gold: %d -> %s\n' % (test[j].ID, predictions[j], test[j].candidates[predictions[j]], test_labels[j], test[j].candidates[test_labels[j]])) metrics.accuracy = float(metrics.correct) / metrics.total log.writeln('Fold accuracy: {0:.2f} ({1:,}/{2:,})'.format( metrics.accuracy, metrics.correct, metrics.total)) cross_fold_metrics.append(metrics) overall_metrics = SimpleNamespace() overall_metrics.correct = 0 overall_metrics.total = 0 log.writeln('\n\n-- Cross-validation report --\n') for i in range(len(cross_fold_metrics)): m = cross_fold_metrics[i] overall_metrics.correct += m.correct overall_metrics.total += m.total log.writeln(' Fold %d -- Accuracy: %f (%d/%d)' % (i + 1, m.accuracy, m.correct, m.total)) overall_metrics.accuracy = np.mean( [m.accuracy for m in cross_fold_metrics]) log.writeln('\nOverall cross-validation accuracy: %f' % overall_metrics.accuracy) return overall_metrics
] if config['ExtractionMode'] == 'csv': settings.extend([ ('Plaintext directory', config['PlaintextDirectory']), ('CSV file ID pattern', config['CSVIdentifierPattern']), ('Plaintext file render pattern', config['PlaintextIdentifierPattern']) ]) settings.extend([ ('Output mentions file', options.outputf), ('Mention map file (automatic)', options.mention_map_file), ]) log.writeConfig(settings, title='Mention extraction for action classification') t_sub = log.startTimer('Generating %s features.' % options.dataset) mentions, mention_map = getAllMentions(config, options, tokenizer=options.tokenizer, bert_vocab_file=options.bert_vocab_file, log=log) log.stopTimer(t_sub, 'Extracted {0:,} samples.'.format(len(mentions))) log.writeln('Writing mention map information to %s...' % options.mention_map_file) with open(options.mention_map_file, 'w') as stream: for (mention_ID, mention_info) in mention_map.items(): stream.write('%d\t%s\n' % (mention_ID, mention_info)) log.writeln('Wrote info for {0:,} mentions.\n'.format(len(mention_map))) t_sub = log.startTimer('Writing samples to %s...' % options.outputf, newline=False) mention_file.write(mentions, options.outputf) log.stopTimer(t_sub, message='Done ({0:.2f}s).') log.stop()
parser.error('Must provide --keys') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('HDF5 embeddings', options.input_f), ('HDF5 layer', ('Average' if options.layer == AVERAGE_LAYERS else options.layer)), ('Per-row keys', options.key_f), ('Mentions file', options.mentions_f), ('Using Action oracle', options.action_oracle), ('Output embedded mentions file', options.output_f), ], 'Embedded mentions file generation with pre-generated HDF5 features') log.writeln('Reading keys from %s...' % options.key_f) keys = readKeys(options.key_f) log.writeln('Read {0:,} keys.\n'.format(len(keys))) log.writeln('Reading textual mentions from %s...' % options.mentions_f) mentions = mention_file.read(options.mentions_f) mentions_by_id = {m.ID: m for m in mentions} log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) log.writeln('Generating embedded mentions from HDF5 file %s...' % options.input_f) new_mentions = collapseMentionEmbeddings(options.input_f, keys, options.layer, mentions_by_id, options.action_oracle) log.writeln('Generated {0:,} embedded mentions.\n'.format( len(new_mentions)))
mentionf, options = _cli() log.start(logfile=options.logfile) log.writeConfig([ ('Mention file', mentionf), ('Entity definitions file', options.definitions_file), ('Restricting to main definitions only', options.main_only), ], title="Adapted Lesk similarity baseline") t_sub = log.startTimer('Reading mentions from %s...' % mentionf) mentions = mention_file.read(mentionf) log.stopTimer(t_sub, message='Read %s mentions ({0:.2f}s)\n' % ('{0:,}'.format(len(mentions)))) log.writeln('Reading definitions from %s...' % options.definitions_file) definitions = readCodeDefinitions(options.definitions_file, options.main_only) log.writeln('Read definitions for {0:,} codes.\n'.format(len(definitions))) if options.preds_file: preds_stream = open(options.preds_file, 'w') else: preds_stream = None results = experimentWrapper(mentions, definitions, options, preds_stream) if options.preds_file: preds_stream.close() log.stop()
options.bert_dir, '%s.compiled_output.predictions' % options.model) options.logfile = '%s.log' % options.output_f return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Mentions file', options.mentions_f), ('BERT baseline root directory', options.bert_dir), ('Model configuration', options.model), ('Output file', options.output_f), ], 'BERT baseline results compilation') log.writeln('Reading mentions from %s...' % options.mentions_f) mentions = mention_file.read(options.mentions_f) mentions_by_ID = {m.ID: m for m in mentions} log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) fold_dirs = glob.glob(os.path.join(options.bert_dir, 'fold-*')) log.writeln('Found {0} folds in {1}.\n'.format(len(fold_dirs), options.bert_dir)) with open(options.output_f, 'w') as stream: fold_dirs = sorted(fold_dirs) for i in range(len(fold_dirs)): log.writeln('Checking fold {0}/{1}'.format(i + 1, len(fold_dirs))) log.indent() test_f = os.path.join(fold_dirs[i], 'test.tsv')
def crossValidationSplits(dataset, n_folds, dev_size, persistent_path=None, random_seed=1, log=log): if persistent_path and os.path.isfile('%s.fold-0.train' % persistent_path): log.writeln('Reading pre-existing cross validation splits from %s.' % persistent_path) splits = readSplits(persistent_path, n_folds, id_cast=int) else: log.writeln('Generating cross-validation splits...') np.random.seed(random_seed) ids_by_class, classes = stratifyByClass(dataset) total_size = 0 for (lbl, ids) in ids_by_class.items(): total_size += len(ids) log.writeln(' Dataset size: {0:,}'.format(total_size)) log.writeln(' Number of classes: {0:,}'.format(len(classes))) # shuffle it for _class in classes: np.random.shuffle(ids_by_class[_class]) # figure out how many points of each class per fold fold_size_by_class, dev_size_by_class = getFoldAndDevSizeByClass( ids_by_class, n_folds, dev_size) labeled_splits, id_splits = [], [] for i in range(n_folds): train_by_class = {} for _class in classes: train_by_class[_class] = [] for j in range(n_folds): fold_by_class = {} for _class in classes: fold_size = fold_size_by_class[_class] if j < (n_folds - 1): fold_by_class[_class] = ids_by_class[_class][j * fold_size: (j + 1) * fold_size] else: fold_by_class[_class] = ids_by_class[_class][ j * fold_size:] # pull test if j == i: test_by_class = fold_by_class.copy() # pull dev (portion) elif j == ((i + 1) % n_folds): dev_by_class = {} for (_class, subset) in fold_by_class.items(): dev_by_class[ _class] = subset[:dev_size_by_class[_class]] train_by_class[_class].extend( subset[dev_size_by_class[_class]:]) # everything else goes to training else: for (_class, subset) in fold_by_class.items(): train_by_class[_class].extend(subset) # collapse train, dev, test to flat ID lists lbl_train, id_train = collapseFromByClass(train_by_class) lbl_dev, id_dev = collapseFromByClass(dev_by_class) lbl_test, id_test = collapseFromByClass(test_by_class) labeled_splits.append((lbl_train, lbl_dev, lbl_test)) id_splits.append((id_train, id_dev, id_test)) log.writeln( ' Fold {0} -- Train: {1:,} Dev: {2:,} Test: {3:,}'.format( i + 1, len(id_train), len(id_dev), len(id_test))) if persistent_path: log.writeln('Writing cross validation splits to %s.' % persistent_path) writeSplits(labeled_splits, persistent_path) splits = id_splits log.writeln() return splits
def extractAllEntities(data_directories, log=log, with_full_text=False, errors='strict', by_document=False, polarity_type=int): ''' Extract all Mobility, Action, Assistance, and Quantification entities from XML-formatted annotation files. @parameters data_directories :: list of directories containing .xml annotation files with_full_text :: includes full document text in "full_text" field of each object log :: logging object to write to (defaults to dng_logger.log) @returns mobilities :: list of Mobility objects actions :: list of Action objects assistances :: list of Assistance objects quantifications :: list of Quantification objects ''' mobilities = [] actions = [] assistances = [] quantifications = [] documents = [] extractor = XMLEntityExtractor() for dir_path in data_directories: files = os.listdir(dir_path) log.writeln('Extracting data from %s...' % dir_path) log.track( message= ' >> Extracted entities from {0:,}/{1:,} files ({2:,} entities)', writeInterval=1) for f in files: fpath = os.path.join(dir_path, f) doc = extractor.extractMentions(fpath, with_full_text=with_full_text, errors=errors, polarity_type=polarity_type, as_document=True) doc.file_path = fpath doc.ID = f for m in doc.mobilities: m.file_ID = f mobilities.append(m) for m in doc.actions: m.file_ID = f actions.append(m) for m in doc.assistances: m.file_ID = f assistances.append(m) for m in doc.quantifications: m.file_ID = f quantifications.append(m) documents.append(doc) log.tick( len(files), len(mobilities) + len(actions) + len(assistances) + len(quantifications)) log.flushTracker( len(files), len(mobilities) + len(actions) + len(assistances) + len(quantifications)) if by_document: return documents else: return (mobilities, actions, assistances, quantifications)
('Input embedding file', embf), ('Input embedding file mode', options.embedding_mode), ('Output neighbor file', options.outputf), ('Ordered vocabulary file', options.vocabf), ('Number of nearest neighbors', options.k), ('Batch size', options.batch_size), ('Number of threads', options.threads), ('Partial nearest neighbors file for resuming', options.partial_neighbors_file), ], 'k Nearest Neighbor calculation with cosine similarity') t_sub = log.startTimer('Reading embeddings from %s...' % embf) emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace') log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(len(emb), '{0:.2f}')) if not os.path.isfile(options.vocabf): log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf) writeNodeMap(emb, options.vocabf) else: log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf) node_map = readNodeMap(options.vocabf) # get the vocabulary in node ID order, and map index in emb_arr # to node IDs node_IDs = list(node_map.keys()) node_IDs.sort() ordered_vocab = [ node_map[node_ID] for node_ID in node_IDs ] emb_arr = np.array([
return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Mentions file', options.mentions_f), ('Mention map file', options.mention_map_f), ('Number of folds', options.num_folds), ('Dev set size', options.dev_size), ('Document ID filter list', options.filter_doc_ID_f), ('Random seed', options.random_seed), ('Output file', options.output_f), ], 'Cross-validation splits generation') log.writeln('Loading mentions from %s...' % options.mentions_f) mentions = mention_file.read(options.mentions_f) log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) if options.filter_doc_ID_f: log.writeln('Reading mention map from %s...' % options.mention_map_f) mention_map = mention_map_lib.load(options.mention_map_f) log.writeln('Read mapping info for {0:,} mentions.\n'.format( len(mention_map))) log.writeln('Reading doc ID filter list from %s...' % options.filter_doc_ID_f) filter_doc_IDs = readFilterDocIDSet(options.filter_doc_ID_f) filtered_mentions = [] for m in mentions: if mention_map[m.ID] in filter_doc_IDs:
], 'JET -- STR -> CUI file preprocessing') t_sub = log.startTimer('Initializing tokenizer...') tokenizer = tokenization.CLI.initializeTokenizer(options) log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n') t_sub = log.startTimer('Reading terminology file...') ngrams, entities_by_term = readTerminology( options.input_f, tokenizer, remove_stopwords=options.remove_stopwords, use_collapsed_string=options.use_collapsed_string) log.stopTimer(t_sub, message='Completed in {0:.2f}s.\n') if options.verbose: log.writeln('\nRead map:') NGramMapPrinter.prn(ngrams) log.writeln('\nTerm ID-Entity mapping:') for term_ID in entities_by_term.keys(): log.writeln(' %s -> %s' % (term_ID, entities_by_term[term_ID])) picklebase = os.path.join( options.output_dir, os.path.splitext(os.path.basename(options.input_f))[0]) term_to_string_map_f = '%s.term_to_string_map.txt' % picklebase t_sub = log.startTimer('Writing term ID-string map to %s...' % term_to_string_map_f) writeTermStringMap(ngrams, term_to_string_map_f) log.stopTimer(t_sub)
default=10) parser.add_option( '-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if len(args) == 0: parser.print_help() exit() neighbor_files = args return neighbor_files, options neighbor_files, options = _cli() log.start(options.logfile) log.writeConfig([ *[('Neighborhood sample file %d' % (i + 1), neighbor_files[i]) for i in range(len(neighbor_files))], ('Output file', options.outputf), ('Number of neighbors to include in edge construction', options.k), ], 'Nearest neighborhood graph generation') graph = buildGraph(neighbor_files, options.k) log.write('Writing graph to %s...' % options.outputf) writeGraph(graph, options.outputf) log.writeln('Done!') log.stop()
return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Input embeddings', options.inputf), ('Vocabulary file', options.vocabf), ('Output embeddings', options.outputf), ('Output embeddings format', options.output_format), ]) log.startTimer('Reading node2vec embeddings from %s...' % options.inputf) e = pyemblib.read(options.inputf, format=pyemblib.Format.Word2Vec, mode=pyemblib.Mode.Text) log.stopTimer( message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}')) log.writeln('Reading vocabulary mapping from %s...' % options.vocabf) vocab = readVocab(options.vocabf) log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab))) e = {vocab[int(k)]: v for (k, v) in e.items()} log.writeln('Writing remapped embeddings to %s...' % options.outputf) (fmt, mode) = pyemblib.CLI_Formats.parse(options.output_format) pyemblib.write(e, options.outputf, format=fmt, mode=mode, verbose=True) log.writeln('Done!') log.stop()
if not options.bert_f: parser.error('Must provide --bert-output') elif not options.overlaps_f: parser.error('Must provide --overlaps') elif not options.output_f: parser.error('Must provide --output') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('BERT output', options.bert_f), ('Overlaps file', options.overlaps_f), ('Output file', options.output_f), ], 'BERT embedding recombination') log.writeln('Reading overlaps from %s...' % options.overlaps_f) overlaps = readOverlaps(options.overlaps_f) log.writeln('Read overlaps for {0:,} lines.\n'.format(len(overlaps))) log.writeln('Streaming BERT output conversion...') streamingBERTConvert( options.bert_f, overlaps, options.output_f, options.tokenized_f ) log.writeln('Done.') log.stop()
(options, args) = parser.parse_args() if not options.corpus_f: parser.print_help() parser.error('Must provide --corpus') if not options.annotations_f: parser.print_help() parser.error('Must provide --annotations') if not options.term_strings_f: parser.print_help() parser.error('Must provide --term-strings') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Corpus file', options.corpus_f), ('Annotations file', options.annotations_f), ('Term strings file', options.term_strings_f), ], 'JET annotation validation') log.writeln('Reading term->strings mapping from %s...' % options.term_strings_f) term_map = readTermMap(options.term_strings_f) log.writeln('Mapped strings for {0:,} terms.\n'.format(len(term_map))) log.writeln('Validating corpus annotations...') validate(options.corpus_f, options.annotations_f, term_map) log.writeln('Done!\n') log.stop()
('Input file', options.input_file), ('Output settings', [ ('Base path', options.output_file), ('Tokenized file', output_tokens), ('Subsequences file', output_subsequences), ('Overlaps file', output_overlaps), ('Log file', output_log), ]), ('Max subsequence length', options.max_sequence_length), ('Overlap fraction', options.overlap), ('BERT vocab file', options.vocab_file) ]) options.max_sequence_length -= 2 log.writeln('Tokenizing input file %s...' % options.input_file) tokenizer = bert.tokenization.FullTokenizer( vocab_file=options.vocab_file, do_lower_case=True ) num_lines = 0 with open(options.input_file, 'r') as input_stream, \ open(output_tokens, 'w') as output_stream: for line in input_stream: tokens = tokenizer.tokenize(line.strip()) output_stream.write('%s\n' % (' '.join(tokens))) num_lines += 1 log.writeln('Wrote {0:,} tokenized lines.\n'.format(num_lines)) log.writeln('Reading tokenized lines from %s...' % output_tokens) tokenized_lines = []