コード例 #1
0
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if (not options.inputf) or (not options.outputf) or (
                not options.vocabf):
            parser.print_help()
            exit()
        return options

    options = _cli()

    log.start(options.logfile)
    log.writeConfig([
        ('Input embeddings', options.inputf),
        ('Vocabulary file', options.vocabf),
        ('Output embeddings', options.outputf),
        ('Output embeddings format', options.output_format),
    ])

    log.startTimer('Reading node2vec embeddings from %s...' % options.inputf)
    e = pyemblib.read(options.inputf,
                      format=pyemblib.Format.Word2Vec,
                      mode=pyemblib.Mode.Text)
    log.stopTimer(
        message='Read {0:,} embeddings in {1}s.\n'.format(len(e), '{0:.2f}'))

    log.writeln('Reading vocabulary mapping from %s...' % options.vocabf)
    vocab = readVocab(options.vocabf)
    log.writeln('Read {0:,} vocab mappings.\n'.format(len(vocab)))

    e = {vocab[int(k)]: v for (k, v) in e.items()}
コード例 #2
0
        (options, args) = parser.parse_args()
        if not options.corpus_f:
            parser.print_help()
            parser.error('Must provide --corpus')
        if not options.annotations_f:
            parser.print_help()
            parser.error('Must provide --annotations')
        if not options.term_strings_f:
            parser.print_help()
            parser.error('Must provide --term-strings')
        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Corpus file', options.corpus_f),
        ('Annotations file', options.annotations_f),
        ('Term strings file', options.term_strings_f),
    ], 'JET annotation validation')

    log.writeln('Reading term->strings mapping from %s...' %
                options.term_strings_f)
    term_map = readTermMap(options.term_strings_f)
    log.writeln('Mapped strings for {0:,} terms.\n'.format(len(term_map)))

    log.writeln('Validating corpus annotations...')
    validate(options.corpus_f, options.annotations_f, term_map)
    log.writeln('Done!\n')

    log.stop()
 log.writeConfig(
     [
         ('Mention file', mentionf),
         ('Entity embedding settings', entity_settings),
         ('Word/ctx embeddings', options.ctx_embf),
         ('Word vocabulary (unused if empty)', options.word_vocabf),
         ('Writing predictions to', options.preds_file),
         ('Using feature normalization', options.normalize_features),
         ('Classification algorithm', options.classifier),
         ('Training settings', [
             ('Cross validation splits file',
              options.cross_validation_file),
             ('Number of folds', options.n_folds),
             ('Fraction of training used for dev', options.dev_size),
             ('Random seed', options.random_seed),
         ]),
         ('Hyperparameter settings', [
             ('Evaluating on development data', options.eval_on_dev),
             ('Using entity embeddings at all',
              options.use_entity_embeddings),
             ('Using full entity embeddings instead of cos sim',
              options.full_entity_embeddings),
             ('Using context embeddings', options.use_ctx_embeddings),
             ('Including unigram features', options.unigram_features),
             ('Using TF-IDF values for unigram features',
              options.unigrams_as_tfidf
              if options.unigram_features else 'N/A'),
             ('Using Action oracle', options.action_oracle),
             ('Input predictions file', options.input_predsf),
             ('Pre-embedded mentions', options.pre_embedded),
         ]),
     ],
     title=
     "Entity linking (disambiguation) experiment using scikit-learn baseline algorithms"
 )
        (options, args) = parser.parse_args()
        if not options.splitsf:
            parser.print_help()
            parser.error('Must provide --splits')
        if len(args) != 2:
            parser.print_help()
            exit()
        return args, options

    (mentionf, predsf), options = _cli()
    log.start(logfile=options.logfile)

    log.writeConfig([
        ('Mention file', mentionf),
        ('Key remapping file', options.keymapf),
        ('Predictions file', predsf),
        ('No scores in predictions', options.no_scores),
        ('Cross-validation splits file', options.splitsf),
        ('Evaluating on development data', options.dev),
    ], 'BTRIS Mobility code-level predictions analysis')

    log.writeln('Reading mentions from %s...' % mentionf)
    mentions = mention_file.read(mentionf)
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    log.writeln('Reading splits from %s...' % options.splitsf)
    splits = cross_validation.readSplits(options.splitsf)
    log.writeln('Read {0:,} splits.\n'.format(len(splits)))

    log.writeln('Compiling evaluation set...')
    eval_set = compileEvaluationSet(splits, options.dev)
    log.writeln('Evaluating on {0:,} samples.\n'.format(len(eval_set)))
コード例 #5
0
        if len(args) != 1:
            parser.print_help()
            parser.error('Must supply only MENTIONS')
        if not options.definitions_file:
            parser.print_help()
            parser.error('Must supply --definitions')

        (mentionf, ) = args
        return mentionf, options

    ## Getting configuration settings
    mentionf, options = _cli()
    log.start(logfile=options.logfile)
    log.writeConfig([
        ('Mention file', mentionf),
        ('Entity definitions file', options.definitions_file),
        ('Restricting to main definitions only', options.main_only),
    ],
                    title="Adapted Lesk similarity baseline")

    t_sub = log.startTimer('Reading mentions from %s...' % mentionf)
    mentions = mention_file.read(mentionf)
    log.stopTimer(t_sub,
                  message='Read %s mentions ({0:.2f}s)\n' %
                  ('{0:,}'.format(len(mentions))))

    log.writeln('Reading definitions from %s...' % options.definitions_file)
    definitions = readCodeDefinitions(options.definitions_file,
                                      options.main_only)
    log.writeln('Read definitions for {0:,} codes.\n'.format(len(definitions)))

    if options.preds_file:
コード例 #6
0
            'N/A' if options.tokenizer != Tokenizer.BERT else options.bert_vocab_file
        )),
        ('Extraction mode', config['ExtractionMode']),
        ('Annotation directories', config['DataDirectories']),
    ]
    if config['ExtractionMode'] == 'csv':
        settings.extend([
            ('Plaintext directory', config['PlaintextDirectory']),
            ('CSV file ID pattern', config['CSVIdentifierPattern']),
            ('Plaintext file render pattern', config['PlaintextIdentifierPattern'])
        ])
    settings.extend([
        ('Output mentions file', options.outputf),
        ('Mention map file (automatic)', options.mention_map_file),
    ])
    log.writeConfig(settings, title='Mention extraction for action classification')

    t_sub = log.startTimer('Generating %s features.' % options.dataset)
    mentions, mention_map = getAllMentions(config, options,
        tokenizer=options.tokenizer, bert_vocab_file=options.bert_vocab_file,
        log=log)
    log.stopTimer(t_sub, 'Extracted {0:,} samples.'.format(len(mentions)))

    log.writeln('Writing mention map information to %s...' % options.mention_map_file)
    with open(options.mention_map_file, 'w') as stream:
        for (mention_ID, mention_info) in mention_map.items():
            stream.write('%d\t%s\n' % (mention_ID, mention_info))
    log.writeln('Wrote info for {0:,} mentions.\n'.format(len(mention_map)))

    t_sub = log.startTimer('Writing samples to %s...' % options.outputf, newline=False)
    mention_file.write(mentions, options.outputf)
コード例 #7
0
        (options, args) = parser.parse_args()

        if not options.bert_f:
            parser.error('Must provide --bert-output')
        elif not options.overlaps_f:
            parser.error('Must provide --overlaps')
        elif not options.output_f:
            parser.error('Must provide --output')

        return options
    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('BERT output', options.bert_f),
        ('Overlaps file', options.overlaps_f),
        ('Output file', options.output_f),
    ], 'BERT embedding recombination')

    log.writeln('Reading overlaps from %s...' % options.overlaps_f)
    overlaps = readOverlaps(options.overlaps_f)
    log.writeln('Read overlaps for {0:,} lines.\n'.format(len(overlaps)))

    log.writeln('Streaming BERT output conversion...')
    streamingBERTConvert(
        options.bert_f,
        overlaps,
        options.output_f,
        options.tokenized_f
    )
    log.writeln('Done.')
            parser.error('Must supply --bert-dir')
        elif not options.model:
            parser.error('Must supply --model')

        options.output_f = os.path.join(
            options.bert_dir, '%s.compiled_output.predictions' % options.model)
        options.logfile = '%s.log' % options.output_f

        return options

    options = _cli()
    log.start(options.logfile)

    log.writeConfig([
        ('Mentions file', options.mentions_f),
        ('BERT baseline root directory', options.bert_dir),
        ('Model configuration', options.model),
        ('Output file', options.output_f),
    ], 'BERT baseline results compilation')

    log.writeln('Reading mentions from %s...' % options.mentions_f)
    mentions = mention_file.read(options.mentions_f)
    mentions_by_ID = {m.ID: m for m in mentions}
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    fold_dirs = glob.glob(os.path.join(options.bert_dir, 'fold-*'))
    log.writeln('Found {0} folds in {1}.\n'.format(len(fold_dirs),
                                                   options.bert_dir))

    with open(options.output_f, 'w') as stream:
        fold_dirs = sorted(fold_dirs)
        for i in range(len(fold_dirs)):
コード例 #9
0
        (options, args) = parser.parse_args()
        if not options.input_f:
            parser.print_help()
            parser.error('Must provide --input')
        if not options.output_dir:
            options.output_dir = os.path.dirname(options.input_f)
        return options

    sys.setrecursionlimit(1800)

    options = args = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Terminology file', options.input_f),
        ('Storing pickled maps to', options.output_dir),
        ('Map concepts separated by', options.sep),
        ('Removing stopword terms', options.remove_stopwords),
        ('Tokenization settings', tokenization.CLI.logOptions(options)),
    ], 'JET -- STR -> CUI file preprocessing')

    t_sub = log.startTimer('Initializing tokenizer...')
    tokenizer = tokenization.CLI.initializeTokenizer(options)
    log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n')

    t_sub = log.startTimer('Reading terminology file...')
    ngrams, entities_by_term = readTerminology(
        options.input_f,
        tokenizer,
        remove_stopwords=options.remove_stopwords,
        use_collapsed_string=options.use_collapsed_string)
    log.stopTimer(t_sub, message='Completed in {0:.2f}s.\n')
        if not options.mentions_f:
            parser.error('Must supply --mentions')
        elif not options.output_f:
            parser.error('Must supply --output')
        elif options.filter_doc_ID_f and not options.mention_map_f:
            parser.error('Must supply --mention-map if using --filter-doc-IDs')

        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Mentions file', options.mentions_f),
        ('Mention map file', options.mention_map_f),
        ('Number of folds', options.num_folds),
        ('Dev set size', options.dev_size),
        ('Document ID filter list', options.filter_doc_ID_f),
        ('Random seed', options.random_seed),
        ('Output file', options.output_f),
    ], 'Cross-validation splits generation')

    log.writeln('Loading mentions from %s...' % options.mentions_f)
    mentions = mention_file.read(options.mentions_f)
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    if options.filter_doc_ID_f:
        log.writeln('Reading mention map from %s...' % options.mention_map_f)
        mention_map = mention_map_lib.load(options.mention_map_f)
        log.writeln('Read mapping info for {0:,} mentions.\n'.format(
            len(mention_map)))
コード例 #11
0
        if not options.output_f:
            parser.print_help()
            parser.error('Must provide --output')
        if options.threads < 3:
            parser.print_help()
            parser.error('--threads must be at least 3')
        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Plaintext corpus file', options.input_f),
        ('Pickled ngram->term map', options.terminology_pkl_f),
        ('Output annotations file', options.output_f),
        ('Tagging settings', [
            ('Number of tagging threads', options.threads),
            ('Line queue size cap',
             'unlimited' if options.maxlines <= 0 else options.maxlines),
        ]),
        ('Tokenization settings', tokenization.CLI.logOptions(options)),
    ], 'JET -- Automated corpus tagging')

    t_sub = log.startTimer('Loading pickled strings map...')
    compiled_terminology = pickleio.read(options.terminology_pkl_f)
    log.stopTimer(t_sub, message='Done in {0:.2f}s.\n')

    t_sub = log.startTimer('Initializing tokenizer...')
    tokenizer = tokenization.CLI.initializeTokenizer(options)
    log.stopTimer(t_sub, message='Tokenizer ready in {0:.2f}s.\n')

    t_sub = log.startTimer('Tagging corpus...')
コード例 #12
0
            default=10)
        parser.add_option(
            '-l',
            '--logfile',
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if len(args) == 0:
            parser.print_help()
            exit()
        neighbor_files = args
        return neighbor_files, options

    neighbor_files, options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        *[('Neighborhood sample file %d' % (i + 1), neighbor_files[i])
          for i in range(len(neighbor_files))],
        ('Output file', options.outputf),
        ('Number of neighbors to include in edge construction', options.k),
    ], 'Nearest neighborhood graph generation')

    graph = buildGraph(neighbor_files, options.k)

    log.write('Writing graph to %s...' % options.outputf)
    writeGraph(graph, options.outputf)
    log.writeln('Done!')

    log.stop()
コード例 #13
0
                help='name of file to write log contents to (empty for stdout)',
                default=None)
        (options, args) = parser.parse_args()
        if len(args) != 1:
            parser.print_help()
            exit()
        (embf,) = args
        return embf, options

    embf, options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Input embedding file', embf),
        ('Input embedding file mode', options.embedding_mode),
        ('Output neighbor file', options.outputf),
        ('Ordered vocabulary file', options.vocabf),
        ('Number of nearest neighbors', options.k),
        ('Batch size', options.batch_size),
        ('Number of threads', options.threads),
        ('Partial nearest neighbors file for resuming', options.partial_neighbors_file),
    ], 'k Nearest Neighbor calculation with cosine similarity')

    t_sub = log.startTimer('Reading embeddings from %s...' % embf)
    emb = pyemblib.read(embf, mode=options.embedding_mode, errors='replace')
    log.stopTimer(t_sub, message='Read {0:,} embeddings in {1}s.\n'.format(len(emb), '{0:.2f}'))

    if not os.path.isfile(options.vocabf):
        log.writeln('Writing node ID <-> vocab map to %s...\n' % options.vocabf)
        writeNodeMap(emb, options.vocabf)
    else:
        log.writeln('Reading node ID <-> vocab map from %s...\n' % options.vocabf)
    node_map = readNodeMap(options.vocabf)
コード例 #14
0
        (options, args) = parser.parse_args()
        if not options.input_f:
            parser.error('Must provide --input')
        elif not options.output_f:
            parser.error('Must provide --output')
        elif not options.key_f:
            parser.error('Must provide --keys')
        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('HDF5 embeddings', options.input_f),
        ('HDF5 layer',
         ('Average' if options.layer == AVERAGE_LAYERS else options.layer)),
        ('Per-row keys', options.key_f),
        ('Mentions file', options.mentions_f),
        ('Using Action oracle', options.action_oracle),
        ('Output embedded mentions file', options.output_f),
    ], 'Embedded mentions file generation with pre-generated HDF5 features')

    log.writeln('Reading keys from %s...' % options.key_f)
    keys = readKeys(options.key_f)
    log.writeln('Read {0:,} keys.\n'.format(len(keys)))

    log.writeln('Reading textual mentions from %s...' % options.mentions_f)
    mentions = mention_file.read(options.mentions_f)
    mentions_by_id = {m.ID: m for m in mentions}
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    log.writeln('Generating embedded mentions from HDF5 file %s...' %
コード例 #15
0
        return options
    options = _cli()

    output_tokens = '%s.tokens' % options.output_file
    output_subsequences = '%s.subsequences' % options.output_file
    output_overlaps = '%s.overlaps' % options.output_file
    output_log = '%s.log' % options.output_file

    log.start(output_log)
    log.writeConfig([
        ('Input file', options.input_file),
        ('Output settings', [
            ('Base path', options.output_file),
            ('Tokenized file', output_tokens),
            ('Subsequences file', output_subsequences),
            ('Overlaps file', output_overlaps),
            ('Log file', output_log),
        ]),
        ('Max subsequence length', options.max_sequence_length),
        ('Overlap fraction', options.overlap),
        ('BERT vocab file', options.vocab_file)
    ])

    options.max_sequence_length -= 2

    log.writeln('Tokenizing input file %s...' % options.input_file)
    tokenizer = bert.tokenization.FullTokenizer(
        vocab_file=options.vocab_file,
        do_lower_case=True
    )
    num_lines = 0