dest='logfile', help='file to write log messages to') (options, args) = parser.parse_args() if not options.corpus_f: parser.print_help() parser.error('Must provide --corpus') if not options.annotations_f: parser.print_help() parser.error('Must provide --annotations') if not options.term_strings_f: parser.print_help() parser.error('Must provide --term-strings') return options options = _cli() log.start(options.logfile) log.writeConfig([ ('Corpus file', options.corpus_f), ('Annotations file', options.annotations_f), ('Term strings file', options.term_strings_f), ], 'JET annotation validation') log.writeln('Reading term->strings mapping from %s...' % options.term_strings_f) term_map = readTermMap(options.term_strings_f) log.writeln('Mapped strings for {0:,} terms.\n'.format(len(term_map))) log.writeln('Validating corpus annotations...') validate(options.corpus_f, options.annotations_f, term_map) log.writeln('Done!\n')
if len(args) != 1: _bail('Must supply only MENTIONS') elif (options.use_entity_embeddings and len(options.entity_embfs) == 0): _bail('Must supply --entities') elif (options.use_ctx_embeddings and not options.ctx_embf): _bail('Must supply --ctxs') elif (options.dev_size <= 0 or options.dev_size >= 1): _bail('--dev-size must be between (0,1)') (mentionf, ) = args return mentionf, options ## Getting configuration settings mentionf, options = _cli() log.start(logfile=options.logfile, stdout_also=True) entity_settings = [('Entities %d' % i, options.entity_embfs[i]) for i in range(len(options.entity_embfs))] log.writeConfig( [ ('Mention file', mentionf), ('Entity embedding settings', entity_settings), ('Word/ctx embeddings', options.ctx_embf), ('Word vocabulary (unused if empty)', options.word_vocabf), ('Writing predictions to', options.preds_file), ('Using feature normalization', options.normalize_features), ('Classification algorithm', options.classifier), ('Training settings', [ ('Cross validation splits file', options.cross_validation_file), ('Number of folds', options.n_folds),
'-l', '--logfile', dest='logfile', help='name of file to write log contents to (empty for stdout)', default=None) (options, args) = parser.parse_args() if not options.splitsf: parser.print_help() parser.error('Must provide --splits') if len(args) != 2: parser.print_help() exit() return args, options (mentionf, predsf), options = _cli() log.start(logfile=options.logfile) log.writeConfig([ ('Mention file', mentionf), ('Key remapping file', options.keymapf), ('Predictions file', predsf), ('No scores in predictions', options.no_scores), ('Cross-validation splits file', options.splitsf), ('Evaluating on development data', options.dev), ], 'BTRIS Mobility code-level predictions analysis') log.writeln('Reading mentions from %s...' % mentionf) mentions = mention_file.read(mentionf) log.writeln('Read {0:,} mentions.\n'.format(len(mentions))) log.writeln('Reading splits from %s...' % options.splitsf)
elif not options.output_file: parser.error('Must provide --output') elif not options.vocab_file: parser.error('Must provide --vocab-file') elif options.overlap < 0 or options.overlap >= 1: parser.error('--overlap must be between [0,1)') return options options = _cli() output_tokens = '%s.tokens' % options.output_file output_subsequences = '%s.subsequences' % options.output_file output_overlaps = '%s.overlaps' % options.output_file output_log = '%s.log' % options.output_file log.start(output_log) log.writeConfig([ ('Input file', options.input_file), ('Output settings', [ ('Base path', options.output_file), ('Tokenized file', output_tokens), ('Subsequences file', output_subsequences), ('Overlaps file', output_overlaps), ('Log file', output_log), ]), ('Max subsequence length', options.max_sequence_length), ('Overlap fraction', options.overlap), ('BERT vocab file', options.vocab_file) ]) options.max_sequence_length -= 2