Esempio n. 1
0
                          dest='logfile',
                          help='file to write log messages to')
        (options, args) = parser.parse_args()
        if not options.corpus_f:
            parser.print_help()
            parser.error('Must provide --corpus')
        if not options.annotations_f:
            parser.print_help()
            parser.error('Must provide --annotations')
        if not options.term_strings_f:
            parser.print_help()
            parser.error('Must provide --term-strings')
        return options

    options = _cli()
    log.start(options.logfile)
    log.writeConfig([
        ('Corpus file', options.corpus_f),
        ('Annotations file', options.annotations_f),
        ('Term strings file', options.term_strings_f),
    ], 'JET annotation validation')

    log.writeln('Reading term->strings mapping from %s...' %
                options.term_strings_f)
    term_map = readTermMap(options.term_strings_f)
    log.writeln('Mapped strings for {0:,} terms.\n'.format(len(term_map)))

    log.writeln('Validating corpus annotations...')
    validate(options.corpus_f, options.annotations_f, term_map)
    log.writeln('Done!\n')
        if len(args) != 1:
            _bail('Must supply only MENTIONS')
        elif (options.use_entity_embeddings
              and len(options.entity_embfs) == 0):
            _bail('Must supply --entities')
        elif (options.use_ctx_embeddings and not options.ctx_embf):
            _bail('Must supply --ctxs')
        elif (options.dev_size <= 0 or options.dev_size >= 1):
            _bail('--dev-size must be between (0,1)')

        (mentionf, ) = args
        return mentionf, options

    ## Getting configuration settings
    mentionf, options = _cli()
    log.start(logfile=options.logfile, stdout_also=True)
    entity_settings = [('Entities %d' % i, options.entity_embfs[i])
                       for i in range(len(options.entity_embfs))]
    log.writeConfig(
        [
            ('Mention file', mentionf),
            ('Entity embedding settings', entity_settings),
            ('Word/ctx embeddings', options.ctx_embf),
            ('Word vocabulary (unused if empty)', options.word_vocabf),
            ('Writing predictions to', options.preds_file),
            ('Using feature normalization', options.normalize_features),
            ('Classification algorithm', options.classifier),
            ('Training settings', [
                ('Cross validation splits file',
                 options.cross_validation_file),
                ('Number of folds', options.n_folds),
            '-l',
            '--logfile',
            dest='logfile',
            help='name of file to write log contents to (empty for stdout)',
            default=None)
        (options, args) = parser.parse_args()
        if not options.splitsf:
            parser.print_help()
            parser.error('Must provide --splits')
        if len(args) != 2:
            parser.print_help()
            exit()
        return args, options

    (mentionf, predsf), options = _cli()
    log.start(logfile=options.logfile)

    log.writeConfig([
        ('Mention file', mentionf),
        ('Key remapping file', options.keymapf),
        ('Predictions file', predsf),
        ('No scores in predictions', options.no_scores),
        ('Cross-validation splits file', options.splitsf),
        ('Evaluating on development data', options.dev),
    ], 'BTRIS Mobility code-level predictions analysis')

    log.writeln('Reading mentions from %s...' % mentionf)
    mentions = mention_file.read(mentionf)
    log.writeln('Read {0:,} mentions.\n'.format(len(mentions)))

    log.writeln('Reading splits from %s...' % options.splitsf)
        elif not options.output_file:
            parser.error('Must provide --output')
        elif not options.vocab_file:
            parser.error('Must provide --vocab-file')
        elif options.overlap < 0 or options.overlap >= 1:
            parser.error('--overlap must be between [0,1)')

        return options
    options = _cli()

    output_tokens = '%s.tokens' % options.output_file
    output_subsequences = '%s.subsequences' % options.output_file
    output_overlaps = '%s.overlaps' % options.output_file
    output_log = '%s.log' % options.output_file

    log.start(output_log)
    log.writeConfig([
        ('Input file', options.input_file),
        ('Output settings', [
            ('Base path', options.output_file),
            ('Tokenized file', output_tokens),
            ('Subsequences file', output_subsequences),
            ('Overlaps file', output_overlaps),
            ('Log file', output_log),
        ]),
        ('Max subsequence length', options.max_sequence_length),
        ('Overlap fraction', options.overlap),
        ('BERT vocab file', options.vocab_file)
    ])

    options.max_sequence_length -= 2