def parse_args(argv=None):
    parser = argparse.ArgumentParser()
    add_main_args(parser)
    add_cloud_args(parser)

    # parsed_args, other_args = parser.parse_known_args(argv)
    parsed_args = parser.parse_args(argv)

    process_main_args(parser, parsed_args)
    process_cloud_args(parsed_args,
                       parsed_args.output_path,
                       name='sciencbeam-gym-preprocessing')

    get_logger().info('parsed_args: %s', parsed_args)

    return parsed_args
Ejemplo n.º 2
0
def parse_args(argv=None):
    parser = argparse.ArgumentParser()
    add_main_args(parser)
    add_cloud_args(parser)

    args = parser.parse_args(argv)

    if args.debug:
        logging.getLogger().setLevel('DEBUG')

    process_main_args(args, parser)
    process_cloud_args(args, args.output_path, name='sciencebeam-convert')

    get_logger().info('args: %s', args)

    return args
Ejemplo n.º 3
0
def add_args(parser: argparse.ArgumentParser):
    source_group = parser.add_argument_group('source')
    source_group.add_argument(
        '--source-base-path', type=str,
        help='source base data path for files to fix'
    )
    source_group.add_argument(
        '--source-path', type=str,
        help='source path to a specific file to fix'
    )
    source_group.add_argument(
        '--source-filename-pattern', type=str,
        default='**.xml*',
        help='file pattern within source base path to find files to process'
    )
    source_group.add_argument(
        '--source-file-list', type=str,
        help='path to source file list'
    )
    source_group.add_argument(
        '--source-file-list-column', type=str,
        default='xml_url',
        help='the column to use when reading the source file list (if csv or tsv)'
    )

    parser.add_argument(
        '--output-path', type=str, required=True,
        help='output base path'
    )

    parser.add_argument(
        '--limit', type=int, required=False,
        help='limit the number of files to process'
    )

    parser.add_argument(
        '--multi-processing', action='store_true', default=False,
        help='enable multi processing rather than multi threading'
    )

    parser.add_argument(
        '--no-log-file', action='store_true', default=False,
        help='disable logging of file being processed'
    )

    add_cloud_args(parser)
Ejemplo n.º 4
0
 def test_should_accept_job_name_with_hyphen(self):
     args = add_cloud_args(argparse.ArgumentParser()).parse_args(['--job-name=job1'])
     assert args.job_name == 'job1'
Ejemplo n.º 5
0
 def test_should_accept_max_workers(self):
     args = add_cloud_args(argparse.ArgumentParser()).parse_args(['--max-workers=123'])
     assert args.max_num_workers == 123
Ejemplo n.º 6
0
 def test_should_accept_num_workers_with_hyphen(self):
     args = add_cloud_args(argparse.ArgumentParser()).parse_args(['--num-workers=123'])
     assert args.num_workers == 123
def add_annotation_pipeline_arguments(
        parser: argparse.ArgumentParser,
        default_matcher_lookahead_lines: int = 500):
    source_group = parser.add_argument_group('source')
    source_group.add_argument(
        '--source-base-path',
        type=str,
        help='source base data path for files to auto-annotate')
    source_group.add_argument(
        '--source-path',
        type=str,
        help='source path to a specific file to auto-annotate')

    parser.add_argument('--output-path',
                        type=str,
                        required=True,
                        help='target training data path')

    parser.add_argument(
        '--failed-output-path',
        type=str,
        required=False,
        help=
        ('Target data path where documents should be saved to, if they fail quality checks.'
         ' Leave blank if those documents should not be saved.'))

    parser.add_argument('--limit',
                        type=int,
                        required=False,
                        help='limit the number of files to process')

    parser.add_argument('--xml-path',
                        type=str,
                        required=True,
                        help='path to xml file(s)')
    parser.add_argument(
        '--xml-filename-regex',
        type=str,
        required=True,
        help=
        'regular expression to transform source filename to target xml filename'
    )
    parser.add_argument(
        '--xml-mapping-path',
        type=str,
        default=get_default_config_path(DEFAULT_ANNOT_CONFIG_FILENAME),
        help='path to xml mapping file')

    parser.add_argument(
        '--no-preserve-tags',
        action='store_true',
        required=False,
        help=
        'do not preserve existing tags (tags other than the one being annotated)'
    )

    parser.add_argument(
        '--always-preserve-fields',
        type=comma_separated_str_to_list,
        help=
        'always preserve the listed fields (they will be excluded from the matcher)'
    )

    parser.add_argument(
        '--resume',
        action='store_true',
        default=False,
        help='resume conversion (skip files that already have an output file)')

    matcher_group = parser.add_argument_group('matcher')
    matcher_group.add_argument(
        '--matcher',
        type=str,
        choices=MATCHER_NAMES,
        default=DEFAULT_MATCHER_NAME,
        help=''.join([
            'the kind of matcher to use ("simple" uses a simpler algorith,',
            ' requiring correct reading order)'
        ]))
    matcher_group.add_argument(
        '--matcher-score-threshold',
        type=float,
        default=0.8,
        help='score threshold for a match to be accepted (1.0 for exact match)'
    )
    matcher_group.add_argument(
        '--matcher-lookahead-lines',
        type=int,
        default=default_matcher_lookahead_lines,
        help='simple matcher only: number of lines to try to find matches for')
    matcher_group.add_argument(
        '--debug-match',
        type=str,
        required=False,
        help='if set, path to csv or tsv file with debug matches')

    parser.add_argument(
        '--multi-processing',
        action='store_true',
        default=False,
        help='enable multi processing rather than multi threading')

    parser.add_argument('--skip-errors',
                        action='store_true',
                        default=False,
                        help='skip errors')

    line_no_group = parser.add_argument_group('line number annotation')
    line_no_group.add_argument('--use-line-number-annotator',
                               dest='use_line_number_annotator',
                               action='store_true',
                               default=False,
                               help='Enable line number annotator')
    line_no_group.add_argument('--no-line-number-annotator',
                               dest='use_line_number_annotator',
                               action='store_false',
                               default=False,
                               help='Disable line number annotator')
    line_no_group.add_argument(
        '--min-line-numbers-per-page',
        type=int,
        default=DEFAULT_MIN_LINE_NUMBER_COUNT,
        help='minimum number of line number candidates on page to be considered'
    )
    line_no_group.add_argument(
        '--max-line-number-gap',
        type=int,
        default=DEFAULT_MAX_LINE_NUMBER_GAP,
        help=' '.join([
            'the maximum interval gap between line numbers',
            '(some documents only show line numbers on lines with text)'
        ]))
    line_no_group.add_argument(
        '--min-line-number-ratio',
        type=str,
        default=DEFAULT_LINE_NUMBER_RATIO_THRESHOLD,
        help=' '.join([
            'minimum ratio of line number candidates vs non-line number tokens',
            ' (first token of line)'
        ]))

    line_no_group.add_argument(
        '--xml-mapping-overrides',
        type=parse_dict,
        help=' '.join([
            'override xml mapping values, in the format: key1=value1|key2=value2'
        ]))

    add_cloud_args(parser)
    return parser