Example #1
0
def run(opt):
    source_file_list = load_file_list(join_if_relative_path(
        opt.source_base_path, opt.source_file_list),
                                      column=opt.source_file_column,
                                      limit=opt.limit)
    source_base_path = get_or_validate_base_path(source_file_list,
                                                 opt.source_base_path)

    target_file_list = get_output_file_list(source_file_list, source_base_path,
                                            opt.output_base_path,
                                            opt.output_file_suffix)

    if opt.check:
        check_file_list = (target_file_list[:opt.check_limit]
                           if opt.check_limit else target_file_list)
        LOGGER.info('checking %d (out of %d) files...', len(check_file_list),
                    len(target_file_list))
        check_files_and_report_result(check_file_list)

    if opt.use_relative_paths:
        target_file_list = to_relative_file_list(opt.output_base_path,
                                                 target_file_list)

    LOGGER.info('saving file list (with %d files) to: %s',
                len(target_file_list), opt.output_file_list)
    save_file_list(opt.output_file_list,
                   target_file_list,
                   column=opt.output_file_column)
Example #2
0
 def test_should_call_load_plain_file_list(self, load_plain_file_list_mock):
     result = load_file_list('file-list.lst',
                             column='url',
                             header=True,
                             limit=1,
                             to_absolute=False)
     load_plain_file_list_mock.assert_called_with('file-list.lst', limit=1)
     assert result == load_plain_file_list_mock.return_value
def run(opt):
    file_list = load_file_list(opt.source_file_list,
                               opt.source_file_column,
                               limit=opt.limit)
    if opt.cv_source_file_list:
        cv_file_list = load_file_list(opt.cv_source_file_list,
                                      opt.cv_source_file_column,
                                      limit=opt.limit)
    else:
        cv_file_list = None
    get_logger().info('training using %d files (limit %d), page range: %s',
                      len(file_list), opt.limit, opt.pages)
    save_model(
        opt.output_path,
        train_model(file_list,
                    cv_file_list,
                    cv_source_tag_scope=opt.cv_source_tag_scope,
                    page_range=opt.pages))
Example #4
0
    def test_should_make_file_list_absolute(self, load_plain_file_list_mock,
                                            to_absolute_file_list_mock):

        result = load_file_list('/base/path/file-list.lst',
                                column='url',
                                to_absolute=True)
        to_absolute_file_list_mock.assert_called_with(
            '/base/path', load_plain_file_list_mock.return_value)
        assert result == to_absolute_file_list_mock.return_value
Example #5
0
def _load_values(file_list_path, file_column, xpath, limit, namespaces):
    file_list = load_file_list(
        file_list_path,
        file_column,
        limit=limit
    )
    return [
        _extract_value_from_file(file_path, xpath, namespaces)
        for file_path in file_list
    ]
def get_file_list_for_args(args: argparse.Namespace):
    if args.source_file_list:
        file_list_path = join_if_relative_path(args.base_data_path,
                                               args.source_file_list)
        return load_file_list(file_list_path,
                              column=args.source_file_column,
                              limit=args.limit)
    return list(
        find_matching_filenames_with_limit(join_if_relative_path(
            args.base_data_path, args.source_path),
                                           limit=args.limit))
Example #7
0
 def get_source_file_list(self):
     if self.source_file_list_path:
         return load_file_list(
             self.source_file_list_path,
             column=self.source_file_list_column,
             limit=self.limit
         )
     if self.source_path:
         return [self.source_path]
     return list(find_matching_filenames_with_limit(os.path.join(
         self.source_base_path,
         self.source_filename_pattern
     ), limit=self.limit))
Example #8
0
def ReadFileList(file_list_path, column, limit=None):
    file_list = load_file_list(file_list_path, column=column, limit=limit)
    return beam.Create(file_list)
Example #9
0
def load_file_list_from_config(file_list_config, limit):
    return FileList(base_path=file_list_config['base_path'],
                    file_list=load_file_list(
                        file_list_config['file_list'],
                        column=file_list_config['file_column'],
                        limit=limit))
def run(opt):
    file_list = load_file_list(opt.file_list,
                               column=opt.file_column,
                               limit=opt.limit)
    check_files_and_report_result(file_list)