def run(opt): source_file_list = load_file_list(join_if_relative_path( opt.source_base_path, opt.source_file_list), column=opt.source_file_column, limit=opt.limit) source_base_path = get_or_validate_base_path(source_file_list, opt.source_base_path) target_file_list = get_output_file_list(source_file_list, source_base_path, opt.output_base_path, opt.output_file_suffix) if opt.check: check_file_list = (target_file_list[:opt.check_limit] if opt.check_limit else target_file_list) LOGGER.info('checking %d (out of %d) files...', len(check_file_list), len(target_file_list)) check_files_and_report_result(check_file_list) if opt.use_relative_paths: target_file_list = to_relative_file_list(opt.output_base_path, target_file_list) LOGGER.info('saving file list (with %d files) to: %s', len(target_file_list), opt.output_file_list) save_file_list(opt.output_file_list, target_file_list, column=opt.output_file_column)
def test_should_call_load_plain_file_list(self, load_plain_file_list_mock): result = load_file_list('file-list.lst', column='url', header=True, limit=1, to_absolute=False) load_plain_file_list_mock.assert_called_with('file-list.lst', limit=1) assert result == load_plain_file_list_mock.return_value
def run(opt): file_list = load_file_list(opt.source_file_list, opt.source_file_column, limit=opt.limit) if opt.cv_source_file_list: cv_file_list = load_file_list(opt.cv_source_file_list, opt.cv_source_file_column, limit=opt.limit) else: cv_file_list = None get_logger().info('training using %d files (limit %d), page range: %s', len(file_list), opt.limit, opt.pages) save_model( opt.output_path, train_model(file_list, cv_file_list, cv_source_tag_scope=opt.cv_source_tag_scope, page_range=opt.pages))
def test_should_make_file_list_absolute(self, load_plain_file_list_mock, to_absolute_file_list_mock): result = load_file_list('/base/path/file-list.lst', column='url', to_absolute=True) to_absolute_file_list_mock.assert_called_with( '/base/path', load_plain_file_list_mock.return_value) assert result == to_absolute_file_list_mock.return_value
def _load_values(file_list_path, file_column, xpath, limit, namespaces): file_list = load_file_list( file_list_path, file_column, limit=limit ) return [ _extract_value_from_file(file_path, xpath, namespaces) for file_path in file_list ]
def get_file_list_for_args(args: argparse.Namespace): if args.source_file_list: file_list_path = join_if_relative_path(args.base_data_path, args.source_file_list) return load_file_list(file_list_path, column=args.source_file_column, limit=args.limit) return list( find_matching_filenames_with_limit(join_if_relative_path( args.base_data_path, args.source_path), limit=args.limit))
def get_source_file_list(self): if self.source_file_list_path: return load_file_list( self.source_file_list_path, column=self.source_file_list_column, limit=self.limit ) if self.source_path: return [self.source_path] return list(find_matching_filenames_with_limit(os.path.join( self.source_base_path, self.source_filename_pattern ), limit=self.limit))
def ReadFileList(file_list_path, column, limit=None): file_list = load_file_list(file_list_path, column=column, limit=limit) return beam.Create(file_list)
def load_file_list_from_config(file_list_config, limit): return FileList(base_path=file_list_config['base_path'], file_list=load_file_list( file_list_config['file_list'], column=file_list_config['file_column'], limit=limit))
def run(opt): file_list = load_file_list(opt.file_list, column=opt.file_column, limit=opt.limit) check_files_and_report_result(file_list)