def default_output_file_loaders(cls) -> dict[FileType, FileLoader]: target_field_names = ["predict", "predictions", "true_rank"] return { FileType.json: JSONFileLoader([ FileLoaderField("predict", target_field_names[0], str), FileLoaderField("predictions", target_field_names[1], list), FileLoaderField("true_rank", target_field_names[2], int), ]) }
def default_output_file_loaders(cls) -> dict[FileType, FileLoader]: return { FileType.conll: CoNLLFileLoader([FileLoaderField(1, "pred_tags", str)]), FileType.json: JSONFileLoader([ FileLoaderField("tokens", "tokens", list), FileLoaderField("predicted_tags", "pred_tags", list), ]), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: return { FileType.json: JSONFileLoader([ FileLoaderField('text', 'text', str), FileLoaderField('edits', 'edits', dict), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField('text', 'text', str), FileLoaderField('edits', 'edits', dict), ]), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: field_names = ["tokens", "true_tags"] return { FileType.conll: CoNLLFileLoader([ FileLoaderField(0, field_names[0], str), FileLoaderField(1, field_names[1], str), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField("tokens", field_names[0], list), FileLoaderField("tags", field_names[1], list), ]), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: field_name = 'text' return { FileType.text: TextFileLoader(), FileType.json: JSONFileLoader([ FileLoaderField("text", field_name, str), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField("text", field_name, str), ]), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: file_path = cache_api.cache_online_file( 'http://phontron.com/download/explainaboard/pre_computed/kg/entity2wikidata.json', # noqa 'pre_computed/kg/entity2wikidata.json', ) with open(file_path, 'r') as file: entity_dic = json.loads(file.read()) map_preprocessor = KGMapPreprocessor( resources={"dictionary": entity_dic}) target_field_names = [ "true_head", "true_head_decipher", "true_link", "true_tail", "true_tail_decipher", ] return { FileType.json: JSONFileLoader([ FileLoaderField("gold_head", target_field_names[0], str), FileLoaderField("gold_head", target_field_names[1], str, parser=map_preprocessor), FileLoaderField("gold_predicate", target_field_names[2], str), FileLoaderField("gold_tail", target_field_names[3], str), FileLoaderField("gold_tail", target_field_names[4], str, parser=map_preprocessor), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField("head", target_field_names[0], str), FileLoaderField("head", target_field_names[1], str, parser=map_preprocessor), FileLoaderField("link", target_field_names[2], str), FileLoaderField("tail", target_field_names[3], str), FileLoaderField("tail", target_field_names[4], str, parser=map_preprocessor), ]), }
def default_output_file_loaders(cls) -> dict[FileType, FileLoader]: return { FileType.json: JSONFileLoader([ FileLoaderField("predicted_answers", "predicted_answers", dict) ]) }
def test_tsv_validation(self): self.assertRaises( ValueError, lambda: TSVFileLoader( [FileLoaderField("0", "field0", str)], use_idx_as_id=True ), )
def default_output_file_loaders(cls) -> dict[FileType, FileLoader]: field_name = "hypothesis" return { FileType.text: TextFileLoader(field_name, str), FileType.json: JSONFileLoader([FileLoaderField(field_name, field_name, str)]), }
def test_conll_loader(self): tabs_path = os.path.join(test_artifacts_path, "ner", "dataset.tsv") spaces_path = os.path.join(test_artifacts_path, "ner", "dataset-space.tsv") loader_true = CoNLLFileLoader([ FileLoaderField(0, 'tokens', str), FileLoaderField(1, 'true_tags', str), ]) loader_pred = CoNLLFileLoader([ FileLoaderField(1, 'pred_tags', str), ]) tabs_true = loader_true.load(tabs_path, Source.local_filesystem) spaces_true = loader_true.load(spaces_path, Source.local_filesystem) self.assertEqual(tabs_true, spaces_true) tabs_pred = loader_pred.load(tabs_path, Source.local_filesystem) spaces_pred = loader_pred.load(spaces_path, Source.local_filesystem) self.assertEqual(tabs_pred, spaces_pred)
def _statistics_func(self, samples: Iterator, sys_info: SysOutputInfo): if sys_info.source_language is None or sys_info.target_language is None: raise ValueError( 'source or target languages must be specified to load ' f'translation data, but source={sys_info.source_language} ' f', target={sys_info.target_language}' ) src = FileLoaderField(('translation', sys_info.source_language), '', str) trg = FileLoaderField(('translation', sys_info.target_language), '', str) return { 'source_vocab': accumulate_vocab_from_samples( samples, lambda x: FileLoader.find_field(x, src), unwrap(sys_info.source_tokenizer), ), 'target_vocab': accumulate_vocab_from_samples( samples, lambda x: FileLoader.find_field(x, trg), unwrap(sys_info.target_tokenizer), ), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: target_field_names = ["context", "options", "question_mark", "answers"] return { FileType.json: JSONFileLoader([ FileLoaderField("context", target_field_names[0], str), FileLoaderField("options", target_field_names[1], list), FileLoaderField("question_mark", target_field_names[2], str), FileLoaderField("answers", target_field_names[3], dict), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField("context", target_field_names[0], str), FileLoaderField("options", target_field_names[1], list), FileLoaderField("question_mark", target_field_names[2], str), FileLoaderField("answers", target_field_names[3], dict), ]), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: target_names = ["text1", "text2", "true_label"] return { FileType.tsv: TSVFileLoader([ FileLoaderField(0, target_names[0], str), FileLoaderField(1, target_names[1], str), FileLoaderField(2, target_names[2], str), ], ), FileType.json: JSONFileLoader([ FileLoaderField("text1", target_names[0], str), FileLoaderField("text2", target_names[1], str), FileLoaderField("true_label", target_names[2], str), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField("text1", target_names[0], str), FileLoaderField("text2", target_names[1], str), FileLoaderField("label", target_names[2], str), ]), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: field_names = ["aspect", "text", "true_label"] return { FileType.tsv: TSVFileLoader([ FileLoaderField(0, field_names[0], str), FileLoaderField(1, field_names[1], str), FileLoaderField(2, field_names[2], str), ], ), FileType.json: JSONFileLoader([ FileLoaderField(field_names[0], field_names[0], str), FileLoaderField(field_names[1], field_names[1], str), FileLoaderField(field_names[2], field_names[2], str), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField(field_names[0], field_names[0], str), FileLoaderField(field_names[1], field_names[1], str), FileLoaderField(field_names[2], field_names[2], str), ]), }
def test_load_in_memory_tsv(self): loader = Loader( dataset_data=load_file_as_str(self.dataset), output_data=load_file_as_str( os.path.join(test_artifacts_path, "text_classification", "output.txt")), dataset_source=Source.in_memory, output_source=Source.in_memory, dataset_file_type=FileType.tsv, output_file_type=FileType.text, dataset_file_loader=TSVFileLoader( [FileLoaderField(0, "field0", str)], use_idx_as_id=True), output_file_loader=TextFileLoader("output", str), ) samples = [sample for sample in loader.load()] self.assertEqual(len(samples), 10) self.assertEqual(set(samples[0].keys()), {"id", "field0", "output"})
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: return { FileType.tsv: TSVFileLoader([ FileLoaderField(0, cls.OUTPUT_FIELDS[0], str), FileLoaderField(1, cls.OUTPUT_FIELDS[1], str), ], ), FileType.json: JSONFileLoader([ FileLoaderField(cls.JSON_FIELDS[0], cls.OUTPUT_FIELDS[0], str), FileLoaderField(cls.JSON_FIELDS[1], cls.OUTPUT_FIELDS[1], str), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField(cls.JSON_FIELDS[0], cls.OUTPUT_FIELDS[0], str), FileLoaderField(cls.JSON_FIELDS[1], cls.OUTPUT_FIELDS[1], str), ]), }
def default_dataset_file_loaders(cls) -> dict[FileType, FileLoader]: target_field_names = ["context", "question", "answers"] return { FileType.json: JSONFileLoader([ FileLoaderField( target_field_names[0], target_field_names[0], str, strip_before_parsing=False, ), FileLoaderField( target_field_names[1], target_field_names[1], str, strip_before_parsing=False, ), FileLoaderField(target_field_names[2], target_field_names[2]), ]), FileType.datalab: DatalabFileLoader([ FileLoaderField( "context", target_field_names[0], str, strip_before_parsing=False, ), FileLoaderField( "question", target_field_names[1], str, strip_before_parsing=False, ), FileLoaderField("answers", target_field_names[2]), ]), }
def default_output_file_loaders(cls) -> dict[FileType, FileLoader]: field_name = "predicted_edits" return { FileType.json: JSONFileLoader([FileLoaderField(field_name, field_name, dict)]), }
def test_text_file_loader_validate(self): loader = TextFileLoader(target_name="prediction", dtype=int) self.assertRaises( ValueError, lambda: loader.add_fields([FileLoaderField("test", "test", str)]), )