def _collect(self, *args, **kwargs) -> Iterator[Any]: # pylint: disable = no-self-use, unused-argument """ Iterator over conll files in the data_source Args: args: args[0] is the directory to the conllu files. kwargs: Returns: data packs obtained from each document from each conllu file. """ conll_dir_path = args[0] file_paths = dataset_path_iterator(conll_dir_path, "conllu") for file_path in file_paths: with open(file_path, "r", encoding="utf8") as file: lines = file.readlines() doc_lines = [] for i, line in enumerate(lines): # previous document ends doc_lines.append(line) if i == len(lines) - 1 or \ lines[i + 1].strip().startswith("# newdoc"): yield doc_lines doc_lines = []
def _collect(self, conll_directory: str) -> Iterator[Any]: # type: ignore """ Iterator over *.gold_conll files in the data_source Args: conll_directory: path to the directory containing the files. Returns: Iterator over files with gold_conll path. """ return dataset_path_iterator(conll_directory, "gold_conll")
def _collect(self, conll_directory) -> Iterator[Any]: # type: ignore """ Iterator over conll files in the data_source Args: conll_directory: directory to the conll files. Returns: Iterator over files in the path with conll extensions. """ logging.info("Reading .conll from %s", conll_directory) return dataset_path_iterator(conll_directory, "conll")
def _collect(self, text_directory) -> Iterator[Any]: # type: ignore """ Should be called with param `text_directory` which is a path to a folder containing txt files. Args: text_directory: text directory containing the files. Returns: Iterator over paths to .txt files """ return dataset_path_iterator(text_directory, ".txt")
def _collect(self, json_directory) -> Iterator[Any]: # type: ignore """ Should be called with param `json_directory` which is a path to a folder containing json files. Args: json_directory: directory containing the json files. Returns: Iterator over paths to .json files """ return dataset_path_iterator(json_directory, "")
def _collect(self, text_directory: str) -> Iterator[Any]: # type: ignore return dataset_path_iterator(text_directory, '')