Beispiel #1
0
    def _collect(self, *args, **kwargs) -> Iterator[Any]:
        # pylint: disable = unused-argument
        r"""Iterator over conll files in the data_source.

        Args:
            args: args[0] is the directory to the conllu files.
            kwargs:

        Returns: data packs obtained from each document from each conllu file.
        """
        conll_dir_path = args[0]

        file_paths = dataset_path_iterator(conll_dir_path, "conllu")
        for file_path in file_paths:
            with open(file_path, "r", encoding="utf8") as file:
                lines = file.readlines()
                doc_lines = []

                for i, line in enumerate(lines):
                    # previous document ends
                    doc_lines.append(line)
                    if i == len(lines) - 1 or \
                            lines[i + 1].strip().startswith("# newdoc"):
                        yield doc_lines
                        doc_lines = []
Beispiel #2
0
    def _collect(self, conll_directory: str) -> Iterator[Any]:  # type: ignore
        r"""Iterator over *.gold_conll files in the data_source

        Args:
            conll_directory:  path to the directory containing the files.

        Returns: Iterator over files with gold_conll path.
        """
        return dataset_path_iterator(conll_directory, "gold_conll")
Beispiel #3
0
    def _collect(self, json_directory) -> Iterator[Any]:  # type: ignore
        r"""Should be called with param ``json_directory`` which is a path to a
        folder containing json files.

        Args:
            json_directory: directory containing the json files.

        Returns: Iterator over paths to .json files
        """
        return dataset_path_iterator(json_directory, "")
Beispiel #4
0
    def _collect(self, text_directory) -> Iterator[Any]:  # type: ignore
        r"""Should be called with param ``text_directory`` which is a path to a
        folder containing txt files.

        Args:
            text_directory: text directory containing the files.

        Returns: Iterator over paths to .txt files
        """
        return dataset_path_iterator(text_directory, self.configs.file_ext)
Beispiel #5
0
    def _collect(self, conll_directory) -> Iterator[Any]:  # type: ignore
        r"""Iterator over conll files in the data_source.

        Args:
            conll_directory: directory to the conll files.

        Returns: Iterator over files in the path with conll extensions.
        """
        logging.info("Reading .conll from %s", conll_directory)
        return dataset_path_iterator(conll_directory, "conll")
    def _collect(self, text_directory) -> Iterator[Any]:
        r"""Should be called with param ``text_directory`` which is a path to a
        folder containing xml files.

        Args:
            text_directory: text directory containing the files.

        Returns: Iterator over paths to .xml files
        """
        return dataset_path_iterator(text_directory, '.xml')
Beispiel #7
0
    def _collect(self, *args, **kwargs) -> Iterator[str]:
        # pylint: disable = unused-argument
        r"""Iterator over text files in the data_source.

        Args:
            args: args[0] is the directory to the pos/neg movie files.
            kwargs:

        Returns: Iterator over files in the path with txt extensions.
        """
        movie_directory: str = args[0]
        logging.info("Reading .txt from %s", movie_directory)
        return dataset_path_iterator(movie_directory, "txt")
    def _collect(self, *args, **kwargs) -> Iterator[Any]:
        # pylint: disable = unused-argument
        r'''args[0] should be the folder where
        the SemEval Task8 dataset is stored.
        Files ended with sem_eval_task8_file_extension (.txt)
        are exptected here.

        Args:
            args: args[0] is the directory to the dataset.

        Returns: Iterator over the file name (str).
        '''
        sem_file_dir: str = args[0]
        return dataset_path_iterator(
            sem_file_dir, self.configs.sem_eval_task8_file_extension)
Beispiel #9
0
    def _collect(self, *args, **kwargs) -> Iterator[str]:
        # pylint: disable = unused-argument
        r"""Should be called with param ``oie_directory`` which is a path to a
        folder containing json files.

        Args:
            args: args[0] is the directory to the open ie files.
            kwargs:

        Returns: Iterator over files in the path with oie extensions.
        """
        oie_directory: str = args[0]
        oie_file_extension: str = self.configs.oie_file_extension
        logging.info("Reading dataset from %s with extension %s",
                     oie_directory, oie_file_extension)
        return dataset_path_iterator(oie_directory, oie_file_extension)
Beispiel #10
0
    def _collect(self, *args, **kwargs) -> Iterator[str]:
        # pylint: disable = unused-argument
        r"""Iterator over text files in the data_source

        Args:
            args: args[0] is the directory to the .qa files.
            kwargs:

        Returns: Iterator over files in the path with qa extensions.
        """

        qa_directory: str = args[0]

        qa_file_extension: str = self.configs.qa_file_extension

        logging.info("Reading dataset from %s with extension %s",
                     qa_directory, qa_file_extension)
        return dataset_path_iterator(qa_directory, qa_file_extension)
Beispiel #11
0
    def _collect(self, content) -> Iterator[str]:  # type: ignore
        r"""Could be called with a directory, a particular file location or a
        list of strings. If the string is an HTML string, it will be cleaned.

        Args:
            content: either a string, or list of string

        Returns: Iterator over the content based on type of input
        """
        if isinstance(content, str):
            # Check if directory
            if os.path.isdir(content):
                self.init_with_fileloc = True
                # TODO: maybe extend it to .txt also if need be?
                return dataset_path_iterator(content, ".html")
            # If file path to a single file, just return the filepath
            elif os.path.isfile(content):

                def data_yielder(data):
                    yield data

                self.init_with_fileloc = True
                return data_yielder(content)
            else:  # Treat it as a string
                content = [content]

        if isinstance(content, list):  # Must be a list of strings now
            self.init_with_html = True

            def data_iterator(data):
                for html_string in data:
                    yield html_string

            return data_iterator(content)

        else:
            raise TypeError(f"HTMLReader supports only strings and list of"
                            f" strings, Please make sure your inputs are"
                            f" correct!"
                            f"Found {type(content)} instead!")
Beispiel #12
0
 def _collect(self, text_directory: str) -> Iterator[Any]:  # type: ignore
     return dataset_path_iterator(text_directory, '')