Example #1
0
    def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
        name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")]
        with codecs.open(fp, encoding=self.encoding) as f:
            reader = csv.reader(f)

            # Load CSV header
            header_names = None
            if self.header:
                header_names = next(reader)

            # Load document per row
            for i, row in enumerate(reader):
                sections = []
                for j, content in enumerate(row):
                    rule = (self.parser_rule[j] if self.parser_rule is not None
                            and j in self.parser_rule else column_constructor)
                    content_header = (header_names[j]
                                      if header_names is not None else None)
                    context = [
                        build_node(t, n, c) for t, n, c in rule(content)
                    ]
                    sections.append(
                        build_node("section", content_header,
                                   "".join(context)))

                text = build_node("doc", None, "".join(sections))
                doc_name = name + ":" + str(i)
                stable_id = self._get_stable_id(doc_name)

                yield Document(
                    name=doc_name,
                    stable_id=stable_id,
                    text=text,
                    meta={"file_name": file_name},
                )
Example #2
0
 def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
     with codecs.open(fp, encoding=self.encoding) as f:
         name = os.path.basename(fp).rsplit(".", 1)[0]
         stable_id = self._get_stable_id(name)
         text = build_node("doc", None, build_node("text", None, f.read().strip()))
         yield Document(
             name=name, stable_id=stable_id, text=text, meta={"file_name": file_name}
         )
Example #3
0
 def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
     with codecs.open(fp, encoding=self.encoding) as tsv:
         if self.header:
             tsv.readline()
         for line in tsv:
             (doc_name, doc_text) = line.split("\t")
             stable_id = self._get_stable_id(doc_name)
             text = build_node("doc", None, build_node("text", None, doc_text))
             yield Document(
                 name=doc_name,
                 stable_id=stable_id,
                 text=text,
                 meta={"file_name": file_name},
             )