def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")] with codecs.open(fp, encoding=self.encoding) as f: reader = csv.reader(f) # Load CSV header header_names = None if self.header: header_names = next(reader) # Load document per row for i, row in enumerate(reader): sections = [] for j, content in enumerate(row): rule = (self.parser_rule[j] if self.parser_rule is not None and j in self.parser_rule else column_constructor) content_header = (header_names[j] if header_names is not None else None) context = [ build_node(t, n, c) for t, n, c in rule(content) ] sections.append( build_node("section", content_header, "".join(context))) text = build_node("doc", None, "".join(sections)) doc_name = name + ":" + str(i) stable_id = self._get_stable_id(doc_name) yield Document( name=doc_name, stable_id=stable_id, text=text, meta={"file_name": file_name}, )
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: with codecs.open(fp, encoding=self.encoding) as f: name = os.path.basename(fp).rsplit(".", 1)[0] stable_id = self._get_stable_id(name) text = build_node("doc", None, build_node("text", None, f.read().strip())) yield Document( name=name, stable_id=stable_id, text=text, meta={"file_name": file_name} )
def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]: with codecs.open(fp, encoding=self.encoding) as tsv: if self.header: tsv.readline() for line in tsv: (doc_name, doc_text) = line.split("\t") stable_id = self._get_stable_id(doc_name) text = build_node("doc", None, build_node("text", None, doc_text)) yield Document( name=doc_name, stable_id=stable_id, text=text, meta={"file_name": file_name}, )