def to_document(self, fpath: Path) -> List[ColDocument]: docid, tags, title, text = self._get_info(fpath) return [ ColDocument(docid=models.KeywordField(docid), title=models.TextField(title), text=models.TextField(text), tags=models.KeywordListField(tags)) ]
def to_document(self, fpath: Path) -> List[ColDocument]: root: ET.Element = ET.parse(str(fpath.resolve())).getroot() docid: str = self._get_docid(root) tags: List[str] = self._get_tags(root) title: str = self._get_title(root) text: str = self._get_text(root) return [ ColDocument(docid=models.KeywordField(docid), title=models.TextField(title), text=models.TextField(text), tags=models.KeywordListField(tags)) ]
def _create_doc_from_values(cls, docid: str, title: str, text: str, tags: List[str]) -> ColDocument: """ for testing """ return ColDocument( docid=models.KeywordField(docid), title=models.TextField(title), text=models.TextField(text), tags=models.KeywordListField(tags))
def to_document(self, fpath: Path) -> Generator[ColDocument, None, None]: with open(fpath, 'r') as fin: lines: List[str] = [ self.escape(line) for line in fin.read().splitlines() ] for line in lines: root: ET.Element = ET.fromstring(line) docid: str = self._get_docid(root) tags: List[str] = self._get_tags(root) title: str = self._get_title(root) text: str = self._get_text(root) yield ColDocument(docid=models.KeywordField(docid), title=models.TextField(title), text=models.TextField(text), tags=models.KeywordListField(tags))
def to_paragraph(self, fpath: Path) -> List[ColParagraph]: root: ET.Element = ET.parse(str(fpath.resolve())).getroot() # text try: paras: List[str] = self._get_paragraph_list(root) except Exception as e: logger.warning(e, exc_info=True) logger.warning( 'Could not find description field in the original XML.') if len(paras) == 0: logger.warning('No paragraphs found.') return [] docid: str = self._get_docid(root) tags: List[str] = self._get_tags(root) return [ ColParagraph(docid=models.KeywordField(docid), paraid=models.IntField(paraid), text=models.TextField(para), tags=models.KeywordListField(tags)) for paraid, para in enumerate(paras) ]