Beispiel #1
0
 def to_document(self, fpath: Path) -> List[ColDocument]:
     docid, tags, title, text = self._get_info(fpath)
     return [
         ColDocument(docid=models.KeywordField(docid),
                     title=models.TextField(title),
                     text=models.TextField(text),
                     tags=models.KeywordListField(tags))
     ]
Beispiel #2
0
    def to_document(self, fpath: Path) -> List[ColDocument]:
        root: ET.Element = ET.parse(str(fpath.resolve())).getroot()

        docid: str = self._get_docid(root)
        tags: List[str] = self._get_tags(root)
        title: str = self._get_title(root)
        text: str = self._get_text(root)
        return [
            ColDocument(docid=models.KeywordField(docid),
                        title=models.TextField(title),
                        text=models.TextField(text),
                        tags=models.KeywordListField(tags))
        ]
Beispiel #3
0
 def _create_doc_from_values(cls,
                             docid: str,
                             title: str,
                             text: str,
                             tags: List[str]) -> ColDocument:
     """
     for testing
     """
     return ColDocument(
         docid=models.KeywordField(docid),
         title=models.TextField(title),
         text=models.TextField(text),
         tags=models.KeywordListField(tags))
Beispiel #4
0
    def to_document(self, fpath: Path) -> Generator[ColDocument, None, None]:
        with open(fpath, 'r') as fin:
            lines: List[str] = [
                self.escape(line) for line in fin.read().splitlines()
            ]

        for line in lines:
            root: ET.Element = ET.fromstring(line)
            docid: str = self._get_docid(root)
            tags: List[str] = self._get_tags(root)
            title: str = self._get_title(root)
            text: str = self._get_text(root)
            yield ColDocument(docid=models.KeywordField(docid),
                              title=models.TextField(title),
                              text=models.TextField(text),
                              tags=models.KeywordListField(tags))
Beispiel #5
0
    def to_paragraph(self, fpath: Path) -> List[ColParagraph]:
        root: ET.Element = ET.parse(str(fpath.resolve())).getroot()
        # text
        try:
            paras: List[str] = self._get_paragraph_list(root)
        except Exception as e:
            logger.warning(e, exc_info=True)
            logger.warning(
                'Could not find description field in the original XML.')
        if len(paras) == 0:
            logger.warning('No paragraphs found.')
            return []

        docid: str = self._get_docid(root)
        tags: List[str] = self._get_tags(root)

        return [
            ColParagraph(docid=models.KeywordField(docid),
                         paraid=models.IntField(paraid),
                         text=models.TextField(para),
                         tags=models.KeywordListField(tags))
            for paraid, para in enumerate(paras)
        ]