コード例 #1
0
    def check_next_few(self,
                       pq_obj: PyQuery) -> Tuple[Optional[Date], re.Match]:
        date = None
        cursor = pq_obj.next()

        for i in range(self.check_next):
            m = is_date(cursor.text())

            if m:
                logger.info("Date found",
                            extra={
                                "pdf_page": self.current_page + 1,
                                "pdf_name": self.filename
                            })
                date = Date.from_re_match(m)

                break

            cursor = cursor.next()

        return date, m
コード例 #2
0
    def extract_topic(self, pq_obj: PyQuery):
        cursor = pq_obj.next()
        # skip till date occurs
        date_found = False

        for i in range(self.check_next):
            if is_date(cursor.text()):
                date_found = True
                date_text = cursor.text()
                cursor = pq_obj.next()  # reset

                break
            cursor = cursor.next()

        if not date_found:
            logger.error(
                "Date not found. Extract should not have been called.",
                extra={
                    "pdf_page": self.current_page + 1,
                    "pdf_name": self.filename
                })
        # cursor is after date
        topic = []
        topic_ended = False

        for i in range(self.max_topic_range):
            if (cursor.text()) == date_text:
                cursor = cursor.next()

                continue

            t = cursor.text()

            if self.end_of_topic(t):
                topic_ended = True

                break
            topic.append(t)
            cursor = cursor.next()

        if not topic_ended:
            # breakpoint()
            err = "Topic did not end in {} lines".format(self.current_page)
            logger.error(err,
                         extra={
                             "pdf_page": self.current_page + 1,
                             "pdf_name": self.filename
                         })
            logger.debug("Last line read for topic was : {}".format(t),
                         extra={
                             "pdf_page": self.current_page + 1,
                             "pdf_name": self.filename
                         })
            # raise PDFQueryException(err)

        topic_str = ' '.join(topic)

        if not topic_str.strip():
            logger.error("Registerd empty topic",
                         extra={
                             "pdf_page": self.current_page + 1,
                             "pdf_name": self.filename
                         })

            return ""
        else:
            return topic_str