def check_next_few(self, pq_obj: PyQuery) -> Tuple[Optional[Date], re.Match]: date = None cursor = pq_obj.next() for i in range(self.check_next): m = is_date(cursor.text()) if m: logger.info("Date found", extra={ "pdf_page": self.current_page + 1, "pdf_name": self.filename }) date = Date.from_re_match(m) break cursor = cursor.next() return date, m
def extract_topic(self, pq_obj: PyQuery): cursor = pq_obj.next() # skip till date occurs date_found = False for i in range(self.check_next): if is_date(cursor.text()): date_found = True date_text = cursor.text() cursor = pq_obj.next() # reset break cursor = cursor.next() if not date_found: logger.error( "Date not found. Extract should not have been called.", extra={ "pdf_page": self.current_page + 1, "pdf_name": self.filename }) # cursor is after date topic = [] topic_ended = False for i in range(self.max_topic_range): if (cursor.text()) == date_text: cursor = cursor.next() continue t = cursor.text() if self.end_of_topic(t): topic_ended = True break topic.append(t) cursor = cursor.next() if not topic_ended: # breakpoint() err = "Topic did not end in {} lines".format(self.current_page) logger.error(err, extra={ "pdf_page": self.current_page + 1, "pdf_name": self.filename }) logger.debug("Last line read for topic was : {}".format(t), extra={ "pdf_page": self.current_page + 1, "pdf_name": self.filename }) # raise PDFQueryException(err) topic_str = ' '.join(topic) if not topic_str.strip(): logger.error("Registerd empty topic", extra={ "pdf_page": self.current_page + 1, "pdf_name": self.filename }) return "" else: return topic_str