def detect_category_names_to_spans(self, text: str, field: str = None) \ -> Dict[str, List[Tuple[int, int, str]]]: if self.sklearn_model is None: return {} sentence_spans = get_sentence_span_list(text) res = {} for span in sentence_spans: sentence = text[span[0]:span[1]] predicted = self.sklearn_model.predict([sentence]).toarray()[0] for target_index, value in enumerate(predicted): if not value: continue target_name = self.target_names[target_index] if target_name == SkLearnClassifierModel.EMPTY_CAT_NAME: continue if (not field and target_name) or (field and field == target_name): spans_of_category = res.get(target_name) if not spans_of_category: spans_of_category = [(span[0], span[1], sentence)] res[target_name] = spans_of_category else: spans_of_category.append((span[0], span[1], sentence)) return res
def build_dataset_on_document(document_class_name: str, document_id, retrain_model: bool = False, task_id=None): field_configs = DOCUMENT_FIELDS[document_class_name] if not field_configs: return document_class = BuildFieldDetectorDataset._get_doc_class(document_class_name) doc = document_class.objects.get(pk=document_id) classifier_model, created = ClassifierModel.objects.get_or_create( kind=ClassifierModel.KIND_SENTENCES_RELATED_TO_FIELDS, document_class=document_class_name, document_field=None) deleted, rows_count = ClassifierDataSetEntry.objects.filter( field_detection_model=classifier_model, document=doc).delete() if deleted > 0: log('Deleted {0} data set entries of document {1}'.format(deleted, doc.pk), task=task_id) def add(code, sentence): ClassifierDataSetEntry.objects.create(field_detection_model=classifier_model, document=doc, category=code, text=sentence) log('Extracting training data from document: {0}'.format(doc.pk), task=task_id) text = doc.full_text annotations = list(DocumentAnnotation.objects.filter(document__pk=doc.pk)) sentence_spans = get_sentence_span_list(text) for span in sentence_spans: sentence = text[span[0]:span[1]] annotated_fields = set() added = False if annotations: for a in annotations: if a.document_field \ and a.start_offset <= span[1] and span[0] <= a.end_offset: field_code = a.document_field.pk add(field_code, sentence) annotated_fields.add(field_code) added = True for field_config in field_configs.values(): if field_config.field_code not in annotated_fields \ and field_config.sentence_matches_field_detectors(sentence): add(field_config.field_code, sentence) added = True if not added: add('', sentence) log('Processed {0} sentences of document {1}'.format(len(sentence_spans), doc.pk), task=task_id) if retrain_model: TrainFieldDetectorModel.train_model_for_document_class.apply_async( args=(document_class_name,))
def _create_annotator_model(self, documents_gen: Generator[Document, Any, None], annotations_by_doc: Callable[ [Document], Generator[Tuple[int, int], Any, None]] = None): positive = [] negative = [] for doc in documents_gen: text = doc.full_text annotations = list(annotations_by_doc(doc)) sentence_spans = get_sentence_span_list(text) for span in sentence_spans: sentence = text[span[0]:span[1]] if self._sentence_matches_field_detectors(sentence) \ or self._sentence_matches_annotations(span, annotations): positive.append(sentence) else: negative.append(sentence)
def get_copyright_annotations(cls, text: str, return_sources=False) \ -> Generator[CopyrightAnnotation, None, None]: """ Find copyright in text. :param text: :param return_sources: :return: """ # Iterate through sentences if not cls.copyright_ptn_re.search(text): return for sent_start, _, sentence in get_sentence_span_list(text): tagged_phrases = cls.extract_phrases_with_coords(sentence) for phrase, phrase_start in tagged_phrases: for match in cls.copyright_ptn_re.finditer(phrase): cp_text, cp_sign, cp_date, cp_name = match.groups() # TODO: catch in the general regex if not cp_date: cp_date_at_end = cls.year_ptn_re.search(cp_name) if cp_date_at_end: cp_date = cp_date_at_end.group() cp_name = re.sub(r'{}$'.format(cp_date), '', cp_name) start, end = match.span() start += phrase_start + sent_start end += phrase_start ant = CopyrightAnnotation( coords=(start, end), sign=cp_sign.strip(), date=cp_date, name=cp_name.strip(string.punctuation + string.whitespace)) if return_sources: ant.text = cp_text.strip() cls.split_copyright_date(ant) cls.derive_company_name(ant, phrase) yield ant
def test_title_start_end(self): text = self.get_text( 'lexnlp/nlp/en/tests/test_sections/skewed_document.txt') sentence_spans = get_sentence_span_list(text) sections = list( get_section_spans(text, use_ml=False, return_text=False, skip_empty_headers=True)) self.assertGreater(len(sections), 3) # test title coordinates before enhancing titles ... for sect in sections: title = text[sect.title_start:sect.title_end] self.assertEqual(sect.title, title) # ... and after enhancing find_section_titles(sections, sentence_spans, text) for sect in sections: title = text[sect.title_start:sect.title_end] self.assertEqual(sect.title, title)
def annotate(self, text: str, field: str = None) -> Dict[str, Tuple[int, int, str]]: if self.sklearn_model is None: return {} sentence_spans = get_sentence_span_list(text) res = {} for span in sentence_spans: sentence = text[span[0]:span[1]] target_index = self.sklearn_model.predict([sentence])[0] target_name = self.target_names[target_index] if (not field and target_name) or (field and field == target_name): of_field = res.get(target_name) if not of_field: of_field = [(span[0], span[1], sentence)] res[target_name] = of_field else: of_field.append((span[0], span[1], sentence)) return res
def detect_category_names_to_spans(self, text: str, field: str = None) \ -> Dict[str, List[Tuple[int, int, str]]]: if self.sklearn_model is None: return {} sentence_spans = get_sentence_span_list(text) res = {} for span in sentence_spans: sentence = text[span[0]:span[1]] category_names = self.detect_category_names_for_sentence(sentence) for target_name in category_names: if (not field and target_name) or (field and field == target_name): spans_of_category = res.get(target_name) if not spans_of_category: spans_of_category = [(span[0], span[1], sentence)] res[target_name] = spans_of_category else: spans_of_category.append((span[0], span[1], sentence)) return res
def get_company_annotations( text: str, strict: bool = False, use_gnp: bool = False, count_unique: bool = False, name_upper: bool = False, ) -> Generator[CompanyAnnotation, None, None]: """ Find company names in text, optionally using the stricter article/prefix expression. :param parse_name_abbr: :param text: :param strict: :param use_gnp: use get_noun_phrases or NPExtractor :param name_upper: return company name in upper case. :param count_unique: return only unique companies - case insensitive. :return: """ # skip if all text is in uppercase if text == text.upper(): return valid_punctuation = VALID_PUNCTUATION + ["(", ")"] unique_companies = {} # type: Dict[Tuple[str, str], CompanyAnnotation] if COMPANY_TYPES_RE.search(text): # Iterate through sentences for s_start, s_end, sentence in get_sentence_span_list(text): # skip if whole phrase is in uppercase if sentence == sentence.upper(): continue if use_gnp: phrases = list( get_noun_phrases(sentence, strict=strict, valid_punctuation=valid_punctuation)) else: phrases = list(np_extractor.get_np(sentence)) phrase_spans = PhrasePositionFinder.find_phrase_in_source_text( sentence, phrases) for phrase, p_start, p_end in phrase_spans: if COMPANY_TYPES_RE.search(phrase): # noinspection PyTypeChecker for ant in nltk_re.get_companies( phrase, use_sentence_splitter=False ): # type: CompanyAnnotation if ant.name == ant.company_type or ant.name == ant.description: continue ant.coords = (ant.coords[0] + s_start + p_start, ant.coords[1] + s_start + p_start) if name_upper: ant.name = ant.name.upper() if count_unique: unique_key = (ant.name.lower() if ant.name else None, ant.company_type_abbr) existing_result = unique_companies.get(unique_key) if existing_result: existing_result.counter += 1 else: unique_companies[unique_key] = ant else: yield ant if count_unique: for company in unique_companies.values(): yield company
def get_companies(text: str, use_article: bool = False, use_sentence_splitter: bool = True) -> Generator[CompanyAnnotation, None, None]: """ Find company names in text, optionally using the stricter article/prefix expression. """ # Select regex re_c = RE_ARTICLE_COMPANY if use_article else RE_COMPANY # Iterate through sentences sent_list = get_sentence_span_list(text) if use_sentence_splitter else [(0, len(text), text)] for start, _, sentence in sent_list: if check_backtrack_catastrophy(sentence): continue for match in re_c.finditer(sentence): captures = match.capturesdict() company_type = captures["company_type_of"] or \ captures["company_type"] or \ captures["company_type_single"] company_type = "".join(company_type).strip( string.punctuation.replace(".", "") + string.whitespace) company_type = company_type or None company_name = "".join(captures["full_name"]) if company_type: company_name = re.sub(r'%s$' % company_type, '', company_name) company_name = FALSE_POS_SUB_RE.sub('', company_name) company_name = company_name.strip( string.punctuation.replace('&', '').replace(')', '') + string.whitespace) company_name = re.sub(r'^\s*(?:and|&|of)\s+|\s+(?:and|&|of)\s*$', '', company_name, re.IGNORECASE) if not company_name: continue # catch a Delaware company if company_name.lower().startswith('a ') or captures.get('article') == ['a']: continue company_description = captures.get("company_description", '') if not company_description: company_description_match = DEFAULT_COMPANY_DESC_RE.findall(company_name) if company_description_match: company_description = company_description_match[0] company_description = "".join(company_description).strip( string.punctuation + string.whitespace) # catch ABC & Company LLC case if company_description.lower() == 'company' and \ ('& company' in company_name.lower() or 'and company' in company_name.lower()): company_description = None company_description = company_description or None # catch "The Company" if company_description: _company_name = re.sub(r'[\s,]%s$' % company_description, '', company_name) if not _company_name or \ ARTICLE_RE.fullmatch(_company_name) or \ re.match(r'.+?\s(?:of|in)$', _company_name.lower()): continue if company_name in COMPANY_DESCRIPTIONS: continue abbr_name = "".join(captures["abbr_name"]) or None ret = CompanyAnnotation( (match.start() + start, match.end() + start), name=company_name, company_type_full=company_type) ret.company_type_abbr = COMPANY_TYPES[company_type.lower()]['abbr'] if company_type else None ret.company_type_label = COMPANY_TYPES[company_type.lower()]['label'] if company_type else None ret.description = company_description ret.name_abbr = abbr_name ret.text = sentence # no args: = [company_name, company_type, company_description] # detail_type: + [company_type_abbr, company_type_label] # parse_name_abbr: + [abbr_name] # return_source: + [source] yield ret
def extract_text_and_structure(pdf_fn: str, pdf_password: str = None, timeout_sec: int = 3600, language: str = "", correct_pdf: bool = False, render_coords_debug: bool = False) \ -> Tuple[ str, TextAndPDFCoordinates, str, Dict[int, float]]: # text, structure, corrected_pdf_fn, page_rotate_angles if render_coords_debug: correct_pdf = True java_modules_path = get_settings().java_modules_path # Convert language to language code lang_converter = LanguageConverter() language, locale_code = lang_converter.get_language_and_locale_code( language) temp_dir = mkdtemp(prefix='pdf_text_') out_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack') out_pdf_fn = pdf_fn try: args = [ 'java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn, '-f', 'pages_msgpack' ] if pdf_password: args.append('-p') args.append(pdf_password) if correct_pdf: out_pdf_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf') args.append('-corrected_output') args.append(out_pdf_fn) if render_coords_debug: args.append('-render_char_rects') completed_process: CompletedProcess = subprocess.run( args, check=False, timeout=timeout_sec, universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process( log, completed_process, process_title=lambda: f'Extract text and structure from {pdf_fn}') raise_from_pdfbox_error_messages(completed_process) with open(out_fn, 'rb') as pages_f: # see object structure in com.lexpredict.textextraction.dto.PDFPlainText pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False) # Remove Null characters because of incompatibility with PostgreSQL text = pdfbox_res['text'].replace("\x00", "") if len(text) == 0: pdf_coordinates = PDFCoordinates( char_bboxes=pdfbox_res['charBBoxes']) text_struct = PlainTextStructure( title='', language=language or 'en', # FastText returns English for empty strings pages=[], sentences=[], paragraphs=[], sections=[]) yield text, \ TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \ out_pdf_fn, \ None return page_rotate_angles: List[float] = [ pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages'] ] pages = [] num: int = 0 for p in pdfbox_res['pages']: p_res = PlainTextPage(number=num, start=p['location'][0], end=p['location'][1], bbox=p['bbox']) pages.append(p_res) num += 1 sentence_spans = get_sentence_span_list(text) lang = get_lang_detector() sentences = [ PlainTextSentence(start=start, end=end, language=language or lang.predict_lang(segment)) for start, end, segment in sentence_spans ] # There was a try-except in Contraxsuite catching some lexnlp exception. # Not putting it here because it should be solved on lexnlp side. paragraphs = [ PlainTextParagraph(start=start, end=end, language=language or lang.predict_lang(segment)) for segment, start, end in get_paragraphs(text, return_spans=True) ] sections = [ PlainTextSection(title=sect.title, start=sect.start, end=sect.end, title_start=sect.title_start, title_end=sect.title_end, level=sect.level, abs_level=sect.abs_level) for sect in get_document_sections_with_titles( text, sentence_list=sentence_spans) ] try: title = next(get_titles(text)) except StopIteration: title = None text_struct = PlainTextStructure(title=title, language=language or lang.predict_lang(text), pages=pages, sentences=sentences, paragraphs=paragraphs, sections=sections) char_bboxes = pdfbox_res['charBBoxes'] pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes) yield text, TextAndPDFCoordinates( text_structure=text_struct, pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles return finally: shutil.rmtree(temp_dir, ignore_errors=True)
def get_company_annotations( self, text: str, strict: bool = False, use_gnp: bool = False, count_unique: bool = False, name_upper: bool = False, banlist_usage: Optional[BanListUsage] = None ) -> Generator[CompanyAnnotation, None, None]: """ Find company names in text, optionally using the stricter article/prefix expression. :param text: :param strict: :param use_gnp: use get_noun_phrases or NPExtractor :param name_upper: return company name in upper case. :param count_unique: return only unique companies - case insensitive. :param banlist_usage: a banlist or hints on using the default BL :return: """ # skip if all text is in uppercase if text == text.upper(): return banlist = self.get_company_banlist(banlist_usage) valid_punctuation = VALID_PUNCTUATION + ["(", ")"] unique_companies: Dict[Tuple[str, str], CompanyAnnotation] = {} if not self.company_types_re.search(text): return # iterate through sentences for s_start, _s_end, sentence in get_sentence_span_list(text): # skip if whole phrase is in uppercase if sentence == sentence.upper(): continue if use_gnp: phrases = list( get_noun_phrases(sentence, strict=strict, valid_punctuation=valid_punctuation)) else: phrases = list(self.np_extractor.get_np(sentence)) phrase_spans = PhrasePositionFinder.find_phrase_in_source_text( sentence, phrases) for phrase, p_start, _p_end in phrase_spans: if self.company_types_re.search(phrase): ant: CompanyAnnotation for ant in self.get_companies_re( phrase, use_sentence_splitter=False): if ant.name == ant.company_type or ant.name == ant.description: continue # check against banlist if banlist: if EntityBanListItem.check_list(ant.name, banlist): continue ant.coords = (ant.coords[0] + s_start + p_start, ant.coords[1] + s_start + p_start) if name_upper: ant.name = ant.name.upper() if count_unique: unique_key = (ant.name.lower() if ant.name else None, ant.company_type_abbr) existing_result = unique_companies.get(unique_key) if existing_result: existing_result.counter += 1 else: unique_companies[unique_key] = ant else: yield ant if count_unique: for company in unique_companies.values(): yield company