def split_definitions_inside_term(term: str, src_with_coords: Tuple[int, int, str], term_start: int, term_end: int) -> List[Tuple[str, int, int]]: """ The whole phrase can be considered definition ("MSRB", "we", "us" or "our"), but in fact the phrase can be a collection of definitions. Here we split definition phrase to a list of definitions. Source string could be pre-processed, that's why we search for each sub-phrase's coordinates (PhrasePositionFinder) :param term: a definition or, probably, a set of definitions ("MSRB", "we", "us" or "our") :param src_with_coords: a sentence (probably), containing the term + its coords :param term_start: "term" start coordinate within the source sentence :param term_end: "term" end coordinate within the source sentence :return: [(definition, def_start, def_end), ...] """ src_start = src_with_coords[0] src_text = src_with_coords[2] matches = [m.group() for m in SPLIT_SUBDEFINITIONS_RE.finditer(term)] if len(matches) < 2: matches = [term] match_coords = PhrasePositionFinder.find_phrase_in_source_text( src_text, matches, term_start - src_start, term_end - src_start) if len(match_coords) < len(matches): return [(term, term_start, term_end)] match_coords = [(m[0], m[1] + src_start, m[2] + src_start) for m in match_coords] return match_coords
def test_exact_entry(self): text = 'The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.' phrases = ['regular expressions'] tagged = PhrasePositionFinder.find_phrase_in_source_text( text, phrases)[0] pos = tagged[1] self.assertEqual(text.find(phrases[0]), pos)
def test_split_with_quotes(self): text = 'He took my heart in "East Atlanta"\n, nah-nah-nah' spans = list(SpanTokenizer.get_token_spans(text)) self.assertEqual(('"', '``', 20, 20), spans[5]) self.assertEqual(('nah-nah-nah', 'JJ', 37, 47), spans[10]) words = nltk.word_tokenize(text) tokens = nltk.pos_tag(words) phrases = [t[0] for t in tokens] spans_alt = PhrasePositionFinder.find_phrase_in_source_text( text, phrases) self.assertEqual(('``', 20, 21), spans_alt[5]) self.assertEqual(('nah-nah-nah', 37, 48), spans_alt[10])
def normalize_text_with_map( text: str, spaces_on_start_end: bool = True, spaces_after_dots: bool = True, lowercase: bool = True, use_stemmer: bool = False, simple_tokenization: bool = False) -> Tuple[str, List[int]]: """ Almost like normalize_text, but also returns source-to-resulted char index map: map[i] = I, where i is the character coordinate within the source text, I is the same character's coordinate within the resulted text """ src_dest_map = [] # type: List[int] if use_stemmer: tokens = get_stem_list(text, lowercase=lowercase) elif simple_tokenization: tokens = reg_space.split(text) if lowercase: tokens = [t.lower() for t in tokens] else: tokens = get_token_list(text, lowercase=lowercase) # [ (token, start, end,), ... ] entity_positions = PhrasePositionFinder.find_phrase_in_source_text( text, [t for t in tokens]) resulted = '' src_index, first_token = 0, True for tok, s, _e in entity_positions: if first_token or spaces_on_start_end: resulted += ' ' first_token = False while src_index < s: src_dest_map.append(len(resulted) - 1) src_index += 1 for c_index in range(len(tok)): c = tok[c_index] if spaces_after_dots and c == '.' and c_index > 0: resulted += ' ' resulted += c src_dest_map.append(len(resulted) - 1) if spaces_after_dots and c == '.' and c_index < len(tok) - 1: resulted += ' ' src_index += 1 if spaces_on_start_end: resulted += ' ' return resulted, src_dest_map
def get_trademark_annotations(text: str) -> \ Generator[TrademarkAnnotation, None, None]: """ Find trademarks in text. """ # Iterate through sentences if TRADEMARK_PTN_RE.search(text): for scd in get_sentence_span(text): sentence = scd[2] phrases = list(np_extractor.get_np(sentence)) tagged_phrases = PhrasePositionFinder.find_phrase_int_source_text( sentence, phrases) for phrase in tagged_phrases: for tm in TRADEMARK_PTN_RE.finditer(phrase[0]): coords = tm.span() coords = (coords[0] + scd[0] + phrase[1], coords[1] + scd[0] + phrase[1]) if coords[1] >= len(text): coords = (coords[0], len(text) - 1) ant = TrademarkAnnotation(coords=coords, trademark=tm.group()) yield ant
def test_tagging_non_uni_quotes(self): text = '(each an “Obligation” and collectively, the “Obligations”)' tagged = PhrasePositionFinder.find_phrase_in_source_text( text, ['"Obligation"', '"Obligations"'], 0, 58) self.assertEqual((9, 44), (tagged[0][1], tagged[1][1]))
def test_similar_entries(self): text = 'aa aaa aaa aaaaa aa aaa aa' tagged = PhrasePositionFinder.find_phrase_in_source_text(text, ['aaa', 'aa']) self.assertEqual((3, 7), (tagged[0][1], tagged[1][1]))
def test_corrupted_entry(self): text = 'The Treebank tokenizer uses regular(expressions) to tokenize text as in Penn Treebank.' phrases = ['regular expressions'] tagged = PhrasePositionFinder.find_phrase_in_source_text(text, phrases)[0] self.assertEqual(0, tagged[1])
def get_company_annotations( text: str, strict: bool = False, use_gnp: bool = False, count_unique: bool = False, name_upper: bool = False, ) -> Generator[CompanyAnnotation, None, None]: """ Find company names in text, optionally using the stricter article/prefix expression. :param parse_name_abbr: :param text: :param strict: :param use_gnp: use get_noun_phrases or NPExtractor :param name_upper: return company name in upper case. :param count_unique: return only unique companies - case insensitive. :return: """ # skip if all text is in uppercase if text == text.upper(): return valid_punctuation = VALID_PUNCTUATION + ["(", ")"] unique_companies = {} # type: Dict[Tuple[str, str], CompanyAnnotation] if COMPANY_TYPES_RE.search(text): # Iterate through sentences for s_start, s_end, sentence in get_sentence_span_list(text): # skip if whole phrase is in uppercase if sentence == sentence.upper(): continue if use_gnp: phrases = list( get_noun_phrases(sentence, strict=strict, valid_punctuation=valid_punctuation)) else: phrases = list(np_extractor.get_np(sentence)) phrase_spans = PhrasePositionFinder.find_phrase_in_source_text( sentence, phrases) for phrase, p_start, p_end in phrase_spans: if COMPANY_TYPES_RE.search(phrase): # noinspection PyTypeChecker for ant in nltk_re.get_companies( phrase, use_sentence_splitter=False ): # type: CompanyAnnotation if ant.name == ant.company_type or ant.name == ant.description: continue ant.coords = (ant.coords[0] + s_start + p_start, ant.coords[1] + s_start + p_start) if name_upper: ant.name = ant.name.upper() if count_unique: unique_key = (ant.name.lower() if ant.name else None, ant.company_type_abbr) existing_result = unique_companies.get(unique_key) if existing_result: existing_result.counter += 1 else: unique_companies[unique_key] = ant else: yield ant if count_unique: for company in unique_companies.values(): yield company
def get_np_with_coords(self, text: str) -> List[Tuple[str, int, int]]: phrases = list(self.get_np(text)) tagged_phrases = PhrasePositionFinder.find_phrase_in_source_text( text, phrases) return tagged_phrases
def extract_phrases_with_coords(cls, sentence: str) -> List[Tuple[str, int]]: phrases = list(np_extractor.get_np(sentence)) tagged_phrases = PhrasePositionFinder.find_phrase_int_source_text( sentence, phrases) return tagged_phrases
def get_company_annotations( self, text: str, strict: bool = False, use_gnp: bool = False, count_unique: bool = False, name_upper: bool = False, banlist_usage: Optional[BanListUsage] = None ) -> Generator[CompanyAnnotation, None, None]: """ Find company names in text, optionally using the stricter article/prefix expression. :param text: :param strict: :param use_gnp: use get_noun_phrases or NPExtractor :param name_upper: return company name in upper case. :param count_unique: return only unique companies - case insensitive. :param banlist_usage: a banlist or hints on using the default BL :return: """ # skip if all text is in uppercase if text == text.upper(): return banlist = self.get_company_banlist(banlist_usage) valid_punctuation = VALID_PUNCTUATION + ["(", ")"] unique_companies: Dict[Tuple[str, str], CompanyAnnotation] = {} if not self.company_types_re.search(text): return # iterate through sentences for s_start, _s_end, sentence in get_sentence_span_list(text): # skip if whole phrase is in uppercase if sentence == sentence.upper(): continue if use_gnp: phrases = list( get_noun_phrases(sentence, strict=strict, valid_punctuation=valid_punctuation)) else: phrases = list(self.np_extractor.get_np(sentence)) phrase_spans = PhrasePositionFinder.find_phrase_in_source_text( sentence, phrases) for phrase, p_start, _p_end in phrase_spans: if self.company_types_re.search(phrase): ant: CompanyAnnotation for ant in self.get_companies_re( phrase, use_sentence_splitter=False): if ant.name == ant.company_type or ant.name == ant.description: continue # check against banlist if banlist: if EntityBanListItem.check_list(ant.name, banlist): continue ant.coords = (ant.coords[0] + s_start + p_start, ant.coords[1] + s_start + p_start) if name_upper: ant.name = ant.name.upper() if count_unique: unique_key = (ant.name.lower() if ant.name else None, ant.company_type_abbr) existing_result = unique_companies.get(unique_key) if existing_result: existing_result.counter += 1 else: unique_companies[unique_key] = ant else: yield ant if count_unique: for company in unique_companies.values(): yield company