def test_split_with_quotes(self): text = 'He took my heart in "East Atlanta"\n, nah-nah-nah' spans = list(SpanTokenizer.get_token_spans(text)) self.assertEqual(('"', '``', 20, 20), spans[5]) self.assertEqual(('nah-nah-nah', 'JJ', 37, 47), spans[10]) words = nltk.word_tokenize(text) tokens = nltk.pos_tag(words) phrases = [t[0] for t in tokens] spans_alt = PhrasePositionFinder.find_phrase_in_source_text( text, phrases) self.assertEqual(('``', 20, 21), spans_alt[5]) self.assertEqual(('nah-nah-nah', 37, 48), spans_alt[10])
def test_split_plain(self): text = 'He took my heart in East Atlanta, nah-nah-nah' spans = list(SpanTokenizer.get_token_spans(text)) self.assertGreater(len(spans), 3) self.assertEqual(('He', 'PRP', 0, 1), spans[0]) self.assertEqual(('nah-nah-nah', 'JJ', 34, 44), spans[8])
def test_split_dont(self): text = "You don't do it, man!" spans = list(SpanTokenizer.get_token_spans(text)) self.assertEqual(8, len(spans)) self.assertEqual(17, spans[6][2]) self.assertEqual(19, spans[6][3])
def get_definition_list_in_sentence( sentence_coords: Tuple[int, int, str], decode_unicode=True) -> List[DefinitionCaught]: """ Find possible definitions in natural language in a single sentence. :param sentence_coords: sentence, sentence start, end :param decode_unicode: :return: """ definitions = [] # type: List[DefinitionCaught] sentence = sentence_coords[2] # unify quotes and braces # replace excess braces with ' ' so the str length will remain the same sentence = TextBeautifier.unify_quotes_braces(sentence, empty_replacement=' ') sent_start = sentence_coords[0] result = set() # type: Set[Tuple[str, int, int]] # it really transforms string, e.g. replaces “ with " if decode_unicode: sentence = unidecode.unidecode(sentence) sentence_coords = sentence_coords[0], sentence_coords[1], sentence # case 1 for item in TRIGGER_WORDS_PTN_RE.finditer(sentence): result.update( regex_matches_to_word_coords(EXTRACT_PTN_RE, item.group(), item.start() + sent_start)) # case 3 mts = regex_matches_to_word_coords(NOUN_PTN_RE, sentence, sent_start) mts = [i for i in mts if not NOUN_ANTI_PTN_RE.fullmatch(i[0])] mts = [ m for m in mts if m[0].lower().strip(' ,;.') not in EnLanguageTokens.pronouns ] if len(mts) > 0: result.update(mts) # cases 2, 4, 5, 6 for _ in TRIGGER_QUOTED_DEFINITION_RE.finditer(sentence): for quoted_definition_re in QUOTED_DEFINITION_RE: result.update( regex_matches_to_word_coords(quoted_definition_re, sentence, sent_start)) break # make definitions out of entries for term, start, end in result: term_cleared = TextBeautifier.strip_pair_symbols((term, start, end)) term_cleared = trim_defined_term(term_cleared[0], term_cleared[1], term_cleared[2]) was_quoted = term_cleared[3] if PICK_DEFINITION_FROM_QUOTES: term, start, end = term_cleared[0], term_cleared[1], term_cleared[ 2] if not term_cleared[0]: continue term, start, end = TextBeautifier.unify_quotes_braces_coords( term, start, end) # check the term is not empty if len(term.strip(PUNCTUATION_STRIP_STR)) == 0: continue # returns [('word', 'token', (word_start, word_end)), ...] ... term_pos = list(SpanTokenizer.get_token_spans(term)) if does_term_are_service_words(term_pos): continue term_wo_intro = IntroductoryWordsDetector.remove_term_introduction( term, term_pos) if term_wo_intro != term: term = TextBeautifier.strip_pair_symbols(term_wo_intro) if not term: continue # check the term is not too long max_words_per_definition = MAX_TERM_TOKENS if was_quoted: max_words_per_definition = MAX_QUOTED_TERM_TOKENS words_in_term = sum( 1 for w in word_processor.split_text_on_words(term_cleared[0]) if not w.is_separator) quotes_in_text = get_quotes_count_in_string(term_cleared[0]) possible_definitions = quotes_in_text // 2 if quotes_in_text > 1 else 1 possible_tokens_count = max_words_per_definition * possible_definitions if words_in_term > possible_tokens_count: continue split_definitions_lst = split_definitions_inside_term( term, sentence_coords, start, end) for definition, s, e in split_definitions_lst: definition, s, e = TextBeautifier.strip_pair_symbols( (definition, s, e)) definitions.append(DefinitionCaught(definition, sentence, ( s, e, ))) return definitions
def test_split_simplest_case(self): text = 'John was named after his dog' spans = list(SpanTokenizer.get_token_spans(text)) self.assertGreater(len(spans), 3)
def test_split_with_quotes(self): #text = 'He took my heart in "East Atlanta"\n, nah-nah-nah' #text = 'John also likes so called blue house at the end of the street.' text = 'John was named after his dog' spans = list(SpanTokenizer.get_token_spans(text)) self.assertGreater(len(spans), 3)
return self.members() == other.members() else: return False def __hash__(self): return hash(self.members()) class NGramType: OTHER = 0 ADDR_START = 1 ADDR_MIDDLE = 2 ADDR_END = 3 TOKENIZER = SpanTokenizer() def _safe_index(sentence, token, point, safe: bool = False): try: return sentence.index(token, point) except ValueError: if safe: return None else: raise ValueError(f'Substring "{token}" not found in:\n' f'"{sentence}"\n' f'Search start pos: {point}') def align_tokens(tokens, sentence):
def test_positive(self): term = 'so called "champerty\'' term_pos = list(SpanTokenizer.get_token_spans(term)) term_clear = \ IntroductoryWordsDetector.remove_term_introduction(term, term_pos) self.assertEqual('"champerty\'', term_clear)
def test_negative_combined(self): term = 'Combined EDITT Deficit Alpha Beta Gamma Cappa' term_pos = list(SpanTokenizer.get_token_spans(term)) term_clear = \ IntroductoryWordsDetector.remove_term_introduction(term, term_pos) self.assertEqual(term, term_clear)
def test_negative(self): term = 'Physically Completed' term_pos = list(SpanTokenizer.get_token_spans(term)) term_clear = \ IntroductoryWordsDetector.remove_term_introduction(term, term_pos) self.assertEqual(term, term_clear)