Esempio n. 1
0
    def test_split_with_quotes(self):
        text = 'He took my heart in "East Atlanta"\n, nah-nah-nah'
        spans = list(SpanTokenizer.get_token_spans(text))
        self.assertEqual(('"', '``', 20, 20), spans[5])
        self.assertEqual(('nah-nah-nah', 'JJ', 37, 47), spans[10])

        words = nltk.word_tokenize(text)
        tokens = nltk.pos_tag(words)
        phrases = [t[0] for t in tokens]

        spans_alt = PhrasePositionFinder.find_phrase_in_source_text(
            text, phrases)
        self.assertEqual(('``', 20, 21), spans_alt[5])
        self.assertEqual(('nah-nah-nah', 37, 48), spans_alt[10])
Esempio n. 2
0
 def test_split_plain(self):
     text = 'He took my heart in East Atlanta, nah-nah-nah'
     spans = list(SpanTokenizer.get_token_spans(text))
     self.assertGreater(len(spans), 3)
     self.assertEqual(('He', 'PRP', 0, 1), spans[0])
     self.assertEqual(('nah-nah-nah', 'JJ', 34, 44), spans[8])
Esempio n. 3
0
 def test_split_dont(self):
     text = "You don't do it, man!"
     spans = list(SpanTokenizer.get_token_spans(text))
     self.assertEqual(8, len(spans))
     self.assertEqual(17, spans[6][2])
     self.assertEqual(19, spans[6][3])
Esempio n. 4
0
def get_definition_list_in_sentence(
        sentence_coords: Tuple[int, int, str],
        decode_unicode=True) -> List[DefinitionCaught]:
    """
        Find possible definitions in natural language in a single sentence.
        :param sentence_coords: sentence, sentence start, end
        :param decode_unicode:
        :return:
        """
    definitions = []  # type: List[DefinitionCaught]
    sentence = sentence_coords[2]
    # unify quotes and braces
    # replace excess braces with ' ' so the str length will remain the same
    sentence = TextBeautifier.unify_quotes_braces(sentence,
                                                  empty_replacement=' ')
    sent_start = sentence_coords[0]
    result = set()  # type: Set[Tuple[str, int, int]]

    # it really transforms string, e.g. replaces “ with "
    if decode_unicode:
        sentence = unidecode.unidecode(sentence)
        sentence_coords = sentence_coords[0], sentence_coords[1], sentence

    # case 1
    for item in TRIGGER_WORDS_PTN_RE.finditer(sentence):
        result.update(
            regex_matches_to_word_coords(EXTRACT_PTN_RE, item.group(),
                                         item.start() + sent_start))

    # case 3
    mts = regex_matches_to_word_coords(NOUN_PTN_RE, sentence, sent_start)
    mts = [i for i in mts if not NOUN_ANTI_PTN_RE.fullmatch(i[0])]
    mts = [
        m for m in mts
        if m[0].lower().strip(' ,;.') not in EnLanguageTokens.pronouns
    ]
    if len(mts) > 0:
        result.update(mts)

    # cases 2, 4, 5, 6
    for _ in TRIGGER_QUOTED_DEFINITION_RE.finditer(sentence):
        for quoted_definition_re in QUOTED_DEFINITION_RE:
            result.update(
                regex_matches_to_word_coords(quoted_definition_re, sentence,
                                             sent_start))
        break

    # make definitions out of entries
    for term, start, end in result:
        term_cleared = TextBeautifier.strip_pair_symbols((term, start, end))
        term_cleared = trim_defined_term(term_cleared[0], term_cleared[1],
                                         term_cleared[2])
        was_quoted = term_cleared[3]

        if PICK_DEFINITION_FROM_QUOTES:
            term, start, end = term_cleared[0], term_cleared[1], term_cleared[
                2]

        if not term_cleared[0]:
            continue

        term, start, end = TextBeautifier.unify_quotes_braces_coords(
            term, start, end)

        # check the term is not empty
        if len(term.strip(PUNCTUATION_STRIP_STR)) == 0:
            continue

        # returns [('word', 'token', (word_start, word_end)), ...] ...
        term_pos = list(SpanTokenizer.get_token_spans(term))
        if does_term_are_service_words(term_pos):
            continue

        term_wo_intro = IntroductoryWordsDetector.remove_term_introduction(
            term, term_pos)
        if term_wo_intro != term:
            term = TextBeautifier.strip_pair_symbols(term_wo_intro)
        if not term:
            continue

        # check the term is not too long
        max_words_per_definition = MAX_TERM_TOKENS
        if was_quoted:
            max_words_per_definition = MAX_QUOTED_TERM_TOKENS

        words_in_term = sum(
            1 for w in word_processor.split_text_on_words(term_cleared[0])
            if not w.is_separator)
        quotes_in_text = get_quotes_count_in_string(term_cleared[0])
        possible_definitions = quotes_in_text // 2 if quotes_in_text > 1 else 1
        possible_tokens_count = max_words_per_definition * possible_definitions
        if words_in_term > possible_tokens_count:
            continue

        split_definitions_lst = split_definitions_inside_term(
            term, sentence_coords, start, end)

        for definition, s, e in split_definitions_lst:
            definition, s, e = TextBeautifier.strip_pair_symbols(
                (definition, s, e))
            definitions.append(DefinitionCaught(definition, sentence, (
                s,
                e,
            )))

    return definitions
Esempio n. 5
0
 def test_split_simplest_case(self):
     text = 'John was named after his dog'
     spans = list(SpanTokenizer.get_token_spans(text))
     self.assertGreater(len(spans), 3)
 def test_split_with_quotes(self):
     #text = 'He took my heart in "East Atlanta"\n, nah-nah-nah'
     #text = 'John also likes so called blue house at the end of the street.'
     text = 'John was named after his dog'
     spans = list(SpanTokenizer.get_token_spans(text))
     self.assertGreater(len(spans), 3)
Esempio n. 7
0
            return self.members() == other.members()
        else:
            return False

    def __hash__(self):
        return hash(self.members())


class NGramType:
    OTHER = 0
    ADDR_START = 1
    ADDR_MIDDLE = 2
    ADDR_END = 3


TOKENIZER = SpanTokenizer()


def _safe_index(sentence, token, point, safe: bool = False):
    try:
        return sentence.index(token, point)
    except ValueError:
        if safe:
            return None
        else:
            raise ValueError(f'Substring "{token}" not found in:\n'
                             f'"{sentence}"\n'
                             f'Search start pos: {point}')


def align_tokens(tokens, sentence):
 def test_positive(self):
     term = 'so called "champerty\''
     term_pos = list(SpanTokenizer.get_token_spans(term))
     term_clear = \
         IntroductoryWordsDetector.remove_term_introduction(term, term_pos)
     self.assertEqual('"champerty\'', term_clear)
 def test_negative_combined(self):
     term = 'Combined EDITT Deficit Alpha Beta Gamma Cappa'
     term_pos = list(SpanTokenizer.get_token_spans(term))
     term_clear = \
         IntroductoryWordsDetector.remove_term_introduction(term, term_pos)
     self.assertEqual(term, term_clear)
 def test_negative(self):
     term = 'Physically Completed'
     term_pos = list(SpanTokenizer.get_token_spans(term))
     term_clear = \
         IntroductoryWordsDetector.remove_term_introduction(term, term_pos)
     self.assertEqual(term, term_clear)