コード例 #1
0
 def test_exact_entry(self):
     text = 'The Treebank tokenizer uses regular  expressions to tokenize text as in Penn Treebank.'
     phrases = ['regular  expressions']
     tagged = PhrasePositionFinder.find_phrase_in_source_text(
         text, phrases)[0]
     pos = tagged[1]
     self.assertEqual(text.find(phrases[0]), pos)
コード例 #2
0
def split_definitions_inside_term(term: str, src_with_coords: Tuple[int, int,
                                                                    str],
                                  term_start: int,
                                  term_end: int) -> List[Tuple[str, int, int]]:
    """
    The whole phrase can be considered definition ("MSRB", "we", "us" or "our"),
    but in fact the phrase can be a collection of definitions.
    Here we split definition phrase to a list of definitions.

    Source string could be pre-processed, that's why we search for each
    sub-phrase's coordinates (PhrasePositionFinder)
    :param term: a definition or, probably, a set of definitions ("MSRB", "we", "us" or "our")
    :param src_with_coords: a sentence (probably), containing the term + its coords
    :param term_start: "term" start coordinate within the source sentence
    :param term_end: "term" end coordinate within the source sentence
    :return: [(definition, def_start, def_end), ...]
    """
    src_start = src_with_coords[0]
    src_text = src_with_coords[2]

    matches = [m.group() for m in SPLIT_SUBDEFINITIONS_RE.finditer(term)]
    if len(matches) < 2:
        matches = [term]

    match_coords = PhrasePositionFinder.find_phrase_in_source_text(
        src_text, matches, term_start - src_start, term_end - src_start)

    if len(match_coords) < len(matches):
        return [(term, term_start, term_end)]

    match_coords = [(m[0], m[1] + src_start, m[2] + src_start)
                    for m in match_coords]

    return match_coords
コード例 #3
0
    def test_split_with_quotes(self):
        text = 'He took my heart in "East Atlanta"\n, nah-nah-nah'
        spans = list(SpanTokenizer.get_token_spans(text))
        self.assertEqual(('"', '``', 20, 20), spans[5])
        self.assertEqual(('nah-nah-nah', 'JJ', 37, 47), spans[10])

        words = nltk.word_tokenize(text)
        tokens = nltk.pos_tag(words)
        phrases = [t[0] for t in tokens]

        spans_alt = PhrasePositionFinder.find_phrase_in_source_text(
            text, phrases)
        self.assertEqual(('``', 20, 21), spans_alt[5])
        self.assertEqual(('nah-nah-nah', 37, 48), spans_alt[10])
コード例 #4
0
def normalize_text_with_map(
        text: str,
        spaces_on_start_end: bool = True,
        spaces_after_dots: bool = True,
        lowercase: bool = True,
        use_stemmer: bool = False,
        simple_tokenization: bool = False) -> Tuple[str, List[int]]:
    """
    Almost like normalize_text, but also returns source-to-resulted char index map:
    map[i] = I, where i is the character coordinate within the source text,
                I is the same character's coordinate within the resulted text
    """
    src_dest_map = []  # type: List[int]
    if use_stemmer:
        tokens = get_stem_list(text, lowercase=lowercase)
    elif simple_tokenization:
        tokens = reg_space.split(text)
        if lowercase:
            tokens = [t.lower() for t in tokens]
    else:
        tokens = get_token_list(text, lowercase=lowercase)
    # [ (token, start, end,), ... ]
    entity_positions = PhrasePositionFinder.find_phrase_in_source_text(
        text, [t for t in tokens])

    resulted = ''
    src_index, first_token = 0, True
    for tok, s, _e in entity_positions:
        if first_token or spaces_on_start_end:
            resulted += ' '
        first_token = False
        while src_index < s:
            src_dest_map.append(len(resulted) - 1)
            src_index += 1

        for c_index in range(len(tok)):
            c = tok[c_index]
            if spaces_after_dots and c == '.' and c_index > 0:
                resulted += ' '
            resulted += c
            src_dest_map.append(len(resulted) - 1)
            if spaces_after_dots and c == '.' and c_index < len(tok) - 1:
                resulted += ' '

            src_index += 1

    if spaces_on_start_end:
        resulted += ' '
    return resulted, src_dest_map
コード例 #5
0
 def test_tagging_non_uni_quotes(self):
     text = '(each an “Obligation” and collectively, the “Obligations”)'
     tagged = PhrasePositionFinder.find_phrase_in_source_text(
         text, ['"Obligation"', '"Obligations"'], 0, 58)
     self.assertEqual((9, 44), (tagged[0][1], tagged[1][1]))
コード例 #6
0
 def test_similar_entries(self):
     text = 'aa aaa aaa aaaaa aa aaa aa'
     tagged = PhrasePositionFinder.find_phrase_in_source_text(text, ['aaa', 'aa'])
     self.assertEqual((3, 7), (tagged[0][1], tagged[1][1]))
コード例 #7
0
 def test_corrupted_entry(self):
     text = 'The Treebank tokenizer uses regular(expressions) to tokenize text as in Penn Treebank.'
     phrases = ['regular expressions']
     tagged = PhrasePositionFinder.find_phrase_in_source_text(text, phrases)[0]
     self.assertEqual(0, tagged[1])
コード例 #8
0
def get_company_annotations(
    text: str,
    strict: bool = False,
    use_gnp: bool = False,
    count_unique: bool = False,
    name_upper: bool = False,
) -> Generator[CompanyAnnotation, None, None]:
    """
    Find company names in text, optionally using the stricter article/prefix expression.
    :param parse_name_abbr:
    :param text:
    :param strict:
    :param use_gnp: use get_noun_phrases or NPExtractor
    :param name_upper: return company name in upper case.
    :param count_unique: return only unique companies - case insensitive.
    :return:
    """
    # skip if all text is in uppercase
    if text == text.upper():
        return
    valid_punctuation = VALID_PUNCTUATION + ["(", ")"]

    unique_companies = {}  # type: Dict[Tuple[str, str], CompanyAnnotation]

    if COMPANY_TYPES_RE.search(text):
        # Iterate through sentences
        for s_start, s_end, sentence in get_sentence_span_list(text):
            # skip if whole phrase is in uppercase
            if sentence == sentence.upper():
                continue
            if use_gnp:
                phrases = list(
                    get_noun_phrases(sentence,
                                     strict=strict,
                                     valid_punctuation=valid_punctuation))
            else:
                phrases = list(np_extractor.get_np(sentence))
            phrase_spans = PhrasePositionFinder.find_phrase_in_source_text(
                sentence, phrases)

            for phrase, p_start, p_end in phrase_spans:
                if COMPANY_TYPES_RE.search(phrase):
                    # noinspection PyTypeChecker
                    for ant in nltk_re.get_companies(
                            phrase, use_sentence_splitter=False
                    ):  # type: CompanyAnnotation

                        if ant.name == ant.company_type or ant.name == ant.description:
                            continue
                        ant.coords = (ant.coords[0] + s_start + p_start,
                                      ant.coords[1] + s_start + p_start)

                        if name_upper:
                            ant.name = ant.name.upper()

                        if count_unique:
                            unique_key = (ant.name.lower() if ant.name else
                                          None, ant.company_type_abbr)
                            existing_result = unique_companies.get(unique_key)

                            if existing_result:
                                existing_result.counter += 1
                            else:
                                unique_companies[unique_key] = ant
                        else:
                            yield ant

        if count_unique:
            for company in unique_companies.values():
                yield company
コード例 #9
0
ファイル: utils.py プロジェクト: stkyle/lexpredict-lexnlp
 def get_np_with_coords(self, text: str) -> List[Tuple[str, int, int]]:
     phrases = list(self.get_np(text))
     tagged_phrases = PhrasePositionFinder.find_phrase_in_source_text(
         text, phrases)
     return tagged_phrases
コード例 #10
0
    def get_company_annotations(
        self,
        text: str,
        strict: bool = False,
        use_gnp: bool = False,
        count_unique: bool = False,
        name_upper: bool = False,
        banlist_usage: Optional[BanListUsage] = None
    ) -> Generator[CompanyAnnotation, None, None]:
        """
        Find company names in text, optionally using the stricter article/prefix expression.
        :param text:
        :param strict:
        :param use_gnp: use get_noun_phrases or NPExtractor
        :param name_upper: return company name in upper case.
        :param count_unique: return only unique companies - case insensitive.
        :param banlist_usage: a banlist or hints on using the default BL
        :return:
        """
        # skip if all text is in uppercase
        if text == text.upper():
            return
        banlist = self.get_company_banlist(banlist_usage)
        valid_punctuation = VALID_PUNCTUATION + ["(", ")"]
        unique_companies: Dict[Tuple[str, str], CompanyAnnotation] = {}

        if not self.company_types_re.search(text):
            return
        # iterate through sentences
        for s_start, _s_end, sentence in get_sentence_span_list(text):
            # skip if whole phrase is in uppercase
            if sentence == sentence.upper():
                continue
            if use_gnp:
                phrases = list(
                    get_noun_phrases(sentence,
                                     strict=strict,
                                     valid_punctuation=valid_punctuation))
            else:
                phrases = list(self.np_extractor.get_np(sentence))
            phrase_spans = PhrasePositionFinder.find_phrase_in_source_text(
                sentence, phrases)

            for phrase, p_start, _p_end in phrase_spans:
                if self.company_types_re.search(phrase):
                    ant: CompanyAnnotation
                    for ant in self.get_companies_re(
                            phrase, use_sentence_splitter=False):
                        if ant.name == ant.company_type or ant.name == ant.description:
                            continue
                        # check against banlist
                        if banlist:
                            if EntityBanListItem.check_list(ant.name, banlist):
                                continue
                        ant.coords = (ant.coords[0] + s_start + p_start,
                                      ant.coords[1] + s_start + p_start)

                        if name_upper:
                            ant.name = ant.name.upper()

                        if count_unique:
                            unique_key = (ant.name.lower() if ant.name else
                                          None, ant.company_type_abbr)
                            existing_result = unique_companies.get(unique_key)

                            if existing_result:
                                existing_result.counter += 1
                            else:
                                unique_companies[unique_key] = ant
                        else:
                            yield ant

        if count_unique:
            for company in unique_companies.values():
                yield company