Esempio n. 1
0
def test_get_candidate_fragments():
    """Test extraction of maximal longform candidate from text
    """
    # Test with no excluded words
    for text, result in zip([text1, text2, text3, text4, text5],
                            [result1, result2, result3, result4, result5]):
        fragments = get_candidate_fragments(text, 'INDRA')
        candidates = [get_candidate(fragment) for fragment in fragments]
        assert candidates == result

    # Case where pattern is at start of the sentence
    fragments1 = get_candidate_fragments('(INDRA) is an ambiguous acronym',
                                         'INDRA')
    candidate1 = get_candidate(fragments1[0])
    assert not candidate1
    # Case where pattern is not found
    assert not get_candidate_fragments(
        'Integrated Network'
        'and dynamical reasoning assembler', 'INDRA')

    # Test with excluded words
    fragments2 = get_candidate_fragments(text1, 'INDRA')
    candidate2 = get_candidate(fragments2[0], exclude=stopwords)
    assert candidate2 == ['dynamical', 'reasoning', 'assembler']

    fragments3 = get_candidate_fragments('Is (INDRA) ambiguous?', 'INDRA')
    candidate3 = get_candidate(fragments3[0], exclude=stopwords)
    assert not candidate3
Esempio n. 2
0
    def process_texts(self, texts):
        """Update longform candidate scores from a corpus of texts

        Runs co-occurence statistics in a corpus of texts to compute
        likelihood scores for candidate longforms associated to the shortform.
        This is an online method, it can be run multiple times to process_texts
        multiple batches of text. This allows previously trained AdeftMiners to
        be updated when new content becomes available.

        Parameters
        ----------
        texts : list of str
            A list of texts
        """
        for text in texts:
            # lonform candidates taken from a window of text before each
            # defining pattern
            fragments = get_candidate_fragments(text, self.shortform,
                                                self.window)
            for fragment in fragments:
                if fragment:
                    candidate, _ = get_candidate(fragment)
                    self._add(candidate)
        self._alignment_scores_computed = False
        self._scores_propagated = False
Esempio n. 3
0
    def recognize(self, text):
        """Find longforms in text by searching for defining patterns (DPs)

        Parameters
        ----------
        text : str
            Sentence where we seek to disambiguate shortform

        Returns
        -------
        expansions : set of str
            Set of longforms corresponding to shortform in sentence if a
            defining pattern is matched. Returns None if no defining patterns
            are found
        """
        results = []
        fragments = get_candidate_fragments(text,
                                            self.shortform,
                                            window=self.window)
        for fragment in fragments:
            if not fragment:
                continue
            tokens, longform_map = get_candidate(fragment)
            # search for longform in trie
            result = self._search(tokens)
            # if a longform is recognized, add it to output list
            if result:
                longform = result['longform']
                num_tokens = len(word_tokenize(longform))
                longform_text = longform_map[num_tokens]
                result = self._post_process(result)
                result['longform_text'] = longform_text
                results.append((result))
        return results
Esempio n. 4
0
    def recognize(self, text):
        """Find longforms in text by searching for defining patterns (DPs)

        Parameters
        ----------
        text : str
            Sentence where we seek to disambiguate shortform

        Returns
        -------
        expansions : set of str
            Set of longforms corresponding to shortform in sentence if a
            defining pattern is matched. Returns None if no defining patterns
            are found
        """
        expansions = set()
        fragments = get_candidate_fragments(text, self.shortform,
                                            window=self.window)
        for fragment in fragments:
            if not fragment:
                continue
            tokens = get_candidate(fragment, self.exclude)
            # search for longform in trie
            longform = self._search(tokens)
            # if a longform is recognized, add it to output list
            if longform:
                expansion = self._post_process(longform)
                expansions.add(expansion)
        return expansions
Esempio n. 5
0
def test_get_candidate_fragments():
    """Test extraction of maximal longform candidate from text
    """
    for text, result in zip([text1, text2, text3, text4, text5],
                            [result1, result2, result3, result4, result5]):
        fragments = get_candidate_fragments(text, 'INDRA')
        candidates = [get_candidate(fragment)[0] for fragment in fragments]
        assert candidates == result

    # Case where pattern is at start of the sentence
    fragments1 = get_candidate_fragments(' (INDRA) is an ambiguous acronym',
                                         'INDRA')
    assert not fragments1
    # Case where pattern is not found
    assert not get_candidate_fragments('Integrated Network'
                                       'and dynamical reasoning assembler',
                                       'INDRA')
Esempio n. 6
0
    def strip_defining_patterns(self, text):
        """Return text with defining patterns stripped

       This is useful for training machine learning models where training
       labels are generated by finding defining patterns (DP)s. Models must
       be trained to disambiguate texts that do not contain a defining
       pattern.

       The output on the first sentence of the previous paragraph is
       "This is useful for training machine learning models where training
       labels are generated by finding DPs."

       Parameters
       ----------
       text : str
           Text to remove defining patterns from

       Returns
       -------
       stripped_text : str
           Text with defining patterns replaced with shortform
        """
        fragments = get_candidate_fragments(text, self.shortform)
        for fragment in fragments:
            # Each fragment is tokenized and its longform is identified
            tokens = word_tokenize(fragment)
            result = self._search([
                token for token, _ in tokens if token not in string.punctuation
            ])
            if result is None:
                # For now, ignore a fragment if its grounding has no longform
                # from the grounding map
                continue
            longform = result['longform']
            # Remove the longform from the fragment, keeping in mind that
            # punctuation is ignored when extracting longforms from text
            num_words = len(longform.split())
            i = 0
            j = len(tokens) - 1
            while i < num_words:
                if re.match(r'\w+', tokens[j][0]):
                    i += 1
                j -= 1
                if i > self.window:
                    break
            text = text.replace(fragment.strip(),
                                word_detokenize(tokens[:j + 1]))
        # replace all instances of parenthesized shortform with shortform
        stripped_text = re.sub(r'\(\s*%s\s*\)' % self.shortform,
                               ' ' + self.shortform + ' ', text)
        stripped_text = ' '.join(stripped_text.split())
        return stripped_text
Esempio n. 7
0
    def process_texts(self, texts):
        """Update longform candidate scores from a corpus of texts

        Runs co-occurence statistics in a corpus of texts to compute
        scores for candidate longforms associated to the shortform. This
        is an online method, additional texts can be processed after training
        has taken place.

        Parameters
        ----------
        texts : list of str
            A list of texts
        """
        for text in texts:
            # lonform candidates taken from a window of text before each
            # defining pattern
            fragments = get_candidate_fragments(text, self.shortform,
                                                self.window)
            for fragment in fragments:
                if fragment:
                    candidate = get_candidate(fragment, self.exclude)
                    self._add(candidate)