def test_get_candidate_fragments(): """Test extraction of maximal longform candidate from text """ # Test with no excluded words for text, result in zip([text1, text2, text3, text4, text5], [result1, result2, result3, result4, result5]): fragments = get_candidate_fragments(text, 'INDRA') candidates = [get_candidate(fragment) for fragment in fragments] assert candidates == result # Case where pattern is at start of the sentence fragments1 = get_candidate_fragments('(INDRA) is an ambiguous acronym', 'INDRA') candidate1 = get_candidate(fragments1[0]) assert not candidate1 # Case where pattern is not found assert not get_candidate_fragments( 'Integrated Network' 'and dynamical reasoning assembler', 'INDRA') # Test with excluded words fragments2 = get_candidate_fragments(text1, 'INDRA') candidate2 = get_candidate(fragments2[0], exclude=stopwords) assert candidate2 == ['dynamical', 'reasoning', 'assembler'] fragments3 = get_candidate_fragments('Is (INDRA) ambiguous?', 'INDRA') candidate3 = get_candidate(fragments3[0], exclude=stopwords) assert not candidate3
def process_texts(self, texts): """Update longform candidate scores from a corpus of texts Runs co-occurence statistics in a corpus of texts to compute likelihood scores for candidate longforms associated to the shortform. This is an online method, it can be run multiple times to process_texts multiple batches of text. This allows previously trained AdeftMiners to be updated when new content becomes available. Parameters ---------- texts : list of str A list of texts """ for text in texts: # lonform candidates taken from a window of text before each # defining pattern fragments = get_candidate_fragments(text, self.shortform, self.window) for fragment in fragments: if fragment: candidate, _ = get_candidate(fragment) self._add(candidate) self._alignment_scores_computed = False self._scores_propagated = False
def recognize(self, text): """Find longforms in text by searching for defining patterns (DPs) Parameters ---------- text : str Sentence where we seek to disambiguate shortform Returns ------- expansions : set of str Set of longforms corresponding to shortform in sentence if a defining pattern is matched. Returns None if no defining patterns are found """ results = [] fragments = get_candidate_fragments(text, self.shortform, window=self.window) for fragment in fragments: if not fragment: continue tokens, longform_map = get_candidate(fragment) # search for longform in trie result = self._search(tokens) # if a longform is recognized, add it to output list if result: longform = result['longform'] num_tokens = len(word_tokenize(longform)) longform_text = longform_map[num_tokens] result = self._post_process(result) result['longform_text'] = longform_text results.append((result)) return results
def recognize(self, text): """Find longforms in text by searching for defining patterns (DPs) Parameters ---------- text : str Sentence where we seek to disambiguate shortform Returns ------- expansions : set of str Set of longforms corresponding to shortform in sentence if a defining pattern is matched. Returns None if no defining patterns are found """ expansions = set() fragments = get_candidate_fragments(text, self.shortform, window=self.window) for fragment in fragments: if not fragment: continue tokens = get_candidate(fragment, self.exclude) # search for longform in trie longform = self._search(tokens) # if a longform is recognized, add it to output list if longform: expansion = self._post_process(longform) expansions.add(expansion) return expansions
def test_get_candidate_fragments(): """Test extraction of maximal longform candidate from text """ for text, result in zip([text1, text2, text3, text4, text5], [result1, result2, result3, result4, result5]): fragments = get_candidate_fragments(text, 'INDRA') candidates = [get_candidate(fragment)[0] for fragment in fragments] assert candidates == result # Case where pattern is at start of the sentence fragments1 = get_candidate_fragments(' (INDRA) is an ambiguous acronym', 'INDRA') assert not fragments1 # Case where pattern is not found assert not get_candidate_fragments('Integrated Network' 'and dynamical reasoning assembler', 'INDRA')
def strip_defining_patterns(self, text): """Return text with defining patterns stripped This is useful for training machine learning models where training labels are generated by finding defining patterns (DP)s. Models must be trained to disambiguate texts that do not contain a defining pattern. The output on the first sentence of the previous paragraph is "This is useful for training machine learning models where training labels are generated by finding DPs." Parameters ---------- text : str Text to remove defining patterns from Returns ------- stripped_text : str Text with defining patterns replaced with shortform """ fragments = get_candidate_fragments(text, self.shortform) for fragment in fragments: # Each fragment is tokenized and its longform is identified tokens = word_tokenize(fragment) result = self._search([ token for token, _ in tokens if token not in string.punctuation ]) if result is None: # For now, ignore a fragment if its grounding has no longform # from the grounding map continue longform = result['longform'] # Remove the longform from the fragment, keeping in mind that # punctuation is ignored when extracting longforms from text num_words = len(longform.split()) i = 0 j = len(tokens) - 1 while i < num_words: if re.match(r'\w+', tokens[j][0]): i += 1 j -= 1 if i > self.window: break text = text.replace(fragment.strip(), word_detokenize(tokens[:j + 1])) # replace all instances of parenthesized shortform with shortform stripped_text = re.sub(r'\(\s*%s\s*\)' % self.shortform, ' ' + self.shortform + ' ', text) stripped_text = ' '.join(stripped_text.split()) return stripped_text
def process_texts(self, texts): """Update longform candidate scores from a corpus of texts Runs co-occurence statistics in a corpus of texts to compute scores for candidate longforms associated to the shortform. This is an online method, additional texts can be processed after training has taken place. Parameters ---------- texts : list of str A list of texts """ for text in texts: # lonform candidates taken from a window of text before each # defining pattern fragments = get_candidate_fragments(text, self.shortform, self.window) for fragment in fragments: if fragment: candidate = get_candidate(fragment, self.exclude) self._add(candidate)