Example #1
0
 def test_pattern_to_word_matching6(self):
     anword = AnnotatedWord(index=5,
                            word='baboons',
                            lemma='bongo',
                            pos='NN',
                            ner='O',
                            dependencies='cc-conj-d')
     pattern = pattern_pfx + '<word deps="cc-con*" exdeps="*-d"/>'
     tree = etree.fromstring(pattern)
     pattern_word = PatternWord(tree)
     self.assertEqual(
         False, PatternMatcher.word_matches_pattern(anword, pattern_word))
Example #2
0
 def test_pattern_to_word_matching1(self):
     anword = AnnotatedWord(index=7,
                            word='bongoes',
                            lemma='bongo',
                            pos='IN',
                            ner='O',
                            dependencies='cc-conj-d')
     pattern = pattern_pfx + '<word pos="IN" lemma="bon*" max="1"/>'
     tree = etree.fromstring(pattern)
     pattern_word = PatternWord(tree)
     self.assertEqual(
         True, PatternMatcher.word_matches_pattern(anword, pattern_word))
Example #3
0
  def annotate(self, sentence):
    ''' Use the NLTK library to add basic NLP info to sentence.
        Return an AnnotatedSentence. '''
    tokens = word_tokenize(sentence)
    pos_tagged_tokens = pos_tag(tokens)
    anno_words = []
    for i,(token,pos) in enumerate(pos_tagged_tokens):
      lemma_pos = 'n' if pos[0].lower() != 'v' else 'v'
      word_lemma = self.lemmatiser.lemmatize(token, pos=lemma_pos)
      anno_words.append(AnnotatedWord(index=i,word=token,pos=pos,lemma=word_lemma))

    return AnnotatedSentence(anno_words)
Example #4
0
 def test_pattern_to_word_matching3(self):
     anword = AnnotatedWord(index=5,
                            word='baboons',
                            lemma='bongo',
                            pos='NNS',
                            ner='O',
                            dependencies='cc-conj-d')
     pattern = pattern_pfx + '<word pos="*" deps="cc*"/>'
     tree = etree.fromstring(pattern)
     pattern_word = PatternWord(tree)
     self.assertEqual(
         True,
         PatternMatcher.word_matches_pattern(anword,
                                             pattern_word,
                                             verbose=True))
Example #5
0
  def annotate(self, sentence):
    ''' Uses the CoreNLP server to create an AnnotatedSentence from a string. '''
    annotated_data = json.loads(self.nlp.annotate(sentence))
    annotated_sentence = annotated_data['sentences'][0]
    anno_words = []
    for token in annotated_sentence['tokens']:
      dependencies = self._get_dependency_string(token['index'],
                                                 annotated_sentence['basicDependencies'])
      # -1 the index because CoreNLP makes them 1-based rather than 0-based, so fix.
      anword = AnnotatedWord(index=token['index']-1,
                             word=token['word'],
                             lemma=token['lemma'],
                             pos=token['pos'],
                             ner=token['ner'],
                             dependencies=dependencies)
      anno_words.append(anword)

    return AnnotatedSentence(anno_words)
Example #6
0
def get_reduced_sentence(patterns, annotated_words):
    ''' Replaces preprocessor patterns with a single classname word,
      e.g. "the book" would become "NOUN". '''
    skip_num = 0
    words = []
    index = 0
    dependencies = []
    for word in annotated_words:
        if skip_num > 0:
            skip_num -= 1
            dependencies.append(word.dependencies)
            continue

        # if found dependencies, it means the last word will be a preprocessor chunk
        if len(dependencies) > 0:
            # when merging dependencies, governor and dependent dependencies of the same type
            # should cancel out. A noun compound that contains both compound-g and compound-d
            # has _internal_ dependencies. These don't matter and may confuse other patterns.
            # We're only interested in 'unresolved' dependencies (only a dependent or a governor)
            dep_list = list(set(dependencies))
            reduced_dependencies = []
            for dep in dep_list:
                depname = dep.split('-')[0]
                if depname + '-g' in dep_list and depname + '-d' in dep_list:
                    # both found, ignore
                    continue
                else:
                    # 'unresolved' dependency, save this
                    reduced_dependencies.append(dep)

            # append the found dependencies to the preprocessed chunk
            words[-1].dependencies = ','.join(reduced_dependencies)
        # reset dependencies
        dependencies = []

        found = False
        for ptype, pattern_words in patterns:
            if found:
                break
            for pword in pattern_words:
                if found:
                    break
                if pword.index == word.index:
                    words.append(
                        AnnotatedWord(word=ptype.classname,
                                      index=index,
                                      lemma=word.lemma,
                                      pos='NULL'))
                    skip_num = len(pattern_words) - 1
                    found = True
                    dependencies.append(word.dependencies)
                    break
        if not found:
            word.index = index
            words.append(word)

    # repair indices
    index = 0
    for word in words:
        word.index = index
        index += 1
    return AnnotatedSentence(words)