Example #1
0
 def extract(self, taggedTerms):
     """See interfaces.ITermExtractor"""
     terms = {}
     # Phase 1: A little state machine is used to build simple and
     # composite terms.
     multiterm = []
     state = SEARCH
     while taggedTerms:
         term, tag, norm = taggedTerms.pop(0)
         if state == SEARCH and tag.startswith('N'):
             state = NOUN
             _add(term, norm, multiterm, terms)
         elif state == SEARCH and tag == 'JJ' and term[0].isupper():
             state = NOUN
             _add(term, norm, multiterm, terms)
         elif state == NOUN and tag.startswith('N'):
             _add(term, norm, multiterm, terms)
         elif state == NOUN and not tag.startswith('N'):
             state = SEARCH
             if len(multiterm) > 1:
                 word = ' '.join([word for word, norm in multiterm])
                 terms.setdefault(word, 0)
                 terms[word] += 1
             multiterm = []
     # Phase 2: Only select the terms that fulfill the filter criteria.
     # Also create the term strength.
     return [
         (word, occur, len(word.split()))
         for word, occur in terms.items()
         if self.filter(word, occur, len(word.split()))]
Example #2
0
def get_wordnet_pos(tag):  #to get tag of words as adjective,verb,noun or adv
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''
Example #3
0
 def extract(self,
             taggedTerms,
             splits,
             KEEP_ORIGINAL_SPACING,
             RETURN_BIO=False):
     """See interfaces.ITermExtractor"""
     """ Warning: This is desctructive to taggedTerms + splits (has side effects) """
     terms = {}
     bio_encoding = []
     # Phase 1: A little state machine is used to build simple and
     # composite terms.
     multiterm = []
     state = SEARCH
     assert len(taggedTerms) == len(splits)
     while taggedTerms:
         term, tag, norm = taggedTerms.pop(0)
         split = splits.pop(0)
         if state == SEARCH and tag.startswith('N'):
             state = NOUN
             _add(term, norm, split, multiterm, terms)
             bio_encoding.append("B")
         elif state == SEARCH and tag == 'JJ' and term[0].isupper():
             state = NOUN
             _add(term, norm, split, multiterm, terms)
             bio_encoding.append("I")
         elif state == NOUN and tag.startswith('N'):
             _add(term, norm, split, multiterm, terms)
             bio_encoding.append("I")
         elif state == NOUN and not tag.startswith('N'):
             state = SEARCH
             if len(multiterm) > 0:
                 _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING)
             multiterm = []
             bio_encoding.append("O")
         else:
             bio_encoding.append("O")
     # Potentially keep the last term, if there is one. -jpt
     if len(multiterm) > 0:
         _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING)
     multiterm = []
     if RETURN_BIO: return bio_encoding
     # Phase 2: Only select the terms that fulfill the filter criteria.
     # Also create the term strength.
     return [(word, occur, len(word.split()))
             for word, occur in list(terms.items())
             if self.filter(word, occur, len(word.split()))]
 def extract(self, taggedTerms, splits, KEEP_ORIGINAL_SPACING, RETURN_BIO=False):
     """See interfaces.ITermExtractor"""
     """ Warning: This is desctructive to taggedTerms + splits (has side effects) """
     terms = {}
     bio_encoding = []
     # Phase 1: A little state machine is used to build simple and
     # composite terms.
     multiterm = []
     state = SEARCH
     assert len(taggedTerms) == len(splits)
     while taggedTerms:
         term, tag, norm = taggedTerms.pop(0)
         split = splits.pop(0)
         if state == SEARCH and tag.startswith("N"):
             state = NOUN
             _add(term, norm, split, multiterm, terms)
             bio_encoding.append("B")
         elif state == SEARCH and tag == "JJ" and term[0].isupper():
             state = NOUN
             _add(term, norm, split, multiterm, terms)
             bio_encoding.append("I")
         elif state == NOUN and tag.startswith("N"):
             _add(term, norm, split, multiterm, terms)
             bio_encoding.append("I")
         elif state == NOUN and not tag.startswith("N"):
             state = SEARCH
             if len(multiterm) > 0:
                 _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING)
             multiterm = []
             bio_encoding.append("O")
         else:
             bio_encoding.append("O")
     # Potentially keep the last term, if there is one. -jpt
     if len(multiterm) > 0:
         _keepterm(multiterm, terms, KEEP_ORIGINAL_SPACING)
     multiterm = []
     if RETURN_BIO:
         return bio_encoding
     # Phase 2: Only select the terms that fulfill the filter criteria.
     # Also create the term strength.
     return [
         (word, occur, len(word.split()))
         for word, occur in terms.items()
         if self.filter(word, occur, len(word.split()))
     ]