def contain_verb(tag, verb): '''whether a sentence contain 'verb' tagged with 'VB' ''' for word, pos in tag: if (pos.startswith('VB') and util.lemmatize(word, 'v')==verb): return True return False
def contain_query_words(sentence, query, query_pos='v'): # has to compare each query word to the base form of the # words in the sentence, which might be time consuming if query: keywords = query.split() for keyword in keywords: sentence = [util.lemmatize(word, query_pos) for word in sentence] if keyword not in sentence: return False return True
def canonical_tagged_sentence(tokens, tags): keep = [] for token, tag in zip(tokens, tags): if tag in ['DT', 'PRP$', 'TO', '.', '$', ':', ',', 'POS', '``', "''", 'CD']: continue token = util.lemmatize(token, util.tag2wnpos(tag)) if tag == 'IN': token = 'IN' elif token == "n't": token = 'not' elif token == "'s" and tag.startswith('VB'): token = 'be' elif token == "'re": token = 'be' keep.append(token) # use str is because pytable doesn't support unicode # in case we want to store it to pytable return str(' '.join(keep))