def match(mention, dependencies, stem_pos=False, corelex=False): mention = mention.split() #results = list() events = list() dependent_event = ['dependent', 'governor_pos'] governor_event = ['dependent_pos', 'governor'] negations = list() for dependency in dependencies: if dependency['relation'] == 'neg': if (dependency['governor_pos'].startswith('JJ') or dependency['governor_pos'].startswith('VB')): negations.append(dependency['governor']) else: negations.append(dependency['dependent']) for dependency in dependencies: if dependency['dependent']==mention[-1]: if (dependency['governor_pos'].startswith('JJ') or dependency['governor_pos'].startswith('VB')): #results.append(dependency) if stem_pos: dependency['governor_pos'] = dependency['governor_pos'][:2] if corelex: dependent = lookup(dependency['dependent']) if dependent != '': dependency['dependent'] = dependent event = "({} {} _)".format(*map(lambda key: dependency[key], dependent_event)) if dependency['governor'] in negations: event = "NEGATION {}".format(event) events.append(event) continue if dependency['governor']==mention[-1]: if dependency['dependent'] not in mention: if (dependency['dependent_pos'].startswith('JJ') or dependency['dependent_pos'].startswith('VB')): #results.append(dependency) if stem_pos: dependency['dependent_pos'] = dependency['dependent_pos'][:2] if corelex: governor = lookup(dependency['governor']) if governor != '': dependency['governor'] = governor event = "(_ {} {})".format(*map(lambda key: dependency[key], governor_event)) if dependency['dependent'] in negations: event = "NEG {}".format(event) events.append(event) return events
def get_ngrams(feature_vector, tokens, prefix=None, n=1, add_null_tokens=False, binary_output=False, stem=False, use_lowercase=True, corelex=True, generalize=True): global stemmer if prefix == None: prefix = __get_measure(n) if stem: prefix='stem'+prefix if use_lowercase: tokens=[token.lower() for token in tokens] if stem: if stemmer == None: import nltk.stem stemmer = nltk.stem.PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] if corelex: import corelex tokens = [corelex.lookup(word) if ''!=corelex.lookup(word) else word for word in tokens] if generalize: tokens = ["_NUMERIC_" if re.search(r'\d', word) else word for word in tokens] if n==1: n_grams = tokens else: n_grams = list() if add_null_tokens: unigrams = ['-nil-' for i in range(n-1)]+tokens+['-nil-' for i in range(n-1)] else: unigrams=tokens for i in range(len(unigrams)-n+1): n_grams.append(' '.join(unigrams[i:i+n])) word_counts = Counter(n_grams) total_words = len(n_grams) for word, count in word_counts.items(): if binary_output: feature_vector[prefix+':'+word]=True else: feature_vector[prefix+':'+word]=count/float(total_words)