def extract_patterns(self, config): # extract ReVerb pattern and detect the presence of the passive voice patterns_bet_tags = Reverb.extract_reverb_patterns_ptb(self.bet_words) if len(patterns_bet_tags) > 0: self.passive_voice = self.config.reverb.detect_passive_voice( patterns_bet_tags) # forced hack since _'s_ is always tagged as VBZ, (u"'s", 'VBZ') and causes ReVerb to identify # a pattern which is wrong, if this happens, ignore that a pattern was extracted if patterns_bet_tags[0][0] == "'s": self.bet_vector = self.construct_words_vectors( self.bet_words, config) else: self.bet_vector = self.construct_pattern_vector( patterns_bet_tags, config) else: self.bet_vector = self.construct_words_vectors( self.bet_words, config) # extract two words before the first entity, and two words after the second entity if len(self.bef_words) > 0: self.bef_vector = self.construct_words_vectors( self.bef_words, config) if len(self.aft_words) > 0: self.aft_vector = self.construct_words_vectors( self.aft_words, config)
def __init__(self): self.tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') self.reverb = Reverb() self.lmtzr = WordNetLemmatizer() self.aux_verbs = ['be', 'have']
class FeatureExtractor: def __init__(self): self.tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') self.reverb = Reverb() self.lmtzr = WordNetLemmatizer() self.aux_verbs = ['be', 'have'] @staticmethod def extract_prepositions(context, shingles): for token in context: if token[1].startswith("IN") or token[1].startswith("TO"): shingles.write(token[0].encode("utf8").strip() + '_PREP ') def extract_verbs(self, context, shingles, context_tag): for token in context: if token[1].startswith("V") and self.lmtzr.lemmatize(token[0], 'v') not in self.aux_verbs: # VB - Verb, base form if token[1] == "VB": shingles.write(token[0].encode("utf8").strip() + '_VB_' + context_tag + ' ') # VBD Verb, past tense # VBN Verb, past participle if token[1] == "VBD": shingles.write(token[0].encode("utf8").strip() + '_VBD_' + context_tag + ' ') if token[1] == "VBN": shingles.write(token[0].encode("utf8").strip() + '_VBN_' + context_tag + ' ') # VBP Verb, non-3rd person singular present # VBZ Verb, 3rd person singular present if token[1] == "VBP": shingles.write(token[0].encode("utf8").strip() + '_VBP_' + context_tag + ' ') if token[1] == "VBZ": shingles.write(token[0].encode("utf8").strip() + '_VBZ_' + context_tag + ' ') # VBG Verb, gerund or present participle if token[1] == "VBG": shingles.write(token[0].encode("utf8").strip() + '_VBG_' + context_tag + ' ') def extract_features(self, after, before, between, e1_type, e2_type): shingles = StringIO.StringIO() # add entities type shingles.write(e1_type.encode("utf8").strip() + '_ENTITY1 ') shingles.write(e2_type.encode("utf8").strip() + '_ENTITY2 ') # relational patterns corresponding to: a verb, followed by nouns, adjectives, or adverbs, # and ending with a preposition; reverb_pattern = self.reverb.extract_reverb_patterns_tagged_ptb(between) if len(reverb_pattern) > 0: passive_voice = self.reverb.detect_passive_voice(reverb_pattern) pattern = '_'.join([t[0] for t in reverb_pattern]) if passive_voice is True: pattern += '_RVB_PASSIVE' else: pattern += '_RVB' shingles.write(pattern.encode("utf8").strip() + ' ') # normalized version of reverb_patterns pattern_normalized = '' for t in reverb_pattern: if t[1].startswith("V"): pattern_normalized += self.lmtzr.lemmatize(t[0], 'v') + '_' else: pattern_normalized += self.lmtzr.lemmatize(t[0]) + '_' pattern_normalized += 'RVB_NORM' shingles.write(pattern_normalized.encode("utf8").strip() + ' ') # verbs from all contexts, except aux verbs self.extract_verbs(before, shingles, 'BEF') self.extract_verbs(between, shingles, 'BET') self.extract_verbs(after, shingles, 'AFT') # prepositions from BET context self.extract_prepositions(between, shingles) # nouns from the BET context for t in between: if t[1] == 'NN': shingles.write(t[0].encode("utf8").strip()+'_NN_BET' + ' ') # n-grams of characters from all contexts bef_grams = self.extract_ngrams_chars(' '.join([token[0] for token in before]), "BEF") bet_grams = self.extract_ngrams_chars(' '.join([token[0] for token in between]), "BET") aft_grams = self.extract_ngrams_chars(' '.join([token[0] for token in after]), "AFT") for shingle in bef_grams, bet_grams, aft_grams: shingles.write(shingle.encode("utf8").strip() + ' ') return shingles def process_index(self, sentence, e1, e2): sentence_no_tags = re.sub(regex_clean_simple, "", sentence) text_tokens = word_tokenize(sentence_no_tags) text_tagged = self.tagger.tag(text_tokens) assert len(text_tagged) == len(text_tokens) # extract entities types e1_type = re.search(r'<[A-Z]+>'+e1+'</[A-Z]+>', sentence).group(0) e2_type = re.search(r'<[A-Z]+>'+e2+'</[A-Z]+>', sentence).group(0) e1_type = e1_type[1:4] e2_type = e2_type[1:4] e1_info = find_locations(e1, text_tokens) e2_info = find_locations(e2, text_tokens) for e1_b in e1_info[1]: for e2_b in e2_info[1]: distance = abs(e2_b - e1_b) if distance > MAX_TOKENS or distance < MIN_TOKENS: continue else: before = text_tagged[:e1_b] before = before[-CONTEXT_WINDOW:] between = text_tagged[e1_b+len(e1_info[0]):e2_b] after = text_tagged[e2_b+len(e2_info[0]):] after = after[:CONTEXT_WINDOW] return self.extract_features(after, before, between, e1_type, e2_type) def process_classify(self, line): sentence_no_tags = re.sub(regex_clean_simple, "", line) text_tokens = word_tokenize(sentence_no_tags) text_tagged = self.tagger.tag(text_tokens) assert len(text_tagged) == len(text_tokens) sentence = Sentence(line.strip(), MAX_TOKENS, MIN_TOKENS, CONTEXT_WINDOW, self.tagger) relationships = [] for rel in sentence.relationships: shingles = self.extract_features(rel.after, rel.before, rel.between, rel.e1_type, rel.e2_type) relationships.append((rel, shingles)) return relationships @staticmethod def extract_ngrams_chars(text, context): tmp = StringIO.StringIO() chrs = ['_' if c == ' ' else c for c in text] for g in ngrams(chrs, N_GRAMS_SIZE): tmp.write(''.join(g) + '_' + context + ' ') return tmp.getvalue() @staticmethod def extract_bigrams(text): tokens = word_tokenize(text) return [gram[0]+' '+gram[1] for gram in bigrams(tokens)]
class FeatureExtractor: def __init__(self): self.tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') self.reverb = Reverb() self.lmtzr = WordNetLemmatizer() self.aux_verbs = ['be', 'have'] @staticmethod def extract_prepositions(context, shingles): for token in context: if token[1].startswith("IN") or token[1].startswith("TO"): shingles.write(token[0].encode("utf8").strip() + '_PREP ') def extract_verbs(self, context, shingles, context_tag): for token in context: if token[1].startswith("V") and self.lmtzr.lemmatize( token[0], 'v') not in self.aux_verbs: # VB - Verb, base form if token[1] == "VB": shingles.write(token[0].encode("utf8").strip() + '_VB_' + context_tag + ' ') # VBD Verb, past tense # VBN Verb, past participle if token[1] == "VBD": shingles.write(token[0].encode("utf8").strip() + '_VBD_' + context_tag + ' ') if token[1] == "VBN": shingles.write(token[0].encode("utf8").strip() + '_VBN_' + context_tag + ' ') # VBP Verb, non-3rd person singular present # VBZ Verb, 3rd person singular present if token[1] == "VBP": shingles.write(token[0].encode("utf8").strip() + '_VBP_' + context_tag + ' ') if token[1] == "VBZ": shingles.write(token[0].encode("utf8").strip() + '_VBZ_' + context_tag + ' ') # VBG Verb, gerund or present participle if token[1] == "VBG": shingles.write(token[0].encode("utf8").strip() + '_VBG_' + context_tag + ' ') def extract_features(self, after, before, between, e1_type, e2_type): shingles = StringIO.StringIO() # add entities type shingles.write(e1_type.encode("utf8").strip() + '_ENTITY1 ') shingles.write(e2_type.encode("utf8").strip() + '_ENTITY2 ') # relational patterns corresponding to: a verb, followed by nouns, adjectives, or adverbs, # and ending with a preposition; reverb_pattern = self.reverb.extract_reverb_patterns_tagged_ptb( between) if len(reverb_pattern) > 0: passive_voice = self.reverb.detect_passive_voice(reverb_pattern) pattern = '_'.join([t[0] for t in reverb_pattern]) if passive_voice is True: pattern += '_RVB_PASSIVE' else: pattern += '_RVB' shingles.write(pattern.encode("utf8").strip() + ' ') # normalized version of reverb_patterns pattern_normalized = '' for t in reverb_pattern: if t[1].startswith("V"): pattern_normalized += self.lmtzr.lemmatize(t[0], 'v') + '_' else: pattern_normalized += self.lmtzr.lemmatize(t[0]) + '_' pattern_normalized += 'RVB_NORM' shingles.write(pattern_normalized.encode("utf8").strip() + ' ') # verbs from all contexts, except aux verbs self.extract_verbs(before, shingles, 'BEF') self.extract_verbs(between, shingles, 'BET') self.extract_verbs(after, shingles, 'AFT') # prepositions from BET context self.extract_prepositions(between, shingles) # nouns from the BET context for t in between: if t[1] == 'NN': shingles.write(t[0].encode("utf8").strip() + '_NN_BET' + ' ') # n-grams of characters from all contexts bef_grams = self.extract_ngrams_chars( ' '.join([token[0] for token in before]), "BEF") bet_grams = self.extract_ngrams_chars( ' '.join([token[0] for token in between]), "BET") aft_grams = self.extract_ngrams_chars( ' '.join([token[0] for token in after]), "AFT") for shingle in bef_grams, bet_grams, aft_grams: shingles.write(shingle.encode("utf8").strip() + ' ') return shingles def process_index(self, sentence, e1, e2): sentence_no_tags = re.sub(regex_clean_simple, "", sentence) text_tokens = word_tokenize(sentence_no_tags) text_tagged = self.tagger.tag(text_tokens) assert len(text_tagged) == len(text_tokens) # extract entities types e1_type = re.search(r'<[A-Z]+>' + e1 + '</[A-Z]+>', sentence).group(0) e2_type = re.search(r'<[A-Z]+>' + e2 + '</[A-Z]+>', sentence).group(0) e1_type = e1_type[1:4] e2_type = e2_type[1:4] e1_info = find_locations(e1, text_tokens) e2_info = find_locations(e2, text_tokens) for e1_b in e1_info[1]: for e2_b in e2_info[1]: distance = abs(e2_b - e1_b) if distance > MAX_TOKENS or distance < MIN_TOKENS: continue else: before = text_tagged[:e1_b] before = before[-CONTEXT_WINDOW:] between = text_tagged[e1_b + len(e1_info[0]):e2_b] after = text_tagged[e2_b + len(e2_info[0]):] after = after[:CONTEXT_WINDOW] return self.extract_features(after, before, between, e1_type, e2_type) def process_classify(self, line): sentence_no_tags = re.sub(regex_clean_simple, "", line) text_tokens = word_tokenize(sentence_no_tags) text_tagged = self.tagger.tag(text_tokens) assert len(text_tagged) == len(text_tokens) sentence = Sentence(line.strip(), MAX_TOKENS, MIN_TOKENS, CONTEXT_WINDOW, self.tagger) relationships = [] for rel in sentence.relationships: shingles = self.extract_features(rel.after, rel.before, rel.between, rel.e1_type, rel.e2_type) relationships.append((rel, shingles)) return relationships @staticmethod def extract_ngrams_chars(text, context): tmp = StringIO.StringIO() chrs = ['_' if c == ' ' else c for c in text] for g in ngrams(chrs, N_GRAMS_SIZE): tmp.write(''.join(g) + '_' + context + ' ') return tmp.getvalue() @staticmethod def extract_bigrams(text): tokens = word_tokenize(text) return [gram[0] + ' ' + gram[1] for gram in bigrams(tokens)]
def __init__(self, config_file, seeds_file, negative_seeds, sentences_file, similarity, confidance): self.seed_tuples = set() self.negative_seed_tuples = set() self.e1_type = None self.e2_type = None self.stopwords = stopwords.words('english') self.threshold_similarity = similarity self.instance_confidance = confidance self.reverb = Reverb() for line in fileinput.input(config_file): if line.startswith("#") or len(line) == 1: continue if line.startswith("wUpdt"): self.wUpdt = float(line.split("=")[1]) if line.startswith("wUnk"): self.wUnk = float(line.split("=")[1]) if line.startswith("wNeg"): self.wNeg = float(line.split("=")[1]) if line.startswith("number_iterations"): self.number_iterations = int(line.split("=")[1]) if line.startswith("use_RlogF"): self.use_RlogF = bool(line.split("=")[1]) if line.startswith("min_pattern_support"): self.min_pattern_support = int(line.split("=")[1]) if line.startswith("max_tokens_away"): self.max_tokens_away = int(line.split("=")[1]) if line.startswith("min_tokens_away"): self.min_tokens_away = int(line.split("=")[1]) if line.startswith("context_window_size"): self.context_window_size = int(line.split("=")[1]) if line.startswith("use_reverb"): self.use_reverb = line.split("=")[1].strip() if line.startswith("alpha"): self.alpha = float(line.split("=")[1]) if line.startswith("beta"): self.beta = float(line.split("=")[1]) if line.startswith("gamma"): self.gamma = float(line.split("=")[1]) assert self.alpha + self.beta + self.gamma == 1 self.read_seeds(seeds_file) self.read_negative_seeds(negative_seeds) fileinput.close() print "\nConfiguration parameters" print "========================" print "Relationship Representation" print "e1 type :", self.e1_type print "e2 type :", self.e2_type print "context window :", self.context_window_size print "max tokens away :", self.max_tokens_away print "min tokens away :", self.min_tokens_away print "use ReVerb :", self.use_reverb print "\nVectors" print "alpha :", self.alpha print "beta :", self.beta print "gamma :", self.gamma print "\nSeeds:" print "positive seeds :", len(self.seed_tuples) print "negative seeds :", len(self.negative_seed_tuples) print "negative seeds wNeg :", self.wNeg print "unknown seeds wUnk :", self.wUnk print "\nParameters and Thresholds" print "threshold_similarity :", self.threshold_similarity print "instance confidence :", self.instance_confidance print "min_pattern_support :", self.min_pattern_support print "iterations :", self.number_iterations print "iteration wUpdt :", self.wUpdt print "\n" try: os.path.isfile("vsm.pkl") f = open("vsm.pkl", "r") print "\nLoading tf-idf model from disk..." self.vsm = cPickle.load(f) f.close() except IOError: print "\nGenerating tf-idf model from sentences..." self.vsm = VectorSpaceModel(sentences_file, self.stopwords) print "\nWriting generated model to disk..." f = open("vsm.pkl", "wb") cPickle.dump(self.vsm, f) f.close()