def __init__(self, w2v_path, corpus_dict_path, port=9000): # corenlp client self.parser = CoreNLPParser(url='http://localhost:' + str(port)) self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' + str(port)) # w2v self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( 'data/saved_models/GoogleNews-vectors-negative300.bin', binary=True) print('w2v model loaded') # training corpus for one hot features corpus_dict = pickle.load(open(corpus_dict_path, 'rb')) self.dep_tuple_vectorizer = DictVectorizer(sparse=False) self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit( corpus_dict['dep_tuple']) self.unigram_vectorizer = DictVectorizer(sparse=False) self.unigram_vectorizer = self.unigram_vectorizer.fit( corpus_dict['unigram']) self.bigram_vectorizer = DictVectorizer(sparse=False) self.bigram_vectorizer = self.bigram_vectorizer.fit( corpus_dict['bigram']) self.trigram_vectorizer = DictVectorizer(sparse=False) self.trigram_vectorizer = self.trigram_vectorizer.fit( corpus_dict['trigram']) self.lexical_vectorizer = DictVectorizer(sparse=False) self.lexical_vectorizer = self.lexical_vectorizer.fit( corpus_dict['lexical'])
def depParser(sent): dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') result = dep_parser.raw_parse(sent) newResult = result.__next__() dep = list(newResult.triples()) return dep
for i in graph2: counts2[i[1]] = counts2.get(i[1], 0) + 1 all_deps = set(list(counts1.keys()) + list(counts2.keys())) diffs = 0 for dep in all_deps: diffs += abs(counts1.get(dep, 0) - counts2.get(dep, 0)) return diffs ######################################################################################################## # Main code ######################################################################################################## # initialize the dependency parser chi_parser = CoreNLPDependencyParser('http://localhost:9001') # use nltk treebank tokenizer and detokenizer tokenizer = TreebankWordTokenizer() detokenizer = TreebankWordDetokenizer() # BERT initialization berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased') bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased') bertmodel.eval() # initialize the Google translate client translate_client = translate.Client() print('initialized')
#Setting up geography category conn=part1.create_connection(os.path.join("Database","WorldGeography.sqlite")) if not(conn is None): geog_db=part1.geography_db(conn) #Getting the set of similar words from wordnet geog_set=part1.create_lists(geog) mov_set=part1.create_lists(movies_list) music_set=part1.create_lists(music_list) #Getting the tags qstn,ner,pos=part1.tagging(filename) parser = CoreNLPParser(url='http://localhost:9000') dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') # Printing parse tree and output for i in range(0,len(qstn)): qtype="YesNo" print("Question: ", qstn[i].strip()) parsed=part1.parse_tree(qstn[i],parser) dep_parsed=return_deptree(qstn[i],dep_parser) #print(list(dep_parsed.triples()),parsed,ner[i]) #continue if 'SBAR' in parsed[0:15]: qtype="WH" category=part1.categorize(qstn[i],ner[i],pos[i],geog_set,mov_set,music_set, mus_name,songs,geog_db) print(category) if category=="Music":
lap_14_train_txt = os.path.join(config['lap_14'], 'train.tsv') lap_14_test_txt = os.path.join(config['lap_14'], 'test.tsv') res_14_train_txt = os.path.join(config['res_14'], 'train.tsv') res_14_test_txt = os.path.join(config['res_14'], 'test.tsv') res_15_train_txt = os.path.join(config['res_15'], 'train.tsv') res_15_test_txt = os.path.join(config['res_15'], 'test.tsv') res_16_train_txt = os.path.join(config['res_16'], 'train.tsv') res_16_test_txt = os.path.join(config['res_16'], 'test.tsv') POLARITY_DICT = {'NEU': 0, 'POS': 1, 'NEG': 2} POLARITY_DICT_REV = {v: k for k, v in POLARITY_DICT.items()} depparser = CoreNLPDependencyParser(url='http://172.28.6.42:9000') def load_data(txt_path, pair_path): """ :param txt_path: the original annotation file path :param pair_path: the processed pair file path :return: """ pairs = read_pickle(pair_path) data_list = [] with open(txt_path, encoding='utf-8') as f: texts = f.readlines() assert len(pairs) == len(texts) for idx, (t, p) in enumerate(zip(texts, pairs)):
class FeatureExtractor: def __init__(self, w2v_path, corpus_dict_path, port=9000): # corenlp client self.parser = CoreNLPParser(url='http://localhost:' + str(port)) self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' + str(port)) # w2v self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( 'data/saved_models/GoogleNews-vectors-negative300.bin', binary=True) print('w2v model loaded') # training corpus for one hot features corpus_dict = pickle.load(open(corpus_dict_path, 'rb')) self.dep_tuple_vectorizer = DictVectorizer(sparse=False) self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit( corpus_dict['dep_tuple']) self.unigram_vectorizer = DictVectorizer(sparse=False) self.unigram_vectorizer = self.unigram_vectorizer.fit( corpus_dict['unigram']) self.bigram_vectorizer = DictVectorizer(sparse=False) self.bigram_vectorizer = self.bigram_vectorizer.fit( corpus_dict['bigram']) self.trigram_vectorizer = DictVectorizer(sparse=False) self.trigram_vectorizer = self.trigram_vectorizer.fit( corpus_dict['trigram']) self.lexical_vectorizer = DictVectorizer(sparse=False) self.lexical_vectorizer = self.lexical_vectorizer.fit( corpus_dict['lexical']) def _get_case_features(self, sent_annotations, sentence): num_all_caps = 0 for word_annotations in sent_annotations: if word_annotations.token.isupper(): num_all_caps += 1 if sentence.islower(): is_sent_lower = 1 else: is_sent_lower = 0 if sent_annotations[0].token.isupper(): is_first_word_caps = 1 else: is_first_word_caps = 0 return [num_all_caps, is_sent_lower, is_first_word_caps] def _get_dependency_tuples(self, sent_annotations): # (gov, typ, dep) (gov, typ) (typ, dep) (gov, dep) dependency_tuple_dict = defaultdict(int) for word_annotations in sent_annotations: gov = sent_annotations[int(word_annotations.head) - 1].pos typ = word_annotations.depRel dep = word_annotations.pos gov_typ_dep = '_'.join([gov, typ, dep]) dependency_tuple_dict[gov_typ_dep] = 1 gov_typ = '_'.join([gov, typ]) dependency_tuple_dict[gov_typ] = 1 typ_dep = '_'.join([typ, dep]) dependency_tuple_dict[typ_dep] = 1 gov_dep = '_'.join([gov, dep]) dependency_tuple_dict[gov_dep] = 1 return dependency_tuple_dict def _get_entity_features(self, sent_annotations): ner_tags = [0] * len(NER_TAGSET) person_mentions_total_len = 0 for word_annotations in sent_annotations: if word_annotations.ner == 'O': continue if word_annotations.ner not in NER_TAGSET: continue else: index = NER_TAGSET.index(word_annotations.ner) ner_tags[index] = 1 if word_annotations.ner == 'PERSON': person_mentions_total_len += len(word_annotations.token) person_mentions_avg_len = person_mentions_total_len * 1.0 / len( sent_annotations) return ner_tags + [person_mentions_avg_len] def _get_lexical_features(self, words): num_contractions = 0 total_word_len = 0 for word in words: if '\'' in word: num_contractions += 1 total_word_len += len(word) avg_num_contractions = num_contractions * 1.0 / len(words) avg_word_len = total_word_len * 1.0 / len(words) #TODO: avg word-log frequency acc to Google Ngram #TODO: avg formality score using Pavlick & Nenkova (2015) return [avg_num_contractions, avg_word_len] def _get_ngrams(self, sent_annotations): # tokens = [w.token for w in sent_annotations] tokens = [w.lemma for w in sent_annotations] sentence = ' '.join(tokens) # .decode('utf-8', 'ignore') blob = TextBlob(sentence) unigrams = tokens bigrams = blob.ngrams(n=2) trigrams = blob.ngrams(n=3) unigram_dict = defaultdict(int) bigram_dict = defaultdict(int) trigram_dict = defaultdict(int) for unigram in unigrams: unigram_dict[unigram] = 1 for bigram in bigrams: bigram_dict['_'.join(bigram)] = 1 for trigram in trigrams: trigram_dict['_'.join(trigram)] = 1 return unigram_dict, bigram_dict, trigram_dict def _get_parse_features(self, stanford_parse_tree, sent_annotations): sent_len = len(sent_annotations) avg_depth = stanford_parse_tree.height() * 1.0 / sent_len lexical_production_dict = defaultdict(int) for production in stanford_parse_tree.productions(): if production.is_lexical(): continue lexical_production_dict[production] += 1 avg_depth_feature = [avg_depth] return avg_depth_feature, lexical_production_dict def _get_POS_features(self, sent_annotations): pos_tag_ct = [0] * len(POS_TAGSET) for word_annotations in sent_annotations: try: pos_tag_ct[POS_TAGSET.index(word_annotations.pos)] += 1 except: # print word_annotations.pos continue for i in range(len(pos_tag_ct)): pos_tag_ct[i] = pos_tag_ct[i] * 1.0 / len(sent_annotations) return pos_tag_ct def _get_punctuation_features(self, sentence): num_question_marks = sentence.count('?') num_ellipses = sentence.count('...') num_exclamations = sentence.count('!') return [num_question_marks, num_ellipses, num_exclamations] def _get_readability_features(self, sentence, words): num_words = len(words) num_chars = len(sentence) - sentence.count(' ') return [num_words, num_chars] def _get_subjectivity_features(self, sent_annotations, sentence): subjectivity_features = [] fp_pros = 0 tp_pros = 0 for word_annotations in sent_annotations: if word_annotations.lemma in FP_PRO_LIST: fp_pros += 1 if word_annotations.lemma in TP_PRO_LIST: tp_pros += 1 subjectivity_features.append(fp_pros * 1.0 / len(sent_annotations)) subjectivity_features.append(tp_pros * 1.0 / len(sent_annotations)) polarity, subjectivity = TextBlob(sentence).sentiment subjectivity_features.append(float(np.sign(polarity))) subjectivity_features.append(subjectivity) return subjectivity_features def _get_word2vec_features(self, sent_annotations): word_vectors = [] for word_annotations in sent_annotations: try: word_vector = self.word2vec_model[word_annotations.lemma] word_vectors.append(word_vector) except: # print word_annotations.token continue if len(word_vectors) == 0: avg_word_vectors = np.zeros(300) else: avg_word_vectors = np.transpose(np.mean(word_vectors, axis=0)) return avg_word_vectors def _remove_less_frequent(self, dict, reference_dict, freq_cutoff): new_dict = defaultdict(int) for item, count in dict.iteritems(): if reference_dict[item] > freq_cutoff: new_dict[item] = count return new_dict def extract_features_pt16(self, sentence, sent_annotations, parse_tree): words = sentence.split() feature_set = [] #case features case_features = self._get_case_features(sent_annotations, sentence) feature_set += case_features # dependency features dependency_tuple_dict = self._get_dependency_tuples(sent_annotations) # entity features entity_features = self._get_entity_features(sent_annotations) feature_set += entity_features # lexical features lexical_features = self._get_lexical_features(words) feature_set += lexical_features # ngram features unigram_dict, bigram_dict, trigram_dict = self._get_ngrams( sent_annotations) # parse features avg_depth_feature, lexical_production_dict = self._get_parse_features( parse_tree, sent_annotations) feature_set += avg_depth_feature # POS features pos_features = self._get_POS_features(sent_annotations) feature_set += pos_features # punctuation features punctuation_features = self._get_punctuation_features(sentence) feature_set += punctuation_features # readability features readability_features = self._get_readability_features(sentence, words) feature_set += readability_features # subjectivity features # subjectivity_features = self._get_subjectivity_features(sent_annotations, sentence) # feature_set += subjectivity_features # word2vec features word2vec_features = self._get_word2vec_features(sent_annotations) feature_set = np.concatenate((feature_set, word2vec_features), axis=0) # get one hot features dependency_tuple_feature = self.dep_tuple_vectorizer.transform( dependency_tuple_dict) unigram_feature = self.unigram_vectorizer.transform(unigram_dict) bigram_feature = self.bigram_vectorizer.transform(bigram_dict) trigram_feature = self.trigram_vectorizer.transform(trigram_dict) lexical_production_feature = self.lexical_vectorizer.transform( lexical_production_dict) feature_vectors = np.array([feature_set]) feature_vectors = np.concatenate( (feature_vectors, dependency_tuple_feature, unigram_feature, bigram_feature, trigram_feature, lexical_production_feature), axis=1) return feature_vectors def _transform_raw(self, sentence): sent_annotations = [] for dependency in sentence['basicDependencies']: dep_idx = dependency['dependent'] token = sentence['tokens'][dep_idx - 1] annotation = StanfordAnnotations(token['word'], token['lemma'], token['pos'], token['ner'], dependency['governor'], dependency['dep']) sent_annotations.append(annotation) return sent_annotations def extract_parse(self, s): """ Easy, built in parser from nltk """ tree_list = self.parser.raw_parse(s, outputFormat='penn') tree = next(tree_list) return tree def extract_annotations(self, s): """ Needs some arm wrestling """ props = {'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'} raw_json = self.dep_parser.api_call(s, properties=props) sentence = raw_json['sentences'][0] return self._transform_raw(sentence)
import pandas as pd from nltk.parse import CoreNLPDependencyParser parser = CoreNLPDependencyParser(url='http://localhost:9001') fin = 'oie_corpus/science_eval.oie' fout = 'oie_corpus/science_eval.oie.correct.head' with open(fin) as fi, open(fout, 'a') as fo: for line in fi: data = line.strip().split('\t') sent = data[0] try: parse, = parser.raw_parse(sent) except: continue df = pd.DataFrame([x.split('\t') for x in parse.to_conll(3).split('\n')], columns = ['word', 'pos', 'depth']) line = line.rstrip() line += '\t' + '<SYN_HEAD>' args = data[1:] word_list = list(df['word']) depth_list = list(df['depth']) for arg in args: arg = arg.split(' ') for i, w in enumerate(word_list): if word_list[i:i+len(arg)] == arg: print(arg) candidate_words = word_list[i:i+len(arg)]