class NLTK_NLP(): def __init__(self, ip_port): self.dep_parser = CoreNLPDependencyParser(url=ip_port) self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner') self.parser = CoreNLPParser(url=ip_port) self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos') def generate_dependency_tree(self, sentence): '''what is the name of the asteroid ?''' dependency_tree, = self.dep_parser.raw_parse(sentence=sentence) return dependency_tree def generate_dependency_graph(self, sentence): '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'} 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.''' dependency_tree, = self.dep_parser.raw_parse(sentence=sentence) return DependencyGraph(dependency_tree.to_conll(10)) def generate_constituency_tree(self, sentence): '''input: one question''' tree_list = list(self.parser.raw_parse(sentence=sentence)) return tree_list[0] def get_pos(self, sentence): '''What is the airspeed of an unladen swallow ? [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] ''' pos_list = list(self.pos_tagger.tag(sentence.split())) # tokens = nltk.word_tokenize(sentence) # wordpos = nltk.pos_tag(tokens) return pos_list def get_pos_by_tokens(self, tokens): '''What is the airspeed of an unladen swallow ?''' pos_list = list(self.pos_tagger.tag(tokens)) return pos_list def get_ner(self, sentence): # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split() '''april the 26th, 1882 is the birth date of which athletes ? [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'), ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'), ('athletes', 'O'), ('?', 'O')]''' sequence_ner_tuple_list = self.ner_parser.tag(sentence.split()) sequence_ner_list = [] for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list): sequence_ner_list.append(ner_tag) return sequence_ner_list def get_toknizer(self, sentence): return list(self.parser.tokenize(sentence)) def find_phrases(self, tree, phrase_tag='NP'): return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
class CNLP: CNLPServerURL = 'http://localhost:9000' def __init__(self): self.parser = CoreNLPParser(url=self.CNLPServerURL) self.dep_parser = CoreNLPDependencyParser(url=self.CNLPServerURL) self.ner_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='ner') self.pos_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='pos') def getParse(self, sentence): if (type(sentence) == list): return self.parser.parse(sentence) else: return self.parser.raw_parse(sentence) def getDepParse(self, sentence): if (type(sentence) == list): return self.dep_parser.parse(sentence) else: return self.dep_parser.raw_parse(sentence) def getNERTags(self, sentence): if (type(sentence) != list): sentence = sentence.split() return self.ner_tagger.tag(sentence) def getPOSTags(self, sentence): if (type(sentence) == list): return self.pos_tagger.parse(sentence) else: return self.pos_tagger.raw_parse(sentence)
def getNERs(ws): from nltk.parse.corenlp import CoreNLPParser from textcrafts.corenlp_api import parserURL parser = CoreNLPParser(url=parserURL, tagtype='ner') ts = parser.tag(ws) for t in ts: if t[1] != 'O': yield t
def get_postagger_for_criterion(criterion): #ini_path = "/stanford/postagger" #os.environ['STANFORD_PARSER'] = ini_path #os.environ['STANFORD_MODELS'] = ini_path #os.environ['CLASSPATH'] = ini_path st = CoreNLPParser(url=os.environ['STANFORD_NLP_TOOLS'], tagtype='pos') postagger_list = st.tag(criterion) return postagger_list
class Lex_parser: def __init__(self, tag_id_initialized=False, tag_id=None, uncased=True): self.uncased = uncased self.tag_id_initialized = tag_id_initialized if tag_id_initialized: self.tag_to_id = tag_id else: self.tag_to_id = {"CLSSEP": 0, "UNKNOWN": 1} self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos') self.basic_tokenizer = BasicTokenizer() def tokenize(self, sentence): return list(self.parser.tokenize(sentence)) def convert_sentence_to_tags(self, sentence: Union[str, list]): if type(sentence) == str: if self.uncased: sentence = sentence.lower() else: sentence = " ".join(sentence) if self.uncased: sentence = sentence.lower() sentence = self.basic_tokenizer.tokenize(sentence) # print("sentence here,", sentence) sentence = list(map(lambda x: x.upper() if x == 'i' else x, sentence)) tags = self.parser.tag(sentence) # print("sentence here,", sentence) # print("tags here", tags) # exit(-2) if not self.tag_id_initialized: for tag in tags: if tag[1] not in self.tag_to_id: self.tag_to_id[tag[1]] = len(self.tag_to_id) return tags def convert_tags_to_ids(self, tags): res = list(map(lambda x: self.tag_to_id[x[1]], tags)) # print("to ids ==") # print(len(tags), tags) # print(len(res), res) return res def convert_sentence_to_ids(self, sentence: Union[str, list]): if not self.parser: self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos') tags = self.convert_sentence_to_tags(sentence) ids = self.convert_tags_to_ids(tags) print(type(sentence), len(sentence), len(tags), len(ids)) return list(ids)
def get_entity_of_sentence(sentence): ner_tagger = CoreNLPParser(url='http://0.0.0.0:9000', tagtype='ner') entity_list = ner_tagger.tag(sentence.split()) return entity_list
class TrueFalseQuestions: def __init__(self, filename, port): self.filename = filename self.port = port self.prepare_similars() pass def prepare_similars(self): with open(self.filename, 'r') as f: text = f.read() self.parser = CoreNLPParser('http://localhost:' + str(self.port), tagtype='ner') tokens = text.split(' ') ner_tagged = self.parser.tag(tokens) self.all_ner_tags = {} last = 'O' sent = '' for w, tag in ner_tagged: if tag == 'O': if last != 'O' and len(sent) > 0: if last in self.all_ner_tags.keys(): self.all_ner_tags[last].append(sent) else: self.all_ner_tags[last] = [sent] sent = '' continue if tag == last: if len(sent) > 0: sent += ' ' + w else: sent += w else: if last != 'O' and len(sent) > 0: if last in self.all_ner_tags.keys(): self.all_ner_tags[last].append(sent) else: self.all_ner_tags[last] = [sent] sent = w last = tag for key, li in self.all_ner_tags.items(): stt = set(li) li = [] for word in stt: li.append(word) self.all_ner_tags[key] = li def get_false_sentence(self, sentence): ner_tag = self.parser.tag(sentence.split(' ')) last = 'O' tagged_sentence = [] sent = '' index = 0 for w, tag in ner_tag: index +=1 if tag == last: if sent: sent += ' ' + w else: sent += w else: if last != 'O': tagged_sentence.append((sent, last)) sent = w if index == len(ner_tag) and tag!='O': tagged_sentence.append((sent,tag)) last = tag candidate_wrong = [] for gap , tag in tagged_sentence: if tag not in self.all_ner_tags.keys(): continue length = len(self.all_ner_tags[tag]) if length < 2: continue while True: rep = self.all_ner_tags[tag][randint(0,length-1)] if rep != gap: break candidate_wrong.append(sentence.replace(gap , rep)) return candidate_wrong
def extract_title_keywords(): t1 = time.time() titles = open('titles_raw.txt', "r") # global ne_key global ne_value global feature_cols global stem_dict # ne_key = [] # ne_value = ['LOCATION', 'PERSON', 'ORGANIZATION', 'MISC'] prettt = [[], []] st = StanfordNERTagger( '/Users/yixuancui/Downloads/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/yixuancui/Downloads/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8') titles_c_list = [] for title in titles: titles_c_list.append(title) print("Length of titles_a_list: ", len(titles_c_list)) label_list = [] counter = 0 titles_d_list = [] titles_ner1_list = [] titles_ner2_list = [] words_in_between = [] v_list = [] n_list = [] r_list = [] j_list = [] d_list = [] i_list = [] stem_list = [] for c in titles_c_list: text = c if not text: continue print(counter) # label = input(c) # label_list.append(int(label)) label_list = [1] * 50 + [2] * 50 + [3] * 50 + [4] * 50 + [5] * 50 + [ 6 ] * 50 v_count = 0 n_count = 0 r_count = 0 j_count = 0 d_count = 0 i_count = 0 stem_check = 0 # todo 这里已经把index提取出来了,接下来可以把index1之前,1和2之间,2之后的所有名词和动词提取出来,加pos,stemmer怎么用再说吧我也不知道 tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') pos = list(pos_tagger.tag(tokenized_text)) ss = SnowballStemmer("english") index = [] for i in range(len(classified_text)): word = classified_text[i][0] tag = classified_text[i][1] if tag != 'O': # print('NE: ', word, ', Tag: ', tag, ', Index: ', i) # if word.lower() not in ne_key: # ne_key.append(word.lower()) index.append(i) pre_oh_classified_text_0 = [ classified_text[i][0].lower() for i in index ] pre_oh_classified_text_1 = [classified_text[i][1] for i in index] if not pre_oh_classified_text_0: pre_oh_classified_text_0 = ['None'] if not pre_oh_classified_text_1: pre_oh_classified_text_1 = ['None'] prettt[0].append(pre_oh_classified_text_0) prettt[1].append(pre_oh_classified_text_1) counter += 1 # for i in range(len(classified_text)): # if ss.stem(word) in ['acquir', 'buy', 'purchas', 'acquisit']: # stem_check = 1 # break for i in range(len(classified_text)): if pos[i][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: v_count += 1 word_stem = ss.stem(pos[i][0].lower()) stem_list.append(word_stem) if word_stem not in stem_dict: stem_dict.append(word_stem) if pos[i][1] in ['NN', 'NNS', 'NNP', 'NNPS']: n_count += 1 # word_stem = ss.stem(pos[i][0].lower()) # stem_list.append(word_stem) # if word_stem not in stem_dict: # stem_dict.append(word_stem) if pos[i][1] in ['RB', 'RBR', 'RBS', 'RP']: r_count += 1 if pos[i][1] in ['JJ', 'JJR', 'JJS']: j_count += 1 if pos[i][1] in ['DT']: d_count += 1 if pos[i][1] in ['IN']: i_count += 1 v_list.append(v_count) n_list.append(n_count) r_list.append(r_count) j_list.append(j_count) d_list.append(d_count) i_list.append(i_count) # stem_list.append(stem_check) # enc1 = OneHotEncoder(handle_unknown='ignore') # enc1.fit(array(ne_key).reshape(-1, 1)) enc2 = OneHotEncoder(handle_unknown='ignore') enc2.fit(array(ne_value).reshape(-1, 1)) enc3 = OneHotEncoder(handle_unknown='ignore') enc3.fit(array(stem_dict).reshape(-1, 1)) onehot_list = [] for j in range(len(titles_c_list)): # onehot_ner1 = enc1.transform(array(prettt[0][j]).reshape(-1, 1)).toarray().tolist() # nekeys = [sum(i) for i in zip(*onehot_ner1)] onehot_ner2 = enc2.transform(array(prettt[1][j]).reshape( -1, 1)).toarray().tolist() nevalues = [sum(i) for i in zip(*onehot_ner2)] onehot_ner3 = enc3.transform(array(stem_list[j]).reshape( -1, 1)).toarray().tolist() stemlists = [sum(i) for i in zip(*onehot_ner3)] # join = nekeys + nevalues + stemlists join = nevalues + stemlists onehot_list.append(join) # header_list = ne_key + ne_value + ["v_list", "n_list", "r_list", "j_list", "d_list", "i_list", "stem_list", "label"] # feature_cols = ne_key + ne_value + ["v_list", "n_list", "r_list", "j_list", "d_list", "i_list", "stem_list"] header_list = ne_value + stem_dict + [ "v_list", "n_list", "r_list", "j_list", "d_list", "i_list", "label" ] feature_cols = ne_value + stem_dict + [ "v_list", "n_list", "r_list", "j_list", "d_list", "i_list" ] ll = [ onehot_list[i] + [v_list[i]] + [n_list[i]] + [r_list[i]] + [j_list[i]] + [d_list[i]] + [i_list[i]] + [label_list[i]] for i in range(len(onehot_list)) ] # ll = [onehot_list[i] + [v_list[i]] + [n_list[i]] + [r_list[i]] + [j_list[i]] # + [d_list[i]] + [i_list[i]] + [stem_list[i]] + [label_list[i]] for i in range(len(onehot_list))] # ll = [onehot_list[i] + [v_list[i]] + [n_list[i]] + [r_list[i]] + [j_list[i]] # + [d_list[i]] + [i_list[i]] + [stem_list[i]] for i in range(len(onehot_list))] with open('DT2.csv', mode='w') as DT: dt_writer = csv.writer(DT, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) dt_writer.writerow(header_list) dt_writer.writerows(ll) with open('DT2.csv') as DT: dt_writer = csv.reader(DT, delimiter=',') counter = 0 for row in dt_writer: counter += 1 # print(row) print(counter) del titles_c_list del titles_d_list # del onehot_ner1 del onehot_ner2 # with open('ne_key.data', 'wb') as filehandle: # # store the data as binary data stream # pickle.dump(ne_key, filehandle) t2 = time.time() print(t2 - t1) print(ne_key)
if person != "": #wh_query=person if 'actor' in from_clause: wh_query += " and p.id=a.actor_id" elif 'director' in from_clause: wh_query += " and p.id=d.director_id" if 'oscar' in from_clause and 'movie' in from_clause: wh_query += " and m.id=o.movie_id " if 'oscar' in from_clause and 'person' in from_clause: wh_query += " o.person_id=p.id and " + osc if 'movie' in from_clause and 'actor' in from_clause: wh_query += " a.movie_id=m.id" query = select_clause + " from " + from_clause[1:] + " where " + wh_query print(query) return query qstn = "Which album by Swift was released in 2012?" parser = CoreNLPDependencyParser() depparse = next(parser.raw_parse(qstn)) for l in list(depparse.triples()): print(l) ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') ner_tag = ner_tagger.tag(qstn.split()) generate_query(depparse, ner_tag, 'S')