def dependency(text): parser = StanfordDependencyParser(u"./stanford-ner/stanford-parser/stanford-parser.jar",u"./stanford-ner/stanford-parser/stanford-parser-3.6.0-models.jar",u"./stanford-ner/classifiers/englishPCFG.ser.gz") text2 = text.split('.') if text2[len(text2)-1]=='': text2 = text2[0:len(text2)-1] adj = [] center = [] for i in text2: for j in [',',';',':','!','~','?']: i = i.replace(j,'') rec = list(parser.parse(i.split())) j=0 for row in rec[0].triples(): print (row) if j==0: center.append(str(row[0][0])) j+=1 if str(row[1]).find('amod')!=-1: adj.append(str(row[2][0])) for i in adj: print ('111') print (i) print ('222') print (center) return center, adj
def parse_sentence(sentence): parser = StanfordDependencyParser(path_to_jar=PATH_TO_JAR, path_to_models_jar=PATH_TO_MODELS) trees = list(parser.parse(sentence)) if not trees: return None parsed_tree = trees[0] return list(parsed_tree.triples())
def get_parser(domain): if domain == 'laptop': suffix = '_2014.pkl' else: suffix = '_2016_2.pkl' filename = './pkl/data_' + domain + suffix save_filename = './pkl/dep_' + domain + suffix fr = open(filename, 'rb') data = pickle.load(fr) sents = data['raw_sentence'] labels = data['labels'] fr.close() new_data = {} dep = [] eng_parser = StanfordDependencyParser( model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') for sent in sents: res = list(eng_parser.parse(sent)) sent_dep = [] # dep.append(res) for row in res[0].triples(): print(row) sent_dep.append(row) dep.append(sent_dep) # break new_data['raw_sentence'] = sents new_data['dependency'] = dep new_data['labels'] = labels fr = open(save_filename, 'wb') pickle.dump(new_data, fr) fr.close()
class DependencyParsing: """ Stanford dependency parsing """ def __init__(self, path_to_jar, path_to_models_jar): self.dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) def parse_sentences(self, sentences): """ Dependency parsing of list of tokenized sentences using the stanford parser :param sentences: List of sentences. Each sentence is a list of tokens. :type sentences: list(list(str)) :return: iterator of DependencyGraph objects :rtype: iterator """ result = self.dependency_parser.parse_sents(sentences) return result.__next__() def parse_sentence(self, sentence): """ Dependency parsing of a tokenized sentence using the stanford parser :param sentence: sentence as a list of tokens. :type sentence: list(str) :return: DependencyGraph object :rtype: nltk.DependencyGraph """ result = self.dependency_parser.parse(sentence) return result.__next__()
def Dep_handler(self, sentence, parser): if parser == "spacy": try: import spacy, en_core_web_sm except ImportError: print("Can't import spacy") nlp = en_core_web_sm.load() doc = nlp(sentence) return_dict = {} for token in doc: return_dict[str(token.text)] = str(token.dep_) return return_dict elif parser == "nltk": try: import nltk from nltk.parse.stanford import StanfordDependencyParser os.environ["CLASSPATH"] = "./StanfordNLP/jars" os.environ["STANFORD_MODELS"] = "./StanfordNLP/models" except ImportError: print("Can't import nltk") eng_parser = StanfordDependencyParser() res = list(eng_parser.parse(sentence.split())) return_dict = {} turn = True for row in res[0].triples(): if row[0][0] not in return_dict and turn: return_dict[row[0][0]] = "ROOT" turn = False return_dict[row[2][0]] = row[1] # print(row) # print(return_dict) return return_dict
def dependency_parser_nltk(word_lists, filename): os.environ['JAVAHOME'] = JAVA_PATH os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS chinese_parser = StanfordDependencyParser(model_path=nltk_parse_model_path) STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0] chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR)) chinese_parser.java_options = '-mx15000m' node_file = shelve.open(filename) all_dependency_list = [] for index, sentence in enumerate(word_lists): # 存进all_dependency_list中,存储数据类型是列表 res = list(chinese_parser.parse(sentence.strip().split())) print("we have finished ", index + 1, " sentence!!!") list_file = [triple for triple in res[0].triples()] all_dependency_list.append(list_file) #存进node_file,存储数据类型是dict/defaultdict,用作备份文件 node_dict = {} node = res[0].nodes for inner_index in range(len(node.items()) * 2): if node[inner_index]['word'] != None or node[inner_index][ 'ctag'] != None: # print(node[inner_index]) node_dict[node[inner_index]["address"]] = node[inner_index] # print(node[inner_index]["address"], type(node[inner_index]["address"])) node_file[str(index)] = node_dict node_file.close() return all_dependency_list
def ch_depenpaeser(str): chi_parser = StanfordDependencyParser( r"E:\tools\stanfordNLTK\jar\stanford-parser.jar", r"E:\tools\stanfordNLTK\jar\stanford-parser-3.9.1-models.jar", r"E:\tools\stanfordNLTK\jar\classifiers\chinesePCFG.ser.gz") res = list(chi_parser.parse(str.split())) for row in res[0].triples(): print(row)
def extract_depend_parser(self, sentence): parser_feature = [] eng_parser = StanfordDependencyParser( model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res = list(eng_parser.parse(str(sentence).split())) for row in res[0].triples(): parser_feature.append(str(row)) return parser_feature
class feature_cal(): def __init__(self, text_collector): # wn.ensure_loaded() self.text_collector = text_collector self.dep_parser = StanfordDependencyParser( '/data3/zyx/project/eye_nlp/data/model/stanford-parser.jar', '/data3/zyx/project/eye_nlp/data/model/stanford-parser-3.9.2-models.jar', model_path= '/data3/zyx/project/eye_nlp/data/model/englishPCFG.ser.gz') self.tokenizer = nltk.tokenize.RegexpTokenizer('\w+') self.nlp = spacy.load("en_core_web_sm") def get_feature(self, words_list, wn): raw_words_list = [ self.tokenizer.tokenize(word)[0] for word in words_list ] fea_num_letter = [len(word) for word in raw_words_list] fea_start_capital = [word.istitle() for word in raw_words_list] fea_capital_only = [word.isupper() for word in raw_words_list] fea_have_num = [ True if re.match(r'[+-]?\d+$', word) else False for word in raw_words_list ] fea_abbre = [ word.isupper() and len(word) >= 2 for word in raw_words_list ] fea_entity_critical = cal_entity_critical(self.nlp, words_list) # use nlp method doc = self.nlp() res = self.dep_parser.parse(words_list) deps = res.__next__() traverse(deps, 0) # 0 is always the root node fea_domi_nodes = [] for i in range(1, len(words_list) + 1): this_dominate = cal_dominate(deps, i) fea_domi_nodes.append(this_dominate) fea_max_d = cal_max_d(deps, len(words_list)) fea_idf = cal_idf(self.text_collector, raw_words_list) if len(fea_max_d) != len(fea_have_num): print('length error') # fea_num_wordnet = [len(wn.synsets(word)) for word in raw_words_list] fea_complexity = [ textstat.flesch_kincaid_grade(str(word)) for word in words_list ] return [ fea_num_letter, fea_start_capital, fea_capital_only, fea_have_num, fea_abbre, fea_entity_critical, fea_domi_nodes, fea_max_d, fea_idf, fea_complexity ]
def DependencyParmer(self, sentence): cutSentenceList = self.get_sentence_list(sentence) chi_parser = StanfordDependencyParser( r"D:\python\stanfordParser\jars\stanford-parser.jar", r"D:\python\stanfordParser\jars\stanford-parser-3.5.0-models.jar", r"D:\python\stanfordParser\jars\chinesePCFG.ser.gz") res = list(chi_parser.parse(cutSentenceList)) relationship = [] for row in res[0].triples(): relationship.append(row) return relationship
def dependencyParser(tokens): from nltk.parse.stanford import StanfordDependencyParser chi_parser = StanfordDependencyParser( r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser.jar", r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar", r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp" r"\models\lexparser\chinesePCFG.ser.gz") tree = chi_parser.parse(tokens) res = list(tree) for row in res[0].triples(): print(row)
def save_depend_tree(self, parser_sentence): depend_tree = [] eng_parser = StanfordDependencyParser( model_path="E:/Stanford parser/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models/edu/stanford/nlp/models/lexparser/englishFactored.ser.gz") for i in range(len(parser_sentence)): print(i) sentence = [] res = list(eng_parser.parse(str(parser_sentence[i].word).split())) for row in res[0].triples(): sentence.append(row) depend_tree.append(sentence) pickle.dump(depend_tree, open("test_depend_tree.txt", 'wb'))
def parser_dependency(sentence): eng_parser = StanfordDependencyParser( path_to_jar=path_dit.get('path_to_jar'), path_to_models_jar=path_dit.get('path_to_models_jar'), model_path=path_dit.get('model_path')) res = list(eng_parser.parse(sentence.split())) print type(res) for row in res[0].triples(): # print row[1] print row[0][0], row[1], row[2][0] return res
class DependencyParser(object): def __init__(self, path_to_jar, path_to_models_jar): self.model = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) def str_parse(self, sentence): ''' sentence is a string ''' parsed = self.model.raw_parse(sentence) return [p for p in parsed] def lst_parse(self, sentence): ''' sentence is a list of words ''' parsed = self.model.parse(sentence) return [p for p in parsed]
def sentToTriples(sent): #returns a list of triples sent = ''.join([i if i.isalpha() else ' ' for i in sent]) eng_parser = StanfordDependencyParser( r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser.jar", r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser-3.6.0-models.jar", r"/home/losphoenix/StanfordNLP/stanford-parser/englishPCFG.ser.gz") parsed = eng_parser.parse(sent.split()) result = list(parsed) #print parsed; # for row in result[0].triples(): # print(row[0]); return result[0].triples()
def sentToTriples(sent): #returns a list of triples cns_parser = StanfordDependencyParser( r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser.jar", r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser-3.6.0-models.jar", r"/home/losphoenix/StanfordNLP/stanford-parser/models/lexparser/chinesePCFG.ser.gz" ) parsed = cns_parser.parse(sent.split()) result = list(parsed) #print parsed; # for row in result[0].triples(): # print(row[0]); return result[0].triples()
def parser(): os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar' os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m") for x in content: a = list(eng_parser.parse(x.split()))[0] print(a) # a.draw() eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') for x in content: a = list(eng_dep_parser.parse(x.split()))[0] for row in a.triples(): print(row)
def stanfordDP(sentence, displayTree=0, showTriples=0): '''Stanford依存語法解析。若需印出依存圖則設定displayTree=1。 ''' #print(repr(sentence),'\n') parser = StanfordDependencyParser() res = list(parser.parse(sentence.split())) #print(res[0].tree(),'\n') #print(*res[0].tree(),'\n') rels = [rel for rel in res[0].triples()] if (showTriples != 0): for row in res[0].triples(): print(row) if (displayTree != 0): for row in res[0].tree(): #print(row) if type(row) is not str: #row.draw() display(row) return rels
tags = semcor.tagged_sents(tag = 'sem') n = 0 correct = 0 base = 0 total = 0 for sent in corp: sentence = ' '.join(sent) print sentence parsed = list(parser.parse(tokenizer.tokenize(sentence))) for term in tags[n]: if len(term)==1 and isinstance(term[0], basestring) and isinstance(term, Tree) and len(wordnet.synsets(term[0])) > 1: if isinstance(term.label(), unicode): continue syn = term.label().synset() word = term[0] sense_standard = syn print word for pair in parsed[0].triples(): if pair[0][0] == word: pos = pair[0][1] if pair[2][0] == word:
class Baselines: def __init__(self): # 读入停用词表 with open(FilePool.stopword_txt, 'r') as f_stopword: doc = f_stopword.readlines() self.stopwords = [line.rstrip('\n') for line in doc] # 读入答案 if args.answer_base == 'long': # 使用长答案 ans_json = FilePool.long_answers_json ans_txt = FilePool.long_answers_txt elif args.answer_base == 'cleaned': # 使用短答案 ans_json = FilePool.cleaned_answers_json ans_txt = FilePool.cleaned_answers_txt else: # 使用small answers ans_json = FilePool.small_answers_json ans_txt = FilePool.small_answers_txt with open(ans_json, 'r') as f_json: text = json.load(f_json) if args.trim_stop: self.cut_answers = [[ele for ele in answer if ele not in self.stopwords] for answer in text] else: self.cut_answers = text with open(ans_txt, 'r') as f_ans_txt: text = f_ans_txt.readlines() self.uncut_answers = [line.rstrip('\n') for line in text] # 读入QA库和已知问题库 if args.method == Method.mix or args.method == Method.qq_match: with open(FilePool.qa_file, 'r') as f_qa: self.qa = json.load(f_qa) with open(FilePool.base_ques_list_file, 'r') as f_base_ques_list: self.base_ques_list = json.load(f_base_ques_list) # 提前实例化bm25模型,提升性能 # 如果提前对问题分类了,那也要提前实例化模型,给分类为空的问题兜底 if (args.method == Method.bm25 or args.method == Method.bm25_syn): self.bm25_model_uncat = BM25(self.cut_answers) if args.method == Method.mix or args.method == Method.bm25_new: self.bm25_model_uncat = NewBM25(self.cut_answers) # 提前实例化tfidf模型,提升性能 if args.method == Method.mix or args.method == Method.qq_match: self.tfidf_dict = Dictionary(self.base_ques_list) # fit dictionary n_features = len(self.tfidf_dict.token2id) bow = [self.tfidf_dict.doc2bow(line) for line in self.base_ques_list] # convert corpus to BoW format # 构造tf-idf模型 self.tfidf_model = TfidfModel(bow) # fit model text_tfidf = self.tfidf_model[bow] # apply model self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features) elif args.method == Method.tfidf_sim: self.tfidf_dict = Dictionary(self.cut_answers) # fit dictionary n_features = len(self.tfidf_dict.token2id) bow = [self.tfidf_dict.doc2bow(line) for line in self.cut_answers] # convert corpus to BoW format # 构造tf-idf模型 self.tfidf_model = TfidfModel(bow) # fit model text_tfidf = self.tfidf_model[bow] # apply model self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features) # 实例化Parser self.parser = StanfordDependencyParser(path_to_jar=FilePool.stanford_parser, path_to_models_jar=FilePool.stanford_chinese_model) # bm25算法搜索 def bm25(self, query, categorized_qa): # 只有问题分类的情况下才在这里做模型实例化,其他情况下模型已经在__init__()里实例化过了 if args.categorize_question: if len(categorized_qa['cut_answers']) != 0: # 非空的时候才用这个作corpus传进BM25 bm25_model = BM25(categorized_qa['cut_answers']) # print(categorized_qa['classes']) else: # 如果为空,那么还用原来的corpus传进BM25 bm25_model = self.bm25_model_uncat # print('没用分类问题') else: bm25_model = self.bm25_model_uncat # print('没用分类问题') bm25_weights = bm25_model.get_scores(query) sorted_scores = sorted(bm25_weights, reverse=True) # 将得分从大到小排序 # 选择不同的normalize方式 if not args.advanced_norm: sorted_scores = [s / (len(query) + 1) for s in sorted_scores] # 将得分除以句长 else: # 一种高级的normalize方法 content_word_cnt = len(query) parse_result = self.parser.parse(query).__next__() depend_relation_cnt = len(list(parse_result.triples())) sorted_scores = [s / (content_word_cnt * k1 + depend_relation_cnt * k2) for s in sorted_scores] max_pos = np.argsort(bm25_weights)[::-1] # 从大到小排序,返回index(而不是真正的value) # 根据max_pos从答案库里把真正的答案抽出来 if args.categorize_question: # 答案来源是categorized的时候 if len(categorized_qa['cut_answers']) != 0: # 非空的时候才用这个作为answer base answers = self.__max_pos2answers(max_pos, categorized_qa['uncut_answers']) else: # 如果为空,那么还用原来的self.uncut_answers作为answer base answers = self.__max_pos2answers(max_pos, self.uncut_answers) else: # 答案来源不是categorized的时候,categorized_qa是None answers = self.__max_pos2answers(max_pos, self.uncut_answers) return sorted_scores, max_pos, answers # bm25 with synonym module # 不支持问题分类 def bm25_syn(self, query): bm25_model = self.bm25_model_uncat query_weights = bm25_model.get_scores(query) # 普通的bm25算法 max_pos = np.argsort(query_weights)[::-1][0] # 最高得分所在的index(而不是真正的value) # 找出来query里哪个词是最关键的 max_score = 0 kw = '' # 最关键的那个词 kw_idx = -1 for idx, word in enumerate(query): word_weight = bm25_model.get_score([word], index=max_pos) if word_weight > max_score: max_score = word_weight kw = word kw_idx = idx # 为这个最关键的词创造一个近义词列表 nearby_list = synonyms.nearby(kw) syn_list = [kw] # 先手动把关键词自己加到列表里 for word, score in zip(nearby_list[0], nearby_list[1]): # 条件:得分大于阈值 if score > args.syn_threshold and word not in syn_list: syn_list.append(word) # 找出来哪个近义词得分最高 max_score = -1 best_kw = '' # 得分最高的词 for syn in syn_list: query[kw_idx] = syn # 替换query中的那个最关键的词 weights = bm25_model.get_scores(query) # 普通的bm25算法 score = sorted(weights, reverse=True)[0] # 将得分从大到小排序,取第1个 if score > max_score: max_score = score best_kw = syn # if best_kw != kw: # print('1') # else: # print('0') # print(kw + '\t' + best_kw) # 找到最合适的关键词了,回到正规,返回sorted_scores, max_pos, answers query[kw_idx] = best_kw bm25_weights = bm25_model.get_scores(query) sorted_scores = sorted(bm25_weights, reverse=True) # 将得分从大到小排序 sorted_scores = [s / (len(query) + 1) for s in sorted_scores] # 将得分除以句长 max_pos = np.argsort(bm25_weights)[::-1] # 从大到小排序,返回index(而不是真正的value) answers = self.__max_pos2answers(max_pos, self.uncut_answers) return sorted_scores, max_pos, answers # 改进版的bm25 def bm25_new(self, query, uncut_query, categorized_qa): # 只有 问题分类 且 分类不为空 且 不用uni_idf 的情况下才在这里做模型实例化 # 其他情况下模型已经在__init__()里实例化过了 if args.categorize_question and len(categorized_qa['cut_answers']) != 0 and not args.uni_idf: bm25_model = NewBM25(categorized_qa['cut_answers']) # print(categorized_qa['classes']) else: bm25_model = self.bm25_model_uncat # print('没用分类问题') expanded_query = [] for q in query: if q not in expanded_query: expanded_query.append(q) # 先把q加到expanded_query里面 nearby_list = synonyms.nearby(q) # 为q创造一个近义词列表 for word, score in zip(nearby_list[0], nearby_list[1]): # 条件:得分大于阈值 && expanded_query当前没这个词 if score > args.syn_threshold and word not in expanded_query: expanded_query.append(word) bm25_weights = bm25_model.get_new_scores(query, expanded_query) sorted_scores = sorted(bm25_weights, reverse=True) # 将得分从大到小排序 # 选择不同的normalize方式 if not args.advanced_norm: sorted_scores = [s / (len(query) + 1) for s in sorted_scores] # 将得分除以句长 else: # 一种高级的normalize方法 content_word_cnt = len(query) parse_result = self.parser.parse(query).__next__() depend_relation_cnt = len(list(parse_result.triples())) sorted_scores = [s / (content_word_cnt * k1 + depend_relation_cnt * k2) for s in sorted_scores] max_pos = np.argsort(bm25_weights)[::-1] # 从大到小排序,返回index(而不是真正的value) # 根据max_pos从答案库里把真正的答案抽出来 # 只有 问题分类 且 分类不为空 且 不用uni_idf 的情况下才在用categorized_qa作为答案库 # 其他情况下用self.uncut_answers作为答案库 if args.categorize_question and len(categorized_qa['cut_answers']) != 0 and not args.uni_idf: answers = self.__max_pos2answers(max_pos, categorized_qa['uncut_answers']) else: answers = self.__max_pos2answers(max_pos, self.uncut_answers) # 如果用全集答案库计算IDF的话,相当于模型得分和不分类是一样的,只是现在需要筛掉不在categorized_qa中的答案 if args.uni_idf: filtered_sorted_scores = [] filtered_max_pos = [] filtered_answers = [] for s, m, a in zip(sorted_scores, max_pos, answers): if a in answers: filtered_sorted_scores.append(s) filtered_max_pos.append(m) filtered_answers.append(a) sorted_scores = filtered_sorted_scores max_pos = filtered_max_pos answers = filtered_answers # if args.categorize_question: # print(sorted_scores[0]) # # 如果问题分类的得分不够阈值,那么重头来过,用答案库全集重新跑一遍new bm25 # if args.categorize_question and sorted_scores[0] < args.bm25_new_cat_threshold: # bm25_model = self.bm25_model_uncat # # bm25_weights = bm25_model.get_new_scores(query, expanded_query) # # sorted_scores = sorted(bm25_weights, reverse=True) # 将得分从大到小排序 # # 选择不同的normalize方式 # if not args.advanced_norm: # sorted_scores = [s / (len(query) + 1) for s in sorted_scores] # 将得分除以句长 # else: # # 一种高级的normalize方法 # content_word_cnt = len(query) # parse_result = self.parser.parse(query).__next__() # depend_relation_cnt = len(list(parse_result.triples())) # sorted_scores = [s / (content_word_cnt * k1 + depend_relation_cnt * k2) for s in sorted_scores] # max_pos = np.argsort(bm25_weights)[::-1] # 从大到小排序,返回index(而不是真正的value) # # answers = self.__max_pos2answers(max_pos, self.uncut_answers) return sorted_scores, max_pos, answers # 问题-问题匹配 def qq_match(self, query): # 输入tf-idf,得到从大到小排列的index list sorted_scores, max_pos, _ = self.tfidf_sim(query, direct_call=False) answers, questions = self.__max_pos2answers_questions(max_pos) # 用QQ匹配的阈值过滤一遍结果 sorted_scores, max_pos, answers, questions = \ self.__filter_by_threshold(sorted_scores, max_pos, answers, questions, args.qq_threshold) return sorted_scores, max_pos, answers, questions # QQ匹配+问题分类+QA匹配+可回答性判断 def qq_cat_qa_filter(self, query, uncut_query, categorized_qa): sorted_scores, max_pos, answers, questions = self.qq_match(query) # 先用QQ匹配试试 if len(sorted_scores) > 0: # QQ匹配效果不错,直接返回结果 # print(questions[0]) return sorted_scores, max_pos, answers, questions else: # 截断之后啥也不剩了,说明QQ匹配没有一个得分到阈值的 # 果断放弃,改用QA匹配 # QA匹配暂时选用bm25算法 sorted_scores, max_pos, answers = self.bm25_new(query, uncut_query, categorized_qa) # 用QA匹配的阈值过滤一遍结果,注意分类和没分类的情况阈值是不一样的 if args.categorize_question and len(categorized_qa['cut_answers']) != 0 and not args.uni_idf: if args.advanced_norm: # new bm25用高级归一化方法 threshold = args.cat_adv_norm_threshold else: # new bm25用普通归一化方法 threshold = args.cat_threshold else: # new bm25不用问题分类 threshold = args.qa_threshold sorted_scores, max_pos, answers, _ = \ self.__filter_by_threshold(sorted_scores, max_pos, answers, [], threshold) return sorted_scores, max_pos, answers, [] # questions的位置返回一个空list # tf-idf相似度算法搜索 # direct_call = True 代表直接调用tfidf-sim()做答案选择 # direct_call = False 代表qq-match()调用tfidf-sim()做QQ匹配 def tfidf_sim(self, query, direct_call=True): query_bow = [self.tfidf_dict.doc2bow(query)] # 用query做一个bag of words query_tfidf = self.tfidf_model[query_bow] # 用tfidf model编码 similarities = self.sim_index[query_tfidf][0] # 算相似度 sorted_scores = sorted(similarities, reverse=True) # 将得分从大到小排序 max_pos = np.argsort(similarities)[::-1] # 从大到小排序,返回index(而不是真正的value) if direct_call: answers = self.__max_pos2answers(max_pos, self.uncut_answers) # 根据max_pos从答案库里把真正的答案抽出来 else: answers = [] # 没人关心answers return sorted_scores, max_pos, answers # 根据max_pos从答案库里把真正的答案抽出来 def __max_pos2answers(self, max_pos, uncut_answers): max_pos = max_pos.tolist() # ndarray -> list answers = [] for r in max_pos: answers.append(uncut_answers[r]) return answers def __max_pos2answers_questions(self, max_pos): max_pos = max_pos.tolist() # ndarray -> list answers = [] questions = [] for r in max_pos: answers.append(self.qa[r]['sentence'][0]) questions.append(self.qa[r]['question']) return answers, questions # 根据阈值对结果进行截断 def __filter_by_threshold(self, sorted_scores, max_pos, answers, questions, threshold): cut_point = 10000 # 截断位置,先初始化一个很大的值 for i, score in enumerate(sorted_scores): # 如果第i个score小于阈值,那么后面的一定都小于阈值,从这里截断就行 if score < threshold: cut_point = i # 把截断位置设为i break # 进行截断操作 sorted_scores = sorted_scores[:cut_point] max_pos = max_pos[:cut_point] answers = answers[:cut_point] questions = questions[:cut_point] return sorted_scores, max_pos, answers, questions
seg_list = jieba.cut(sentence[i]) tokens = "/".join(seg_list) tokenizer = tokens.split("/") #print(tokenizer) for j in range(len(char)): if (char[j] in tokenizer): A = char[j] break B = A zh_dependency_parser = StanfordDependencyParser( "/Users/sumeiru/Desktop/StanfordNLP/StanfordNLP/jars/stanford-parser.jar", "/Users/sumeiru/Desktop/StanfordNLP/StanfordNLP/jars/stanford-parser-3.9.2-models.jar", "/Users/sumeiru/Desktop/StanfordNLP/StanfordNLP/models/chineseFactored.ser.gz" ) ans = list(zh_dependency_parser.parse(tokens.split("/"))) for row in ans[0].triples(): if (row[1] == "nsubj" and row[2][0] in char): A = row[2][0] #print("from: "+A) elif (row[1] == "compound:nn" and row[2][0] in char_except): B = row[2][0] #print("to: "+B) for k in range(len(vocal)): if (vocal[i] in dialogue[i]): detect_print = False break if (detect_print): print(A + ": " + dialogue[i])
from nltk.parse.stanford import StanfordDependencyParser chi_parser = StanfordDependencyParser(r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser.jar", r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser-3.6.0-models.jar", r"/Users/baymax/Dev/library/stanford/segmenter/chinesePCFG.ser.gz" ) res = list(chi_parser.parse(u'螺纹 按 其 截面 形状 ( 牙型 ) 分为 三角形螺纹 、 矩形螺纹 、 梯形螺纹'.split())) for row in res[0].triples(): print(row)
#------------------------------------------------------------------------------# YFC_text_sent_token = nltk.sent_tokenize(YFC_text_final) zz3 = [] for i in range(0,5,1): zz3.append(nltk.sent_tokenize(YFC_text_final)[i]) YFC_text__to_temp = ' '.join(zz3) zz3_5 = nltk.word_tokenize(YFC_text__to_temp) zz4 = [ii6 for ii6 in zz3_5 if not ii6 in stopword2] YFC_text__to_parse = ' '.join(zz4) zzfinal = nltk.sent_tokenize(YFC_text__to_parse.lower()) eng_parser = StanfordDependencyParser() res = list(eng_parser.parse(zzfinal)) dep_tree_dot_repr = [parse for parse in res][0].to_dot() dtree = [parse for parse in res][0] source = Source(dep_tree_dot_repr, filename="dep_tree", format="png") source.view() #------------------------------------------------------------------------------# tt ={} ttt4 = [] ttt4=res[0].nodes.keys() for i in range(len(ttt4)+1): tt[i]=res[0].get_by_address(i) kkw={}
from nltk.parse.stanford import StanfordDependencyParser, StanfordParser from src.data.paths import get_dictionaty_data_path path_to_jar = get_dictionaty_data_path('stanford-corenlp-3.9.1.jar') path_to_models_jar = get_dictionaty_data_path( 'stanford-corenlp-3.9.1-models.jar') dep_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) sentence = [ 'i', 'am', 'a', 'smart', 'business', 'man', 'and', 'can', 'lead', 'much', 'better', 'than', 'the', 'old', 'lazy', 'presidents' ] # print(list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))) # print([parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]) # result = list(dep_parser.parse(sentence)) # result[0].draw() result = list(dep_parser.parse(sentence)) dep_tree = [parse.tree() for parse in result][0] dep_tree.draw()
for w in doc: if w.text in neg_words: negated.add(w.head.i) for j in xrange(0, len(tokens)): neg_array.append(j in negated) nlp.tokenizer = old_tokenizer return neg_array if __name__ == "__main__": tokens = map(TokenFactory.create, "I do not have any idea and I am not alright .".split(" ")) print tokens print compute_neg_dir_dep(tokens, ["not", "any"]) dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # parse tree = next(dep_parser.parse("I do not have any idea and I am not alright.".split(" "))).tree() # Can't deal with the duplicate because the dependency graph does not label each token negated = [] parent = tree.label() queue = Queue.Queue() queue.put((parent, tree)) while not queue.empty(): elem = queue.get() print elem parent, current = elem if isinstance(current, Tree): label = current.label() for c in current: queue.put((label, c)) else: label = current
words_len = len(words) else: i = i + 1 words_len = len(words) words_only = words[1:] i = 1 while (i < words_len): lmtzr.lemmatize(words[i]) i = i + 1 pos_words = pos_tagger.tag(words_only) parsed_out_pcfg = reg_parser.parse(pos_words) pre_parsed_out = dependency_parser.parse(words_only) dep = pre_parsed_out.__next__() parsed_out = list(dep.triples()) Script_Word_Ct += len(pos_words) i = 0 while (i < words_len - 1): tags = pos_words[i][1] if (i < len(pos_words) - 1 and tags == 'NP' and pos_words[i + 1][1] == 'PRP'): NP_PRP += 1 i = i + 1 for all in tags:
except: BioSentStopArray = BioSentStopArray else: BioSentStopArray = cosine_similarity(titlevector, mainfindingvector) BioSentStop = BioSentStopArray[0][0] # pattern based feature scoring Pattern = 0.00 pattern_sheet = readworkbook("Pattern.xlsx", 0) for string_num in range(1, 7): if re.search( pattern_sheet.cell(string_num, 0).value, sentence.lower()): Pattern = Pattern + pattern_sheet.cell(string_num, 1).value if sentence.split(): res = list(dependencyparser.parse(sentence.lower().split())) for row in res[0].triples(): rowtext = ''.join(str(row)) if rowtext.find("(u'we', u'PRP')") != -1: for pattern_num in range(1, 16): if rowtext.find(pattern_sheet.cell(pattern_num, 2).value) != -1: Pattern = Pattern + pattern_sheet.cell(pattern_num, 3).value mainfindingwordspattern = nltk.word_tokenize(sentence.lower()) if len(mainfindingwordspattern) > 3: for first_three_num in range(1, 38): if mainfindingwordspattern[0] + ' ' + mainfindingwordspattern[ 1] + ' ' + mainfindingwordspattern[2] == pattern_sheet.cell( first_three_num, 4).value: Pattern = Pattern + pattern_sheet.cell(first_three_num,
print(se) count = 0 res = '' words = pseg.cut(se) for word, flag in words: if (flag == 'n'): print('%s %s' % (word, flag)) # res += word + ' / ' res += word + ' ' #词向量需要分词文本语料 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus(u'testVec.txt' ) # 加载语料 model = word2vec.Word2Vec(sentences, size=200) # 训练skip-gram模型; 默认window=5 # 计算某个词的相关词列表 y2 = model.most_similar(u"螺纹", topn=40) # 20个最相关的 count = 0 for item in y2: print(item[0], item[1]) chi_parser = StanfordDependencyParser(r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser.jar", r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser-3.6.0-models.jar", r"/Users/baymax/Dev/library/stanford/segmenter/chinesePCFG.ser.gz" ) res = list(chi_parser.parse(se.split())) for row in res[0].triples(): print(row)
path_to_jar = '/Users/wang/dev/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar' path_to_model_jar = '/Users/wang/dev/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar' model_path = '/Users/wang/dev/stanford-chinese-corenlp-2018-02-27-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz' #s = u"你 有个 优惠券 快要 过期 了" s = u"王其龙 是 一个 优秀 的 程序员,他 喜欢 江曦" # 依存分析 from nltk.parse.stanford import StanfordDependencyParser parser = StanfordDependencyParser(path_to_jar, path_to_model_jar, model_path) result = list(parser.parse(s.split())) for row in result[0].triples(): print(row) # 句法结构分析 from nltk.parse.stanford import StanfordParser parser = StanfordParser(path_to_jar, path_to_model_jar, model_path) result = list(parser.parse(s.split())) for r in result: print (r) print (r.draw())
print(dep) for d in dep['deps']: for addr2 in dep['deps'][d]: traverse(deps, addr2) # code on book dep_parser = StanfordDependencyParser( '/data3/zyx/project/eye_nlp/data/model/stanford-parser.jar', '/data3/zyx/project/eye_nlp/data/model/stanford-parser-3.9.2-models.jar', model_path='/data3/zyx/project/eye_nlp/data/model/englishPCFG.ser.gz') # print(list(english_parser.raw_parse_sents(('this is the english parser test')))) # [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] s = 'the big dog chased the little cat.' res = dep_parser.parse( s.split()) # can use a simple .split since my input is already tokenised deps = res.__next__() traverse(deps, 0) # 0 is always the root node features = [] def cal_dominate(dep, index): tmp_dominate = 0 for each_dep in dep.nodes[index]['deps']: for each_index in dep.nodes[index]['deps'][each_dep]: tmp_dominate += cal_dominate(dep, each_index) return tmp_dominate + 1 for i in range(1, len(deps.nodes)):
# mylist = list(eng_parser.parse(sentence.split())) # print(len(mylist)) # print("句法分析结果", mylist) # 依存句法分析 # 对于依存关系的标签说明:http://universaldependencies.org/u/dep/all.html#al-u-dep/det eng_dependency_parser = StanfordDependencyParser( path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar", path_to_models_jar= r"D:\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar", model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') outputs = ' '.join(tokenizer.tokenize("Dole was defeated by Clinton")) print(outputs) result = list(eng_dependency_parser.parse(outputs.split())) for each in result[0].triples(): print(each) # if each[1]=='dobj': # # print(each) # print(each[0][0]) # print(each[2][0]) # print("依存句法分析结果:") # for row in result[0].triples(): # print(row) # print(result[0]) # 中文分词 # 还要研究一下,一下代码报错 # chinese_segmenter = StanfordSegmenter( # path_to_jar=r"D:\stanford-segmenter-2016-10-31\stanford-segmenter-3.7.0.jar", # path_to_slf4j=r"D:\stanford-segmenter-2016-10-31\slf4j-api.jar",
'interest_3': 2, 'interest_4': 3, 'interest_5': 4, 'interest_6': 5 } bayes = [[], [], [], [], [], []] count = [0, 0, 0, 0, 0, 0] n = 0 for instance in senseval.instances('interest.pos')[0:1599]: count[sense[instance.senses[0]]] += 1 sentence = ' '.join(w for (w, p) in instance.context) parsed = list(parser.parse(tokenizer.tokenize(sentence))) for triple in parsed[0].triples(): related = 0 if triple[0][0] in interest: word = triple[2][0] related = 1 if triple[2][0] in interest: word = triple[0][0] related = 1 if related == 1: exist = 0 for item in bayes[sense[instance.senses[0]]]: if item[0] == word: item[1] += 1 exist = 1 if exist == 0: