Esempio n. 1
0
def dependency(text):
	parser = StanfordDependencyParser(u"./stanford-ner/stanford-parser/stanford-parser.jar",u"./stanford-ner/stanford-parser/stanford-parser-3.6.0-models.jar",u"./stanford-ner/classifiers/englishPCFG.ser.gz")

	text2 = text.split('.')
	if text2[len(text2)-1]=='':
		text2 = text2[0:len(text2)-1]


	adj = []
	center = []

	for i in text2:
		for j in [',',';',':','!','~','?']:
			i = i.replace(j,'')
		rec = list(parser.parse(i.split()))
		j=0
		for row in rec[0].triples():
			print (row)
			if j==0:
				center.append(str(row[0][0]))
			j+=1

			if str(row[1]).find('amod')!=-1:
				adj.append(str(row[2][0]))

	for i in adj:
		print ('111')
		print (i)

	print ('222')
	print (center)

	return center, adj
Esempio n. 2
0
def parse_sentence(sentence):
    parser = StanfordDependencyParser(path_to_jar=PATH_TO_JAR, path_to_models_jar=PATH_TO_MODELS)
    trees = list(parser.parse(sentence))
    if not trees:
        return None
    parsed_tree = trees[0]
    return list(parsed_tree.triples())
Esempio n. 3
0
def get_parser(domain):
    if domain == 'laptop':
        suffix = '_2014.pkl'
    else:
        suffix = '_2016_2.pkl'
    filename = './pkl/data_' + domain + suffix
    save_filename = './pkl/dep_' + domain + suffix
    fr = open(filename, 'rb')
    data = pickle.load(fr)
    sents = data['raw_sentence']
    labels = data['labels']
    fr.close()

    new_data = {}
    dep = []

    eng_parser = StanfordDependencyParser(
        model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
    for sent in sents:
        res = list(eng_parser.parse(sent))
        sent_dep = []
        # dep.append(res)
        for row in res[0].triples():
            print(row)
            sent_dep.append(row)
        dep.append(sent_dep)
        # break
    new_data['raw_sentence'] = sents
    new_data['dependency'] = dep
    new_data['labels'] = labels
    fr = open(save_filename, 'wb')
    pickle.dump(new_data, fr)
    fr.close()
Esempio n. 4
0
class DependencyParsing:
    """
    Stanford dependency parsing
    """
    def __init__(self, path_to_jar, path_to_models_jar):
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

    def parse_sentences(self, sentences):
        """ Dependency parsing of list of tokenized sentences using the stanford parser

        :param sentences: List of sentences. Each sentence is a list of tokens.
        :type sentences: list(list(str))
        :return: iterator of DependencyGraph objects
        :rtype: iterator
        """
        result = self.dependency_parser.parse_sents(sentences)
        return result.__next__()

    def parse_sentence(self, sentence):
        """ Dependency parsing of a tokenized sentence using the stanford parser

        :param sentence: sentence as a list of tokens.
        :type sentence: list(str)
        :return: DependencyGraph object
        :rtype: nltk.DependencyGraph
        """
        result = self.dependency_parser.parse(sentence)
        return result.__next__()
Esempio n. 5
0
    def Dep_handler(self, sentence, parser):
        if parser == "spacy":
            try:
                import spacy, en_core_web_sm
            except ImportError:
                print("Can't import spacy")
            nlp = en_core_web_sm.load()
            doc = nlp(sentence)
            return_dict = {}
            for token in doc:
                return_dict[str(token.text)] = str(token.dep_)
            return return_dict
        elif parser == "nltk":
            try:
                import nltk
                from nltk.parse.stanford import StanfordDependencyParser

                os.environ["CLASSPATH"] = "./StanfordNLP/jars"
                os.environ["STANFORD_MODELS"] = "./StanfordNLP/models"
            except ImportError:
                print("Can't import nltk")
            eng_parser = StanfordDependencyParser()
            res = list(eng_parser.parse(sentence.split()))
            return_dict = {}
            turn = True
            for row in res[0].triples():
                if row[0][0] not in return_dict and turn:
                    return_dict[row[0][0]] = "ROOT"
                    turn = False
                return_dict[row[2][0]] = row[1]
            #     print(row)
            #             print(return_dict)
            return return_dict
Esempio n. 6
0
def dependency_parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordDependencyParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'

    node_file = shelve.open(filename)
    all_dependency_list = []
    for index, sentence in enumerate(word_lists):
        # 存进all_dependency_list中,存储数据类型是列表
        res = list(chinese_parser.parse(sentence.strip().split()))
        print("we have finished ", index + 1, " sentence!!!")

        list_file = [triple for triple in res[0].triples()]
        all_dependency_list.append(list_file)

        #存进node_file,存储数据类型是dict/defaultdict,用作备份文件
        node_dict = {}
        node = res[0].nodes
        for inner_index in range(len(node.items()) * 2):
            if node[inner_index]['word'] != None or node[inner_index][
                    'ctag'] != None:
                # print(node[inner_index])
                node_dict[node[inner_index]["address"]] = node[inner_index]
                # print(node[inner_index]["address"], type(node[inner_index]["address"]))
        node_file[str(index)] = node_dict

    node_file.close()
    return all_dependency_list
Esempio n. 7
0
def ch_depenpaeser(str):
    chi_parser = StanfordDependencyParser(
        r"E:\tools\stanfordNLTK\jar\stanford-parser.jar",
        r"E:\tools\stanfordNLTK\jar\stanford-parser-3.9.1-models.jar",
        r"E:\tools\stanfordNLTK\jar\classifiers\chinesePCFG.ser.gz")
    res = list(chi_parser.parse(str.split()))
    for row in res[0].triples():
        print(row)
 def extract_depend_parser(self, sentence):
     parser_feature = []
     eng_parser = StanfordDependencyParser(
         model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
     res = list(eng_parser.parse(str(sentence).split()))
     for row in res[0].triples():
         parser_feature.append(str(row))
     return parser_feature
Esempio n. 9
0
class feature_cal():
    def __init__(self, text_collector):
        # wn.ensure_loaded()
        self.text_collector = text_collector
        self.dep_parser = StanfordDependencyParser(
            '/data3/zyx/project/eye_nlp/data/model/stanford-parser.jar',
            '/data3/zyx/project/eye_nlp/data/model/stanford-parser-3.9.2-models.jar',
            model_path=
            '/data3/zyx/project/eye_nlp/data/model/englishPCFG.ser.gz')
        self.tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
        self.nlp = spacy.load("en_core_web_sm")

    def get_feature(self, words_list, wn):

        raw_words_list = [
            self.tokenizer.tokenize(word)[0] for word in words_list
        ]

        fea_num_letter = [len(word) for word in raw_words_list]
        fea_start_capital = [word.istitle() for word in raw_words_list]
        fea_capital_only = [word.isupper() for word in raw_words_list]
        fea_have_num = [
            True if re.match(r'[+-]?\d+$', word) else False
            for word in raw_words_list
        ]
        fea_abbre = [
            word.isupper() and len(word) >= 2 for word in raw_words_list
        ]
        fea_entity_critical = cal_entity_critical(self.nlp, words_list)

        # use nlp method

        doc = self.nlp()
        res = self.dep_parser.parse(words_list)
        deps = res.__next__()
        traverse(deps, 0)  # 0 is always the root node
        fea_domi_nodes = []
        for i in range(1, len(words_list) + 1):
            this_dominate = cal_dominate(deps, i)
            fea_domi_nodes.append(this_dominate)

        fea_max_d = cal_max_d(deps, len(words_list))

        fea_idf = cal_idf(self.text_collector, raw_words_list)
        if len(fea_max_d) != len(fea_have_num):
            print('length error')
        # fea_num_wordnet = [len(wn.synsets(word)) for word in raw_words_list]
        fea_complexity = [
            textstat.flesch_kincaid_grade(str(word)) for word in words_list
        ]
        return [
            fea_num_letter, fea_start_capital, fea_capital_only, fea_have_num,
            fea_abbre, fea_entity_critical, fea_domi_nodes, fea_max_d, fea_idf,
            fea_complexity
        ]
 def DependencyParmer(self, sentence):
     cutSentenceList = self.get_sentence_list(sentence)
     chi_parser = StanfordDependencyParser(
         r"D:\python\stanfordParser\jars\stanford-parser.jar",
         r"D:\python\stanfordParser\jars\stanford-parser-3.5.0-models.jar",
         r"D:\python\stanfordParser\jars\chinesePCFG.ser.gz")
     res = list(chi_parser.parse(cutSentenceList))
     relationship = []
     for row in res[0].triples():
         relationship.append(row)
     return relationship
Esempio n. 11
0
def dependencyParser(tokens):
    from nltk.parse.stanford import StanfordDependencyParser
    chi_parser = StanfordDependencyParser(
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser.jar",
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar",
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp"
        r"\models\lexparser\chinesePCFG.ser.gz")

    tree = chi_parser.parse(tokens)
    res = list(tree)
    for row in res[0].triples():
        print(row)
 def save_depend_tree(self, parser_sentence):
     depend_tree = []
     eng_parser = StanfordDependencyParser(
         model_path="E:/Stanford parser/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models/edu/stanford/nlp/models/lexparser/englishFactored.ser.gz")
     for i in range(len(parser_sentence)):
         print(i)
         sentence = []
         res = list(eng_parser.parse(str(parser_sentence[i].word).split()))
         for row in res[0].triples():
             sentence.append(row)
         depend_tree.append(sentence)
     pickle.dump(depend_tree, open("test_depend_tree.txt", 'wb'))
Esempio n. 13
0
def parser_dependency(sentence):
    eng_parser = StanfordDependencyParser(
        path_to_jar=path_dit.get('path_to_jar'),
        path_to_models_jar=path_dit.get('path_to_models_jar'),
        model_path=path_dit.get('model_path'))
    res = list(eng_parser.parse(sentence.split()))
    print type(res)
    for row in res[0].triples():
        # print row[1]
        print row[0][0], row[1], row[2][0]

    return res
Esempio n. 14
0
class DependencyParser(object):
    def __init__(self, path_to_jar, path_to_models_jar):
        self.model = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

    def str_parse(self, sentence):
        ''' sentence is a string '''
        parsed = self.model.raw_parse(sentence)
        return [p for p in parsed]

    def lst_parse(self, sentence):
        ''' sentence is a list of words '''
        parsed = self.model.parse(sentence)
        return [p for p in parsed]
Esempio n. 15
0
def sentToTriples(sent):
    #returns a list of triples
    sent = ''.join([i if i.isalpha() else ' ' for i in sent])
    eng_parser = StanfordDependencyParser(
        r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser.jar",
        r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser-3.6.0-models.jar",
        r"/home/losphoenix/StanfordNLP/stanford-parser/englishPCFG.ser.gz")

    parsed = eng_parser.parse(sent.split())
    result = list(parsed)
    #print parsed;

    #   for row in result[0].triples():
    #       print(row[0]);
    return result[0].triples()
Esempio n. 16
0
def sentToTriples(sent):
    #returns a list of triples
    cns_parser = StanfordDependencyParser(
        r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser.jar",
        r"/home/losphoenix/StanfordNLP/stanford-parser/stanford-parser-3.6.0-models.jar",
        r"/home/losphoenix/StanfordNLP/stanford-parser/models/lexparser/chinesePCFG.ser.gz"
    )

    parsed = cns_parser.parse(sent.split())
    result = list(parsed)
    #print parsed;

    #   for row in result[0].triples():
    #       print(row[0]);
    return result[0].triples()
Esempio n. 17
0
def parser():
	os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar'
	os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

	eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m")
	for x in content:
		a = list(eng_parser.parse(x.split()))[0]
		print(a)
		# a.draw()

	eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
	for x in content:
		a = list(eng_dep_parser.parse(x.split()))[0]
		for row in a.triples():
			print(row)
def stanfordDP(sentence, displayTree=0, showTriples=0):
    '''Stanford依存語法解析。若需印出依存圖則設定displayTree=1。
    '''
    #print(repr(sentence),'\n')
    parser = StanfordDependencyParser()
    res = list(parser.parse(sentence.split()))

    #print(res[0].tree(),'\n')
    #print(*res[0].tree(),'\n')

    rels = [rel for rel in res[0].triples()]
    if (showTriples != 0):
        for row in res[0].triples():
            print(row)

    if (displayTree != 0):
        for row in res[0].tree():
            #print(row)
            if type(row) is not str:
                #row.draw()
                display(row)
    return rels
Esempio n. 19
0
tags = semcor.tagged_sents(tag = 'sem')

n = 0

correct = 0
base = 0
total = 0

for sent in corp:

    sentence =  ' '.join(sent)

    print sentence

    parsed = list(parser.parse(tokenizer.tokenize(sentence)))

    for term in tags[n]:
        if len(term)==1 and isinstance(term[0], basestring) and isinstance(term, Tree) and len(wordnet.synsets(term[0])) > 1:
            if isinstance(term.label(), unicode):
                continue
            syn = term.label().synset()
            word = term[0]
            sense_standard = syn

            print word
        
            for pair in parsed[0].triples():
                if pair[0][0] == word:
                    pos = pair[0][1]
                if pair[2][0] == word:
Esempio n. 20
0
class Baselines:
    def __init__(self):
        # 读入停用词表
        with open(FilePool.stopword_txt, 'r') as f_stopword:
            doc = f_stopword.readlines()
        self.stopwords = [line.rstrip('\n') for line in doc]

        # 读入答案
        if args.answer_base == 'long':
            # 使用长答案
            ans_json = FilePool.long_answers_json
            ans_txt = FilePool.long_answers_txt
        elif args.answer_base == 'cleaned':
            # 使用短答案
            ans_json = FilePool.cleaned_answers_json
            ans_txt = FilePool.cleaned_answers_txt
        else:
            # 使用small answers
            ans_json = FilePool.small_answers_json
            ans_txt = FilePool.small_answers_txt
        with open(ans_json, 'r') as f_json:
            text = json.load(f_json)
            if args.trim_stop:
                self.cut_answers = [[ele for ele in answer if ele not in self.stopwords] for answer in text]
            else:
                self.cut_answers = text
        with open(ans_txt, 'r') as f_ans_txt:
            text = f_ans_txt.readlines()
            self.uncut_answers = [line.rstrip('\n') for line in text]

        # 读入QA库和已知问题库
        if args.method == Method.mix or args.method == Method.qq_match:
            with open(FilePool.qa_file, 'r') as f_qa:
                self.qa = json.load(f_qa)
            with open(FilePool.base_ques_list_file, 'r') as f_base_ques_list:
                self.base_ques_list = json.load(f_base_ques_list)

        # 提前实例化bm25模型,提升性能
        # 如果提前对问题分类了,那也要提前实例化模型,给分类为空的问题兜底
        if (args.method == Method.bm25 or args.method == Method.bm25_syn):
            self.bm25_model_uncat = BM25(self.cut_answers)
        if args.method == Method.mix or args.method == Method.bm25_new:
            self.bm25_model_uncat = NewBM25(self.cut_answers)

        # 提前实例化tfidf模型,提升性能
        if args.method == Method.mix or args.method == Method.qq_match:
            self.tfidf_dict = Dictionary(self.base_ques_list)  # fit dictionary
            n_features = len(self.tfidf_dict.token2id)
            bow = [self.tfidf_dict.doc2bow(line) for line in self.base_ques_list]  # convert corpus to BoW format
            # 构造tf-idf模型
            self.tfidf_model = TfidfModel(bow)  # fit model
            text_tfidf = self.tfidf_model[bow]  # apply model
            self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features)
        elif args.method == Method.tfidf_sim:
            self.tfidf_dict = Dictionary(self.cut_answers)  # fit dictionary
            n_features = len(self.tfidf_dict.token2id)
            bow = [self.tfidf_dict.doc2bow(line) for line in self.cut_answers]  # convert corpus to BoW format
            # 构造tf-idf模型
            self.tfidf_model = TfidfModel(bow)  # fit model
            text_tfidf = self.tfidf_model[bow]  # apply model
            self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features)

        # 实例化Parser
        self.parser = StanfordDependencyParser(path_to_jar=FilePool.stanford_parser,
                                               path_to_models_jar=FilePool.stanford_chinese_model)

    # bm25算法搜索
    def bm25(self, query, categorized_qa):
        # 只有问题分类的情况下才在这里做模型实例化,其他情况下模型已经在__init__()里实例化过了
        if args.categorize_question:
            if len(categorized_qa['cut_answers']) != 0:
                # 非空的时候才用这个作corpus传进BM25
                bm25_model = BM25(categorized_qa['cut_answers'])
                # print(categorized_qa['classes'])
            else:
                # 如果为空,那么还用原来的corpus传进BM25
                bm25_model = self.bm25_model_uncat
                # print('没用分类问题')
        else:
            bm25_model = self.bm25_model_uncat
            # print('没用分类问题')

        bm25_weights = bm25_model.get_scores(query)

        sorted_scores = sorted(bm25_weights, reverse=True)  # 将得分从大到小排序
        # 选择不同的normalize方式
        if not args.advanced_norm:
            sorted_scores = [s / (len(query) + 1) for s in sorted_scores]  # 将得分除以句长
        else:
            # 一种高级的normalize方法
            content_word_cnt = len(query)
            parse_result = self.parser.parse(query).__next__()
            depend_relation_cnt = len(list(parse_result.triples()))
            sorted_scores = [s / (content_word_cnt * k1 + depend_relation_cnt * k2) for s in sorted_scores]
        max_pos = np.argsort(bm25_weights)[::-1]  # 从大到小排序,返回index(而不是真正的value)

        # 根据max_pos从答案库里把真正的答案抽出来
        if args.categorize_question:
            # 答案来源是categorized的时候
            if len(categorized_qa['cut_answers']) != 0:
                # 非空的时候才用这个作为answer base
                answers = self.__max_pos2answers(max_pos, categorized_qa['uncut_answers'])
            else:
                # 如果为空,那么还用原来的self.uncut_answers作为answer base
                answers = self.__max_pos2answers(max_pos, self.uncut_answers)
        else:
            # 答案来源不是categorized的时候,categorized_qa是None
            answers = self.__max_pos2answers(max_pos, self.uncut_answers)

        return sorted_scores, max_pos, answers

    # bm25 with synonym module
    # 不支持问题分类
    def bm25_syn(self, query):
        bm25_model = self.bm25_model_uncat

        query_weights = bm25_model.get_scores(query)  # 普通的bm25算法
        max_pos = np.argsort(query_weights)[::-1][0]  # 最高得分所在的index(而不是真正的value)

        # 找出来query里哪个词是最关键的
        max_score = 0
        kw = ''  # 最关键的那个词
        kw_idx = -1
        for idx, word in enumerate(query):
            word_weight = bm25_model.get_score([word], index=max_pos)
            if word_weight > max_score:
                max_score = word_weight
                kw = word
                kw_idx = idx

        # 为这个最关键的词创造一个近义词列表
        nearby_list = synonyms.nearby(kw)
        syn_list = [kw]  # 先手动把关键词自己加到列表里
        for word, score in zip(nearby_list[0], nearby_list[1]):
            # 条件:得分大于阈值
            if score > args.syn_threshold and word not in syn_list:
                syn_list.append(word)

        # 找出来哪个近义词得分最高
        max_score = -1
        best_kw = ''  # 得分最高的词
        for syn in syn_list:
            query[kw_idx] = syn  # 替换query中的那个最关键的词
            weights = bm25_model.get_scores(query)  # 普通的bm25算法
            score = sorted(weights, reverse=True)[0]  # 将得分从大到小排序,取第1个
            if score > max_score:
                max_score = score
                best_kw = syn

        # if best_kw != kw:
        #     print('1')
        # else:
        #     print('0')
        # print(kw + '\t' + best_kw)

        # 找到最合适的关键词了,回到正规,返回sorted_scores, max_pos, answers
        query[kw_idx] = best_kw
        bm25_weights = bm25_model.get_scores(query)

        sorted_scores = sorted(bm25_weights, reverse=True)  # 将得分从大到小排序
        sorted_scores = [s / (len(query) + 1) for s in sorted_scores]  # 将得分除以句长
        max_pos = np.argsort(bm25_weights)[::-1]  # 从大到小排序,返回index(而不是真正的value)
        answers = self.__max_pos2answers(max_pos, self.uncut_answers)

        return sorted_scores, max_pos, answers

    # 改进版的bm25
    def bm25_new(self, query, uncut_query, categorized_qa):
        # 只有 问题分类 且 分类不为空 且 不用uni_idf 的情况下才在这里做模型实例化
        # 其他情况下模型已经在__init__()里实例化过了
        if args.categorize_question and len(categorized_qa['cut_answers']) != 0 and not args.uni_idf:
            bm25_model = NewBM25(categorized_qa['cut_answers'])
            # print(categorized_qa['classes'])
        else:
            bm25_model = self.bm25_model_uncat
            # print('没用分类问题')

        expanded_query = []
        for q in query:
            if q not in expanded_query:
                expanded_query.append(q)  # 先把q加到expanded_query里面
            nearby_list = synonyms.nearby(q)  # 为q创造一个近义词列表
            for word, score in zip(nearby_list[0], nearby_list[1]):
                # 条件:得分大于阈值 && expanded_query当前没这个词
                if score > args.syn_threshold and word not in expanded_query:
                    expanded_query.append(word)

        bm25_weights = bm25_model.get_new_scores(query, expanded_query)

        sorted_scores = sorted(bm25_weights, reverse=True)  # 将得分从大到小排序
        # 选择不同的normalize方式
        if not args.advanced_norm:
            sorted_scores = [s / (len(query) + 1) for s in sorted_scores]  # 将得分除以句长
        else:
            # 一种高级的normalize方法
            content_word_cnt = len(query)
            parse_result = self.parser.parse(query).__next__()
            depend_relation_cnt = len(list(parse_result.triples()))
            sorted_scores = [s / (content_word_cnt * k1 + depend_relation_cnt * k2) for s in sorted_scores]
        max_pos = np.argsort(bm25_weights)[::-1]  # 从大到小排序,返回index(而不是真正的value)

        # 根据max_pos从答案库里把真正的答案抽出来
        # 只有 问题分类 且 分类不为空 且 不用uni_idf 的情况下才在用categorized_qa作为答案库
        # 其他情况下用self.uncut_answers作为答案库
        if args.categorize_question and len(categorized_qa['cut_answers']) != 0 and not args.uni_idf:
            answers = self.__max_pos2answers(max_pos, categorized_qa['uncut_answers'])
        else:
            answers = self.__max_pos2answers(max_pos, self.uncut_answers)

        # 如果用全集答案库计算IDF的话,相当于模型得分和不分类是一样的,只是现在需要筛掉不在categorized_qa中的答案
        if args.uni_idf:
            filtered_sorted_scores = []
            filtered_max_pos = []
            filtered_answers = []
            for s, m, a in zip(sorted_scores, max_pos, answers):
                if a in answers:
                    filtered_sorted_scores.append(s)
                    filtered_max_pos.append(m)
                    filtered_answers.append(a)
            sorted_scores = filtered_sorted_scores
            max_pos = filtered_max_pos
            answers = filtered_answers

        # if args.categorize_question:
        #     print(sorted_scores[0])

        # # 如果问题分类的得分不够阈值,那么重头来过,用答案库全集重新跑一遍new bm25
        # if args.categorize_question and sorted_scores[0] < args.bm25_new_cat_threshold:
        #     bm25_model = self.bm25_model_uncat
        #
        #     bm25_weights = bm25_model.get_new_scores(query, expanded_query)
        #
        #     sorted_scores = sorted(bm25_weights, reverse=True)  # 将得分从大到小排序
        #     # 选择不同的normalize方式
        #     if not args.advanced_norm:
        #         sorted_scores = [s / (len(query) + 1) for s in sorted_scores]  # 将得分除以句长
        #     else:
        #         # 一种高级的normalize方法
        #         content_word_cnt = len(query)
        #         parse_result = self.parser.parse(query).__next__()
        #         depend_relation_cnt = len(list(parse_result.triples()))
        #         sorted_scores = [s / (content_word_cnt * k1 + depend_relation_cnt * k2) for s in sorted_scores]
        #     max_pos = np.argsort(bm25_weights)[::-1]  # 从大到小排序,返回index(而不是真正的value)
        #
        #     answers = self.__max_pos2answers(max_pos, self.uncut_answers)

        return sorted_scores, max_pos, answers

    # 问题-问题匹配
    def qq_match(self, query):
        # 输入tf-idf,得到从大到小排列的index list
        sorted_scores, max_pos, _ = self.tfidf_sim(query, direct_call=False)
        answers, questions = self.__max_pos2answers_questions(max_pos)

        # 用QQ匹配的阈值过滤一遍结果
        sorted_scores, max_pos, answers, questions = \
            self.__filter_by_threshold(sorted_scores, max_pos, answers, questions, args.qq_threshold)

        return sorted_scores, max_pos, answers, questions

    # QQ匹配+问题分类+QA匹配+可回答性判断
    def qq_cat_qa_filter(self, query, uncut_query, categorized_qa):
        sorted_scores, max_pos, answers, questions = self.qq_match(query)  # 先用QQ匹配试试

        if len(sorted_scores) > 0:
            # QQ匹配效果不错,直接返回结果
            # print(questions[0])
            return sorted_scores, max_pos, answers, questions
        else:
            # 截断之后啥也不剩了,说明QQ匹配没有一个得分到阈值的
            # 果断放弃,改用QA匹配
            # QA匹配暂时选用bm25算法
            sorted_scores, max_pos, answers = self.bm25_new(query, uncut_query, categorized_qa)

            # 用QA匹配的阈值过滤一遍结果,注意分类和没分类的情况阈值是不一样的
            if args.categorize_question and len(categorized_qa['cut_answers']) != 0 and not args.uni_idf:
                if args.advanced_norm:
                    # new bm25用高级归一化方法
                    threshold = args.cat_adv_norm_threshold
                else:
                    # new bm25用普通归一化方法
                    threshold = args.cat_threshold
            else:
                # new bm25不用问题分类
                threshold = args.qa_threshold
            sorted_scores, max_pos, answers, _ = \
                self.__filter_by_threshold(sorted_scores, max_pos, answers, [], threshold)

            return sorted_scores, max_pos, answers, []  # questions的位置返回一个空list

    # tf-idf相似度算法搜索
    # direct_call = True 代表直接调用tfidf-sim()做答案选择
    # direct_call = False 代表qq-match()调用tfidf-sim()做QQ匹配
    def tfidf_sim(self, query, direct_call=True):
        query_bow = [self.tfidf_dict.doc2bow(query)]  # 用query做一个bag of words
        query_tfidf = self.tfidf_model[query_bow]  # 用tfidf model编码
        similarities = self.sim_index[query_tfidf][0]  # 算相似度

        sorted_scores = sorted(similarities, reverse=True)  # 将得分从大到小排序
        max_pos = np.argsort(similarities)[::-1]  # 从大到小排序,返回index(而不是真正的value)
        if direct_call:
            answers = self.__max_pos2answers(max_pos, self.uncut_answers)  # 根据max_pos从答案库里把真正的答案抽出来
        else:
            answers = []  # 没人关心answers
        return sorted_scores, max_pos, answers

    # 根据max_pos从答案库里把真正的答案抽出来
    def __max_pos2answers(self, max_pos, uncut_answers):
        max_pos = max_pos.tolist()  # ndarray -> list
        answers = []
        for r in max_pos:
            answers.append(uncut_answers[r])
        return answers

    def __max_pos2answers_questions(self, max_pos):
        max_pos = max_pos.tolist()  # ndarray -> list
        answers = []
        questions = []
        for r in max_pos:
            answers.append(self.qa[r]['sentence'][0])
            questions.append(self.qa[r]['question'])
        return answers, questions

    # 根据阈值对结果进行截断
    def __filter_by_threshold(self, sorted_scores, max_pos, answers, questions, threshold):
        cut_point = 10000  # 截断位置,先初始化一个很大的值
        for i, score in enumerate(sorted_scores):
            # 如果第i个score小于阈值,那么后面的一定都小于阈值,从这里截断就行
            if score < threshold:
                cut_point = i  # 把截断位置设为i
                break
        # 进行截断操作
        sorted_scores = sorted_scores[:cut_point]
        max_pos = max_pos[:cut_point]
        answers = answers[:cut_point]
        questions = questions[:cut_point]
        return sorted_scores, max_pos, answers, questions
Esempio n. 21
0
                    seg_list = jieba.cut(sentence[i])
                    tokens = "/".join(seg_list)
                    tokenizer = tokens.split("/")
                    #print(tokenizer)
                    for j in range(len(char)):
                        if (char[j] in tokenizer):
                            A = char[j]
                            break
                    B = A

                    zh_dependency_parser = StanfordDependencyParser(
                        "/Users/sumeiru/Desktop/StanfordNLP/StanfordNLP/jars/stanford-parser.jar",
                        "/Users/sumeiru/Desktop/StanfordNLP/StanfordNLP/jars/stanford-parser-3.9.2-models.jar",
                        "/Users/sumeiru/Desktop/StanfordNLP/StanfordNLP/models/chineseFactored.ser.gz"
                    )
                    ans = list(zh_dependency_parser.parse(tokens.split("/")))
                    for row in ans[0].triples():
                        if (row[1] == "nsubj" and row[2][0] in char):
                            A = row[2][0]
                            #print("from: "+A)
                        elif (row[1] == "compound:nn"
                              and row[2][0] in char_except):
                            B = row[2][0]
                            #print("to: "+B)
                for k in range(len(vocal)):
                    if (vocal[i] in dialogue[i]):
                        detect_print = False
                        break

                if (detect_print):
                    print(A + ": " + dialogue[i])
Esempio n. 22
0
from nltk.parse.stanford import StanfordDependencyParser

chi_parser = StanfordDependencyParser(r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser.jar",
                                      r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser-3.6.0-models.jar",
                                      r"/Users/baymax/Dev/library/stanford/segmenter/chinesePCFG.ser.gz"
                                      )
res = list(chi_parser.parse(u'螺纹 按 其 截面 形状 ( 牙型 ) 分为 三角形螺纹 、 矩形螺纹 、 梯形螺纹'.split()))

for row in res[0].triples():
    print(row)
Esempio n. 23
0
#------------------------------------------------------------------------------#

YFC_text_sent_token = nltk.sent_tokenize(YFC_text_final)

zz3 = []
for i in range(0,5,1):
    zz3.append(nltk.sent_tokenize(YFC_text_final)[i])
YFC_text__to_temp = ' '.join(zz3)

zz3_5 = nltk.word_tokenize(YFC_text__to_temp)
zz4 = [ii6 for ii6 in zz3_5 if not ii6 in stopword2]
YFC_text__to_parse = ' '.join(zz4)
zzfinal = nltk.sent_tokenize(YFC_text__to_parse.lower())

eng_parser = StanfordDependencyParser()
res = list(eng_parser.parse(zzfinal))

dep_tree_dot_repr = [parse for parse in res][0].to_dot()
dtree = [parse for parse in res][0]
source = Source(dep_tree_dot_repr, filename="dep_tree", format="png")
source.view()

#------------------------------------------------------------------------------#

tt ={}
ttt4 = []
ttt4=res[0].nodes.keys()
for i in range(len(ttt4)+1):
    tt[i]=res[0].get_by_address(i)
    
kkw={}
Esempio n. 24
0
from nltk.parse.stanford import StanfordDependencyParser, StanfordParser

from src.data.paths import get_dictionaty_data_path

path_to_jar = get_dictionaty_data_path('stanford-corenlp-3.9.1.jar')
path_to_models_jar = get_dictionaty_data_path(
    'stanford-corenlp-3.9.1-models.jar')

dep_parser = StanfordDependencyParser(path_to_jar=path_to_jar,
                                      path_to_models_jar=path_to_models_jar)
parser = StanfordParser(path_to_jar=path_to_jar,
                        path_to_models_jar=path_to_models_jar)

sentence = [
    'i', 'am', 'a', 'smart', 'business', 'man', 'and', 'can', 'lead', 'much',
    'better', 'than', 'the', 'old', 'lazy', 'presidents'
]

# print(list(parser.raw_parse("the quick brown fox jumps over the lazy dog")))
# print([parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")])

# result = list(dep_parser.parse(sentence))
# result[0].draw()

result = list(dep_parser.parse(sentence))
dep_tree = [parse.tree() for parse in result][0]
dep_tree.draw()
Esempio n. 25
0
    for w in doc:
        if w.text in neg_words:
            negated.add(w.head.i)
    for j in xrange(0, len(tokens)):
        neg_array.append(j in negated)
    nlp.tokenizer = old_tokenizer
    return neg_array


if __name__ == "__main__":
    tokens = map(TokenFactory.create, "I do not have any idea and I am not alright .".split(" "))
    print tokens
    print compute_neg_dir_dep(tokens, ["not", "any"])
    dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    # parse
    tree = next(dep_parser.parse("I do not have any idea and I am not alright.".split(" "))).tree()
    # Can't deal with the duplicate because the dependency graph does not label each token
    negated = []
    parent = tree.label()
    queue = Queue.Queue()
    queue.put((parent, tree))
    while not queue.empty():
        elem = queue.get()
        print elem
        parent, current = elem
        if isinstance(current, Tree):
            label = current.label()
            for c in current:
                queue.put((label, c))
        else:
            label = current
Esempio n. 26
0
                                words_len = len(words)
                            else:
                                i = i + 1
                                words_len = len(words)

                        words_only = words[1:]

                        i = 1
                        while (i < words_len):
                            lmtzr.lemmatize(words[i])
                            i = i + 1

                        pos_words = pos_tagger.tag(words_only)
                        parsed_out_pcfg = reg_parser.parse(pos_words)

                        pre_parsed_out = dependency_parser.parse(words_only)
                        dep = pre_parsed_out.__next__()
                        parsed_out = list(dep.triples())

                        Script_Word_Ct += len(pos_words)

                        i = 0
                        while (i < words_len - 1):
                            tags = pos_words[i][1]

                            if (i < len(pos_words) - 1 and tags == 'NP'
                                    and pos_words[i + 1][1] == 'PRP'):
                                NP_PRP += 1

                            i = i + 1
                            for all in tags:
    except:
        BioSentStopArray = BioSentStopArray
    else:
        BioSentStopArray = cosine_similarity(titlevector, mainfindingvector)
    BioSentStop = BioSentStopArray[0][0]

    # pattern based feature scoring
    Pattern = 0.00
    pattern_sheet = readworkbook("Pattern.xlsx", 0)
    for string_num in range(1, 7):
        if re.search(
                pattern_sheet.cell(string_num, 0).value, sentence.lower()):
            Pattern = Pattern + pattern_sheet.cell(string_num, 1).value

    if sentence.split():
        res = list(dependencyparser.parse(sentence.lower().split()))
        for row in res[0].triples():
            rowtext = ''.join(str(row))
            if rowtext.find("(u'we', u'PRP')") != -1:
                for pattern_num in range(1, 16):
                    if rowtext.find(pattern_sheet.cell(pattern_num,
                                                       2).value) != -1:
                        Pattern = Pattern + pattern_sheet.cell(pattern_num,
                                                               3).value
    mainfindingwordspattern = nltk.word_tokenize(sentence.lower())
    if len(mainfindingwordspattern) > 3:
        for first_three_num in range(1, 38):
            if mainfindingwordspattern[0] + ' ' + mainfindingwordspattern[
                    1] + ' ' + mainfindingwordspattern[2] == pattern_sheet.cell(
                        first_three_num, 4).value:
                Pattern = Pattern + pattern_sheet.cell(first_three_num,
Esempio n. 28
0
print(se)

count = 0
res = ''
words = pseg.cut(se)
for word, flag in words:
    if (flag == 'n'):
        print('%s %s' % (word, flag))
        # res += word + ' / '
        res += word + ' '

#词向量需要分词文本语料
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus(u'testVec.txt' )  # 加载语料
model = word2vec.Word2Vec(sentences, size=200)  # 训练skip-gram模型; 默认window=5

# 计算某个词的相关词列表
y2 = model.most_similar(u"螺纹", topn=40)  # 20个最相关的
count = 0
for item in y2:
        print(item[0], item[1])

chi_parser = StanfordDependencyParser(r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser.jar",
                                      r"/Users/baymax/Dev/library/stanford/segmenter/stanford-parser-3.6.0-models.jar",
                                      r"/Users/baymax/Dev/library/stanford/segmenter/chinesePCFG.ser.gz"
                                      )
res = list(chi_parser.parse(se.split()))

for row in res[0].triples():
    print(row)
Esempio n. 29
0
path_to_jar = '/Users/wang/dev/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar'
path_to_model_jar = '/Users/wang/dev/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar'
model_path = '/Users/wang/dev/stanford-chinese-corenlp-2018-02-27-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz'

#s = u"你 有个 优惠券 快要 过期 了"
s = u"王其龙 是 一个 优秀 的 程序员,他 喜欢 江曦"

# 依存分析
from nltk.parse.stanford import StanfordDependencyParser
parser = StanfordDependencyParser(path_to_jar, path_to_model_jar, model_path)
result = list(parser.parse(s.split()))
for row in result[0].triples():
    print(row)

# 句法结构分析
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(path_to_jar, path_to_model_jar, model_path)
result = list(parser.parse(s.split()))
for r in result:
    print (r)
    print (r.draw())
Esempio n. 30
0
    print(dep)
    for d in dep['deps']:
        for addr2 in dep['deps'][d]:
            traverse(deps, addr2)


# code on book
dep_parser = StanfordDependencyParser(
    '/data3/zyx/project/eye_nlp/data/model/stanford-parser.jar',
    '/data3/zyx/project/eye_nlp/data/model/stanford-parser-3.9.2-models.jar',
    model_path='/data3/zyx/project/eye_nlp/data/model/englishPCFG.ser.gz')
# print(list(english_parser.raw_parse_sents(('this is the english parser test'))))
# [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]

s = 'the big dog chased the little cat.'
res = dep_parser.parse(
    s.split())  # can use a simple .split since my input is already tokenised
deps = res.__next__()
traverse(deps, 0)  # 0 is always the root node

features = []


def cal_dominate(dep, index):
    tmp_dominate = 0
    for each_dep in dep.nodes[index]['deps']:
        for each_index in dep.nodes[index]['deps'][each_dep]:
            tmp_dominate += cal_dominate(dep, each_index)
    return tmp_dominate + 1


for i in range(1, len(deps.nodes)):
Esempio n. 31
0
# mylist = list(eng_parser.parse(sentence.split()))
# print(len(mylist))
# print("句法分析结果", mylist)

# 依存句法分析
# 对于依存关系的标签说明:http://universaldependencies.org/u/dep/all.html#al-u-dep/det
eng_dependency_parser = StanfordDependencyParser(
    path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar",
    path_to_models_jar=
    r"D:\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar",
    model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

outputs = ' '.join(tokenizer.tokenize("Dole was defeated by Clinton"))
print(outputs)

result = list(eng_dependency_parser.parse(outputs.split()))
for each in result[0].triples():
    print(each)
#     if each[1]=='dobj':
#         # print(each)
#         print(each[0][0])
#         print(each[2][0])
# print("依存句法分析结果:")
# for row in result[0].triples():
#     print(row)
# print(result[0])
# 中文分词
# 还要研究一下,一下代码报错
# chinese_segmenter = StanfordSegmenter(
#     path_to_jar=r"D:\stanford-segmenter-2016-10-31\stanford-segmenter-3.7.0.jar",
#     path_to_slf4j=r"D:\stanford-segmenter-2016-10-31\slf4j-api.jar",
Esempio n. 32
0
    'interest_3': 2,
    'interest_4': 3,
    'interest_5': 4,
    'interest_6': 5
}

bayes = [[], [], [], [], [], []]

count = [0, 0, 0, 0, 0, 0]

n = 0

for instance in senseval.instances('interest.pos')[0:1599]:
    count[sense[instance.senses[0]]] += 1
    sentence = ' '.join(w for (w, p) in instance.context)
    parsed = list(parser.parse(tokenizer.tokenize(sentence)))
    for triple in parsed[0].triples():
        related = 0
        if triple[0][0] in interest:
            word = triple[2][0]
            related = 1
        if triple[2][0] in interest:
            word = triple[0][0]
            related = 1
        if related == 1:
            exist = 0
            for item in bayes[sense[instance.senses[0]]]:
                if item[0] == word:
                    item[1] += 1
                    exist = 1
            if exist == 0: