class DependencyParsing: """ Stanford dependency parsing """ def __init__(self, path_to_jar, path_to_models_jar): self.dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) def parse_sentences(self, sentences): """ Dependency parsing of list of tokenized sentences using the stanford parser :param sentences: List of sentences. Each sentence is a list of tokens. :type sentences: list(list(str)) :return: iterator of DependencyGraph objects :rtype: iterator """ result = self.dependency_parser.parse_sents(sentences) return result.__next__() def parse_sentence(self, sentence): """ Dependency parsing of a tokenized sentence using the stanford parser :param sentence: sentence as a list of tokens. :type sentence: list(str) :return: DependencyGraph object :rtype: nltk.DependencyGraph """ result = self.dependency_parser.parse(sentence) return result.__next__()
def dependencies(): # création des fichiers - articles passé dans le stanford parser / analyse en dépendances (filenameDep, inputDependencies) = createId() os.environ['CLASSPATH'] = "stanford-parser/stanford-parser-full-2018-10-17" os.environ['JAVAHOME'] = "D:/Program Files/java/bin" path_parser = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser.jar" path_model = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar" dependency_parser = StanfordDependencyParser(path_to_jar = path_parser, path_to_models_jar = path_model) texts_dependencies = {} for i in range(len(inputDependencies)): parsedText = "" dependencies = dependency_parser.parse_sents(inputDependencies[i]) for dep in dependencies: for d in dep: parsedText += str(d) texts_dependencies[filenameDep[i]] = parsedText return texts_dependencies
def createDependenceInformation(inputfile, outputfile): corpus = loadXML(inputfile) texts = corpus.texts tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sent_num = [] all_sents = [] print('分句开始') for text in texts: #对文本分句 sents = tokenizer.tokenize(text) sents = [nltk.word_tokenize(sent) for sent in sents] all_sents.extend(sents) sent_num.append(len(sents)) print('解析开始') eng_parser = StanfordDependencyParser( r"..\..\stanford parser\stanford-parser.jar", r"..\..\stanford parser\stanford-parser-3.6.0-models.jar") res_list = list(eng_parser.parse_sents(all_sents)) res_list = [list(i) for i in res_list] depends = [] #遍历每组关联 for item in res_list: depend = [] for row in item[0].triples(): depend.append(row) depends.append(depend) print('解析完成,开始切分') index = 0 depend_list = [] for num in sent_num: depend_list.append(depends[index:index + num]) index += num print('切分完成,开始保存') with open(outputfile, 'wb') as f: pickle.dump(depend_list, f) print('完成。')
class StanfordParser: def __init__(self, query, config): jars_path = config.jars_path os.environ['STANFORD_PARSER'] = jars_path os.environ['STANFORD_MODELS'] = jars_path os.environ['CLASSPATH'] = jars_path self.parse(query) self.build_tree(query) self.fix_conj(query) def parse(self, query): self.depParser = STDParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") self.parser = STParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") result = self.depParser.parse_sents([query.sentence.output_words]) self.map_words = {'ROOT': [0]} self.map_words_index = {'ROOT': 0} self.parent_index = {} for idx in range(len(query.sentence.output_words)): if self.map_words.get(query.sentence.output_words[idx], None) is None: self.map_words[query.sentence.output_words[idx]] = [] self.map_words[query.sentence.output_words[idx]] += [idx + 1] self.map_words_index[query.sentence.output_words[idx]] = 0 self.parent_index = deepcopy(self.map_words_index) item = next(result) dep = next(item) self.tree = dep.tree() dependency_list = list(dep.triples()) dep_root_item = [(('ROOT', 'ROOT'), 'root', dependency_list[0][0])] dep_dict = {} for item in dep_root_item + dependency_list: process_items = [item[0], item[2]] if dep_dict.get(item[2][0], None) is None: dep_dict[item[2][0]] = [] dep_dict[item[2][0]] += [item] tree_table = [] #treeTable[0] = (1, ) #for item in self. for idx in range(len(query.sentence.output_words)): real_idx = idx + 1 word = query.sentence.output_words[idx] idx_dep = self.map_words_index[word] dependency = dep_dict[word][idx_dep] relation = dependency[1] tag = dependency[2][1] parent_word = dependency[0][0] parent_idx = self.parent_index[word] parent_word_idx = self.map_words[parent_word][ 0] #STRONGLY ASSUMPTION tree_table_item = [ real_idx, word.replace('_', ' '), tag, parent_word_idx, parent_word, relation ] tree_table += [tree_table_item] if relation.startswith('conj'): query.conj_table += [ str(parent_word_idx) + ' ' + str(real_idx) ] self.map_words_index[word] = idx_dep + 1 self.parent_index[word] = parent_idx + 1 query.tree_table = tree_table def build_tree(self, query): query.parse_tree = ParseTree() done_list = [False] * len(query.tree_table) i = 0 for tree_table_item in query.tree_table: if tree_table_item[PARENT_IDX] == 0: done_list[i] = True query.parse_tree.build_node(tree_table_item) i += 1 finished = False while not finished: i = 0 for i in range(len(query.tree_table)): if not done_list[i]: if query.parse_tree.build_node(query.tree_table[i]): done_list[i] = True break finished = True for done_list_item in done_list: if not done_list_item: finished = False break def fix_conj(self, query): if len(query.conj_table) == 0: return i = 0 for conj_table_item in query.conj_table: numbers = conj_table_item.split(' ') gov_idx = int(numbers[0]) dep_idx = int(numbers[1]) gov_node = query.parse_tree.search_node_by_order(gov_idx) dep_node = query.parse_tree.search_node_by_order(dep_idx) logic = ',' if query.parse_tree.search_node_by_order(dep_node.word_order - 1) is not None: logic = query.parse_tree.search_node_by_order( dep_node.word_order - 1).label if logic.lower() == 'or': query.conj_table[i] = query.conj_table[i] dep_node.left_rel = 'or' for j in range(len(gov_node.parent.children)): if gov_node.parent.children[j].left_rel == ',': gov_node.parent.children[j].left_rel = 'or' elif logic.lower() == 'and' or logic.lower() == 'but': query.conj_table[i] = query.conj_table[i] dep_node.left_rel = 'and' for j in range(len(gov_node.parent.children)): if gov_node.parent.children[j] == ',': gov_node.parent.children[j].left_rel = 'and' elif logic.lower() == ',': dep_node.left_rel = ',' dep_node.parent = gov_node.parent gov_node.parent.children += [dep_node] gov_node.children.remove(dep_node) dep_node.relationship = gov_node.relationship i += 1
class SummaryTools: _PRSR_NAME = "stanford-parser.jar" _MODELS_NAME = "stanford-parser-models.jar" def __init__(self, jar_dir="/Users/brandon/Code/External Packages/JARS/", w2v_model_path="../Input/Models/news_w2v_50.p"): parser_path = jar_dir + SummaryTools._PRSR_NAME models_path = jar_dir + SummaryTools._MODELS_NAME self.SDP = StanfordDependencyParser(parser_path, models_path) self.model = Word2Vec.load(w2v_model_path) # Determine model vector size: self.v_size = len(self.model.seeded_vector(0)) def parameterize_sentence(self, S): """Determine and return the word vectors for each token in S. Vector is the word2vec for sent[i] concatenated with parameters representing part-of-speech (POS). * S should already be tokenized and processed. * Additional Parameters: is_NN is_NNS is_NNP is_VB is_VBD is_VBG is_DT is_JJ For the sake of efficiency, does not get deps. """ # 1) Tag S for POS: # TODO: Time me! It might be what is slowing us to a crawl... pos_tags = SummaryTools.get_pos_tags(S) word_vecs = [] # 3) Lookup word2vecs (use 0 vectors if word does not exist): for i in range(len(S)): word_vecs.append(self.get_word_vec(S[i])) other_params = [] # Determine additional params: check_pos = ["NN", "NNP", "NNS", "VB", "VBD", "VBG", "DT", "JJ"] for check in check_pos: other_params.append(1 if pos_tags[i] == check else 0) check_deps = ["root", "advc", "nsubj", "dobj"] # Additional parameters go here: ################################# # TODO: Check if word was in quotations? # ################################################################ # other_params.append(self.get_simmilarity(S[i], prev_root)) # other_params.append(self.get_simmilarity(S[i], prev_sub)) # Concatenate additional parameters to the word vector: # FIXME: other_params needs to be array other_params = np.array([[o] for o in other_params]) wv = np.array([[v] for v in word_vecs[-1]]) word_vecs[-1] = np.concatenate((wv, other_params), axis=0) return word_vecs def parameterize_sentences(self, S): S_params = [] S_deps = self.get_rel_deps_all(S) for i in range(len(S)): s = S[i] S_params.append(self.parameterize_sentence(s)) # concat params with corresponding deps for w in range(len(S_params[-1])): check_deps = ["root", "advc", "nsubj", "dobj"] deps = [] for check in check_deps: deps.append(1 if check in S_deps[i][w] else 0) wv = S_params[-1][w] deps = np.array([[d] for d in deps]) S_params[-1][w] = np.concatenate((wv, deps), axis=0) return S_params def get_word_vec(self, word): try: return self.model[word] except KeyError: # word is not in model return np.zeros(shape=(self.v_size,)) @staticmethod def get_pos_tags(S): # NOTE: use "nltk.download("maxent_treebank_pos_tagger")" # (inside python) if missing resource to call nltk.pos_tag() return list(zip(*nltk.pos_tag(S)))[1] def get_rel_deps(self, S): # {index: ["ROOT", "ADVC", ...]} deps = {} for i in range(len(S)): deps[i] = set() parse = self.SDP.parse_sents([S]) parse = list(parse)[0] # 1st sentence DG = list(parse)[0] # 1st DepGraph? for n in range(len(DG.nodes)): if DG.nodes[n]["word"] is None: continue # TOP node (not a word) deps[n-1].add(DG.nodes[n]["rel"]) return deps def get_rel_deps_all(self, S): deps = [] parses = list(self.SDP.parse_sents(S)) for i in range(len(S)): s_deps = {} for w in range(len(S[i])): s_deps[w] = set() parse = list(parses[i]) # ith sentence DG = parse[0] # 1st DepGraph? for n in range(len(DG.nodes)): if DG.nodes[n]["word"] is None: continue # TOP node (not a word) # TODO: Add additional DepParse info? s_deps[n-1].add(DG.nodes[n]["rel"]) deps.append(s_deps) return deps def get_simmilarity(self, w1, w2, v_size=50): try: return self.W2V_MODELS[v_size].similarity(w1, w2) except KeyError: return 0