Esempio n. 1
0
class DependencyParsing:
    """
    Stanford dependency parsing
    """
    def __init__(self, path_to_jar, path_to_models_jar):
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

    def parse_sentences(self, sentences):
        """ Dependency parsing of list of tokenized sentences using the stanford parser

        :param sentences: List of sentences. Each sentence is a list of tokens.
        :type sentences: list(list(str))
        :return: iterator of DependencyGraph objects
        :rtype: iterator
        """
        result = self.dependency_parser.parse_sents(sentences)
        return result.__next__()

    def parse_sentence(self, sentence):
        """ Dependency parsing of a tokenized sentence using the stanford parser

        :param sentence: sentence as a list of tokens.
        :type sentence: list(str)
        :return: DependencyGraph object
        :rtype: nltk.DependencyGraph
        """
        result = self.dependency_parser.parse(sentence)
        return result.__next__()
Esempio n. 2
0
def dependencies(): # création des fichiers - articles passé dans le stanford parser / analyse en dépendances
    (filenameDep, inputDependencies) = createId()
    
    os.environ['CLASSPATH'] = "stanford-parser/stanford-parser-full-2018-10-17"
    os.environ['JAVAHOME'] = "D:/Program Files/java/bin"
    path_parser = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser.jar"
    path_model = "stanford-parser/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar"
    dependency_parser = StanfordDependencyParser(path_to_jar = path_parser, path_to_models_jar = path_model)

    texts_dependencies = {}
    for i in range(len(inputDependencies)):
        parsedText = ""
        dependencies = dependency_parser.parse_sents(inputDependencies[i])
        
        for dep in dependencies:
            for d in dep:
                parsedText += str(d)
        texts_dependencies[filenameDep[i]] = parsedText
    return texts_dependencies
Esempio n. 3
0
def createDependenceInformation(inputfile, outputfile):
    corpus = loadXML(inputfile)
    texts = corpus.texts
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sent_num = []
    all_sents = []
    print('分句开始')
    for text in texts:
        #对文本分句
        sents = tokenizer.tokenize(text)
        sents = [nltk.word_tokenize(sent) for sent in sents]
        all_sents.extend(sents)
        sent_num.append(len(sents))
    print('解析开始')
    eng_parser = StanfordDependencyParser(
        r"..\..\stanford parser\stanford-parser.jar",
        r"..\..\stanford parser\stanford-parser-3.6.0-models.jar")
    res_list = list(eng_parser.parse_sents(all_sents))
    res_list = [list(i) for i in res_list]
    depends = []
    #遍历每组关联
    for item in res_list:
        depend = []
        for row in item[0].triples():
            depend.append(row)
        depends.append(depend)
    print('解析完成,开始切分')
    index = 0
    depend_list = []
    for num in sent_num:
        depend_list.append(depends[index:index + num])
        index += num

    print('切分完成,开始保存')
    with open(outputfile, 'wb') as f:
        pickle.dump(depend_list, f)
    print('完成。')
Esempio n. 4
0
class StanfordParser:
    def __init__(self, query, config):
        jars_path = config.jars_path

        os.environ['STANFORD_PARSER'] = jars_path
        os.environ['STANFORD_MODELS'] = jars_path
        os.environ['CLASSPATH'] = jars_path

        self.parse(query)

        self.build_tree(query)
        self.fix_conj(query)

    def parse(self, query):

        self.depParser = STDParser(
            model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        self.parser = STParser(
            model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

        result = self.depParser.parse_sents([query.sentence.output_words])

        self.map_words = {'ROOT': [0]}
        self.map_words_index = {'ROOT': 0}
        self.parent_index = {}

        for idx in range(len(query.sentence.output_words)):
            if self.map_words.get(query.sentence.output_words[idx],
                                  None) is None:
                self.map_words[query.sentence.output_words[idx]] = []
            self.map_words[query.sentence.output_words[idx]] += [idx + 1]
            self.map_words_index[query.sentence.output_words[idx]] = 0

        self.parent_index = deepcopy(self.map_words_index)
        item = next(result)
        dep = next(item)

        self.tree = dep.tree()
        dependency_list = list(dep.triples())
        dep_root_item = [(('ROOT', 'ROOT'), 'root', dependency_list[0][0])]
        dep_dict = {}

        for item in dep_root_item + dependency_list:

            process_items = [item[0], item[2]]

            if dep_dict.get(item[2][0], None) is None:
                dep_dict[item[2][0]] = []
            dep_dict[item[2][0]] += [item]

        tree_table = []
        #treeTable[0] = (1, )
        #for item in self.
        for idx in range(len(query.sentence.output_words)):
            real_idx = idx + 1

            word = query.sentence.output_words[idx]
            idx_dep = self.map_words_index[word]

            dependency = dep_dict[word][idx_dep]
            relation = dependency[1]
            tag = dependency[2][1]

            parent_word = dependency[0][0]
            parent_idx = self.parent_index[word]
            parent_word_idx = self.map_words[parent_word][
                0]  #STRONGLY ASSUMPTION

            tree_table_item = [
                real_idx,
                word.replace('_', ' '), tag, parent_word_idx, parent_word,
                relation
            ]
            tree_table += [tree_table_item]

            if relation.startswith('conj'):
                query.conj_table += [
                    str(parent_word_idx) + ' ' + str(real_idx)
                ]

            self.map_words_index[word] = idx_dep + 1
            self.parent_index[word] = parent_idx + 1

        query.tree_table = tree_table

    def build_tree(self, query):

        query.parse_tree = ParseTree()
        done_list = [False] * len(query.tree_table)
        i = 0

        for tree_table_item in query.tree_table:
            if tree_table_item[PARENT_IDX] == 0:
                done_list[i] = True
                query.parse_tree.build_node(tree_table_item)
            i += 1

        finished = False
        while not finished:
            i = 0
            for i in range(len(query.tree_table)):
                if not done_list[i]:
                    if query.parse_tree.build_node(query.tree_table[i]):

                        done_list[i] = True
                        break

            finished = True
            for done_list_item in done_list:
                if not done_list_item:
                    finished = False
                    break

    def fix_conj(self, query):

        if len(query.conj_table) == 0:
            return
        i = 0

        for conj_table_item in query.conj_table:
            numbers = conj_table_item.split(' ')
            gov_idx = int(numbers[0])
            dep_idx = int(numbers[1])
            gov_node = query.parse_tree.search_node_by_order(gov_idx)
            dep_node = query.parse_tree.search_node_by_order(dep_idx)
            logic = ','

            if query.parse_tree.search_node_by_order(dep_node.word_order -
                                                     1) is not None:
                logic = query.parse_tree.search_node_by_order(
                    dep_node.word_order - 1).label

            if logic.lower() == 'or':
                query.conj_table[i] = query.conj_table[i]
                dep_node.left_rel = 'or'
                for j in range(len(gov_node.parent.children)):
                    if gov_node.parent.children[j].left_rel == ',':
                        gov_node.parent.children[j].left_rel = 'or'

            elif logic.lower() == 'and' or logic.lower() == 'but':
                query.conj_table[i] = query.conj_table[i]
                dep_node.left_rel = 'and'

                for j in range(len(gov_node.parent.children)):
                    if gov_node.parent.children[j] == ',':
                        gov_node.parent.children[j].left_rel = 'and'

            elif logic.lower() == ',':
                dep_node.left_rel = ','

            dep_node.parent = gov_node.parent
            gov_node.parent.children += [dep_node]
            gov_node.children.remove(dep_node)
            dep_node.relationship = gov_node.relationship
            i += 1
Esempio n. 5
0
class SummaryTools:

    _PRSR_NAME = "stanford-parser.jar"
    _MODELS_NAME = "stanford-parser-models.jar"

    def __init__(self, jar_dir="/Users/brandon/Code/External Packages/JARS/",
                 w2v_model_path="../Input/Models/news_w2v_50.p"):
        parser_path = jar_dir + SummaryTools._PRSR_NAME
        models_path = jar_dir + SummaryTools._MODELS_NAME
        self.SDP = StanfordDependencyParser(parser_path,
                                            models_path)
        self.model = Word2Vec.load(w2v_model_path)
        # Determine model vector size:
        self.v_size = len(self.model.seeded_vector(0))

    def parameterize_sentence(self, S):
        """Determine and return the word vectors for each token in S.

        Vector is the word2vec for sent[i] concatenated with parameters
        representing part-of-speech (POS).

        * S should already be tokenized and processed. *

        Additional Parameters:
        is_NN     is_NNS
        is_NNP    is_VB
        is_VBD    is_VBG
        is_DT     is_JJ

        For the sake of efficiency, does not get deps.
        """

        # 1) Tag S for POS:
        # TODO: Time me! It might be what is slowing us to a crawl...
        pos_tags = SummaryTools.get_pos_tags(S)

        word_vecs = []
        # 3) Lookup word2vecs (use 0 vectors if word does not exist):
        for i in range(len(S)):
            word_vecs.append(self.get_word_vec(S[i]))
            other_params = []

            # Determine additional params:
            check_pos = ["NN", "NNP", "NNS", "VB", "VBD", "VBG", "DT", "JJ"]
            for check in check_pos:
                other_params.append(1 if pos_tags[i] == check else 0)
            check_deps = ["root", "advc", "nsubj", "dobj"]

            # Additional parameters go here: #################################
            # TODO: Check if word was in quotations?
            # ################################################################

#             other_params.append(self.get_simmilarity(S[i], prev_root))
#             other_params.append(self.get_simmilarity(S[i], prev_sub))

            # Concatenate additional parameters to the word vector:
            # FIXME: other_params needs to be array
            other_params = np.array([[o] for o in other_params])
            wv = np.array([[v] for v in word_vecs[-1]])
            word_vecs[-1] = np.concatenate((wv, other_params),
                                           axis=0)
        return word_vecs

    def parameterize_sentences(self, S):
        S_params = []
        S_deps = self.get_rel_deps_all(S)
        for i in range(len(S)):
            s = S[i]
            S_params.append(self.parameterize_sentence(s))
            # concat params with corresponding deps
            for w in range(len(S_params[-1])):
                check_deps = ["root", "advc", "nsubj", "dobj"]
                deps = []
                for check in check_deps:
                    deps.append(1 if check in S_deps[i][w] else 0)
                wv = S_params[-1][w]
                deps = np.array([[d] for d in deps])
                S_params[-1][w] = np.concatenate((wv, deps), axis=0)
        return S_params

    def get_word_vec(self, word):
        try:
            return self.model[word]
        except KeyError:
            # word is not in model
            return np.zeros(shape=(self.v_size,))

    @staticmethod
    def get_pos_tags(S):
        # NOTE: use "nltk.download("maxent_treebank_pos_tagger")"
        #  (inside python) if missing resource to call nltk.pos_tag()
        return list(zip(*nltk.pos_tag(S)))[1]

    def get_rel_deps(self, S):
        # {index: ["ROOT", "ADVC", ...]}
        deps = {}
        for i in range(len(S)):
            deps[i] = set()

        parse = self.SDP.parse_sents([S])
        parse = list(parse)[0]  # 1st sentence
        DG = list(parse)[0]  # 1st DepGraph?
        for n in range(len(DG.nodes)):
            if DG.nodes[n]["word"] is None:
                continue  # TOP node (not a word)
            deps[n-1].add(DG.nodes[n]["rel"])
        return deps

    def get_rel_deps_all(self, S):
        deps = []

        parses = list(self.SDP.parse_sents(S))
        for i in range(len(S)):
            s_deps = {}
            for w in range(len(S[i])):
                s_deps[w] = set()
            parse = list(parses[i])  # ith sentence
            DG = parse[0]  # 1st DepGraph?
            for n in range(len(DG.nodes)):
                if DG.nodes[n]["word"] is None:
                    continue  # TOP node (not a word)
                # TODO: Add additional DepParse info?
                s_deps[n-1].add(DG.nodes[n]["rel"])
            deps.append(s_deps)
        return deps

    def get_simmilarity(self, w1, w2, v_size=50):
        try:
            return self.W2V_MODELS[v_size].similarity(w1, w2)
        except KeyError:
            return 0