コード例 #1
0
def get_single(summary):
    if summary.startswith('.'):
        summary = summary[1:]
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    try:
        parse, = dep_parser.raw_parse(summary)
        nouns = set()
        for x in range(1, len(parse.nodes.items())):
            wdict = parse.nodes[x]
            if "NN" in wdict["tag"]:
                nouns.add(wdict["word"])
        return nouns
    except JSONDecodeError:
        print("Decode Error at " + summary)
        return None
    except StopIteration:
        print("Stopped at " + summary)
        return None
    except HTTPError:
        print("HTTPError " + summary)
        return None
コード例 #2
0
ファイル: ner.py プロジェクト: nitimkc/cyberbullying
            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities


# run this you have to connect to api
# go to dir - stanford-corenlp-full-2018-02-27
# the two lines below type in terminal as one line
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
# -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 &

from nltk.parse import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')
list(parser.parse(doc))  # for sentence tokenized doc
list(parser.raw_parse(doc))  # for non tokenized docs

# on tokenized list of words
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
list(pos_tagger.tag(doc))

ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
list(ner_tagger.tag(doc))

from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
list(dep_parser.parse(doc))
コード例 #3
0
file_names = []

for r, d, f in os.walk(file_loc):
    for file in f:
        if '.txt' in file:
            file_names.append(os.path.join(r, file))

# File read
for file in file_names:
    print(file)
    file_read = open(file, 'r')
    file_text = file_read.read()
    lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer()
    # stanford corenlp is expected to run at localhost:9000
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    corpus_dict = {}
    count = 0
    sent_text = nltk.sent_tokenize(file_text)  # Tokenizing text to sentences
    for sentence in sent_text:
        tokenized_text = [
            i for i in nltk.word_tokenize(sentence.lower())
            if i not in stop_words
        ]  # Tokenizing sentences into words
        # Lemmatizing the words to extract lemmas as features
        lemma = [lemmatizer.lemmatize(word) for word in tokenized_text]
        stemmed = [porter.stem(word)
                   for word in tokenized_text]  # Stemming the words
        # POS tagging the words to extract POS features
        tagged = nltk.pos_tag(tokenized_text)
コード例 #4
0
##2017 12 3 using a different parser to parse sentence
'''
from nltk.parse.stanford import StanfordDependencyParser
path_to_jar = '/Users/collin/stanford/stanford-parser-full-2017-06-09/stanford-parser.jar'
path_to_models_jar = '/Users/collin/stanford/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'
dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
'''

from nltk.parse.corenlp import CoreNLPServer, CoreNLPDependencyParser
path_to_jar = '/Users/collin/stanford/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0.jar'
path_to_models_jar = '/Users/collin/stanford/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0-models.jar'
server = CoreNLPServer(path_to_jar=path_to_jar,
                       path_to_models_jar=path_to_models_jar)
server.start()
dependency_parser = CoreNLPDependencyParser()

stemmer = SnowballStemmer('english')


def stem(w):
    return stemmer.stem(w)


DR_one = ['nsubj', 'dobj', 'xsubj', 'csubj', 'nmod', 'iobj', 'xcomp']
DR_two = ['amod']
#DR_two = ['nsubj','dobj','xsubj','csubj','nsubjpass','nmod','iobj']
DR_three = ['conj']
DR = DR_one + DR_three

コード例 #5
0
 def __init__(self):
     self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
コード例 #6
0
import os
import copy
import string
#from word2number import w2n

import stanza
from nltk.parse import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPParser(url='http://localhost:9000')
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

nlp = stanza.Pipeline(
    "en",
    processors={"tokenize": "gum", "pos": "gum",
                "lemma": "gum", "depparse": "gum"},
    use_gpu=True,
    pos_batch_size=2000
)

# cd ./Desktop/Udep2Mono/NaturalLanguagePipeline/lib/stanford-corenlp-4.1.0
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

replacement = {
    "out of": "out-of",
    "none of the": "none-of-the",
    "all of the": "all-of-the",
    "some of the": "some-of-the",
    "most of the": "most-of-the",
    "many of the": "many-of-the",
    "several of the": "several-of-the",
コード例 #7
0
def context_to_tree(ith_data, step, to_graph=False):
    start_time = time.time()

    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    if to_graph:
        context = ith_data['context']
        graph = [[] for _ in range(len(context))]
    else:
        context = ith_data['context']
        tree = [[] for _ in range(len(context))]
        triple = [[] for _ in range(len(context))]
        # figure = [[] for _ in range(len(context))]

    result = {}

    for i in range(
            len(context)
    ):  ## ith context of input movie(divided in multple sentences)
        if to_graph:
            graph[i] = [[] for _ in range(len(context[i]))]
        else:
            tree[i] = [[] for _ in range(len(context[i]))]
            triple[i] = [[] for _ in range(len(context[i]))]
            # figure[i] = [[] for _ in range(len(context[i]))]

        for j, jth in enumerate(context[i]):  ## jth sentence of ith context

            ## Tokenizing PLAN
            if to_graph:
                if jth != '':
                    graph[i][j] = []
                    parsed = dep_parser.raw_parse(jth)
                    for parse in parsed:
                        graph[i][j].append(parse.to_dot())
                    graph[i][j] = graph[i][j][0].split('\n')

                else:
                    graph[i][j] = jth

            else:
                if jth != '':
                    # doc = nlp(jth)
                    # tree[i][j] = doc.sentences[0] ## stanfordnlp
                    tree[i][j], triple[i][j] = [], []
                    parsed = dep_parser.raw_parse(jth)
                    for parse in parsed:
                        tree[i][j].append(parse.tree())
                        triple[i][j].append(parse.triples())

                    # figure[i][j] = tree[i][j][0].pretty_print()
                    tree[i][j] = list(tree[i][j][0])
                    triple[i][j] = list(triple[i][j][0])

                else:
                    tree[i][j] = jth
                    triple[i][j] = jth
                    # figure[i][j] = jth
            # print("{0}th Movie Processing => ".format(step+1) + 'i & j: {0}/{2}, {1}/{3}'.format(i+1, j+1, len(context), len(context[i])))

    if to_graph:
        ith_data['graph'] = graph
        print("Parsing Runtime: %0.2f Minutes" %
              ((time.time() - start_time) / 60))
        return ith_data

    else:
        ith_data['tree'] = tree
        ith_data['triple'] = triple
        # ith_data['figure'] = figure
        # print("Parsing Runtime: %0.2f Minutes"%((time.time() - start_time)/60))
        return ith_data
コード例 #8
0
# uncomment the two lines below for the first time you run the code
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# from cgi import escape
from pprint import pprint
# from en import singular
# from pattern.text.en import singularize
from nltk.stem.snowball import SnowballStemmer
# stemmer is useful but not right now.
from nltk.treeprettyprinter import TreePrettyPrinter
from nltk.parse import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
coreNLPurl = 'http://corenlp.run/'
localServer = 'http://localhost:9000'
# use local server with desired port number in case you cant use corenlp.run
dependencyParser = CoreNLPDependencyParser(url=coreNLPurl)
stemmer = SnowballStemmer("english")
Parser = CoreNLPParser(url=coreNLPurl)


def main():
    reviewFile = "review-data.txt"
    reviewData = open(reviewFile)
    text = reviewData.readline()
    sentences = nltk.sent_tokenize(text)
    print(
        "========================================================================="
    )
    O = ["great"]  #opinion wordd dictionary
    print("Initial Opinion Lexicon ")
    print(O)
コード例 #9
0
    def dependency_parse_tree(self, s):
        parser = CoreNLPDependencyParser()

        parse = next(parser.raw_parse(s))

        return parse
コード例 #10
0
ファイル: hanks.py プロジェクト: rogerferrod/tln
    lemmatizer = WordNetLemmatizer()
    new_dict = {}
    for k in graph.dict:
        word = graph.dict[k]
        if word in tags.keys() and tags[word] in verbs:
            new_dict[k] = lemmatizer.lemmatize(word, 'v')
        else:
            new_dict[k] = word

    return DependencyGraph(graph.graph, new_dict)


if __name__ == "__main__":
    fillers = []  # [(subj, obj, sentence)]
    sentences = []
    dependency_parser = CoreNLPDependencyParser(url="http://localhost:9000")

    print('extracting sentences...')
    list_word_sentences = text_extraction()
    for sent in list_word_sentences:
        sentence = ' '.join(sent)
        sentences.append(sentence.strip())

    sentences = [x.lower() for x in sentences]
    print(str(len(sentences)) + ' frasi')

    print('extracting fillers...')
    for sentence in sentences:
        # PoS tagging
        sentence = sentence.replace('.', '')
        tokens = nltk.word_tokenize(sentence)
コード例 #11
0
ファイル: Extractor.py プロジェクト: melkimble/SIE598
import nltk
from nltk.corpus import wordnet as wn
from nltk.parse.corenlp import CoreNLPDependencyParser
from graphviz import Source
from pattern.vector import stemmer
from pycorenlp import StanfordCoreNLP
from sutime import SUTime
from textblob import TextBlob
from stanfordnlp.server import CoreNLPClient
from pynlp import StanfordCoreNLP

annotators = 'tokenize, ssplit, pos, ner, coref'
options = {'openie.resolve_coref': True}

nlp = StanfordCoreNLP(annotators=annotators, options=options)
sdp = CoreNLPDependencyParser()

#-----------------------------------------------------------------------------------------------------------------------
#LOAD THE SENTENCES
filepath = 'kolbuszowa.txt'
list_sentences = []
with open(filepath, encoding="utf8") as file:
    for line in file:
        list_sentences.append([line[:line.rfind(".") + 1]])

#PREPROCESSING START
#CREATE TEMPORARY LIST FOR ADJUCENT SENTECNES FOR COREFERENCING (PREVIOUS 2 SENTENCES)
for i in range(len(list_sentences)):
    adj_sentences = []
    start_index = i - 1
    if (start_index < 0):
コード例 #12
0
from datetime import datetime
from collections import defaultdict
import re
import random
from .config import *
from util.file_utils import load, save
from util.dict_utils import counter2ordered_dict
from common.constants import STOPWORDS, PUNCTUATIONS, FIGURE_PATH, OUTPUT_PATH, SYNONYM_DICT, CONCEPT_PATTERN_DICT, SPECIAL_WORDS, PATTERN_WORDS  # SPECIAL_WORDS
import os
from nltk.parse.corenlp import CoreNLPDependencyParser
import networkx as nx
from networkx.drawing.nx_pydot import write_dot
from .GIANT_data_utils import char2cid, get_embedding, from_networkx
from torch_geometric.data import Data  # , DataLoader

DEP_PARSER = CoreNLPDependencyParser(url='http://localhost:9005')


def cover_count(title, entitydict):
    allvalue = 0
    for token, value in entitydict.items():
        if token in title:
            allvalue += value
    return allvalue


def select_sub_titles(title_candi, wordset):
    title_score = {}
    for title in title_candi:
        subline = re.split(r'[?!/,\(\)_:\-【】\[\]—!,\|。、?: 丨]+', title)
        goodtitle = ''
コード例 #13
0
 def __init__(self):
     self.parser = CoreNLPDependencyParser(url=self.corenlp_server())
     self.sentence_tokenizer = PunktSentenceTokenizer()
コード例 #14
0
ファイル: Triple_Extractor.py プロジェクト: melkimble/SIE598
from textblob import TextBlob
from stanfordnlp.server import CoreNLPClient
from pynlp import StanfordCoreNLP
from pycorenlp import *

from Util import get_tree, get_left_children, get_children, get_wordnet_pos, get_right_children

synonyms = {'soldiers':'troopers', 'soldier':'trooper'}

annotators = 'pos, ner, depparse, openie'
options = {'openie.resolve_coref': True}

nlp_ = pynlp.StanfordCoreNLP(annotators=annotators, options=options)
nlp=StanfordCoreNLP("http://localhost:9000/")

sdp = CoreNLPDependencyParser()
jar_files = os.path.join(os.path.dirname(__file__), 'jars')
sutime = SUTime(jars=jar_files, mark_time_ranges=True, include_range=True)
lemmatizer = nltk.WordNetLemmatizer()
#-----------------------------------------------------------------------------------------------------------------------
#LOAD THE SENTENCES
filepath = 'kolbuszowa.txt'
list_sentences = []
with open(filepath,encoding="utf8") as file:
     for line in file:
         list_sentences.append([line[:line.rfind(".") + 1]])

#PREPROCESSING START
for i in range(len(list_sentences)):
    sentence = list_sentences[i][0]
    #PREPROCESSING
コード例 #15
0
class DependenciesLCA():
    def __init__(self, sentence, port=9004):
        self.sentence = sentence.rstrip('.')
        self.sentence = re.sub(r'(.?)([\.,;:\?!()\[\]\{\}«»\'\"\-\—\/’&])',
                               '\\1 \\2 ', self.sentence)

        self.corenlpparser = CoreNLPDependencyParser(url='http://localhost:' +
                                                     str(port))
        parse = self.corenlpparser.raw_parse(self.sentence)
        self.tree = next(parse)

    def lca(self, index1, index2):
        path1 = []
        path2 = []
        path1.append(index1)
        path2.append(index2)

        node = index1
        while (node != self.tree.root):
            node = self.tree.nodes[node['head']]
            path1.append(node)

        node = index2
        while (node != self.tree.root):
            node = self.tree.nodes[node['head']]
            path2.append(node)

        for l1, l2 in zip(path1[::-1], path2[::-1]):
            if (l1 == l2):
                temp = l1
        return temp

    def path_lca(self, node, lca_node):
        path = []
        path.append(node)
        while (node != lca_node):
            node = self.tree.nodes[node['head']]
            path.append(node)
        return path

    def branch_paths(self, ent1, ent2):

        entity1 = re.split(r"[ .',\-0-9]", ent1)[-1]
        entity2 = re.split(r"[ .',\-0-9]", ent2)[-1]

        node1 = None
        node2 = None
        for node in self.tree.nodes:
            if (self.tree.nodes[node]["word"] == entity1) & (node1 == None):
                node1 = self.tree.nodes[node]
            elif (self.tree.nodes[node]["word"] == entity2) & (node2 == None):
                node2 = self.tree.nodes[node]

        try:
            if node1['address'] != None and node2['address'] != None:
                lca_node = self.lca(node1, node2)
                path1 = self.path_lca(node1, lca_node)
                path2 = self.path_lca(node2, lca_node)

                word_path1 = "/".join([p["word"] for p in path1])
                word_path2 = "/".join([p["word"] for p in path2])
                rel_path1 = "/".join([p["rel"] for p in path1])
                rel_path2 = "/".join([p["rel"] for p in path2])
                pos_path1 = "/".join([p["tag"] for p in path1])
                pos_path2 = "/".join([p["tag"] for p in path2])
            else:
                print(entity1, entity2, self.sentence)
        except AssertionError:
            print("Node none, Entity 1 :", node1, entity1, ent1,
                  " /  Entity2 :", node2, entity2, ent2, " /  Phrase :",
                  self.sentence)
        except:
            if (bool(re.search(r'\d', entity1)) == True) | (bool(
                    re.search(r'\d', entity1)) == False):
                return (None, None, None, None, None, None)
            print("Node none, Entity 1 :", node1, entity1, ent1,
                  " /  Entity2 :", node2, entity2, ent2, " /  Phrase :",
                  self.sentence, "  / Tree : ", self.tree)

        return (word_path1, word_path2, rel_path1, rel_path2, pos_path1,
                pos_path2)
コード例 #16
0
import os
from nltk.tokenize import RegexpTokenizer
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000',
                                     tagtype="pos")


def read_data():
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_sentences = []
    sentences = []
    for _, _, file in os.walk("../../data/parsing_corpus"):
        for filename in file:
            with open("../../data/parsing_corpus/" + filename, "r") as f:
                contents = f.read()
                contents = contents.split("\n")
                for i in range(len(contents)):
                    temp_tokenized_sentence = tokenizer.tokenize(contents[i])
                    if (len(temp_tokenized_sentence) <= 50):
                        tokenized_sentences.append(temp_tokenized_sentence)
                        sentences.append(contents[i])
    return tokenized_sentences, sentences


tokenized_sentences, sentences = read_data()
dependency_parsed = []
with open("./dependencies.txt", "w") as f:
    for i in range(len(tokenized_sentences)):
        if (tokenized_sentences[i]):
            f.write(sentences[i] + "\n")
            parses = dep_parser.parse(tokenized_sentences[i])
コード例 #17
0
class SqlGen:
    parsed = ""
    tokenized = ""
    dep_parser = ""
    text = ""
    data = ""
    attributes = ""
    conditions = []

    #constructor
    def __init__(self, sentence):
        self.prop = {
            "depparse.extradependencies": "NONE",
            "depparse.keepPunct": "false"
        }
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
        self.text = ner.NER().ner_pass(sentence)
        self.parsed, = self.dep_parser.raw_parse(self.text,
                                                 properties=self.prop)

    def getData(self, type='None'):

        if type == 'pandas1' or type == 'pandas2' or type == 'pandas3':
            x = self.parsed
            if x.contains_address(0):
                x.remove_by_address(0)
            x = x.nodes
            df = pd.DataFrame(
                [(v['address'], v['word'], v['lemma'], v['ctag'], v['tag'],
                  v['feats'], v['head'], v['deps'], v['rel'])
                 for v in x.values()],
                columns=[
                    'position', 'word', 'lemma', 'ctag', 'tag', 'feat', 'head',
                    'deps', 'rel'
                ]).set_index('position')
            self.data = df
            if type == 'pandas1':
                #all columns are included
                return df
            elif type == 'pandas2':
                # removed some columns from pandas1, only the columns specified in the list are included
                return df[['lemma', 'tag', 'head', 'rel']]
            else:
                # removed all colums except dependents
                return df[['deps']]

        else:
            return self.parsed.to_conll(4)

    def getAction(self, df):
        try:
            mainVerb = df.query("tag == 'VB'  & head == 0").to_dict()
            return mainVerb['lemma']
        except IndexError:
            return

    def getAttributes(self, df):
        #x = df.query(" (rel == 'dobj'  & head == %s) |(rel == 'conj:and'  & head ==  %s)" %(1,1)).to_dict()
        x = df.query(
            "  (rel == 'dobj' & head == %s) |(rel == 'acl:relcl') |(rel == 'conj:and' & head ==  %s) |(rel == 'appos' )"
            % (1, 1)).to_dict()
        self.attributes = (x['lemma'])
        self.rel = (x['rel'])
        return x

    def getValueNodes(self, index):
        pos = self.data.query("(rel == 'acl:relcl' )").to_dict()['word']
        if pos:
            pos = list(pos.keys())[0]
            x = self.data.query(
                " (rel == 'nmod:poss')|(rel == 'nmod:at')|(rel == 'dobj' & index > %s) | (rel == 'nmod:for')| (rel == 'nmod:as') | (rel == 'nmod:in')"
                % (pos)).to_dict()
            self.conditions = x['word']
            return x
        else:
            x = self.data.query(
                " (rel == 'nmod:poss')|(rel == 'nmod:at') | (rel == 'nmod:for')| (rel == 'nmod:as') | (rel == 'nmod:in')"
                % (pos)).to_dict()
            self.conditions = x['word']
            return x

    def findAssociation(self, attributes):
        att = []
        for keys in attributes:
            x = self.data.query(
                " (~tag.str.contains('DT')& ~rel.str.contains('ref')& ~rel.str.contains('cc')& ~rel.str.contains('case') & ~rel.str.contains('punct')) &(head == %s)"
                % (keys)).to_dict()
            #temp = [(attributes[keys])]
            if self.rel[keys] == "acl:relcl":
                temp = {attributes[keys]: 'acl:relcl'}
            else:
                temp = {attributes[keys]: 'main'}
            for keys in x['lemma']:
                try:
                    if (x['lemma'][keys] not in attributes.values() and
                            x['lemma'][keys] not in self.conditions.values()):
                        temp[x['lemma'][keys]] = x['rel'][keys]
                except AttributeError:
                    pass
            att.append(temp)
        return att
コード例 #18
0
from datetime import datetime
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.dependencygraph import DependencyGraph

parser = CoreNLPDependencyParser(url='http://localhost:9000')

sentence = "The trophy would not fit in the brown suitcase because it was too big"
# sentence = "I spread the roth on the table in order to protect it"
# sentence = "On the table I've spread the roth in order to protect it"
# sentence = "The city councilmen refused the demonstrators a permit because they feared violence"
# sentence = "She said he told her their secrets"
sentence = "The monkey said the bird told the elephant he was dangerous."
sentence = "The women stopped taking the pills because they were carcinogenic."
sentence = "Marta has a cat, her cat is brown"
parse, = parser.raw_parse(sentence)
conll = parse.to_conll(4)
print(conll)
dg = DependencyGraph(conll)
dotted = dg.to_dot()
G = dg.nx_graph()
f = open('hoy_' + str(datetime.now()) + '.svg', 'w')
svg = dg._repr_svg_()
f.write(svg)
コード例 #19
0
import os
from nltk.tokenize import RegexpTokenizer
from nltk.parse.corenlp import CoreNLPDependencyParser

dep_parser = CoreNLPDependencyParser(url='http://localhost:9000',
                                     tagtype="pos")
import collections


def read_data():
    count_preposition = []
    tokenizer = RegexpTokenizer(r'\w+')
    write_to_file = []
    for _, _, file in os.walk("../../data/parsing_corpus"):
        for filename in file:
            with open("../../data/parsing_corpus/" + filename, "r") as f:
                preposition_list = []
                contents = f.read()
                contents = contents.split("\n")
                for i in range(len(contents)):
                    temp_list = []
                    temp_tokenized_sentence = tokenizer.tokenize(contents[i])
                    if (len(temp_tokenized_sentence) <= 50):
                        if (temp_tokenized_sentence):
                            parses = dep_parser.parse(temp_tokenized_sentence)
                            for parse in parses:
                                for governor, dep, dependent in parse.triples(
                                ):
                                    if (governor[1] == "IN"):
                                        if (governor not in temp_list):
                                            temp_list.append(governor)
コード例 #20
0
# Parse tokenized text.
print(
    list(parser.parse('What is the airspeed of an unladen swallow ?'.split())))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Parse raw string.
print(list(parser.raw_parse('What is the airspeed of an unladen swallow ?')))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Neural Dependency Parser
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(
    'What is the airspeed of an unladen swallow ?'.split())
print([[(governor, dep, dependent)
        for governor, dep, dependent in parse.triples()] for parse in parses])
print(
    "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n"
)

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
print(list(parser.tokenize('What is the airspeed of an unladen swallow?')))
print(
    "\nExpected: ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']\n"
)
コード例 #21
0
def hanks(verb):
    """
    Implementation of P. Hanks theory.
    Given a transitive verb, we find N sentences in the Brown corpus that
    contains the given verb. We do WSD (using 2 version of Lesk algorithm,
    one handwritten by us and the other from NLTK library) on the verb
    arguments (subj and obj), and finally, we compute the Filler's supersense
    incidence rate.
    """

    fillers = []  # [(subj, obj, sentence)]
    sentences = []

    # Set the URI to communicate with Stanford CoreNLP
    dependency_parser = CoreNLPDependencyParser(url="http://localhost:9000")

    print('[1] - Extracting sentences...')
    list_word_sentences = text_extraction(verb)
    for sent in list_word_sentences:
        sentence = ' '.join(sent)
        sentences.append(sentence.strip())

    sentences = [x.lower() for x in sentences]
    print("\t{} sentences in which the verb \'{}\' appears.".format(str(len(sentences)), verb))

    print('\n[2] - Extracting fillers...')
    for sentence in sentences:
        # PoS Tagging
        sentence = sentence.replace('.', '')
        tokens = nltk.word_tokenize(sentence)
        tags = dict(nltk.pos_tag(tokens))  # dictionary of all PoS Tag of the tokens

        # Syntactic parsing
        result = dependency_parser.raw_parse(sentence)
        dep = next(result)
        graph = OurDependencyGraph()  # first init needed because of .init_from_dot()
        graph.init_from_dot(dep.to_dot())

        # Lemmatization
        # (it lemmatized only the verbs, the other words are not changed)
        lemmatized_graph = lemmatize_graph(graph, tags)  # es. "said" to "say"

        verb_key_list = lemmatized_graph.get_verb_key(verb)  # list of keys in which we can find the verb in graph.dict
        # format -> [int1, int 2, ...], eg.: [34], [0, 10, 34, ...]

        if len(verb_key_list) <= 0:
            # DEBUG
            # print("\tError in **{}**".format(sentence), file=sys.stderr)
            continue

        # Adjacency List
        # we take the first occurrence of the verb, which is our root
        adjs = lemmatized_graph.get_adj_neighbor(verb_key_list[0])
        # if the adjacent element of the verb are subj or obj we update adjs variable
        adjs = list(filter(lambda x: x[1] in subj_dept or x[1] in obj_dept, adjs))

        # Valency = 2
        if len(adjs) == 2:  # Note: not all the verb in sentences have valency = 2
            # assigning the correct subject and obj
            if adjs[0][1] in subj_dept:
                w1 = lemmatized_graph.dict[adjs[0][0]]
                w2 = lemmatized_graph.dict[adjs[1][0]]
            else:
                w1 = lemmatized_graph.dict[adjs[1][0]]
                w2 = lemmatized_graph.dict[adjs[0][0]]
            fillers.append((w1, w2, sentence))  # where w1 = subj and w2 = obj

    tot = len(fillers)
    print("\n[3] - Total of {} Fillers".format(str(tot)))
    for f in fillers:
        print("\t{}".format(f))

    our_lesk_semantic_types = {}  # {(s1, s2): count}
    nltk_lesk_semantic_types = {}  # {(s1, s2): count}
    for f in fillers:
        # WSD

        # Our Lesk
        s1 = our_lesk(f[0], f[2])
        s2 = our_lesk(f[1], f[2])

        # nltk.wsd's Lesk
        s3 = lesk(f[2], f[0])
        s4 = lesk(f[2], f[1])

        if s1 is not None and s2 is not None:
            # Getting supersences
            t = (s1.lexname(), s2.lexname())

            # Getting frequency
            if t in our_lesk_semantic_types.keys():
                our_lesk_semantic_types[t] = our_lesk_semantic_types[t] + 1
            else:
                our_lesk_semantic_types[t] = 1

        if s3 is not None and s4 is not None:
            # Getting supersences
            t = (s3.lexname(), s4.lexname())

            # Getting frequency
            if t in nltk_lesk_semantic_types.keys():
                nltk_lesk_semantic_types[t] = nltk_lesk_semantic_types[t] + 1
            else:
                nltk_lesk_semantic_types[t] = 1

    print('\n[4.1] - "Our Lesk":\n\tFinding Semantic Clusters (percentage, count of instances, semantic cluster):')
    for key, value in sorted(our_lesk_semantic_types.items(), key=lambda x: x[1]):
        to_print = str(round((value / tot) * 100, 2))
        print("\t[{}%] - {} - {}".format(to_print, value, key))

    print('\n[4.2] - "NLTK Lesk":\n\tFinding Semantic Clusters (percentage, count of instances, semantic cluster):')
    for key, value in sorted(nltk_lesk_semantic_types.items(), key=lambda x: x[1]):
        to_print = str(round((value / tot) * 100, 2))
        print("\t[{}%] - {} - {}".format(to_print, value, key))
コード例 #22
0
class SVO():
    def __init__(self, sentence):
        config = ApplicationConfig.get_corenlp_config()
        self._parser = CoreNLPParser(url=f"http://{config['host']}:{config['port']}")
        self._dependency = CoreNLPDependencyParser(url=f"http://{config['host']}:{config['port']}")
        sentence = sentence.replace('  ', ' ')
        sentence = sentence.replace('.', '')
        self._load(sentence)
        self.original = sentence

    def get_dependency_tree():
        return self._dependency
    def get_parser_tree():
        return self.t
                                                   
    def _load(self, sentence):
        self.t = list(self._parser.raw_parse(sentence))[0]
        self.t = ParentedTree.convert(self.t)

    def show(self):
        self.t.pretty_print()
        
    def find_svo(self):
        self._queue = []

        # sentence須為S或NP才能找SVO & find conj
        for i in self.t.subtrees(lambda i: i.label() != 'ROOT'):
#             if i.label() in ['S','NP','SINV','SBAR','FRAG','X','PP']:
            remover = self._find_conj()

            # refresh
            for i in remover:
                self.original = self.original.replace(i, '')
            self._load(self.original) 
            self.pos = self.t.pos()
            self._root = SVONode(('main', self.t), None)
            self._queue.append(self._root)
            break
#             else:
#                 return 'Sentence can not find SVO.'  
                              
        # find SVO   
        while self._queue != []:
            self._data = self._queue.pop(0)
            tmp = list(self._data.data.flatten())
            if ',' in tmp:
                tmp.remove(',')
            if len(tmp) == 1:
                continue
            sentence = ' '.join(self._data.data.flatten())
            self.t = self._data.data

            # 找子句 & 對等連接詞 & 分詞
#             self.show()
            if self._data.relation != 'appos':
                self._find_SBAR()
#             self.show()
#             self._remove_comma()
#             self.show()
            self._data.svo = collections.defaultdict(list)

            # Find Subject
            tmp = self._find_subject()
            if isinstance(tmp, list):
                self._data.svo['subject'] = tmp
            else:
                self._data.svo['subject'] = self._add_conj(tmp)

            # Find Predicate
            tmp = self._find_predicate()
            self._data.svo['predicate'] = self._add_conj(tmp)
            
            # Find Object
            tmp = self._find_object(self._data.svo['predicate'])
            self._data.svo['object'] = self._add_conj(tmp)                
            
            self._all = collections.defaultdict(list)
            self._flatten(self._data.svo['predicate'])
            self._data.svo['object'] = self._filter(self._data.svo['object'])
            
            for s in self.t.subtrees():
                if s.label() != 'ROOT':
                    break
                else:
                    for i in self.t.subtrees(lambda i:i.label() != 'ROOT'):
                        if i.label() in ['FRAG']:
                            continue
                        if i.label() in ['S','SINV']:
                            for n in i.subtrees(lambda n: n.label() == 'S' and n != i):
                                flag = True
                                test = n
                                while test.parent():
                                    if test.parent() == i:
                                        flag = False
                                        break
                                    test = test.parent()
                                if flag:
                                    tmp = self._del(' '.join(n.flatten()))
                                    if tmp:
                                        self._refresh(n)
                                        kid = SVONode(('', self.t), self._data)
                                        self._data.child.append(kid)
                                        self._queue.append(kid)
                                break
                        break
                break
                                                   
        # Integrate
        self._result = collections.defaultdict(list)
        self._traversal(self._root)
        
        return self._result                                           
                                                   
    def _filter(self, x):
        for i in x:
            if i[1] != []:
                for j in i[1]:
                    if isinstance(j,dict):
                        for k in ['predicate', 'object']:
                            tmp = self._filter(j[k])
                            if tmp == []:
                                del j[k]
                    else:
                        if j in self._all['predicate']:
                            i[1].remove(j)
            if i[0] in self._all['predicate']:
                x.remove(i)
        return x
                                                   
    def _flatten(self, x):
        for i in x:
            self._all['predicate'].append(i[0])
            if i[1] != []:
                for j in i[1]:
                    if isinstance(j,dict):
                        for k in j.keys():
                            self._flatten(j[k])
                    else:
                        self._all['predicate'].append(j)
    
    def _traversal(self, node):
        if node.svo != None and (node.svo['subject']!=[] or node.svo['predicate']!=[] or node.svo['object']!=[]):
            self._result[node.relation].append({'subject':node.svo['subject'], 'predicate':node.svo['predicate'], 'object':node.svo['object']})
        for i in node.child:
            self._traversal(i)
    
    def _add_conj(self, tmp):
        result = []
        if isinstance(tmp, tuple):
            flag = tmp[0].split(' ')
            if len(flag) <= 5:
                for k in flag:
                    if k in self._dic.keys():
                        # 把conj補進來
                        for j in self._dic[k]:
                            if j[0] == 'attr':
                                tree = list(self._parser.raw_parse(tmp[0]+' is '+j[1]))[0]
                                tree = ParentedTree.convert(tree)
                                kid = SVONode(('appos', tree), self._data)
                                self._data.child.append(kid)
                                self._queue.append(kid)
                                self._dic[k].remove(j)
#                                 a = tmp[0]
#                                 b = tmp[1]
#                                 result.append((a, b+[j[1]]))
                            else:
                                result.append((j[1], j[2]))

        if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]:
            result.append(tmp)
        result.reverse()
        return result
    
    def _remove_comma(self):
        for i in self.t.subtrees(lambda i:i[0] in [',', ';']):
            if i.left_sibling() and i.left_sibling().label() not in ['NP','S','VP','PP','JJ','SINV','ADJP'] and 'VB' not in i.left_sibling().label():
                if ' '.join(i.left_sibling().flatten()) != ' '.join(self.t.flatten()):
                    self._refresh(i.left_sibling())
                if ' '.join(i.flatten()) != ' '.join(self.t.flatten()):
                    self._refresh(i)
    
    # 拔掉的句子放進child                                               
    def _child(self, a, b):
        kid = SVONode((a, b), self._data)
        self._data.child.append(kid)
        self._queue.append(kid)                                               
        self._refresh(b, a)
    
    # 能否 refresh(拔掉的句子和原有句子是否一樣)                                               
    def _del(self, tmp_1):
        tmp = ' '.join(self.t.flatten())
        tmp = tmp.replace(tmp_1, '')   
        tmp = tmp.strip(',; ') 
        if tmp != '':
            return True
        else:
            return False                                       
                                                   
    def _find_SBAR(self):
        # 有無對等連接詞
        for i in self.t.subtrees(lambda i: i.label() == 'CC'):
            if i.right_sibling() and i.right_sibling().label() in ['S','VP']:
                tmp = self._del(i[0]+' '+' '.join(i.right_sibling().flatten()))
                if tmp and [x for x in self._queue if ' '.join(i.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []:
                    self._child(i[0], i.right_sibling())                               
                                                   
        # 有無子句                                          
        for node in self.t.subtrees(lambda node: node.label() == 'SBAR'):
            if 'VB' in node.pos()[0][1]:
                continue
            tmp = self._del(' '.join(node.flatten()))   
            if tmp:
                conj = []
                # 連接詞
                for s in node.subtrees(lambda s: s.label() != 'SBAR'):
                    if s.label() not in ['S','ADVP','RB'] and 'VB' not in s.label():
                        if s.leaves()[0] not in conj:
                            conj.append(s.leaves()[0])
                    elif s.label() in ['ADVP','RB']:
                        continue
                    else:
                        break
                conj = ' '.join(conj)
                for s in node.subtrees(lambda s: s.label() == 'S'):
                    # SBAR 會重複
                    if [x for x in self._queue if ' '.join(s.flatten()) in ' '.join(x.data.flatten())] == []:
                        if node.left_sibling() and node.left_sibling().label() == 'IN' and node.parent().label() != 'S':
                            tmp = self._del(' '.join(node.parent().flatten()))                       
                            if tmp:
                                self._child(conj, s)
                        else:
                            self._child(conj, s)
                    break
                                                  
        # 分詞                                           
        participle = [x[0] for x in self.t.pos() if x[1] in ['VBG','VBN']]
        for i in participle:
            if i in self.t.leaves():
                candidate = [x for x, y in enumerate(self.t.leaves()) if y == i]
                if candidate[-1] == 0:
                    pos = ''
                else:
                    before = self.t.leaves()[candidate[-1]-1]
                    pos = [x for x in self.t.pos() if x[0] == before][0][1]
                IN = ['when','while','before','after','till','since','because','as','so','although','though','if','unless','upon','once']
                                                   
                if pos == 'IN' and before.lower() in IN:
#                 candidate[-1]-2 >= 0 and 'VB' not in [x for x in self.t.pos() if x[0] == self.t.leaves()[candidate[-1]-2]][0][1]
                    for j in self.t.subtrees(lambda j: j[0] == before):
                        tmp = self._del(' '.join(j.parent().flatten()))                           
                        if tmp and j.parent().label() != 'NP' and j.right_sibling() and [x for x in self._queue if ' '.join(j.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []:
                            self._child(before, j.right_sibling())
                            
                if ('VB' not in pos) and (pos not in ['IN','RB','MD','POS', 'TO']):
                    for j in self.t.subtrees(lambda j: j[0] == i):
                        tmp = self._del(' '.join(j.parent().flatten()))                                                       
                        if tmp and j.parent().label() not in ['NP','ADJP'] and j.right_sibling() and [x for x in self._queue if ' '.join(j.parent().flatten()) in ' '.join(x.data.flatten())] == []:
                            self._child('', j.parent())                       
    
                                                   
    def _refresh(self, node, conj=''):
        sentence = ' '.join(self.t.flatten())
        if conj == '':
            tmp = ' '.join(node.flatten())
        else:
            tmp = conj + ' ' + ' '.join(node.flatten())
        if tmp in sentence:
            idx = sentence.index(tmp)
            if idx-2 >= 0 and sentence[idx-2] == ',':
                tmp = ', ' + tmp
            if idx+len(tmp)+1 < len(sentence) and sentence[idx+len(tmp)+1] == ',':
                tmp = tmp +' ,'
        sentence = sentence.replace(tmp, '')
        self._load(sentence)
    
    def _find_conj(self):
        self._dic = collections.defaultdict(list)
        dep, = self._dependency.raw_parse(self.original)
        remover = []      
        pool_conj = []
        pool_appos = []
        for governor, bridge, dependent in dep.triples():
            # 對等連接詞
            if bridge == 'conj':
                # NN conj NN
                if 'NN' in governor[1] and 'NN' in dependent[1]:
                    address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['conj']
                    for add in address:
                        if add not in pool_conj:
                            tmp = []
                            r = []
                            pool_conj.append(add)
                            for key, value in dep.get_by_address(add)['deps'].items():
                                if key not in ['conj', 'cc', 'nmod', 'nmod:poss']:
                                    for j in value:
                                        tmp.append(dep.get_by_address(j)['word'])
                                        r.append(dep.get_by_address(j)['word'])
                                if key in ['nmod']:
                                    r.append(dep.get_by_address(add)['word'])
                                    for j in value:
                                        for key1, value1 in dep.get_by_address(j)['deps'].items():
                                            if key1 not in ['conj', 'cc']:
                                                for k in value1:
                                                    r.append(dep.get_by_address(k)['word'])
                                        r.append(dep.get_by_address(j)['word'])
                                if key in ['nmod:poss']:
                                    for j in value:
                                        for key1, value1 in dep.get_by_address(j)['deps'].items():
                                            if key1 not in ['conj', 'cc', 'case']:
                                                for k in value1:
                                                   tmp.append(dep.get_by_address(k)['word'])
                                                   r.append(dep.get_by_address(k)['word'])
                                            if key1 in ['case']:
                                                tmp.append(dep.get_by_address(j)['word'])
                                                r.append(dep.get_by_address(j)['word'])
                                                for k in value1:
                                                   tmp.append(dep.get_by_address(k)['word'])
                                                   r.append(dep.get_by_address(k)['word'])
                                    if dep.get_by_address(j)['word'] not in tmp:
                                        tmp.append(dep.get_by_address(j)['word'])
                                        r.append(dep.get_by_address(j)['word'])    
                            if dep.get_by_address(add)['word'] not in tmp:
                                tmp.append(dep.get_by_address(add)['word'])
                            if dep.get_by_address(add)['word'] not in r:
                                r.append(dep.get_by_address(add)['word'])

                            for i in self.t.subtrees(lambda i: i.leaves() == r):
                                for n in i.subtrees(lambda n: n[0] == dependent[0]):
                                    self._dic[governor[0]].append(('entity', ' '.join(tmp), self._find_attrs(n, ' '.join(tmp))))
                                    remover.append(' '.join(r))
                                    break
                                break
                            if ' '.join(r) not in remover:
                                self._dic[governor[0]].append(('entity', ' '.join(tmp), []))
                                remover.append(' '.join(r))
                            
                    
                # VB conj VB O
                elif 'VB' in governor[1] and 'VB' in dependent[1] and governor[1] == dependent[1]:   
                    gov_key = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0].keys()
                    dep_key = [x['deps'] for x in dep.nodes.values() if x['word']==dependent[0]][0].keys()
                    if [j for j in gov_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass']]==[] or [j for j in dep_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass', 'nsubj']]==[]:  
                        for i in self.t.subtrees(lambda i: i[0] == dependent[0]):
                            self._dic[governor[0]].append(('entity', dependent[0],  self._find_attrs(i, dependent[0])))
                            remover.append(dependent[0])
                            break
                        
            # 同位語(回傳整串)           
            elif bridge == 'appos':
                tmp = []
                address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['appos']
                for add in address:
                    if add not in pool_appos:
                        tmp = []
                        pool_appos.append(add)    
                        for key, value in dep.get_by_address(add)['deps'].items():
                            if key in ['compound', 'amod']:
                                for j in value:
                                    tmp.append(dep.get_by_address(j)['word'])
                            if key in ['nmod']:
                                tmp.append(dep.get_by_address(add)['word'])
                                for j in value:
                                    for key1, value1 in dep.get_by_address(j)['deps'].items():
                                        if key1 not in ['conj', 'cc']:
                                            for k in value1:
                                                tmp.append(dep.get_by_address(k)['word'])
                                    tmp.append(dep.get_by_address(j)['word'])
                        if dep.get_by_address(add)['word'] not in tmp:
                            tmp.append(dep.get_by_address(add)['word'])                        
                        self._dic[governor[0]].append(('attr', ' '.join(tmp), []))
                        remover.append(' '.join(tmp))
        
        for i in range(len(remover)):
            #所有可能的位置
            can = [m.start() for m in re.finditer(remover[i], self.original)]
            flag = False
            for j in can:
                if self.original[j-2] == ',':
                    remover[i] = ', ' + remover[i]
                    flag = True
                    break
                elif self.original[j-4:j-1] == 'and':
                    remover[i] = 'and ' + remover[i]
                    flag = True
                    break
            if not flag:
                remover[i] = ' ' + remover[i]
        return remover        
                                                   
    # Breadth First Search the tree and take the first noun in the NP subtree.
    def _find_subject(self):
        synonym = ['', 'which', 'that', 'who', 'whom', 'where', 'when', 'what', 'why', 'how', 'whether', 'in']
        for i in self.t.subtrees(lambda i: i.label() == 'SBAR'):
            dep, = self._dependency.raw_parse(' '.join(self.t.flatten()))
            sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']]
            if sub != []:
                for s in self.t.subtrees(lambda s:s[0] == sub[0][0]):
                    return self._find_NOUN(s)   
            for s in i.subtrees(lambda s: s.label() == 'NP'):
                for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() in 'PRP'):
                    return self._find_NOUN(n)
                for n in s.subtrees(lambda n: n.label() == 'DT'):
                    return (n[0], self._find_attrs(n, n[0]))
        for i in self.t.subtrees(lambda i: i.label() not in ['S', 'ROOT', 'PP', 'FRAG']):  
            # 有Subject
            dep, = self._dependency.raw_parse(' '.join(self.t.flatten()))
            sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']]
            if sub != []:
                for s in self.t.subtrees(lambda s:s[0] == sub[0][0]):
                    return self._find_NOUN(s)   
                                                   
            if i.label() not in ['VP','PP'] and 'VB' not in i.label():                                
                for s in self.t.subtrees(lambda s: s.label() == 'NP'): 
                    for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'):                          
                        return self._find_NOUN(n)
                    for n in s.subtrees(lambda n: n.label() == 'DT'):
                        return (n[0], self._find_attrs(n, n[0]))
            
            # 祈使句
            elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation == 'main':
                if [x for x in self.t.pos()][0][1] not in ['RB','MD'] and 'VB' not in [x for x in self.t.pos()][0][1]:
                    for s in self.t.subtrees(lambda s: s.label() == 'NP'): 
                        for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'):                          
                            return self._find_NOUN(n)
                        for n in s.subtrees(lambda n: n.label() == 'DT'):
                            return (n[0], self._find_attrs(n, n[0]))
                    return None
                else:
                    return None
                                                   
            # 沒有subject & relation是代名詞
            elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation in synonym:
                dep, = self._dependency.raw_parse(self.original)
                candidate = [x for x in dep.triples() if x[1] in ['acl:relcl','acl'] and x[2][0] in self.t.flatten()]
                if candidate != []:
                    compound = self._find_compound(candidate[0][0][0], dep)
                    sub = []
                    if compound != '':
                        for com in compound:
                            sub.append(com)
                    sub.append(candidate[0][0][0])
                    return (' '.join(sub), [])
                else:
                    sent = [x[0] for x in self.pos]
                    if self._data.relation != '':
                        candidate = [x for x, y in enumerate(sent) if y == self._data.relation.split(' ')[0]]
                        after = self.t.pos()[0][0]
                    else:
                        candidate = [x for x, y in enumerate(sent) if y == self.t.pos()[0][0]]
                        if len(self.t.pos()) > 1:                               
                            after = self.t.pos()[1][0]
                        else:
                            after = ''                           
                    before = candidate[0] - 1 
                    for x in candidate:
                        if sent[x+1] == after:
                            before = x - 1
                    
                    if before == -1:
                        return None

                    # 原句前一個詞是否為NN  
                    if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0] or [x[1] for x in self.pos if x[0] == sent[before]][0] in ['PRP']:
                        sub = [sent[before]]
                        before -= 1
                        while 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                            sub.append(sent[before])
                            before -= 1
                        return (' '.join(reversed(sub)), [])
                    elif [x[1] for x in self.pos if x[0] == sent[before]][0] in ['IN',','] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]:
                        before -= 1                               
                        sub = [sent[before]]
                        before -= 1
                        while before != -1 and 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                            sub.append(sent[before])
                            before -= 1
                        return (' '.join(reversed(sub)), [])

                    # 找parent中最近的
                    else:
                        target = self.t.pos()[0][0]
                        if self._data.parent.svo['subject'] == []:
                            sub = -1    
                        else:
                            sub = self._data.parent.svo['subject'][0][0].split(' ')[-1]
                        if self._data.parent.svo['object'] == []:
                            obj = -1
                        else:
                            obj = self._data.parent.svo['object'][0][0].split(' ')[-1]
                        if sub == -1 and obj != -1:
                            return self._data.parent.svo['object']
                        elif sub != -1 and obj == -1:
                            return self._data.parent.svo['subject']
                        elif sub != -1 and obj != -1:
                            if abs(self.original.find(target)-self.original.find(sub)) <= abs(self.original.find(target)-self.original.find(obj)):
                                return self._data.parent.svo['subject']
                            else:
                                return self._data.parent.svo['object']

            # 沒有subject & relation是連接詞    
            elif i.label() == 'VP' or i.label().startswith('VB'):                                   
                if self._data.parent != None:
                    return self._data.parent.svo['subject']
            else:                                  
                return None
                                                   
    def _find_compound(self, word, dep):
        deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word]
        com = []
        deps = [x for x in deps if 'compound' in x]                                           
        for i in deps:
            for j in i['compound']:
                com.append(dep.get_by_address(j)['word'])  
        deps = [x for x in deps if 'dep' in x]                                           
        for i in deps:
            com.append(dep.get_by_address(i['dep'][0])['word'])                                            
        return com
                                                   
    
    def _compound(self, compound, before):
        obj = []
        if compound != '':
            for n in self.t.subtrees(lambda n:n[0] == before):
                for com in compound:
                    for s in n.parent().subtrees(lambda s:s[0] == com):
                        obj.append(com)
        return obj
                                                   
                                                   
    def _dobj(self, candidate, dep, before):
        if 'dobj' in candidate.keys():
            word = dep.nodes[candidate['dobj'][0]]['word']
            tag = dep.nodes[candidate['dobj'][0]]['tag']
        else:
            word = dep.nodes[candidate['xcomp'][0]]['word']
            tag = dep.nodes[candidate['xcomp'][0]]['tag'] 
        compound = self._find_compound(word, dep)
        obj = self._compound(compound, before)
        if tag != 'TO':
            for n in self.t.subtrees(lambda n:n[0] == before):
                for s in n.parent().subtrees(lambda s:s[0] == word):
                    obj.append(s[0])
                    return (' '.join(obj), self._find_attrs(s, ' '.join(obj)))                                           
        

    def _find_object(self, predicate, node = '', data = ''):
        if node == '':
            node = self.t
        if data == '':
            data = self._data
        synonym = ['which', 'that', 'who', 'whom']                                          
        if data != None and data.relation == 'appos':
            dep, = self._dependency.raw_parse(' '.join(node.flatten()))
        else:
            dep, = self._dependency.raw_parse(self.original)
        for i in predicate:
            pre = i[0].split(' ')
            for j in range(len(pre)-1, -1, -1):
                if len([x['deps'] for x in dep.nodes.values() if x['word']==pre[j]]) > 1:
                    dep, = self._dependency.raw_parse(' '.join(node.flatten()))

                candidate = [x['deps'] for x in dep.nodes.values() if x['word']==pre[j]][0]
                candidate_1 = [x for x in dep.triples() if x[2][0]==pre[j]]
                                                   
                if 'dobj' in candidate.keys() or 'xcomp' in candidate.keys():
                    return self._dobj(candidate, dep, pre[j])
                                                   
                elif 'ccomp' in candidate.keys():
                    word = dep.nodes[candidate['ccomp'][0]]['word']
                    tag = dep.nodes[candidate['ccomp'][0]]['tag']
                    dic = collections.defaultdict(list)
                    deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word][0]
                                                   
                    if 'nsubj' in deps.keys():
                        compound = self._find_compound(dep.get_by_address(deps['nsubj'][0])['word'], dep)
                        obj = self._compound(compound, pre[j])
                        obj.append(dep.get_by_address(deps['nsubj'][0])['word'])
                        if 'dobj' in deps.keys() or 'xcomp' in deps.keys():
                            for n in self.t.subtrees(lambda n:n[0] == word):
                                dic['predicate'].append((word, self._find_attrs(n, word))) 
                            dic['object'] = self._add_conj(self._dobj(deps, dep, word))
                            return (' '.join(obj), [dic])
                     
                    elif 'dobj' in deps.keys():
                        compound = self._find_compound(dep.get_by_address(deps['dobj'][0])['word'], dep)
                        obj = self._compound(compound, pre[j])
                        for n in self.t.subtrees(lambda n:n[0] == dep.get_by_address(deps['dobj'][0])['word']):
                            obj.append(n[0])
                            return (' '.join(obj), self._find_attrs(n, ' '.join(obj)))
#                     else:
#                         return None
                                                   
                elif 'cop' in [x[1] for x in candidate_1]:
                    tmp = [x for x in candidate_1 if x[1] == 'cop'][0]
                    compound = self._find_compound(tmp[0][0], dep)
                    obj = self._compound(compound, pre[j])
                    for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]):
                        obj.append(j[0])
                        return (' '.join(obj), self._find_attrs(j, ' '.join(obj)))    
                elif 'case' in [x[1] for x in candidate_1]:
                    tmp = [x for x in candidate_1 if x[1] == 'case'][0]
                    compound = self._find_compound(tmp[0][0], dep)
                    obj = self._compound(compound, pre[j])
                    for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]):
                        obj.append(j[0])
                        return (' '.join(obj), self._find_attrs(j, ' '.join(obj)))
                                                   
                elif 'auxpass' in candidate.keys():
                    sent = [x[0] for x in self.pos]
                    if data != None and data.relation in synonym:
                        relation = sent.index(data.relation.split(' ')[0])
                        if 'IN' in [x[1] for x in self.pos if x[0] == sent[relation]][0]:
                            return (sent[relation-1], [])
                    return None
                                
                # 沒有受詞
                elif data != None and data.relation in synonym:
                    sent = [x[0] for x in self.pos]
                    before = sent.index(data.relation.split(' ')[0])-1
                    # 原句前一個詞是否為NN   
                    if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                        return (sent[before], [])
                    elif 'IN' in [x[1] for x in self.pos if x[0] == sent[before]][0] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]:
                        return (sent[before-1], [])
                    elif data.child != []:
                        kid = data.child[0]
                        if kid.relation != 'appos':
                            return (kid.relation+' '+' '.join(kid.data.flatten()), [])
                    else:
                        return None

                # 受詞為子句
                elif data != None and data.child != []:
                    kid = data.child[0]
                    if kid.relation != 'appos':
                        return (kid.relation+' '+' '.join(kid.data.flatten()), [])
                elif [x for x in dep.nodes.values() if x['word']==pre[j]][0]['tag'] == 'RP':
                    continue
                else:
                    return None
                                                   
    def _find_predicate(self):
        tmp = self.t.flatten()
        for n in self.t.subtrees(lambda n: n.label().startswith('VB')):
            if n.parent().label() in ['ADJP']:
                continue
            i = tmp.index(n[0])
            sub = []
            while self.t.pos()[i-1][1] in ['MD','RB']:
                sub.append(self.t.pos()[i-1][0])
                i -= 1
            sub.reverse()
            i = tmp.index(n[0])
            while i+1 < len(tmp):
                if [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] == 'RP':
                    sub.append(tmp[i])
                    i += 1
                elif [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] in ['RB','MD']:
                    if i+2 >= len(tmp):
                        break
                    count = i+2
                    while count+1 < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] in ['RB','MD']:
                        count += 1
                    if count < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] == 'TO':
                        sub.append(tmp[i])
                        i += 1
                    else:
                        break
                else:
                    break
            flag = i
            sub.append(tmp[flag])
            # 不定詞
            for j in self.t.subtrees(lambda j:j[0] == tmp[flag]):
                if j.right_sibling() and j.right_sibling().label() == 'PP' and j.right_sibling().leaves()[0] != 'to':
                    start = tmp.index(j.right_sibling().leaves()[-1])
                    has_PP = True
                else:
                    start = flag
                    has_PP = False

                if start+1 < len(tmp) and tmp[start+1] == 'to':
                    for i in range(start+1, len(tmp)):                                                   
                        if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i]][0] in ['TO','RB']:
                            sub.append(tmp[i])
                            if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB'):
                                flag = i
                        else:
                            break

                    if has_PP:
                        for i in self.t.subtrees(lambda i:i[0] == sub[-1]):
                            return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                    else:
                        for i in self.t.subtrees(lambda i:i[0] == tmp[flag]):
                            return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                else:
                    for i in self.t.subtrees(lambda i:i[0] == tmp[flag]):
                        return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                                                   
           
    def _find_NOUN(self, n):
        # 所有格
        if n.parent().right_sibling() and n.parent().right_sibling().label().startswith('NN'):
            sub = n.parent().leaves()
            p = n.parent()
            while p.right_sibling():
                if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD','DT']:
                    p = p.right_sibling()
                    sub.append(p[0])   
                else:
                    break
            return (' '.join(sub), self._find_attrs(p, ' '.join(sub)))
        else:
            sub = []
            pp = n.parent()   
            flag = ''
            for l in pp:
                if l.label().startswith('NN') or l.label() in ['PRP','CD','DT']:
                    if l[0] not in sub:
                        sub.append(l[0])
                        flag = l 
            if flag == '':
                sub.append(n[0])
                flag = n
            return (' '.join(sub), self._find_attrs(flag, ' '.join(sub)))
                                                   
    def _find_to(self, node):
        dic = collections.defaultdict(list)
        flag = node.leaves().index('to')
        tmp = node.leaves()[flag:]
        predicate = []
        for i in tmp:
            if [x[1] for x in self.t.pos() if x[0] == i][0] in 'TO' or 'VB' in [x[1] for x in self.t.pos() if x[0] == i][0]:
                predicate.append(i)
            else:
                break    
        for n in node.subtrees(lambda n: n[0] == predicate[-1]):        
            dic['predicate'].append((' '.join(predicate), self._find_attrs(n, ' '.join(predicate))))
        if predicate[-1] == 'be':
            for n in node.subtrees(lambda n: n.label() in ['NP', 'PP']):
                if n.label() in ['NP', 'PP']:
                    for c in n.subtrees(lambda c: c.label().startswith('NN') or c.label() in ['PRP', 'CD']):
                        a = self._find_NOUN(c)
                        dic['object'] = self._add_conj(a)
                        return dic
        else:
            tmp = self._find_object(dic['predicate'], node, None)
            dic['object'] = self._add_conj(tmp)
            return dic 
                                                   
    def _toV(self, node):
        # 可能有多個一樣的字                                           
        flat = list(self.t.flatten())
        candidate = [x for x, y in enumerate(flat) if y == node[0]]
        flag = candidate[0]
        if node.left_sibling():
            before = node.left_sibling().leaves()[-1]
            for i in candidate:
                if flat[i-1] == before:
                    flag = i
                    break
        elif node.right_sibling():
            after = node.right_sibling().leaves()[0]
            for i in candidate:
                if flat[i+1] == after:
                    flag = i
                    break 
        elif node.parent().left_sibling():
            before = node.parent().left_sibling().leaves()[-1]
            for i in candidate:
                if flat[i-1] == before:
                    flag = i
                    break
        elif node.parent().right_sibling():
            after = node.parent().right_sibling().leaves()[0]
            for i in candidate:
                if flat[i+1] == after:
                    flag = i
                    break 
        
        if not node.label().startswith('VB') and flag+2 < len(flat) and flat[flag+1] == 'to' and [x[1] for x in self.t.pos() if x[0] == flat[flag+2]][0] in 'VB':
            for i in self.t.subtrees(lambda i: i[0] == 'to'):                                 
                if flat[flag] not in i.parent().flatten():
                    return i.parent()

        else:
            return None
               
    def _PP(self, s, name, attrs):
        if ' '.join(s.flatten()) not in name:
            if len(s[0]) != 1:
                for i in s.subtrees(lambda i: i.label() == 'PP'):
                    if i.parent() == s:
                        a = self._proposition(i)
                        if a != []:
                            attrs.append(a)
                        else:
                            attrs.append(' '.join(s.flatten()))
            else:
                a = self._proposition(s)
                if a != []:
                    attrs.append(a)
                else:
                    attrs.append(' '.join(s.flatten()))
        return attrs
                                                   
                                                   
    def _find_attrs(self, node, name): 
        attrs = []
        p = node.parent()
        toV = self._toV(node)
        name = name.split(' ')
        # Search siblings of adjective for adverbs
        if node.label().startswith('JJ'):
            for s in p:
                if s.label() == 'RB':
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s.label() == 'PP':
                    attrs = self._PP(s, name, attrs)
                elif s.label() == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))                 

        elif node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']:
            for s in p:
                if s != node and s.label() in ['DT','PRP$','POS','CD','IN'] or s.label().startswith('JJ') or s.label().startswith('NN'):
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s != node and s.label() in ['ADJP','NP','QP', 'VP']:                            
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))  
                elif s != p and s.label() in ['PP']:
                    attrs = self._PP(s, name, attrs)

        # Search siblings of verbs for adverb phrase
        elif node.label().startswith('VB'):   
            for s in p:
#                 if s.label() in ['ADVP','MD','RB']:
                if s.label() in ['ADVP', 'RB', 'MD']:
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))

                elif s.label() == 'PP':
                    attrs = self._PP(s, name, attrs)

            
        # Search uncles
        # if the node is noun or adjective search for prepositional phrase
        if node.label().startswith('JJ') or node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']:
            if p.label() == 'QP':
                p = p.parent()
            for s in p.parent():
                if s != p and s.label() in ['PP']:
                    attrs = self._PP(s, name, attrs)
                elif s != p and 'NN' in s.label() or s.label() == 'JJ':
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s != p and s.label() == 'VP' and s.parent().label() == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        if toV != None:
                            if ' '.join(s.flatten()[:3]) != ' '.join(toV.flatten()[:3]):
                                attrs.append(' '.join(s.flatten()))
                        else:
#                             self._refresh(s)
                            attrs.append(' '.join(s.flatten()))

        elif node.label().startswith('VB') or node.label() == 'RP':
            if p.parent():
                tmp = node
                for s in p.parent():
                    if s != p and s.label().startswith('ADVP'):
                        if ' '.join(s.flatten()) not in name:
                            attrs.append(' '.join(s.flatten()))
    #                 elif s != p and s.label() in ['MD','RB']:
    #                     attrs.append(s[0])
                    elif s != p and s.label() == 'PP' and s == tmp.right_sibling():       
                        attrs = self._PP(s, name, attrs)
                        tmp = s
        
        if toV != None:
            attrs.append(self._find_to(toV))
            self._refresh(toV) 
        
        return attrs  
                                                   
    def _proposition(self, node):
        dic = collections.defaultdict(list)
        tmp = node.leaves()
        if len(tmp) == 1:
            return []
        for k in node.subtrees(lambda k: k.label() in ['IN', 'TO']):  
            if tmp.index(k[0])+1 < len(tmp):
                VB = [x for x in node.pos() if x[0] == tmp[tmp.index(k[0])+1]]
                if VB != [] and 'VB' in VB[0][1]:                                   
                    dic['predicate'].append((k[0]+' '+VB[0][0], []))
                else:
                    dic['predicate'].append((k[0], []))  
            else:
                dic['predicate'].append((k[0], []))                                   
            if k.right_sibling():
                for c in k.right_sibling().subtrees(lambda c: c.label().startswith('NN') or c.label() in ['JJ', 'CD']):
                    # 所有格
                    if c.parent().right_sibling() and c.parent().right_sibling().label().startswith('NN'):
                        sub = c.parent().leaves()
                        p = c.parent()
                        while p.right_sibling():
                            if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD']:
                                p = p.right_sibling()
                                sub.append(p[0])
                                flag = p
                            else:
                                break
                    else:
                        sub = []
                        pp = c.parent()
                        for l in pp:
                            if l.label().startswith('NN') or l.label() in ['PRP','CD', 'JJ']:
                                if l[0] not in sub:
                                    sub.append(l[0])
                                    flag = l
                    dic['object'].append((' '.join(sub), self._find_attrs(flag, ' '.join(sub))))
                    dic['object'] = self._add_conj(dic['object'][0])   
                    return dic
                return []
            else:
               return []                                    
        return []                                           
コード例 #23
0
ファイル: bpe_tags_root.py プロジェクト: liuqingpu/pascal
"""
Created by:         Emanuele Bugliarello (@e-bug)
Date created:       9/4/2019
Date last modified: 9/4/2019
"""

import re
import sys
import html
import numpy as np
from collections import defaultdict
from nltk.parse.corenlp import CoreNLPDependencyParser

mapper = {'"': '``'}
parsers = {
    'en': CoreNLPDependencyParser(url='http://*****:*****@@'):
    tokens = tokenize(sent)
    word, words = [], []
    for tok in tokens:
        if tok.endswith(separator):
コード例 #24
0
    features_files = [
        "features/features_train_" + args.version + ".txt",
        "features/features_devel_" + args.version + ".txt",
        "features/features_test_" + args.version + ".txt"
    ]

    features_megan_files = [
        "features/features_megan_train_" + args.version + ".txt",
        "features/features_megan_devel_" + args.version + ".txt",
        "features/features_megan_test_" + args.version + ".txt"
    ]

    # connect to your CoreNLP server
    try:
        my_parser = CoreNLPDependencyParser(url="http://localhost:9000")
    except (ConnectionError, ConnectionRefusedError) as e:
        print("Loading parser\n")
        print("Error while trying to connect to CorNLP server. Try running:\n")
        print("\tcd stanford-corenlp-full-2018-10-05")
        print(
            "\tjava -mx4g -cp \"*\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer"
        )
        exit()

    #create_features_file(args.inputdirtrain, features_files[0], features_megan_files[0])
    #create_features_file(args.inputdirdevel, features_files[1], features_megan_files[1])
    #if str(args.createtest) == "True":
    create_features_file(args.inputdirtest, features_files[2],
                         features_megan_files[2])
コード例 #25
0
ファイル: test.py プロジェクト: CinaShi/AnswerMachine
from nltk.parse import CoreNLPParser
from nltk import sent_tokenize
# parser = CoreNLPParser(url='http://localhost:9000')

# print(list(parser.parse('Jack is a boy . He is handsome .'.split())))

# print(list(parser.raw_parse('Jack is a boy . He is handsome .')))

from nltk.parse.corenlp import CoreNLPDependencyParser

dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

print('I am your dad , he is also your dad .'.split())
parses = dep_parser.parse('I am your dad , he is also your dad .'.split())

print([[(governor, dep, dependent)
        for governor, dep, dependent in parse.triples()] for parse in parses])

# parser = CoreNLPParser(url='http://localhost:9000')

# print(list(parser.tokenize('What is the airspeed of an unladen swallow?')))

# pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')

# print(list(pos_tagger.tag('What is the airspeed of an unladen swallow ?'.split())))

# ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')

# print(list(ner_tagger.tag(('Rami Eid is studying at Stony Brook University in NY'.split()))))

# tagger = CoreNLPParser(url='http://localhost:9000')
コード例 #26
0
ファイル: combined.py プロジェクト: rawrdinasour/Thesis-Q3
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

import pickle
from fuzzywuzzy import fuzz
#nltk.download('punkt')
from nltk import tokenize
import re
from tqdm import tqdm
tqdm.pandas()
from string import digits
from string import punctuation

# In[5]:

dep_parser = CoreNLPDependencyParser(url='http://0.0.0.0:9000')
pos_tagger = CoreNLPParser(url='http://0.0.0.0:9000', tagtype='pos')

# In[6]:


def convert_sentence(input_sent):
    # Parse sentence using Stanford CoreNLP Parser
    pos_type = pos_tagger.tag(input_sent.split())
    parse_tree, = ParentedTree.convert(
        list(pos_tagger.parse(input_sent.split()))[0])
    dep_type, = ParentedTree.convert(dep_parser.parse(input_sent.split()))
    return pos_type, parse_tree, dep_type


def multi_liaison(
コード例 #27
0
ファイル: deepRank.py プロジェクト: damons/DeepRank
class GraphMaker:
    def __init__(self, parserURL='http://localhost:9000'):
        self.dparser = CoreNLPDependencyParser(url=parserURL)
        self.clear()

    # clear saved state
    def clear(self):
        self.maxcc = None
        self.gs = None
        self.nxgraph = None
        self.ranked = None
        #self.words=mdict() # not used ...
        self.words2lemmas = set()
        self.noun_set = dict()
        self.svo_edges_in_graph = []

    # digest a file
    def load(self, fname):
        self.clear()
        f = open(fname, 'r')
        text = f.read()
        f.close()
        self.digest(text)

    def parse(self, text):
        ts = self.dparser.parse_text(text)
        return list(ts)

    # digest a string using dependecy parser
    def digest(self, text):
        self.clear()
        chop = 2**16
        gens = []
        # deals with files that are too large to be parse at once
        while len(text) > chop:
            head = text[:chop]
            text = text[chop:]
            #print((head))
            if head:
                hs = list(self.parse(head))
                #print('PARSED')
                gens.append(hs)
        if gens:
            self.gs = [x for xs in gens for x in xs]
        else:
            self.gs = self.parse(text)
        #print('!!!',self.gs)

    # sentence as sequence of words generator
    def sentence(self):
        for g in self.gs:
            yield str.join(' ', list(gwords(g)))

    def wsentence(self):
        for g in self.gs:
            yield tuple(gwords(g))

    def nth_sent_words(self, n):
        ws = tuple(gwords(self.gs[n]))
        return ws

    # sentence as sequence of lemmas generator
    def lsentence(self):
        for g in self.gs:
            yield tuple(glemmas(g))

# curates, reverses and adds some new edges
# yields an <edge, sentence in which it occurs> pair

    def edgesInSent(self):
        self.svo_edges_in_graph = []

        def noun_to_def(x, tx, k):
            if noun_defs:
                k_ = self.noun_set.get(x)
                if k == k_:
                    yield (x, tx, 'first_in', k, 'SENT')

        def edgeOf(k, g):
            d = w2l(g)
            merge_dict(self.words2lemmas, d)
            make_noun_set(g, self.noun_set, k)
            svo_edges_in_sent = []
            for ts in g.triples():
                #print('TS',ts)
                fr, rel, to = list(ts)
                lfrom, ftag = d[fr[0]]
                lto, ttag = d[to[0]]
                # vn is True it is an s->v or o->v link
                so = isSubj(rel) or isObj(rel)
                vn = isVerb(ftag) and isNoun(ttag) and so
                if rel == 'punct' and ttag == '.':
                    # sentence points to predicate verb
                    yield (k, 'SENT', 'predicate', lfrom, ftag)
                elif vn:
                    # collects vs and vo links to merge them later into svo
                    svo_edges_in_sent.append((lfrom, ftag, rel, lto, ttag))
                    yield lfrom, ftag, rel, lto, ttag  # verb to noun
                    yield k, 'SENT', 'about', lto, ttag  # sent to noun
                    # all words recommend sentence
                    #yield lfrom,ftag,'recommends',k,'SENT' # verb to sent - in elif !
                    for e in noun_to_def(
                            lto,
                            ttag,
                            k,
                    ):
                        yield e  # noun to sent
                    if noun_self: yield lto, ttag, 'self', lto, ttag
                elif isNoun(ttag):  # e.g. nmod relation
                    #print('x-->n',k,lfrom,ftag,rel,lto,ttag)
                    yield lfrom, ftag, rel, lto, ttag
                    for e in noun_to_def(
                            lto,
                            ttag,
                            k,
                    ):
                        yield e  # noun to sent
                    if noun_self: yield lto, ttag, 'self', lto, ttag
                    #yield lfrom, ftag, 'recommends', k, 'SENT' # dependent of noun to sent
                else:  # yield link as is
                    yield lto, ttag, rel, lfrom, ftag
                    # all words recommend sentence
                    if all_recs: yield lto, ttag, 'recommends', k, 'SENT'

                # merge compound terms, make their parts recommend them
                if isNoun(ftag) and isNoun(ttag) and rel == 'compound':
                    comp = lto + ' ' + lfrom
                    yield lfrom, ftag, 'fused', comp, ftag
                    yield lto, ttag, 'fused', comp, ttag
                    for e in noun_to_def(comp, ttag, k):
                        yield e
                    if noun_self: yield comp, ttag, 'self', comp, ttag
            # collect svo relations
            self.svo_edges_in_graph.append(to_svo(k, svo_edges_in_sent))

        k = 0
        for g in self.gs:
            for e in edgeOf(k, g):
                # collects words at the two ends of e
                self.addWordsIn(e)
                yield e, k
            k += 1

    # yields  the edge. possibly for each sentence where is found
    def multi_edges(self):
        for e, k in self.edgesInSent():
            yield e

    def edges(self):
        for e in set(self.multi_edges()):
            yield e

    # collects unique words at ends of an edge
    def addWordsIn(self, e):
        f, tf, r, t, tt = e
        if maybeWord(f) and tf != 'SENT':
            self.words.add(f, tf)
        if maybeWord(t) and tt != 'SENT':
            self.words.add(t, tt)
            yield e

    # returns final networkx text graph
    def graph(self):
        if (self.nxgraph): return self.nxgraph
        dg = nx.DiGraph()

        for e in self.edges():
            f, tf, r, t, tt = e
            dg.add_edge(f, t, rel=r)

        self.nxgraph = dg
        #print('DG:',dg,'END')

        #print('NOUN_SET',self.noun_set)
        return dg

    # ranks (unless ranked and stored as such) the text graph
    def pagerank(self):
        if self.ranked: return self.ranked
        g = self.graph()
        pr = self.runPagerank(g)
        self.ranked = pr
        if not all_recs: return pr
        ccs = list(nx.strongly_connected_components(g))
        lc = len(ccs)
        #print('LENCOM', lc)
        if lc < 4:
            self.maxcc = max(ccs, key=len)
        return pr

    # extracts best k nodes passing filtering test
    def bestNodes(self, k, filter):
        g = self.graph()
        comps = list(nx.strongly_connected_components(g))

        pr = self.pagerank()
        i = 0
        ns = []  # not a set - that looses order !!!
        for x, r in pr:
            if i >= k: break
            #print('RANKED',x,r)
            if filter(x):
                #print('FILTERED',x,r,'MC')
                if not self.maxcc or x in self.maxcc:
                    if not x in ns:
                        ns.append(x)
                        i += 1
        return ns

    # specialization returning all best k nodes
    def bestAny(self, k):
        return self.bestNodes(k, lambda x: True)

    # specialization returning best k sentence nodes
    def bestSentencesByRank(self, k):
        best = self.bestNodes(100 + k, isSent)
        if not best: return
        #print('BEST SENTS:',best)
        c = 0
        for i in best:
            g = self.gs[i]
            lems = [w for w in glemmas0(g)]
            #print('LEMS',lems)
            if isCleanSent(lems):
                sent = list(gwords(g))
                #sent=str.join(' ',list(gwords(g)))
                yield (i, sent)
                c += 1
            #else : print('SENT UNCLEAN',lems)
            if c >= k: break

    def bestSentences(self, k):
        for i_s in sorted(self.bestSentencesByRank(k)):
            yield i_s

    # specialization returning best k word nodes
    def bestWords(self, k):
        #print('NOUNS',self.noun_set)
        c = 0
        best = self.bestNodes(100 + k, maybeWord)
        #print('BEST WORDS:',best)
        for w in best:
            if c >= k: break
            if not isStopWord(w) and self.hasNoun(w):
                yield (w)
                #print('BWORD',w)
            c += 1

    # true if a phrase has a noun in it
    def hasNoun(self, w):
        ws = w.split(' ')
        for v in ws:
            if v in self.noun_set: return True
        return False

    # runs PageRank on text graph
    def runPagerank(self, g):
        d = nx.pagerank(g)
        #print("PR",d)

        # normalize sentence ranks by favoring those close to everage rank
        sents = list(self.wsentence())
        lens = list(map(len, sents))
        #print('LENS:', lens)
        avg = sum(lens) / len(lens)

        #print('AVG SENT LENGTH:', avg)

        # reranks long sentences
        i = 0
        for ws in sents:
            #print('WS:',ws)
            if i in d:
                l = len(ws)
                r = d[i]
                newr = adjust_rank(r, l, avg)
                d[i] = newr
                #if l<6 : print(r,'--->',newr,l,'ws=',ws)
                i += 1

        sd = sorted(d, key=d.get, reverse=True)

        return [(k, d[k]) for k in sd]

    # extracts k highest ranked SVO triplets
    def bestSVOs(self, k):
        rank_list = self.pagerank()
        rank_dict = dict()
        for (w, rw) in rank_list:
            rank_dict[w] = rw
        #print('PRANK',rank_list)
        ranked = []  # should not be a set !
        for rs in self.svo_edges_in_graph:
            for r in rs:
                #print('SVO',r)
                (f, _), (rel, _), (t, _), sent_id = r
                srank = rank_dict[f]
                orank = rank_dict[t]
                if srank and orank:
                    sorank = (2 * srank + orank) / 3
                    ranked.append((sorank, (f, rel, t, sent_id)))
        ranked = sorted(ranked, reverse=True)
        i = 0
        exts = set()
        seen = set()
        for (_, e) in ranked:
            i += 1
            if i > k: break
            #print('SVO_EDGE',e)
            if e in seen: continue
            seen.add(e)
            yield e
            for xe in self.extend_with_wn_links(e, rank_dict):
                f, _, t, _ = xe
                if wn.morphy(f.lower()) != wn.morphy(t.lower()):
                    exts.add(xe)
        i = 0
        for xe in exts:
            i += 1
            if i > k: break
            #print('XE',xe)
            yield xe

    # adds wordnet-derived links to a dictionary d
    # we tag them with is_a or part_of
    def extend_with_wn_links(self, e, d):
        s, v, o, sent_id = e
        m = 1  # how many of each are taken
        for x in wn_holo(m, s, 'n'):
            if x in d: yield (s, 'part_of', x, sent_id)
        for x in wn_mero(m, s, 'n'):
            if x in d: yield (x, 'part_of', s, sent_id)
        for x in wn_hyper(m, s, 'n'):
            if x in d: yield (s, 'is_a', x, sent_id)
        for x in wn_hypo(m, s, 'n'):
            if x in d: yield (x, 'is_a', s, sent_id)
        for x in wn_holo(m, o, 'n'):
            if x in d: yield (o, 'part_of', x, sent_id)
        for x in wn_mero(m, o, 'n'):
            if x in d: yield (x, 'part_of', o, sent_id)
        for x in wn_hyper(m, o, 'n'):
            if x in d: yield (o, 'is_a', x, sent_id)
        for x in wn_hypo(m, o, 'n'):
            if x in d: yield (x, 'is_a', o, sent_id)

    # visualize filtered set of edges with graphviz
    def toDot(self, k, filter, svo=False, show=True, fname='textgraph.gv'):
        dot = Digraph()
        g = self.graph()
        best = self.bestNodes(k, filter)
        for f, t in g.edges():
            if f in best and t in best:
                dot.edge(str(f), str(t))
        if svo:
            svos = set()
            for (s, v, o, _) in self.bestSVOs(k):
                svos.add((s, v, o))
            for e in svos:
                s, v, o = e
                dot.edge(s, o, label=v)
        showGraph(dot, show=show, file_name=fname)

    # visualize filtered set of edges as graphviz dot graph
    def svoToDot(self, k):
        dot = Digraph()
        for e in self.bestSVOs(3 * k):
            s, v, o = e
            dot.edge(s, o, label=v)
        showGraph(dot)

    # specialize dot graph to words
    def wordsToDot(self, k):
        self.toDot(k, isWord)

    # specialize dot senteces graph words
    def sentsToDot(self, k):
        self.toDot(k, isSent)

    # visualize mixed sentence - word graph
    def allToDot(self, k):
        self.toDot(k, lambda x: True)
コード例 #28
0
def extractFeatures():
    stop_words = stopwords.words('english') + list(string.punctuation)
    file_loc='wikiTest/'
    os.chdir('/Users/ranjithreddykommidi/NLP/Project/wikiTest')
    file_names = glob.glob('*.txt')
    
    #Read every wikipedia articles given in the input fileList
    for file in file_names:
        readfile = open(file, 'r')
        text = readfile.read()
        corpus = {}
        sent_text = nltk.sent_tokenize(text)
        dep_parser = CoreNLPDependencyParser(url='http://localhost:9010')
        ner_tagger = CoreNLPParser(url='http://localhost:9010', tagtype='ner')
        count = 0
        for sentence in sent_text:
            tokenized_text = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]  
            lemma = [WordNetLemmatizer().lemmatize(word) for word in tokenized_text]
            stemmed = [PorterStemmer().stem(word) for word in tokenized_text]
            tagged = nltk.pos_tag(tokenized_text)
            parse, = dep_parser.raw_parse(sentence)
            dependency_parse = list(parse.triples())
            tokenized_text_ner = nltk.word_tokenize(sentence) 
            try:
                ner_tag = ner_tagger.tag(tokenized_text_ner)
            except:
                ner_tag = ner_tagger.tag(tokenized_text)
            
            Synonym = []
            Hypernym = []
            Hyponym = []
            Meronym = []
            Holonym = []
            Heads = []
        
            for t in tokenized_text:
                Nyms = lesk(sentence, t)
                if Nyms is not None:
                    this_synonym = t
                    if Nyms.lemmas()[0].name() != t:this_synonym = Nyms.lemmas()[0].name()
                    Synonym.append(this_synonym)
                    if Nyms.hypernyms() != []:Hypernym.append(Nyms.hypernyms()[0].lemmas()[0].name())
                    if Nyms.hyponyms() != []:Hyponym.append(Nyms.hyponyms()[0].lemmas()[0].name())
                    if Nyms.part_meronyms() != []:Meronym.append(Nyms.part_meronyms()[0].lemmas()[0].name())
                    if Nyms.part_holonyms() != []:Holonym.append(Nyms.part_holonyms()[0].lemmas()[0].name())
                else:
                    Synonym.append(t)
        
            striped_sentence = sentence.strip(" '\"")
            if striped_sentence != "":
                dependency_parser = dep_parser.raw_parse(striped_sentence)
                parsetree = list(dependency_parser)[0]
                head_word = ""
                head_word = [k["word"]
                         for k in parsetree.nodes.values() if k["head"] == 0][0]
                if head_word != "":
                    Heads.append([head_word])
                else:
                    for i, pp in enumerate(tagged):
                        if pp.startswith("VB"):
                            Heads.append([tokenized_text[i]])
                            break
                    if head_word == "":
                        for i, pp in enumerate(tagged):
                            if pp.startswith("NN"):
                                Heads.append([tokenized_text[i]])
                                break
            else:
                Heads.append([""])

            count = count + 1
            corpus[count] = {}
            corpus[count]["sentence"] = {}
            corpus[count]["sentence"] = sentence
            corpus[count]["tokenized_text"] = {}
            corpus[count]["tokenized_text"] = tokenized_text
            corpus[count]["lemma"] = {}
            corpus[count]["lemma"] = lemma
            corpus[count]["stem"] = {}
            corpus[count]["stem"] = stemmed
            corpus[count]["tag"] = {}   
            corpus[count]["tag"] = tagged
            corpus[count]["dependency_parse"] = {}
            corpus[count]["dependency_parse"] = dependency_parse
            corpus[count]["synonyms"] = {}
            corpus[count]["synonyms"] = Synonym
            corpus[count]["hypernyms"] = {}
            corpus[count]["hypernyms"] = Hypernym
            corpus[count]["hyponyms"] = {}
            corpus[count]["hyponyms"] = Hyponym
            corpus[count]["meronyms"] = {}
            corpus[count]["meronyms"] = Meronym
            corpus[count]["holonyms"] = {}
            corpus[count]["holonyms"] = Holonym
            corpus[count]["ner_tag"] = {}
            corpus[count]["ner_tag"] = str(dict(ner_tag))
            corpus[count]["head_word"] = {}
            corpus[count]["head_word"] = Heads[0]
            corpus[count]["file_name"] = {}
            corpus[count]["file_name"] = file[len(file_loc):]

        outputName = file[len(file_loc)]        
        json_object = json.dumps(corpus, indent = 4) 
        with open(outputName, "w") as f:
            f.write(json_object)
コード例 #29
0
ファイル: deepRank.py プロジェクト: damons/DeepRank
 def __init__(self, parserURL='http://localhost:9000'):
     self.dparser = CoreNLPDependencyParser(url=parserURL)
     self.clear()
コード例 #30
0
ファイル: single-question.py プロジェクト: gsk12/LMATFY
def question_pipeline(question):

    lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer()
    # stanford corenlp is expected to run at localhost:9000
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    corpus_dict = {}
    count = 0
    sent_text = question
    tokenized_text = nltk.word_tokenize(sent_text)
    question_types = ['who', 'when', 'where', 'Who', 'When', 'Where']
    type_of_question = [i for i in question_types if i in tokenized_text]
    lemma = [lemmatizer.lemmatize(word) for word in tokenized_text]
    stemmed = [porter.stem(word)
               for word in tokenized_text]  # Stemming the words
    # POS tagging the words to extract POS features
    tagged = nltk.pos_tag(tokenized_text)
    parse, = dep_parser.raw_parse(question)
    # Dependency parsing to parse tree based patters as features
    dependency_parse = list(parse.triples())
    # LESK to extract best sense of a word
    best_sense = [lesk(question, word) for word in tokenized_text]
    # tokenized_text_ner = nltk.word_tokenize(sent_text) #Tokenizing sentences into words
    ner_tag = ner_tagger.tag(tokenized_text)
    head_list = []
    striped_sentence = sent_text.strip(" '\"")
    if striped_sentence != "":
        dependency_parser = dep_parser.raw_parse(striped_sentence)
        parsetree = list(dependency_parser)[0]
        head_word = ""
        head_word = [
            k["word"] for k in parsetree.nodes.values() if k["head"] == 0
        ][0]
        if head_word != "":
            head_list.append([head_word])
        else:
            for i, pp in enumerate(tagged):
                if pp.startswith("VB"):
                    head_list.append([tokenized_text[i]])
                    break
            if head_word == "":
                for i, pp in enumerate(tagged):
                    if pp.startswith("NN"):
                        head_list.append([tokenized_text[i]])
                        break
    else:
        head_list.append([""])
    synonym_list = []
    hypernym_list = []
    hyponym_list = []
    meronym_list = []
    holonym_list = []
    for t in tokenized_text:
        best_sense = lesk(sent_text, t)  # LESK to extract best sense of a word
        if best_sense is not None:
            this_synonym = t
            if best_sense.lemmas()[0].name() != t:
                this_synonym = best_sense.lemmas()[0].name()
            synonym_list.append(this_synonym)
            if best_sense.hypernyms() != []:
                hypernym_list.append(
                    best_sense.hypernyms()[0].lemmas()[0].name())
            if best_sense.hyponyms() != []:
                hyponym_list.append(
                    best_sense.hyponyms()[0].lemmas()[0].name())
            if best_sense.part_meronyms() != []:
                meronym_list.append(
                    best_sense.part_meronyms()[0].lemmas()[0].name())
            if best_sense.part_holonyms() != []:
                holonym_list.append(
                    best_sense.part_holonyms()[0].lemmas()[0].name())
        else:
            synonym_list.append(t)

    count = count + 1
    corpus_dict[count] = {}
    corpus_dict[count]["sentence"] = {}
    corpus_dict[count]["sentence"] = sent_text
    corpus_dict[count]["type_of_question"] = {}
    corpus_dict[count]["type_of_question"] = type_of_question
    corpus_dict[count]["tokenized_text"] = {}
    corpus_dict[count]["tokenized_text"] = tokenized_text
    corpus_dict[count]["lemma"] = {}
    corpus_dict[count]["lemma"] = lemma
    corpus_dict[count]["stemmed"] = {}
    corpus_dict[count]["stemmed"] = stemmed
    corpus_dict[count]["tagged"] = {}
    corpus_dict[count]["tagged"] = tagged
    corpus_dict[count]["dependency_parse"] = {}
    corpus_dict[count]["dependency_parse"] = dependency_parse
    corpus_dict[count]["synonyms"] = {}
    corpus_dict[count]["synonyms"] = synonym_list
    corpus_dict[count]["hypernyms"] = {}
    corpus_dict[count]["hypernyms"] = hypernym_list
    corpus_dict[count]["hyponyms"] = {}
    corpus_dict[count]["hyponyms"] = hyponym_list
    corpus_dict[count]["meronyms"] = {}
    corpus_dict[count]["meronyms"] = meronym_list
    corpus_dict[count]["holonyms"] = {}
    corpus_dict[count]["holonyms"] = holonym_list
    corpus_dict[count]["ner_tag"] = {}
    corpus_dict[count]["ner_tag"] = dict(ner_tag)
    corpus_dict[count]["head_word"] = {}
    corpus_dict[count]["head_word"] = head_list[0]
    return corpus_dict
コード例 #31
0
ファイル: main.py プロジェクト: soverysour/MIRPR-2019-2020
    return weightedTags


parser = nltk.parse.corenlp.CoreNLPParser()
rawInput = sys.argv[1]

sentence = next(parser.raw_parse(rawInput))[0]
rules = propToRule(sentence)

lemmatizer = WordNetLemmatizer()
tags = list(
    set(
        map(lambda node: lemmatizeSafely(node[1], lemmatizer),
            flattenTree(sentence))))

dependency_parser = CoreNLPDependencyParser()

result = dependency_parser.raw_parse(rawInput)

depGraph = next(result)

depGraphResult = traverseDepGraph(depGraph.nodes, lemmatizer)

print(
    json.dumps({
        'modality': flatten(rules, []),
        'depGraph': depGraphResult,
        'flatTags': tags
    }))