Esempio n. 1
0
class NLTK_NLP():

    def __init__(self, ip_port):
        self.dep_parser = CoreNLPDependencyParser(url=ip_port)
        self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner')
        self.parser = CoreNLPParser(url=ip_port)
        self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos')

    def generate_dependency_tree(self, sentence):
        '''what is the name of the asteroid ?'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return dependency_tree

    def generate_dependency_graph(self, sentence):
        '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'}
        7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return DependencyGraph(dependency_tree.to_conll(10))

    def generate_constituency_tree(self, sentence):
        '''input: one question'''
        tree_list = list(self.parser.raw_parse(sentence=sentence))
        return tree_list[0]

    def get_pos(self, sentence):
        '''What is the airspeed of an unladen swallow ?
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        '''
        pos_list = list(self.pos_tagger.tag(sentence.split()))
        # tokens = nltk.word_tokenize(sentence)
        # wordpos = nltk.pos_tag(tokens)
        return pos_list

    def get_pos_by_tokens(self, tokens):
        '''What is the airspeed of an unladen swallow ?'''
        pos_list = list(self.pos_tagger.tag(tokens))
        return pos_list

    def get_ner(self, sentence):
        # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
        '''april the 26th, 1882 is the birth date of which athletes ?
        [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'),
        ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'),
        ('athletes', 'O'), ('?', 'O')]'''
        sequence_ner_tuple_list = self.ner_parser.tag(sentence.split())
        sequence_ner_list = []
        for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list):
            sequence_ner_list.append(ner_tag)
        return sequence_ner_list

    def get_toknizer(self, sentence):
        return list(self.parser.tokenize(sentence))

    def find_phrases(self, tree, phrase_tag='NP'):
        return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
Esempio n. 2
0
def parse(sentence):
    parser = CoreNLPDependencyParser(url='http://localhost:9000')
    parse = parser.raw_parse(sentence)
    parse_tree = list(parse)[0]

    triple = []
    parse_values = []
    for k in parse_tree.nodes.values():
        if k is not None:
            parse_values.append(k)
        else:
            print("NONE happened", sentence)
    parse_values.sort(key=lambda x: x["address"])
    parse_values = parse_values[1:]
    words = [x["word"] for x in parse_values]

    for k in parse_tree.nodes.values():
        try:
            if k["address"] == 0:
                continue
            elif k["head"] == 0:
                triple.append((("ROOT", k["head"] - 1), (words[k["address"] - 1], k["address"] - 1), k["rel"]))
            else:
                triple.append(
                    ((words[k["head"] - 1], k["head"] - 1), (words[k["address"] - 1], k["address"] - 1), k["rel"]))
        except IndexError:
            print(words)
    return triple, words
Esempio n. 3
0
def parseReview():
	# Start server by running the following command under the parser directory:
	# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
	dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
	dependency_relation = ["nsubj", "dobj", "agent", "advmod", "amod", 
							"neg", "prep_of", "acomp", "xcomp"]

	train_data, test_data = readData("")
	review_dependency_feature = dict()
	prev_percent = 0
	all_data = [train_data, test_data]
	names = ["train_data", "test_data"]

	for j in range(len(all_data)):
		dat = all_data[j]
		for i in range(len(dat.data)):
			review = tokenize.sent_tokenize(train_data.data[i].decode("utf-8"))
			review_feature = list()
			for line in review:
				parse, = dep_parser.raw_parse(line)
				for governor, dep, dependent in parse.triples():
					if dep in dependency_relation:
						review_feature.append((governor[0],dependent[0],dep))
			review_dependency_feature[i] = review_feature
			percent = int(i/len(train_data.data)*100)

			if percent == prev_percent + 1:
				prev_percent += 1
				print (percent,"% processed")


		with open(names[j]+'.json', 'w') as outfile:  
			json.dump(review_dependency_feature, outfile)
Esempio n. 4
0
def context_to_tree_goodreads(ith_data):
    start_time = time.time()

    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    context = ith_data['review_sentences']
    graph = [[0, []] for _ in range(len(context))]
    # tree = [[] for _ in range(len(context))]
    # triple = [[] for _ in range(len(context))]

    for i, ith in enumerate(
            context
    ):  ## ith context of input movie(divided in multple sentences)
        if ith[0] == 1:
            graph[i][0] = 1

            ## Tokenizing PLAN
        if ith[1] != '':
            parsed = dep_parser.raw_parse(ith[1])
            for parse in parsed:
                graph[i][1].append(parse.to_dot())

            graph[i][1] = graph[i][1][0].split('\n')

        else:
            graph[i][1] = ith[1]
        # print("{0}th Movie Processing => ".format(step+1) + 'i & j: {0}/{2}, {1}/{3}'.format(i+1, j+1, len(context), len(context[i])))

    # ith_data['graph'] = graph
    result = extract_info_buru(graph)
    return result
Esempio n. 5
0
class CNLP:
    CNLPServerURL = 'http://localhost:9000'

    def __init__(self):
        self.parser = CoreNLPParser(url=self.CNLPServerURL)
        self.dep_parser = CoreNLPDependencyParser(url=self.CNLPServerURL)
        self.ner_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='ner')
        self.pos_tagger = CoreNLPParser(url=self.CNLPServerURL, tagtype='pos')

    def getParse(self, sentence):
        if (type(sentence) == list):
            return self.parser.parse(sentence)
        else:
            return self.parser.raw_parse(sentence)

    def getDepParse(self, sentence):
        if (type(sentence) == list):
            return self.dep_parser.parse(sentence)
        else:
            return self.dep_parser.raw_parse(sentence)

    def getNERTags(self, sentence):
        if (type(sentence) != list):
            sentence = sentence.split()
        return self.ner_tagger.tag(sentence)

    def getPOSTags(self, sentence):
        if (type(sentence) == list):
            return self.pos_tagger.parse(sentence)
        else:
            return self.pos_tagger.raw_parse(sentence)
Esempio n. 6
0
def parse_sentence_without_compare(sentence):
    """
    :param sentence: str
    :return: triples, words
    """
    parser = CoreNLPDependencyParser(url='http://localhost:9000')

    parse = parser.raw_parse(sentence, properties={
        'tokenize.options': 'ptb3Escaping=false, normalizeFractions=false'})

    parse_sents = list(parse)
    assert len(parse_sents) == 1, "More than 1 sentence extracted"
    parse_graph = list(parse_sents[0].nodes.values())

    parse_graph.sort(key=lambda x: x["address"])
    parse_graph = parse_graph[1:]
    words = [x["word"] for x in parse_graph]

    triples = []
    for k in parse_graph:
        if k["head"] is None:
            continue
        elif k["head"] == 0:
            triples.append((("ROOT", k["head"] - 1), (words[k["address"] - 1], k["address"] - 1), k["rel"]))
        else:
            triples.append(
                ((words[k["head"] - 1], k["head"] - 1), (words[k["address"] - 1], k["address"] - 1),
                 k["rel"]))

    return triples, words
Esempio n. 7
0
def get_parses(inputs):
	dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
	sentences = sent_tokenize(inputs)
	elem = {}
	parses = []
	for text in sentences:
		token_words = word_tokenize(text)
		parse, = dep_parser.raw_parse(text)
		parse1 = []
		for governor, dep, dependent in parse.triples():
			lst = []
			# print governor, dep, dependent
			try:
				lst.append(dep)
				lst.append("(")
				lst.append(governor[0])
				lst.append("-")
				lst.append(str(token_words.index(governor[0])+1))
				lst.append(", ")
				lst.append(dependent[0])
				lst.append("-")
				lst.append(str(token_words.index(dependent[0])+1))
				lst.append(")")
				parse1.append("".join(lst))
			except:
				continue
		parses.append(parse1)


	elem = {"text": inputs,
			"sentences": sentences,
			"parses": parses }

	return(elem)
 def processQueryToExtractHeadWord(self, query):
     dependency_parser = CoreNLPDependencyParser('http://localhost:9000')
     headWord = None
     parsedSentence = list(dependency_parser.raw_parse(query))[0]
     rootValue = list(
         list(parsedSentence.nodes.values())[0]['deps']['ROOT'])[0]
     for n in parsedSentence.nodes.values():
         if n['address'] == rootValue:
             headWord = n['word']
             break
     return headWord
def analyze(stext):
    parser = CoreNLPDependencyParser(url="http://localhost:9000")

    if '\r\n' in stext:
        stext = stext.replace('\r\n', '  ')
    iterator = parser.raw_parse(stext)
    parse = next(iterator)

    parse = add_offset_to_tree(parse, stext)

    return parse
 def findHeadWord(self, indexSentenceMap):
     print("Head Word Extraction...")
     indexHeadMap = collections.OrderedDict()
     dependency_parser = CoreNLPDependencyParser('http://localhost:9000')
     for k, v in indexSentenceMap.items():
         parsedSentence = list(dependency_parser.raw_parse(v))[0]
         rootValue = list(
             list(parsedSentence.nodes.values())[0]['deps']['ROOT'])[0]
         for n in parsedSentence.nodes.values():
             if n['address'] == rootValue:
                 indexHeadMap[k] = n['word']
                 break
     return indexHeadMap
Esempio n. 11
0
def get_dependency_analysis_of_sentence(sentence):
    #dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    dep_parser = CoreNLPDependencyParser(url='http://0.0.0.0:9000')
    parse = DependencyGraph()
    try:
        parse, = dep_parser.raw_parse(sentence)
    except:
        print('Some error happend when runing the dependency parser.....')
    #print('parse......',parse)
    # print('parser.to_conll......', parse.to_conll(4))
    # print('parse.tree......', parse.tree())
    # for governor, dep, dependent in parse.triples():
    #     print(governor, dep, dependent)
    return parse
Esempio n. 12
0
def method(dat):
    result = []
    k = re.sub(r'[^\w\s]', '', dat)  # remove punctuation
    k2 = re.sub("\d+", "", k)  # remove digits
    parser = CoreNLPDependencyParser(url='http://localhost:9000')

    # Triples from CoreNLPDependencyParser
    sent = k.lower()
    if sent == 'tunisia':
        return 'tunisia'
    else:
        parse = parser.raw_parse(sent)
        tree = next(parse)
        sentparse = [t for t in tree.triples()]
        return sentparse
Esempio n. 13
0
class TreeParser:
    def __init__(self):
        self.parser = None
        self.server = None
        self.dependency_parser = None

    def setup(self):
        url = settings.CORENLP_URL

        if url is None:
            server = CoreNLPServer(
               settings.CORENLP_PATH,
               settings.CORENLP_MODEL_PATH,
            )
            server.start()

            self.server = server
            url = server.url

        else:
            print("[TreeParser] Using existing CoreNLP Server...")

        self.parser = CoreNLPParser(url=url)

        # maybe separated with another class...
        self.dependency_parser = CoreNLPDependencyParser(url=url)

        return self.parser

    def parse(self, sentence):
        if not self.parser:
            raise AttributeError('parser is not set up')

        return self.parser.raw_parse(sentence)

    def free(self):
        if not self.server:
            return

        self.server.stop()

    def dependency_parse(self, sentence):
        if not self.dependency_parser:
            raise AttributeError('dependency parser is not set up')

        return self.dependency_parser.raw_parse(sentence)
Esempio n. 14
0
class CoreNLP:
    def __init__(self):
        self.parser = CoreNLPDependencyParser(url=self.corenlp_server())
        self.sentence_tokenizer = PunktSentenceTokenizer()

    @staticmethod
    def corenlp_server():
        return getenv('CORENLP_SERVER')

    def dep_parse(self, text: str, conll_version=10) -> str:
        """Get a CoreNLP depparse,lemma"""
        def get_conll(t):
            deps, = self.parser.raw_parse(t)
            return deps.to_conll(conll_version)  # xrenner requires conll10

        sentences = self.sentence_tokenizer.sentences_from_text(text)
        return '\n'.join(map(get_conll, sentences))
 def processQueryToExtractImprovisedHeadWord(self, query):
     dependency_parser = CoreNLPDependencyParser('http://localhost:9000')
     headWord = None
     parsedSentence = list(dependency_parser.raw_parse(query))[0]
     rootValue = list(
         list(parsedSentence.nodes.values())[0]['deps']['ROOT'])[0]
     for n in parsedSentence.nodes.values():
         if n['address'] == rootValue:
             headWord = n['word']
             if len(headWord):
                 _, tag = pos_tag([headWord])[0]
                 wnTag = IndexCreation().getWordnetTag(tag)
                 if wnTag is not None:
                     synset = wn.synsets(headWord, pos=wnTag)
                     if len(synset) > 0:
                         headWord = synset[0].name().split('.')[0]
             break
     return headWord
Esempio n. 16
0
def get_single(summary):
    if summary.startswith('.'):
        summary = summary[1:]
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    try:
        parse, = dep_parser.raw_parse(summary)
        nouns = set()
        for x in range(1, len(parse.nodes.items())):
            wdict = parse.nodes[x]
            if "NN" in wdict["tag"]:
                nouns.add(wdict["word"])
        return nouns
    except JSONDecodeError:
        print("Decode Error at " + summary)
        return None
    except StopIteration:
        print("Stopped at " + summary)
        return None
    except HTTPError:
        print("HTTPError " + summary)
        return None
Esempio n. 17
0
def analyze(sentence_text):

    # Core NLP is used. It sometimes throws StopIteration exception, in which case the analysis continues with the next
    # sentence
    parser = CoreNLPDependencyParser(url="http://localhost:9000")
    dependency_graph, = parser.raw_parse(sentence_text)

    # For every word, the offset is added to the corresponding node
    address = 0
    offset = 0
    while dependency_graph.contains_address(address):
        node = dependency_graph.get_by_address(address)
        word = node["word"]
        if isinstance(word, str):
            offset = sentence_text.find(word, offset)
            node["start"] = offset
            node["end"] = offset + len(word) - 1
            offset += len(word)
        address += 1

    return dependency_graph
 def findImprovisedHeadWord(self, indexSentenceMap):
     print("Improvised Head Word Extraction...")
     indexHeadMap = collections.OrderedDict()
     dependency_parser = CoreNLPDependencyParser('http://localhost:9000')
     for k, v in indexSentenceMap.items():
         parsedSentence = list(dependency_parser.raw_parse(v))[0]
         rootValue = list(
             list(parsedSentence.nodes.values())[0]['deps']['ROOT'])[0]
         for n in parsedSentence.nodes.values():
             if n['address'] == rootValue:
                 headWord = n['word']
                 if len(headWord) > 0:
                     _, tag = pos_tag([headWord])[0]
                     wnTag = IndexCreation().getWordnetTag(tag)
                     if wnTag is not None:
                         synset = wn.synsets(headWord, pos=wnTag)
                         if len(synset) > 0:
                             headWord = synset[0].name().split('.')[0]
                 indexHeadMap[k] = headWord
                 break
     return indexHeadMap
Esempio n. 19
0
def get_single(summary):
    if summary.startswith('.'):
        summary = summary[1:]
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    print("Finished %s" % summary)
    try:
        parse, = dep_parser.raw_parse(summary)
        nouns = set()
        for x in range(1, len(parse.nodes.items())):
            wdict = parse.nodes[x]
            if "NN" in wdict["tag"]:
                nouns.add(wdict["word"])
        return nouns
    except JSONDecodeError:
        print("Decode Error at " + summary)
        return None
    except StopIteration:
        print("Stopped at " + summary)
        return None
    except HTTPError:
        print("HTTPError " + summary)
        return None
Esempio n. 20
0
from datetime import datetime
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.dependencygraph import DependencyGraph

parser = CoreNLPDependencyParser(url='http://localhost:9000')

# filename = "text6"
# f = open("../Fragments_for_testing/"+filename, "r")
# sentences = f.readlines()
# for sentence in sentences:
sentence = "Elephants are big. Monkeys are small"
parse, = parser.raw_parse(sentence)
conll = parse.to_conll(4)
dp = DependencyGraph(conll)
dotted = dp.to_dot()
G = dp.nx_graph()
f = open('test_' + str(datetime.now()) + '.svg', 'w')
svg = dp._repr_svg_()
f.write(svg)
    def dependency_parse_tree(self, s):
        parser = CoreNLPDependencyParser()

        parse = next(parser.raw_parse(s))

        return parse
Esempio n. 22
0
def question_pipeline(question):

    lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer()
    # stanford corenlp is expected to run at localhost:9000
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    corpus_dict = {}
    count = 0
    sent_text = question
    tokenized_text = nltk.word_tokenize(sent_text)
    question_types = ['who', 'when', 'where', 'Who', 'When', 'Where']
    type_of_question = [i for i in question_types if i in tokenized_text]
    lemma = [lemmatizer.lemmatize(word) for word in tokenized_text]
    stemmed = [porter.stem(word)
               for word in tokenized_text]  # Stemming the words
    # POS tagging the words to extract POS features
    tagged = nltk.pos_tag(tokenized_text)
    parse, = dep_parser.raw_parse(question)
    # Dependency parsing to parse tree based patters as features
    dependency_parse = list(parse.triples())
    # LESK to extract best sense of a word
    best_sense = [lesk(question, word) for word in tokenized_text]
    # tokenized_text_ner = nltk.word_tokenize(sent_text) #Tokenizing sentences into words
    ner_tag = ner_tagger.tag(tokenized_text)
    head_list = []
    striped_sentence = sent_text.strip(" '\"")
    if striped_sentence != "":
        dependency_parser = dep_parser.raw_parse(striped_sentence)
        parsetree = list(dependency_parser)[0]
        head_word = ""
        head_word = [
            k["word"] for k in parsetree.nodes.values() if k["head"] == 0
        ][0]
        if head_word != "":
            head_list.append([head_word])
        else:
            for i, pp in enumerate(tagged):
                if pp.startswith("VB"):
                    head_list.append([tokenized_text[i]])
                    break
            if head_word == "":
                for i, pp in enumerate(tagged):
                    if pp.startswith("NN"):
                        head_list.append([tokenized_text[i]])
                        break
    else:
        head_list.append([""])
    synonym_list = []
    hypernym_list = []
    hyponym_list = []
    meronym_list = []
    holonym_list = []
    for t in tokenized_text:
        best_sense = lesk(sent_text, t)  # LESK to extract best sense of a word
        if best_sense is not None:
            this_synonym = t
            if best_sense.lemmas()[0].name() != t:
                this_synonym = best_sense.lemmas()[0].name()
            synonym_list.append(this_synonym)
            if best_sense.hypernyms() != []:
                hypernym_list.append(
                    best_sense.hypernyms()[0].lemmas()[0].name())
            if best_sense.hyponyms() != []:
                hyponym_list.append(
                    best_sense.hyponyms()[0].lemmas()[0].name())
            if best_sense.part_meronyms() != []:
                meronym_list.append(
                    best_sense.part_meronyms()[0].lemmas()[0].name())
            if best_sense.part_holonyms() != []:
                holonym_list.append(
                    best_sense.part_holonyms()[0].lemmas()[0].name())
        else:
            synonym_list.append(t)

    count = count + 1
    corpus_dict[count] = {}
    corpus_dict[count]["sentence"] = {}
    corpus_dict[count]["sentence"] = sent_text
    corpus_dict[count]["type_of_question"] = {}
    corpus_dict[count]["type_of_question"] = type_of_question
    corpus_dict[count]["tokenized_text"] = {}
    corpus_dict[count]["tokenized_text"] = tokenized_text
    corpus_dict[count]["lemma"] = {}
    corpus_dict[count]["lemma"] = lemma
    corpus_dict[count]["stemmed"] = {}
    corpus_dict[count]["stemmed"] = stemmed
    corpus_dict[count]["tagged"] = {}
    corpus_dict[count]["tagged"] = tagged
    corpus_dict[count]["dependency_parse"] = {}
    corpus_dict[count]["dependency_parse"] = dependency_parse
    corpus_dict[count]["synonyms"] = {}
    corpus_dict[count]["synonyms"] = synonym_list
    corpus_dict[count]["hypernyms"] = {}
    corpus_dict[count]["hypernyms"] = hypernym_list
    corpus_dict[count]["hyponyms"] = {}
    corpus_dict[count]["hyponyms"] = hyponym_list
    corpus_dict[count]["meronyms"] = {}
    corpus_dict[count]["meronyms"] = meronym_list
    corpus_dict[count]["holonyms"] = {}
    corpus_dict[count]["holonyms"] = holonym_list
    corpus_dict[count]["ner_tag"] = {}
    corpus_dict[count]["ner_tag"] = dict(ner_tag)
    corpus_dict[count]["head_word"] = {}
    corpus_dict[count]["head_word"] = head_list[0]
    return corpus_dict
Esempio n. 23
0
def extractFeatures():
    stop_words = stopwords.words('english') + list(string.punctuation)
    file_loc='wikiTest/'
    os.chdir('/Users/ranjithreddykommidi/NLP/Project/wikiTest')
    file_names = glob.glob('*.txt')
    
    #Read every wikipedia articles given in the input fileList
    for file in file_names:
        readfile = open(file, 'r')
        text = readfile.read()
        corpus = {}
        sent_text = nltk.sent_tokenize(text)
        dep_parser = CoreNLPDependencyParser(url='http://localhost:9010')
        ner_tagger = CoreNLPParser(url='http://localhost:9010', tagtype='ner')
        count = 0
        for sentence in sent_text:
            tokenized_text = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]  
            lemma = [WordNetLemmatizer().lemmatize(word) for word in tokenized_text]
            stemmed = [PorterStemmer().stem(word) for word in tokenized_text]
            tagged = nltk.pos_tag(tokenized_text)
            parse, = dep_parser.raw_parse(sentence)
            dependency_parse = list(parse.triples())
            tokenized_text_ner = nltk.word_tokenize(sentence) 
            try:
                ner_tag = ner_tagger.tag(tokenized_text_ner)
            except:
                ner_tag = ner_tagger.tag(tokenized_text)
            
            Synonym = []
            Hypernym = []
            Hyponym = []
            Meronym = []
            Holonym = []
            Heads = []
        
            for t in tokenized_text:
                Nyms = lesk(sentence, t)
                if Nyms is not None:
                    this_synonym = t
                    if Nyms.lemmas()[0].name() != t:this_synonym = Nyms.lemmas()[0].name()
                    Synonym.append(this_synonym)
                    if Nyms.hypernyms() != []:Hypernym.append(Nyms.hypernyms()[0].lemmas()[0].name())
                    if Nyms.hyponyms() != []:Hyponym.append(Nyms.hyponyms()[0].lemmas()[0].name())
                    if Nyms.part_meronyms() != []:Meronym.append(Nyms.part_meronyms()[0].lemmas()[0].name())
                    if Nyms.part_holonyms() != []:Holonym.append(Nyms.part_holonyms()[0].lemmas()[0].name())
                else:
                    Synonym.append(t)
        
            striped_sentence = sentence.strip(" '\"")
            if striped_sentence != "":
                dependency_parser = dep_parser.raw_parse(striped_sentence)
                parsetree = list(dependency_parser)[0]
                head_word = ""
                head_word = [k["word"]
                         for k in parsetree.nodes.values() if k["head"] == 0][0]
                if head_word != "":
                    Heads.append([head_word])
                else:
                    for i, pp in enumerate(tagged):
                        if pp.startswith("VB"):
                            Heads.append([tokenized_text[i]])
                            break
                    if head_word == "":
                        for i, pp in enumerate(tagged):
                            if pp.startswith("NN"):
                                Heads.append([tokenized_text[i]])
                                break
            else:
                Heads.append([""])

            count = count + 1
            corpus[count] = {}
            corpus[count]["sentence"] = {}
            corpus[count]["sentence"] = sentence
            corpus[count]["tokenized_text"] = {}
            corpus[count]["tokenized_text"] = tokenized_text
            corpus[count]["lemma"] = {}
            corpus[count]["lemma"] = lemma
            corpus[count]["stem"] = {}
            corpus[count]["stem"] = stemmed
            corpus[count]["tag"] = {}   
            corpus[count]["tag"] = tagged
            corpus[count]["dependency_parse"] = {}
            corpus[count]["dependency_parse"] = dependency_parse
            corpus[count]["synonyms"] = {}
            corpus[count]["synonyms"] = Synonym
            corpus[count]["hypernyms"] = {}
            corpus[count]["hypernyms"] = Hypernym
            corpus[count]["hyponyms"] = {}
            corpus[count]["hyponyms"] = Hyponym
            corpus[count]["meronyms"] = {}
            corpus[count]["meronyms"] = Meronym
            corpus[count]["holonyms"] = {}
            corpus[count]["holonyms"] = Holonym
            corpus[count]["ner_tag"] = {}
            corpus[count]["ner_tag"] = str(dict(ner_tag))
            corpus[count]["head_word"] = {}
            corpus[count]["head_word"] = Heads[0]
            corpus[count]["file_name"] = {}
            corpus[count]["file_name"] = file[len(file_loc):]

        outputName = file[len(file_loc)]        
        json_object = json.dumps(corpus, indent = 4) 
        with open(outputName, "w") as f:
            f.write(json_object)
Esempio n. 24
0
 ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
 corpus_dict = {}
 count = 0
 sent_text = nltk.sent_tokenize(file_text)  # Tokenizing text to sentences
 for sentence in sent_text:
     tokenized_text = [
         i for i in nltk.word_tokenize(sentence.lower())
         if i not in stop_words
     ]  # Tokenizing sentences into words
     # Lemmatizing the words to extract lemmas as features
     lemma = [lemmatizer.lemmatize(word) for word in tokenized_text]
     stemmed = [porter.stem(word)
                for word in tokenized_text]  # Stemming the words
     # POS tagging the words to extract POS features
     tagged = nltk.pos_tag(tokenized_text)
     parse, = dep_parser.raw_parse(sentence)
     # Dependency parsing to parse tree based patters as features
     dependency_parse = list(parse.triples())
     # best_sense = [lesk(sentence, word) for word in tokenized_text] #LESK to extract best sense of a word
     tokenized_text_ner = nltk.word_tokenize(
         sentence)  # Tokenizing sentences into words
     try:
         ner_tag = ner_tagger.tag(tokenized_text_ner)
     except:
         ner_tag = ner_tagger.tag(tokenized_text)
     head_list = []
     striped_sentence = sentence.strip(" '\"")
     if striped_sentence != "":
         dependency_parser = dep_parser.raw_parse(striped_sentence)
         parsetree = list(dependency_parser)[0]
         head_word = ""
Esempio n. 25
0
    for sent in list_word_sentences:
        sentence = ' '.join(sent)
        sentences.append(sentence.strip())

    sentences = [x.lower() for x in sentences]
    print(str(len(sentences)) + ' frasi')

    print('extracting fillers...')
    for sentence in sentences:
        # PoS tagging
        sentence = sentence.replace('.', '')
        tokens = nltk.word_tokenize(sentence)
        tags = dict(nltk.pos_tag(tokens))

        # syntactic parsing
        result = dependency_parser.raw_parse(sentence)
        dep = next(result)
        graph = DependencyGraph()
        graph.from_dot(dep.to_dot())

        # lemmatize
        lemmatized_graph = lemmatize(graph, tags)
        index = lemmatized_graph.get_index(verb)
        if len(index) <= 0:
            print('error in **' + sentence + '**')
            continue

        # adjacency list
        adjs = lemmatized_graph.get_directional_adj(index[0])
        adjs = list(
            filter(lambda x: x[1] in subj_dept or x[1] in obj_dept, adjs))
class SVO():
    def __init__(self, sentence):
        config = ApplicationConfig.get_corenlp_config()
        self._parser = CoreNLPParser(url=f"http://{config['host']}:{config['port']}")
        self._dependency = CoreNLPDependencyParser(url=f"http://{config['host']}:{config['port']}")
        sentence = sentence.replace('  ', ' ')
        sentence = sentence.replace('.', '')
        self._load(sentence)
        self.original = sentence

    def get_dependency_tree():
        return self._dependency
    def get_parser_tree():
        return self.t
                                                   
    def _load(self, sentence):
        self.t = list(self._parser.raw_parse(sentence))[0]
        self.t = ParentedTree.convert(self.t)

    def show(self):
        self.t.pretty_print()
        
    def find_svo(self):
        self._queue = []

        # sentence須為S或NP才能找SVO & find conj
        for i in self.t.subtrees(lambda i: i.label() != 'ROOT'):
#             if i.label() in ['S','NP','SINV','SBAR','FRAG','X','PP']:
            remover = self._find_conj()

            # refresh
            for i in remover:
                self.original = self.original.replace(i, '')
            self._load(self.original) 
            self.pos = self.t.pos()
            self._root = SVONode(('main', self.t), None)
            self._queue.append(self._root)
            break
#             else:
#                 return 'Sentence can not find SVO.'  
                              
        # find SVO   
        while self._queue != []:
            self._data = self._queue.pop(0)
            tmp = list(self._data.data.flatten())
            if ',' in tmp:
                tmp.remove(',')
            if len(tmp) == 1:
                continue
            sentence = ' '.join(self._data.data.flatten())
            self.t = self._data.data

            # 找子句 & 對等連接詞 & 分詞
#             self.show()
            if self._data.relation != 'appos':
                self._find_SBAR()
#             self.show()
#             self._remove_comma()
#             self.show()
            self._data.svo = collections.defaultdict(list)

            # Find Subject
            tmp = self._find_subject()
            if isinstance(tmp, list):
                self._data.svo['subject'] = tmp
            else:
                self._data.svo['subject'] = self._add_conj(tmp)

            # Find Predicate
            tmp = self._find_predicate()
            self._data.svo['predicate'] = self._add_conj(tmp)
            
            # Find Object
            tmp = self._find_object(self._data.svo['predicate'])
            self._data.svo['object'] = self._add_conj(tmp)                
            
            self._all = collections.defaultdict(list)
            self._flatten(self._data.svo['predicate'])
            self._data.svo['object'] = self._filter(self._data.svo['object'])
            
            for s in self.t.subtrees():
                if s.label() != 'ROOT':
                    break
                else:
                    for i in self.t.subtrees(lambda i:i.label() != 'ROOT'):
                        if i.label() in ['FRAG']:
                            continue
                        if i.label() in ['S','SINV']:
                            for n in i.subtrees(lambda n: n.label() == 'S' and n != i):
                                flag = True
                                test = n
                                while test.parent():
                                    if test.parent() == i:
                                        flag = False
                                        break
                                    test = test.parent()
                                if flag:
                                    tmp = self._del(' '.join(n.flatten()))
                                    if tmp:
                                        self._refresh(n)
                                        kid = SVONode(('', self.t), self._data)
                                        self._data.child.append(kid)
                                        self._queue.append(kid)
                                break
                        break
                break
                                                   
        # Integrate
        self._result = collections.defaultdict(list)
        self._traversal(self._root)
        
        return self._result                                           
                                                   
    def _filter(self, x):
        for i in x:
            if i[1] != []:
                for j in i[1]:
                    if isinstance(j,dict):
                        for k in ['predicate', 'object']:
                            tmp = self._filter(j[k])
                            if tmp == []:
                                del j[k]
                    else:
                        if j in self._all['predicate']:
                            i[1].remove(j)
            if i[0] in self._all['predicate']:
                x.remove(i)
        return x
                                                   
    def _flatten(self, x):
        for i in x:
            self._all['predicate'].append(i[0])
            if i[1] != []:
                for j in i[1]:
                    if isinstance(j,dict):
                        for k in j.keys():
                            self._flatten(j[k])
                    else:
                        self._all['predicate'].append(j)
    
    def _traversal(self, node):
        if node.svo != None and (node.svo['subject']!=[] or node.svo['predicate']!=[] or node.svo['object']!=[]):
            self._result[node.relation].append({'subject':node.svo['subject'], 'predicate':node.svo['predicate'], 'object':node.svo['object']})
        for i in node.child:
            self._traversal(i)
    
    def _add_conj(self, tmp):
        result = []
        if isinstance(tmp, tuple):
            flag = tmp[0].split(' ')
            if len(flag) <= 5:
                for k in flag:
                    if k in self._dic.keys():
                        # 把conj補進來
                        for j in self._dic[k]:
                            if j[0] == 'attr':
                                tree = list(self._parser.raw_parse(tmp[0]+' is '+j[1]))[0]
                                tree = ParentedTree.convert(tree)
                                kid = SVONode(('appos', tree), self._data)
                                self._data.child.append(kid)
                                self._queue.append(kid)
                                self._dic[k].remove(j)
#                                 a = tmp[0]
#                                 b = tmp[1]
#                                 result.append((a, b+[j[1]]))
                            else:
                                result.append((j[1], j[2]))

        if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]:
            result.append(tmp)
        result.reverse()
        return result
    
    def _remove_comma(self):
        for i in self.t.subtrees(lambda i:i[0] in [',', ';']):
            if i.left_sibling() and i.left_sibling().label() not in ['NP','S','VP','PP','JJ','SINV','ADJP'] and 'VB' not in i.left_sibling().label():
                if ' '.join(i.left_sibling().flatten()) != ' '.join(self.t.flatten()):
                    self._refresh(i.left_sibling())
                if ' '.join(i.flatten()) != ' '.join(self.t.flatten()):
                    self._refresh(i)
    
    # 拔掉的句子放進child                                               
    def _child(self, a, b):
        kid = SVONode((a, b), self._data)
        self._data.child.append(kid)
        self._queue.append(kid)                                               
        self._refresh(b, a)
    
    # 能否 refresh(拔掉的句子和原有句子是否一樣)                                               
    def _del(self, tmp_1):
        tmp = ' '.join(self.t.flatten())
        tmp = tmp.replace(tmp_1, '')   
        tmp = tmp.strip(',; ') 
        if tmp != '':
            return True
        else:
            return False                                       
                                                   
    def _find_SBAR(self):
        # 有無對等連接詞
        for i in self.t.subtrees(lambda i: i.label() == 'CC'):
            if i.right_sibling() and i.right_sibling().label() in ['S','VP']:
                tmp = self._del(i[0]+' '+' '.join(i.right_sibling().flatten()))
                if tmp and [x for x in self._queue if ' '.join(i.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []:
                    self._child(i[0], i.right_sibling())                               
                                                   
        # 有無子句                                          
        for node in self.t.subtrees(lambda node: node.label() == 'SBAR'):
            if 'VB' in node.pos()[0][1]:
                continue
            tmp = self._del(' '.join(node.flatten()))   
            if tmp:
                conj = []
                # 連接詞
                for s in node.subtrees(lambda s: s.label() != 'SBAR'):
                    if s.label() not in ['S','ADVP','RB'] and 'VB' not in s.label():
                        if s.leaves()[0] not in conj:
                            conj.append(s.leaves()[0])
                    elif s.label() in ['ADVP','RB']:
                        continue
                    else:
                        break
                conj = ' '.join(conj)
                for s in node.subtrees(lambda s: s.label() == 'S'):
                    # SBAR 會重複
                    if [x for x in self._queue if ' '.join(s.flatten()) in ' '.join(x.data.flatten())] == []:
                        if node.left_sibling() and node.left_sibling().label() == 'IN' and node.parent().label() != 'S':
                            tmp = self._del(' '.join(node.parent().flatten()))                       
                            if tmp:
                                self._child(conj, s)
                        else:
                            self._child(conj, s)
                    break
                                                  
        # 分詞                                           
        participle = [x[0] for x in self.t.pos() if x[1] in ['VBG','VBN']]
        for i in participle:
            if i in self.t.leaves():
                candidate = [x for x, y in enumerate(self.t.leaves()) if y == i]
                if candidate[-1] == 0:
                    pos = ''
                else:
                    before = self.t.leaves()[candidate[-1]-1]
                    pos = [x for x in self.t.pos() if x[0] == before][0][1]
                IN = ['when','while','before','after','till','since','because','as','so','although','though','if','unless','upon','once']
                                                   
                if pos == 'IN' and before.lower() in IN:
#                 candidate[-1]-2 >= 0 and 'VB' not in [x for x in self.t.pos() if x[0] == self.t.leaves()[candidate[-1]-2]][0][1]
                    for j in self.t.subtrees(lambda j: j[0] == before):
                        tmp = self._del(' '.join(j.parent().flatten()))                           
                        if tmp and j.parent().label() != 'NP' and j.right_sibling() and [x for x in self._queue if ' '.join(j.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []:
                            self._child(before, j.right_sibling())
                            
                if ('VB' not in pos) and (pos not in ['IN','RB','MD','POS', 'TO']):
                    for j in self.t.subtrees(lambda j: j[0] == i):
                        tmp = self._del(' '.join(j.parent().flatten()))                                                       
                        if tmp and j.parent().label() not in ['NP','ADJP'] and j.right_sibling() and [x for x in self._queue if ' '.join(j.parent().flatten()) in ' '.join(x.data.flatten())] == []:
                            self._child('', j.parent())                       
    
                                                   
    def _refresh(self, node, conj=''):
        sentence = ' '.join(self.t.flatten())
        if conj == '':
            tmp = ' '.join(node.flatten())
        else:
            tmp = conj + ' ' + ' '.join(node.flatten())
        if tmp in sentence:
            idx = sentence.index(tmp)
            if idx-2 >= 0 and sentence[idx-2] == ',':
                tmp = ', ' + tmp
            if idx+len(tmp)+1 < len(sentence) and sentence[idx+len(tmp)+1] == ',':
                tmp = tmp +' ,'
        sentence = sentence.replace(tmp, '')
        self._load(sentence)
    
    def _find_conj(self):
        self._dic = collections.defaultdict(list)
        dep, = self._dependency.raw_parse(self.original)
        remover = []      
        pool_conj = []
        pool_appos = []
        for governor, bridge, dependent in dep.triples():
            # 對等連接詞
            if bridge == 'conj':
                # NN conj NN
                if 'NN' in governor[1] and 'NN' in dependent[1]:
                    address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['conj']
                    for add in address:
                        if add not in pool_conj:
                            tmp = []
                            r = []
                            pool_conj.append(add)
                            for key, value in dep.get_by_address(add)['deps'].items():
                                if key not in ['conj', 'cc', 'nmod', 'nmod:poss']:
                                    for j in value:
                                        tmp.append(dep.get_by_address(j)['word'])
                                        r.append(dep.get_by_address(j)['word'])
                                if key in ['nmod']:
                                    r.append(dep.get_by_address(add)['word'])
                                    for j in value:
                                        for key1, value1 in dep.get_by_address(j)['deps'].items():
                                            if key1 not in ['conj', 'cc']:
                                                for k in value1:
                                                    r.append(dep.get_by_address(k)['word'])
                                        r.append(dep.get_by_address(j)['word'])
                                if key in ['nmod:poss']:
                                    for j in value:
                                        for key1, value1 in dep.get_by_address(j)['deps'].items():
                                            if key1 not in ['conj', 'cc', 'case']:
                                                for k in value1:
                                                   tmp.append(dep.get_by_address(k)['word'])
                                                   r.append(dep.get_by_address(k)['word'])
                                            if key1 in ['case']:
                                                tmp.append(dep.get_by_address(j)['word'])
                                                r.append(dep.get_by_address(j)['word'])
                                                for k in value1:
                                                   tmp.append(dep.get_by_address(k)['word'])
                                                   r.append(dep.get_by_address(k)['word'])
                                    if dep.get_by_address(j)['word'] not in tmp:
                                        tmp.append(dep.get_by_address(j)['word'])
                                        r.append(dep.get_by_address(j)['word'])    
                            if dep.get_by_address(add)['word'] not in tmp:
                                tmp.append(dep.get_by_address(add)['word'])
                            if dep.get_by_address(add)['word'] not in r:
                                r.append(dep.get_by_address(add)['word'])

                            for i in self.t.subtrees(lambda i: i.leaves() == r):
                                for n in i.subtrees(lambda n: n[0] == dependent[0]):
                                    self._dic[governor[0]].append(('entity', ' '.join(tmp), self._find_attrs(n, ' '.join(tmp))))
                                    remover.append(' '.join(r))
                                    break
                                break
                            if ' '.join(r) not in remover:
                                self._dic[governor[0]].append(('entity', ' '.join(tmp), []))
                                remover.append(' '.join(r))
                            
                    
                # VB conj VB O
                elif 'VB' in governor[1] and 'VB' in dependent[1] and governor[1] == dependent[1]:   
                    gov_key = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0].keys()
                    dep_key = [x['deps'] for x in dep.nodes.values() if x['word']==dependent[0]][0].keys()
                    if [j for j in gov_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass']]==[] or [j for j in dep_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass', 'nsubj']]==[]:  
                        for i in self.t.subtrees(lambda i: i[0] == dependent[0]):
                            self._dic[governor[0]].append(('entity', dependent[0],  self._find_attrs(i, dependent[0])))
                            remover.append(dependent[0])
                            break
                        
            # 同位語(回傳整串)           
            elif bridge == 'appos':
                tmp = []
                address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['appos']
                for add in address:
                    if add not in pool_appos:
                        tmp = []
                        pool_appos.append(add)    
                        for key, value in dep.get_by_address(add)['deps'].items():
                            if key in ['compound', 'amod']:
                                for j in value:
                                    tmp.append(dep.get_by_address(j)['word'])
                            if key in ['nmod']:
                                tmp.append(dep.get_by_address(add)['word'])
                                for j in value:
                                    for key1, value1 in dep.get_by_address(j)['deps'].items():
                                        if key1 not in ['conj', 'cc']:
                                            for k in value1:
                                                tmp.append(dep.get_by_address(k)['word'])
                                    tmp.append(dep.get_by_address(j)['word'])
                        if dep.get_by_address(add)['word'] not in tmp:
                            tmp.append(dep.get_by_address(add)['word'])                        
                        self._dic[governor[0]].append(('attr', ' '.join(tmp), []))
                        remover.append(' '.join(tmp))
        
        for i in range(len(remover)):
            #所有可能的位置
            can = [m.start() for m in re.finditer(remover[i], self.original)]
            flag = False
            for j in can:
                if self.original[j-2] == ',':
                    remover[i] = ', ' + remover[i]
                    flag = True
                    break
                elif self.original[j-4:j-1] == 'and':
                    remover[i] = 'and ' + remover[i]
                    flag = True
                    break
            if not flag:
                remover[i] = ' ' + remover[i]
        return remover        
                                                   
    # Breadth First Search the tree and take the first noun in the NP subtree.
    def _find_subject(self):
        synonym = ['', 'which', 'that', 'who', 'whom', 'where', 'when', 'what', 'why', 'how', 'whether', 'in']
        for i in self.t.subtrees(lambda i: i.label() == 'SBAR'):
            dep, = self._dependency.raw_parse(' '.join(self.t.flatten()))
            sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']]
            if sub != []:
                for s in self.t.subtrees(lambda s:s[0] == sub[0][0]):
                    return self._find_NOUN(s)   
            for s in i.subtrees(lambda s: s.label() == 'NP'):
                for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() in 'PRP'):
                    return self._find_NOUN(n)
                for n in s.subtrees(lambda n: n.label() == 'DT'):
                    return (n[0], self._find_attrs(n, n[0]))
        for i in self.t.subtrees(lambda i: i.label() not in ['S', 'ROOT', 'PP', 'FRAG']):  
            # 有Subject
            dep, = self._dependency.raw_parse(' '.join(self.t.flatten()))
            sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']]
            if sub != []:
                for s in self.t.subtrees(lambda s:s[0] == sub[0][0]):
                    return self._find_NOUN(s)   
                                                   
            if i.label() not in ['VP','PP'] and 'VB' not in i.label():                                
                for s in self.t.subtrees(lambda s: s.label() == 'NP'): 
                    for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'):                          
                        return self._find_NOUN(n)
                    for n in s.subtrees(lambda n: n.label() == 'DT'):
                        return (n[0], self._find_attrs(n, n[0]))
            
            # 祈使句
            elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation == 'main':
                if [x for x in self.t.pos()][0][1] not in ['RB','MD'] and 'VB' not in [x for x in self.t.pos()][0][1]:
                    for s in self.t.subtrees(lambda s: s.label() == 'NP'): 
                        for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'):                          
                            return self._find_NOUN(n)
                        for n in s.subtrees(lambda n: n.label() == 'DT'):
                            return (n[0], self._find_attrs(n, n[0]))
                    return None
                else:
                    return None
                                                   
            # 沒有subject & relation是代名詞
            elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation in synonym:
                dep, = self._dependency.raw_parse(self.original)
                candidate = [x for x in dep.triples() if x[1] in ['acl:relcl','acl'] and x[2][0] in self.t.flatten()]
                if candidate != []:
                    compound = self._find_compound(candidate[0][0][0], dep)
                    sub = []
                    if compound != '':
                        for com in compound:
                            sub.append(com)
                    sub.append(candidate[0][0][0])
                    return (' '.join(sub), [])
                else:
                    sent = [x[0] for x in self.pos]
                    if self._data.relation != '':
                        candidate = [x for x, y in enumerate(sent) if y == self._data.relation.split(' ')[0]]
                        after = self.t.pos()[0][0]
                    else:
                        candidate = [x for x, y in enumerate(sent) if y == self.t.pos()[0][0]]
                        if len(self.t.pos()) > 1:                               
                            after = self.t.pos()[1][0]
                        else:
                            after = ''                           
                    before = candidate[0] - 1 
                    for x in candidate:
                        if sent[x+1] == after:
                            before = x - 1
                    
                    if before == -1:
                        return None

                    # 原句前一個詞是否為NN  
                    if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0] or [x[1] for x in self.pos if x[0] == sent[before]][0] in ['PRP']:
                        sub = [sent[before]]
                        before -= 1
                        while 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                            sub.append(sent[before])
                            before -= 1
                        return (' '.join(reversed(sub)), [])
                    elif [x[1] for x in self.pos if x[0] == sent[before]][0] in ['IN',','] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]:
                        before -= 1                               
                        sub = [sent[before]]
                        before -= 1
                        while before != -1 and 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                            sub.append(sent[before])
                            before -= 1
                        return (' '.join(reversed(sub)), [])

                    # 找parent中最近的
                    else:
                        target = self.t.pos()[0][0]
                        if self._data.parent.svo['subject'] == []:
                            sub = -1    
                        else:
                            sub = self._data.parent.svo['subject'][0][0].split(' ')[-1]
                        if self._data.parent.svo['object'] == []:
                            obj = -1
                        else:
                            obj = self._data.parent.svo['object'][0][0].split(' ')[-1]
                        if sub == -1 and obj != -1:
                            return self._data.parent.svo['object']
                        elif sub != -1 and obj == -1:
                            return self._data.parent.svo['subject']
                        elif sub != -1 and obj != -1:
                            if abs(self.original.find(target)-self.original.find(sub)) <= abs(self.original.find(target)-self.original.find(obj)):
                                return self._data.parent.svo['subject']
                            else:
                                return self._data.parent.svo['object']

            # 沒有subject & relation是連接詞    
            elif i.label() == 'VP' or i.label().startswith('VB'):                                   
                if self._data.parent != None:
                    return self._data.parent.svo['subject']
            else:                                  
                return None
                                                   
    def _find_compound(self, word, dep):
        deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word]
        com = []
        deps = [x for x in deps if 'compound' in x]                                           
        for i in deps:
            for j in i['compound']:
                com.append(dep.get_by_address(j)['word'])  
        deps = [x for x in deps if 'dep' in x]                                           
        for i in deps:
            com.append(dep.get_by_address(i['dep'][0])['word'])                                            
        return com
                                                   
    
    def _compound(self, compound, before):
        obj = []
        if compound != '':
            for n in self.t.subtrees(lambda n:n[0] == before):
                for com in compound:
                    for s in n.parent().subtrees(lambda s:s[0] == com):
                        obj.append(com)
        return obj
                                                   
                                                   
    def _dobj(self, candidate, dep, before):
        if 'dobj' in candidate.keys():
            word = dep.nodes[candidate['dobj'][0]]['word']
            tag = dep.nodes[candidate['dobj'][0]]['tag']
        else:
            word = dep.nodes[candidate['xcomp'][0]]['word']
            tag = dep.nodes[candidate['xcomp'][0]]['tag'] 
        compound = self._find_compound(word, dep)
        obj = self._compound(compound, before)
        if tag != 'TO':
            for n in self.t.subtrees(lambda n:n[0] == before):
                for s in n.parent().subtrees(lambda s:s[0] == word):
                    obj.append(s[0])
                    return (' '.join(obj), self._find_attrs(s, ' '.join(obj)))                                           
        

    def _find_object(self, predicate, node = '', data = ''):
        if node == '':
            node = self.t
        if data == '':
            data = self._data
        synonym = ['which', 'that', 'who', 'whom']                                          
        if data != None and data.relation == 'appos':
            dep, = self._dependency.raw_parse(' '.join(node.flatten()))
        else:
            dep, = self._dependency.raw_parse(self.original)
        for i in predicate:
            pre = i[0].split(' ')
            for j in range(len(pre)-1, -1, -1):
                if len([x['deps'] for x in dep.nodes.values() if x['word']==pre[j]]) > 1:
                    dep, = self._dependency.raw_parse(' '.join(node.flatten()))

                candidate = [x['deps'] for x in dep.nodes.values() if x['word']==pre[j]][0]
                candidate_1 = [x for x in dep.triples() if x[2][0]==pre[j]]
                                                   
                if 'dobj' in candidate.keys() or 'xcomp' in candidate.keys():
                    return self._dobj(candidate, dep, pre[j])
                                                   
                elif 'ccomp' in candidate.keys():
                    word = dep.nodes[candidate['ccomp'][0]]['word']
                    tag = dep.nodes[candidate['ccomp'][0]]['tag']
                    dic = collections.defaultdict(list)
                    deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word][0]
                                                   
                    if 'nsubj' in deps.keys():
                        compound = self._find_compound(dep.get_by_address(deps['nsubj'][0])['word'], dep)
                        obj = self._compound(compound, pre[j])
                        obj.append(dep.get_by_address(deps['nsubj'][0])['word'])
                        if 'dobj' in deps.keys() or 'xcomp' in deps.keys():
                            for n in self.t.subtrees(lambda n:n[0] == word):
                                dic['predicate'].append((word, self._find_attrs(n, word))) 
                            dic['object'] = self._add_conj(self._dobj(deps, dep, word))
                            return (' '.join(obj), [dic])
                     
                    elif 'dobj' in deps.keys():
                        compound = self._find_compound(dep.get_by_address(deps['dobj'][0])['word'], dep)
                        obj = self._compound(compound, pre[j])
                        for n in self.t.subtrees(lambda n:n[0] == dep.get_by_address(deps['dobj'][0])['word']):
                            obj.append(n[0])
                            return (' '.join(obj), self._find_attrs(n, ' '.join(obj)))
#                     else:
#                         return None
                                                   
                elif 'cop' in [x[1] for x in candidate_1]:
                    tmp = [x for x in candidate_1 if x[1] == 'cop'][0]
                    compound = self._find_compound(tmp[0][0], dep)
                    obj = self._compound(compound, pre[j])
                    for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]):
                        obj.append(j[0])
                        return (' '.join(obj), self._find_attrs(j, ' '.join(obj)))    
                elif 'case' in [x[1] for x in candidate_1]:
                    tmp = [x for x in candidate_1 if x[1] == 'case'][0]
                    compound = self._find_compound(tmp[0][0], dep)
                    obj = self._compound(compound, pre[j])
                    for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]):
                        obj.append(j[0])
                        return (' '.join(obj), self._find_attrs(j, ' '.join(obj)))
                                                   
                elif 'auxpass' in candidate.keys():
                    sent = [x[0] for x in self.pos]
                    if data != None and data.relation in synonym:
                        relation = sent.index(data.relation.split(' ')[0])
                        if 'IN' in [x[1] for x in self.pos if x[0] == sent[relation]][0]:
                            return (sent[relation-1], [])
                    return None
                                
                # 沒有受詞
                elif data != None and data.relation in synonym:
                    sent = [x[0] for x in self.pos]
                    before = sent.index(data.relation.split(' ')[0])-1
                    # 原句前一個詞是否為NN   
                    if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]:
                        return (sent[before], [])
                    elif 'IN' in [x[1] for x in self.pos if x[0] == sent[before]][0] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]:
                        return (sent[before-1], [])
                    elif data.child != []:
                        kid = data.child[0]
                        if kid.relation != 'appos':
                            return (kid.relation+' '+' '.join(kid.data.flatten()), [])
                    else:
                        return None

                # 受詞為子句
                elif data != None and data.child != []:
                    kid = data.child[0]
                    if kid.relation != 'appos':
                        return (kid.relation+' '+' '.join(kid.data.flatten()), [])
                elif [x for x in dep.nodes.values() if x['word']==pre[j]][0]['tag'] == 'RP':
                    continue
                else:
                    return None
                                                   
    def _find_predicate(self):
        tmp = self.t.flatten()
        for n in self.t.subtrees(lambda n: n.label().startswith('VB')):
            if n.parent().label() in ['ADJP']:
                continue
            i = tmp.index(n[0])
            sub = []
            while self.t.pos()[i-1][1] in ['MD','RB']:
                sub.append(self.t.pos()[i-1][0])
                i -= 1
            sub.reverse()
            i = tmp.index(n[0])
            while i+1 < len(tmp):
                if [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] == 'RP':
                    sub.append(tmp[i])
                    i += 1
                elif [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] in ['RB','MD']:
                    if i+2 >= len(tmp):
                        break
                    count = i+2
                    while count+1 < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] in ['RB','MD']:
                        count += 1
                    if count < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] == 'TO':
                        sub.append(tmp[i])
                        i += 1
                    else:
                        break
                else:
                    break
            flag = i
            sub.append(tmp[flag])
            # 不定詞
            for j in self.t.subtrees(lambda j:j[0] == tmp[flag]):
                if j.right_sibling() and j.right_sibling().label() == 'PP' and j.right_sibling().leaves()[0] != 'to':
                    start = tmp.index(j.right_sibling().leaves()[-1])
                    has_PP = True
                else:
                    start = flag
                    has_PP = False

                if start+1 < len(tmp) and tmp[start+1] == 'to':
                    for i in range(start+1, len(tmp)):                                                   
                        if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i]][0] in ['TO','RB']:
                            sub.append(tmp[i])
                            if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB'):
                                flag = i
                        else:
                            break

                    if has_PP:
                        for i in self.t.subtrees(lambda i:i[0] == sub[-1]):
                            return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                    else:
                        for i in self.t.subtrees(lambda i:i[0] == tmp[flag]):
                            return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                else:
                    for i in self.t.subtrees(lambda i:i[0] == tmp[flag]):
                        return (' '.join(sub), self._find_attrs(i, ' '.join(sub)))
                                                   
           
    def _find_NOUN(self, n):
        # 所有格
        if n.parent().right_sibling() and n.parent().right_sibling().label().startswith('NN'):
            sub = n.parent().leaves()
            p = n.parent()
            while p.right_sibling():
                if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD','DT']:
                    p = p.right_sibling()
                    sub.append(p[0])   
                else:
                    break
            return (' '.join(sub), self._find_attrs(p, ' '.join(sub)))
        else:
            sub = []
            pp = n.parent()   
            flag = ''
            for l in pp:
                if l.label().startswith('NN') or l.label() in ['PRP','CD','DT']:
                    if l[0] not in sub:
                        sub.append(l[0])
                        flag = l 
            if flag == '':
                sub.append(n[0])
                flag = n
            return (' '.join(sub), self._find_attrs(flag, ' '.join(sub)))
                                                   
    def _find_to(self, node):
        dic = collections.defaultdict(list)
        flag = node.leaves().index('to')
        tmp = node.leaves()[flag:]
        predicate = []
        for i in tmp:
            if [x[1] for x in self.t.pos() if x[0] == i][0] in 'TO' or 'VB' in [x[1] for x in self.t.pos() if x[0] == i][0]:
                predicate.append(i)
            else:
                break    
        for n in node.subtrees(lambda n: n[0] == predicate[-1]):        
            dic['predicate'].append((' '.join(predicate), self._find_attrs(n, ' '.join(predicate))))
        if predicate[-1] == 'be':
            for n in node.subtrees(lambda n: n.label() in ['NP', 'PP']):
                if n.label() in ['NP', 'PP']:
                    for c in n.subtrees(lambda c: c.label().startswith('NN') or c.label() in ['PRP', 'CD']):
                        a = self._find_NOUN(c)
                        dic['object'] = self._add_conj(a)
                        return dic
        else:
            tmp = self._find_object(dic['predicate'], node, None)
            dic['object'] = self._add_conj(tmp)
            return dic 
                                                   
    def _toV(self, node):
        # 可能有多個一樣的字                                           
        flat = list(self.t.flatten())
        candidate = [x for x, y in enumerate(flat) if y == node[0]]
        flag = candidate[0]
        if node.left_sibling():
            before = node.left_sibling().leaves()[-1]
            for i in candidate:
                if flat[i-1] == before:
                    flag = i
                    break
        elif node.right_sibling():
            after = node.right_sibling().leaves()[0]
            for i in candidate:
                if flat[i+1] == after:
                    flag = i
                    break 
        elif node.parent().left_sibling():
            before = node.parent().left_sibling().leaves()[-1]
            for i in candidate:
                if flat[i-1] == before:
                    flag = i
                    break
        elif node.parent().right_sibling():
            after = node.parent().right_sibling().leaves()[0]
            for i in candidate:
                if flat[i+1] == after:
                    flag = i
                    break 
        
        if not node.label().startswith('VB') and flag+2 < len(flat) and flat[flag+1] == 'to' and [x[1] for x in self.t.pos() if x[0] == flat[flag+2]][0] in 'VB':
            for i in self.t.subtrees(lambda i: i[0] == 'to'):                                 
                if flat[flag] not in i.parent().flatten():
                    return i.parent()

        else:
            return None
               
    def _PP(self, s, name, attrs):
        if ' '.join(s.flatten()) not in name:
            if len(s[0]) != 1:
                for i in s.subtrees(lambda i: i.label() == 'PP'):
                    if i.parent() == s:
                        a = self._proposition(i)
                        if a != []:
                            attrs.append(a)
                        else:
                            attrs.append(' '.join(s.flatten()))
            else:
                a = self._proposition(s)
                if a != []:
                    attrs.append(a)
                else:
                    attrs.append(' '.join(s.flatten()))
        return attrs
                                                   
                                                   
    def _find_attrs(self, node, name): 
        attrs = []
        p = node.parent()
        toV = self._toV(node)
        name = name.split(' ')
        # Search siblings of adjective for adverbs
        if node.label().startswith('JJ'):
            for s in p:
                if s.label() == 'RB':
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s.label() == 'PP':
                    attrs = self._PP(s, name, attrs)
                elif s.label() == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))                 

        elif node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']:
            for s in p:
                if s != node and s.label() in ['DT','PRP$','POS','CD','IN'] or s.label().startswith('JJ') or s.label().startswith('NN'):
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s != node and s.label() in ['ADJP','NP','QP', 'VP']:                            
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))  
                elif s != p and s.label() in ['PP']:
                    attrs = self._PP(s, name, attrs)

        # Search siblings of verbs for adverb phrase
        elif node.label().startswith('VB'):   
            for s in p:
#                 if s.label() in ['ADVP','MD','RB']:
                if s.label() in ['ADVP', 'RB', 'MD']:
                    if ' '.join(s.flatten()) not in name:
                        attrs.append(' '.join(s.flatten()))

                elif s.label() == 'PP':
                    attrs = self._PP(s, name, attrs)

            
        # Search uncles
        # if the node is noun or adjective search for prepositional phrase
        if node.label().startswith('JJ') or node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']:
            if p.label() == 'QP':
                p = p.parent()
            for s in p.parent():
                if s != p and s.label() in ['PP']:
                    attrs = self._PP(s, name, attrs)
                elif s != p and 'NN' in s.label() or s.label() == 'JJ':
                    if s[0] not in name:
                        attrs.append(s[0])
                elif s != p and s.label() == 'VP' and s.parent().label() == 'NP':
                    if ' '.join(s.flatten()) not in name:
                        if toV != None:
                            if ' '.join(s.flatten()[:3]) != ' '.join(toV.flatten()[:3]):
                                attrs.append(' '.join(s.flatten()))
                        else:
#                             self._refresh(s)
                            attrs.append(' '.join(s.flatten()))

        elif node.label().startswith('VB') or node.label() == 'RP':
            if p.parent():
                tmp = node
                for s in p.parent():
                    if s != p and s.label().startswith('ADVP'):
                        if ' '.join(s.flatten()) not in name:
                            attrs.append(' '.join(s.flatten()))
    #                 elif s != p and s.label() in ['MD','RB']:
    #                     attrs.append(s[0])
                    elif s != p and s.label() == 'PP' and s == tmp.right_sibling():       
                        attrs = self._PP(s, name, attrs)
                        tmp = s
        
        if toV != None:
            attrs.append(self._find_to(toV))
            self._refresh(toV) 
        
        return attrs  
                                                   
    def _proposition(self, node):
        dic = collections.defaultdict(list)
        tmp = node.leaves()
        if len(tmp) == 1:
            return []
        for k in node.subtrees(lambda k: k.label() in ['IN', 'TO']):  
            if tmp.index(k[0])+1 < len(tmp):
                VB = [x for x in node.pos() if x[0] == tmp[tmp.index(k[0])+1]]
                if VB != [] and 'VB' in VB[0][1]:                                   
                    dic['predicate'].append((k[0]+' '+VB[0][0], []))
                else:
                    dic['predicate'].append((k[0], []))  
            else:
                dic['predicate'].append((k[0], []))                                   
            if k.right_sibling():
                for c in k.right_sibling().subtrees(lambda c: c.label().startswith('NN') or c.label() in ['JJ', 'CD']):
                    # 所有格
                    if c.parent().right_sibling() and c.parent().right_sibling().label().startswith('NN'):
                        sub = c.parent().leaves()
                        p = c.parent()
                        while p.right_sibling():
                            if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD']:
                                p = p.right_sibling()
                                sub.append(p[0])
                                flag = p
                            else:
                                break
                    else:
                        sub = []
                        pp = c.parent()
                        for l in pp:
                            if l.label().startswith('NN') or l.label() in ['PRP','CD', 'JJ']:
                                if l[0] not in sub:
                                    sub.append(l[0])
                                    flag = l
                    dic['object'].append((' '.join(sub), self._find_attrs(flag, ' '.join(sub))))
                    dic['object'] = self._add_conj(dic['object'][0])   
                    return dic
                return []
            else:
               return []                                    
        return []                                           
Esempio n. 27
0
def context_to_tree(ith_data, step, to_graph=False):
    start_time = time.time()

    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    if to_graph:
        context = ith_data['context']
        graph = [[] for _ in range(len(context))]
    else:
        context = ith_data['context']
        tree = [[] for _ in range(len(context))]
        triple = [[] for _ in range(len(context))]
        # figure = [[] for _ in range(len(context))]

    result = {}

    for i in range(
            len(context)
    ):  ## ith context of input movie(divided in multple sentences)
        if to_graph:
            graph[i] = [[] for _ in range(len(context[i]))]
        else:
            tree[i] = [[] for _ in range(len(context[i]))]
            triple[i] = [[] for _ in range(len(context[i]))]
            # figure[i] = [[] for _ in range(len(context[i]))]

        for j, jth in enumerate(context[i]):  ## jth sentence of ith context

            ## Tokenizing PLAN
            if to_graph:
                if jth != '':
                    graph[i][j] = []
                    parsed = dep_parser.raw_parse(jth)
                    for parse in parsed:
                        graph[i][j].append(parse.to_dot())
                    graph[i][j] = graph[i][j][0].split('\n')

                else:
                    graph[i][j] = jth

            else:
                if jth != '':
                    # doc = nlp(jth)
                    # tree[i][j] = doc.sentences[0] ## stanfordnlp
                    tree[i][j], triple[i][j] = [], []
                    parsed = dep_parser.raw_parse(jth)
                    for parse in parsed:
                        tree[i][j].append(parse.tree())
                        triple[i][j].append(parse.triples())

                    # figure[i][j] = tree[i][j][0].pretty_print()
                    tree[i][j] = list(tree[i][j][0])
                    triple[i][j] = list(triple[i][j][0])

                else:
                    tree[i][j] = jth
                    triple[i][j] = jth
                    # figure[i][j] = jth
            # print("{0}th Movie Processing => ".format(step+1) + 'i & j: {0}/{2}, {1}/{3}'.format(i+1, j+1, len(context), len(context[i])))

    if to_graph:
        ith_data['graph'] = graph
        print("Parsing Runtime: %0.2f Minutes" %
              ((time.time() - start_time) / 60))
        return ith_data

    else:
        ith_data['tree'] = tree
        ith_data['triple'] = triple
        # ith_data['figure'] = figure
        # print("Parsing Runtime: %0.2f Minutes"%((time.time() - start_time)/60))
        return ith_data
class DependenciesLCA():
    def __init__(self, sentence, port=9004):
        self.sentence = sentence.rstrip('.')
        self.sentence = re.sub(r'(.?)([\.,;:\?!()\[\]\{\}«»\'\"\-\—\/’&])',
                               '\\1 \\2 ', self.sentence)

        self.corenlpparser = CoreNLPDependencyParser(url='http://localhost:' +
                                                     str(port))
        parse = self.corenlpparser.raw_parse(self.sentence)
        self.tree = next(parse)

    def lca(self, index1, index2):
        path1 = []
        path2 = []
        path1.append(index1)
        path2.append(index2)

        node = index1
        while (node != self.tree.root):
            node = self.tree.nodes[node['head']]
            path1.append(node)

        node = index2
        while (node != self.tree.root):
            node = self.tree.nodes[node['head']]
            path2.append(node)

        for l1, l2 in zip(path1[::-1], path2[::-1]):
            if (l1 == l2):
                temp = l1
        return temp

    def path_lca(self, node, lca_node):
        path = []
        path.append(node)
        while (node != lca_node):
            node = self.tree.nodes[node['head']]
            path.append(node)
        return path

    def branch_paths(self, ent1, ent2):

        entity1 = re.split(r"[ .',\-0-9]", ent1)[-1]
        entity2 = re.split(r"[ .',\-0-9]", ent2)[-1]

        node1 = None
        node2 = None
        for node in self.tree.nodes:
            if (self.tree.nodes[node]["word"] == entity1) & (node1 == None):
                node1 = self.tree.nodes[node]
            elif (self.tree.nodes[node]["word"] == entity2) & (node2 == None):
                node2 = self.tree.nodes[node]

        try:
            if node1['address'] != None and node2['address'] != None:
                lca_node = self.lca(node1, node2)
                path1 = self.path_lca(node1, lca_node)
                path2 = self.path_lca(node2, lca_node)

                word_path1 = "/".join([p["word"] for p in path1])
                word_path2 = "/".join([p["word"] for p in path2])
                rel_path1 = "/".join([p["rel"] for p in path1])
                rel_path2 = "/".join([p["rel"] for p in path2])
                pos_path1 = "/".join([p["tag"] for p in path1])
                pos_path2 = "/".join([p["tag"] for p in path2])
            else:
                print(entity1, entity2, self.sentence)
        except AssertionError:
            print("Node none, Entity 1 :", node1, entity1, ent1,
                  " /  Entity2 :", node2, entity2, ent2, " /  Phrase :",
                  self.sentence)
        except:
            if (bool(re.search(r'\d', entity1)) == True) | (bool(
                    re.search(r'\d', entity1)) == False):
                return (None, None, None, None, None, None)
            print("Node none, Entity 1 :", node1, entity1, ent1,
                  " /  Entity2 :", node2, entity2, ent2, " /  Phrase :",
                  self.sentence, "  / Tree : ", self.tree)

        return (word_path1, word_path2, rel_path1, rel_path2, pos_path1,
                pos_path2)
Esempio n. 29
0
from datetime import datetime
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')


sentence = "the cat ran in the house"
parse, = dep_parser.raw_parse(sentence)

f = open(str(datetime.now())+'.svg', 'w')
svg = parse._repr_svg_()
f.write(svg)

print(parse.to_conll(4))
Esempio n. 30
0
class SqlGen:
    parsed = ""
    tokenized = ""
    dep_parser = ""
    text = ""
    data = ""
    attributes = ""
    conditions = []

    #constructor
    def __init__(self, sentence):
        self.prop = {
            "depparse.extradependencies": "NONE",
            "depparse.keepPunct": "false"
        }
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
        self.text = ner.NER().ner_pass(sentence)
        self.parsed, = self.dep_parser.raw_parse(self.text,
                                                 properties=self.prop)

    def getData(self, type='None'):

        if type == 'pandas1' or type == 'pandas2' or type == 'pandas3':
            x = self.parsed
            if x.contains_address(0):
                x.remove_by_address(0)
            x = x.nodes
            df = pd.DataFrame(
                [(v['address'], v['word'], v['lemma'], v['ctag'], v['tag'],
                  v['feats'], v['head'], v['deps'], v['rel'])
                 for v in x.values()],
                columns=[
                    'position', 'word', 'lemma', 'ctag', 'tag', 'feat', 'head',
                    'deps', 'rel'
                ]).set_index('position')
            self.data = df
            if type == 'pandas1':
                #all columns are included
                return df
            elif type == 'pandas2':
                # removed some columns from pandas1, only the columns specified in the list are included
                return df[['lemma', 'tag', 'head', 'rel']]
            else:
                # removed all colums except dependents
                return df[['deps']]

        else:
            return self.parsed.to_conll(4)

    def getAction(self, df):
        try:
            mainVerb = df.query("tag == 'VB'  & head == 0").to_dict()
            return mainVerb['lemma']
        except IndexError:
            return

    def getAttributes(self, df):
        #x = df.query(" (rel == 'dobj'  & head == %s) |(rel == 'conj:and'  & head ==  %s)" %(1,1)).to_dict()
        x = df.query(
            "  (rel == 'dobj' & head == %s) |(rel == 'acl:relcl') |(rel == 'conj:and' & head ==  %s) |(rel == 'appos' )"
            % (1, 1)).to_dict()
        self.attributes = (x['lemma'])
        self.rel = (x['rel'])
        return x

    def getValueNodes(self, index):
        pos = self.data.query("(rel == 'acl:relcl' )").to_dict()['word']
        if pos:
            pos = list(pos.keys())[0]
            x = self.data.query(
                " (rel == 'nmod:poss')|(rel == 'nmod:at')|(rel == 'dobj' & index > %s) | (rel == 'nmod:for')| (rel == 'nmod:as') | (rel == 'nmod:in')"
                % (pos)).to_dict()
            self.conditions = x['word']
            return x
        else:
            x = self.data.query(
                " (rel == 'nmod:poss')|(rel == 'nmod:at') | (rel == 'nmod:for')| (rel == 'nmod:as') | (rel == 'nmod:in')"
                % (pos)).to_dict()
            self.conditions = x['word']
            return x

    def findAssociation(self, attributes):
        att = []
        for keys in attributes:
            x = self.data.query(
                " (~tag.str.contains('DT')& ~rel.str.contains('ref')& ~rel.str.contains('cc')& ~rel.str.contains('case') & ~rel.str.contains('punct')) &(head == %s)"
                % (keys)).to_dict()
            #temp = [(attributes[keys])]
            if self.rel[keys] == "acl:relcl":
                temp = {attributes[keys]: 'acl:relcl'}
            else:
                temp = {attributes[keys]: 'main'}
            for keys in x['lemma']:
                try:
                    if (x['lemma'][keys] not in attributes.values() and
                            x['lemma'][keys] not in self.conditions.values()):
                        temp[x['lemma'][keys]] = x['rel'][keys]
                except AttributeError:
                    pass
            att.append(temp)
        return att
Esempio n. 31
0
def hanks(verb):
    """
    Implementation of P. Hanks theory.
    Given a transitive verb, we find N sentences in the Brown corpus that
    contains the given verb. We do WSD (using 2 version of Lesk algorithm,
    one handwritten by us and the other from NLTK library) on the verb
    arguments (subj and obj), and finally, we compute the Filler's supersense
    incidence rate.
    """

    fillers = []  # [(subj, obj, sentence)]
    sentences = []

    # Set the URI to communicate with Stanford CoreNLP
    dependency_parser = CoreNLPDependencyParser(url="http://localhost:9000")

    print('[1] - Extracting sentences...')
    list_word_sentences = text_extraction(verb)
    for sent in list_word_sentences:
        sentence = ' '.join(sent)
        sentences.append(sentence.strip())

    sentences = [x.lower() for x in sentences]
    print("\t{} sentences in which the verb \'{}\' appears.".format(str(len(sentences)), verb))

    print('\n[2] - Extracting fillers...')
    for sentence in sentences:
        # PoS Tagging
        sentence = sentence.replace('.', '')
        tokens = nltk.word_tokenize(sentence)
        tags = dict(nltk.pos_tag(tokens))  # dictionary of all PoS Tag of the tokens

        # Syntactic parsing
        result = dependency_parser.raw_parse(sentence)
        dep = next(result)
        graph = OurDependencyGraph()  # first init needed because of .init_from_dot()
        graph.init_from_dot(dep.to_dot())

        # Lemmatization
        # (it lemmatized only the verbs, the other words are not changed)
        lemmatized_graph = lemmatize_graph(graph, tags)  # es. "said" to "say"

        verb_key_list = lemmatized_graph.get_verb_key(verb)  # list of keys in which we can find the verb in graph.dict
        # format -> [int1, int 2, ...], eg.: [34], [0, 10, 34, ...]

        if len(verb_key_list) <= 0:
            # DEBUG
            # print("\tError in **{}**".format(sentence), file=sys.stderr)
            continue

        # Adjacency List
        # we take the first occurrence of the verb, which is our root
        adjs = lemmatized_graph.get_adj_neighbor(verb_key_list[0])
        # if the adjacent element of the verb are subj or obj we update adjs variable
        adjs = list(filter(lambda x: x[1] in subj_dept or x[1] in obj_dept, adjs))

        # Valency = 2
        if len(adjs) == 2:  # Note: not all the verb in sentences have valency = 2
            # assigning the correct subject and obj
            if adjs[0][1] in subj_dept:
                w1 = lemmatized_graph.dict[adjs[0][0]]
                w2 = lemmatized_graph.dict[adjs[1][0]]
            else:
                w1 = lemmatized_graph.dict[adjs[1][0]]
                w2 = lemmatized_graph.dict[adjs[0][0]]
            fillers.append((w1, w2, sentence))  # where w1 = subj and w2 = obj

    tot = len(fillers)
    print("\n[3] - Total of {} Fillers".format(str(tot)))
    for f in fillers:
        print("\t{}".format(f))

    our_lesk_semantic_types = {}  # {(s1, s2): count}
    nltk_lesk_semantic_types = {}  # {(s1, s2): count}
    for f in fillers:
        # WSD

        # Our Lesk
        s1 = our_lesk(f[0], f[2])
        s2 = our_lesk(f[1], f[2])

        # nltk.wsd's Lesk
        s3 = lesk(f[2], f[0])
        s4 = lesk(f[2], f[1])

        if s1 is not None and s2 is not None:
            # Getting supersences
            t = (s1.lexname(), s2.lexname())

            # Getting frequency
            if t in our_lesk_semantic_types.keys():
                our_lesk_semantic_types[t] = our_lesk_semantic_types[t] + 1
            else:
                our_lesk_semantic_types[t] = 1

        if s3 is not None and s4 is not None:
            # Getting supersences
            t = (s3.lexname(), s4.lexname())

            # Getting frequency
            if t in nltk_lesk_semantic_types.keys():
                nltk_lesk_semantic_types[t] = nltk_lesk_semantic_types[t] + 1
            else:
                nltk_lesk_semantic_types[t] = 1

    print('\n[4.1] - "Our Lesk":\n\tFinding Semantic Clusters (percentage, count of instances, semantic cluster):')
    for key, value in sorted(our_lesk_semantic_types.items(), key=lambda x: x[1]):
        to_print = str(round((value / tot) * 100, 2))
        print("\t[{}%] - {} - {}".format(to_print, value, key))

    print('\n[4.2] - "NLTK Lesk":\n\tFinding Semantic Clusters (percentage, count of instances, semantic cluster):')
    for key, value in sorted(nltk_lesk_semantic_types.items(), key=lambda x: x[1]):
        to_print = str(round((value / tot) * 100, 2))
        print("\t[{}%] - {} - {}".format(to_print, value, key))