def get_single(summary): if summary.startswith('.'): summary = summary[1:] dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') try: parse, = dep_parser.raw_parse(summary) nouns = set() for x in range(1, len(parse.nodes.items())): wdict = parse.nodes[x] if "NN" in wdict["tag"]: nouns.add(wdict["word"]) return nouns except JSONDecodeError: print("Decode Error at " + summary) return None except StopIteration: print("Stopped at " + summary) return None except HTTPError: print("HTTPError " + summary) return None
trees = ne_chunk(sent) for tree in trees: if hasattr(tree, 'label'): if tree.label() in labels: entities.append(' '.join( [child[0].lower() for child in tree])) return entities # run this you have to connect to api # go to dir - stanford-corenlp-full-2018-02-27 # the two lines below type in terminal as one line # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer # -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 & from nltk.parse import CoreNLPParser parser = CoreNLPParser(url='http://localhost:9000') list(parser.parse(doc)) # for sentence tokenized doc list(parser.raw_parse(doc)) # for non tokenized docs # on tokenized list of words pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') list(pos_tagger.tag(doc)) ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') list(ner_tagger.tag(doc)) from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') list(dep_parser.parse(doc))
file_names = [] for r, d, f in os.walk(file_loc): for file in f: if '.txt' in file: file_names.append(os.path.join(r, file)) # File read for file in file_names: print(file) file_read = open(file, 'r') file_text = file_read.read() lemmatizer = WordNetLemmatizer() porter = PorterStemmer() # stanford corenlp is expected to run at localhost:9000 dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') corpus_dict = {} count = 0 sent_text = nltk.sent_tokenize(file_text) # Tokenizing text to sentences for sentence in sent_text: tokenized_text = [ i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words ] # Tokenizing sentences into words # Lemmatizing the words to extract lemmas as features lemma = [lemmatizer.lemmatize(word) for word in tokenized_text] stemmed = [porter.stem(word) for word in tokenized_text] # Stemming the words # POS tagging the words to extract POS features tagged = nltk.pos_tag(tokenized_text)
##2017 12 3 using a different parser to parse sentence ''' from nltk.parse.stanford import StanfordDependencyParser path_to_jar = '/Users/collin/stanford/stanford-parser-full-2017-06-09/stanford-parser.jar' path_to_models_jar = '/Users/collin/stanford/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar' dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) ''' from nltk.parse.corenlp import CoreNLPServer, CoreNLPDependencyParser path_to_jar = '/Users/collin/stanford/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0.jar' path_to_models_jar = '/Users/collin/stanford/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0-models.jar' server = CoreNLPServer(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) server.start() dependency_parser = CoreNLPDependencyParser() stemmer = SnowballStemmer('english') def stem(w): return stemmer.stem(w) DR_one = ['nsubj', 'dobj', 'xsubj', 'csubj', 'nmod', 'iobj', 'xcomp'] DR_two = ['amod'] #DR_two = ['nsubj','dobj','xsubj','csubj','nsubjpass','nmod','iobj'] DR_three = ['conj'] DR = DR_one + DR_three
def __init__(self): self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
import os import copy import string #from word2number import w2n import stanza from nltk.parse import CoreNLPParser from nltk.parse.corenlp import CoreNLPDependencyParser parser = CoreNLPParser(url='http://localhost:9000') dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') nlp = stanza.Pipeline( "en", processors={"tokenize": "gum", "pos": "gum", "lemma": "gum", "depparse": "gum"}, use_gpu=True, pos_batch_size=2000 ) # cd ./Desktop/Udep2Mono/NaturalLanguagePipeline/lib/stanford-corenlp-4.1.0 # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 replacement = { "out of": "out-of", "none of the": "none-of-the", "all of the": "all-of-the", "some of the": "some-of-the", "most of the": "most-of-the", "many of the": "many-of-the", "several of the": "several-of-the",
def context_to_tree(ith_data, step, to_graph=False): start_time = time.time() dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') if to_graph: context = ith_data['context'] graph = [[] for _ in range(len(context))] else: context = ith_data['context'] tree = [[] for _ in range(len(context))] triple = [[] for _ in range(len(context))] # figure = [[] for _ in range(len(context))] result = {} for i in range( len(context) ): ## ith context of input movie(divided in multple sentences) if to_graph: graph[i] = [[] for _ in range(len(context[i]))] else: tree[i] = [[] for _ in range(len(context[i]))] triple[i] = [[] for _ in range(len(context[i]))] # figure[i] = [[] for _ in range(len(context[i]))] for j, jth in enumerate(context[i]): ## jth sentence of ith context ## Tokenizing PLAN if to_graph: if jth != '': graph[i][j] = [] parsed = dep_parser.raw_parse(jth) for parse in parsed: graph[i][j].append(parse.to_dot()) graph[i][j] = graph[i][j][0].split('\n') else: graph[i][j] = jth else: if jth != '': # doc = nlp(jth) # tree[i][j] = doc.sentences[0] ## stanfordnlp tree[i][j], triple[i][j] = [], [] parsed = dep_parser.raw_parse(jth) for parse in parsed: tree[i][j].append(parse.tree()) triple[i][j].append(parse.triples()) # figure[i][j] = tree[i][j][0].pretty_print() tree[i][j] = list(tree[i][j][0]) triple[i][j] = list(triple[i][j][0]) else: tree[i][j] = jth triple[i][j] = jth # figure[i][j] = jth # print("{0}th Movie Processing => ".format(step+1) + 'i & j: {0}/{2}, {1}/{3}'.format(i+1, j+1, len(context), len(context[i]))) if to_graph: ith_data['graph'] = graph print("Parsing Runtime: %0.2f Minutes" % ((time.time() - start_time) / 60)) return ith_data else: ith_data['tree'] = tree ith_data['triple'] = triple # ith_data['figure'] = figure # print("Parsing Runtime: %0.2f Minutes"%((time.time() - start_time)/60)) return ith_data
# uncomment the two lines below for the first time you run the code # nltk.download('punkt') # nltk.download('averaged_perceptron_tagger') # from cgi import escape from pprint import pprint # from en import singular # from pattern.text.en import singularize from nltk.stem.snowball import SnowballStemmer # stemmer is useful but not right now. from nltk.treeprettyprinter import TreePrettyPrinter from nltk.parse import CoreNLPParser from nltk.parse.corenlp import CoreNLPDependencyParser coreNLPurl = 'http://corenlp.run/' localServer = 'http://localhost:9000' # use local server with desired port number in case you cant use corenlp.run dependencyParser = CoreNLPDependencyParser(url=coreNLPurl) stemmer = SnowballStemmer("english") Parser = CoreNLPParser(url=coreNLPurl) def main(): reviewFile = "review-data.txt" reviewData = open(reviewFile) text = reviewData.readline() sentences = nltk.sent_tokenize(text) print( "=========================================================================" ) O = ["great"] #opinion wordd dictionary print("Initial Opinion Lexicon ") print(O)
def dependency_parse_tree(self, s): parser = CoreNLPDependencyParser() parse = next(parser.raw_parse(s)) return parse
lemmatizer = WordNetLemmatizer() new_dict = {} for k in graph.dict: word = graph.dict[k] if word in tags.keys() and tags[word] in verbs: new_dict[k] = lemmatizer.lemmatize(word, 'v') else: new_dict[k] = word return DependencyGraph(graph.graph, new_dict) if __name__ == "__main__": fillers = [] # [(subj, obj, sentence)] sentences = [] dependency_parser = CoreNLPDependencyParser(url="http://localhost:9000") print('extracting sentences...') list_word_sentences = text_extraction() for sent in list_word_sentences: sentence = ' '.join(sent) sentences.append(sentence.strip()) sentences = [x.lower() for x in sentences] print(str(len(sentences)) + ' frasi') print('extracting fillers...') for sentence in sentences: # PoS tagging sentence = sentence.replace('.', '') tokens = nltk.word_tokenize(sentence)
import nltk from nltk.corpus import wordnet as wn from nltk.parse.corenlp import CoreNLPDependencyParser from graphviz import Source from pattern.vector import stemmer from pycorenlp import StanfordCoreNLP from sutime import SUTime from textblob import TextBlob from stanfordnlp.server import CoreNLPClient from pynlp import StanfordCoreNLP annotators = 'tokenize, ssplit, pos, ner, coref' options = {'openie.resolve_coref': True} nlp = StanfordCoreNLP(annotators=annotators, options=options) sdp = CoreNLPDependencyParser() #----------------------------------------------------------------------------------------------------------------------- #LOAD THE SENTENCES filepath = 'kolbuszowa.txt' list_sentences = [] with open(filepath, encoding="utf8") as file: for line in file: list_sentences.append([line[:line.rfind(".") + 1]]) #PREPROCESSING START #CREATE TEMPORARY LIST FOR ADJUCENT SENTECNES FOR COREFERENCING (PREVIOUS 2 SENTENCES) for i in range(len(list_sentences)): adj_sentences = [] start_index = i - 1 if (start_index < 0):
from datetime import datetime from collections import defaultdict import re import random from .config import * from util.file_utils import load, save from util.dict_utils import counter2ordered_dict from common.constants import STOPWORDS, PUNCTUATIONS, FIGURE_PATH, OUTPUT_PATH, SYNONYM_DICT, CONCEPT_PATTERN_DICT, SPECIAL_WORDS, PATTERN_WORDS # SPECIAL_WORDS import os from nltk.parse.corenlp import CoreNLPDependencyParser import networkx as nx from networkx.drawing.nx_pydot import write_dot from .GIANT_data_utils import char2cid, get_embedding, from_networkx from torch_geometric.data import Data # , DataLoader DEP_PARSER = CoreNLPDependencyParser(url='http://localhost:9005') def cover_count(title, entitydict): allvalue = 0 for token, value in entitydict.items(): if token in title: allvalue += value return allvalue def select_sub_titles(title_candi, wordset): title_score = {} for title in title_candi: subline = re.split(r'[?!/,\(\)_:\-【】\[\]—!,\|。、?: 丨]+', title) goodtitle = ''
def __init__(self): self.parser = CoreNLPDependencyParser(url=self.corenlp_server()) self.sentence_tokenizer = PunktSentenceTokenizer()
from textblob import TextBlob from stanfordnlp.server import CoreNLPClient from pynlp import StanfordCoreNLP from pycorenlp import * from Util import get_tree, get_left_children, get_children, get_wordnet_pos, get_right_children synonyms = {'soldiers':'troopers', 'soldier':'trooper'} annotators = 'pos, ner, depparse, openie' options = {'openie.resolve_coref': True} nlp_ = pynlp.StanfordCoreNLP(annotators=annotators, options=options) nlp=StanfordCoreNLP("http://localhost:9000/") sdp = CoreNLPDependencyParser() jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True, include_range=True) lemmatizer = nltk.WordNetLemmatizer() #----------------------------------------------------------------------------------------------------------------------- #LOAD THE SENTENCES filepath = 'kolbuszowa.txt' list_sentences = [] with open(filepath,encoding="utf8") as file: for line in file: list_sentences.append([line[:line.rfind(".") + 1]]) #PREPROCESSING START for i in range(len(list_sentences)): sentence = list_sentences[i][0] #PREPROCESSING
class DependenciesLCA(): def __init__(self, sentence, port=9004): self.sentence = sentence.rstrip('.') self.sentence = re.sub(r'(.?)([\.,;:\?!()\[\]\{\}«»\'\"\-\—\/’&])', '\\1 \\2 ', self.sentence) self.corenlpparser = CoreNLPDependencyParser(url='http://localhost:' + str(port)) parse = self.corenlpparser.raw_parse(self.sentence) self.tree = next(parse) def lca(self, index1, index2): path1 = [] path2 = [] path1.append(index1) path2.append(index2) node = index1 while (node != self.tree.root): node = self.tree.nodes[node['head']] path1.append(node) node = index2 while (node != self.tree.root): node = self.tree.nodes[node['head']] path2.append(node) for l1, l2 in zip(path1[::-1], path2[::-1]): if (l1 == l2): temp = l1 return temp def path_lca(self, node, lca_node): path = [] path.append(node) while (node != lca_node): node = self.tree.nodes[node['head']] path.append(node) return path def branch_paths(self, ent1, ent2): entity1 = re.split(r"[ .',\-0-9]", ent1)[-1] entity2 = re.split(r"[ .',\-0-9]", ent2)[-1] node1 = None node2 = None for node in self.tree.nodes: if (self.tree.nodes[node]["word"] == entity1) & (node1 == None): node1 = self.tree.nodes[node] elif (self.tree.nodes[node]["word"] == entity2) & (node2 == None): node2 = self.tree.nodes[node] try: if node1['address'] != None and node2['address'] != None: lca_node = self.lca(node1, node2) path1 = self.path_lca(node1, lca_node) path2 = self.path_lca(node2, lca_node) word_path1 = "/".join([p["word"] for p in path1]) word_path2 = "/".join([p["word"] for p in path2]) rel_path1 = "/".join([p["rel"] for p in path1]) rel_path2 = "/".join([p["rel"] for p in path2]) pos_path1 = "/".join([p["tag"] for p in path1]) pos_path2 = "/".join([p["tag"] for p in path2]) else: print(entity1, entity2, self.sentence) except AssertionError: print("Node none, Entity 1 :", node1, entity1, ent1, " / Entity2 :", node2, entity2, ent2, " / Phrase :", self.sentence) except: if (bool(re.search(r'\d', entity1)) == True) | (bool( re.search(r'\d', entity1)) == False): return (None, None, None, None, None, None) print("Node none, Entity 1 :", node1, entity1, ent1, " / Entity2 :", node2, entity2, ent2, " / Phrase :", self.sentence, " / Tree : ", self.tree) return (word_path1, word_path2, rel_path1, rel_path2, pos_path1, pos_path2)
import os from nltk.tokenize import RegexpTokenizer from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000', tagtype="pos") def read_data(): tokenizer = RegexpTokenizer(r'\w+') tokenized_sentences = [] sentences = [] for _, _, file in os.walk("../../data/parsing_corpus"): for filename in file: with open("../../data/parsing_corpus/" + filename, "r") as f: contents = f.read() contents = contents.split("\n") for i in range(len(contents)): temp_tokenized_sentence = tokenizer.tokenize(contents[i]) if (len(temp_tokenized_sentence) <= 50): tokenized_sentences.append(temp_tokenized_sentence) sentences.append(contents[i]) return tokenized_sentences, sentences tokenized_sentences, sentences = read_data() dependency_parsed = [] with open("./dependencies.txt", "w") as f: for i in range(len(tokenized_sentences)): if (tokenized_sentences[i]): f.write(sentences[i] + "\n") parses = dep_parser.parse(tokenized_sentences[i])
class SqlGen: parsed = "" tokenized = "" dep_parser = "" text = "" data = "" attributes = "" conditions = [] #constructor def __init__(self, sentence): self.prop = { "depparse.extradependencies": "NONE", "depparse.keepPunct": "false" } self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') self.text = ner.NER().ner_pass(sentence) self.parsed, = self.dep_parser.raw_parse(self.text, properties=self.prop) def getData(self, type='None'): if type == 'pandas1' or type == 'pandas2' or type == 'pandas3': x = self.parsed if x.contains_address(0): x.remove_by_address(0) x = x.nodes df = pd.DataFrame( [(v['address'], v['word'], v['lemma'], v['ctag'], v['tag'], v['feats'], v['head'], v['deps'], v['rel']) for v in x.values()], columns=[ 'position', 'word', 'lemma', 'ctag', 'tag', 'feat', 'head', 'deps', 'rel' ]).set_index('position') self.data = df if type == 'pandas1': #all columns are included return df elif type == 'pandas2': # removed some columns from pandas1, only the columns specified in the list are included return df[['lemma', 'tag', 'head', 'rel']] else: # removed all colums except dependents return df[['deps']] else: return self.parsed.to_conll(4) def getAction(self, df): try: mainVerb = df.query("tag == 'VB' & head == 0").to_dict() return mainVerb['lemma'] except IndexError: return def getAttributes(self, df): #x = df.query(" (rel == 'dobj' & head == %s) |(rel == 'conj:and' & head == %s)" %(1,1)).to_dict() x = df.query( " (rel == 'dobj' & head == %s) |(rel == 'acl:relcl') |(rel == 'conj:and' & head == %s) |(rel == 'appos' )" % (1, 1)).to_dict() self.attributes = (x['lemma']) self.rel = (x['rel']) return x def getValueNodes(self, index): pos = self.data.query("(rel == 'acl:relcl' )").to_dict()['word'] if pos: pos = list(pos.keys())[0] x = self.data.query( " (rel == 'nmod:poss')|(rel == 'nmod:at')|(rel == 'dobj' & index > %s) | (rel == 'nmod:for')| (rel == 'nmod:as') | (rel == 'nmod:in')" % (pos)).to_dict() self.conditions = x['word'] return x else: x = self.data.query( " (rel == 'nmod:poss')|(rel == 'nmod:at') | (rel == 'nmod:for')| (rel == 'nmod:as') | (rel == 'nmod:in')" % (pos)).to_dict() self.conditions = x['word'] return x def findAssociation(self, attributes): att = [] for keys in attributes: x = self.data.query( " (~tag.str.contains('DT')& ~rel.str.contains('ref')& ~rel.str.contains('cc')& ~rel.str.contains('case') & ~rel.str.contains('punct')) &(head == %s)" % (keys)).to_dict() #temp = [(attributes[keys])] if self.rel[keys] == "acl:relcl": temp = {attributes[keys]: 'acl:relcl'} else: temp = {attributes[keys]: 'main'} for keys in x['lemma']: try: if (x['lemma'][keys] not in attributes.values() and x['lemma'][keys] not in self.conditions.values()): temp[x['lemma'][keys]] = x['rel'][keys] except AttributeError: pass att.append(temp) return att
from datetime import datetime from nltk.parse.corenlp import CoreNLPDependencyParser from nltk.parse.dependencygraph import DependencyGraph parser = CoreNLPDependencyParser(url='http://localhost:9000') sentence = "The trophy would not fit in the brown suitcase because it was too big" # sentence = "I spread the roth on the table in order to protect it" # sentence = "On the table I've spread the roth in order to protect it" # sentence = "The city councilmen refused the demonstrators a permit because they feared violence" # sentence = "She said he told her their secrets" sentence = "The monkey said the bird told the elephant he was dangerous." sentence = "The women stopped taking the pills because they were carcinogenic." sentence = "Marta has a cat, her cat is brown" parse, = parser.raw_parse(sentence) conll = parse.to_conll(4) print(conll) dg = DependencyGraph(conll) dotted = dg.to_dot() G = dg.nx_graph() f = open('hoy_' + str(datetime.now()) + '.svg', 'w') svg = dg._repr_svg_() f.write(svg)
import os from nltk.tokenize import RegexpTokenizer from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000', tagtype="pos") import collections def read_data(): count_preposition = [] tokenizer = RegexpTokenizer(r'\w+') write_to_file = [] for _, _, file in os.walk("../../data/parsing_corpus"): for filename in file: with open("../../data/parsing_corpus/" + filename, "r") as f: preposition_list = [] contents = f.read() contents = contents.split("\n") for i in range(len(contents)): temp_list = [] temp_tokenized_sentence = tokenizer.tokenize(contents[i]) if (len(temp_tokenized_sentence) <= 50): if (temp_tokenized_sentence): parses = dep_parser.parse(temp_tokenized_sentence) for parse in parses: for governor, dep, dependent in parse.triples( ): if (governor[1] == "IN"): if (governor not in temp_list): temp_list.append(governor)
# Parse tokenized text. print( list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))) print( "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n" ) # Parse raw string. print(list(parser.raw_parse('What is the airspeed of an unladen swallow ?'))) print( "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n" ) # Neural Dependency Parser from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') parses = dep_parser.parse( 'What is the airspeed of an unladen swallow ?'.split()) print([[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]) print( "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n" ) # Tokenizer parser = CoreNLPParser(url='http://localhost:9000') print(list(parser.tokenize('What is the airspeed of an unladen swallow?'))) print( "\nExpected: ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']\n" )
def hanks(verb): """ Implementation of P. Hanks theory. Given a transitive verb, we find N sentences in the Brown corpus that contains the given verb. We do WSD (using 2 version of Lesk algorithm, one handwritten by us and the other from NLTK library) on the verb arguments (subj and obj), and finally, we compute the Filler's supersense incidence rate. """ fillers = [] # [(subj, obj, sentence)] sentences = [] # Set the URI to communicate with Stanford CoreNLP dependency_parser = CoreNLPDependencyParser(url="http://localhost:9000") print('[1] - Extracting sentences...') list_word_sentences = text_extraction(verb) for sent in list_word_sentences: sentence = ' '.join(sent) sentences.append(sentence.strip()) sentences = [x.lower() for x in sentences] print("\t{} sentences in which the verb \'{}\' appears.".format(str(len(sentences)), verb)) print('\n[2] - Extracting fillers...') for sentence in sentences: # PoS Tagging sentence = sentence.replace('.', '') tokens = nltk.word_tokenize(sentence) tags = dict(nltk.pos_tag(tokens)) # dictionary of all PoS Tag of the tokens # Syntactic parsing result = dependency_parser.raw_parse(sentence) dep = next(result) graph = OurDependencyGraph() # first init needed because of .init_from_dot() graph.init_from_dot(dep.to_dot()) # Lemmatization # (it lemmatized only the verbs, the other words are not changed) lemmatized_graph = lemmatize_graph(graph, tags) # es. "said" to "say" verb_key_list = lemmatized_graph.get_verb_key(verb) # list of keys in which we can find the verb in graph.dict # format -> [int1, int 2, ...], eg.: [34], [0, 10, 34, ...] if len(verb_key_list) <= 0: # DEBUG # print("\tError in **{}**".format(sentence), file=sys.stderr) continue # Adjacency List # we take the first occurrence of the verb, which is our root adjs = lemmatized_graph.get_adj_neighbor(verb_key_list[0]) # if the adjacent element of the verb are subj or obj we update adjs variable adjs = list(filter(lambda x: x[1] in subj_dept or x[1] in obj_dept, adjs)) # Valency = 2 if len(adjs) == 2: # Note: not all the verb in sentences have valency = 2 # assigning the correct subject and obj if adjs[0][1] in subj_dept: w1 = lemmatized_graph.dict[adjs[0][0]] w2 = lemmatized_graph.dict[adjs[1][0]] else: w1 = lemmatized_graph.dict[adjs[1][0]] w2 = lemmatized_graph.dict[adjs[0][0]] fillers.append((w1, w2, sentence)) # where w1 = subj and w2 = obj tot = len(fillers) print("\n[3] - Total of {} Fillers".format(str(tot))) for f in fillers: print("\t{}".format(f)) our_lesk_semantic_types = {} # {(s1, s2): count} nltk_lesk_semantic_types = {} # {(s1, s2): count} for f in fillers: # WSD # Our Lesk s1 = our_lesk(f[0], f[2]) s2 = our_lesk(f[1], f[2]) # nltk.wsd's Lesk s3 = lesk(f[2], f[0]) s4 = lesk(f[2], f[1]) if s1 is not None and s2 is not None: # Getting supersences t = (s1.lexname(), s2.lexname()) # Getting frequency if t in our_lesk_semantic_types.keys(): our_lesk_semantic_types[t] = our_lesk_semantic_types[t] + 1 else: our_lesk_semantic_types[t] = 1 if s3 is not None and s4 is not None: # Getting supersences t = (s3.lexname(), s4.lexname()) # Getting frequency if t in nltk_lesk_semantic_types.keys(): nltk_lesk_semantic_types[t] = nltk_lesk_semantic_types[t] + 1 else: nltk_lesk_semantic_types[t] = 1 print('\n[4.1] - "Our Lesk":\n\tFinding Semantic Clusters (percentage, count of instances, semantic cluster):') for key, value in sorted(our_lesk_semantic_types.items(), key=lambda x: x[1]): to_print = str(round((value / tot) * 100, 2)) print("\t[{}%] - {} - {}".format(to_print, value, key)) print('\n[4.2] - "NLTK Lesk":\n\tFinding Semantic Clusters (percentage, count of instances, semantic cluster):') for key, value in sorted(nltk_lesk_semantic_types.items(), key=lambda x: x[1]): to_print = str(round((value / tot) * 100, 2)) print("\t[{}%] - {} - {}".format(to_print, value, key))
class SVO(): def __init__(self, sentence): config = ApplicationConfig.get_corenlp_config() self._parser = CoreNLPParser(url=f"http://{config['host']}:{config['port']}") self._dependency = CoreNLPDependencyParser(url=f"http://{config['host']}:{config['port']}") sentence = sentence.replace(' ', ' ') sentence = sentence.replace('.', '') self._load(sentence) self.original = sentence def get_dependency_tree(): return self._dependency def get_parser_tree(): return self.t def _load(self, sentence): self.t = list(self._parser.raw_parse(sentence))[0] self.t = ParentedTree.convert(self.t) def show(self): self.t.pretty_print() def find_svo(self): self._queue = [] # sentence須為S或NP才能找SVO & find conj for i in self.t.subtrees(lambda i: i.label() != 'ROOT'): # if i.label() in ['S','NP','SINV','SBAR','FRAG','X','PP']: remover = self._find_conj() # refresh for i in remover: self.original = self.original.replace(i, '') self._load(self.original) self.pos = self.t.pos() self._root = SVONode(('main', self.t), None) self._queue.append(self._root) break # else: # return 'Sentence can not find SVO.' # find SVO while self._queue != []: self._data = self._queue.pop(0) tmp = list(self._data.data.flatten()) if ',' in tmp: tmp.remove(',') if len(tmp) == 1: continue sentence = ' '.join(self._data.data.flatten()) self.t = self._data.data # 找子句 & 對等連接詞 & 分詞 # self.show() if self._data.relation != 'appos': self._find_SBAR() # self.show() # self._remove_comma() # self.show() self._data.svo = collections.defaultdict(list) # Find Subject tmp = self._find_subject() if isinstance(tmp, list): self._data.svo['subject'] = tmp else: self._data.svo['subject'] = self._add_conj(tmp) # Find Predicate tmp = self._find_predicate() self._data.svo['predicate'] = self._add_conj(tmp) # Find Object tmp = self._find_object(self._data.svo['predicate']) self._data.svo['object'] = self._add_conj(tmp) self._all = collections.defaultdict(list) self._flatten(self._data.svo['predicate']) self._data.svo['object'] = self._filter(self._data.svo['object']) for s in self.t.subtrees(): if s.label() != 'ROOT': break else: for i in self.t.subtrees(lambda i:i.label() != 'ROOT'): if i.label() in ['FRAG']: continue if i.label() in ['S','SINV']: for n in i.subtrees(lambda n: n.label() == 'S' and n != i): flag = True test = n while test.parent(): if test.parent() == i: flag = False break test = test.parent() if flag: tmp = self._del(' '.join(n.flatten())) if tmp: self._refresh(n) kid = SVONode(('', self.t), self._data) self._data.child.append(kid) self._queue.append(kid) break break break # Integrate self._result = collections.defaultdict(list) self._traversal(self._root) return self._result def _filter(self, x): for i in x: if i[1] != []: for j in i[1]: if isinstance(j,dict): for k in ['predicate', 'object']: tmp = self._filter(j[k]) if tmp == []: del j[k] else: if j in self._all['predicate']: i[1].remove(j) if i[0] in self._all['predicate']: x.remove(i) return x def _flatten(self, x): for i in x: self._all['predicate'].append(i[0]) if i[1] != []: for j in i[1]: if isinstance(j,dict): for k in j.keys(): self._flatten(j[k]) else: self._all['predicate'].append(j) def _traversal(self, node): if node.svo != None and (node.svo['subject']!=[] or node.svo['predicate']!=[] or node.svo['object']!=[]): self._result[node.relation].append({'subject':node.svo['subject'], 'predicate':node.svo['predicate'], 'object':node.svo['object']}) for i in node.child: self._traversal(i) def _add_conj(self, tmp): result = [] if isinstance(tmp, tuple): flag = tmp[0].split(' ') if len(flag) <= 5: for k in flag: if k in self._dic.keys(): # 把conj補進來 for j in self._dic[k]: if j[0] == 'attr': tree = list(self._parser.raw_parse(tmp[0]+' is '+j[1]))[0] tree = ParentedTree.convert(tree) kid = SVONode(('appos', tree), self._data) self._data.child.append(kid) self._queue.append(kid) self._dic[k].remove(j) # a = tmp[0] # b = tmp[1] # result.append((a, b+[j[1]])) else: result.append((j[1], j[2])) if isinstance(tmp, tuple) and tmp[0] not in [x[0] for x in result]: result.append(tmp) result.reverse() return result def _remove_comma(self): for i in self.t.subtrees(lambda i:i[0] in [',', ';']): if i.left_sibling() and i.left_sibling().label() not in ['NP','S','VP','PP','JJ','SINV','ADJP'] and 'VB' not in i.left_sibling().label(): if ' '.join(i.left_sibling().flatten()) != ' '.join(self.t.flatten()): self._refresh(i.left_sibling()) if ' '.join(i.flatten()) != ' '.join(self.t.flatten()): self._refresh(i) # 拔掉的句子放進child def _child(self, a, b): kid = SVONode((a, b), self._data) self._data.child.append(kid) self._queue.append(kid) self._refresh(b, a) # 能否 refresh(拔掉的句子和原有句子是否一樣) def _del(self, tmp_1): tmp = ' '.join(self.t.flatten()) tmp = tmp.replace(tmp_1, '') tmp = tmp.strip(',; ') if tmp != '': return True else: return False def _find_SBAR(self): # 有無對等連接詞 for i in self.t.subtrees(lambda i: i.label() == 'CC'): if i.right_sibling() and i.right_sibling().label() in ['S','VP']: tmp = self._del(i[0]+' '+' '.join(i.right_sibling().flatten())) if tmp and [x for x in self._queue if ' '.join(i.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []: self._child(i[0], i.right_sibling()) # 有無子句 for node in self.t.subtrees(lambda node: node.label() == 'SBAR'): if 'VB' in node.pos()[0][1]: continue tmp = self._del(' '.join(node.flatten())) if tmp: conj = [] # 連接詞 for s in node.subtrees(lambda s: s.label() != 'SBAR'): if s.label() not in ['S','ADVP','RB'] and 'VB' not in s.label(): if s.leaves()[0] not in conj: conj.append(s.leaves()[0]) elif s.label() in ['ADVP','RB']: continue else: break conj = ' '.join(conj) for s in node.subtrees(lambda s: s.label() == 'S'): # SBAR 會重複 if [x for x in self._queue if ' '.join(s.flatten()) in ' '.join(x.data.flatten())] == []: if node.left_sibling() and node.left_sibling().label() == 'IN' and node.parent().label() != 'S': tmp = self._del(' '.join(node.parent().flatten())) if tmp: self._child(conj, s) else: self._child(conj, s) break # 分詞 participle = [x[0] for x in self.t.pos() if x[1] in ['VBG','VBN']] for i in participle: if i in self.t.leaves(): candidate = [x for x, y in enumerate(self.t.leaves()) if y == i] if candidate[-1] == 0: pos = '' else: before = self.t.leaves()[candidate[-1]-1] pos = [x for x in self.t.pos() if x[0] == before][0][1] IN = ['when','while','before','after','till','since','because','as','so','although','though','if','unless','upon','once'] if pos == 'IN' and before.lower() in IN: # candidate[-1]-2 >= 0 and 'VB' not in [x for x in self.t.pos() if x[0] == self.t.leaves()[candidate[-1]-2]][0][1] for j in self.t.subtrees(lambda j: j[0] == before): tmp = self._del(' '.join(j.parent().flatten())) if tmp and j.parent().label() != 'NP' and j.right_sibling() and [x for x in self._queue if ' '.join(j.right_sibling().flatten()) in ' '.join(x.data.flatten())] == []: self._child(before, j.right_sibling()) if ('VB' not in pos) and (pos not in ['IN','RB','MD','POS', 'TO']): for j in self.t.subtrees(lambda j: j[0] == i): tmp = self._del(' '.join(j.parent().flatten())) if tmp and j.parent().label() not in ['NP','ADJP'] and j.right_sibling() and [x for x in self._queue if ' '.join(j.parent().flatten()) in ' '.join(x.data.flatten())] == []: self._child('', j.parent()) def _refresh(self, node, conj=''): sentence = ' '.join(self.t.flatten()) if conj == '': tmp = ' '.join(node.flatten()) else: tmp = conj + ' ' + ' '.join(node.flatten()) if tmp in sentence: idx = sentence.index(tmp) if idx-2 >= 0 and sentence[idx-2] == ',': tmp = ', ' + tmp if idx+len(tmp)+1 < len(sentence) and sentence[idx+len(tmp)+1] == ',': tmp = tmp +' ,' sentence = sentence.replace(tmp, '') self._load(sentence) def _find_conj(self): self._dic = collections.defaultdict(list) dep, = self._dependency.raw_parse(self.original) remover = [] pool_conj = [] pool_appos = [] for governor, bridge, dependent in dep.triples(): # 對等連接詞 if bridge == 'conj': # NN conj NN if 'NN' in governor[1] and 'NN' in dependent[1]: address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['conj'] for add in address: if add not in pool_conj: tmp = [] r = [] pool_conj.append(add) for key, value in dep.get_by_address(add)['deps'].items(): if key not in ['conj', 'cc', 'nmod', 'nmod:poss']: for j in value: tmp.append(dep.get_by_address(j)['word']) r.append(dep.get_by_address(j)['word']) if key in ['nmod']: r.append(dep.get_by_address(add)['word']) for j in value: for key1, value1 in dep.get_by_address(j)['deps'].items(): if key1 not in ['conj', 'cc']: for k in value1: r.append(dep.get_by_address(k)['word']) r.append(dep.get_by_address(j)['word']) if key in ['nmod:poss']: for j in value: for key1, value1 in dep.get_by_address(j)['deps'].items(): if key1 not in ['conj', 'cc', 'case']: for k in value1: tmp.append(dep.get_by_address(k)['word']) r.append(dep.get_by_address(k)['word']) if key1 in ['case']: tmp.append(dep.get_by_address(j)['word']) r.append(dep.get_by_address(j)['word']) for k in value1: tmp.append(dep.get_by_address(k)['word']) r.append(dep.get_by_address(k)['word']) if dep.get_by_address(j)['word'] not in tmp: tmp.append(dep.get_by_address(j)['word']) r.append(dep.get_by_address(j)['word']) if dep.get_by_address(add)['word'] not in tmp: tmp.append(dep.get_by_address(add)['word']) if dep.get_by_address(add)['word'] not in r: r.append(dep.get_by_address(add)['word']) for i in self.t.subtrees(lambda i: i.leaves() == r): for n in i.subtrees(lambda n: n[0] == dependent[0]): self._dic[governor[0]].append(('entity', ' '.join(tmp), self._find_attrs(n, ' '.join(tmp)))) remover.append(' '.join(r)) break break if ' '.join(r) not in remover: self._dic[governor[0]].append(('entity', ' '.join(tmp), [])) remover.append(' '.join(r)) # VB conj VB O elif 'VB' in governor[1] and 'VB' in dependent[1] and governor[1] == dependent[1]: gov_key = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0].keys() dep_key = [x['deps'] for x in dep.nodes.values() if x['word']==dependent[0]][0].keys() if [j for j in gov_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass']]==[] or [j for j in dep_key if j in ['dobj','xcomp','ccomp', 'nmod', 'nsubjpass', 'nsubj']]==[]: for i in self.t.subtrees(lambda i: i[0] == dependent[0]): self._dic[governor[0]].append(('entity', dependent[0], self._find_attrs(i, dependent[0]))) remover.append(dependent[0]) break # 同位語(回傳整串) elif bridge == 'appos': tmp = [] address = [x['deps'] for x in dep.nodes.values() if x['word']==governor[0]][0]['appos'] for add in address: if add not in pool_appos: tmp = [] pool_appos.append(add) for key, value in dep.get_by_address(add)['deps'].items(): if key in ['compound', 'amod']: for j in value: tmp.append(dep.get_by_address(j)['word']) if key in ['nmod']: tmp.append(dep.get_by_address(add)['word']) for j in value: for key1, value1 in dep.get_by_address(j)['deps'].items(): if key1 not in ['conj', 'cc']: for k in value1: tmp.append(dep.get_by_address(k)['word']) tmp.append(dep.get_by_address(j)['word']) if dep.get_by_address(add)['word'] not in tmp: tmp.append(dep.get_by_address(add)['word']) self._dic[governor[0]].append(('attr', ' '.join(tmp), [])) remover.append(' '.join(tmp)) for i in range(len(remover)): #所有可能的位置 can = [m.start() for m in re.finditer(remover[i], self.original)] flag = False for j in can: if self.original[j-2] == ',': remover[i] = ', ' + remover[i] flag = True break elif self.original[j-4:j-1] == 'and': remover[i] = 'and ' + remover[i] flag = True break if not flag: remover[i] = ' ' + remover[i] return remover # Breadth First Search the tree and take the first noun in the NP subtree. def _find_subject(self): synonym = ['', 'which', 'that', 'who', 'whom', 'where', 'when', 'what', 'why', 'how', 'whether', 'in'] for i in self.t.subtrees(lambda i: i.label() == 'SBAR'): dep, = self._dependency.raw_parse(' '.join(self.t.flatten())) sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']] if sub != []: for s in self.t.subtrees(lambda s:s[0] == sub[0][0]): return self._find_NOUN(s) for s in i.subtrees(lambda s: s.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() in 'PRP'): return self._find_NOUN(n) for n in s.subtrees(lambda n: n.label() == 'DT'): return (n[0], self._find_attrs(n, n[0])) for i in self.t.subtrees(lambda i: i.label() not in ['S', 'ROOT', 'PP', 'FRAG']): # 有Subject dep, = self._dependency.raw_parse(' '.join(self.t.flatten())) sub = [z for x, y, z in dep.triples() if y in ['nsubj', 'nsubjpass']] if sub != []: for s in self.t.subtrees(lambda s:s[0] == sub[0][0]): return self._find_NOUN(s) if i.label() not in ['VP','PP'] and 'VB' not in i.label(): for s in self.t.subtrees(lambda s: s.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'): return self._find_NOUN(n) for n in s.subtrees(lambda n: n.label() == 'DT'): return (n[0], self._find_attrs(n, n[0])) # 祈使句 elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation == 'main': if [x for x in self.t.pos()][0][1] not in ['RB','MD'] and 'VB' not in [x for x in self.t.pos()][0][1]: for s in self.t.subtrees(lambda s: s.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN') or n.label() == 'PRP'): return self._find_NOUN(n) for n in s.subtrees(lambda n: n.label() == 'DT'): return (n[0], self._find_attrs(n, n[0])) return None else: return None # 沒有subject & relation是代名詞 elif (i.label() == 'VP' or i.label().startswith('VB')) and self._data.relation in synonym: dep, = self._dependency.raw_parse(self.original) candidate = [x for x in dep.triples() if x[1] in ['acl:relcl','acl'] and x[2][0] in self.t.flatten()] if candidate != []: compound = self._find_compound(candidate[0][0][0], dep) sub = [] if compound != '': for com in compound: sub.append(com) sub.append(candidate[0][0][0]) return (' '.join(sub), []) else: sent = [x[0] for x in self.pos] if self._data.relation != '': candidate = [x for x, y in enumerate(sent) if y == self._data.relation.split(' ')[0]] after = self.t.pos()[0][0] else: candidate = [x for x, y in enumerate(sent) if y == self.t.pos()[0][0]] if len(self.t.pos()) > 1: after = self.t.pos()[1][0] else: after = '' before = candidate[0] - 1 for x in candidate: if sent[x+1] == after: before = x - 1 if before == -1: return None # 原句前一個詞是否為NN if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0] or [x[1] for x in self.pos if x[0] == sent[before]][0] in ['PRP']: sub = [sent[before]] before -= 1 while 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]: sub.append(sent[before]) before -= 1 return (' '.join(reversed(sub)), []) elif [x[1] for x in self.pos if x[0] == sent[before]][0] in ['IN',','] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]: before -= 1 sub = [sent[before]] before -= 1 while before != -1 and 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]: sub.append(sent[before]) before -= 1 return (' '.join(reversed(sub)), []) # 找parent中最近的 else: target = self.t.pos()[0][0] if self._data.parent.svo['subject'] == []: sub = -1 else: sub = self._data.parent.svo['subject'][0][0].split(' ')[-1] if self._data.parent.svo['object'] == []: obj = -1 else: obj = self._data.parent.svo['object'][0][0].split(' ')[-1] if sub == -1 and obj != -1: return self._data.parent.svo['object'] elif sub != -1 and obj == -1: return self._data.parent.svo['subject'] elif sub != -1 and obj != -1: if abs(self.original.find(target)-self.original.find(sub)) <= abs(self.original.find(target)-self.original.find(obj)): return self._data.parent.svo['subject'] else: return self._data.parent.svo['object'] # 沒有subject & relation是連接詞 elif i.label() == 'VP' or i.label().startswith('VB'): if self._data.parent != None: return self._data.parent.svo['subject'] else: return None def _find_compound(self, word, dep): deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word] com = [] deps = [x for x in deps if 'compound' in x] for i in deps: for j in i['compound']: com.append(dep.get_by_address(j)['word']) deps = [x for x in deps if 'dep' in x] for i in deps: com.append(dep.get_by_address(i['dep'][0])['word']) return com def _compound(self, compound, before): obj = [] if compound != '': for n in self.t.subtrees(lambda n:n[0] == before): for com in compound: for s in n.parent().subtrees(lambda s:s[0] == com): obj.append(com) return obj def _dobj(self, candidate, dep, before): if 'dobj' in candidate.keys(): word = dep.nodes[candidate['dobj'][0]]['word'] tag = dep.nodes[candidate['dobj'][0]]['tag'] else: word = dep.nodes[candidate['xcomp'][0]]['word'] tag = dep.nodes[candidate['xcomp'][0]]['tag'] compound = self._find_compound(word, dep) obj = self._compound(compound, before) if tag != 'TO': for n in self.t.subtrees(lambda n:n[0] == before): for s in n.parent().subtrees(lambda s:s[0] == word): obj.append(s[0]) return (' '.join(obj), self._find_attrs(s, ' '.join(obj))) def _find_object(self, predicate, node = '', data = ''): if node == '': node = self.t if data == '': data = self._data synonym = ['which', 'that', 'who', 'whom'] if data != None and data.relation == 'appos': dep, = self._dependency.raw_parse(' '.join(node.flatten())) else: dep, = self._dependency.raw_parse(self.original) for i in predicate: pre = i[0].split(' ') for j in range(len(pre)-1, -1, -1): if len([x['deps'] for x in dep.nodes.values() if x['word']==pre[j]]) > 1: dep, = self._dependency.raw_parse(' '.join(node.flatten())) candidate = [x['deps'] for x in dep.nodes.values() if x['word']==pre[j]][0] candidate_1 = [x for x in dep.triples() if x[2][0]==pre[j]] if 'dobj' in candidate.keys() or 'xcomp' in candidate.keys(): return self._dobj(candidate, dep, pre[j]) elif 'ccomp' in candidate.keys(): word = dep.nodes[candidate['ccomp'][0]]['word'] tag = dep.nodes[candidate['ccomp'][0]]['tag'] dic = collections.defaultdict(list) deps = [x['deps'] for x in dep.nodes.values() if x['word'] == word][0] if 'nsubj' in deps.keys(): compound = self._find_compound(dep.get_by_address(deps['nsubj'][0])['word'], dep) obj = self._compound(compound, pre[j]) obj.append(dep.get_by_address(deps['nsubj'][0])['word']) if 'dobj' in deps.keys() or 'xcomp' in deps.keys(): for n in self.t.subtrees(lambda n:n[0] == word): dic['predicate'].append((word, self._find_attrs(n, word))) dic['object'] = self._add_conj(self._dobj(deps, dep, word)) return (' '.join(obj), [dic]) elif 'dobj' in deps.keys(): compound = self._find_compound(dep.get_by_address(deps['dobj'][0])['word'], dep) obj = self._compound(compound, pre[j]) for n in self.t.subtrees(lambda n:n[0] == dep.get_by_address(deps['dobj'][0])['word']): obj.append(n[0]) return (' '.join(obj), self._find_attrs(n, ' '.join(obj))) # else: # return None elif 'cop' in [x[1] for x in candidate_1]: tmp = [x for x in candidate_1 if x[1] == 'cop'][0] compound = self._find_compound(tmp[0][0], dep) obj = self._compound(compound, pre[j]) for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]): obj.append(j[0]) return (' '.join(obj), self._find_attrs(j, ' '.join(obj))) elif 'case' in [x[1] for x in candidate_1]: tmp = [x for x in candidate_1 if x[1] == 'case'][0] compound = self._find_compound(tmp[0][0], dep) obj = self._compound(compound, pre[j]) for j in self.t.subtrees(lambda j:j[0] == tmp[0][0]): obj.append(j[0]) return (' '.join(obj), self._find_attrs(j, ' '.join(obj))) elif 'auxpass' in candidate.keys(): sent = [x[0] for x in self.pos] if data != None and data.relation in synonym: relation = sent.index(data.relation.split(' ')[0]) if 'IN' in [x[1] for x in self.pos if x[0] == sent[relation]][0]: return (sent[relation-1], []) return None # 沒有受詞 elif data != None and data.relation in synonym: sent = [x[0] for x in self.pos] before = sent.index(data.relation.split(' ')[0])-1 # 原句前一個詞是否為NN if 'NN' in [x[1] for x in self.pos if x[0] == sent[before]][0]: return (sent[before], []) elif 'IN' in [x[1] for x in self.pos if x[0] == sent[before]][0] and 'NN' in [x[1] for x in self.pos if x[0] == sent[before-1]][0]: return (sent[before-1], []) elif data.child != []: kid = data.child[0] if kid.relation != 'appos': return (kid.relation+' '+' '.join(kid.data.flatten()), []) else: return None # 受詞為子句 elif data != None and data.child != []: kid = data.child[0] if kid.relation != 'appos': return (kid.relation+' '+' '.join(kid.data.flatten()), []) elif [x for x in dep.nodes.values() if x['word']==pre[j]][0]['tag'] == 'RP': continue else: return None def _find_predicate(self): tmp = self.t.flatten() for n in self.t.subtrees(lambda n: n.label().startswith('VB')): if n.parent().label() in ['ADJP']: continue i = tmp.index(n[0]) sub = [] while self.t.pos()[i-1][1] in ['MD','RB']: sub.append(self.t.pos()[i-1][0]) i -= 1 sub.reverse() i = tmp.index(n[0]) while i+1 < len(tmp): if [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] == 'RP': sub.append(tmp[i]) i += 1 elif [x[1] for x in self.t.pos() if x[0] == tmp[i+1]][0] in ['RB','MD']: if i+2 >= len(tmp): break count = i+2 while count+1 < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] in ['RB','MD']: count += 1 if count < len(tmp) and [x[1] for x in self.t.pos() if x[0] == tmp[count]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[count]][0] == 'TO': sub.append(tmp[i]) i += 1 else: break else: break flag = i sub.append(tmp[flag]) # 不定詞 for j in self.t.subtrees(lambda j:j[0] == tmp[flag]): if j.right_sibling() and j.right_sibling().label() == 'PP' and j.right_sibling().leaves()[0] != 'to': start = tmp.index(j.right_sibling().leaves()[-1]) has_PP = True else: start = flag has_PP = False if start+1 < len(tmp) and tmp[start+1] == 'to': for i in range(start+1, len(tmp)): if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB') or [x[1] for x in self.t.pos() if x[0] == tmp[i]][0] in ['TO','RB']: sub.append(tmp[i]) if [x[1] for x in self.t.pos() if x[0] == tmp[i]][0].startswith('VB'): flag = i else: break if has_PP: for i in self.t.subtrees(lambda i:i[0] == sub[-1]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) else: for i in self.t.subtrees(lambda i:i[0] == tmp[flag]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) else: for i in self.t.subtrees(lambda i:i[0] == tmp[flag]): return (' '.join(sub), self._find_attrs(i, ' '.join(sub))) def _find_NOUN(self, n): # 所有格 if n.parent().right_sibling() and n.parent().right_sibling().label().startswith('NN'): sub = n.parent().leaves() p = n.parent() while p.right_sibling(): if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD','DT']: p = p.right_sibling() sub.append(p[0]) else: break return (' '.join(sub), self._find_attrs(p, ' '.join(sub))) else: sub = [] pp = n.parent() flag = '' for l in pp: if l.label().startswith('NN') or l.label() in ['PRP','CD','DT']: if l[0] not in sub: sub.append(l[0]) flag = l if flag == '': sub.append(n[0]) flag = n return (' '.join(sub), self._find_attrs(flag, ' '.join(sub))) def _find_to(self, node): dic = collections.defaultdict(list) flag = node.leaves().index('to') tmp = node.leaves()[flag:] predicate = [] for i in tmp: if [x[1] for x in self.t.pos() if x[0] == i][0] in 'TO' or 'VB' in [x[1] for x in self.t.pos() if x[0] == i][0]: predicate.append(i) else: break for n in node.subtrees(lambda n: n[0] == predicate[-1]): dic['predicate'].append((' '.join(predicate), self._find_attrs(n, ' '.join(predicate)))) if predicate[-1] == 'be': for n in node.subtrees(lambda n: n.label() in ['NP', 'PP']): if n.label() in ['NP', 'PP']: for c in n.subtrees(lambda c: c.label().startswith('NN') or c.label() in ['PRP', 'CD']): a = self._find_NOUN(c) dic['object'] = self._add_conj(a) return dic else: tmp = self._find_object(dic['predicate'], node, None) dic['object'] = self._add_conj(tmp) return dic def _toV(self, node): # 可能有多個一樣的字 flat = list(self.t.flatten()) candidate = [x for x, y in enumerate(flat) if y == node[0]] flag = candidate[0] if node.left_sibling(): before = node.left_sibling().leaves()[-1] for i in candidate: if flat[i-1] == before: flag = i break elif node.right_sibling(): after = node.right_sibling().leaves()[0] for i in candidate: if flat[i+1] == after: flag = i break elif node.parent().left_sibling(): before = node.parent().left_sibling().leaves()[-1] for i in candidate: if flat[i-1] == before: flag = i break elif node.parent().right_sibling(): after = node.parent().right_sibling().leaves()[0] for i in candidate: if flat[i+1] == after: flag = i break if not node.label().startswith('VB') and flag+2 < len(flat) and flat[flag+1] == 'to' and [x[1] for x in self.t.pos() if x[0] == flat[flag+2]][0] in 'VB': for i in self.t.subtrees(lambda i: i[0] == 'to'): if flat[flag] not in i.parent().flatten(): return i.parent() else: return None def _PP(self, s, name, attrs): if ' '.join(s.flatten()) not in name: if len(s[0]) != 1: for i in s.subtrees(lambda i: i.label() == 'PP'): if i.parent() == s: a = self._proposition(i) if a != []: attrs.append(a) else: attrs.append(' '.join(s.flatten())) else: a = self._proposition(s) if a != []: attrs.append(a) else: attrs.append(' '.join(s.flatten())) return attrs def _find_attrs(self, node, name): attrs = [] p = node.parent() toV = self._toV(node) name = name.split(' ') # Search siblings of adjective for adverbs if node.label().startswith('JJ'): for s in p: if s.label() == 'RB': if s[0] not in name: attrs.append(s[0]) elif s.label() == 'PP': attrs = self._PP(s, name, attrs) elif s.label() == 'NP': if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) elif node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']: for s in p: if s != node and s.label() in ['DT','PRP$','POS','CD','IN'] or s.label().startswith('JJ') or s.label().startswith('NN'): if s[0] not in name: attrs.append(s[0]) elif s != node and s.label() in ['ADJP','NP','QP', 'VP']: if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) elif s != p and s.label() in ['PP']: attrs = self._PP(s, name, attrs) # Search siblings of verbs for adverb phrase elif node.label().startswith('VB'): for s in p: # if s.label() in ['ADVP','MD','RB']: if s.label() in ['ADVP', 'RB', 'MD']: if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) elif s.label() == 'PP': attrs = self._PP(s, name, attrs) # Search uncles # if the node is noun or adjective search for prepositional phrase if node.label().startswith('JJ') or node.label().startswith('NN') or node.label() in ['PRP', 'CD', 'DT']: if p.label() == 'QP': p = p.parent() for s in p.parent(): if s != p and s.label() in ['PP']: attrs = self._PP(s, name, attrs) elif s != p and 'NN' in s.label() or s.label() == 'JJ': if s[0] not in name: attrs.append(s[0]) elif s != p and s.label() == 'VP' and s.parent().label() == 'NP': if ' '.join(s.flatten()) not in name: if toV != None: if ' '.join(s.flatten()[:3]) != ' '.join(toV.flatten()[:3]): attrs.append(' '.join(s.flatten())) else: # self._refresh(s) attrs.append(' '.join(s.flatten())) elif node.label().startswith('VB') or node.label() == 'RP': if p.parent(): tmp = node for s in p.parent(): if s != p and s.label().startswith('ADVP'): if ' '.join(s.flatten()) not in name: attrs.append(' '.join(s.flatten())) # elif s != p and s.label() in ['MD','RB']: # attrs.append(s[0]) elif s != p and s.label() == 'PP' and s == tmp.right_sibling(): attrs = self._PP(s, name, attrs) tmp = s if toV != None: attrs.append(self._find_to(toV)) self._refresh(toV) return attrs def _proposition(self, node): dic = collections.defaultdict(list) tmp = node.leaves() if len(tmp) == 1: return [] for k in node.subtrees(lambda k: k.label() in ['IN', 'TO']): if tmp.index(k[0])+1 < len(tmp): VB = [x for x in node.pos() if x[0] == tmp[tmp.index(k[0])+1]] if VB != [] and 'VB' in VB[0][1]: dic['predicate'].append((k[0]+' '+VB[0][0], [])) else: dic['predicate'].append((k[0], [])) else: dic['predicate'].append((k[0], [])) if k.right_sibling(): for c in k.right_sibling().subtrees(lambda c: c.label().startswith('NN') or c.label() in ['JJ', 'CD']): # 所有格 if c.parent().right_sibling() and c.parent().right_sibling().label().startswith('NN'): sub = c.parent().leaves() p = c.parent() while p.right_sibling(): if p.right_sibling().label().startswith('NN') or p.right_sibling().label() in ['PRP','CD']: p = p.right_sibling() sub.append(p[0]) flag = p else: break else: sub = [] pp = c.parent() for l in pp: if l.label().startswith('NN') or l.label() in ['PRP','CD', 'JJ']: if l[0] not in sub: sub.append(l[0]) flag = l dic['object'].append((' '.join(sub), self._find_attrs(flag, ' '.join(sub)))) dic['object'] = self._add_conj(dic['object'][0]) return dic return [] else: return [] return []
""" Created by: Emanuele Bugliarello (@e-bug) Date created: 9/4/2019 Date last modified: 9/4/2019 """ import re import sys import html import numpy as np from collections import defaultdict from nltk.parse.corenlp import CoreNLPDependencyParser mapper = {'"': '``'} parsers = { 'en': CoreNLPDependencyParser(url='http://*****:*****@@'): tokens = tokenize(sent) word, words = [], [] for tok in tokens: if tok.endswith(separator):
features_files = [ "features/features_train_" + args.version + ".txt", "features/features_devel_" + args.version + ".txt", "features/features_test_" + args.version + ".txt" ] features_megan_files = [ "features/features_megan_train_" + args.version + ".txt", "features/features_megan_devel_" + args.version + ".txt", "features/features_megan_test_" + args.version + ".txt" ] # connect to your CoreNLP server try: my_parser = CoreNLPDependencyParser(url="http://localhost:9000") except (ConnectionError, ConnectionRefusedError) as e: print("Loading parser\n") print("Error while trying to connect to CorNLP server. Try running:\n") print("\tcd stanford-corenlp-full-2018-10-05") print( "\tjava -mx4g -cp \"*\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer" ) exit() #create_features_file(args.inputdirtrain, features_files[0], features_megan_files[0]) #create_features_file(args.inputdirdevel, features_files[1], features_megan_files[1]) #if str(args.createtest) == "True": create_features_file(args.inputdirtest, features_files[2], features_megan_files[2])
from nltk.parse import CoreNLPParser from nltk import sent_tokenize # parser = CoreNLPParser(url='http://localhost:9000') # print(list(parser.parse('Jack is a boy . He is handsome .'.split()))) # print(list(parser.raw_parse('Jack is a boy . He is handsome .'))) from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') print('I am your dad , he is also your dad .'.split()) parses = dep_parser.parse('I am your dad , he is also your dad .'.split()) print([[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]) # parser = CoreNLPParser(url='http://localhost:9000') # print(list(parser.tokenize('What is the airspeed of an unladen swallow?'))) # pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') # print(list(pos_tagger.tag('What is the airspeed of an unladen swallow ?'.split()))) # ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') # print(list(ner_tagger.tag(('Rami Eid is studying at Stony Brook University in NY'.split())))) # tagger = CoreNLPParser(url='http://localhost:9000')
from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() import pickle from fuzzywuzzy import fuzz #nltk.download('punkt') from nltk import tokenize import re from tqdm import tqdm tqdm.pandas() from string import digits from string import punctuation # In[5]: dep_parser = CoreNLPDependencyParser(url='http://0.0.0.0:9000') pos_tagger = CoreNLPParser(url='http://0.0.0.0:9000', tagtype='pos') # In[6]: def convert_sentence(input_sent): # Parse sentence using Stanford CoreNLP Parser pos_type = pos_tagger.tag(input_sent.split()) parse_tree, = ParentedTree.convert( list(pos_tagger.parse(input_sent.split()))[0]) dep_type, = ParentedTree.convert(dep_parser.parse(input_sent.split())) return pos_type, parse_tree, dep_type def multi_liaison(
class GraphMaker: def __init__(self, parserURL='http://localhost:9000'): self.dparser = CoreNLPDependencyParser(url=parserURL) self.clear() # clear saved state def clear(self): self.maxcc = None self.gs = None self.nxgraph = None self.ranked = None #self.words=mdict() # not used ... self.words2lemmas = set() self.noun_set = dict() self.svo_edges_in_graph = [] # digest a file def load(self, fname): self.clear() f = open(fname, 'r') text = f.read() f.close() self.digest(text) def parse(self, text): ts = self.dparser.parse_text(text) return list(ts) # digest a string using dependecy parser def digest(self, text): self.clear() chop = 2**16 gens = [] # deals with files that are too large to be parse at once while len(text) > chop: head = text[:chop] text = text[chop:] #print((head)) if head: hs = list(self.parse(head)) #print('PARSED') gens.append(hs) if gens: self.gs = [x for xs in gens for x in xs] else: self.gs = self.parse(text) #print('!!!',self.gs) # sentence as sequence of words generator def sentence(self): for g in self.gs: yield str.join(' ', list(gwords(g))) def wsentence(self): for g in self.gs: yield tuple(gwords(g)) def nth_sent_words(self, n): ws = tuple(gwords(self.gs[n])) return ws # sentence as sequence of lemmas generator def lsentence(self): for g in self.gs: yield tuple(glemmas(g)) # curates, reverses and adds some new edges # yields an <edge, sentence in which it occurs> pair def edgesInSent(self): self.svo_edges_in_graph = [] def noun_to_def(x, tx, k): if noun_defs: k_ = self.noun_set.get(x) if k == k_: yield (x, tx, 'first_in', k, 'SENT') def edgeOf(k, g): d = w2l(g) merge_dict(self.words2lemmas, d) make_noun_set(g, self.noun_set, k) svo_edges_in_sent = [] for ts in g.triples(): #print('TS',ts) fr, rel, to = list(ts) lfrom, ftag = d[fr[0]] lto, ttag = d[to[0]] # vn is True it is an s->v or o->v link so = isSubj(rel) or isObj(rel) vn = isVerb(ftag) and isNoun(ttag) and so if rel == 'punct' and ttag == '.': # sentence points to predicate verb yield (k, 'SENT', 'predicate', lfrom, ftag) elif vn: # collects vs and vo links to merge them later into svo svo_edges_in_sent.append((lfrom, ftag, rel, lto, ttag)) yield lfrom, ftag, rel, lto, ttag # verb to noun yield k, 'SENT', 'about', lto, ttag # sent to noun # all words recommend sentence #yield lfrom,ftag,'recommends',k,'SENT' # verb to sent - in elif ! for e in noun_to_def( lto, ttag, k, ): yield e # noun to sent if noun_self: yield lto, ttag, 'self', lto, ttag elif isNoun(ttag): # e.g. nmod relation #print('x-->n',k,lfrom,ftag,rel,lto,ttag) yield lfrom, ftag, rel, lto, ttag for e in noun_to_def( lto, ttag, k, ): yield e # noun to sent if noun_self: yield lto, ttag, 'self', lto, ttag #yield lfrom, ftag, 'recommends', k, 'SENT' # dependent of noun to sent else: # yield link as is yield lto, ttag, rel, lfrom, ftag # all words recommend sentence if all_recs: yield lto, ttag, 'recommends', k, 'SENT' # merge compound terms, make their parts recommend them if isNoun(ftag) and isNoun(ttag) and rel == 'compound': comp = lto + ' ' + lfrom yield lfrom, ftag, 'fused', comp, ftag yield lto, ttag, 'fused', comp, ttag for e in noun_to_def(comp, ttag, k): yield e if noun_self: yield comp, ttag, 'self', comp, ttag # collect svo relations self.svo_edges_in_graph.append(to_svo(k, svo_edges_in_sent)) k = 0 for g in self.gs: for e in edgeOf(k, g): # collects words at the two ends of e self.addWordsIn(e) yield e, k k += 1 # yields the edge. possibly for each sentence where is found def multi_edges(self): for e, k in self.edgesInSent(): yield e def edges(self): for e in set(self.multi_edges()): yield e # collects unique words at ends of an edge def addWordsIn(self, e): f, tf, r, t, tt = e if maybeWord(f) and tf != 'SENT': self.words.add(f, tf) if maybeWord(t) and tt != 'SENT': self.words.add(t, tt) yield e # returns final networkx text graph def graph(self): if (self.nxgraph): return self.nxgraph dg = nx.DiGraph() for e in self.edges(): f, tf, r, t, tt = e dg.add_edge(f, t, rel=r) self.nxgraph = dg #print('DG:',dg,'END') #print('NOUN_SET',self.noun_set) return dg # ranks (unless ranked and stored as such) the text graph def pagerank(self): if self.ranked: return self.ranked g = self.graph() pr = self.runPagerank(g) self.ranked = pr if not all_recs: return pr ccs = list(nx.strongly_connected_components(g)) lc = len(ccs) #print('LENCOM', lc) if lc < 4: self.maxcc = max(ccs, key=len) return pr # extracts best k nodes passing filtering test def bestNodes(self, k, filter): g = self.graph() comps = list(nx.strongly_connected_components(g)) pr = self.pagerank() i = 0 ns = [] # not a set - that looses order !!! for x, r in pr: if i >= k: break #print('RANKED',x,r) if filter(x): #print('FILTERED',x,r,'MC') if not self.maxcc or x in self.maxcc: if not x in ns: ns.append(x) i += 1 return ns # specialization returning all best k nodes def bestAny(self, k): return self.bestNodes(k, lambda x: True) # specialization returning best k sentence nodes def bestSentencesByRank(self, k): best = self.bestNodes(100 + k, isSent) if not best: return #print('BEST SENTS:',best) c = 0 for i in best: g = self.gs[i] lems = [w for w in glemmas0(g)] #print('LEMS',lems) if isCleanSent(lems): sent = list(gwords(g)) #sent=str.join(' ',list(gwords(g))) yield (i, sent) c += 1 #else : print('SENT UNCLEAN',lems) if c >= k: break def bestSentences(self, k): for i_s in sorted(self.bestSentencesByRank(k)): yield i_s # specialization returning best k word nodes def bestWords(self, k): #print('NOUNS',self.noun_set) c = 0 best = self.bestNodes(100 + k, maybeWord) #print('BEST WORDS:',best) for w in best: if c >= k: break if not isStopWord(w) and self.hasNoun(w): yield (w) #print('BWORD',w) c += 1 # true if a phrase has a noun in it def hasNoun(self, w): ws = w.split(' ') for v in ws: if v in self.noun_set: return True return False # runs PageRank on text graph def runPagerank(self, g): d = nx.pagerank(g) #print("PR",d) # normalize sentence ranks by favoring those close to everage rank sents = list(self.wsentence()) lens = list(map(len, sents)) #print('LENS:', lens) avg = sum(lens) / len(lens) #print('AVG SENT LENGTH:', avg) # reranks long sentences i = 0 for ws in sents: #print('WS:',ws) if i in d: l = len(ws) r = d[i] newr = adjust_rank(r, l, avg) d[i] = newr #if l<6 : print(r,'--->',newr,l,'ws=',ws) i += 1 sd = sorted(d, key=d.get, reverse=True) return [(k, d[k]) for k in sd] # extracts k highest ranked SVO triplets def bestSVOs(self, k): rank_list = self.pagerank() rank_dict = dict() for (w, rw) in rank_list: rank_dict[w] = rw #print('PRANK',rank_list) ranked = [] # should not be a set ! for rs in self.svo_edges_in_graph: for r in rs: #print('SVO',r) (f, _), (rel, _), (t, _), sent_id = r srank = rank_dict[f] orank = rank_dict[t] if srank and orank: sorank = (2 * srank + orank) / 3 ranked.append((sorank, (f, rel, t, sent_id))) ranked = sorted(ranked, reverse=True) i = 0 exts = set() seen = set() for (_, e) in ranked: i += 1 if i > k: break #print('SVO_EDGE',e) if e in seen: continue seen.add(e) yield e for xe in self.extend_with_wn_links(e, rank_dict): f, _, t, _ = xe if wn.morphy(f.lower()) != wn.morphy(t.lower()): exts.add(xe) i = 0 for xe in exts: i += 1 if i > k: break #print('XE',xe) yield xe # adds wordnet-derived links to a dictionary d # we tag them with is_a or part_of def extend_with_wn_links(self, e, d): s, v, o, sent_id = e m = 1 # how many of each are taken for x in wn_holo(m, s, 'n'): if x in d: yield (s, 'part_of', x, sent_id) for x in wn_mero(m, s, 'n'): if x in d: yield (x, 'part_of', s, sent_id) for x in wn_hyper(m, s, 'n'): if x in d: yield (s, 'is_a', x, sent_id) for x in wn_hypo(m, s, 'n'): if x in d: yield (x, 'is_a', s, sent_id) for x in wn_holo(m, o, 'n'): if x in d: yield (o, 'part_of', x, sent_id) for x in wn_mero(m, o, 'n'): if x in d: yield (x, 'part_of', o, sent_id) for x in wn_hyper(m, o, 'n'): if x in d: yield (o, 'is_a', x, sent_id) for x in wn_hypo(m, o, 'n'): if x in d: yield (x, 'is_a', o, sent_id) # visualize filtered set of edges with graphviz def toDot(self, k, filter, svo=False, show=True, fname='textgraph.gv'): dot = Digraph() g = self.graph() best = self.bestNodes(k, filter) for f, t in g.edges(): if f in best and t in best: dot.edge(str(f), str(t)) if svo: svos = set() for (s, v, o, _) in self.bestSVOs(k): svos.add((s, v, o)) for e in svos: s, v, o = e dot.edge(s, o, label=v) showGraph(dot, show=show, file_name=fname) # visualize filtered set of edges as graphviz dot graph def svoToDot(self, k): dot = Digraph() for e in self.bestSVOs(3 * k): s, v, o = e dot.edge(s, o, label=v) showGraph(dot) # specialize dot graph to words def wordsToDot(self, k): self.toDot(k, isWord) # specialize dot senteces graph words def sentsToDot(self, k): self.toDot(k, isSent) # visualize mixed sentence - word graph def allToDot(self, k): self.toDot(k, lambda x: True)
def extractFeatures(): stop_words = stopwords.words('english') + list(string.punctuation) file_loc='wikiTest/' os.chdir('/Users/ranjithreddykommidi/NLP/Project/wikiTest') file_names = glob.glob('*.txt') #Read every wikipedia articles given in the input fileList for file in file_names: readfile = open(file, 'r') text = readfile.read() corpus = {} sent_text = nltk.sent_tokenize(text) dep_parser = CoreNLPDependencyParser(url='http://localhost:9010') ner_tagger = CoreNLPParser(url='http://localhost:9010', tagtype='ner') count = 0 for sentence in sent_text: tokenized_text = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words] lemma = [WordNetLemmatizer().lemmatize(word) for word in tokenized_text] stemmed = [PorterStemmer().stem(word) for word in tokenized_text] tagged = nltk.pos_tag(tokenized_text) parse, = dep_parser.raw_parse(sentence) dependency_parse = list(parse.triples()) tokenized_text_ner = nltk.word_tokenize(sentence) try: ner_tag = ner_tagger.tag(tokenized_text_ner) except: ner_tag = ner_tagger.tag(tokenized_text) Synonym = [] Hypernym = [] Hyponym = [] Meronym = [] Holonym = [] Heads = [] for t in tokenized_text: Nyms = lesk(sentence, t) if Nyms is not None: this_synonym = t if Nyms.lemmas()[0].name() != t:this_synonym = Nyms.lemmas()[0].name() Synonym.append(this_synonym) if Nyms.hypernyms() != []:Hypernym.append(Nyms.hypernyms()[0].lemmas()[0].name()) if Nyms.hyponyms() != []:Hyponym.append(Nyms.hyponyms()[0].lemmas()[0].name()) if Nyms.part_meronyms() != []:Meronym.append(Nyms.part_meronyms()[0].lemmas()[0].name()) if Nyms.part_holonyms() != []:Holonym.append(Nyms.part_holonyms()[0].lemmas()[0].name()) else: Synonym.append(t) striped_sentence = sentence.strip(" '\"") if striped_sentence != "": dependency_parser = dep_parser.raw_parse(striped_sentence) parsetree = list(dependency_parser)[0] head_word = "" head_word = [k["word"] for k in parsetree.nodes.values() if k["head"] == 0][0] if head_word != "": Heads.append([head_word]) else: for i, pp in enumerate(tagged): if pp.startswith("VB"): Heads.append([tokenized_text[i]]) break if head_word == "": for i, pp in enumerate(tagged): if pp.startswith("NN"): Heads.append([tokenized_text[i]]) break else: Heads.append([""]) count = count + 1 corpus[count] = {} corpus[count]["sentence"] = {} corpus[count]["sentence"] = sentence corpus[count]["tokenized_text"] = {} corpus[count]["tokenized_text"] = tokenized_text corpus[count]["lemma"] = {} corpus[count]["lemma"] = lemma corpus[count]["stem"] = {} corpus[count]["stem"] = stemmed corpus[count]["tag"] = {} corpus[count]["tag"] = tagged corpus[count]["dependency_parse"] = {} corpus[count]["dependency_parse"] = dependency_parse corpus[count]["synonyms"] = {} corpus[count]["synonyms"] = Synonym corpus[count]["hypernyms"] = {} corpus[count]["hypernyms"] = Hypernym corpus[count]["hyponyms"] = {} corpus[count]["hyponyms"] = Hyponym corpus[count]["meronyms"] = {} corpus[count]["meronyms"] = Meronym corpus[count]["holonyms"] = {} corpus[count]["holonyms"] = Holonym corpus[count]["ner_tag"] = {} corpus[count]["ner_tag"] = str(dict(ner_tag)) corpus[count]["head_word"] = {} corpus[count]["head_word"] = Heads[0] corpus[count]["file_name"] = {} corpus[count]["file_name"] = file[len(file_loc):] outputName = file[len(file_loc)] json_object = json.dumps(corpus, indent = 4) with open(outputName, "w") as f: f.write(json_object)
def __init__(self, parserURL='http://localhost:9000'): self.dparser = CoreNLPDependencyParser(url=parserURL) self.clear()
def question_pipeline(question): lemmatizer = WordNetLemmatizer() porter = PorterStemmer() # stanford corenlp is expected to run at localhost:9000 dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') corpus_dict = {} count = 0 sent_text = question tokenized_text = nltk.word_tokenize(sent_text) question_types = ['who', 'when', 'where', 'Who', 'When', 'Where'] type_of_question = [i for i in question_types if i in tokenized_text] lemma = [lemmatizer.lemmatize(word) for word in tokenized_text] stemmed = [porter.stem(word) for word in tokenized_text] # Stemming the words # POS tagging the words to extract POS features tagged = nltk.pos_tag(tokenized_text) parse, = dep_parser.raw_parse(question) # Dependency parsing to parse tree based patters as features dependency_parse = list(parse.triples()) # LESK to extract best sense of a word best_sense = [lesk(question, word) for word in tokenized_text] # tokenized_text_ner = nltk.word_tokenize(sent_text) #Tokenizing sentences into words ner_tag = ner_tagger.tag(tokenized_text) head_list = [] striped_sentence = sent_text.strip(" '\"") if striped_sentence != "": dependency_parser = dep_parser.raw_parse(striped_sentence) parsetree = list(dependency_parser)[0] head_word = "" head_word = [ k["word"] for k in parsetree.nodes.values() if k["head"] == 0 ][0] if head_word != "": head_list.append([head_word]) else: for i, pp in enumerate(tagged): if pp.startswith("VB"): head_list.append([tokenized_text[i]]) break if head_word == "": for i, pp in enumerate(tagged): if pp.startswith("NN"): head_list.append([tokenized_text[i]]) break else: head_list.append([""]) synonym_list = [] hypernym_list = [] hyponym_list = [] meronym_list = [] holonym_list = [] for t in tokenized_text: best_sense = lesk(sent_text, t) # LESK to extract best sense of a word if best_sense is not None: this_synonym = t if best_sense.lemmas()[0].name() != t: this_synonym = best_sense.lemmas()[0].name() synonym_list.append(this_synonym) if best_sense.hypernyms() != []: hypernym_list.append( best_sense.hypernyms()[0].lemmas()[0].name()) if best_sense.hyponyms() != []: hyponym_list.append( best_sense.hyponyms()[0].lemmas()[0].name()) if best_sense.part_meronyms() != []: meronym_list.append( best_sense.part_meronyms()[0].lemmas()[0].name()) if best_sense.part_holonyms() != []: holonym_list.append( best_sense.part_holonyms()[0].lemmas()[0].name()) else: synonym_list.append(t) count = count + 1 corpus_dict[count] = {} corpus_dict[count]["sentence"] = {} corpus_dict[count]["sentence"] = sent_text corpus_dict[count]["type_of_question"] = {} corpus_dict[count]["type_of_question"] = type_of_question corpus_dict[count]["tokenized_text"] = {} corpus_dict[count]["tokenized_text"] = tokenized_text corpus_dict[count]["lemma"] = {} corpus_dict[count]["lemma"] = lemma corpus_dict[count]["stemmed"] = {} corpus_dict[count]["stemmed"] = stemmed corpus_dict[count]["tagged"] = {} corpus_dict[count]["tagged"] = tagged corpus_dict[count]["dependency_parse"] = {} corpus_dict[count]["dependency_parse"] = dependency_parse corpus_dict[count]["synonyms"] = {} corpus_dict[count]["synonyms"] = synonym_list corpus_dict[count]["hypernyms"] = {} corpus_dict[count]["hypernyms"] = hypernym_list corpus_dict[count]["hyponyms"] = {} corpus_dict[count]["hyponyms"] = hyponym_list corpus_dict[count]["meronyms"] = {} corpus_dict[count]["meronyms"] = meronym_list corpus_dict[count]["holonyms"] = {} corpus_dict[count]["holonyms"] = holonym_list corpus_dict[count]["ner_tag"] = {} corpus_dict[count]["ner_tag"] = dict(ner_tag) corpus_dict[count]["head_word"] = {} corpus_dict[count]["head_word"] = head_list[0] return corpus_dict
return weightedTags parser = nltk.parse.corenlp.CoreNLPParser() rawInput = sys.argv[1] sentence = next(parser.raw_parse(rawInput))[0] rules = propToRule(sentence) lemmatizer = WordNetLemmatizer() tags = list( set( map(lambda node: lemmatizeSafely(node[1], lemmatizer), flattenTree(sentence)))) dependency_parser = CoreNLPDependencyParser() result = dependency_parser.raw_parse(rawInput) depGraph = next(result) depGraphResult = traverseDepGraph(depGraph.nodes, lemmatizer) print( json.dumps({ 'modality': flatten(rules, []), 'depGraph': depGraphResult, 'flatTags': tags }))