def init_server(self): STANDFORD = os.path.join("stanford-corenlp-full-2018-10-05") self.server = CoreNLPServer( os.path.join(STANDFORD, "stanford-corenlp-3.9.2.jar"), os.path.join(STANDFORD, "stanford-corenlp-3.9.2-models.jar")) self.server.start() self.parser = CoreNLPParser()
def pos_tagger_lemma(document, listterms): print('Definición por pos tagger y lemma, busqueda de 3,2 y 1 gram') text = str() definiendums = list() pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') for i in document: if (len(i) > 1): tag = pos_tagger.tag(i.split(' ')) for t in tag: if (t[1] == 'VERB'): doc = nlp(t[0]) for tok in doc: l = tok.lemma_ if (l == 'ser'): text = i indverb = i.index(t[0]) front = i[indverb:] back = i[:indverb + len(t[0]) + 1] tagfront = pos_tagger.tag(front.split(' ')) tagback = pos_tagger.tag(back.split(' ')) definiendum_definition(t[0], text, listterms) elif (t[1] == 'NOUN' and t[0] != '=RRB='): text = i if (len(t[0]) > 1): #definiendum_definition(t[0], text, listterms) pass return (text)
def _parseSentences(sentences, parseCacheFile): def cacheKey(text): return text.strip().encode('utf-8') cache = shelve.open(parseCacheFile) toParse = [] for sentence in sentences: if cacheKey(sentence) not in cache: toParse.append(sentence) if toParse: p = Pool(10) parser = CoreNLPParser( url=os.getenv("CORENLP_HOST", "http://localhost:9000")) parseIterator = p.imap(lambda s: parser.parse_one(s.split()), toParse) progress = ProgressBar(len(toParse)) for i, parse in enumerate(parseIterator): cache[cacheKey(toParse[i])] = parse progress.done(i) progress.complete() parses = map(lambda s: cache[cacheKey(s)], sentences) cache.close() return parses
def check_triples_by_pos(triples): pos_tagger = CoreNLPParser(url='http://39.98.186.125:9000', tagtype='pos') ret_triples = [] for triple in triples: source = triple[0] relation = triple[1] target = triple[2] source_pos = ",".join( [e[1] for e in pos_tagger.tag(source.split(" "))]) relation_pos = ",".join( [e[1] for e in pos_tagger.tag(relation.split(" "))]) target_pos = ",".join( [e[1] for e in pos_tagger.tag(target.split(" "))]) if "VB" in source_pos or "VB" in target_pos: continue if "NN" not in source_pos or "NN" not in target_pos: continue if "NN" in relation_pos: if " at" in relation.lower(): relation = "at" elif "of" not in relation.split(" ") and len( relation.split(" ")) > 1: continue ret_triples.append([source, relation, target]) return ret_triples
def PVD(document): pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') sentence = sent_tokenize(document) word = 'se' lemma = 'definir' for i in range(len(sentence)): #print('--------------------') pattern = list() postaglist = list() tokens = nltk.word_tokenize(sentence[i]) tag = pos_tagger.tag(tokens) for t in tag: if ('se' in tokens): pos = tokens.index('se') front = tokens[pos + 1:pos + 2] tag = pos_tagger.tag(front) doc = nlp(t[0]) lemlist = [tok.lemma_ for tok in doc] #lem=''.join(lemlist) #lemmas_list.append(lem) #print(lemma, '-', lemlist) if ('definir' in lemlist or 'entender' in lemlist or 'denominar' in lemlist): #print(sentence[i]) front = tokens[pos + 2:pos + 5] if (t[1] == 'PUNCT'): pos = tokens.index(t[0]) print(t[0], pos, tag[pos + 1]) '''if(t[1]=='AUX'):
def dependency_parse(raw_data): from nltk.parse.corenlp import CoreNLPServer # The server needs to know the location of the following files: # - stanford-corenlp-X.X.X.jar # - stanford-corenlp-X.X.X-models.jar STANFORD = os.path.join("..", "stanford-corenlp-full-2020-04-20") # Create the server server = CoreNLPServer( os.path.join(STANFORD, "stanford-corenlp-4.0.0.jar"), os.path.join(STANFORD, "stanford-corenlp-4.0.0-models.jar"), ) # Start the server in the background server.start() from nltk.parse import CoreNLPParser parser = CoreNLPParser() new_data = [] for example in raw_data: sentence, features_seq = example[0], example[-1] parse = next(parser.raw_parse(sentence)) # get a few "important" neighboring words server.stop()
def pos_tag_text(text): def penn_to_wn_tags(pos_tag): if pos_tag.startswith('J'): return wn.ADJ elif pos_tag.startswith('V'): return wn.VERB elif pos_tag.startswith('N'): return wn.NOUN elif pos_tag.startswith('R'): return wn.ADV else: return None #print("ORIGINAL TEXT ---------------",text) #tagged_text = tag(text) pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') tagged_text = pos_tagger.tag(text.split()) #tagged_text = nltk.pos_tag(text) #print("TAGGED TEXT ---------",tagged_text) tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag)) for word, pos_tag in tagged_text] return tagged_lower_text
def __init__(self, w2v_path, corpus_dict_path, port=9000): # corenlp client self.parser = CoreNLPParser(url='http://localhost:' + str(port)) self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' + str(port)) # w2v self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( 'data/saved_models/GoogleNews-vectors-negative300.bin', binary=True) print('w2v model loaded') # training corpus for one hot features corpus_dict = pickle.load(open(corpus_dict_path, 'rb')) self.dep_tuple_vectorizer = DictVectorizer(sparse=False) self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit( corpus_dict['dep_tuple']) self.unigram_vectorizer = DictVectorizer(sparse=False) self.unigram_vectorizer = self.unigram_vectorizer.fit( corpus_dict['unigram']) self.bigram_vectorizer = DictVectorizer(sparse=False) self.bigram_vectorizer = self.bigram_vectorizer.fit( corpus_dict['bigram']) self.trigram_vectorizer = DictVectorizer(sparse=False) self.trigram_vectorizer = self.trigram_vectorizer.fit( corpus_dict['trigram']) self.lexical_vectorizer = DictVectorizer(sparse=False) self.lexical_vectorizer = self.lexical_vectorizer.fit( corpus_dict['lexical'])
def parseSentenceStructure(data): #Tokenize sent. tokens = nltk.word_tokenize(data) #Tag sent. tagged = nltk.pos_tag(tokens) #Parser parser = CoreNLPParser( url='http://localhost:9000' ) #https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk/51981566#51981566 dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') #Parse w/ Stanford tree = parser.raw_parse(data) #print(list(tree)) #list(tree)[0].pretty_print() #print(list(tree)) #Provide N-V-N relationships w/ all N combinations #Traverse for NP root tree_recurse_find(list(tree)[0])
def __init__(self, config): self.ontology_tagging = OntologyTagging() self.config = config self.word_dictionary = self.compute_all_embeddings() self.server_url = 'http://localhost:9000' self.parser = CoreNLPParser(url=self.server_url) self.core_nlp_dependency_parser = CoreNLPDependencyParser( url=self.server_url)
def pruebas(): pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') tag = pos_tagger.tag( 'tengo que ir Por el contrato de compra y venta uno de los contratantes se obliga a entregar una cosa determinada y el otro a pagar por ella un precio cierto, en dinero o signo que lo represente' .split(' ')) doc = nlp('considerará') lemlist = [tok.lemma_ for tok in doc] print(lemlist)
def scorer(title, speaker): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') pos_tag_list = list(pos_tagger.tag(title[0].split())) s = 0 for i in pos_tag_list: if 'NN' in i[1] or 'NP' in i[1]: s += 1 return -s + abs(title[1] - speaker[1]) * 0.3
def __init__(self, host='services.loadbalancer.api.questo.ai', port=9000, separator='|'): # self.parser = CoreNLPParser(url=f'http://{host}:{port}') self.parser = CoreNLPParser( url='http://services.loadbalancer.api.questo.ai:9000') self.separator = separator
def clear_data(self): self.parser = CoreNLPParser(url='http://localhost:9000') self.first_NP = '' self.first_VP = '' self.parse_tree = None self.subject = RDF_Triple.RDF_SOP('subject') self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB') self.Object = RDF_Triple.RDF_SOP('object')
def parse(text): parser = CoreNLPParser("http://localhost:9000") result = parser.raw_parse(text.lower()) trees = [tree for tree in result] for tree in trees: tree.chomsky_normal_form() tree.collapse_unary(collapseRoot=True, collapsePOS=True) trees = [ParentedTree.convert(tree) for tree in trees] return trees
def parse(text): parser = CoreNLPParser(CORENLP_SERVER) result = parser.raw_parse(text) trees = [tree for tree in result] for tree in trees: tree.chomsky_normal_form() tree.collapse_unary(collapseRoot=True, collapsePOS=True) trees = [ParentedTree.convert(tree) for tree in trees] return trees
def verb_stats(data): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') verb_count = 0 for _, value in data.items(): pos = list(pos_tagger.tag(value.split())) for _, second in pos: if second.startswith("V"): verb_count += 1 print(verb_count)
def syntax_tree_parser(self): """ get syntax tree :return: syntax tree """ if self.syntax_tree is not None: return self.syntax_tree parser = CoreNLPParser(url='http://localhost:8999') self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0] return self.syntax_tree
def get_probable_title(titles): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') score = [] for title in titles: pos_tag_list = list(pos_tagger.tag(title[0].split())) s = 0 for i in pos_tag_list: if 'NN' in i[1] or 'NP' in i[1]: s += 1 score.append((s, len(title[0]), title[0], title[1])) return max(score)
def __init__(self): """ Initialize the SVO Methods """ self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"] self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] self.adjective_types = ["JJ", "JJR", "JJS"] self.pred_verb_phrase_siblings = None jar = r'D:\data' model = r'your_path/stanford-postagger-full-2016-10-31/models/english-left3words-distsim.tagger' self.parser = CoreNLPParser(url='http://localhost:9000') self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def get_speaker_salutaion(text, persons, speakers): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') text = text.split() pos_tag_list = list(pos_tagger.tag(text)) for i in range(len(pos_tag_list) - 1): if pos_tag_list[i][1] == 'NNP' and pos_tag_list[i][0] not in persons: if pos_tag_list[i + 1] in persons: for speak in speakers: if pos_tag_list[i + 1] in speak: speak = pos_tag_list[i] + ' ' + speak break return speakers
def get_stanford_pos_tags(line): """ Get part of speech tags using the Stanford POS tagger """ st_pos = CoreNLPParser(url="http://localhost:9000", tagtype="pos") tokenized_line = cnf.TOKENIZER.tokenize(line) line_tagged_initial = st_pos.tag(tokenized_line) line_tagged_output = [] for item in line_tagged_initial: line_tagged_output.append((item[0], item[1])) return line_tagged_output
def createGrammar(self, userMessages, ctx): parser = CoreNLPParser(url='http://localhost:9000') parse_trees = [] for message in userMessages: tokenized = nltk.sent_tokenize(message) for sentence in tokenized: parse_trees.append(list(parser.raw_parse(sentence))[0]) grammar_rules = set() for tree in parse_trees: for production in tree.productions(): grammar_rules.add(production) start = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(start, grammar_rules) return (' '.join((self.generate_sentence(grammar))))
def get_entities(tweets): entities = [] for atweet in tweets: for sent in atweet: pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') sent = list(pos_tagger.tag(normalize(sent))) # sent = pos_tag(normalize(sent)) trees = ne_chunk(sent) for tree in trees: if hasattr(tree, 'label'): if tree.label() in labels: entities.append(' '.join( [child[0].lower() for child in tree])) return entities
def sfNERTagger(rawText): '''(sf = stanford) get the raw text from a file and convert that to a list with tuples of each word with a StanFord annotated NER-tag''' parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner') tupleList = list(parser.tag(rawText.split())) #convert list of tuple to list of lists, so we can change tags we dont need NERList = [list(tuple) for tuple in tupleList] #change tags we dont need for item in NERList: if item[1] == 'COUNTRY': item[1] = 'COU' elif item[1] == 'PERSON': item[1] = 'PER' elif item[1] == 'CITY': item[1] = 'CIT' elif item[1] == 'ORGANIZATION': item[1] = 'ORG' else: item[1] = '' return NERList
class PosTaggingCoreNLP(PosTagging): """ Concrete class of PosTagging using a CoreNLP server Provides a faster way to process several documents using since it doesn't require to load the model each time. """ def __init__(self, host='localhost', port=9000, separator='|'): self.parser = CoreNLPParser(url=f'http://{host}:{port}') self.separator = separator def pos_tag_raw_text(self, text, as_tuple_list=True): # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split) # We create a small custom function highly inspired from raw_tag_sents to do both def raw_tag_text(): """ Perform tokenizing sentence splitting and PosTagging and keep the sentence splits structure """ properties = {'annotators': 'tokenize,ssplit,pos'} tagged_data = self.parser.api_call(text, properties=properties) for tagged_sentence in tagged_data['sentences']: yield [(token['word'], token['pos']) for token in tagged_sentence['tokens']] tagged_text = list(raw_tag_text()) if as_tuple_list: return tagged_text return '[ENDSENT]'.join( [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])
def get_stanford_nps(line): """ Get noun phrases using the Stanford tagger """ noun_phrase_list = [] parser = CoreNLPParser(url="http://localhost:9000") tag_list = list(parser.raw_parse(line)) for item in tag_list: for subtree in item.subtrees(): if subtree.label() == "NP": noun_phrase = " ".join(subtree.leaves()) noun_phrase_list.append((noun_phrase)) return noun_phrase_list
def ed_rip(word: str): NV = False pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') nlpinfo = nlp(word.lower()) ripword = nlpinfo.sentences[0].words[0].lemma # Recapitalization if re.search('^[A-Z]', word) != None: ripword = ripword.capitalize() # Return information needed to determine NV Passive. riptoken = nltk.word_tokenize(ripword) riptag = pos_tagger.tag(riptoken)[0][1] print(riptoken, riptag) if riptag.startswith('V') is True: NV = False elif riptag.startswith('N') is True: NV = True return (ripword, NV)
def punctuation_funct(document, listterms): print( 'Definición por punctuación [:, termino seguido de coma y acabando en coma verbo' ) text = str() definiendum = str() definiendums = list() for i in document: for j in listterms: term = j[:-1] if (len(i) > 1): if (term + ':' in i): ind = i.index(':') after = i[ind + 1:] if (len(after) > 1 and term not in definiendums): definiendum = term definition = after definiendums.append(definiendum) print(definiendum, '---->', definition) elif (term + ',' in i): indterm = i.index(term) if (',' in i[indterm + len(term):indterm + len(term) + 1]): #print('-') front = i[indterm:-1] pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') tag = pos_tagger.tag(i.split(' ')) for t in tag: if (t[1] == 'VERB'): #print(front) if (t[0] in i): #print(t[0]) indverb = i.index(t[0]) if (i[indverb - 2] == ','): definiendum = term definition = i[indterm + len(term) + 1:indverb] if (len(definiendum) > 1 and len(definition) > 1 and definiendum not in definiendums): definiendums.append(definiendum) print(definiendum, '---->', definition)
def __init__(self): # user need to download Stanford Parser, NER and POS tagger from stanford website #parserJar = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-parser-full-2017-06-09/stanford-parser.jar" #parserModel = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar" self.fullModule = CoreNLPParser(url='http://localhost:9000') self.customProperties = { 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse' } #self.constituent_parse_tree = CoreNLPParser(url='http://localhost:9000') # StanfordParser(path_to_jar=parserJar, path_to_models_jar=parserModel) # user need to set as environment variable #self.stanford_dependency = CoreNLPDependencyParser(url='http://localhost:9000') #StanfordDependencyParser(path_to_jar=parserJar, path_to_models_jar=parserModel) # #user need to set as environment variable self.lemma = WordNetLemmatizer() ##self.ner = CoreNLPParser(url='http://localhost:9000', tagtype='ner') #self.pos_tag = CoreNLPParser(url='http://localhost:9000', tagtype='pos') #self.home = '/home/ramesh' #user needs to download stanford packages and change directory #nerModelPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz" #nerJarPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-ner-2017-06-09/stanford-ner.jar" #self.ner = StanfordNERTagger(model_filename = nerModelPath,path_to_jar=nerJarPath) #posModelPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger" #posJarPath = "/home/rav/Development/Semantic-Textual-Similarity/monolingualWordAligner/stanfordModels/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar" #self.pos_tag = StanfordPOSTagger(model_filename = posModelPath,path_to_jar=posJarPath) self.CharacterOffsetEnd = 0 self.CharacterOffsetBegin = 0 #this try block is just to check and make sure the server is up and running by #parsing a random sentence try: self.parser("A random sentence") except requests.ConnectionError as err: print( "The server doesn't seem to be running, make sure you start it" ) raise SystemExit