class StanfordSentimentAnalyzer(SentimentAnalyzer): def __init__(self): self.snlp = StanfordCoreNLP('http://localhost:9000') def lemmatize(self, text): lemmatized = self.snlp.annotate(text, properties={ 'annotators': 'tokenize, ssplit, pos, lemma', 'outputFormat': 'json' }) sentence = lemmatized['sentences'][0] tokens = [x['lemma'] for x in sentence['tokens']] return ' '.join(tokens) def get_analyzer_type(self): return 'snlp' def get_sentiment_from_text(self, text): nearest_ascii = unidecode.unidecode(text) nearest_ascii = ' '.join(nearest_ascii.split()) resp = self.snlp.annotate(nearest_ascii, properties={ 'timeout': '50000', 'annotators': 'tokenize, ssplit, pos, lemma, sentiment', 'outputFormat': 'json' }) sentiment = np.zeros(2) tokens = [] # logging.debug(resp) for sentence in resp['sentences']: i = int(sentence['sentimentValue']) sentiment += np.array([max(i - 2, 0), max(2 - i, 0)]) tokens += [x['lemma'] for x in sentence['tokens']] return sentiment, tokens
def test_stanford_corenlp_server(): ''' Tests connection to stanford corenlp server ''' try: nlp = StanfordCoreNLP('http://localhost:9000') nlp.annotate("HOLA") assert (1 == 1) except: assert ("Connection error" == "StanfordCoreNLP")
class Parser: def __init__(self, coreNLPServer ='http://localhost:9000'): self.nlp = StanfordCoreNLP('http://localhost:9000') def word_list(self, text): nlp_output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit', 'outputFormat': 'json' }) word_array = [] for sentence in nlp_output['sentences']: for w in sentence['tokens']: word_array.append(w['word'].lower()) return word_array def parse_tree(self, text, binary=False, preprocessed=False): nlp_output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,parse', 'outputFormat': 'json', 'parse.binaryTrees': 'true' }) if type(nlp_output) == str: nlp_output = json.loads(nlp_output, strict=False) if len(nlp_output['sentences']) > 1: #merge trees from sentences tree_string = "(Top " for s in nlp_output['sentences']: p_tree = Tree.fromstring(s['parse']) tree_string += str(p_tree[0]) tree_string += ")" merged_tree = Tree.fromstring(tree_string) else: #no merging required merged_tree = Tree.fromstring(nlp_output['sentences'][0]['parse']) #remove root merged_tree = merged_tree[0] if binary: nltk.treetransforms.chomsky_normal_form(merged_tree) if preprocessed: merged_tree = preprocess_parse_tree(merged_tree) return merged_tree def draw_parse_tree(self, parse_tree): nltk.draw.tree.draw_trees(parse_tree)
def anaphora(text): nlp = StanfordCoreNLP('http://192.168.54.210:9000/') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,coref', 'outputFormat': 'text'}) sents = nltk.sent_tokenize(text) a=[] for sent in sents: a.append(sent.split()) output = str(output.replace('\r','').replace('\t','')) #output = output.split('Coreference set:', 1)[1] output = output.split('Coreference set:') #output = str(output.replace('\r','').replace('\t','')) #output = output.split('\n'); for out in output[1:]: #print out out = str(out.replace('\r','').replace('\t','')) out = out.split('\n') for i in out[1:-1]: i = i.split(', that is:') toFrom = i[0].split('->') fromSent , fromStart, fromEnd = sentenceRange(toFrom[0]) toSent , toStart, toEnd = sentenceRange(toFrom[1]) fromText , toText = fromTo(i[1]) if len(toText.split()) > 1: toText = shorten(toText) toText = [toText] #a[fromSent - 1][fromStart - 1:fromEnd - 1] = a[toSent - 1][toStart - 1:toEnd - 1] a[fromSent - 1][fromStart - 1:fromEnd - 1] = toText return a
def ner(text): nlp = StanfordCoreNLP('http://localhost:8098/') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,pos,ssplit,ner,lemma', 'outputFormat': 'json', }) return output['sentences']
class CoreNLP: """Used to initialize the Stanford Core NLP in servlet mode and then connect to it using a socket""" mongo = MongoClient() mongo_db = mongo.get_database('dependencies') def __init__(self, timeout=15000, port=9000, buffer_size=4096): """Used to initialize the StanfordAPI object with the host, port and buffer""" # self.host = socket.gethostname() self.port = str(port) # self.timeout = str(timeout) # self.buffer = str(buffer_size) # self.process = Popen( # args=['java', '-mx4g', '-cp', 'commons/corenlp/*', 'edu.stanford.nlp.pipeline.StanfordCoreNLPServer', # '-port', self.port, '-timeout', self.timeout]) # time.sleep(5) self.nlp = StanfordCoreNLP('http://localhost:' + self.port) def parse(self, text): dobj = self.mongo_db.get_collection('dependency').find_one({'text': text}) if not dobj or dobj['deps'] == 'CoreNLP request timed out. Your document may be too long.': output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,coref', 'coref.algorithm': 'neural', 'outputFormat': 'json', }) dep = {'text': text, 'deps': output} self.mongo_db.get_collection('dependency').insert_one(dep) return output else: return dobj['deps']
class CoreNLP(object): def __init__(self): self.corenlp = StanfordCoreNLP('http://localhost:9000') def tokenize_sentence(self, sentence): doc = self.corenlp.annotate( sentence, properties={ 'annotators': 'tokenize,lemma,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) s = doc['sentences'][0] return self.structure_tokens(s) def structure_tokens(self, sentence): words = [] for token in sentence['tokens']: words.append({ "pos": token['pos'], "token": token['word'], "links": [], "lemma": token['lemma'] }) for dep in sentence['enhancedPlusPlusDependencies']: words[dep['governor'] - 1]['links'].append( [dep['dep'], (dep['dependent'] - 1)]) return words
def making_parsed_tree(sentiment_code, file_name): splited_sentence_first = [] parsed_sentence_first = [] pcn = StanfordCoreNLP('http://*****:*****@", '', text) text = re.sub(r'http\S+', '', text) return text for a in tqdm(range(len(df_amazon))): tweet_txt = about_symbol(text[a]) if label[a] == sentiment_code: if len(tweet_txt) > 3: tweet_txt = " ".join(tweet_txt.split()) tweet_txt = contractions.fix(tweet_txt) doc = nlp(tweet_txt) splited_sentence_second = [] parsed_sentence_second = [] for sentence in doc.sentences: temp = [] for token in sentence.tokens: temp.append(token.text) sum_text = " ".join(temp) sum_text = about_symbol(sum_text) output = pcn.annotate(sum_text, properties={ 'annotators': 'parse', 'outputFormat': 'json' }) parsed_sent = output['sentences'][0]['parse'] parsed_sent = " ".join(parsed_sent.split()) parsed_sent = parsed_sent.replace('(', '<') parsed_sent = parsed_sent.replace(')', '>') parsed_sentence_second.append(parsed_sent) splited_sentence_second.append(sum_text) # print(parsed_sent) splited_sentence_first.append(splited_sentence_second) parsed_sentence_first.append(parsed_sentence_second) sent_json['splited_sentence'] = [] sent_json['parsed_sentence'] = [] sent_json['original_sentence'] = [] sent_json['splited_sentence'].append(splited_sentence_first) sent_json['parsed_sentence'].append(parsed_sentence_first) sent_json['original_sentence'].append(tweet_txt) with open(file_name, 'w') as out_file: json.dump(sent_json, out_file, indent=4)
def extract_triples_openie(): # run corenlp server from shell # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "openie" -port 9000 -timeout 30000 # http://corenlp.run/ nlp = StanfordCoreNLP("http://localhost:9000") output = nlp.annotate( text, properties={ #'annotators': 'tokenize, ssplit, pos, depparse, parse, openie', 'annotators': 'openie', 'outputFormat': 'json' }) #print(output['sentences'][0].keys) for sentence in output['sentences']: for result in sentence['openie']: print("{" + result['subject'] + ", " + result['relation'] + ", " + result['object'] + "}") # RUN OpenIE extraction #extract_triples_openie()
def annotate(text, url=None, properties=None): if url is None: url = NLP_SERVER if properties is None: properties = NLP_PROPERTIES nlp = StanfordCoreNLP(url) return nlp.annotate(text, properties)
def annotate_story(text, name, corenlp_url, props=neural_props): out_path = os.path.join('data', 'writing-prompts', 'annotations', f'{name}.json') # Remove <newline> cleaned_text = [] for token in text.split(): if token != '<newline>': cleaned_text.append(token) cleaned_text = ' '.join(cleaned_text) if os.path.exists(out_path): return nlp = StanfordCoreNLP(corenlp_url) try: annotation = nlp.annotate(cleaned_text, properties=props) except requests.exceptions.ConnectionError as e: logger.error(f'Connection Error for {name}: {e}.') return if isinstance(annotation, str): logger.error(f'Error for {name}: {annotation}.') # Let's try a statistical approach if 'Error making document' in annotation and props[ 'coref.algorithm'] == 'neural': logger.info(f'Switching to statistical coref for {name}') annotate_story(text, name, corenlp_url, props=stats_props) else: logger.info(f'Please check {name}') return else: with open(out_path, 'w') as f: json.dump(annotation, f)
def tokenize_and_tag(idx, sentence): stanford_corenlp = StanfordCoreNLP(corenlp_url) tries = 0 while True: try: annotation = stanford_corenlp.annotate(sentence.encode('utf8'), properties={ 'annotators': 'tokenize,pos,ner', 'outputFormat': 'json' }) assert type(annotation) == dict break except Exception: time.sleep(1) tries += 1 if tries == 10: print "Failed for %s" % sentence return (idx, None, None, None) pass tokens, pos_tags, ner_tags = [], [], [] for sentence in annotation['sentences']: tokens.extend([token['word'] for token in sentence['tokens']]) pos_tags.extend([token['pos'] for token in sentence['tokens']]) ner_tags.extend([token['ner'] for token in sentence['tokens']]) return (idx, tokens, pos_tags, ner_tags)
def get_pt_features_coreNLP(doc, ignoreleaf=True): en = doc.encode('utf-8') de = en.decode('utf-8') doc = de chars_to_remove = ['{', '}', '(', ')'] rx = '[' + re.escape(''.join(chars_to_remove)) + ']' doc = re.sub(rx, '', doc) nlp = StanfordCoreNLP('http://localhost:9000') sentences = sent_tokenize(doc) ptree_features = list() for sentence in sentences: try: if sentence != "" and len( word_tokenize(sentence)) <= 80: # less than 50 words output = nlp.annotate(sentence, properties={ 'annotators': 'parse', 'outputFormat': 'json' }) parsed = (output['sentences'][0]['parse']) rules = traverse(parsed, ignoreleaf=ignoreleaf) ptree_features.append(rules) except: print('Problem in parsing sentece = %s' % sentence) return ptree_features
class SentmentEnvironment: def __init__(self): self.reset() self.stanford = StanfordCoreNLP('http://localhost:9000') def step(self, action: str): result = self.stanford.annotate(action, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': '5000' }) # Result types from CoreNLP: # negative: 1; # neutral: 2; positive: 3 s_scores = [int(s['sentimentValue']) - 2 for s in result['sentences']] reward = math.tanh(sum(s_scores)) done = len(self.history) > CONVO_LEN self.history.append(action) state = random.sample(sentences, 1)[0] return state, reward, done def reset(self): self.history = []
def sentiment_stanford(input_text): if input_text != '': nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(input_text, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 1000, }) rows = [] for s in res["sentences"]: rows.append([s["sentimentValue"], s["sentiment"]]) df = pd.DataFrame(rows, columns=['sentiment_value', 'sentiment']) df['sentiment_value'] = df['sentiment_value'].apply(float) grouped_obj = df.groupby('sentiment') scores = {'pos': 0.0, 'neu': 0.0, 'neg': 0.0, 'compound': 0.0} for gr_name, gr_df in grouped_obj: mean_score = float(gr_df.mean()) if gr_name == 'Positive': scores['pos'] = mean_score elif gr_name == 'Negative': scores['neg'] = mean_score elif gr_name == 'Neutral': scores['neu'] = mean_score scores['compound'] = scores['pos'] - scores['neg'] + (scores['neu'] / 2.) return scores else: return None
class StanfordServerParser(Parser, GenericStanfordParser): """Follow the readme to setup the Stanford CoreNLP server""" def __init__(self, host='localhost', port=9000, properties={}): url = 'http://{0}:{1}'.format(host, port) self.nlp = StanfordCoreNLP(url) if not properties: self.properties = { 'annotators': 'parse', 'outputFormat': 'json', } else: self.properties = properties def _make_tree(self, result): return Tree.fromstring(result) def parse(self, sent): output = self.nlp.annotate(sent, properties=self.properties) # Got random html, return empty tree if isinstance(output, str): return Tree('', []) parse_output = output['sentences'][0]['parse'] + '\n\n' tree = next(next(self._parse_trees_output(parse_output)))[0] return tree
class isQuestionBasic(): # Init Constructor # Initialize StanfordCore NLP local instance on port 9000 def __init__(self): self.nlp = StanfordCoreNLP('http://localhost:9000') # Input: Sentence to be predicted # Processing: 1. Uses Stanfors NLP's 'annotate' method to create Parse Tree # 2. Checks for occurence of 'SQ' or 'SBARQ' in the parse tree # Return: 1 - If sentence is question | 0 - If sentence is not a question def isQuestion(self, sentence): if '?' in sentence: return 1 output = self.nlp.annotate(sentence, properties={ 'annotators': 'parse', 'outputFormat': 'json', 'timeout': 1000, }) if ('SQ' or 'SBARQ') in output['sentences'][0]["parse"]: return 1 else: return 0
def poemAnalysis(poemObject): #https://stackoverflow.com/questions/32879532/stanford-nlp-for-python #Connect to the Stanford NLP server. Note that in order to run this code, #The Stanford NLP server must be run. The local_corenlp_path above must be #changed, and the instructions must be followed from the link above. nlp = StanfordCoreNLP('http://localhost:9000') #Set the analysis to be of sentiment. pros = {'annotators': 'sentiment', 'outputFormat': 'json'} poem = poemObject.get('poems')[0] res = nlp.annotate(poem, properties=pros) totalSentiment = 0 count = 0 # calculate the average sentiment across the sentences. for s in res["sentences"]: totalSentiment = totalSentiment + float(s["sentimentValue"]) count = count + 1 averageSentiment = totalSentiment / count # 0: Very Negative # 1: Negative # 2: Neutral # 3: Positive # 4: Very Positive title = poemObject.get('title') fascicle = poemObject.get('fasc') publication_date = poemObject.get('pubdate') # returns all relevant information. return [title, fascicle, publication_date, averageSentiment, count]
def get_score(text): # Connect to the server nlp = StanfordCoreNLP('http://localhost:9000') text = text.lower() res = nlp.annotate(text, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 10000, }) # Average sentiment over sentences sum = 0 tot_words = 0 for s in res["sentences"]: value = int(s["sentimentValue"]) - 2 # so that neutral is 0 scaled_val = value * len(s["tokens"]) tot_words += len(s["tokens"]) sum += scaled_val score = sum / tot_words # If the review is "neutral", either randomly assign it as either positive or negative # sentiment or ignore the review all together if score == 0: #sent_score = randint(0,1) sent_score = -1 if score < 0: sent_score = 0 elif score > 0: sent_score = 1 return sent_score
class StanfordServerParser(Parser, GenericStanfordParser): """Follow the readme to setup the Stanford CoreNLP server""" def __init__(self, host='localhost', port=9000, properties={}): url = 'http://{0}:{1}'.format(host, port) self.nlp = StanfordCoreNLP(url) if not properties: self.properties = { 'annotators': 'parse', 'outputFormat': 'json', } else: self.properties = properties def _make_tree(self, result): return Tree.fromstring(result) def parse(self, sent): output = self.nlp.annotate(sent, properties=self.properties) # Got random html, return empty tree if isinstance(output, unicode): return Tree('', []) parse_output = output['sentences'][0]['parse'] + '\n\n' tree = next(next(self._parse_trees_output(parse_output)))[0] return tree
def extension_headline_simple(): headline = request.args.get('q') nlp = StanfordCoreNLP('http://localhost:9000') output = nlp.annotate(headline, properties={ 'annotators': 'tokenize,openie,depparse', 'outputFormat': 'json' }) result = {"voice": [], "relationships": []} for dep in output["sentences"][0]["basicDependencies"]: if dep["dep"] == "nsubj": result["voice"].append("Active Voice: " + dep["dependentGloss"] + " -> " + dep["governorGloss"]) if dep["dep"] == "nsubj:pass": result["voice"].append("Passive Voice: " + dep["dependentGloss"] + " -> " + dep["governorGloss"]) for openie in output["sentences"][0]["openie"]: result["relationships"].append(f"Object: " + openie["object"] + ", Relation: " + openie["relation"] + ", Subject: " + openie["subject"]) return json.dumps(result)
def test_parsing(df): nlp = StanfordCoreNLP('http://localhost:9000') utt = 'There is a pub called Wildwood which serves English food. It has a low customer rating and price range - typically less than £20.' output = nlp.annotate(utt, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) # divide the parse tree into lines ptree = '\n'.join([sent['parse'] for sent in output['sentences']]) print(ptree) # if find_apposition(ptree.split('\n')): # if find_fronted_adjective_phrase(ptree.split('\n')): # if find_fronted_prepositional_phrase(ptree.split('\n')): # if find_fronted_verb_phrase(ptree.split('\n')): # if find_fronted_imperative_phrase(ptree.split('\n')): # if find_subordinate_clause_non_wh(ptree.split('\n')): # if find_subordinate_clause_wh(ptree.split('\n')): # if find_gerund_verb(ptree.split('\n')): if find_modal_verb(ptree.split('\n')): # if find_contrast(ptree.split('\n')): # if find_agreement(ptree.split('\n')): # if find_prepositions(ptree.split('\n')): # if find_existential_there(ptree.split('\n')): print(utt)
def main(): args = parse_args() parser = RstParser() parser.load('../data/model') with gzip.open('features/bc3200.pickle.gz') as fin: print('Load Brown clusters for creating features ...') brown_clusters = pickle.load(fin) core_nlp = StanfordCoreNLP('http://localhost:9000') annotate = lambda x: core_nlp.annotate( x, properties={ 'annotators': 'tokenize,ssplit,pos,lemma,parse,depparse', 'outputFormat': 'json', 'ssplit.isOneSentence': True }) edu_file_list = [ os.path.join(args.edu_file_dir, fname) for fname in os.listdir(args.edu_file_dir) if fname.endswith('.edu.txt') ] for edu_file in edu_file_list: print('Parsing {}...'.format(edu_file)) doc = create_doc_from_edu_file(edu_file, annotate_func=annotate) pred_rst = parser.sr_parse(doc, brown_clusters) tree_str = pred_rst.get_parse() pprint_tree_str = Tree.fromstring(tree_str).pformat(margin=150) with open( os.path.join(args.output_dir, os.path.basename(edu_file) + '.parse'), 'w') as fout: fout.write(pprint_tree_str)
def main(): nlp = StanfordCoreNLP('http://localhost:9000') negationWords = [] with open(sys.argv[1], 'r') as f: for line in f: text = line.rstrip() output = nlp.annotate(text, properties={'annotators': 'depparse', 'outputFormat': 'json'}) try: dep = output['sentences'][0]['basicDependencies'] for i in range(len(dep)): if dep[i]['dep'] == 'neg': word = dep[i]['dependentGloss'] if word == "n't": dep_temp = dep[i - 1] if dep_temp['dep'] == 'expl': word = dep_temp['governorGloss'] + word else: word = dep_temp['dependentGloss'] + word word = word.lower() if word not in negationWords: negationWords.append(word) except: pass if __name__ == "__main__": main()
def main(): nlp = StanfordCoreNLP('http://localhost:9000') df_dev = pd.read_csv("train_v1.csv") df_tokens = pd.DataFrame() index = 0 corenlp_json = {} total_num = set(df_dev["id"].values) for question_id in df_dev["id"].values: paragraph = df_dev.loc[df_dev["id"] == question_id, "context"].iloc[0] sentences = split_paragraph_into_sentences(paragraph) question_json_result = [] for sentence in sentences: output = nlp.annotate(sentence, properties={ 'annotators': 'tokenize, pos, lemma, ner', 'outputFormat': 'json' }) if len(output["sentences"]) > 0: question_json_result.append(output["sentences"][0]["tokens"]) corenlp_json[question_id] = question_json_result print(index) index += 1 if index % 10001 == 0: with open("corenlp_paragraph_to" + str(index) + ".json", 'w+') as fp: json.dump(corenlp_json, fp) corenlp_json = {}
def stanfordNLP(data): sentimentLevel = 0 nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(data, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 100000 }) #print (res) for i in res["sentences"]: val = int(i["sentimentValue"]) if i["sentiment"] == "Verypositive": sentimentLevel = sentimentLevel + val + 5 elif i["sentiment"] == "Positive": sentimentLevel = sentimentLevel + val + 1 elif i["sentiment"] == "Neutral": sentimentLevel = 0 elif i["sentiment"] == "Negative": sentimentLevel = sentimentLevel - val - 1 elif i["sentiment"] == "Verynegative": sentimentLevel = sentimentLevel - val - 5 stanfordLevel = calSentimentLevel(sentimentLevel) return stanfordLevel
def Sentiment_StanfordNLP(text): from pycorenlp import StanfordCoreNLP import numpy as np nlpStanford = StanfordCoreNLP('http://localhost:9000') results = nlpStanford.annotate(text, properties={ 'annotators': 'sentiment, ner, pos', 'outputFormat': 'json', 'timeout': 50000, }) sentiment = [] for s in results["sentences"]: sentiment.append(s["sentiment"]) new_sentiment = [] for sent in sentiment: new_string = sent.replace("Negative", "-1").replace( "Neutral", "0").replace("Positive", "1").replace("Verynegative", "-2").replace("Verypositive", "2") new_sentiment.append(new_string) sentiment_mean = [] for x in new_sentiment: sentiment_mean.append(int(x)) return np.mean(sentiment_mean)
def parseDoc(docFileName, outFileName): df = pd.read_csv(docFileName, sep="\t") df.columns = ["docName","sentence"] docNames = df.docName.unique() nlp = StanfordCoreNLP('http://localhost:9000') parseResults = {} nb_sent = 0 for docName in docNames: sents = df[df['docName'] == docName]['sentence'].values start_time = time.time() #print "Number of sentences : ",len(sents) nb_sent = nb_sent + len(sents) output = {} docStr = ( ' '.join([str(x) for x in sents])) output = nlp.annotate(docStr, properties={'timeout': '100000000','annotators': 'tokenize,ssplit,pos,lemma,ner,depparse, parse,coref','outputFormat': 'json'}) #parseResults[docName] = output #print output elapsed_time = time.time() - start_time #print "Number of sentence: ",len(sents), "Number of coreference chains : ",len(output['sentences']), " Elapsed time: ",elapsed_time print nb_sent, "," , len(docNames) print nb_sent/(len(docNames) * 1.0) for docName, result in parseResults.iteritems(): print docName,"\t number of coreference chains :", len(result['corefs']) with open(outFileName, "w") as json_file: json.dump(parseResults, json_file)
def tweet_whole_sentiment(data): ''' input: whole corpus output: 1 dicts for tweet_whole_sentiment, keys: tweet_id, values: sentimentValues (1--Positive,2--Neutral,3--Negative ''' try: nlp_wrapper = StanfordCoreNLP('http://localhost:5000') feature_dict = {} for tweet in data: tokenized = tweet.tweet_words() new_words = [word for word in tokenized if word.isalnum()] if not new_words: feature_dict[tweet.tweet_id] = 2 text = " ".join(new_words) annotate = nlp_wrapper.annotate(text, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 10000, }) for sentence in annotate["sentences"]: # feature_dict[tweet.tweet_id]=sentence["sentimentValue"] feature_dict[tweet.tweet_id] = [sentence["sentimentValue"]] # print(feature_dict) return feature_dict except Exception as e: print("In whole sentiment exception") print(str(e))
def dep_parse(sentence): """ Parse a sentence using CoreNLP dependency parser and extract lemmatization and dependencies in the extradependencies mode. The function depends on CoreNLP server being set up. See http://stanfordnlp.github.io/CoreNLP/corenlp-server.html :param sentence: sentence to be parsed :return: a dictionary whose keys are word indeces and values are lemmas; a dictionary whose keys are (parent, child) tuples and values are edge labels """ nlp = StanfordCoreNLP('http://localhost:9000') annotation = nlp.annotate( (sentence), properties={ 'annotators': 'tokenize,ssplit,lemma,pos,depparse', 'outputFormat': 'json', 'depparse.extradependencies': 'MAXIMAL' }) lemmas = {} words = {} for t in annotation['sentences'][0]['tokens']: lemmas[t['index']] = t['lemma'] words[t['index']] = t['word'] lemmas[0] = 'ROOT' words[0] = 'ROOT' dependencies = rename_dependencies( annotation['sentences'][0]['collapsed-ccprocessed-dependencies']) dep_edge_dict = collections.defaultdict(str) for dep in dependencies: dep_edge_dict[(dep['governor'], dep['dependent'])] = dep['dep'] return lemmas, words, dep_edge_dict
def pdtb_preprocess(args): sections = os.listdir(PathConfig.pipe_data_dir) if not os.path.exists(PathConfig.json_data_dir): os.mkdir(PathConfig.json_data_dir) core_nlp = StanfordCoreNLP('http://localhost:9000') annotate_func = lambda x: core_nlp.annotate(x, properties={ 'annotators': 'tokenize,ssplit,pos,lemma,parse,depparse', 'outputFormat': 'json', # 'ssplit.isOneSentence': True }) instance_cnt = 0 for section in sections: raw_sec_dir = os.path.join(PathConfig.pipe_data_dir, section) if not os.path.isdir(raw_sec_dir): continue converted_sec_dir = os.path.join(PathConfig.json_data_dir, section) if not os.path.exists(converted_sec_dir): os.mkdir(converted_sec_dir) for file in os.listdir(raw_sec_dir): fpath = os.path.join(raw_sec_dir, file) pipe_instances = load_pipe_file(fpath, types=['Implicit']) basename_prefix = os.path.basename(fpath).split('.')[0] for idx, inst in enumerate(pipe_instances, 1): inst.arg1_parse_result = annotate_func(inst.arg1) inst.arg2_parse_result = annotate_func(inst.arg2) with open(os.path.join(converted_sec_dir, '{}.{}.pickle'.format(basename_prefix, idx)), 'wb') as fout: pickle.dump(inst, fout) instance_cnt += 1 if instance_cnt % 100 == 0: print(instance_cnt) print('Totally, {} instances are converted.'.format(instance_cnt))
def tokenize_and_tag(idx, sentence): stanford_corenlp = StanfordCoreNLP(corenlp_url) tries = 0 while True: try: annotation = stanford_corenlp.annotate( (sentence), properties={ 'annotators': 'tokenize,pos,ner,depparse', 'outputFormat': 'json' }) assert type(annotation) == dict break except Exception: time.sleep(1) tries += 1 if tries == 10: print("Failed for {}".format(sentence)) return (idx, None, None, None, None) pass tokens, pos_tags, ner_tags, depparse = [], [], [], [] for sentence in annotation['sentences']: tokens.append([token['word'] for token in sentence['tokens']]) pos_tags.append([token['pos'] for token in sentence['tokens']]) ner_tags.append([token['ner'] for token in sentence['tokens']]) depparse.append([(token['dependent'], token['governor']) for token in sentence['basicDependencies']]) return (idx, tokens, pos_tags, ner_tags, depparse)
def annotateText(self, text): nlp = StanfordCoreNLP('http://localhost:9000') return nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos', 'outputFormat': 'json' })
def pos_tag(sentence): # Set up the Stanford CoreNLP Server nlp = StanfordCoreNLP('http://localhost:9000') # Use the API to POS-tag the sentence and get a json file back as output output = nlp.annotate(sentence, properties = { 'annotators': 'pos', 'outputFormat': 'json', }) # dict_replacements will contain all important words that we will find synonyms for as values and the keys will be the POS tag. # If word matches the specified POS tag, add it to dict_replacements along with the correct POS tag recognized by wordnet. # Run a loop that iterates over each word in the sentence we take in as input. # POS-tags (CoreNLP - Stanford): NN - Noun(Singular), NNS - Noun(Plural), VB - Verb # POS-tags (Wordnet) = n - noun, v - verb dict_replacements = {} pos_list = [] for sent in output['sentences']: for word in sent['tokens']: if word['pos'] == 'NNS' or word['pos'] == 'NN': dict_replacements[word['word']] = 'n' return dict_replacements
def corenlp_tokenize(text): nlp = StanfordCoreNLP('http://localhost:9000') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) print(output['sentences'][0]['parse']) return output
def stanford_parsing_result(): text =""" I shot an elephant. The dog chased the cat. School go to boy. """ nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) print(res['sentences'][0]['parse']) print(res['sentences'][2]['parse'])
def NERGetter(text): nlp = StanfordCoreNLP('http://192.168.54.210:9000/') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos , ner', 'outputFormat': 'text'}) output = str(output.replace('\r','').replace('\t','')) output = output.split('[', 1)[1] output = str(output) output = output.split('\n') for i in output[0:-1]: i = i.replace(']','') i = i.split('NamedEntityTag=') return i[1]
class StanfordAnnotator(PR): def __init__(self, annotators='tokenize,ssplit,pos,parse'):#depparse self.annotators=annotators self.nlp = StanfordCoreNLP('http://localhost:9000') def process(self, doc): output=self.nlp.annotate(doc.getText(), properties={ 'annotators': self.annotators, 'outputFormat': 'json', 'timeout': '600000' }) sents=[] tokens=[] # print "output", json.dumps(output) tStart=0 tEnd=0 for s in output['sentences']: sentText=[] sentTokens=[] for t in s['tokens']: # print t sentText.append(t['before']) sentText.append(t['originalText']) token=Annotation(t['originalText'],tEnd,tEnd,t['characterOffsetBegin'], t['characterOffsetEnd'], 'Token', doc) token.setFeature('pos', t['pos']) token.setFeature('index', t['index']) tokens.append(token) sentTokens.append(token) tEnd+=1 cStart=s['tokens'][0]['characterOffsetBegin'] cEnd=s['tokens'][-1]['characterOffsetEnd'] sentText="".join(sentText) print sentText sent=Annotation(sentText, tStart, tEnd, cStart, cEnd, 'Sentence', doc) tStart=tEnd sent.setFeature('constituency-parse', s['parse']) sent.setFeature('dep-parse', 'not implemented!') sent.setFeature('index', s['index']) # sent.setRelation('tokens',sentTokens) sents.append(sent) # pr- doc.setSents(sents) doc.setTokens(tokens)
class NLPFactory: def __init__(self): self.url = os.environ.get("CORENLP_URL", "http://localhost:9000") self.nlp = StanfordCoreNLP(self.url) def annotate(self, text): """ annotate by dependence parser Args: text (str): input data Returns: json """ # corenlp will treat sentences with full stop independently text = text.replace('.', ',').replace('!', ',') return self.nlp.annotate(text, properties={"annotators": "pos,lemma,depparse,sentiment", "outputFormat": "json"})
def resolve(self, text): sentences_all = sent_tokenize(text, 'English') for i in range(2, len(sentences_all)): text2 = sentences_all[i-2]+' '+sentences_all[i-1]+' '+sentences_all[i] print(text2) sentences = sent_tokenize(text2, 'English') print(sentences) nlp = StanfordCoreNLP('http://localhost:9000') output = nlp.annotate(text2, properties={ 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,mention,dcoref', 'outputFormat': 'json' }) # target.write(output) # target.close() corefs = output['corefs'] cnt = 1 for key, chains in corefs.items(): substitute = '' print("\nchain number "+str(cnt)) cnt += 1 for chain in chains: # print(chain['isRepresentativeMention']+'\n') print(chain['type'] + ' ' + chain['text']) if (chain['isRepresentativeMention'] is True) and (chain['type'] != 'PRONOMINAL'): substitute = str(chain['text']) print(substitute+'\n') if (chain['type'] == 'PRONOMINAL') and (substitute != ''): sentence_num = chain['sentNum'] words = word_tokenize(sentences[sentence_num - 1], 'English') words[chain['startIndex'] - 1] = substitute new_sentence = ' '.join(words) sentences[sentence_num - 1] = new_sentence sentences_all[i-2] = sentences[0] sentences_all[i-1] = sentences[1] sentences_all[i] = sentences[2] return sentences_all
def standford_sentiment_answer(text_str): asw_sentiment = make_default_sentiment() nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(text_str, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 20000, }) try: total_value = 0.0 for s in res["sentences"]: total_value += float(s["sentimentValue"]) asw_sentiment[s["sentiment"]] += 1 asw_sentiment['score'] = total_value return asw_sentiment except: return asw_sentiment
class NerToBratConverter(object): def __init__(self, corenlp_url='http://localhost:9000'): ''' Create Converter for converting NER annotations to Brat annotations classifier training data. To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started ''' self.corenlp = StanfordCoreNLP(corenlp_url) def convertToBrat(self, text_file, ann_file): print("Processing %s" % text_file) with open(text_file) as f: text = f.read() props = { 'annotators': 'tokenize,ssplit,pos,ner', 'outputFormat': 'json'} output = self.corenlp.annotate(text, properties=props) # flatten sentences and tokens tokenlists = [s['tokens'] for s in output['sentences']] tokens = itertools.chain.from_iterable(tokenlists) count = 1 with open(ann_file, 'w', 1) as out: for token in tokens: if token['ner'] != 'O': rec = "T%d\t%s %d %d\t%s" % (count, token['ner'], token['characterOffsetBegin'], token['characterOffsetEnd'], token['originalText']) # print(rec) out.write(rec) out.write("\n") count += 1 print("Wrote %s" % ann_file) def convert_all(self, input_paths): with open(input_paths) as paths: for d in map(lambda x: x.split(','), map(lambda x: x.strip(), paths)): self.convertToBrat(d[0], d[1])
class StanfordNERApi(): ''' Make use of StanfordCoreNLP Server Extract keyword through name entity recogonition ''' def __init__(self): self.nlp = StanfordCoreNLP(NLP_SERVER) def ner_groupby_ner(self, text): response = self.nlp.annotate(text, properties={ 'annotators': 'ner,lemma', 'outputFormat': 'json' }) return self.__process_ner_groupby_ner(response) def __process_ner_groupby_ner(self, response): output_dict = dict() '''The response is generally organized as {sentences:[{tokens:[]},{}]}''' if type(response) == dict and 'sentences' in response: for sentence in response['sentences']: for item in sentence['tokens']: # we only care about ner in set TARGET_NER if item.get('ner') in TARGET_NER: if item['ner'] not in output_dict: output_dict[item['ner']] = set() output_dict[item['ner']].add(item['originalText']) # convert from set to list for further json dumps for key in output_dict: output_dict[key] = list(output_dict[key]) # convert dict to string by json dumps if len(output_dict) > 0: return json.dumps(output_dict) else: return None else: logger.warning('sentences part is not in the response from NLP server.') return None
class Preprocess(): def __init__(self, argv): self.input = "" self.output_folder = "" # output has to be a folder self.input_type = "" # Start Stanford CoreNLP Server self.nlp = StanfordCoreNLP('http://localhost:9000') # Read User Command Line opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) for opt, arg in opts: if opt == '-h': print("Type 'python3.5 text_preprocessing/preprocess.py -i <inputfile> -o <outputfile>' \ in run_source_code.sh file") sys.exit() elif opt in ("-i", "--ifile"): self.input = arg if os.path.exists(arg) == False: print("Input doesn't exist") sys.exit() if os.path.isdir(arg) == True: self.input_type = "dir" elif os.path.isfile(arg) == True: self.input_type = "file" elif opt in ("-o", "--ofile"): self.output_folder = arg print("Input: " + self.input +", " + self.input_type) print("Output: " + self.output_folder) def sentence_parsing(self, row_string): parsed_json = self.nlp.annotate(row_string, properties={ 'annotators': 'tokenize,ssplit,pos', 'outputFormat': 'json' }) return parsed_json def output_preprocessed_data(self, json_input, file_name): rows = [] for sent in json_input['sentences']: parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']]) rows.append(parsed_sent) output_file_path = self.output_folder + file_name with open(output_file_path, 'a') as preprocessed_out: for r in rows: preprocessed_out.write(r + "\n") def pos_tagging(self): if self.input_type == "file": input_path_elems = self.input.split("/") file_name = "" if input_path_elems[-1] != "/": file_name = input_path_elems[-1] else: file_name = input_path_elems[-2] text_string = "" with open(self.input, 'rb') as file_input: for r in file_input: text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')]) print(self.input) parsed_json = self.sentence_parsing(text_string) self.output_preprocessed_data(parsed_json, file_name) elif self.input_type == "dir": for file_name in os.listdir(self.input): input_file_path = self.input + file_name text_string = "" with open(input_file_path, 'rb') as file_input: for r in file_input: text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')]) parsed_json = self.sentence_parsing(text_string) print(input_file_path) self.output_preprocessed_data(parsed_json, file_name)
class StanfordTFIDFApi(): ''' Make use of StanfordCoreNLP Server Extract keyword through tf-idf algorithm ''' def __init__(self): self.nlp = StanfordCoreNLP(NLP_SERVER) def __tf_by_pos(self, text, pos='N'): response = self.nlp.annotate(text, properties={ 'annotators': 'ner,lemma', 'outputFormat': 'json' }) logger.debug(json.dumps(response)) '''The response is generally organized as {sentences:[{tokens:[]},{}]}''' result = list() if type(response) == dict and 'sentences' in response: for sentence in response['sentences']: for item in sentence['tokens']: if item['pos'].startswith(pos): # only accept engish word, and not in STOPWORDS if acceptable_word(item['lemma'].lower()): result.append((item['lemma'].lower())) toks_count = Counter(result) return toks_count else: logger.warning('sentences part is not in the response from NLP server.') return Counter() def tf_idf_groupby_pos(self, text, df_cache): output = dict() output['NOUN'] = self.__tf_by_pos(text, 'N') output['VERB'] = self.__tf_by_pos(text, 'V') for pos in output: logger.debug('Computed tf for %s:' % pos + json.dumps(output['VERB'])) for word in output[pos]: '''Formula is: tf*log(N/df)''' if word in df_cache: output[pos][word] = output[pos][word]*math.log(df_cache['total_document'] /df_cache[word]) else: output[pos][word] = output[pos][word]*math.log(df_cache['total_document']) # return the top 10 words output[pos] = [word for word, count in output[pos].most_common(10)] logger.debug('Computed tf-idf for %s:' % pos + json.dumps(output[pos])) return json.dumps(output) def compute_df(self, document_list): '''Compute document frequency based on input document list''' df_cache = dict() df_output = dict() d_index = 0 for document in document_list: d_index += 1 # tokenize each document reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE) for item in reg_toks: # change each word to lower case and lemmatize item = normalise(item) if item not in df_cache: df_cache[item] = set([d_index]) else: df_cache[item].add(d_index) for item in df_cache: if acceptable_word(item): df_output[item] = len(df_cache[item]) df_output['total_document'] = len(document_list) return df_output
class StanfordRE(ReModel): def __init__(self, corpus, relationtype, modelname="stanfordre_classifier.ser"): super(StanfordRE, self).__init__() self.modelname = modelname self.pairs = {} self.corenlp_client = None self.relationtype = relationtype self.corpus = corpus def generate_data(self, corpus, modelname, pairtypes): if os.path.isfile(self.temp_dir + modelname + ".txt"): print "removed old data" os.remove(self.temp_dir + modelname + ".txt") trainlines = [] # get all entities of this document # doc_entities = [] pcount = 0 truepcount = 0 ns = 0 for sentence in corpus.get_sentences("goldstandard"): logging.info("{}".format(sentence.sid)) nt_to_entity = {} for e in sentence.entities.elist['goldstandard']: # TODO: merge tokens of entity nt = str(e.tokens[0].order) nt_to_entity[nt] = e # print nt_to_entity # ns = sentence.sid.split("s")[-1] for t in sentence.tokens: nt = str(t.order) # print nt, nt in nt_to_entity if nt in nt_to_entity: # print nt, nt_to_entity[nt], nt_to_entity[nt].type #l = [str(ns), nt_to_entity[nt].type, nt, "O", t.pos, t.text, "O", "O", "O"] # TODO: change other to entitiy name l = [str(ns), "Other", nt, "O", t.pos, t.text, "O", "O", "O"] else: # print nt, nt_to_entity l = [str(ns), "O", nt, "O", t.pos, t.text, "O", "O", "O"] trainlines.append(l) trainlines.append([""]) sentence_entities = [entity for entity in sentence.entities.elist["goldstandard"]] # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"]))) for pair in itertools.combinations(sentence_entities, 2): if pair[0].type == pairtypes[0] and pair[1].type == pairtypes[1] or pair[1].type == pairtypes[0] and pair[0].type == pairtypes[1]: # logging.debug(pair) if pair[0].type == pairtypes[0]: e1id = pair[0].eid e2id = pair[1].eid else: e1id = pair[1].eid e2id = pair[0].eid pair = (pair[1], pair[0]) pid = sentence.did + ".p" + str(pcount) # self.pairs[pid] = (e1id, e2id) self.pairs[pid] = pair if e2id in pair[0].targets: truepcount += 1 nt1 = str(pair[0].tokens[0].order) nt2 = str(pair[1].tokens[0].order) trainlines.append([nt1, nt2, "+".join(pairtypes)]) pcount += 1 trainlines.append([""]) ns += 1 logging.info("Writing {} lines...".format(len(trainlines))) with codecs.open(self.temp_dir + modelname + ".corp", 'w', "utf-8") as trainfile: for l in trainlines: # print l trainfile.write("\t".join(l) + "\n") logging.info("True/total relations:{}/{} ({})".format(truepcount, pcount, str(1.0*truepcount/pcount))) def write_props(self): with open(config.corenlp_dir + "roth.properties", 'r') as propfile: lines = propfile.readlines() print lines with open(config.corenlp_dir + "roth.properties", 'w') as propfile: for l in lines: if l.startswith("serializedRelationExtractorPath"): propfile.write("serializedRelationExtractorPath = {}\n".format(config.corenlp_dir + self.modelname)) elif l.startswith("trainPath"): propfile.write("trainPath = {}\n".format(self.temp_dir + self.modelname + ".corp")) else: propfile.write(l) def train(self): self.generate_data(self.corpus, self.modelname, pairtypes=self.relationtype) # java -cp classpath edu.stanford.nlp.ie.machinereading.MachineReading --arguments roth.properties if os.path.isfile(config.corenlp_dir + self.modelname): print "removed old model" os.remove(config.corenlp_dir + self.modelname) if not os.path.isfile(self.temp_dir + self.modelname + ".corp"): print "could not find training file " + config.corenlp_dir + self.modelname + ".corp" sys.exit() self.write_props() classpath = config.corenlp_dir + "*" srecall = ['java', '-mx3g', '-classpath', classpath, "edu.stanford.nlp.ie.machinereading.MachineReading", "--arguments", config.corenlp_dir + "roth.properties"] print " ".join(srecall) # sys.exit() srecall = Popen(srecall) #, stdout=PIPE, stderr=PIPE) res = srecall.communicate() if not os.path.isfile(config.corenlp_dir + self.modelname): print "error with StanfordRE! model file was not created" print res[1] sys.exit() else: statinfo = os.stat(config.corenlp_dir + self.modelname) if statinfo.st_size == 0: print "error with StanfordRE! model has 0 bytes" print res[0] print res[1] sys.exit() # logging.debug(res) def load_classifier(self, inputfile="slk_classifier.model.txt", outputfile="jsre_results.txt"): self.corenlp_client = StanfordCoreNLP('http://localhost:9000') # sup.relation.model= tokenkeys = set() sentencekeys = set() for d in self.corpus.documents: for s in self.corpus.documents[d].sentences: corenlpres = self.corenlp_client.annotate(s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, 'openie.triple.all_nominals': True, 'openie.triple.strict': False, 'openie.max_entailments_per_clause': 500, 'annotators': 'tokenize,ssplit,pos,depparse,natlog,openie', #'annotators': 'tokenize, ssplit, pos, lemma, ner, parse, relation, openie', 'outputFormat': 'json', # 'sup.relation.model': self.modelname }) for o in corenlpres["sentences"][0]["openie"]: if "mir" in o["object"] or "mir" in o["subject"]: print "{}={}>{}".format(o["subject"], o["relation"], o["object"]) def test(self, outputfile="jsre_results.txt"): pass def get_predictions(self, corpus, examplesfile="slk_classifier.model.txt", resultfile="jsre_results.txt"): pass
from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9000') text = 'To summarize the sprawling, byzantine plot: warning - possible spoilers ahead - an elderly Louvre curator is murdered in the museum. Although shot in the chest, he manages to disrobe and surround himself with cryptographic clues - written in blood AND invisible ink (!) - to the reason for his death. His estranged granddaughter, who, coincidentally is a police inspector (!!) AND a cryptologist (!!!), enlists the aid of a visiting Harvard professor and symbologist (!!!!) in unraveling the multiple mysteries of: ' res = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos', 'outputFormat': 'json', })
#!/usr/bin/python import cgi, cgitb import json cgitb.enable() # for troubleshooting #the cgi library gets vars from html data = cgi.FieldStorage() from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9000') text = data['text'].value annotators = data['annotators'].value output = nlp.annotate(text, properties={'annotators': annotators, 'outputFormat': 'json'}) #this is the actual output print "Content-Type: text/html\n" print json.dumps(output)
class StanfordAnnotator(PR): def __init__(self, annotators='tokenize,ssplit,pos,parse,lemma,ner', cacheDir='./corenlp'):#depparse self.annotators=annotators self.nlp = StanfordCoreNLP('http://localhost:9000') if not os.path.exists(cacheDir): os.makedirs(cacheDir) self.cache= os.listdir(cacheDir) self.cacheDir=cacheDir def getOutput(self, doc): jsonFile=doc.getId()+'.json' output=None outfile=None if jsonFile in self.cache: outfile=open(self.cacheDir+"/"+jsonFile, 'r') output=json.load(outfile, encoding='UTF-8') else: outfile=open(self.cacheDir+"/"+jsonFile, 'w') output=self.nlp.annotate(doc.getString(), properties={ 'annotators': self.annotators, 'outputFormat': 'json', 'timeout': '600000' }, encoding='UTF-8') json.dump(output, outfile) outfile.close() return output def process(self, doc): output=self.getOutput(doc) sents=[] tokens=[] # print "output", json.dumps(output) tStart=0 tEnd=0 cStart=0 cEnd=0 text=doc.getText() for s in output['sentences']: sentText=[] sentTokens=[] for t in s['tokens']: # print t txt_bfr=t['before'] txt_tkn=t['originalText'] sentText.append(txt_bfr) sentText.append(txt_tkn) cStart=text.find(txt_tkn, cStart) cEnd=cStart+len(txt_tkn) token=Annotation(t['originalText'],tEnd,tEnd,cStart, cEnd, 'Token', doc) token.setFeature('pos', t['pos']) token.setFeature('lemma', t['lemma']) token.setFeature('ner', t['ner']) token.setFeature('index', t['index']) tokens.append(token) sentTokens.append(token) tEnd+=1 sentCStart=sentTokens[0].cStart sentCEnd=sentTokens[-1].cEnd sentText=u''.join(sentText) # print sentText sent=Annotation(sentText, tStart, tEnd, sentCStart, sentCEnd, 'Sentence', doc) tStart=tEnd sent.setFeature('constituency-parse', s['parse']) sent.setFeature('dep-parse', 'not implemented!') sent.setFeature('index', s['index']) sent.setRelation('tokens',sentTokens) sents.append(sent) # pr- doc.setSents(sents) doc.setTokens(tokens)
if __name__ == '__main__': nlp = StanfordCoreNLP('http://localhost:9000') for line in orig_file: if not line.startswith("PMID"): info = line.split('\t') pmid = info[0] ta = info[1] sentence = info[2] sentence = sentence.rstrip('\n') cleanSentence = removeBracket(sentence) extraClean = removeParenth(cleanSentence) output = nlp.annotate(extraClean,properties={ 'annotators':'tokenize,ssplit,pos,depparse,parse', 'outputFormat' : 'json'}) try: result = output['sentences'][0]['parse'] getPOS = extractPOS(result) fixBioName_POS = bioName(getPOS,sentence) newLine = pmid+'\t'+ta+'\t'+sentence+'\t'+fixBioName_POS+'\n' newFile.write(newLine) except: pass orig_file.close() newFile.close()
keep_all_dependencies = False sent_count = 0 encoding = "utf-8" with codecs.open(output_file, "a", "utf-8") as outfile: outfile.write("{\"corpus\":[\n") for line in codecs.open(input_file, "r", encoding): # if encoding.lower != "utf-8": # line = line.encode("utf-8") sent_count += 1 print("Processing sent #{:d}".format(sent_count)) if sent_count == 16: print() output = nlp.annotate(line.replace("\n", "").strip(), properties, encoding="utf-8") if isinstance(output, str): json_obj = utils.process_json(output, sent_count, keep_all_dependencies) else: json_obj = utils.process_json(json.dumps(output, ensure_ascii=False), sent_count, keep_all_dependencies) json_str = json.dumps(json_obj, ensure_ascii=False) outfile.write("{:s},\n".format(json_str)) outfile.write("]}") print("Done!")
class StanfordMethods: def __init__(self): self.webparser = StanfordCoreNLP('http://localhost:9020') self.load_pickle_file() #To use this parser an instance has to be started in parallel: #Download Stanford CoreNLP from: https://stanfordnlp.github.io/CoreNLP/index.html #Extract anywhere and execute following command: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9020 def webparse(self, text): return self.webparser.annotate(text, properties={ 'timeout': '500000', 'annotators': 'tokenize,ssplit,truecase,pos,depparse,parse,sentiment', 'outputFormat': 'json' }) def load_pickle_file(self): try: self.known_ids = pickle.load(open(os.path.join(_pickled_data_folder, _stanford_pickle_database_file), 'rb')) print('loaded known_ids pickle') #print(self.known_ids) except: print('Stanford pickle does not exist') self.known_ids= {} def store_pickle_file(self): with open(os.path.join(_pickled_data_folder, _stanford_pickle_database_file), 'wb') as f: pickle.dump(self.known_ids, f, pickle.HIGHEST_PROTOCOL) def getStanfordInfo(self, type, body_id, head_id, text, max_number_of_sentences=99): ''' reads info from file else calculates' :param type: either 'body' or 'headline'. type body will be stored, type headline will always be parsed, because it is not unique :param body_id: element id of which the stanford information is needed :param head_id :param text: text of which the stanfordinformation shall be extracted :param max_number_of_sentences: number of sentences which shall be parsed at maximum :return: [ [nouns], [verbs], [negations count, [root_dist]], [sentiment_value] ] ''' if (type == 'body') and (body_id in self.known_ids): return self.known_ids[body_id] elif (type == 'headline') and (head_id+body_id in self.known_ids): return self.known_ids[str(head_id) + str(body_id)] else : try: result = self.extract_stanford_information(text, max_number_of_sentences) if type == 'body': self.known_ids[body_id] = result elif type == 'headline': self.known_ids[str(head_id) + str(body_id)] = result return result except Exception as e: self.store_pickle_file() print('problem with id: ' + str(body_id) + " type:" + type) print(text) print(e) raise e def extract_stanford_information(self, text, max_number_of_sentences=99): ''' Stanford-parse the sentence :parameter: text, max_number of sentences to be parsed :return: [ [nouns], [verbs], [negations count, [root_dist]], [sentiment_value] ] ''' #since the nlp parser might get some problems with long texts, #I decided to divide the text into sentences before parsing it with stanfordparser nouns = [] verbs = [] # sentiment_list = [] sentiment_value_list = [] negation_count = 0 root_dist = [] current_sentence = 0 number_of_words = 0 _refuting_words = [ 'fake', 'fraud', 'hoax', 'false', 'deny', 'denies', # 'refute', 'not', 'despite', 'nope', 'doubt', 'doubts', 'bogus', 'debunk', 'pranks', 'retract' ] for raw_sentence in tokenizer.tokenize(text): try: tagged_text = dict(self.webparse(raw_sentence)) # Normally only one sentence should be in a raw_Sentence # - but the nltk PunktSentence Tokenizer might have missed a split for sentence in tagged_text['sentences']: current_sentence += 1 # Extract nouns and verbs sentiment_value_list.append(int(sentence['sentimentValue'])) for token in sentence['tokens']: if 'NN' in token['pos']: nouns.append(token['originalText']) elif 'V' in token['pos']: verbs.append(token['originalText']) if token['originalText'] in _refuting_words: negation_count += 1 root_dist.append(calculate_distance(sentence, find_root_node(sentence), token['index'])) # Count negations ''' # This only works correct, when at least on sentence can be parsed per text for dependency in sentence['basicDependencies']['dep']: try: # dep, dependent, dependentGloss, governor, governorGloss = dependency.values() if dependency == 'neg': negation_count += 1 #calculate distance to negated words #find head token #find negated token print('Negated token: ' + sentence['tokens'][i-1]['originalText']) distance = calculate_distance(tagged_text, 'not', 'I') # sentiment_list.append(sentence['sentiment']) # Skip sentence if problem occurs while parsing except Exception as e: print('Error parsing sentence: ' + raw_sentence) print(e) #raise e continue ''' number_of_words += 1 #only parse number of given sentences per call if current_sentence == max_number_of_sentences: break except Exception as e: print('Error parsing sentence: ' + raw_sentence) print(e) #raise e continue #ToDo: Think about good way to combine the distance if negation_count >= 1: negation = [negation_count, root_dist] else: negation = [-1, -1] #calculate average words per sentence words_per_sentence = number_of_words/current_sentence return nouns, verbs, negation, sentiment_value_list, words_per_sentence def check_if_already_parsed(self, id): return id in self.known_ids
class BratToNerConverter(object): def __init__(self, corenlp_url='http://localhost:9000'): ''' Create Converter for converting brat annotations to Core NLP NER CRF classifier training data. To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started ''' self.corenlp = StanfordCoreNLP(corenlp_url) def convert(self, text_file, ann_file): text, tree = self.parse(text_file, ann_file) props = { 'annotators': 'tokenize,ssplit', 'outputFormat': 'json'} if text[0].isspace(): text = '.' + text[1:] # Reason: some tools trim/strip off the white spaces # which will mismatch the character offsets output = self.corenlp.annotate(text, properties=props) records = [] for sentence in output['sentences']: continue_ann, continue_ann_en = None, None for tok in sentence['tokens']: begin, tok_end = tok['characterOffsetBegin'], tok['characterOffsetEnd'] label = 'O' if begin in tree: node = tree[begin] if len(node) > 1: print("WARN: multiple starts at ", begin, node) if tok_end in node: node = {tok_end: node[tok_end]} # picking one print("Chose:", node) ann_end, labels = node.items()[0] if not len(labels) == 1: print("WARN: Duplicate labels for token: %s, label:%s. Using the first one!" % (tok['word'], str(labels))) if accept_labels is not None and labels[0] in accept_labels: label = labels[0] if tok_end == ann_end: # annotation ends where token ends continue_ann = None elif tok_end < ann_end and label != 'O': print("Continue for the next %d chars" % (ann_end - tok_end)) continue_ann = label continue_ann_end = ann_end elif continue_ann is not None and tok_end <= continue_ann_end: print("Continuing the annotation %s, %d:%d %d]" % (continue_ann, begin, tok_end, continue_ann_end)) label = continue_ann # previous label is this label if continue_ann_end == tok_end: # continuation ends here print("End") continue_ann = None yield "%s\t%s" % (tok['word'], label) #yield "" # end of sentence yield "" # end of document def parse(self, txt_file, ann_file): with open(txt_file) as text_file, open(ann_file) as ann_file: texts = text_file.read().decode('utf8') text_file.close() #texts = text_file.read() anns = map(lambda x: x.strip().split('\t'), ann_file) anns = filter(lambda x: len(x) > 2, anns) # FIXME: ignoring the annotatiosn which are complex anns = filter(lambda x: ';' not in x[1], anns) # FIXME: some annotations' spread have been split into many, separated by ; ignoring them def __parse_ann(ann): spec = ann[1].split() name = spec[0] markers = list(map(lambda x: int(x), spec[1:])) #t = ' '.join([texts[begin:end] for begin,end in zip(markers[::2], markers[1::2])]) t = texts[markers[0]:markers[1]] if not t == ann[2]: print("Error: Annotation mis-match, file=%s, ann=%s" % (txt_file, str(ann))) return None return (name, markers, t) anns = map(__parse_ann, anns) # format anns = filter(lambda x: x, anns) # skip None # building a tree index for easy accessing tree = {} for entity_type, pos, name in anns: begin, end = pos[0], pos[1] if begin not in tree: tree[begin] = {} node = tree[begin] if end not in node: node[end] = [] node[end].append(entity_type) # Re-read file in without decoding it text_file = open(txt_file) texts = text_file.read() text_file.close() return texts, tree def convert_all(self, input_paths, output): with open(input_paths) as paths, open(output, 'w') as out: for p in map(lambda x: x.strip(), paths): d = p.split(',') print(d) for line in self.convert(d[0], d[1]): out.write(line) out.write("\n") out.write("\n") # end of document
count=0 flag=0 for line in data: # print line if(line.isspace()): continue elif count==5 or flag: break else: count+=1 line = line.lstrip().rstrip() # line = remove_non_ascii(line) # line = filter(lambda x: x in printable, line) output = nlp.annotate(line, properties={ 'annotators': 'tokenize,ssplit,pos,lemma,ner', 'outputFormat': 'json' }) # print 'output: ', output tagged = [] for sentence in output['sentences']: for token in sentence['tokens']: tagged.append((token['originalText'], token['ner'])) # print tagged name = [] for (el1,el2) in tagged: if el2 == u'PERSON': print "Name identified: "+ el1 name.append(el1)
class CoreNLPParser(JournalParser): CORENLP_PARSER = "edu.stanford.nlp.pipeline.CoreNLPServer" def __init__(self, **kwargs): super(CoreNLPParser, self).__init__(**kwargs) self.corenlp = StanfordCoreNLP(kwargs['corenlp_url'] ) self.props = { 'annotators': 'tokenize,ssplit,lemma,pos,ner', 'outputFormat': 'json', 'ner.useSUTime': False, # dont want SUTime model 'ner.applyNumericClassifiers': False, # Dont want numeric classifier } if kwargs.get('ner_model'): # set NER model from CLI if not os.path.exists(kwargs.get('ner_model')): print('Error: Could not find NER model %s.' % kwargs.get('ner_model')) sys.exit(1) self.props['ner.model'] = kwargs['ner_model'] print("CoreNLP Properties : ", self.props) def parse_names(self, text, meta): if type(text) != str: text = text.encode('utf8') #, errors='ignore') if text[0].isspace(): # dont strip white spaces text = '.' + text[1:] output = self.corenlp.annotate(text, properties=self.props) # flatten sentences and tokens tokenlists = [s['tokens'] for s in output['sentences']] tokens = itertools.chain.from_iterable(tokenlists) names = [] for token in tokens: if token['ner'] != 'O': name = { 'label': token['ner'], 'begin': token['characterOffsetBegin'], 'end': token['characterOffsetEnd'], 'text': token['originalText'], 'source': 'corenlp' } names.append(name) # Handle multi-word tokens: # Merge any adjacent Target tokens, if of the same type and # separated by a space, into one span. names.sort(key=lambda x: int(x['begin'])) new_names = [] skip_names = [] for n in names: if n in skip_names: continue next_name = [n2 for n2 in names if \ n['label'] == 'Target' and n2['label'] == 'Target' and int(n2['begin']) == int(n['end']) + 1] if len(next_name) > 0: print('%s: Merging %s and %s' % (meta['resourceName'], n['text'], next_name[0]['text'])) n['text'] += ' ' + next_name[0]['text'] n['end'] = next_name[0]['end'] skip_names.append(next_name[0]) # Either way, save this one new_names.append(n) if len(names) != len(new_names): print('%d -> %d NERs' % (len(names), len(new_names))) if names: meta['ner'] = new_names meta['X-Parsed-By'].append(CoreNLPParser.CORENLP_PARSER) meta['sentences'] = output['sentences'] return meta