def getSentiment(text):
    ## connect to CoreNLP server
    host = "http://localhost"
    port = "9000"
    nlp = StanfordCoreNLP(host + ":" + port)

    # annotate text
    output = nlp.annotate(text,
                          properties={
                              "outputFormat": "json",
                              "annotators": "sentiment"
                          })

    # grab sentiment
    total_sent = 0
    n = 0
    for sen in output['sentences']:
        total_sent = total_sent + int(sen["sentimentValue"])
        n = n + 1

    # avoid divide by 0
    if n != 0:
        return total_sent / n
    else:
        raise Exception("Comment length 0")
def process_stanford(data_set, restart=False):
    # load count, i.e. how many documents have been parsed successfully
    counter = Counter(data_set.stanford_path, restart=restart)

    # prepare to use Stanford parser
    nlp = StanfordCoreNLP(STANFORD_SERVER)

    start = time.time()
    while counter.count < data_set.data['count']:
        doc_start = time.time()

        # read file
        text = fix(data_set.get_text(counter.count))

        # call stanford annotate api
        annotation = nlp.annotate(text,
                                  properties={
                                      'annotators': 'lemma,parse',
                                      'outputFormat': 'json'
                                  })

        if type(annotation) is str:
            print('Error returned by stanford parser:', annotation)
            sys.exit()

        # pickle the result
        data_set.save_stanford_annotation(counter.count, annotation)

        # save the new count
        counter.increment()

        # print time information
        print('%i, %i%% %.2f seconds (%.0f total))' %
              (counter.count - 1, 100 * counter.count / data_set.data['count'],
               time.time() - doc_start, time.time() - start))
Esempio n. 3
0
def pos(text):
    host = "http://localhost"
    port = "9000"
    nlp = StanfordCoreNLP(host + ":" + port)
    lst = []
    #print("POS", text)
    output = nlp.annotate(text,
                          properties={
                              "outputFormat": "json",
                              "annotators": "pos"
                          })
    #output = (output["sentences"][0]["tokens"])
    lst2 = []
    for i in output["sentences"]:
        lst2 = lst2 + i["tokens"]
    #print(output)
    interest = [
        "JJ", "JJR", "JJS", "NN", "NNP", "NNPS", "NNS", "VB", "VBD", "VBG",
        "VBZ", "VBN", "VBP", "CD"
    ]

    for i in lst2:
        if i["pos"] in interest or i["originalText"] == "n't" or i[
                "originalText"] == "not":
            lst.append([i["originalText"], i["index"], classify(i)])
    #print("POS OUTPUT", lst)
    return (lst)
Esempio n. 4
0
def sentiment(text):
    host = "http://localhost"
    port = "9000"
    nlp = StanfordCoreNLP(host + ":" + port)
    lst = []
    output = nlp.annotate(text,
                          properties={
                              "outputFormat": "json",
                              "annotators": "sentiment"
                          })
    #output = (output["sentences"][0]["tokens"])
    #output = (output["sentences"][0]['entitymentions'])
    """for i in output:
        lst.append([i["text"], i["ner"]])
    return(lst)"""
    return (output)
Esempio n. 5
0
def ner(text):

    host = "http://localhost"
    port = "9000"
    nlp = StanfordCoreNLP(host + ":" + port)
    lst = []
    output = nlp.annotate(text,
                          properties={
                              "outputFormat": "json",
                              "annotators": "ner"
                          })
    #output = (output["sentences"][0]["tokens"])
    #output = (output["sentences"][0]['entitymentions'])
    lst2 = []
    for i in output["sentences"]:
        lst2 = lst2 + i["entitymentions"]
    for i in lst2:
        lst.append([i["text"], i["ner"]])
    return (lst)
    print("NER", lst)
Esempio n. 6
0
class STF_TOKEN:
    def __init__(self):
        self.host = "http://124.193.223.50"
        self.port = "8047"
        self.nlp = StanfordCoreNLP(self.host + ":" + self.port)

    def token(self, line):
        #分词
        output = self.nlp.annotate(
            line,
            properties={
                "outputFormat": "json",
                #"annotators": "depparse,ner,entitymentions,sentiment"
                "annotators": "tokenize"
                #"annotators": "tokenize"
            })
        #pprint(output)
        res = [d['originalText'] for d in output['tokens']]
        res = ' '.join(res).split(' ')
        return res

    def token_ssplit(self, line):
        # 分词 + 分句
        output = self.nlp.annotate(
            line,
            properties={
                "outputFormat": "json",
                #"annotators": "depparse,ner,entitymentions,sentiment"
                "annotators": "tokenize, ssplit"
                #"annotators": "tokenize"
            })
        res = [
            ' '.join([d['originalText'] for d in l['tokens']]).split(' ')
            for l in output['sentences']
        ]
        return res
Esempio n. 7
0
This is a preparation step for feature extraction that takes a long time,
    so it is a good idea to separate this from the other steps.  
'''
import pickle
import time

from contractions import fix
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from pycorenlp.corenlp import StanfordCoreNLP
from nltk.tree import Tree


if __name__ == '__main__':
    nlp = StanfordCoreNLP('http://localhost:9000')
    
    levels = ['KET', 'PET', 'FCE', 'CAE', 'CPE']
    num_articles = [64, 60, 71, 67, 69]
    
    start = time.time()
    for l in range(0, len(levels)):
        level_start = time.time()
        
        for i in range(1, num_articles[l] + 1):
            doc_start = time.time()
            # read file
            with open('D:/master project/data/CEPP/' + levels[l] + '/' + str(i) + '.txt', 'r', encoding='utf8') as myfile:
                text = myfile.read()
            
            annotation = nlp.annotate(text, properties={
Esempio n. 8
0
def init_CoreNLPServer():
    global CORENLP_IP
    global CORENLP_PORT
    nlpobj = StanfordCoreNLP('http://' + CORENLP_IP + ':' + CORENLP_PORT)
    return nlpobj
Esempio n. 9
0
import requests
import pandas as pd
from multiprocessing import Process
import sys
import time
from flask_cors import CORS
import flask
from pycorenlp.corenlp import StanfordCoreNLP
from pprint import pprint

app = flask.Flask(__name__)
CORS(app)

host = "http://localhost"
port = "9000"
nlp = StanfordCoreNLP(host + ":" + port)


def classify(text):
    if text["pos"] in ["JJ", "JJR", "JJS"]:
        return ("Descriptor")
    else:
        if text["pos"] in ["NN", "NNP", "NNPS", "NNS", "CD"
                           ] or text["originalText"].lower() == "am":
            return ("Entity")
        else:
            if text["pos"] in ["VB", "VBD", "VBG", "VBZ", "VBN", "VBP"
                               ] or text["originalText"] == "n't" or text[
                                   "originalText"] == "not":
                return ("Action/Service")
Esempio n. 10
0
 def __init__(self):
     self.host = "http://124.193.223.50"
     self.port = "8047"
     self.nlp = StanfordCoreNLP(self.host + ":" + self.port)
Esempio n. 11
0
from pprint import pprint
from pycorenlp.corenlp import StanfordCoreNLP
host = "http://localhost"
port = "9000"
nlp = StanfordCoreNLP(host + ":" + port)
text = "Joshua Brown, 40, was killed in Florida in May when his Tesla failed to " \
       "differentiate between the side of a turning truck and the sky while " \
       "operating in autopilot mode."
output = nlp.annotate(text,
                      properties={
                          "outputFormat": "json",
                          "annotators": "depparse,ner,entitymentions,sentiment"
                      })
pprint(output)
Esempio n. 12
0
dep_json_data = open("dep.json").read()
dep_data = json.loads(dep_json_data)

pos_json_data = ast.literal_eval(json.dumps(pos_json_data))
dep_json_data = ast.literal_eval(json.dumps(dep_json_data))

FVfile = open("fv.txt", "w+")

agreement_text = open("1.txt").read()
agreement_lines = agreement_text.split("\n")

from pycorenlp.corenlp import StanfordCoreNLP
host = "http://localhost"
port = "9000"
nlp = StanfordCoreNLP(host + ":" + port)

for iter in range(0, len(agreement_lines)):
    text = agreement_lines[iter]
    output = nlp.annotate(text,
                          properties={
                              "outputFormat": "json",
                              "annotators": "depparse,lemma"
                          })

    output = ast.literal_eval(json.dumps(output))
    FV = ""
    pprint(output)
    length_tokens = len(output["sentences"][0]["tokens"])
    print length_tokens
    print output["sentences"][0]["tokens"][1]["lemma"]