Python CoreNLPParser.api_callの例

プログラミング言語: Python

名前空間/パッケージ名: nltk.parse.corenlp

クラス/型: CoreNLPParser

メソッド/関数: api_call

hotexamples.comのコード掲載数: 5

Python CoreNLPParser.api_call - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのnltk.parse.corenlp.CoreNLPParser.api_callの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

CoreNLPParser(30)

raw_parse(29)

tokenize(20)

tag(9)

api_call(5)

parse(4)

parse_text(3)

raw_parse_sents(3)

parse_one(1)

tokenize_sents(1)

コード例 #1

ファイルを表示

ファイル: standard_run.py プロジェクト: patrickMalikTU/bachelors

    def run_nlp(self, language):
        # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) :
        # might need root
        # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000
        # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future
        # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period.
        self.__check_language(language)
        util.time_log("starting NLP...")
        annotator_dict = {"annotators": "sentiment"}
        classifier = CoreNLPParser("http://localhost:9000")

        ret_list = []

        for k_iter in range(0, self.k):
            prediction = []
            for review in self.test_data_text(language, k_iter):
                response_dict = classifier.api_call(review,
                                                    properties=annotator_dict,
                                                    timeout=500)
                count = 0
                sentiment = 0.0
                for sentence in response_dict["sentences"]:
                    count += 1
                    sentiment += float(sentence["sentimentValue"])

                avg_sentiment = sentiment / count
                # a lot better results with >=2
                prediction.append(1 if avg_sentiment >= 2 else 0)
            ret_list.append(prediction)
        return ret_list

コード例 #2

ファイルを表示

ファイル: annotate_ws.py プロジェクト: officealexa2020/project

def annotate(sentence, lower=True):
    global client
   
    nlp = CoreNLPParser('http://localhost:9000')

    res = nlp.api_call(sentence,properties={'annotators': 'tokenize,ssplit'})     

        
    words, gloss, after = [], [], []
    
    print(sentence)
    for t in res['sentences']:
        for i in range(len(t['tokens'])):
            words.append(t['tokens'][i]['word'])
            gloss.append(t['tokens'][i]['originalText'])
            after.append(t['tokens'][i]['after'])
    if lower:
        words = [w.lower() for w in words]
    a={
        'gloss': gloss,
        'words': words,
        'after': after,
        }    
    print(a)        
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
        }

コード例 #3

ファイルを表示

ファイル: tagger.py プロジェクト: asyrofist/TermCooc

def tag_file(inputfile, lemma=True):
    stanford_parser = CoreNLPParser()
    with open(inputfile) as fin:
        content = []
        for line in fin:
            linepos = []
            line = line.strip()
            json_result = stanford_parser.api_call(
                line, properties=additional_properties)
            for sentence in json_result['sentences']:
                for dpos in sentence['tokens']:
                    if lemma:
                        word = dpos['lemma']
                    else:
                        word = dpos['word']
                    pos = dpos['pos']
                    linepos.append((word, pos))
            if linepos:
                content.append(linepos[:])
    print content

コード例 #4

ファイルを表示

def main():
    tokenizer = CoreNLPParser(url='http://localhost:42636')
    vocab = set()
    for line in open(sys.argv[1]):
        word = line.rstrip()
        vocab.add(word)

    document_buffer = ""
    token_buffer = []

    with open(sys.argv[2]) as fin, open(sys.argv[3], "w") as fout:
        start = time.time()

        for e, line in enumerate(fin):
            if line.strip() == "":
                continue
            elif line.strip().lower() != end_of_document_symbol:
                document_buffer += line.strip() + " <br> "
                if len(document_buffer) > 90000:
                    while True:
                        try:
                            json_result = tokenizer.api_call(
                                document_buffer,
                                properties=additional_properties)
                            break
                        except requests.exceptions.HTTPError:
                            pass
                    json_result = tokenizer.api_call(
                        document_buffer, properties=additional_properties)
                    for sentence in json_result['sentences']:
                        token_buffer += [(x["originalText"], x["pos"])
                                         for x in sentence['tokens']]
                    document_buffer = ""
            else:
                while True:
                    try:
                        json_result = tokenizer.api_call(
                            document_buffer, properties=additional_properties)
                        break
                    except requests.exceptions.HTTPError:
                        pass
                for sentence in json_result['sentences']:
                    token_buffer += [(x["originalText"], x["pos"])
                                     for x in sentence['tokens']]

                document = " ".join([
                    x.lower() + "__" +
                    pos if x != "." and x != "<br>" else "<br>"
                    for x, pos in token_buffer
                    if x.lower() in vocab or x in ["<br>", "."]
                ])
                sentences = [
                    x.strip() for x in document.split("<br>") if x.strip()
                ]
                fout.write("<doc>\n" + "\n".join(sentences) + "\n</doc>\n")

                document_buffer = ""
                token_buffer = []

            eta = 30749930 / (e + 1) * (time.time() - start) - (time.time() -
                                                                start)
            if (e + 1) % 500 == 0:
                sys.stdout.write("\rsent: %i/%i\tETA: %f" %
                                 (e + 1, 30749930, eta))
                sys.stdout.flush()

コード例 #5

ファイルを表示

ファイル: stanford.py プロジェクト: aredev/ir-data-mining

class StanTokenizer(Composable):
    def __init__(self):
        # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html
        self.additional_properties = {
            'tokenize.options':
            'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false',
            'annotators': 'tokenize, ssplit, pos, lemma'
        }
        self.stanford_parser = CoreNLPParser()
        # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB.
        internals.config_java(options='-xmx4G')

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t

        else:
            pos = start_pos
            try:
                json_result = self.stanford_parser.api_call(
                    value, properties=self.additional_properties)
                for sentence in json_result['sentences']:
                    for token in sentence['tokens']:
                        if token:
                            t.text = token['word']
                            t.lemma = token['lemma']
                            t.pos = token['pos']
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token['originalText']
                            t.stopped = False
                            if positions:
                                t.pos = pos
                                pos += 1
                            if chars:
                                t.startchar = token['characterOffsetBegin']
                                t.endchar = token['characterOffsetEnd']
                            yield t
            except Exception as e:
                logging.critical(str(e))
                pass