def run_nlp(self, language): # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) : # might need root # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000 # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period. self.__check_language(language) util.time_log("starting NLP...") annotator_dict = {"annotators": "sentiment"} classifier = CoreNLPParser("http://localhost:9000") ret_list = [] for k_iter in range(0, self.k): prediction = [] for review in self.test_data_text(language, k_iter): response_dict = classifier.api_call(review, properties=annotator_dict, timeout=500) count = 0 sentiment = 0.0 for sentence in response_dict["sentences"]: count += 1 sentiment += float(sentence["sentimentValue"]) avg_sentiment = sentiment / count # a lot better results with >=2 prediction.append(1 if avg_sentiment >= 2 else 0) ret_list.append(prediction) return ret_list
def annotate(sentence, lower=True): global client nlp = CoreNLPParser('http://localhost:9000') res = nlp.api_call(sentence,properties={'annotators': 'tokenize,ssplit'}) words, gloss, after = [], [], [] print(sentence) for t in res['sentences']: for i in range(len(t['tokens'])): words.append(t['tokens'][i]['word']) gloss.append(t['tokens'][i]['originalText']) after.append(t['tokens'][i]['after']) if lower: words = [w.lower() for w in words] a={ 'gloss': gloss, 'words': words, 'after': after, } print(a) return { 'gloss': gloss, 'words': words, 'after': after, }
def tag_file(inputfile, lemma=True): stanford_parser = CoreNLPParser() with open(inputfile) as fin: content = [] for line in fin: linepos = [] line = line.strip() json_result = stanford_parser.api_call( line, properties=additional_properties) for sentence in json_result['sentences']: for dpos in sentence['tokens']: if lemma: word = dpos['lemma'] else: word = dpos['word'] pos = dpos['pos'] linepos.append((word, pos)) if linepos: content.append(linepos[:]) print content
def main(): tokenizer = CoreNLPParser(url='http://localhost:42636') vocab = set() for line in open(sys.argv[1]): word = line.rstrip() vocab.add(word) document_buffer = "" token_buffer = [] with open(sys.argv[2]) as fin, open(sys.argv[3], "w") as fout: start = time.time() for e, line in enumerate(fin): if line.strip() == "": continue elif line.strip().lower() != end_of_document_symbol: document_buffer += line.strip() + " <br> " if len(document_buffer) > 90000: while True: try: json_result = tokenizer.api_call( document_buffer, properties=additional_properties) break except requests.exceptions.HTTPError: pass json_result = tokenizer.api_call( document_buffer, properties=additional_properties) for sentence in json_result['sentences']: token_buffer += [(x["originalText"], x["pos"]) for x in sentence['tokens']] document_buffer = "" else: while True: try: json_result = tokenizer.api_call( document_buffer, properties=additional_properties) break except requests.exceptions.HTTPError: pass for sentence in json_result['sentences']: token_buffer += [(x["originalText"], x["pos"]) for x in sentence['tokens']] document = " ".join([ x.lower() + "__" + pos if x != "." and x != "<br>" else "<br>" for x, pos in token_buffer if x.lower() in vocab or x in ["<br>", "."] ]) sentences = [ x.strip() for x in document.split("<br>") if x.strip() ] fout.write("<doc>\n" + "\n".join(sentences) + "\n</doc>\n") document_buffer = "" token_buffer = [] eta = 30749930 / (e + 1) * (time.time() - start) - (time.time() - start) if (e + 1) % 500 == 0: sys.stdout.write("\rsent: %i/%i\tETA: %f" % (e + 1, 30749930, eta)) sys.stdout.flush()
class StanTokenizer(Composable): def __init__(self): # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html self.additional_properties = { 'tokenize.options': 'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false', 'annotators': 'tokenize, ssplit, pos, lemma' } self.stanford_parser = CoreNLPParser() # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB. internals.config_java(options='-xmx4G') def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos try: json_result = self.stanford_parser.api_call( value, properties=self.additional_properties) for sentence in json_result['sentences']: for token in sentence['tokens']: if token: t.text = token['word'] t.lemma = token['lemma'] t.pos = token['pos'] t.boost = 1.0 if keeporiginal: t.original = token['originalText'] t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = token['characterOffsetBegin'] t.endchar = token['characterOffsetEnd'] yield t except Exception as e: logging.critical(str(e)) pass