def main(): corenlp_dir = "stanford-corenlp-full-2016-10-31/" parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir) df_dev = pd.read_csv("dev_v1.csv") predictor = AnswertypePredictor() df_dev["AnswerType"] = [predictor.predict_answer_type( question, predictor.vectorizer, predictor.svm) for question in df_dev["question"]] df_keywords = collect_keywords(parser, df_dev["question"]) df_keywords.to_csv("keywords.csv") df_keywords = pd.read_csv("keywords.csv") df_dev["entities"] = df_keywords["entities"] df_dev["NNPs"] = df_keywords["NNPs"] df_dev["NNs"] = df_keywords["NNs"] df_dev["VBs"] = df_keywords["VBs"] df_dev["WP"] = df_keywords["WP"] df_dev.to_csv("df_dev_updated.csv")
def core_nlp_parser(file, query): corenlp_dir = "/Users/piranon/Documents/StanfordParser/stanford-corenlp-full-2013-06-20" properties_file = "./user.properties" parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir, properties=properties_file) tagged = [] with codecs.open(file, 'r', 'utf-8') as f: snt = f.readlines() snt = list(set(snt)) for i, line in enumerate(snt): if len(line) > 400: continue line = line.strip() line_tagged = [] result_nlp = json.loads(parser.parse(line)) dependencies = [ d for d in result_nlp[u'sentences'][0][u'dependencies'][:] if query in d ] for d in dependencies: d.remove(query) for i, w in enumerate( [x[1]['Lemma'] for x in result_nlp[u'sentences'][0][u'words']]): for d in dependencies: if w == query: line_tagged.append((w, 'TARGET')) break elif w in d: line_tagged.append((w, d[0])) break else: #文章全部をPOSで入力するとき # line_tagged.append((w, result_nlp[u'sentences'][0][u'words'][i][1][u'PartOfSpeech'])) #TARGET動詞との関係がある単語のみのとき pass tagged.append(line_tagged) print 'Input text:%d' % len(tagged) return tagged
import json import corenlp if __name__ == '__main__': with open('./nlp.txt', 'r') as f: text = f.read() corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2013-06-20/" parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir) # 一度に処理すると、途中で処理が切れる for line in text.split('\n'): result = json.loads(parser.parse(line)) for sentence_data in result['sentences']: for word_data in sentence_data['words']: word = word_data[0] tag = word_data[1]['NamedEntityTag'] pos = word_data[1]['PartOfSpeech'] if tag == 'PERSON' and pos == 'NNP': print(f'{word}')
import corenlp corenlp_dir = "../../../stanford/stanford-corenlp-full-2015-01-29/" properties_file = "../../../stanford/stanford-corenlp-full-2015-01-29/user.properties" parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir, properties=properties_file) def parse(text_name, out_name): with open(text_name, 'r', encoding='utf-8', errors='ignore') as text, open(out_name, 'w') as out: for line in text: for sentence in parser.raw_parse(line)["sentences"]: words, lemma, pos = [], [], [] for word_element in sentence['words']: words.append(word_element[0]) lemma.append(str(word_element[1]["Lemma"]).lower()) pos.append(word_element[1]["PartOfSpeech"]) depend = sentence["indexeddependencies"] out.write(' '.join(words) + '\n') out.write(' '.join(lemma) + '\n') out.write(' '.join(pos) + '\n') for depend_num in range(len(depend)): out.write(' '.join(depend[depend_num])) if depend_num != len(depend) - 1: out.write('\t') out.write('\n\n')
def __init__(self, *args): global WSDDIR self.tagger = None self.mode = args[0] if args[0] == "file": if len(args) != 2: raise Exception("Syntax: file:[filename]") self.tagger = codecs.open(args[1],'r','utf-8') elif args[0] == "frog": if len(args) != 3: raise Exception("Syntax: frog:[host]:[port]") from pynlpl.clients.frogclient import FrogClient port = int(args[2]) self.tagger = FrogClient(args[1],port) elif args[0] == "freeling": if len(args) != 3: raise Exception("Syntax: freeling:[host]:[port]") from pynlpl.clients.freeling import FreeLingClient host = args[1] port = int(args[2]) self.tagger = FreeLingClient(host,port) elif args[0] == "corenlp": if len(args) != 1: raise Exception("Syntax: corenlp") import corenlp print("Initialising Stanford Core NLP",file=stderr) self.tagger = corenlp.StanfordCoreNLP() elif args[0] == 'treetagger': if not len(args) == 2: raise Exception("Syntax: treetagger:[treetagger-bin]") self.tagger = args[1] elif args[0] == "durmlex": if not len(args) == 2: raise Exception("Syntax: durmlex:[filename]") print("Reading durm lexicon: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[4].split('.')[0] self.tagger[wordform] = (lemma, 'n') f.close() print("Loaded ", len(self.tagger), " wordforms",file=stderr) elif args[0] == "oldlex": if not len(args) == 2: raise Exception("Syntax: oldlex:[filename]") print("Reading OLDLexique: ", args[1],file=stderr) self.mode = "lookup" self.tagger = {} f = codecs.open(args[1],'r','utf-8') for line in f: fields = line.split('\t') wordform = fields[0].lower() lemma = fields[1] if lemma == '=': lemma == fields[0] pos = fields[2][0].lower() self.tagger[wordform] = (lemma, pos) print("Loaded ", len(self.tagger), " wordforms",file=stderr) f.close() else: raise Exception("Invalid mode: " + args[0])
from stemming.porter2 import stem import sys class Sentence(): def __init__(self, sentence): self.lines = re.sub('([.;:?!]) ([A-Z])', "\g<1>\n\g<2>", sentence.replace('\n', '')).split('\n') self.words = self._get_words() def _get_words(self): words = [] for line in self.lines: for word in line.split(' '): words.append(word.rstrip('.,')) words.append('') return words if __name__ == '__main__': with open('../nlp.txt', 'r') as f: strings = f.read() sentence = Sentence(strings) parser = corenlp.StanfordCoreNLP( corenlp_path='/usr/local/lib/stanford-corenlp/', memory="3g") for line in sentence.lines: json_data = parser.raw_parse(line) for word, info in json_data["sentences"][0]['words']: print(word)
import corenlp parser = corenlp.StanfordCoreNLP() json_data = parser.parse("I am Alice.")
def load_stanford_parser(): parser = corenlp.StanfordCoreNLP("stanford-corenlp-full-2016-10-31") return parser
def connect_parser(self): corenlp_dir = "stanford-corenlp-full-2016-10-31/" parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir) return parser
import json import corenlp import copy import pandas as pd # 学習データ LEARNING_DOCUMENT = "../data/train.csv" # パーサの生成 CORENLP_DIR = "../stanford-corenlp-full-2014-08-27" parser = corenlp.StanfordCoreNLP(corenlp_path=CORENLP_DIR) class CreateDfModel: #コンストラクタ def __init__(self): #{学習データ内の単語:その単語が含まれる文数} #【例】 #{りんご:18}だったら、学習データ(STS_train_feature.txt)の中に #りんごという単語が存在する文数は18文ある。 self.__sentence_count_df = pd.DataFrame([], index=list('0'), columns=list()) #一文に出てきた単語の原型の集合 self.__add_sentence_count = set() # 学習データ(STS_train_feature.txt)における # ある単語が含まれる文数をカウント def __count_sentence_in_learning_document(self, add_sentence_count): for word in add_sentence_count: