def main():
    corenlp_dir = "stanford-corenlp-full-2016-10-31/"
    parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir)
    df_dev = pd.read_csv("dev_v1.csv")

    predictor = AnswertypePredictor()

    df_dev["AnswerType"] = [predictor.predict_answer_type(
        question, predictor.vectorizer, predictor.svm) for question in df_dev["question"]]

    df_keywords = collect_keywords(parser, df_dev["question"])
    df_keywords.to_csv("keywords.csv")
    df_keywords = pd.read_csv("keywords.csv")

    df_dev["entities"] = df_keywords["entities"]
    df_dev["NNPs"] = df_keywords["NNPs"]
    df_dev["NNs"] = df_keywords["NNs"]
    df_dev["VBs"] = df_keywords["VBs"]
    df_dev["WP"] = df_keywords["WP"]

    df_dev.to_csv("df_dev_updated.csv")
Example #2
0
def core_nlp_parser(file, query):
    corenlp_dir = "/Users/piranon/Documents/StanfordParser/stanford-corenlp-full-2013-06-20"
    properties_file = "./user.properties"
    parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir,
                                     properties=properties_file)
    tagged = []
    with codecs.open(file, 'r', 'utf-8') as f:
        snt = f.readlines()
    snt = list(set(snt))
    for i, line in enumerate(snt):
        if len(line) > 400:
            continue
        line = line.strip()
        line_tagged = []
        result_nlp = json.loads(parser.parse(line))
        dependencies = [
            d for d in result_nlp[u'sentences'][0][u'dependencies'][:]
            if query in d
        ]
        for d in dependencies:
            d.remove(query)

        for i, w in enumerate(
            [x[1]['Lemma'] for x in result_nlp[u'sentences'][0][u'words']]):
            for d in dependencies:
                if w == query:
                    line_tagged.append((w, 'TARGET'))
                    break
                elif w in d:
                    line_tagged.append((w, d[0]))
                    break
            else:
                #文章全部をPOSで入力するとき
                # line_tagged.append((w, result_nlp[u'sentences'][0][u'words'][i][1][u'PartOfSpeech']))

                #TARGET動詞との関係がある単語のみのとき
                pass
        tagged.append(line_tagged)
    print 'Input text:%d' % len(tagged)
    return tagged
Example #3
0
import json
import corenlp
if __name__ == '__main__':
    with open('./nlp.txt', 'r') as f:
        text = f.read()
    corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2013-06-20/"
    parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir)
    # 一度に処理すると、途中で処理が切れる
    for line in text.split('\n'):
        result = json.loads(parser.parse(line))
        for sentence_data in result['sentences']:
            for word_data in sentence_data['words']:
                word = word_data[0]
                tag = word_data[1]['NamedEntityTag']
                pos = word_data[1]['PartOfSpeech']
                if tag == 'PERSON' and pos == 'NNP':
                    print(f'{word}')
Example #4
0
import corenlp

corenlp_dir = "../../../stanford/stanford-corenlp-full-2015-01-29/"
properties_file = "../../../stanford/stanford-corenlp-full-2015-01-29/user.properties"
parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir,
                                 properties=properties_file)


def parse(text_name, out_name):
    with open(text_name, 'r', encoding='utf-8',
              errors='ignore') as text, open(out_name, 'w') as out:
        for line in text:
            for sentence in parser.raw_parse(line)["sentences"]:
                words, lemma, pos = [], [], []

                for word_element in sentence['words']:
                    words.append(word_element[0])
                    lemma.append(str(word_element[1]["Lemma"]).lower())
                    pos.append(word_element[1]["PartOfSpeech"])

                depend = sentence["indexeddependencies"]
                out.write(' '.join(words) + '\n')
                out.write(' '.join(lemma) + '\n')
                out.write(' '.join(pos) + '\n')

                for depend_num in range(len(depend)):
                    out.write(' '.join(depend[depend_num]))
                    if depend_num != len(depend) - 1:
                        out.write('\t')
                out.write('\n\n')
Example #5
0
 def __init__(self, *args):        
    global WSDDIR
    self.tagger = None
    self.mode = args[0]
    if args[0] == "file":
        if len(args) != 2:
            raise Exception("Syntax: file:[filename]")            
        self.tagger = codecs.open(args[1],'r','utf-8') 
    elif args[0] == "frog":
        if len(args) != 3:
            raise Exception("Syntax: frog:[host]:[port]")
        from pynlpl.clients.frogclient import FrogClient
        port = int(args[2])
        self.tagger = FrogClient(args[1],port)                
    elif args[0] == "freeling":
        if len(args) != 3:
            raise Exception("Syntax: freeling:[host]:[port]")
        from pynlpl.clients.freeling import FreeLingClient
        host = args[1]
        port = int(args[2])
        self.tagger = FreeLingClient(host,port)            
    elif args[0] == "corenlp":
        if len(args) != 1:
            raise Exception("Syntax: corenlp")
        import corenlp
        print("Initialising Stanford Core NLP",file=stderr)
        self.tagger = corenlp.StanfordCoreNLP()
    elif args[0] == 'treetagger':                        
        if not len(args) == 2:
            raise Exception("Syntax: treetagger:[treetagger-bin]")
        self.tagger = args[1]            
    elif args[0] == "durmlex":
        if not len(args) == 2:
            raise Exception("Syntax: durmlex:[filename]")
        print("Reading durm lexicon: ", args[1],file=stderr)
        self.mode = "lookup"
        self.tagger = {}
        f = codecs.open(args[1],'r','utf-8')
        for line in f:
            fields = line.split('\t')
            wordform = fields[0].lower()
            lemma = fields[4].split('.')[0]
            self.tagger[wordform] = (lemma, 'n')
        f.close()
        print("Loaded ", len(self.tagger), " wordforms",file=stderr)
    elif args[0] == "oldlex":
        if not len(args) == 2:
            raise Exception("Syntax: oldlex:[filename]")
        print("Reading OLDLexique: ", args[1],file=stderr)
        self.mode = "lookup"
        self.tagger = {}
        f = codecs.open(args[1],'r','utf-8')
        for line in f:
            fields = line.split('\t')
            wordform = fields[0].lower()                
            lemma = fields[1]
            if lemma == '=': 
                lemma == fields[0]
            pos = fields[2][0].lower()
            self.tagger[wordform] = (lemma, pos)
            print("Loaded ", len(self.tagger), " wordforms",file=stderr)
        f.close()        
    else:
        raise Exception("Invalid mode: " + args[0])
Example #6
0
from stemming.porter2 import stem
import sys


class Sentence():
    def __init__(self, sentence):
        self.lines = re.sub('([.;:?!]) ([A-Z])', "\g<1>\n\g<2>",
                            sentence.replace('\n', '')).split('\n')
        self.words = self._get_words()

    def _get_words(self):
        words = []
        for line in self.lines:
            for word in line.split(' '):
                words.append(word.rstrip('.,'))
            words.append('')
        return words


if __name__ == '__main__':
    with open('../nlp.txt', 'r') as f:
        strings = f.read()
    sentence = Sentence(strings)

    parser = corenlp.StanfordCoreNLP(
        corenlp_path='/usr/local/lib/stanford-corenlp/', memory="3g")
    for line in sentence.lines:
        json_data = parser.raw_parse(line)
        for word, info in json_data["sentences"][0]['words']:
            print(word)
Example #7
0
import corenlp
parser = corenlp.StanfordCoreNLP()
json_data = parser.parse("I am Alice.")
Example #8
0
def load_stanford_parser():
    parser = corenlp.StanfordCoreNLP("stanford-corenlp-full-2016-10-31")
    return parser
 def connect_parser(self):
     corenlp_dir = "stanford-corenlp-full-2016-10-31/"
     parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir)
     return parser
Example #10
0
import json
import corenlp
import copy
import pandas as pd

# 学習データ
LEARNING_DOCUMENT = "../data/train.csv"

# パーサの生成
CORENLP_DIR = "../stanford-corenlp-full-2014-08-27"
parser = corenlp.StanfordCoreNLP(corenlp_path=CORENLP_DIR)


class CreateDfModel:

    #コンストラクタ
    def __init__(self):
        #{学習データ内の単語:その単語が含まれる文数}
        #【例】
        #{りんご:18}だったら、学習データ(STS_train_feature.txt)の中に
        #りんごという単語が存在する文数は18文ある。
        self.__sentence_count_df = pd.DataFrame([],
                                                index=list('0'),
                                                columns=list())
        #一文に出てきた単語の原型の集合
        self.__add_sentence_count = set()

    # 学習データ(STS_train_feature.txt)における
    # ある単語が含まれる文数をカウント
    def __count_sentence_in_learning_document(self, add_sentence_count):
        for word in add_sentence_count: