コード例 #1
0
    def POS_data(self):
        """POS sentences"""
        tag = 'pos'
        idx = 19
        file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx)
        with open(file_name, 'r') as file:
            sentences = file.read().strip().split('\n')

        stop_words = stopwords.words('english')
        eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
        eng_parser.java_options = '-mx3000m'

        print('=' * 100)
        print('current tag: {}, file idx: {}'.format(tag, idx))

        '''POS'''
        print('=' * 100)
        print('Starting POS...')
        pos_sent = []
        for sent in tqdm(sentences):
            pos_sent.append(list(eng_parser.parse(
                [w for w in sent.split()]))[0])

        '''save file'''
        save_file = 'data/{}_sent/{}_sent_{}.csv'.format(tag, tag, idx)
        with open(save_file, mode='w') as file:
            for sent, pos in zip(sentences, pos_sent):
                file.write(sent + '\t')
                file.write(str(pos) + '\t')
        print('Finish! Saved in {}'.format(save_file))
コード例 #2
0
def pos_test():
    stop_words = stopwords.words('english')
    eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
    eng_parser.java_options = '-mx3000m'
    sentence = "so now i run out take a few shot then run back in the house and xfer them to the pc"

    res = eng_parser.parse([w for w in sentence.lower().split()])
    lst_res = list(res)[0]
    with open('data/tree_test.txt', 'w') as file:
        file.write(sentence + '\t')
        file.write(str(lst_res) + '\t')
    # print(lst_res)
    lst_res.pretty_print()

    # lst_res.remove(Tree('NN', ['camera']))
    cleaned_sent = []
    for sent in lst_res:
        wnl = WordNetLemmatizer()
        tmp_sent = []
        for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'):
            '''clean stop words & stemming'''
            tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words]
            '''lenght <= 3 & filter repeated list'''
            if 0 < len(tmp) <= 3 and tmp not in tmp_sent:
                tmp_sent.append(tmp)

        cleaned_sent.append(tmp_sent)

    # get opinion word
    # for w in cleaned_sent[0]:
    #     print(w)
    #     words = sentence.split()
    #     min_dist = len(words)
    #     min_asp = w
    #     for s in lst_res.subtrees(lambda t: t.label() == 'JJ'):
    #         if abs(words.index(s.leaves()[0]) - words.index(w[0])) < min_dist:
    #             min_dist = pos_test
    #             min_asp = s.leaves()[0]
    #
    #     if min_asp == w:
    #         print('not found')
    #     else:
    #         print(min_asp)

    print(cleaned_sent)
コード例 #3
0
def parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'
    all_parser_sentence = []
    file = shelve.open(filename)
    flag = 0

    for sentence in word_lists:
        if sentence.strip() != "":
            res = list(chinese_parser.parse((sentence.strip()).split()))
            new_str = return_str_tofile(sentence_parse=str(res[0]))
            file[str(flag)] = res
            all_parser_sentence.append(new_str)
            flag += 1
            print("###### NLTK Dependency Parser Have finished " + str(flag) +
                  " sentences ###")
    return all_parser_sentence
コード例 #4
0
                   '/home/{}/stanford/parser/stanford-parser.jar:' \
                   '/home/{}/stanford/parser/stanford-parser-3.9.2-models.jar'.format(
    user_name, user_name, user_name)

# tag = 'pos'
# idx = 19
# file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx)
file_name = 'data/nor_clas.csv'

with open(file_name, 'r') as file:
    sentences = file.read().strip().split('\n')

stop_words = stopwords.words('english')
eng_parser = StanfordParser(
    model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
eng_parser.java_options = '-mx3000m'

# print('=' * 100)
# print('current tag: {}, file idx: {}'.format(tag, idx))
'''POS'''
print('=' * 100)
print('Starting POS...')
pos_sent = []
# for sent in tqdm(sentences):
#     pos_sent.append(list(eng_parser.parse(
#         [w for w in sent.split()]))[0])
for sent in tqdm(sentences):
    pos_sent.append(list(eng_parser.parse([w for w in sent.split()[2:]
                                           ]))[0])  # ignore first two word
'''filter noun phrase & NLTK stemming'''
# print('=' * 100)
コード例 #5
0
split = ["trial", "train", "test_annotated"]
for s in split:
    f = open("SICK_" + s + ".txt", "r")
    lines = f.readlines()
    sentences = []
    labels = []
    for i in range(1, len(lines)):
        a = lines[i].split("\t")
        sentences.extend([a[1], a[2]])
        labels.extend([a[3], a[3]])

    parser = StanfordParser(
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    stanford_dir = parser._classpath[0].rpartition('/')[0]
    parser._classpath = tuple(find_jars_within_path(stanford_dir))
    parser.java_options = '-mx5000m'  # To increase the amount of RAM it can use.
    #a=[parse.tree()._pformat_flat("","()",False) for parse in parser.raw_parse("The young boys are playing outdoors and the man is smiling nearby")]
    a = [[parse for parse in dep_graphs]
         for dep_graphs in parser.raw_parse_sents(sentences)]
    file = open("SICK_cons_parse_" + s + ".txt", "w")
    for i in range(len(a)):
        for j in range(len(a[i])):
            a[i][j].chomsky_normal_form(horzMarkov=1)
            a[i][j].collapse_unary(collapsePOS=True)
            d = a[i][j]._pformat_flat("", "()", False)
            sent1 = d.replace("ROOT", labels[i], 1)
            file.write(sent1 + "\n")
    file.close()
    f.close()