コード例 #1
0
def splitAllWord(typeOfDataset="dev"):
    from nltk.tokenize.stanford_segmenter import StanfordSegmenter

    segmenter = StanfordSegmenter()
    segmenter.default_config('zh')

    maxCount = 2000000

    pathOfDev = "dataset/task1/%s.tsv" % typeOfDataset
    dfOfDev = pd.read_csv(pathOfDev, delimiter="\t")

    pathOfNewDev = "%s_split.tsv" % typeOfDataset

    count = 0
    with open(pathOfNewDev, "w", encoding='utf-8') as fw:
        for row in dfOfDev.iterrows():
            if count >= maxCount:
                break
            if count % 100 == 0:
                print("[%s]count = %s" % (typeOfDataset, count))

            label = row[1]['label']
            fw.write(str(label))
            fw.write("\t")
            sentence = row[1]['text_a']

            segmentOfSentence = segmenter.segment(sentence)
            for word in segmentOfSentence.split():
                fw.write(word)
                fw.write(" ")
            fw.write("\n")

            count += 1
コード例 #2
0
def segment(labels, reviews):

    segmented = []

    print('Creating BOW')
    # seg = StanfordSegmenter('../../datasets/data-hauyi/stanford-segmenter-2018-10-16')
    os.environ[
        "STANFORD_SEGMENTER"] = '../datasets/data-hauyi/stanford-segmenter-2018-10-16'
    seg = StanfordSegmenter(
        '../datasets/data-hauyi/stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar'
    )
    seg.default_config('zh', )
    count = 0

    file_out = open('reviews.txt', 'a+')

    for i in range(len(reviews)):
        # print(i)
        s = seg.segment(reviews[i])
        l = labels[i]
        # print(s)
        line = str(l) + ' ' + s
        file_out.write(line)
        segmented.append(s)
        # print('Tokenize: ')
        # print(seg.tokenize(s))
        count = count + 1
        # if count > 5:
        #     break
        print('Count: ', count)

    return (segmented)
コード例 #3
0
# encoding: utf-8
import nltk
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tag import StanfordNERTagger

segmenter = StanfordSegmenter(
    #分词依赖的jar包
    path_to_jar=
    r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar",
    path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar",
    #分词数据文件夹
    path_to_sihan_corpora_dict=
    r"/home/jiangix/document/stanford-segmenter/data",
    #基于北大在2005backoof上提供的人名日报语料库
    path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz",
    path_to_dict=
    r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz")

segmenter.default_config('zh')
result = segmenter.segment(u'我喜欢学习编程')

chi_tagger = StanfordNERTagger(
    model_filename=
    r"/home/jiangix/document/stanford-chinese-corenlp-models/chinese.misc.distsim.crf.ser.gz",
    path_to_jar=r"/home/jiangix/document/stanford-ner/stanford-ner.jar")
for word, tag in chi_tagger.tag(result.split()):
    print(word, tag)
コード例 #4
0
import os, time

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
start = time.time()

jar = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/stanford-segmenter.jar'
api_jar = '/Users/sinansmac/Public/StanfordNLP/stanford-parser-full-2017-06-09/slf4j-api.jar'
# dict = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/data/dict-chris6.ser.gz'

seg = StanfordSegmenter(path_to_jar=jar, path_to_slf4j=api_jar)
seg.default_config('zh')

# sent = u'这是斯坦福中文分词器测试'
# print(seg.segment(sent))

fp = "chinese.txt"
tokenstr = seg.segment_file(fp)
token_ls = list(tokenstr)
print(len(token_ls), '\n', tokenstr, '\n', token_ls)

# with open('chinese_tokens.txt', 'a') as writef:
#     for line in token_ls:
#         writef.write(line.rstrip().split())

# print(tokens, '\n', type(tokens)) # class 'str'

end = time.time()

print("process time:", round(end - start))