def peopledaily(files, save_file):
    '''
	人民日报语料库预处理
	:param files:
	:param save_file:
	:return:
	'''
    text = IOUtil.load_files(files)

    begin = datetime.datetime.now()
    print('start to pretreat...')

    result_text = []

    for line in text:
        # tokens = line.split(delimiter)  # 根据语料处理,空格个数、tab等等
        # print(tokens)
        peopledaily = PeopleDailyUtil(delimiter='  ', line=line)

        #  时间合并,有待优化
        peopledaily.merge_time()
        # print('时间合并完成')
        # 括号内部合并
        peopledaily.merge_brackets()
        # print('括号内部合并完成')
        # 姓名合并
        peopledaily.merge_name()
        # print('姓名合并完成')
        # 百分数合并
        peopledaily.merge_percent()
        # print('百分数合并完成')
        result_text.append(peopledaily.tokens)

    # /变空格或者TAB,保存到文件
    IOUtil.save_to_file(result_text, save_file)
    end = datetime.datetime.now()
    print('finished in ' + str((end - begin).seconds) + ' s!')
    print('save as ' + save_file)
def bakeoff2005(files, save_file):
    '''
	SIGHAN提供的backoff 2005语料
	:param files:
	:param save_file:
	:return:
	'''
    text = IOUtil.load_files(files)
    begin = datetime.datetime.now()
    print('start to pretreat...')
    bakeoff = Bakeoff2005Util()
    bakeoff.pos_tag_for_crf(text, save_file)
    end = datetime.datetime.now()
    print('finished in ' + str((end - begin).seconds) + ' s!')
    print('save as ' + save_file)
Exemple #3
0
   Author:jason
   date:2018/3/19
-------------------------------------------------
   Change Activity:2018/3/19:
-------------------------------------------------
"""
import codecs
import random
import numpy as np
from util.io import IOUtil

if __name__ == '__main__':
	input = 'character_tags.utf-8'
	ftrain = 'data/train.utf-8'
	ftest = 'data/test.utf-8'
	text = IOUtil.load_files([input])
	# print(text)
	
	train = text
	# train_index = random.sample(range(len(text)), int(len(text) * 0.8))
	test_index = random.sample(range(len(text)), int(len(text) * 0.2))
	
	# train = np.array(text)[train_index]
	test = np.array(text)[test_index]
	
	IOUtil.save_to_file(train, '6crf++/train.utf-8')
	IOUtil.save_to_file(test, '6crf++/test.utf-8')
	
	IOUtil.save_to_file(train, ftrain)
	IOUtil.save_to_file(test, ftest)
Exemple #4
0
   Author:jason
   date:2018/3/17
-------------------------------------------------
   Change Activity:2018/3/17:
-------------------------------------------------
"""
from json import JSONDecodeError

from stanfordcorenlp import StanfordCoreNLP
from util.io import IOUtil

delimiter = ' '

if __name__ == '__main__':
	input = 'postags.utf-8'
	text = IOUtil.load_files([input])
	# print(text)
	
	character_tags = []
	
	# nlp = StanfordCoreNLP('http://corenlp.run', port=80, lang='zh')
	nlp = StanfordCoreNLP('C:\stanford-corenlp-full-2018-02-27', port=80, lang='zh')
	try:
		for line in text:
			if len(line.strip()) != 0:
				word, tag = line.strip().split(delimiter)
				print(word)
				character_tag = nlp.ner(word)
				print('ok')
				character_tags.append(word + delimiter + tag + delimiter + character_tag[0][1] + '\n')
			else:
Exemple #5
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
   Author:jason
   date:2018/3/17
-------------------------------------------------
   Change Activity:2018/3/17:
-------------------------------------------------
"""
from stanfordcorenlp import StanfordCoreNLP
from util.io import IOUtil

if __name__ == '__main__':
	input = 'sentences.utf-8'
	text = IOUtil.load_files([input])
	# print(text)
	
	words = []
	
	nlp = StanfordCoreNLP('http://corenlp.run', port=80, lang='zh')
	
	for line in text:
		l = nlp.word_tokenize(line)
		words.extend(' '.join(l))
		words.append('\n')
	nlp.close()
	IOUtil.save_to_file(words, 'words.utf-8')
Exemple #6
0
-------------------------------------------------
   Author:jason
   date:2018/3/17
-------------------------------------------------
   Change Activity:2018/3/17:
-------------------------------------------------
"""

from stanfordcorenlp import StanfordCoreNLP
from util.io import IOUtil

delimiter = ' '

if __name__ == '__main__':
	input = 'words.utf-8'
	text = IOUtil.load_files([input])
	# print(text)
	
	postags = []
	
	nlp = StanfordCoreNLP('http://corenlp.run', port=80, lang='zh')
	
	for line in text:
		l = nlp.pos_tag(line)
		for tlp in l:
			word, tag = tlp
			postags.append(str(word) + delimiter + str(tag) + '\n')
		postags.append('\n')
	nlp.close()
	IOUtil.save_to_file(postags, 'postags.utf-8')
Exemple #7
0
   Author:jason
   date:2018/3/19
-------------------------------------------------
   Change Activity:2018/3/19:
-------------------------------------------------

"""

from stanfordcorenlp import StanfordCoreNLP
from util.io import IOUtil

if __name__ == '__main__':
    train_input = 'corpora/bakeoff2005/data/mypku_training.utf-8'
    test_input = 'corpora/bakeoff2005/data/mypku_test.utf-8'

    train_words = IOUtil.load_files([train_input])
    test_words = IOUtil.load_files([test_input])

    nlp = StanfordCoreNLP('C:\stanford-corenlp-full-2018-02-27', lang='zh')
    '''
	train_words_pos_taged = []
	for line in train_words:
		if line.strip() == '':
			continue
		line_tags = nlp.pos_tag(line)
		for pos_tag in line_tags:
			train_words_pos_taged.append(' '.join(pos_tag))
			train_words_pos_taged.append('\n')
	# print(train_words_pos_taged)
	
	test_words_pos_taged = []
Exemple #8
0
    l = []  # 句子列表,分句后的整句内容
    temp = []  # 临时列表,用于存储捕获到分句标志符之前的每个字符,一旦发现分句符号后,就会将其内容全部赋给l,然后就会被清空

    for line in lines:
        if FindToken(line, cutlist):  # 如果当前字符是分句符号
            temp.append(line)  # 将此字符放入临时列表中
            l.append(''.join(temp))  # 并把当前临时列表的内容加入到句子列表中
            temp = []
        else:  # 如果当前字符不是分句符号,则将该字符直接放入临时列表中
            temp.append(line)
    return l


if __name__ == '__main__':
    input = '0original.utf-8'
    text = IOUtil.load_files([input])
    # print(text)

    sents = []

    # 设置分句的标志符号
    cutlist = "。!?"

    for lines in text:
        l = Cut(list(lines), list(cutlist))
        for line in l:
            if line.strip() != "":
                sents.append(line)
                sents.append('\n')
    IOUtil.save_to_file(sents, 'sentences.utf-8')
Exemple #9
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
   Author:jason
   date:2018/3/17
-------------------------------------------------
   Change Activity:2018/3/17:
-------------------------------------------------
"""

from util.io import IOUtil
import sklearn_crfsuite as crf

delimiter = ' '

if __name__ == '__main__':
	input = 'postags.utf-8'
	text = IOUtil.load_files([input])
	print(text)