def peopledaily(files, save_file): ''' 人民日报语料库预处理 :param files: :param save_file: :return: ''' text = IOUtil.load_files(files) begin = datetime.datetime.now() print('start to pretreat...') result_text = [] for line in text: # tokens = line.split(delimiter) # 根据语料处理,空格个数、tab等等 # print(tokens) peopledaily = PeopleDailyUtil(delimiter=' ', line=line) # 时间合并,有待优化 peopledaily.merge_time() # print('时间合并完成') # 括号内部合并 peopledaily.merge_brackets() # print('括号内部合并完成') # 姓名合并 peopledaily.merge_name() # print('姓名合并完成') # 百分数合并 peopledaily.merge_percent() # print('百分数合并完成') result_text.append(peopledaily.tokens) # /变空格或者TAB,保存到文件 IOUtil.save_to_file(result_text, save_file) end = datetime.datetime.now() print('finished in ' + str((end - begin).seconds) + ' s!') print('save as ' + save_file)
def bakeoff2005(files, save_file): ''' SIGHAN提供的backoff 2005语料 :param files: :param save_file: :return: ''' text = IOUtil.load_files(files) begin = datetime.datetime.now() print('start to pretreat...') bakeoff = Bakeoff2005Util() bakeoff.pos_tag_for_crf(text, save_file) end = datetime.datetime.now() print('finished in ' + str((end - begin).seconds) + ' s!') print('save as ' + save_file)
Author:jason date:2018/3/19 ------------------------------------------------- Change Activity:2018/3/19: ------------------------------------------------- """ import codecs import random import numpy as np from util.io import IOUtil if __name__ == '__main__': input = 'character_tags.utf-8' ftrain = 'data/train.utf-8' ftest = 'data/test.utf-8' text = IOUtil.load_files([input]) # print(text) train = text # train_index = random.sample(range(len(text)), int(len(text) * 0.8)) test_index = random.sample(range(len(text)), int(len(text) * 0.2)) # train = np.array(text)[train_index] test = np.array(text)[test_index] IOUtil.save_to_file(train, '6crf++/train.utf-8') IOUtil.save_to_file(test, '6crf++/test.utf-8') IOUtil.save_to_file(train, ftrain) IOUtil.save_to_file(test, ftest)
Author:jason date:2018/3/17 ------------------------------------------------- Change Activity:2018/3/17: ------------------------------------------------- """ from json import JSONDecodeError from stanfordcorenlp import StanfordCoreNLP from util.io import IOUtil delimiter = ' ' if __name__ == '__main__': input = 'postags.utf-8' text = IOUtil.load_files([input]) # print(text) character_tags = [] # nlp = StanfordCoreNLP('http://corenlp.run', port=80, lang='zh') nlp = StanfordCoreNLP('C:\stanford-corenlp-full-2018-02-27', port=80, lang='zh') try: for line in text: if len(line.strip()) != 0: word, tag = line.strip().split(delimiter) print(word) character_tag = nlp.ner(word) print('ok') character_tags.append(word + delimiter + tag + delimiter + character_tag[0][1] + '\n') else:
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------- Author:jason date:2018/3/17 ------------------------------------------------- Change Activity:2018/3/17: ------------------------------------------------- """ from stanfordcorenlp import StanfordCoreNLP from util.io import IOUtil if __name__ == '__main__': input = 'sentences.utf-8' text = IOUtil.load_files([input]) # print(text) words = [] nlp = StanfordCoreNLP('http://corenlp.run', port=80, lang='zh') for line in text: l = nlp.word_tokenize(line) words.extend(' '.join(l)) words.append('\n') nlp.close() IOUtil.save_to_file(words, 'words.utf-8')
------------------------------------------------- Author:jason date:2018/3/17 ------------------------------------------------- Change Activity:2018/3/17: ------------------------------------------------- """ from stanfordcorenlp import StanfordCoreNLP from util.io import IOUtil delimiter = ' ' if __name__ == '__main__': input = 'words.utf-8' text = IOUtil.load_files([input]) # print(text) postags = [] nlp = StanfordCoreNLP('http://corenlp.run', port=80, lang='zh') for line in text: l = nlp.pos_tag(line) for tlp in l: word, tag = tlp postags.append(str(word) + delimiter + str(tag) + '\n') postags.append('\n') nlp.close() IOUtil.save_to_file(postags, 'postags.utf-8')
Author:jason date:2018/3/19 ------------------------------------------------- Change Activity:2018/3/19: ------------------------------------------------- """ from stanfordcorenlp import StanfordCoreNLP from util.io import IOUtil if __name__ == '__main__': train_input = 'corpora/bakeoff2005/data/mypku_training.utf-8' test_input = 'corpora/bakeoff2005/data/mypku_test.utf-8' train_words = IOUtil.load_files([train_input]) test_words = IOUtil.load_files([test_input]) nlp = StanfordCoreNLP('C:\stanford-corenlp-full-2018-02-27', lang='zh') ''' train_words_pos_taged = [] for line in train_words: if line.strip() == '': continue line_tags = nlp.pos_tag(line) for pos_tag in line_tags: train_words_pos_taged.append(' '.join(pos_tag)) train_words_pos_taged.append('\n') # print(train_words_pos_taged) test_words_pos_taged = []
l = [] # 句子列表,分句后的整句内容 temp = [] # 临时列表,用于存储捕获到分句标志符之前的每个字符,一旦发现分句符号后,就会将其内容全部赋给l,然后就会被清空 for line in lines: if FindToken(line, cutlist): # 如果当前字符是分句符号 temp.append(line) # 将此字符放入临时列表中 l.append(''.join(temp)) # 并把当前临时列表的内容加入到句子列表中 temp = [] else: # 如果当前字符不是分句符号,则将该字符直接放入临时列表中 temp.append(line) return l if __name__ == '__main__': input = '0original.utf-8' text = IOUtil.load_files([input]) # print(text) sents = [] # 设置分句的标志符号 cutlist = "。!?" for lines in text: l = Cut(list(lines), list(cutlist)) for line in l: if line.strip() != "": sents.append(line) sents.append('\n') IOUtil.save_to_file(sents, 'sentences.utf-8')
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------- Author:jason date:2018/3/17 ------------------------------------------------- Change Activity:2018/3/17: ------------------------------------------------- """ from util.io import IOUtil import sklearn_crfsuite as crf delimiter = ' ' if __name__ == '__main__': input = 'postags.utf-8' text = IOUtil.load_files([input]) print(text)