def multi_word_cut(self, sentences): print('Multiprocessing Word cut ') if self.language == 'ch': jieba.initialize( ) # initialize first, or it will initialize in each process jieba.disable_parallel() def func(line): line = [i.strip() for i in jieba.cut(line, cut_all=False)] return [ i for i in line if ((not i.isdigit()) and (i not in self.stop_words)) ] else: def func(line): return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \ (i not in self.stop_words) and \ (len(i) >1 ) )] pool = Pool(nodes=5) t0 = time.time() word_cut = pool.map(func, sentences) pool.close() pool.join() pool.clear() print('MultiProcess time {:.0f}'.format(time.time() - t0)) return word_cut
def map_get_words(txts, kind="char", return_type="str"): if isinstance(txts, str): with open(txts, "r") as f: txts = [row.strip() for row in f.readlines()] jieba = None if kind == "word": import jieba_fast as jieba jieba.initialize() jieba.load_userdict("dict_fasttext.txt") txts = list(map(lambda txt: get_words(txt, kind, return_type, jieba), txts)) return txts
dataset = dataset.shuffle(batch_size * 1000) # 打乱 dataset = dataset.batch(batch_size) # 成批 return dataset if __name__ == '__main__': # 使用测试 from bert4keras.tokenizer import Tokenizer import json, glob, re import jieba_fast as jieba from tqdm import tqdm jieba.initialize() dict_path = '/home/spaces_ac_cn/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) def some_texts(): for _ in range(2): # 数据重复两遍 filenames = glob.glob('/home/spaces_ac_cn/corpus/*/*/*') np.random.shuffle(filenames) for filename in filenames: with open(filename) as f: for l in f: l = json.loads(l)['text'].strip() yield re.findall(u'.*?[\n。]+', l) def word_segment(text): return jieba.lcut(text)
import jieba_fast.posseg posdelim = args.pos def cutfunc(sentence, _, HMM=True): for w, f in jieba_fast.posseg.cut(sentence, HMM): yield w + posdelim + f else: cutfunc = jieba.cut delim = text_type(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin if args.dict: jieba.initialize(args.dict) else: jieba.initialize() if args.user_dict: jieba.load_userdict(args.user_dict) ln = fp.readline() while ln: l = ln.rstrip('\r\n') result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm)) if PY2: result = result.encode(default_encoding) print(result) ln = fp.readline() fp.close()