def main(_): text2ids.init(FLAGS.vocab_) print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en, 'seg_method', FLAGS.seg_method) print(text2ids.ids2text(text2ids_('傻逼脑残B'))) print(text2ids_('傻逼脑残B')) print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976'))) #exit(0) if os.path.isfile(FLAGS.input): build_features(FLAGS.input) else: files = glob.glob(FLAGS.input + '/*') pool = multiprocessing.Pool(multiprocessing.cpu_count()) pool.map(build_features, files) pool.close() pool.join() # for safe some machine might not use cpu count as default ... print('num_records:', counter.value) mode = get_mode(FLAGS.input) os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode)) out_file = os.path.dirname( FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file) print('mean words:', total_words.value / counter.value)
def main(_): FLAGS.seg_method = 'basic_digit' FLAGS.feed_single = True FLAGS.feed_single_en = True print('seg_method:', FLAGS.seg_method, file=sys.stderr) print('feed_single:', FLAGS.feed_single, file=sys.stderr) print('feed_single_en:', FLAGS.feed_single_en, file=sys.stderr) text2ids.init(FLAGS.vocab) counter = WordCounter(most_common=0, min_count=1) vocab2 = ifile.replace('.csv', '.pos.mix.vocab') assert FLAGS.vocab ifile = sys.argv[1] if not gezi.env_has('BSEG'): ofile = ifile.replace('.csv', '.seg.mix.txt') else: ofile = ifile.replace('.csv', '.seg.bseg.mix.txt') ids_set = set() fm = 'w' if os.path.exists(ofile): fm = 'a' for line in open(ofile): ids_set.add(line.split('\t')[0]) print('%s already done %d' % (ofile, len(ids_set))) num_errs = 0 with open(ofile, fm) as out: df = pd.read_csv(ifile, lineterminator='\n') contents = df['content'].values ids = df['id'].values for i in tqdm(range(len(df)), ascii=True): if str(ids[i]) in ids_set: continue #if i != 2333: # continue #print(gezi.cut(filter.filter(contents[i]), type_)) try: seg(ids[i], contents[i], out, counter) except Exception: #print(traceback.format_exc()) num_errs += 1 continue #exit(0) counter.save(vocab2) print('num_errs:', num_errs, 'ratio:', num_errs / len(df))
def main(_): text2ids.init(FLAGS.vocab_) print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en, 'seg_method', FLAGS.seg_method) print(text2ids.ids2text(text2ids_('傻逼脑残B'))) print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976'))) global df df = pd.read_csv(FLAGS.input, lineterminator='\n') mode = get_mode(FLAGS.input) pool = multiprocessing.Pool() if mode in ['valid', 'test', 'dev', 'pm']: FLAGS.num_records_ = 1 print('num records file to gen', FLAGS.num_records_) #FLAGS.num_records_ = 1 pool.map(build_features, range(FLAGS.num_records_)) pool.close() pool.join() #build_features(FLAGS.input) # for safe some machine might not use cpu count as default ... print('num_records:', counter.value) os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode)) out_file = os.path.dirname( FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file) print('mean words:', total_words.value / counter.value)
import sys, os import numpy as np import melt from gezi import Segmentor segmentor = Segmentor() import gezi import pandas as pd from wenzheng.utils import text2ids vocab = FLAGS.vocab_ text2ids.init(vocab) from text2ids import text2ids as text2ids_ #import filter START_WORD = '<S>' END_WORD = '</S>' FLAGS.seg_method = 'basic_digit' print('seg_method:', FLAGS.seg_method, file=sys.stderr) def seg(text, out): #text = filter.filter(text) words = text2ids.ids2words(text2ids_(text))