def main(_): # FLAGS.seg_method = 'basic_digit' # FLAGS.feed_single = True # FLAGS.feed_single_en = True # print('seg_method:', FLAGS.seg_method, file=sys.stderr) # print('feed_single:', FLAGS.feed_single, file=sys.stderr) # print('feed_single_en:', FLAGS.feed_single_en, file=sys.stderr) #assert FLAGS.vocab global vocab vocab = gezi.Vocabulary(FLAGS.vocab) ifile = sys.argv[1] if not gezi.env_has('BSEG'): ofile = ifile.replace('.csv', '.seg.jieba.mix.txt') else: ofile = ifile.replace('.csv', '.seg.bseg.mix.txt') counter = WordCounter(most_common=0, min_count=1) vocab2 = ifile.replace('.csv', '.pos.mix.vocab') ids_set = set() fm = 'w' if os.path.exists(ofile): fm = 'a' for line in open(ofile): ids_set.add(line.split('\t')[0]) print('%s already done %d' % (ofile, len(ids_set))) num_errs = 0 with open(ofile, fm) as out: df = pd.read_csv(ifile, lineterminator='\n') contents = df['content'].values ids = df['id'].values for i in tqdm(range(len(df)), ascii=True): #if str(ids[i]) in ids_set: # continue #if i != 2333: # continue #print(gezi.cut(filter.filter(contents[i]), type_)) try: seg(ids[i], contents[i], out, counter) except Exception: if num_errs == 0: print(traceback.format_exc()) num_errs += 1 continue #exit(0) counter.save(vocab2) print('num_errs:', num_errs, 'ratio:', num_errs / len(df))
def main(_): tokenizer.init(FLAGS.tokenizer_vocab) global counter counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) global char_counter char_counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) run(FLAGS.input) if FLAGS.test_input: run(FLAGS.test_input, count=FLAGS.test_count) vocab_name = FLAGS.vocab_name or 'vocab' os.system('mkdir -p %s' % FLAGS.out_dir) out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name) counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name) char_counter.save(out_txt)
import gezi #assert gezi.env_has('JIEBA_POS') from gezi import WordCounter import pandas as pd from projects.ai2018.sentiment.prepare import filter from tqdm import tqdm import traceback START_WORD = '<S>' END_WORD = '</S>' counter = WordCounter(most_common=0, min_count=1) counter2 = WordCounter(most_common=0, min_count=1) print('seg_method:', FLAGS.seg_method, file=sys.stderr) if gezi.env_has('SENTENCE_PIECE'): assert FLAGS.sp_path gezi.segment.init_sp(FLAGS.sp_path) def seg(id, text, out, type): text = filter.filter(text) counter.add(START_WORD) counter.add(END_WORD) l = gezi.cut(text, type) if type != 'word':
flags.DEFINE_integer("most_common", 0, "if > 0 then get vocab with most_common words") flags.DEFINE_integer("min_count", 0, "if > 0 then cut by min_count") flags.DEFINE_integer("max_lines", 0, "") flags.DEFINE_boolean("add_unknown", True, "treat ignored words as unknow") flags.DEFINE_boolean("save_count_info", True, "save count info to bin") flags.DEFINE_string("out_dir", './mount/temp/ai2018/sentiment/', "save count info to bin") flags.DEFINE_string("vocab_name", None, "") flags.DEFINE_string('seg_method', 'basic_single_all', '') assert FLAGS.most_common > 0 or FLAGS.min_count > 0 assert FLAGS.seg_method from gezi import WordCounter counter = WordCounter( most_common=FLAGS.most_common, min_count=FLAGS.min_count) import sys,os #reload(sys) #sys.setdefaultencoding('utf8') import numpy as np from gezi import Segmentor segmentor = Segmentor() print(segmentor, file=sys.stderr) import gezi START_WORD = '<S>' END_WORD = '</S>'
def main(_): tokenizer.init(FLAGS.tokenizer_vocab) if FLAGS.full_tokenizer: gezi.segment.init_spacy_full() os.system('mkdir -p %s' % FLAGS.out_dir) print('name', FLAGS.name, 'out_dir', FLAGS.out_dir) global counter counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) global char_counter char_counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) global ngram_counter ngram_counter = WordCounter(write_unknown=True, min_count=FLAGS.min_count) global pos_counter, tag_counter, ner_counter pos_counter = WordCounter(write_unknown=True, min_count=1) tag_counter = WordCounter(write_unknown=True, min_count=1) ner_counter = WordCounter(write_unknown=True, min_count=1) run(FLAGS.input) if FLAGS.test_input and not FLAGS.name: run(FLAGS.test_input, count=FLAGS.test_count) if not FLAGS.name: vocab_name = FLAGS.vocab_name or 'vocab' os.system('mkdir -p %s' % FLAGS.out_dir) out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name) counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name) char_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'pos_vocab.txt') pos_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'tag_vocab.txt') tag_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'ner_vocab.txt') ner_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.txt') if not FLAGS.max_ngrams: ngram_counter.save(out_txt) else: # if later need most 2w ngram head -200000 ngram_vocab.full.txt > ngram_vocab.txt out_full_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.full.txt') ngram_counter.save(out_full_txt) os.system('head -n %d %s > %s' % (FLAGS.max_ngrams, out_full_txt, out_txt))