def main(_):
    # FLAGS.seg_method = 'basic_digit'
    # FLAGS.feed_single = True
    # FLAGS.feed_single_en = True
    # print('seg_method:', FLAGS.seg_method, file=sys.stderr)
    # print('feed_single:', FLAGS.feed_single, file=sys.stderr)
    # print('feed_single_en:', FLAGS.feed_single_en, file=sys.stderr)

    #assert FLAGS.vocab

    global vocab
    vocab = gezi.Vocabulary(FLAGS.vocab)

    ifile = sys.argv[1]
    if not gezi.env_has('BSEG'):
        ofile = ifile.replace('.csv', '.seg.jieba.mix.txt')
    else:
        ofile = ifile.replace('.csv', '.seg.bseg.mix.txt')

    counter = WordCounter(most_common=0, min_count=1)
    vocab2 = ifile.replace('.csv', '.pos.mix.vocab')

    ids_set = set()
    fm = 'w'
    if os.path.exists(ofile):
        fm = 'a'
        for line in open(ofile):
            ids_set.add(line.split('\t')[0])

    print('%s already done %d' % (ofile, len(ids_set)))

    num_errs = 0
    with open(ofile, fm) as out:
        df = pd.read_csv(ifile, lineterminator='\n')
        contents = df['content'].values
        ids = df['id'].values
        for i in tqdm(range(len(df)), ascii=True):
            #if str(ids[i]) in ids_set:
            #  continue
            #if i != 2333:
            #  continue
            #print(gezi.cut(filter.filter(contents[i]), type_))
            try:
                seg(ids[i], contents[i], out, counter)
            except Exception:
                if num_errs == 0:
                    print(traceback.format_exc())
                num_errs += 1
                continue
            #exit(0)

    counter.save(vocab2)
    print('num_errs:', num_errs, 'ratio:', num_errs / len(df))
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    global counter
    counter = WordCounter(write_unknown=FLAGS.write_unknown,
                          most_common=FLAGS.most_common,
                          min_count=FLAGS.min_count)

    global char_counter
    char_counter = WordCounter(write_unknown=FLAGS.write_unknown,
                               most_common=FLAGS.most_common,
                               min_count=FLAGS.min_count)

    run(FLAGS.input)
    if FLAGS.test_input:
        run(FLAGS.test_input, count=FLAGS.test_count)

    vocab_name = FLAGS.vocab_name or 'vocab'
    os.system('mkdir -p %s' % FLAGS.out_dir)
    out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name)
    counter.save(out_txt)

    out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name)
    char_counter.save(out_txt)
import gezi

#assert gezi.env_has('JIEBA_POS')
from gezi import WordCounter 

import pandas as pd

from projects.ai2018.sentiment.prepare import filter

from tqdm import tqdm
import traceback

START_WORD = '<S>'
END_WORD = '</S>'

counter = WordCounter(most_common=0, min_count=1)
counter2 = WordCounter(most_common=0, min_count=1)

print('seg_method:', FLAGS.seg_method, file=sys.stderr)

if gezi.env_has('SENTENCE_PIECE'):
  assert FLAGS.sp_path 
  gezi.segment.init_sp(FLAGS.sp_path)

def seg(id, text, out, type):
  text = filter.filter(text)
  counter.add(START_WORD)
  counter.add(END_WORD)
  l = gezi.cut(text, type)

  if type != 'word':
flags.DEFINE_integer("most_common", 0, "if > 0 then get vocab with most_common words")
flags.DEFINE_integer("min_count", 0, "if > 0 then cut by min_count")
flags.DEFINE_integer("max_lines", 0, "")
flags.DEFINE_boolean("add_unknown", True, "treat ignored words as unknow")
flags.DEFINE_boolean("save_count_info", True, "save count info to bin")
flags.DEFINE_string("out_dir", './mount/temp/ai2018/sentiment/', "save count info to bin")
flags.DEFINE_string("vocab_name", None, "")
flags.DEFINE_string('seg_method', 'basic_single_all', '')

assert FLAGS.most_common > 0 or FLAGS.min_count > 0
assert FLAGS.seg_method

from gezi import WordCounter 

counter = WordCounter(
    most_common=FLAGS.most_common,
    min_count=FLAGS.min_count)

import sys,os
#reload(sys)
#sys.setdefaultencoding('utf8')
import numpy as np

from gezi import Segmentor
segmentor = Segmentor()
print(segmentor, file=sys.stderr)

import gezi

START_WORD = '<S>'
END_WORD = '</S>'
Exemple #5
0
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    if FLAGS.full_tokenizer:
        gezi.segment.init_spacy_full()

    os.system('mkdir -p %s' % FLAGS.out_dir)

    print('name', FLAGS.name, 'out_dir', FLAGS.out_dir)

    global counter
    counter = WordCounter(write_unknown=FLAGS.write_unknown,
                          most_common=FLAGS.most_common,
                          min_count=FLAGS.min_count)

    global char_counter
    char_counter = WordCounter(write_unknown=FLAGS.write_unknown,
                               most_common=FLAGS.most_common,
                               min_count=FLAGS.min_count)

    global ngram_counter
    ngram_counter = WordCounter(write_unknown=True, min_count=FLAGS.min_count)

    global pos_counter, tag_counter, ner_counter
    pos_counter = WordCounter(write_unknown=True, min_count=1)
    tag_counter = WordCounter(write_unknown=True, min_count=1)
    ner_counter = WordCounter(write_unknown=True, min_count=1)

    run(FLAGS.input)

    if FLAGS.test_input and not FLAGS.name:
        run(FLAGS.test_input, count=FLAGS.test_count)

    if not FLAGS.name:
        vocab_name = FLAGS.vocab_name or 'vocab'
        os.system('mkdir -p %s' % FLAGS.out_dir)
        out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name)
        counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name)
        char_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'pos_vocab.txt')
        pos_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'tag_vocab.txt')
        tag_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'ner_vocab.txt')
        ner_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.txt')
        if not FLAGS.max_ngrams:
            ngram_counter.save(out_txt)
        else:
            # if later need most 2w ngram head -200000 ngram_vocab.full.txt > ngram_vocab.txt
            out_full_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.full.txt')
            ngram_counter.save(out_full_txt)
            os.system('head -n %d %s > %s' %
                      (FLAGS.max_ngrams, out_full_txt, out_txt))