Example #1
0
def main(_):
    text2ids.init(FLAGS.vocab_)
    print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en,
          'seg_method', FLAGS.seg_method)
    print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    print(text2ids_('傻逼脑残B'))
    print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976')))

    #exit(0)

    if os.path.isfile(FLAGS.input):
        build_features(FLAGS.input)
    else:
        files = glob.glob(FLAGS.input + '/*')
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        pool.map(build_features, files)
        pool.close()
        pool.join()

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)
    mode = get_mode(FLAGS.input)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
Example #2
0
def main(_):  
  FLAGS.seg_method = 'basic_digit'
  FLAGS.feed_single = True
  FLAGS.feed_single_en = True
  print('seg_method:', FLAGS.seg_method, file=sys.stderr)
  print('feed_single:', FLAGS.feed_single, file=sys.stderr)
  print('feed_single_en:', FLAGS.feed_single_en, file=sys.stderr)

  text2ids.init(FLAGS.vocab)

  counter = WordCounter(most_common=0, min_count=1)
  vocab2 = ifile.replace('.csv', '.pos.mix.vocab')

  assert FLAGS.vocab

  ifile = sys.argv[1]
  if not gezi.env_has('BSEG'):
    ofile = ifile.replace('.csv', '.seg.mix.txt')
  else:
    ofile = ifile.replace('.csv', '.seg.bseg.mix.txt')

  ids_set = set()
  fm = 'w'
  if os.path.exists(ofile):
    fm = 'a'
    for line in open(ofile):
      ids_set.add(line.split('\t')[0])

  print('%s already done %d' % (ofile, len(ids_set)))

  num_errs = 0
  with open(ofile, fm) as out:
    df = pd.read_csv(ifile, lineterminator='\n')
    contents = df['content'].values 
    ids = df['id'].values
    for i in tqdm(range(len(df)), ascii=True):
      if str(ids[i]) in ids_set:
        continue
      #if i != 2333:
      #  continue
      #print(gezi.cut(filter.filter(contents[i]), type_))
      try:
        seg(ids[i], contents[i], out, counter)
      except Exception:
        #print(traceback.format_exc())
        num_errs += 1
        continue
      #exit(0)

  counter.save(vocab2)
  print('num_errs:', num_errs, 'ratio:', num_errs / len(df))
def main(_):
    text2ids.init(FLAGS.vocab_)
    print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en,
          'seg_method', FLAGS.seg_method)
    print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976')))

    global df
    df = pd.read_csv(FLAGS.input, lineterminator='\n')

    mode = get_mode(FLAGS.input)

    pool = multiprocessing.Pool()

    if mode in ['valid', 'test', 'dev', 'pm']:
        FLAGS.num_records_ = 1

    print('num records file to gen', FLAGS.num_records_)

    #FLAGS.num_records_ = 1

    pool.map(build_features, range(FLAGS.num_records_))
    pool.close()
    pool.join()

    #build_features(FLAGS.input)

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
import sys, os
import numpy as np
import melt

from gezi import Segmentor
segmentor = Segmentor()

import gezi

import pandas as pd

from wenzheng.utils import text2ids

vocab = FLAGS.vocab_
text2ids.init(vocab)

from text2ids import text2ids as text2ids_

#import filter

START_WORD = '<S>'
END_WORD = '</S>'

FLAGS.seg_method = 'basic_digit'
print('seg_method:', FLAGS.seg_method, file=sys.stderr)


def seg(text, out):
    #text = filter.filter(text)
    words = text2ids.ids2words(text2ids_(text))