Python init Examples

Programming Language: Python

Namespace/Package Name: wenzheng.utils.text2ids

Method/Function: init

Examples at hotexamples.com: 4

Python init - 4 examples found. These are the top rated real world Python examples of wenzheng.utils.text2ids.init extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main(_):
    text2ids.init(FLAGS.vocab_)
    print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en,
          'seg_method', FLAGS.seg_method)
    print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    print(text2ids_('傻逼脑残B'))
    print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友：2948291976')))

    #exit(0)

    if os.path.isfile(FLAGS.input):
        build_features(FLAGS.input)
    else:
        files = glob.glob(FLAGS.input + '/*')
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        pool.map(build_features, files)
        pool.close()
        pool.join()

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)
    mode = get_mode(FLAGS.input)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)

Example #2

Show file

def main(_):  
  FLAGS.seg_method = 'basic_digit'
  FLAGS.feed_single = True
  FLAGS.feed_single_en = True
  print('seg_method:', FLAGS.seg_method, file=sys.stderr)
  print('feed_single:', FLAGS.feed_single, file=sys.stderr)
  print('feed_single_en:', FLAGS.feed_single_en, file=sys.stderr)

  text2ids.init(FLAGS.vocab)

  counter = WordCounter(most_common=0, min_count=1)
  vocab2 = ifile.replace('.csv', '.pos.mix.vocab')

  assert FLAGS.vocab

  ifile = sys.argv[1]
  if not gezi.env_has('BSEG'):
    ofile = ifile.replace('.csv', '.seg.mix.txt')
  else:
    ofile = ifile.replace('.csv', '.seg.bseg.mix.txt')

  ids_set = set()
  fm = 'w'
  if os.path.exists(ofile):
    fm = 'a'
    for line in open(ofile):
      ids_set.add(line.split('\t')[0])

  print('%s already done %d' % (ofile, len(ids_set)))

  num_errs = 0
  with open(ofile, fm) as out:
    df = pd.read_csv(ifile, lineterminator='\n')
    contents = df['content'].values 
    ids = df['id'].values
    for i in tqdm(range(len(df)), ascii=True):
      if str(ids[i]) in ids_set:
        continue
      #if i != 2333:
      #  continue
      #print(gezi.cut(filter.filter(contents[i]), type_))
      try:
        seg(ids[i], contents[i], out, counter)
      except Exception:
        #print(traceback.format_exc())
        num_errs += 1
        continue
      #exit(0)

  counter.save(vocab2)
  print('num_errs:', num_errs, 'ratio:', num_errs / len(df))

Example #3

Show file

File: gen-records.py Project: colinsongf/sentiment_classify

def main(_):
    text2ids.init(FLAGS.vocab_)
    print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en,
          'seg_method', FLAGS.seg_method)
    print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友：2948291976')))

    global df
    df = pd.read_csv(FLAGS.input, lineterminator='\n')

    mode = get_mode(FLAGS.input)

    pool = multiprocessing.Pool()

    if mode in ['valid', 'test', 'dev', 'pm']:
        FLAGS.num_records_ = 1

    print('num records file to gen', FLAGS.num_records_)

    #FLAGS.num_records_ = 1

    pool.map(build_features, range(FLAGS.num_records_))
    pool.close()
    pool.join()

    #build_features(FLAGS.input)

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)

Example #4

Show file

File: gen-mix-seg-dianping.py Project: colinsongf/sentiment_classify

import sys, os
import numpy as np
import melt

from gezi import Segmentor
segmentor = Segmentor()

import gezi

import pandas as pd

from wenzheng.utils import text2ids

vocab = FLAGS.vocab_
text2ids.init(vocab)

from text2ids import text2ids as text2ids_

#import filter

START_WORD = '<S>'
END_WORD = '</S>'

FLAGS.seg_method = 'basic_digit'
print('seg_method:', FLAGS.seg_method, file=sys.stderr)


def seg(text, out):
    #text = filter.filter(text)
    words = text2ids.ids2words(text2ids_(text))