import nowarning from libword_counter import WordCounter counter = WordCounter(addUnknown=FLAGS.add_unknown, mostCommon=FLAGS.most_common, minCount=FLAGS.min_count, saveCountInfo=FLAGS.save_count_info) import sys, os import numpy as np import melt import conf from conf import IMAGE_FEATURE_LEN from gezi import Segmentor segmentor = Segmentor() START_WORD = '<S>' END_WORD = '</S>' print('seg_method:', FLAGS.seg_method, file=sys.stderr) print('min_count:', FLAGS.min_count, 'most_common:', FLAGS.most_common) def deal(text, num): if text.startswith('other/') or text.startswith('ÆäËû/'): continue text = text.lower() words = segmentor.Segment(text, FLAGS.seg_method) if num % 10000 == 0: print(text, '|'.join(words), len(words), file=sys.stderr)
from libword_counter import WordCounter counter = WordCounter( addUnknown=FLAGS.add_unknown, mostCommon=FLAGS.most_common, minCount=FLAGS.min_count, saveCountInfo=FLAGS.save_count_info) import sys,os import numpy as np import melt import conf from conf import IMAGE_FEATURE_LEN from gezi import Segmentor segmentor = Segmentor() START_WORD = '<S>' END_WORD = '</S>' print('seg_method:', FLAGS.seg_method, file=sys.stderr) print('min_count:', FLAGS.min_count, 'most_common:', FLAGS.most_common) num = 0 for line in sys.stdin: if num % 10000 == 0: print(num, file=sys.stderr) l = line.rstrip().split('\t') texts = l[1].split('\x01') for text in texts: words = segmentor.Segment(text, FLAGS.seg_method)
import nowarning from libword_counter import WordCounter counter = WordCounter(addUnknown=FLAGS.add_unknown, mostCommon=FLAGS.most_common, minCount=FLAGS.min_count, saveCountInfo=FLAGS.save_count_info) import sys, os import numpy as np import melt import conf from conf import IMAGE_FEATURE_LEN from gezi import Segmentor segmentor = Segmentor() num = 0 for line in sys.stdin: if num % 10000 == 0: print(num) l = line.rstrip().split('\t') img_end = IMAGE_FEATURE_LEN + 1 texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: words = segmentor.segment(text) for word in words: counter.add(word) num += 1 counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')