def main(_): mode = get_mode(FLAGS.input) global vocab, char_vocab vocab = gezi.Vocabulary(FLAGS.vocab_, fixed=FLAGS.fixed_vocab, unk_word=FLAGS.unk_word) char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt') if os.path.exists(char_vocab_file): char_vocab = Vocabulary(char_vocab_file) print('char vocab size:', char_vocab.size()) mode_ = 'train' if 'valid' in FLAGS.input: mode_ = 'valid' elif 'test' in FLAGS.input: mode_ = 'test' else: assert 'train' in FLAGS.input if FLAGS.augument: mode_ = 'aug.' + mode_ if FLAGS.mode_: mode_ = FLAGS.mode_ global df df = [] for line in open(FLAGS.input): df.append(line.strip().split('\t', 3)) pool = multiprocessing.Pool() if not FLAGS.num_records_: if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm' ] or 'valid' in FLAGS.input: FLAGS.num_records_ = 1 else: FLAGS.num_records_ = 1 print('num records file to gen', FLAGS.num_records_) #FLAGS.num_records_ = 1 pool.map(build_features, range(FLAGS.num_records_)) pool.close() pool.join() # for i in range(FLAGS.num_records_): # build_features(i) # for safe some machine might not use cpu count as default ... print('num_records:', counter.value) os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode)) out_file = os.path.dirname( FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file) print('mean words:', total_words.value / counter.value)
def main(_): num_conflicts = 0 visited = {} visited_ngram = {} ngram_vocab_path = FLAGS.ngram_vocab or os.path.join(FLAGS.dir, 'ngram_vocab.txt') ngram_vocab = Vocabulary(ngram_vocab_path) print('ngram_vocab size', ngram_vocab.size()) print('num ngram buckets', FLAGS.ngram_buckets) if FLAGS.emb.endswith('.npy'): ngram_emb = np.load(FLAGS.emb) assert len(ngram_emb) > 100000 else: ngram_emb = [] for line in open(FLAGS.emb): ngram_emb.append([float(x) for x in line.strip().split()]) print('len ngram emb', len(ngram_emb)) emb_mat = [] vec_size = FLAGS.emb_dim # for padding zero emb_mat.append(np.array([0.] * vec_size)) # exclude first pad and last 3 unk <s> </s> # unk, <s>, </s>, sincie ngram vocab txt not include these will append for i in range(3): emb_mat.append([np.random.uniform(-0.08, 0.08) for _ in range(vec_size)]) for i in range(4, ngram_vocab.size()): ngram = ngram_vocab.key(i) ngram_hash = gezi.hash(ngram) ngram_id = ngram_hash % FLAGS.ngram_buckets if ngram_id not in visited: visited[ngram_id] = 1 visited_ngram[ngram_id] = [ngram] else: visited[ngram_id] += 1 visited_ngram[ngram_id].append(ngram) num_conflicts += 1 #print('Conflict', visited_ngram[ngram_id], 'Num conflicts', num_conflicts) emb_mat.append(ngram_emb[ngram_id]) print('Num conflicts', num_conflicts) print('len(emb_mat)', len(emb_mat)) ngram_output = FLAGS.ngram_output or 'ngram.npy' out_mat = os.path.join(FLAGS.dir, ngram_output) print('out mat', out_mat) np.save(out_mat, np.array(emb_mat))
def main(_): tokenizer.init(FLAGS.tokenizer_vocab) global examples, vocab, char_vocab examples = pd.read_csv(FLAGS.input) #if 'train' in FLAGS.input: # examples = shuffle(examples, random_state=1024) vocab = Vocabulary(FLAGS.vocab) char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt')) pool = multiprocessing.Pool() pool.map(build_features, range(FLAGS.num_records)) pool.close() pool.join() # build_features(0) print('num_records:', counter.value) mode = 'train' if 'train' in FLAGS.input else 'test' out_file = os.path.dirname( FLAGS.vocab) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file)
def main(_): os.system('mkdir -p %s' % FLAGS.dir) tokenizer.init(FLAGS.tokenizer_vocab) global examples, vocab, unk_vocab, char_vocab, pos_vocab, tag_vocab, ner_vocab, ngram_vocab examples = pd.read_csv(FLAGS.input) #if 'train' in FLAGS.input: # examples = shuffle(examples, random_state=1024) vocab = Vocabulary(FLAGS.vocab) # unk_vocab is actually a small vocab so will genearte unk for training #unk_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'unk_vocab.txt')) char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt')) pos_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'pos_vocab.txt')) tag_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'tag_vocab.txt')) ner_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ner_vocab.txt')) ngram_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ngram_vocab.txt')) global enprob_dict enprob_dict = {} enprob_file = '~/data/kaggle/toxic/train.enprob.csv' if 'train' in FLAGS.input else '~/data/kaggle/toxic/test.enprob.csv' enprob_df = pd.read_csv(enprob_file) for id, enprob in zip(enprob_df['id'].values, enprob_df['enprob'].values): enprob_dict[id] = enprob enprob_dict['0'] = 1. pool = multiprocessing.Pool() pool.map(build_features, range(FLAGS.num_records)) pool.close() pool.join() #build_features(0) print('num_records:', counter.value) mode = get_mode() out_file = os.path.dirname(FLAGS.vocab) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file)
def main(_): assert FLAGS.use_char is not None global vocab, char_vocab, pos_vocab vocab = gezi.Vocabulary(FLAGS.vocab_) print('vocab size:', vocab.size()) char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt') if os.path.exists(char_vocab_file): char_vocab = Vocabulary(char_vocab_file) print('char vocab size:', char_vocab.size()) pos_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'pos_vocab.txt') if os.path.exists(pos_vocab_file): pos_vocab = Vocabulary(pos_vocab_file) print('pos vocab size:', pos_vocab.size()) if os.path.isfile(FLAGS.input): build_features(FLAGS.input) else: files = glob.glob(FLAGS.input + '/*') pool = multiprocessing.Pool(multiprocessing.cpu_count()) pool.map(build_features, files) pool.close() pool.join() # for safe some machine might not use cpu count as default ... print('num_records:', counter.value) mode = get_mode(FLAGS.input) os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode)) out_file = os.path.dirname( FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file) print('mean words:', total_words.value / counter.value)
def main(_): mode = get_mode(FLAGS.input) assert FLAGS.use_fold #text2ids.init(FLAGS.vocab_) global vocab, char_vocab, pos_vocab, ner_vocab, seg_result, pos_result, ner_result #vocab = text2ids.vocab vocab = gezi.Vocabulary(FLAGS.vocab_, fixed=FLAGS.fixed_vocab, unk_word=FLAGS.unk_word) print('vocab size:', vocab.size()) char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt') if os.path.exists(char_vocab_file): char_vocab = Vocabulary(char_vocab_file) print('char vocab size:', char_vocab.size()) pos_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'pos_vocab.txt') if os.path.exists(pos_vocab_file): pos_vocab = Vocabulary(pos_vocab_file) print('pos vocab size:', pos_vocab.size()) ner_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'ner_vocab.txt') if os.path.exists(ner_vocab_file): ner_vocab = Vocabulary(ner_vocab_file) print('ner vocab size:', ner_vocab.size()) mode_ = 'train' if 'valid' in FLAGS.input: mode_ = 'valid' elif 'test' in FLAGS.input: mode_ = 'test' else: assert 'train' in FLAGS.input if FLAGS.augument: mode_ = 'aug.' + mode_ if FLAGS.mode_: mode_ = FLAGS.mode_ seg_file = FLAGS.vocab_.replace('vocab.txt', '%s.seg.txt' % mode_) seg_result = {} if os.path.exists(seg_file): print('seg or seg_pos exits:', seg_file) pos_result = {} for line in open(seg_file): id, segs = line.rstrip('\n').split('\t', 1) segs = segs.split('\x09') if FLAGS.ignore_start_end: segs = segs[1:-1] if '|' in segs[0] and not FLAGS.word_only: l = [x.rsplit('|', 1) for x in segs] segs, pos = list(zip(*l)) pos_result[id] = pos seg_result[id] = segs seg_done = True if seg_result else False ner_file = FLAGS.vocab_.replace('vocab.txt', '%s.ner.txt' % mode_) ner_result = {} if os.path.exists(ner_file): print('seg_ner exists:', ner_file) for line in open(ner_file): id, segs = line.rstrip('\n').split('\t', 1) segs = segs.split('\x09') if FLAGS.ignore_start_end: segs = segs[1:-1] if '|' in segs[0]: l = [x.split('|') for x in segs] segs, ner = list(zip(*l)) if not seg_done: seg_result[id] = segs ner_result[id] = ner print('len(seg_result)', len(seg_result)) print('len(ner_result)', len(ner_result)) # print('to_lower:', FLAGS.to_lower, 'feed_single:', FLAGS.feed_single, 'feed_single_en:', FLAGS.feed_single_en, 'seg_method', FLAGS.seg_method) # print(text2ids.ids2text(text2ids_('傻逼脑残B'))) # print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976'))) global df df = pd.read_csv(FLAGS.input, lineterminator='\n') pool = multiprocessing.Pool() if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm' ] or 'valid' in FLAGS.input: FLAGS.num_records_ = 1 print('num records file to gen', FLAGS.num_records_) #FLAGS.num_records_ = 1 pool.map(build_features, range(FLAGS.num_records_)) pool.close() pool.join() # for i in range(FLAGS.num_records_): # build_features(i) # for safe some machine might not use cpu count as default ... print('num_records:', counter.value) os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode)) out_file = os.path.dirname( FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file) print('mean words:', total_words.value / counter.value)
def main(_): input_vocab = os.path.join(FLAGS.dir, 'vocab.full.txt') ft_vocab = Vocabulary(os.path.join(os.path.dirname(FLAGS.emb), 'vocab.txt'), fixed=True) lines = open(input_vocab).readlines() ori_words_counts = [x.rstrip('\n').split('\t') for x in lines] # TODO FIXME why must remove? other wise when in for word, count in zip(ori_words, counts): will ValueError: invalid literal for int() with base 10: ' ' ori_words_counts = filter(lambda x: x[0].strip(), ori_words_counts) ori_words, counts = zip(*ori_words_counts) counts = list(map(int, counts)) ori_set = set(ori_words) normed_ori_set = set([x.lower() for x in ori_set]) embedding_dict = {} ngrams = [] vec_size = FLAGS.emb_dim with open(FLAGS.emb, 'r', encoding='utf-8') as fh: #for line in tqdm(fh, total=2196017): for i, line in enumerate(fh): array = line.split() # fasttext txt has header line if len(array) < vec_size: continue vector = list(map(float, array)) if i >= ft_vocab.size(): ngrams.append(vector) continue word = ft_vocab.key(i) if word.lower() in normed_ori_set: embedding_dict[word] = vector if i % 100000 == 0: print(i) #break print("{} / {} tokens have corresponding word embedding vector".format( len(embedding_dict), len(ori_words))) words = [] emb_mat = [] # for padding zero emb_mat.append(np.array([0.] * vec_size)) if not '<UNK>' in ori_set: #change from all 0 to random normal for unk #emb_mat.append([np.random.normal(scale=0.1) for _ in range(vec_size)]) emb_mat.append( [np.random.uniform(-0.08, 0.08) for _ in range(vec_size)]) words.append('<UNK>') if not '<S>' in ori_set: emb_mat.append( [np.random.uniform(-0.08, 0.08) for _ in range(vec_size)]) words.append('<S>') if not '</S>' in ori_set: emb_mat.append( [np.random.uniform(-0.08, 0.08) for _ in range(vec_size)]) words.append('</S>') with open('/home/gezi/tmp/rare_words.txt', 'w') as rare_out: for word, count in zip(ori_words, counts): if FLAGS.type == 'normal': if word in embedding_dict: emb_mat.append(np.array(embedding_dict[word])) words.append(word) else: if count >= FLAGS.min_count: print('%s %d' % (word, count), file=rare_out) #emb_mat.append([np.random.normal(scale=0.1) for _ in range(vec_size)]) emb_mat.append([ np.random.uniform(-0.08, 0.08) for _ in range(vec_size) ]) words.append(word) elif FLAGS.type == 'scratch': if count >= FLAGS.min_count: if word in embedding_dict: emb_mat.append(np.array(embedding_dict[word])) words.append(word) else: #emb_mat.append([np.random.normal(scale=0.1) for _ in range(vec_size)]) emb_mat.append([ np.random.uniform(-0.08, 0.08) for _ in range(vec_size) ]) words.append(word) elif FLAGS.type == 'only': if word in embedding_dict: emb_mat.append(np.array(embedding_dict[word])) words.append(word) words_set = set(words) for word, count in zip(ori_words, counts): if word not in words_set: contains = False for w in (word.lower(), word.capitalize(), word.upper()): if w in words_set: contains = True if not contains: for w in (word.lower(), word.capitalize(), word.upper()): if w in embedding_dict: print('adding....', w, word) words_set.add(w) emb_mat.append(np.array(embedding_dict[w])) words.append(w) break out_vocab = os.path.join(FLAGS.dir, 'vocab.txt') print('out vocab size', len(words), 'ori ft vocab size', ft_vocab.size()) with open(out_vocab, 'w') as out: for word in words: print(word, file=out) out_mat = os.path.join(FLAGS.dir, FLAGS.out_name) emb_mat += ngrams # # check # ids = gezi.fasttext_ids('you', Vocabulary(out_vocab), FLAGS.ngram_buckets, 3, 3) # print('---------ids', ids) # vectors = [] # for id in ids: # vectors.append(emb_mat[id]) # vectors = np.stack(vectors) # print(np.mean(vectors, 0)) print('len(emb_mat)', len(emb_mat)) np.save(out_mat, np.array(emb_mat))
def init(vocab_path_=None, append=None): global vocab, vocab_size, vocab_path if vocab is None: if not FLAGS.vocab_buckets: vocab_path = vocab_path_ or FLAGS.vocab or gezi.dirname( FLAGS.model_dir) + '/vocab.txt' FLAGS.vocab = vocab_path logging.info('vocab:{}'.format(vocab_path)) logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids)) if append is None: append = FLAGS.vocab_append if gezi.env_has('VOCAB_APPEND'): append = True vocab = Vocabulary(vocab_path, FLAGS.num_reserved_ids, append=append, max_words=FLAGS.vocab_max_words, min_count=FLAGS.vocab_min_count) else: vocab = Vocabulary(buckets=FLAGS.vocab_buckets) vocab_size = vocab.size() if not FLAGS.vocab_size else min( vocab.size(), FLAGS.vocab_size) logging.info('vocab_size:{}'.format(vocab_size)) assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()), vocab.start_id())) logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()), vocab.end_id())) logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()), vocab.unk_id()))
def init(vocab_path=None): global vocab, vocab_size if vocab is None: if vocab_path is None: vocab_path = FLAGS.vocab logging.info('vocab:{}'.format(vocab_path)) logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids)) vocab = Vocabulary(vocab_path, FLAGS.num_reserved_ids) vocab_size = vocab.size() if not FLAGS.vocab_size else min( vocab.size(), FLAGS.vocab_size) logging.info('vocab_size:{}'.format(vocab_size)) assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()), vocab.start_id())) logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()), vocab.end_id())) logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()), vocab.unk_id()))
# \Description # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys, os from gezi import Vocabulary import pandas as pd dir = '/home/gezi/temp/toxic/v16/tfrecords/glove.lower/' vocab = Vocabulary(dir + 'vocab.txt') def run(input): total_tokens = 0 total_unks = 0 num_specials = 0 num_toxic = 0 output = input.replace('.csv', '.numunks.csv') output_speial = input.replace('.csv', '.special.csv') df = pd.read_csv(input) ids = df['id'].values comments = df['tokens'].values if 'toxic' not in df.columns: df['toxic'] = [0.] * len(comments) toxics = df['toxic'].values