def main(_): np.random.seed(FLAGS.seed_) files = gezi.list_files(FLAGS.in_dir) print('input', FLAGS.in_dir) FLAGS.out_dir += f'/{FLAGS.record_name}' if not os.path.exists(FLAGS.out_dir): print('make new dir: [%s]' % FLAGS.out_dir, file=sys.stderr) os.makedirs(FLAGS.out_dir) if FLAGS.train_by_day and FLAGS.shuffle_impressions: assert FLAGS.day is not None global df, uid_vocab, did_vocab, uid_vocab2, did_vocab2 global cat_vocab, scat_vocab, entity_vocab, entity_type_vocab behaviors_file = f'{FLAGS.in_dir}/{FLAGS.mark}/behaviors.tsv' if FLAGS.mark == 'train' and FLAGS.day == 6: behaviors_file = f'{FLAGS.in_dir}/dev/behaviors.tsv' print('behaviors_file', behaviors_file) df = pd.read_csv(behaviors_file, sep='\t', names=behaviors_names) if FLAGS.mark == 'train': print('behaviors_df shuffle') df = df.sample(frac=1, random_state=FLAGS.seed_) uid_vocab = gezi.Vocab(f'{FLAGS.in_dir}/uid.txt') did_vocab = gezi.Vocab(f'{FLAGS.in_dir}/did.txt') uid_vocab2 = gezi.Vocab(f'{FLAGS.in_dir}/train/uid.txt') did_vocab2 = gezi.Vocab(f'{FLAGS.in_dir}/train/did.txt') cat_vocab = gezi.Vocab(f'{FLAGS.in_dir}/cat.txt') scat_vocab = gezi.Vocab(f'{FLAGS.in_dir}/sub_cat.txt') entity_vocab = gezi.Vocab(f'{FLAGS.in_dir}/entity.txt') entity_type_vocab = gezi.Vocab(f'{FLAGS.in_dir}/entity_type.txt') for line in open(f'{FLAGS.in_dir}/start_times.txt'): did, timestamp, _ = line.strip().split('\t') start_timestamps[did] = int(timestamp) global news_info # ndf = pd.read_csv(f'{FLAGS.in_dir}/{FLAGS.mark}/news.tsv', sep='\t', names=news_names) news_info = {} # for _, row in tqdm(ndf.iterrows(), total=len(ndf), ascii=True, desc='news_info'): # news_info[row['did']] = row news_file = f'{FLAGS.in_dir}/{FLAGS.mark}/news.tsv' if FLAGS.mark == 'train' and FLAGS.day == 6: news_file = f'{FLAGS.in_dir}/dev/news.tsv' total = len(open(news_file).readlines()) for line in tqdm(open(news_file), total=total, ascii=True, desc='news_info'): l = line.strip('\n').split('\t') m = {} for i, name in enumerate(news_names): m[name] = l[i] news_info[l[0]] = m with Pool(FLAGS.num_records) as p: p.map(build_features, range(FLAGS.num_records))
def main(_): vocab_file = sys.argv[1] vocab = gezi.Vocab(vocab_file) emb_height = vocab.size() print(vocab.id('i')) emb_size = len(open('./vectors.txt').readline().strip().split()) - 1 print(emb_size) emb = np.random.uniform(-0.05, 0.05,(emb_height, emb_size)) print(emb) emb = list(emb) for line in tqdm(open('./vectors.txt'), total=emb_height): l = line.strip().split() word, vals = l[0], l[1:] vals = np.asarray(list(map(float, vals))) if FLAGS.norm: vals = normalize(np.reshape(vals, (1,-1))) #vals /= np.sqrt(emb_size) vals = np.reshape(vals, (-1,)) emb[vocab.id(word)] = vals emb = np.asarray(emb) print(emb) print(emb.shape) #emb = normalize(emb) np.save('./emb.npy', emb)
from transformers import AutoTokenizer import gezi files = [ './train/news.tsv', './dev/news.tsv', './test/news.tsv' ] model_name = 'bert-base-cased' model = f'/home/gezi/data/lm/{model_name}' tokenizer = AutoTokenizer.from_pretrained(model) emb_size = 30 vocab = gezi.Vocab('./did.txt') emb_height = vocab.size() print(emb_height) dids = set() emb = [[1] * emb_size] * emb_height for file_ in files: total = len(open(file_).readlines()) for line in tqdm(open(file_), total=total): l = line.strip().split('\t') did, title = l[0], l[3] if did in dids: continue dids.add(did)
from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os import numpy as np import gezi from sklearn.preprocessing import normalize from tqdm import tqdm model_name = 'bert-base-cased' model = f'/home/gezi/data/lm/{model_name}' vocab_file = f'{model}/vocab.txt' vocab = gezi.Vocab(vocab_file, fixed=True) emb_height = vocab.size() print(vocab.id('i')) emb_size = len(open('./vectors.txt').readline().strip().split()) - 1 print(emb_size) emb = np.random.uniform(-0.05, 0.05, (emb_height, emb_size)) print(emb) emb = list(emb) for line in tqdm(open('./vectors.txt'), total=emb_height): l = line.strip().split() word, vals = l[0], l[1:]
def init(): vocab_names = [ 'did', 'uid', 'cat', 'sub_cat', 'entity', 'entity_type', 'word' ] # vocabs = # { # 'uid': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': FLAGS.train_uid_emb, # 'pretrain': None, # }, # 'did': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': FLAGS.train_did_emb, # 'pretrain': FLAGS.did_pretrain, # }, # 'cat': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': True, # 'pretrain': None, # }, # 'sub_cat': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': True, # 'pretrain': None, # }, # 'entity': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': FLAGS.train_entity_emb, # 'pretrain': FALGS.entity_pretrain, # }, # 'entity2': { # 'min_count': FLAGS.min_count, # 'slim': True, # 'trainable': True, # 'pretrain': None, # }, # 'entity_type': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': True, # 'pretrain': None, # }, # 'word': { # 'min_count': 0, # 'slim': False, # 'trainable': FLAGS.train_word_emb, # 'pretrain': FLAGS.word_pretrain, # }, # } vocab_sizes = {} for vocab_name in vocab_names: fixed = False if vocab_name != 'word' else True vocab = gezi.Vocab(f'{FLAGS.input_dir}/{vocab_name}.txt', fixed=fixed) min_count = FLAGS.min_count if vocab_name != 'word' else 0 logging.debug('---min_count', min_count) # # > 1e6 表示在train或者dev数据出现过 vocab_size = [vocab.size(), vocab.size(min_count + 1000000)] if vocab_name == 'uid' and FLAGS.max_vid: vocab_size[1] = FLAGS.max_vid # vocab_size[1] is not used vocab_sizes[vocab_name] = vocab_size # print(vocab_name, vocab_size) gezi.set('vocab_sizes', vocab_sizes) # mixed train start from FLAGS.mix_train valid_day = 6 if FLAGS.mix_train: FLAGS.loop_train = False FLAGS.valid_input = f'{FLAGS.train_input}/{valid_day}' FLAGS.train_input = ','.join([ f'{FLAGS.train_input}/{i}' for i in range(int(FLAGS.start), valid_day) ]) FLAGS.mname += '.mix' # 自动循环train/valid/test 注意始终使用day=6做验证 day = int(FLAGS.start or 0) if day != 0: FLAGS.mname += f'.{day}' if FLAGS.mode != 'train': FLAGS.valid_hour = str(valid_day) if 'rand' in FLAGS.input: FLAGS.shuffle = True if 'pad' in FLAGS.input: FLAGS.record_padded = True if FLAGS.neg_mask_ratio > 0: FLAGS.use_weight = True if FLAGS.big_model: FLAGS.his_encoder = 'gru' FLAGS.title_encoder = 'gru' FLAGS.title_pooling = 'att' # FLAGS.use_contexts = True FLAGS.use_his_image = True FLAGS.use_image = True FLAGS.train_image_emb = True
def init(): FLAGS.title_lookup = f'{FLAGS.doc_dir}/title_lookup.npy' FLAGS.doc_lookup = f'{FLAGS.doc_dir}/doc_lookup.npy' FLAGS.doc_fnames = f'{FLAGS.doc_dir}/doc_fnames.npy' FLAGS.doc_flens = f'{FLAGS.doc_dir}/doc_flens.npy' doc_feats = np.load(FLAGS.doc_fnames) doc_feat_lens = np.load(FLAGS.doc_flens) gezi.set('doc_feats', doc_feats) gezi.set('doc_feat_lens', doc_feat_lens) logging.info('doc_feats', list(zip(doc_feats, doc_feat_lens))) vocab_names = [ 'did', 'uid', 'cat', 'sub_cat', 'entity', 'entity_type', 'word' ] # vocabs = # { # 'uid': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': FLAGS.train_uid_emb, # 'pretrain': None, # }, # 'did': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': FLAGS.train_did_emb, # 'pretrain': FLAGS.did_pretrain, # }, # 'cat': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': True, # 'pretrain': None, # }, # 'sub_cat': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': True, # 'pretrain': None, # }, # 'entity': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': FLAGS.train_entity_emb, # 'pretrain': FALGS.entity_pretrain, # }, # 'entity2': { # 'min_count': FLAGS.min_count, # 'slim': True, # 'trainable': True, # 'pretrain': None, # }, # 'entity_type': { # 'min_count': FLAGS.min_count, # 'slim': False, # 'trainable': True, # 'pretrain': None, # }, # 'word': { # 'min_count': 0, # 'slim': False, # 'trainable': FLAGS.train_word_emb, # 'pretrain': FLAGS.word_pretrain, # }, # } vocab_sizes = {} for vocab_name in vocab_names: fixed = False if vocab_name != 'word' else True vocab_file = f'{FLAGS.input_dir}/{vocab_name}.txt' vocab = gezi.Vocab(vocab_file, fixed=fixed) if FLAGS.dev_version == 1: min_count = FLAGS.min_count if vocab_name != 'word' else 0 logging.debug('---min_count', min_count) train_vocab_file = f'{FLAGS.input_dir}/train/{vocab_name}.txt' if os.path.exists(train_vocab_file) and min_count: train_vocab = gezi.Vocab(train_vocab_file, fixed=fixed) else: train_vocab = vocab vocab_size = [vocab.size(), train_vocab.size(min_count)] else: # > 1e6 表示在train或者dev数据出现过 min_count = FLAGS.min_count if vocab_name != 'word' else 0 if min_count != 0: min_count += FLAGS.test_start vocab_size = [vocab.size(), vocab.size(min_count)] if vocab_name == 'uid' and FLAGS.min_uid_count: vocab_size[1] = vocab.size(FLAGS.test_start + FLAGS.min_uid_count) if vocab_name == 'uid' and FLAGS.max_vid: vocab_size[1] = FLAGS.max_vid # vocab_size[1] is not used vocab_sizes[vocab_name] = vocab_size gezi.set('vocab_sizes', vocab_sizes) logging.info('vocab_sizes:', vocab_sizes) # mixed train start from FLAGS.mix_train valid_day = 6 if FLAGS.mix_train: FLAGS.loop_train = False FLAGS.valid_input = f'{FLAGS.train_input}/{valid_day}' FLAGS.train_input = ','.join([f'{FLAGS.train_input}/{i}' for i in range(int(FLAGS.start), valid_day)]) FLAGS.mname += '.mix' # 自动循环train/valid/test 注意始终使用day=6做验证 day = int(FLAGS.start or 0) if day != 0: FLAGS.mname += f'.{day}' if FLAGS.mode != 'train': FLAGS.valid_hour = str(valid_day) if 'rand' in FLAGS.input: FLAGS.shuffle = True if 'pad' in FLAGS.input: FLAGS.record_padded = True if FLAGS.neg_mask_ratio > 0: FLAGS.use_weight = True # 注意异步验证mode==async_valid 仍然会写summary if FLAGS.mode == 'valid': FLAGS.write_summary = False if FLAGS.num_valid == 1000000: FLAGS.num_valid = 0 if FLAGS.big_model: FLAGS.his_encoder = 'gru' FLAGS.title_encoder = 'gru' FLAGS.title_pooling = 'att' # FLAGS.use_contexts = True FLAGS.use_his_image = True FLAGS.use_image = True FLAGS.train_image_emb = True
# ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os import numpy as np import gezi from sklearn.preprocessing import normalize from tqdm import tqdm vocab_file = './entity.txt' vocab = gezi.Vocab(vocab_file) emb_height = vocab.size() emb_size = len( open('./train/entity_embedding.vec').readline().strip().split()) - 1 print(emb_size) emb = np.random.uniform(-0.05, 0.05, (emb_height, emb_size)) print(emb) emb = list(emb) files = [ './train/entity_embedding.vec', './dev/entity_embedding.vec', './test/entity_embedding.vec' ]
import sys import os import numpy as np import gezi from tqdm import tqdm from transformers import AutoTokenizer import gezi files = ['./train/news.tsv', './dev/news.tsv', './test/news.tsv'] model_name = 'bert-base-cased' model = f'/home/gezi/data/lm/{model_name}' tokenizer = AutoTokenizer.from_pretrained(model) vocab = gezi.Vocab(f'{model}/vocab.txt', fixed=True) dids = set() for file_ in files: total = len(open(file_).readlines()) for line in tqdm(open(file_), total=total): l = line.strip().split('\t') did, title, abstract = l[0], l[3], l[4] if did in dids: continue dids.add(did) if abstract: text = title + ' ' + abstract else: text = title