Esempio n. 1
0
def main(_):
    np.random.seed(FLAGS.seed_)

    files = gezi.list_files(FLAGS.in_dir)
    print('input', FLAGS.in_dir)

    FLAGS.out_dir += f'/{FLAGS.record_name}'
    if not os.path.exists(FLAGS.out_dir):
        print('make new dir: [%s]' % FLAGS.out_dir, file=sys.stderr)
        os.makedirs(FLAGS.out_dir)

    if FLAGS.train_by_day and FLAGS.shuffle_impressions:
        assert FLAGS.day is not None

    global df, uid_vocab, did_vocab, uid_vocab2, did_vocab2
    global cat_vocab, scat_vocab, entity_vocab, entity_type_vocab
    behaviors_file = f'{FLAGS.in_dir}/{FLAGS.mark}/behaviors.tsv'
    if FLAGS.mark == 'train' and FLAGS.day == 6:
        behaviors_file = f'{FLAGS.in_dir}/dev/behaviors.tsv'
    print('behaviors_file', behaviors_file)
    df = pd.read_csv(behaviors_file, sep='\t', names=behaviors_names)
    if FLAGS.mark == 'train':
        print('behaviors_df shuffle')
        df = df.sample(frac=1, random_state=FLAGS.seed_)
    uid_vocab = gezi.Vocab(f'{FLAGS.in_dir}/uid.txt')
    did_vocab = gezi.Vocab(f'{FLAGS.in_dir}/did.txt')
    uid_vocab2 = gezi.Vocab(f'{FLAGS.in_dir}/train/uid.txt')
    did_vocab2 = gezi.Vocab(f'{FLAGS.in_dir}/train/did.txt')
    cat_vocab = gezi.Vocab(f'{FLAGS.in_dir}/cat.txt')
    scat_vocab = gezi.Vocab(f'{FLAGS.in_dir}/sub_cat.txt')
    entity_vocab = gezi.Vocab(f'{FLAGS.in_dir}/entity.txt')
    entity_type_vocab = gezi.Vocab(f'{FLAGS.in_dir}/entity_type.txt')

    for line in open(f'{FLAGS.in_dir}/start_times.txt'):
        did, timestamp, _ = line.strip().split('\t')
        start_timestamps[did] = int(timestamp)

    global news_info
    # ndf = pd.read_csv(f'{FLAGS.in_dir}/{FLAGS.mark}/news.tsv', sep='\t', names=news_names)
    news_info = {}
    # for _, row in tqdm(ndf.iterrows(), total=len(ndf), ascii=True, desc='news_info'):
    #   news_info[row['did']] = row
    news_file = f'{FLAGS.in_dir}/{FLAGS.mark}/news.tsv'
    if FLAGS.mark == 'train' and FLAGS.day == 6:
        news_file = f'{FLAGS.in_dir}/dev/news.tsv'
    total = len(open(news_file).readlines())
    for line in tqdm(open(news_file),
                     total=total,
                     ascii=True,
                     desc='news_info'):
        l = line.strip('\n').split('\t')
        m = {}
        for i, name in enumerate(news_names):
            m[name] = l[i]
        news_info[l[0]] = m

    with Pool(FLAGS.num_records) as p:
        p.map(build_features, range(FLAGS.num_records))
Esempio n. 2
0
def main(_):
  vocab_file = sys.argv[1]
  vocab = gezi.Vocab(vocab_file)
  emb_height = vocab.size()

  print(vocab.id('i'))

  emb_size = len(open('./vectors.txt').readline().strip().split()) - 1
  print(emb_size)

  emb = np.random.uniform(-0.05, 0.05,(emb_height, emb_size))
  print(emb) 

  emb = list(emb)

  for line in tqdm(open('./vectors.txt'), total=emb_height):
    l = line.strip().split()
    word, vals = l[0], l[1:]
    vals = np.asarray(list(map(float, vals)))
    if FLAGS.norm:
      vals = normalize(np.reshape(vals, (1,-1)))
    #vals /= np.sqrt(emb_size)
    vals = np.reshape(vals, (-1,))
    emb[vocab.id(word)] = vals  

  emb = np.asarray(emb)
  print(emb)
  print(emb.shape)
  #emb = normalize(emb)

  np.save('./emb.npy', emb)
Esempio n. 3
0
from transformers import AutoTokenizer
import gezi

files = [
          './train/news.tsv', 
          './dev/news.tsv', 
          './test/news.tsv'
        ]

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

emb_size = 30

vocab = gezi.Vocab('./did.txt')
emb_height = vocab.size()
print(emb_height)

dids = set()

emb = [[1] * emb_size] * emb_height

for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title = l[0], l[3]
    if did in dids:
      continue
    dids.add(did)
Esempio n. 4
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os

import numpy as np
import gezi
from sklearn.preprocessing import normalize
from tqdm import tqdm

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
vocab_file = f'{model}/vocab.txt'
vocab = gezi.Vocab(vocab_file, fixed=True)
emb_height = vocab.size()

print(vocab.id('i'))

emb_size = len(open('./vectors.txt').readline().strip().split()) - 1
print(emb_size)

emb = np.random.uniform(-0.05, 0.05, (emb_height, emb_size))
print(emb)

emb = list(emb)

for line in tqdm(open('./vectors.txt'), total=emb_height):
    l = line.strip().split()
    word, vals = l[0], l[1:]
Esempio n. 5
0
def init():
    vocab_names = [
        'did', 'uid', 'cat', 'sub_cat', 'entity', 'entity_type', 'word'
    ]

    # vocabs =
    #   {
    #     'uid': {
    #       'min_count': FLAGS.min_count,
    #       'slim': False,
    #       'trainable': FLAGS.train_uid_emb,
    #       'pretrain': None,
    #     },
    #     'did': {
    #       'min_count': FLAGS.min_count,
    #       'slim': False,
    #       'trainable': FLAGS.train_did_emb,
    #       'pretrain': FLAGS.did_pretrain,
    #     },
    #     'cat': {
    #       'min_count': FLAGS.min_count,
    #       'slim': False,
    #       'trainable': True,
    #       'pretrain': None,
    #     },
    #     'sub_cat': {
    #       'min_count': FLAGS.min_count,
    #       'slim': False,
    #       'trainable': True,
    #       'pretrain': None,
    #     },
    #     'entity': {
    #       'min_count': FLAGS.min_count,
    #       'slim': False,
    #       'trainable': FLAGS.train_entity_emb,
    #       'pretrain': FALGS.entity_pretrain,
    #     },
    #     'entity2': {
    #       'min_count': FLAGS.min_count,
    #       'slim': True,
    #       'trainable': True,
    #       'pretrain': None,
    #     },
    #     'entity_type': {
    #       'min_count': FLAGS.min_count,
    #       'slim': False,
    #       'trainable': True,
    #       'pretrain': None,
    #     },
    #     'word': {
    #       'min_count': 0,
    #       'slim': False,
    #       'trainable': FLAGS.train_word_emb,
    #       'pretrain': FLAGS.word_pretrain,
    #     },
    #   }

    vocab_sizes = {}
    for vocab_name in vocab_names:
        fixed = False if vocab_name != 'word' else True
        vocab = gezi.Vocab(f'{FLAGS.input_dir}/{vocab_name}.txt', fixed=fixed)
        min_count = FLAGS.min_count if vocab_name != 'word' else 0
        logging.debug('---min_count', min_count)
        # # > 1e6 表示在train或者dev数据出现过
        vocab_size = [vocab.size(), vocab.size(min_count + 1000000)]

        if vocab_name == 'uid' and FLAGS.max_vid:
            vocab_size[1] = FLAGS.max_vid  # vocab_size[1] is not used
        vocab_sizes[vocab_name] = vocab_size
        # print(vocab_name, vocab_size)
    gezi.set('vocab_sizes', vocab_sizes)

    # mixed train start from FLAGS.mix_train
    valid_day = 6
    if FLAGS.mix_train:
        FLAGS.loop_train = False
        FLAGS.valid_input = f'{FLAGS.train_input}/{valid_day}'
        FLAGS.train_input = ','.join([
            f'{FLAGS.train_input}/{i}'
            for i in range(int(FLAGS.start), valid_day)
        ])
        FLAGS.mname += '.mix'

    # 自动循环train/valid/test 注意始终使用day=6做验证
    day = int(FLAGS.start or 0)
    if day != 0:
        FLAGS.mname += f'.{day}'
    if FLAGS.mode != 'train':
        FLAGS.valid_hour = str(valid_day)

    if 'rand' in FLAGS.input:
        FLAGS.shuffle = True

    if 'pad' in FLAGS.input:
        FLAGS.record_padded = True

    if FLAGS.neg_mask_ratio > 0:
        FLAGS.use_weight = True

    if FLAGS.big_model:
        FLAGS.his_encoder = 'gru'
        FLAGS.title_encoder = 'gru'
        FLAGS.title_pooling = 'att'
        # FLAGS.use_contexts = True
        FLAGS.use_his_image = True
        FLAGS.use_image = True
        FLAGS.train_image_emb = True
Esempio n. 6
0
def init():
  FLAGS.title_lookup = f'{FLAGS.doc_dir}/title_lookup.npy'
  FLAGS.doc_lookup = f'{FLAGS.doc_dir}/doc_lookup.npy'
  FLAGS.doc_fnames = f'{FLAGS.doc_dir}/doc_fnames.npy'
  FLAGS.doc_flens = f'{FLAGS.doc_dir}/doc_flens.npy'

  doc_feats = np.load(FLAGS.doc_fnames)
  doc_feat_lens = np.load(FLAGS.doc_flens)
  gezi.set('doc_feats', doc_feats)
  gezi.set('doc_feat_lens', doc_feat_lens)

  logging.info('doc_feats', list(zip(doc_feats, doc_feat_lens)))

  vocab_names = [
                  'did', 'uid',
                  'cat', 'sub_cat',
                  'entity', 'entity_type',
                  'word'
                ]

  # vocabs = 
  #   {
  #     'uid': {
  #       'min_count': FLAGS.min_count,
  #       'slim': False,
  #       'trainable': FLAGS.train_uid_emb,
  #       'pretrain': None,
  #     },
  #     'did': {
  #       'min_count': FLAGS.min_count,
  #       'slim': False,
  #       'trainable': FLAGS.train_did_emb,
  #       'pretrain': FLAGS.did_pretrain,
  #     },
  #     'cat': {
  #       'min_count': FLAGS.min_count,
  #       'slim': False,
  #       'trainable': True,
  #       'pretrain': None,
  #     },
  #     'sub_cat': {
  #       'min_count': FLAGS.min_count,
  #       'slim': False,
  #       'trainable': True,
  #       'pretrain': None,
  #     },
  #     'entity': {
  #       'min_count': FLAGS.min_count,
  #       'slim': False,
  #       'trainable': FLAGS.train_entity_emb,
  #       'pretrain': FALGS.entity_pretrain,
  #     },
  #     'entity2': {
  #       'min_count': FLAGS.min_count,
  #       'slim': True,
  #       'trainable': True,
  #       'pretrain': None,
  #     },
  #     'entity_type': {
  #       'min_count': FLAGS.min_count,
  #       'slim': False,
  #       'trainable': True,
  #       'pretrain': None,
  #     },
  #     'word': {
  #       'min_count': 0,
  #       'slim': False,
  #       'trainable': FLAGS.train_word_emb,
  #       'pretrain': FLAGS.word_pretrain,
  #     },   
  #   }

  vocab_sizes = {}
  for vocab_name in vocab_names:
    fixed = False if vocab_name != 'word' else True
    vocab_file =  f'{FLAGS.input_dir}/{vocab_name}.txt'
    vocab = gezi.Vocab(vocab_file, fixed=fixed)
    
    if FLAGS.dev_version == 1:
      min_count = FLAGS.min_count if vocab_name != 'word' else 0
      logging.debug('---min_count', min_count)
      train_vocab_file = f'{FLAGS.input_dir}/train/{vocab_name}.txt'
      if os.path.exists(train_vocab_file) and min_count:
        train_vocab = gezi.Vocab(train_vocab_file, fixed=fixed)
      else:
        train_vocab = vocab
      vocab_size = [vocab.size(), train_vocab.size(min_count)]
    else:
      # > 1e6 表示在train或者dev数据出现过
      min_count = FLAGS.min_count if vocab_name != 'word' else 0
      if min_count != 0:
        min_count += FLAGS.test_start  
      vocab_size = [vocab.size(), vocab.size(min_count)] 

    if vocab_name == 'uid' and FLAGS.min_uid_count:
      vocab_size[1] = vocab.size(FLAGS.test_start + FLAGS.min_uid_count)

    if vocab_name == 'uid' and FLAGS.max_vid:
      vocab_size[1] = FLAGS.max_vid  # vocab_size[1] is not used
    vocab_sizes[vocab_name] = vocab_size

  gezi.set('vocab_sizes', vocab_sizes)
  logging.info('vocab_sizes:', vocab_sizes)

  # mixed train start from FLAGS.mix_train
  valid_day = 6
  if FLAGS.mix_train:
    FLAGS.loop_train = False
    FLAGS.valid_input = f'{FLAGS.train_input}/{valid_day}'
    FLAGS.train_input = ','.join([f'{FLAGS.train_input}/{i}' for i in range(int(FLAGS.start), valid_day)])
    FLAGS.mname += '.mix'

  # 自动循环train/valid/test 注意始终使用day=6做验证
  day = int(FLAGS.start or 0)
  if day != 0:
    FLAGS.mname += f'.{day}'
  if FLAGS.mode != 'train':
    FLAGS.valid_hour = str(valid_day)

  if 'rand' in FLAGS.input:
    FLAGS.shuffle = True

  if 'pad' in FLAGS.input:
    FLAGS.record_padded = True
    
  if FLAGS.neg_mask_ratio > 0:
    FLAGS.use_weight = True

  # 注意异步验证mode==async_valid 仍然会写summary
  if FLAGS.mode == 'valid':
    FLAGS.write_summary = False
    if FLAGS.num_valid == 1000000:
      FLAGS.num_valid = 0

  if FLAGS.big_model:
    FLAGS.his_encoder = 'gru'
    FLAGS.title_encoder = 'gru'
    FLAGS.title_pooling = 'att'
    # FLAGS.use_contexts = True
    FLAGS.use_his_image = True
    FLAGS.use_image = True
    FLAGS.train_image_emb = True
Esempio n. 7
0
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os

import numpy as np
import gezi
from sklearn.preprocessing import normalize
from tqdm import tqdm

vocab_file = './entity.txt'
vocab = gezi.Vocab(vocab_file)
emb_height = vocab.size()

emb_size = len(
    open('./train/entity_embedding.vec').readline().strip().split()) - 1
print(emb_size)

emb = np.random.uniform(-0.05, 0.05, (emb_height, emb_size))
print(emb)

emb = list(emb)

files = [
    './train/entity_embedding.vec', './dev/entity_embedding.vec',
    './test/entity_embedding.vec'
]
Esempio n. 8
0
import sys
import os

import numpy as np
import gezi
from tqdm import tqdm
from transformers import AutoTokenizer
import gezi

files = ['./train/news.tsv', './dev/news.tsv', './test/news.tsv']

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

vocab = gezi.Vocab(f'{model}/vocab.txt', fixed=True)

dids = set()
for file_ in files:
    total = len(open(file_).readlines())
    for line in tqdm(open(file_), total=total):
        l = line.strip().split('\t')
        did, title, abstract = l[0], l[3], l[4]
        if did in dids:
            continue
        dids.add(did)

        if abstract:
            text = title + ' ' + abstract
        else:
            text = title