def data_generator(self): epoch_num = 0 while 1: # batch for embeddings. user_ids = [] item_ids = [] clicks = [] epoch_num += 1 pydev.info('Epoch %d' % epoch_num) for user_id, actions in self.train: actions = filter(lambda x: x[1] == 1, actions) for item_id, _, _ in actions: user_ids.append(user_id) item_ids.append(item_id) clicks.append(1.) user_ids.append(user_id) item_ids.append(random.randint(0, self.movie_count - 1)) clicks.append(0.) if len(clicks) >= self.batch_size: yield (torch.tensor(user_ids).to(self.device), torch.tensor(item_ids).to(self.device), torch.tensor(clicks).to(self.device)) user_ids = [] item_ids = [] clicks = []
def data_generator(self): epoch_num = 0 while 1: # batch for embeddings. a_nids = [] b_nids = [] clicks = [] epoch_num += 1 pydev.info('Epoch %d' % epoch_num) for user, actions in self.train: actions = filter(lambda x: x[1] == 1, actions) for a_nid, _, _ in actions: b_nid, _, _ = actions[random.randint(0, len(actions) - 1)] if a_nid == b_nid: continue a_nids.append(a_nid) b_nids.append(b_nid) clicks.append(1.) a_nids.append(a_nid) b_nids.append(random.randint(0, self.movie_count - 1)) clicks.append(0.) if len(clicks) >= self.batch_size: yield (torch.tensor(a_nids).to(self.device), torch.tensor(b_nids).to(self.device), torch.tensor(clicks).to(self.device)) a_nids = [] b_nids = [] clicks = []
def data_generator(self): InputSize = 10 epoch_num = 0 while 1: # batch for embeddings. input_nids = [] input_offset = [] y = [] clicks = [] epoch_num += 1 pydev.info('Epoch %d' % epoch_num) for user, actions in self.train: # put all to input. input = map(lambda x: x[0], actions) #input = map(lambda x:x[0], filter(lambda x:x[1]==1, actions)) for item, click, _ in actions: input_offset.append(len(input_nids)) input_nids += input y.append(item) clicks.append(float(click)) #print input_nids, input_offset, y, clicks if len(clicks) >= self.batch_size: yield torch.tensor(input_nids).to( self.device), torch.tensor(input_offset).to( self.device), torch.tensor(y).to( self.device), torch.tensor(clicks).to( self.device) input_nids = [] input_offset = [] y = [] clicks = []
def data_generator(self): epoch_num = 0 while 1: # batch for embeddings. user_ids = [] item_ids = [] user_genres_ids = [] user_genres_offset = [] clicks = [] epoch_num += 1 pydev.info('Epoch %d' % epoch_num) for uid, iid, user_genres, click in self.data: user_ids.append(uid) item_ids.append(iid) user_genres_offset.append(len(user_genres_ids)) user_genres_ids += user_genres clicks.append(float(click)) if len(clicks) >= self.batch_size: yield (torch.tensor(user_ids).to(self.device), torch.tensor(item_ids).to(self.device), torch.tensor(user_genres_ids).to(self.device), torch.tensor(user_genres_offset).to(self.device), torch.tensor(clicks).to(self.device)) user_ids = [] item_ids = [] clicks = [] user_genres_ids = [] user_genres_offset = []
def load_uid_iid_data(self): pydev.info('Begin loading data..') # no need to load train. # only load 10000 train as test_of_train. self.test_of_train, self.valid, self.test = utils.readdata( 'data', test_num=10000) pydev.info('Load over')
def test_SlotFileWriter(self): writer = SlotFileWriter('test.ins') for i in range(1000): writer.begin_instance(i % 2) for j in range(10): slot_id = 'slot%s' % j writer.write_slot(slot_id, list(range(j))) writer.end_instance() writer.summary() pydev.info('passed')
def __init__(self): pydev.App.__init__(self) self.debug=True #TestNum = -1 TestNum = -1 pydev.info('Begin loading data..') self.train, self.valid, self.test = utils.readdata('data', test_num=TestNum) pydev.info('Load over')
def eval(data_path, image_set='val', model_path=None): img_tr, tgt_tr = train.eval_transform() data = tv.datasets.VOCSegmentation(data_path, image_set=image_set, transform=img_tr, target_transform=tgt_tr) pydev.info('Data loaded') #model = models.V0_tv_fcn_res101() model = models.V1_tv_dlabv3_res101() pydev.info('Model loaded') if model_path: pydev.info('load model params from [%s]' % model_path) model.load_state_dict(torch.load(model_path)) pydev.info('model load ok') with torch.no_grad(): model.eval() bar = tqdm.tqdm(range(len(data))) acc_prec = 0 acc_count = 0 stat_dict = {} for i in range(0, label_count + 1): stat_dict[i] = [0, 0] for idx in bar: y = model(data[idx][0].unsqueeze(0).cuda()) out = y.squeeze() v = out.max(dim=2).indices.cpu().to(torch.uint8) label_precision(v, data[idx][1], stat_dict) mean_iou = sum( map(lambda x: x[0] / (x[1] + 1e-4), stat_dict.values())) / len(stat_dict) prec = precision(v, data[idx][1]) acc_prec += prec acc_count += 1 bar.set_description('Mean_IOU=%.2f%% MIoU_on_ins=%.2f%%' % (mean_iou * 100., acc_prec * 100. / acc_count)) pydev.info('Mean_IOU=%.2f%% MIoU_on_ins=%.2f%%' % (mean_iou * 100., acc_prec * 100. / acc_count)) for c in range(0, label_count + 1): print 'label-%d: %.2f%% (%d/%d)' % ( c, stat_dict[c][0] * 100. / stat_dict[c][1], stat_dict[c][0], stat_dict[c][1])
def load_movies(path, ignore_tags=False): # load movie basic info. movies = {} for line in file(path + '/movies.csv').readlines(): # line may contains more then 2 ',' row = line.strip().split(',') movie_id = row[0] title = ','.join(row[1:-1]) genres = row[-1] if movie_id == 'movieId': # ignore first line. continue movie = MovieInfo() movie.id = int(movie_id) movie.title = title movie.genres = genres.split('|') movie.process() movies[movie.id] = movie pydev.info('load movie basic info over.') if ignore_tags: return movies # load tag meta-info. tag_info = {} for tagid, tag in pydev.foreach_row(file(path + '/genome-tags.csv'), seperator=','): if tagid == 'tagId': continue tag_info[tagid] = tag.strip() pydev.info('load tags info over.') # load genome tags info. tag_match_count = 0 for movieid, tagid, score in pydev.foreach_row(file(path + '/genome-scores.csv'), seperator=','): try: key = int(movieid) if key not in movies: continue movies[key].tags.append( (int(tagid), tag_info.get(tagid, ''), float(score))) tag_match_count += 1 except Exception, e: pydev.err(e)
def load(self, fd): self.__slot_index = {} slot_info = fd.readline().strip().split('\t') for slot in slot_info: self.__slot_index[slot] = IndexCoder() pydev.info('%d slot info loaded' % len(self.__slot_index)) for slot, key, idx in pydev.foreach_row(fd): slot_index = self.__slot_index.get(slot, None) if slot_index is None: raise Exception('Cannot get slot : %s' % slot) if int(idx) != len(slot_index.tags): raise Exception('Index not match : %s:%s:%s' % (slot, idx, key)) slot_index.index[key] = len(slot_index.tags) slot_index.tags.append(key)
def __init__(self, slot_info, embedding_size): # Ranking model: # input emb_size * N nn.Module.__init__(self) total_input_length = 0 self.emb_bags = nn.ModuleList() for slot, slot_feanum in slot_info: pydev.info('init embeding bag of %s (%d)' % (slot, slot_feanum)) self.emb_bags.append( nn.EmbeddingBag(slot_feanum, embedding_size, mode='mean')) total_input_length += embedding_size pydev.info('input_length : %d' % total_input_length) self.fc1 = nn.Linear(total_input_length, 256) self.fc2 = nn.Linear(256, 256) self.fc3 = nn.Linear(256, 128) self.fc4 = nn.Linear(128, 128) self.out = nn.Linear(128, 1)
def measure(predictor, test, debug=False): progress = tqdm.tqdm(test) y = [] y_ = [] debug_fd = None if debug: debug_fd = file('log/debug.log', 'w') for uid, iid, score in progress: pred_score = predictor(uid, iid, debug_fd) if debug: print >> debug_fd, '%s\t%s\t%d\t%.3f' % (uid, iid, score, pred_score) y.append(score) y_.append(pred_score) pydev.info('Predict over') auc = metrics.roc_auc_score(y, y_) pydev.log('Test AUC: %.3f' % auc)
def __init__(self, train, device, epoch_count, batch_size, movie_dir): max_movie_id = 0 max_user_id = 0 pydev.info('load movies') self.movies = utils.load_movies(movie_dir, ignore_tags=True) self.epoch_count = epoch_count self.batch_size = batch_size self.data_count = 0 self.device = device self.data = [] write_progress = tqdm.tqdm(train) self.slot_coder = easy_train.SlotIndexCoder() # feature extracting. for uid, iid, click in write_progress: max_movie_id = max(max_movie_id, iid) max_user_id = max(max_user_id, uid) self.data_count += 1 movie_id = int(iid) movie = self.movies.get(movie_id, utils.MovieInfo()) user_genres = [] for genres in movie.genres: key = '%s_%s' % (uid, genres) idx = self.slot_coder.alloc('uid_genres', key) user_genres.append(idx) self.data.append((uid, iid, user_genres, click)) self.train_iter_count = self.epoch_count * self.data_count / self.batch_size self.user_count = max_user_id + 1 self.movie_count = max_movie_id + 1 pydev.log('user_count=%d' % self.user_count) pydev.log('movie_count=%d' % self.movie_count) pydev.log('data_count=%d' % self.data_count)
def data_generator(self): self.current_epoch= 0 while 1: # batch for embeddings. user_ids = [] item_ids = [] clicks = [] self.current_epoch += 1 pydev.info('Epoch %d' % self.current_epoch) for uid, iid, click in self.train: user_ids.append(uid) item_ids.append(iid) clicks.append(float(click)) if len(clicks)>=self.batch_size: yield (torch.tensor(user_ids).to(self.device), torch.tensor(item_ids).to(self.device), torch.tensor(clicks).to(self.device)) user_ids = [] item_ids = [] clicks = []
def __init__(self, user_count, item_count, user_genres_count, embedding_size): # Ranking model: # input emb_size * 2 (embbag of input, emb of item to predict) # fc x 4 nn.Module.__init__(self) pydev.info('user_count=%d' % user_count) pydev.info('item_count=%d' % item_count) pydev.info('user_genres_count=%d' % user_genres_count) pydev.info('embedding=%d' % embedding_size) self.uid_emb = nn.Embedding(user_count, embedding_size) self.iid_emb = nn.Embedding(item_count, embedding_size) self.user_genres_emb = nn.EmbeddingBag(user_genres_count, embedding_size, mode='mean') self.lr = nn.Linear(embedding_size * 3, 1)
def __init__(self, filename, seperator=',', contain_key=True, metric='angular'): ''' File format(contain_key=True): <key>[tab]<num>,<num>,... File format(contain_key=False): <num>,<num>,... ''' self.index = None # lazy create. self.emb_size = 0 fd = file(filename) line_count = 0 valid_count = 0 for line in fd.readlines(): if contain_key: key, value = line.strip().split('\t') else: key = line_count value = line.strip() line_count += 1 value = value.split(',') d = len(value) if self.index is None: # first create. self.emb_size = d self.index = annoy.AnnoyIndex(f=self.emb_size, metric=metric) pydev.info('set emb_size=%d metric=%s' % (self.emb_size, metric)) elif d != self.emb_size: continue vec = map(lambda x: float(x), value) self.index.add_item(int(key), vec) valid_count += 1 pydev.info('emb load over, begin to build index..') self.index.build(32) pydev.info('EmbeddingDict load over: valid_count=%d, line_count=%d' % (valid_count, line_count))
def nid_ctr(self): stat = {} global_disp = 0 global_click = 0 for uid, iid, click in self.train: if iid not in stat: stat[iid] = [0, 0] stat[iid][0] += 1 global_disp += 1 if click: global_click += 1 stat[iid][1] += 1 global_click_ratio = global_click * 0.00001 global_disp_ratio = global_disp * 0.00001 def predict(uid, iid, debug_fd, smooth): s = stat.get(iid, [0, 0]) if debug_fd: print >> debug_fd, 'stat\t%s\t%d\t%d' % (iid, s[0], s[1]) if smooth==0: return s[1] * 1. / (s[0] + 0.1) elif smooth==1: return (s[1] + 1.) / (s[0] + 10.) elif smooth==2: return (s[1] + global_click_ratio) / (s[0] + global_disp_ratio) predict_none_smooth = lambda u,i,d:predict(u,i,d,smooth=0) predict_static_smooth = lambda u,i,d:predict(u,i,d,smooth=1) predict_ratio_smooth = lambda u,i,d:predict(u,i,d,smooth=2) pydev.info('nid_ctr with none smooth') utils.measure(predict_none_smooth, self.test, debug=self.debug) pydev.info('nid_ctr with static smooth') utils.measure(predict_static_smooth, self.test, debug=self.debug) pydev.info('nid_ctr with ratio smooth') utils.measure(predict_ratio_smooth, self.test, debug=self.debug)
def infer(data_path, image_set='val', model_path=None): ori_imgs = tv.datasets.VOCSegmentation(data_path, image_set=image_set) img_tr, tgt_tr = train.V0_transform() data = tv.datasets.VOCSegmentation(data_path, image_set=image_set, transform=img_tr, target_transform=tgt_tr) pydev.info('Data loaded') model = models.V0_tv_fcn_res101() pydev.info('Model loaded') if model_path: pydev.info('load model params from [%s]' % model_path) model.load_state_dict(torch.load(model_path)) pydev.info('model load ok') while True: idx = sys.stdin.readline() idx = int(idx) pydev.info('Index=%d' % idx) y = model(data[idx][0].unsqueeze(0).cuda()) out = y.squeeze() v = out.max(dim=2).indices.cpu().to(torch.uint8) prec = precision(v, data[idx][1]) pydev.info('Precision=%.2f%%' % (prec * 100.)) im = PIL.Image.fromarray(v.numpy()) im.putpalette(ori_imgs[0][1].getpalette()) ori_imgs[idx][0].show() ori_imgs[idx][1].show() im.show()
def summary(self): pydev.info('Summary: ins=%d, slot=%d, fea=%d' % (self.__ins_cnt, self.__slot_cnt, self.__fea_cnt))
user_ids = [] item_ids = [] clicks = [] if __name__=='__main__': autoarg = pydev.AutoArg() data_dir = autoarg.option('data', 'data/') model_save_path = autoarg.option('output', 'temp/dnn.pkl') TestNum = int(autoarg.option('testnum', -1)) EmbeddingSize = int(autoarg.option('embed', 16)) EpochCount = int(autoarg.option('epoch', 3)) BatchSize = int(autoarg.option('batch', 1024)) device_name = autoarg.option('device', 'cuda') pydev.info('EmbeddingSize=%d' % EmbeddingSize) pydev.info('Epoch=%d' % EpochCount) pydev.info('BatchSize=%d' % BatchSize) device = torch.device(device_name) train, valid, test = utils.readdata(data_dir, test_num=TestNum) data = DataGenerator(train, device, epoch_count=EpochCount, batch_size=BatchSize) model = DNNRank(data.user_count, data.movie_count, EmbeddingSize).to(device) #optimizer = optim.SGD(model.parameters(), lr=0.005) optimizer = optim.Adam(model.parameters(), lr=0.01) loss_fn = nn.BCELoss() generator = data.data_generator()
user_ids = [] item_ids = [] clicks = [] if __name__ == '__main__': if len(sys.argv) != 2: print >> sys.stderr, 'Usage:\ndnn.py <datadir>' sys.exit(-1) TestNum = 1000 EmbeddingSize = 256 EpochCount = 120 BatchSize = 256 pydev.info('EmbeddingSize=%d' % EmbeddingSize) pydev.info('Epoch=%d' % EpochCount) pydev.info('BatchSize=%d' % BatchSize) device = torch.device('cuda') data_dir = sys.argv[1] train, valid, test = utils.readdata(data_dir, test_num=TestNum) data = DataGenerator(train, device, epoch_count=EpochCount, batch_size=BatchSize) model = UID_NID_DSSM(data.user_count, data.movie_count, EmbeddingSize).to(device)
def epoch_train(train, model, optimizer, loss_fn, epoch, batch_size=32, device=None, validation=None, validation_epoch=10, scheduler=None, validation_scheduler=None, post_process=None): try: ''' import torchvision T = torchvision.transforms.ToPILImage() ''' best = 0 model.train() for e in range(epoch): print 'Epoch %d:' % e bar = tqdm.tqdm(train) loss_sum = 0 correct_all = 0 count = 0 epoch_count = 0 #first = True for x, y in bar: optimizer.zero_grad() if device: x = x.to(device) y = y.to(device) y_ = model(x) y_ = y_.reshape(-1, y_.shape[-1]) y = y.reshape(-1) loss = loss_fn(y_, y) correct = y.eq(y_.max(1)[1]).sum() cur_loss = loss loss.backward() optimizer.step() loss_sum += cur_loss correct_all += correct count += y.shape[0] epoch_count += 1 bar.set_description( "Loss:%.5f Acc:%.5f" % (loss_sum / epoch_count, correct_all * 1. / count)) if scheduler: scheduler.step() if validation_scheduler: prec = epoch_test(validation, model, device, precision_threshold=90, current_best=best) if prec > best: best = prec validation_scheduler.step(prec) elif validation and (e + 1) % validation_epoch == 0: prec = epoch_test(validation, model, device, precision_threshold=90, current_best=best) if prec > best: best = prec if post_process: pydev.info('PostProcessing..') post_process(e) except Exception, ex: pydev.err(ex) pydev.err('Training Exception(may be interrupted by control.)')
id_list, offset = dct.get(slot, [[], []]) emb_pair = torch.tensor(id_list).to(device), torch.tensor( offset).to(device) x.append(emb_pair) clicks_ = model.forward(x) loss = loss_fn(clicks_, clicks) loss.backward() del x, clicks return loss.item() last_epoch = reader.epoch() def while_condition(iter_num): epoch = reader.epoch() if epoch > last_epoch or (iter_num > 0 and iter_num % 200 == 0): tester.test_ins_data(model, slot_info) return epoch < EpochCount pydev.info('Begin training..') easy.pytorch.common_train(fwbp, optimizer, -1, while_condition=while_condition, loss_curve_output=file('log/train_loss.log', 'w')) pydev.info('Saving model..') torch.save(model.state_dict(), model_save_path)
tag_match_count = 0 for movieid, tagid, score in pydev.foreach_row(file(path + '/genome-scores.csv'), seperator=','): try: key = int(movieid) if key not in movies: continue movies[key].tags.append( (int(tagid), tag_info.get(tagid, ''), float(score))) tag_match_count += 1 except Exception, e: pydev.err(e) # sort tags. pydev.info('sort tags..') for movie in movies: movies[movie].tags = sorted(movies[movie].tags, key=lambda x: -x[2]) pydev.info('tag matchs : %d' % tag_match_count) return movies def readfile(fd, test_num=-1): data = [] for line in fd.readlines(): uid, iid, score = line.split(',') uid = int(uid) iid = int(iid) score = int(score)