Exemple #1
0
    def data_generator(self):
        epoch_num = 0
        while 1:
            # batch for embeddings.
            user_ids = []
            item_ids = []
            clicks = []

            epoch_num += 1
            pydev.info('Epoch %d' % epoch_num)
            for user_id, actions in self.train:
                actions = filter(lambda x: x[1] == 1, actions)

                for item_id, _, _ in actions:

                    user_ids.append(user_id)
                    item_ids.append(item_id)
                    clicks.append(1.)

                    user_ids.append(user_id)
                    item_ids.append(random.randint(0, self.movie_count - 1))
                    clicks.append(0.)

                    if len(clicks) >= self.batch_size:
                        yield (torch.tensor(user_ids).to(self.device),
                               torch.tensor(item_ids).to(self.device),
                               torch.tensor(clicks).to(self.device))

                        user_ids = []
                        item_ids = []
                        clicks = []
Exemple #2
0
    def data_generator(self):
        epoch_num = 0
        while 1:
            # batch for embeddings.
            a_nids = []
            b_nids = []
            clicks = []

            epoch_num += 1
            pydev.info('Epoch %d' % epoch_num)
            for user, actions in self.train:
                actions = filter(lambda x: x[1] == 1, actions)

                for a_nid, _, _ in actions:
                    b_nid, _, _ = actions[random.randint(0, len(actions) - 1)]
                    if a_nid == b_nid:
                        continue

                    a_nids.append(a_nid)
                    b_nids.append(b_nid)
                    clicks.append(1.)

                    a_nids.append(a_nid)
                    b_nids.append(random.randint(0, self.movie_count - 1))
                    clicks.append(0.)

                    if len(clicks) >= self.batch_size:
                        yield (torch.tensor(a_nids).to(self.device),
                               torch.tensor(b_nids).to(self.device),
                               torch.tensor(clicks).to(self.device))

                        a_nids = []
                        b_nids = []
                        clicks = []
Exemple #3
0
    def data_generator(self):
        InputSize = 10
        epoch_num = 0
        while 1:
            # batch for embeddings.
            input_nids = []
            input_offset = []
            y = []
            clicks = []

            epoch_num += 1
            pydev.info('Epoch %d' % epoch_num)
            for user, actions in self.train:
                # put all to input.
                input = map(lambda x: x[0], actions)
                #input = map(lambda x:x[0], filter(lambda x:x[1]==1, actions))
                for item, click, _ in actions:
                    input_offset.append(len(input_nids))
                    input_nids += input
                    y.append(item)
                    clicks.append(float(click))

                    #print input_nids, input_offset, y, clicks
                    if len(clicks) >= self.batch_size:
                        yield torch.tensor(input_nids).to(
                            self.device), torch.tensor(input_offset).to(
                                self.device), torch.tensor(y).to(
                                    self.device), torch.tensor(clicks).to(
                                        self.device)

                        input_nids = []
                        input_offset = []
                        y = []
                        clicks = []
Exemple #4
0
    def data_generator(self):
        epoch_num = 0
        while 1:
            # batch for embeddings.
            user_ids = []
            item_ids = []
            user_genres_ids = []
            user_genres_offset = []

            clicks = []

            epoch_num += 1
            pydev.info('Epoch %d' % epoch_num)
            for uid, iid, user_genres, click in self.data:
                user_ids.append(uid)
                item_ids.append(iid)
                user_genres_offset.append(len(user_genres_ids))
                user_genres_ids += user_genres

                clicks.append(float(click))

                if len(clicks) >= self.batch_size:
                    yield (torch.tensor(user_ids).to(self.device),
                           torch.tensor(item_ids).to(self.device),
                           torch.tensor(user_genres_ids).to(self.device),
                           torch.tensor(user_genres_offset).to(self.device),
                           torch.tensor(clicks).to(self.device))

                    user_ids = []
                    item_ids = []
                    clicks = []
                    user_genres_ids = []
                    user_genres_offset = []
Exemple #5
0
 def load_uid_iid_data(self):
     pydev.info('Begin loading data..')
     # no need to load train.
     # only load 10000 train as test_of_train.
     self.test_of_train, self.valid, self.test = utils.readdata(
         'data', test_num=10000)
     pydev.info('Load over')
Exemple #6
0
 def test_SlotFileWriter(self):
     writer = SlotFileWriter('test.ins')
     for i in range(1000):
         writer.begin_instance(i % 2)
         for j in range(10):
             slot_id = 'slot%s' % j
             writer.write_slot(slot_id, list(range(j)))
         writer.end_instance()
     writer.summary()
     pydev.info('passed')
Exemple #7
0
    def __init__(self):
        pydev.App.__init__(self)
    
        self.debug=True

        #TestNum = -1
        TestNum = -1

        pydev.info('Begin loading data..')
        self.train, self.valid, self.test = utils.readdata('data', test_num=TestNum)
        pydev.info('Load over')
Exemple #8
0
def eval(data_path, image_set='val', model_path=None):
    img_tr, tgt_tr = train.eval_transform()
    data = tv.datasets.VOCSegmentation(data_path,
                                       image_set=image_set,
                                       transform=img_tr,
                                       target_transform=tgt_tr)
    pydev.info('Data loaded')

    #model = models.V0_tv_fcn_res101()
    model = models.V1_tv_dlabv3_res101()
    pydev.info('Model loaded')

    if model_path:
        pydev.info('load model params from [%s]' % model_path)
        model.load_state_dict(torch.load(model_path))
        pydev.info('model load ok')

    with torch.no_grad():
        model.eval()
        bar = tqdm.tqdm(range(len(data)))
        acc_prec = 0
        acc_count = 0
        stat_dict = {}
        for i in range(0, label_count + 1):
            stat_dict[i] = [0, 0]
        for idx in bar:
            y = model(data[idx][0].unsqueeze(0).cuda())

            out = y.squeeze()
            v = out.max(dim=2).indices.cpu().to(torch.uint8)
            label_precision(v, data[idx][1], stat_dict)

            mean_iou = sum(
                map(lambda x: x[0] /
                    (x[1] + 1e-4), stat_dict.values())) / len(stat_dict)

            prec = precision(v, data[idx][1])
            acc_prec += prec
            acc_count += 1

            bar.set_description('Mean_IOU=%.2f%% MIoU_on_ins=%.2f%%' %
                                (mean_iou * 100., acc_prec * 100. / acc_count))
        pydev.info('Mean_IOU=%.2f%% MIoU_on_ins=%.2f%%' %
                   (mean_iou * 100., acc_prec * 100. / acc_count))

        for c in range(0, label_count + 1):
            print 'label-%d: %.2f%% (%d/%d)' % (
                c, stat_dict[c][0] * 100. / stat_dict[c][1], stat_dict[c][0],
                stat_dict[c][1])
Exemple #9
0
def load_movies(path, ignore_tags=False):
    # load movie basic info.
    movies = {}
    for line in file(path + '/movies.csv').readlines():
        # line may contains more then 2 ','
        row = line.strip().split(',')
        movie_id = row[0]
        title = ','.join(row[1:-1])
        genres = row[-1]

        if movie_id == 'movieId':
            # ignore first line.
            continue

        movie = MovieInfo()
        movie.id = int(movie_id)
        movie.title = title
        movie.genres = genres.split('|')
        movie.process()
        movies[movie.id] = movie
    pydev.info('load movie basic info over.')

    if ignore_tags:
        return movies

    # load tag meta-info.
    tag_info = {}
    for tagid, tag in pydev.foreach_row(file(path + '/genome-tags.csv'),
                                        seperator=','):
        if tagid == 'tagId':
            continue
        tag_info[tagid] = tag.strip()
    pydev.info('load tags info over.')

    # load genome tags info.
    tag_match_count = 0
    for movieid, tagid, score in pydev.foreach_row(file(path +
                                                        '/genome-scores.csv'),
                                                   seperator=','):
        try:
            key = int(movieid)
            if key not in movies:
                continue
            movies[key].tags.append(
                (int(tagid), tag_info.get(tagid, ''), float(score)))
            tag_match_count += 1
        except Exception, e:
            pydev.err(e)
Exemple #10
0
    def load(self, fd):
        self.__slot_index = {}
        slot_info = fd.readline().strip().split('\t')
        for slot in slot_info:
            self.__slot_index[slot] = IndexCoder()
        pydev.info('%d slot info loaded' % len(self.__slot_index))

        for slot, key, idx in pydev.foreach_row(fd):
            slot_index = self.__slot_index.get(slot, None)
            if slot_index is None:
                raise Exception('Cannot get slot : %s' % slot)

            if int(idx) != len(slot_index.tags):
                raise Exception('Index not match : %s:%s:%s' %
                                (slot, idx, key))

            slot_index.index[key] = len(slot_index.tags)
            slot_index.tags.append(key)
Exemple #11
0
    def __init__(self, slot_info, embedding_size):
        # Ranking model:
        # input emb_size * N
        nn.Module.__init__(self)

        total_input_length = 0
        self.emb_bags = nn.ModuleList()
        for slot, slot_feanum in slot_info:
            pydev.info('init embeding bag of %s (%d)' % (slot, slot_feanum))
            self.emb_bags.append(
                nn.EmbeddingBag(slot_feanum, embedding_size, mode='mean'))
            total_input_length += embedding_size

        pydev.info('input_length : %d' % total_input_length)
        self.fc1 = nn.Linear(total_input_length, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 128)
        self.out = nn.Linear(128, 1)
Exemple #12
0
def measure(predictor, test, debug=False):
    progress = tqdm.tqdm(test)
    y = []
    y_ = []

    debug_fd = None
    if debug:
        debug_fd = file('log/debug.log', 'w')
    for uid, iid, score in progress:
        pred_score = predictor(uid, iid, debug_fd)
        if debug:
            print >> debug_fd, '%s\t%s\t%d\t%.3f' % (uid, iid, score,
                                                     pred_score)

        y.append(score)
        y_.append(pred_score)

    pydev.info('Predict over')

    auc = metrics.roc_auc_score(y, y_)
    pydev.log('Test AUC: %.3f' % auc)
Exemple #13
0
    def __init__(self, train, device, epoch_count, batch_size, movie_dir):
        max_movie_id = 0
        max_user_id = 0

        pydev.info('load movies')
        self.movies = utils.load_movies(movie_dir, ignore_tags=True)

        self.epoch_count = epoch_count
        self.batch_size = batch_size
        self.data_count = 0
        self.device = device
        self.data = []

        write_progress = tqdm.tqdm(train)
        self.slot_coder = easy_train.SlotIndexCoder()
        # feature extracting.
        for uid, iid, click in write_progress:
            max_movie_id = max(max_movie_id, iid)
            max_user_id = max(max_user_id, uid)
            self.data_count += 1

            movie_id = int(iid)
            movie = self.movies.get(movie_id, utils.MovieInfo())
            user_genres = []
            for genres in movie.genres:
                key = '%s_%s' % (uid, genres)
                idx = self.slot_coder.alloc('uid_genres', key)
                user_genres.append(idx)

            self.data.append((uid, iid, user_genres, click))

        self.train_iter_count = self.epoch_count * self.data_count / self.batch_size

        self.user_count = max_user_id + 1
        self.movie_count = max_movie_id + 1

        pydev.log('user_count=%d' % self.user_count)
        pydev.log('movie_count=%d' % self.movie_count)
        pydev.log('data_count=%d' % self.data_count)
Exemple #14
0
    def data_generator(self):
        self.current_epoch= 0
        while 1:
            # batch for embeddings.
            user_ids = []
            item_ids = []
            clicks = []

            self.current_epoch += 1
            pydev.info('Epoch %d' % self.current_epoch)
            for uid, iid, click  in self.train:
                user_ids.append(uid)
                item_ids.append(iid)
                clicks.append(float(click))

                if len(clicks)>=self.batch_size:
                    yield (torch.tensor(user_ids).to(self.device),
                            torch.tensor(item_ids).to(self.device), 
                            torch.tensor(clicks).to(self.device))

                    user_ids = []
                    item_ids = []
                    clicks = []
Exemple #15
0
    def __init__(self, user_count, item_count, user_genres_count,
                 embedding_size):
        # Ranking model:
        # input emb_size * 2 (embbag of input, emb of item to predict)
        # fc x 4
        nn.Module.__init__(self)

        pydev.info('user_count=%d' % user_count)
        pydev.info('item_count=%d' % item_count)
        pydev.info('user_genres_count=%d' % user_genres_count)
        pydev.info('embedding=%d' % embedding_size)

        self.uid_emb = nn.Embedding(user_count, embedding_size)
        self.iid_emb = nn.Embedding(item_count, embedding_size)
        self.user_genres_emb = nn.EmbeddingBag(user_genres_count,
                                               embedding_size,
                                               mode='mean')

        self.lr = nn.Linear(embedding_size * 3, 1)
Exemple #16
0
    def __init__(self,
                 filename,
                 seperator=',',
                 contain_key=True,
                 metric='angular'):
        '''
            File format(contain_key=True): 
               <key>[tab]<num>,<num>,... 

            File format(contain_key=False): 
               <num>,<num>,... 
        '''
        self.index = None  # lazy create.
        self.emb_size = 0

        fd = file(filename)
        line_count = 0
        valid_count = 0
        for line in fd.readlines():
            if contain_key:
                key, value = line.strip().split('\t')
            else:
                key = line_count
                value = line.strip()

            line_count += 1
            value = value.split(',')

            d = len(value)
            if self.index is None:
                # first create.
                self.emb_size = d
                self.index = annoy.AnnoyIndex(f=self.emb_size, metric=metric)
                pydev.info('set emb_size=%d metric=%s' %
                           (self.emb_size, metric))
            elif d != self.emb_size:
                continue

            vec = map(lambda x: float(x), value)
            self.index.add_item(int(key), vec)
            valid_count += 1

        pydev.info('emb load over, begin to build index..')
        self.index.build(32)
        pydev.info('EmbeddingDict load over: valid_count=%d, line_count=%d' %
                   (valid_count, line_count))
Exemple #17
0
    def nid_ctr(self):
        stat = {}
        global_disp = 0
        global_click = 0
        for uid, iid, click in self.train:
            if iid not in stat:
                stat[iid] = [0, 0]

            stat[iid][0] += 1
            global_disp += 1
            if click:
                global_click += 1
                stat[iid][1] += 1

        global_click_ratio = global_click * 0.00001
        global_disp_ratio = global_disp * 0.00001

        def predict(uid, iid, debug_fd, smooth):
            s = stat.get(iid, [0, 0])
            if debug_fd:
                print >> debug_fd, 'stat\t%s\t%d\t%d' % (iid, s[0], s[1])
            if smooth==0:
                return s[1] * 1. / (s[0] + 0.1)
            elif smooth==1:
                return (s[1] + 1.) / (s[0] + 10.)
            elif smooth==2:
                return (s[1] + global_click_ratio) / (s[0] + global_disp_ratio)

        predict_none_smooth = lambda u,i,d:predict(u,i,d,smooth=0)
        predict_static_smooth = lambda u,i,d:predict(u,i,d,smooth=1)
        predict_ratio_smooth = lambda u,i,d:predict(u,i,d,smooth=2)

        pydev.info('nid_ctr with none smooth')
        utils.measure(predict_none_smooth, self.test, debug=self.debug)
        pydev.info('nid_ctr with static smooth')
        utils.measure(predict_static_smooth, self.test, debug=self.debug)
        pydev.info('nid_ctr with ratio smooth')
        utils.measure(predict_ratio_smooth, self.test, debug=self.debug)
Exemple #18
0
def infer(data_path, image_set='val', model_path=None):
    ori_imgs = tv.datasets.VOCSegmentation(data_path, image_set=image_set)
    img_tr, tgt_tr = train.V0_transform()
    data = tv.datasets.VOCSegmentation(data_path,
                                       image_set=image_set,
                                       transform=img_tr,
                                       target_transform=tgt_tr)
    pydev.info('Data loaded')

    model = models.V0_tv_fcn_res101()
    pydev.info('Model loaded')
    if model_path:
        pydev.info('load model params from [%s]' % model_path)
        model.load_state_dict(torch.load(model_path))
        pydev.info('model load ok')

    while True:
        idx = sys.stdin.readline()
        idx = int(idx)
        pydev.info('Index=%d' % idx)

        y = model(data[idx][0].unsqueeze(0).cuda())

        out = y.squeeze()
        v = out.max(dim=2).indices.cpu().to(torch.uint8)
        prec = precision(v, data[idx][1])
        pydev.info('Precision=%.2f%%' % (prec * 100.))

        im = PIL.Image.fromarray(v.numpy())
        im.putpalette(ori_imgs[0][1].getpalette())

        ori_imgs[idx][0].show()
        ori_imgs[idx][1].show()
        im.show()
Exemple #19
0
 def summary(self):
     pydev.info('Summary: ins=%d, slot=%d, fea=%d' %
                (self.__ins_cnt, self.__slot_cnt, self.__fea_cnt))
Exemple #20
0
                    user_ids = []
                    item_ids = []
                    clicks = []

if __name__=='__main__':
    autoarg = pydev.AutoArg()
    data_dir = autoarg.option('data', 'data/')
    model_save_path = autoarg.option('output', 'temp/dnn.pkl')

    TestNum = int(autoarg.option('testnum', -1))
    EmbeddingSize = int(autoarg.option('embed', 16))
    EpochCount = int(autoarg.option('epoch', 3))
    BatchSize = int(autoarg.option('batch', 1024))
    device_name = autoarg.option('device', 'cuda')

    pydev.info('EmbeddingSize=%d' % EmbeddingSize)
    pydev.info('Epoch=%d' % EpochCount)
    pydev.info('BatchSize=%d' % BatchSize)

    device = torch.device(device_name)

    train, valid, test = utils.readdata(data_dir, test_num=TestNum)
    data = DataGenerator(train, device, epoch_count=EpochCount, batch_size=BatchSize)

    model = DNNRank(data.user_count, data.movie_count, EmbeddingSize).to(device)
    #optimizer = optim.SGD(model.parameters(), lr=0.005)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    loss_fn = nn.BCELoss()
    
    generator = data.data_generator()
Exemple #21
0
                        user_ids = []
                        item_ids = []
                        clicks = []


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print >> sys.stderr, 'Usage:\ndnn.py <datadir>'
        sys.exit(-1)

    TestNum = 1000
    EmbeddingSize = 256
    EpochCount = 120
    BatchSize = 256

    pydev.info('EmbeddingSize=%d' % EmbeddingSize)
    pydev.info('Epoch=%d' % EpochCount)
    pydev.info('BatchSize=%d' % BatchSize)

    device = torch.device('cuda')

    data_dir = sys.argv[1]

    train, valid, test = utils.readdata(data_dir, test_num=TestNum)
    data = DataGenerator(train,
                         device,
                         epoch_count=EpochCount,
                         batch_size=BatchSize)

    model = UID_NID_DSSM(data.user_count, data.movie_count,
                         EmbeddingSize).to(device)
Exemple #22
0
def epoch_train(train,
                model,
                optimizer,
                loss_fn,
                epoch,
                batch_size=32,
                device=None,
                validation=None,
                validation_epoch=10,
                scheduler=None,
                validation_scheduler=None,
                post_process=None):
    try:
        '''
        import torchvision
        T = torchvision.transforms.ToPILImage()
        '''
        best = 0
        model.train()
        for e in range(epoch):
            print 'Epoch %d:' % e
            bar = tqdm.tqdm(train)

            loss_sum = 0
            correct_all = 0
            count = 0
            epoch_count = 0

            #first = True
            for x, y in bar:
                optimizer.zero_grad()
                if device:
                    x = x.to(device)
                    y = y.to(device)

                y_ = model(x)

                y_ = y_.reshape(-1, y_.shape[-1])
                y = y.reshape(-1)

                loss = loss_fn(y_, y)
                correct = y.eq(y_.max(1)[1]).sum()

                cur_loss = loss
                loss.backward()
                optimizer.step()

                loss_sum += cur_loss
                correct_all += correct
                count += y.shape[0]
                epoch_count += 1
                bar.set_description(
                    "Loss:%.5f Acc:%.5f" %
                    (loss_sum / epoch_count, correct_all * 1. / count))

            if scheduler:
                scheduler.step()

            if validation_scheduler:
                prec = epoch_test(validation,
                                  model,
                                  device,
                                  precision_threshold=90,
                                  current_best=best)
                if prec > best:
                    best = prec
                validation_scheduler.step(prec)

            elif validation and (e + 1) % validation_epoch == 0:
                prec = epoch_test(validation,
                                  model,
                                  device,
                                  precision_threshold=90,
                                  current_best=best)
                if prec > best:
                    best = prec

            if post_process:
                pydev.info('PostProcessing..')
                post_process(e)

    except Exception, ex:
        pydev.err(ex)
        pydev.err('Training Exception(may be interrupted by control.)')
Exemple #23
0
            id_list, offset = dct.get(slot, [[], []])
            emb_pair = torch.tensor(id_list).to(device), torch.tensor(
                offset).to(device)
            x.append(emb_pair)

        clicks_ = model.forward(x)
        loss = loss_fn(clicks_, clicks)
        loss.backward()

        del x, clicks
        return loss.item()

    last_epoch = reader.epoch()

    def while_condition(iter_num):
        epoch = reader.epoch()
        if epoch > last_epoch or (iter_num > 0 and iter_num % 200 == 0):
            tester.test_ins_data(model, slot_info)
        return epoch < EpochCount

    pydev.info('Begin training..')
    easy.pytorch.common_train(fwbp,
                              optimizer,
                              -1,
                              while_condition=while_condition,
                              loss_curve_output=file('log/train_loss.log',
                                                     'w'))

    pydev.info('Saving model..')
    torch.save(model.state_dict(), model_save_path)
Exemple #24
0
    tag_match_count = 0
    for movieid, tagid, score in pydev.foreach_row(file(path +
                                                        '/genome-scores.csv'),
                                                   seperator=','):
        try:
            key = int(movieid)
            if key not in movies:
                continue
            movies[key].tags.append(
                (int(tagid), tag_info.get(tagid, ''), float(score)))
            tag_match_count += 1
        except Exception, e:
            pydev.err(e)

    # sort tags.
    pydev.info('sort tags..')
    for movie in movies:
        movies[movie].tags = sorted(movies[movie].tags, key=lambda x: -x[2])

    pydev.info('tag matchs : %d' % tag_match_count)

    return movies


def readfile(fd, test_num=-1):
    data = []
    for line in fd.readlines():
        uid, iid, score = line.split(',')
        uid = int(uid)
        iid = int(iid)
        score = int(score)