Exemple #1
0
def save_to_mongo(_results, _saved, model_update_time, data_source):
    done = True
    for key in _results.keys():
        result_pair = _results[key]
        if result_pair[1].return_value is None:
            done = False
            logging.warn("Haven't finish yet for key %s, continue..." % key)
            continue
        else:
            if _saved[key] == False:
                _saved[key] = True
                to_save = (result_pair[0], result_pair[1].return_value, result_pair[2])
                region = to_save[0]
                for single_hour_prediction in zip(to_save[1], to_save[2]):
                    p = Prediction()
                    p.setRegion(region)
                    p.setModelUpdateTime(model_update_time)
                    p.setPredictedValues(
                        float(single_hour_prediction[0][1]), math.sqrt(float(single_hour_prediction[0][2]))
                    )
                    p.setTime(str(single_hour_prediction[1]))
                    p_json = p.toDict()
                    if data_source == "twitter":
                        save_interface = PredictionInterface()
                        save_interface.setDB(TwitterConfig.prediction_db)
                        save_interface.setCollection(TwitterConfig.prediction_collection)
                        save_interface.saveDocument(p_json)
                    elif data_source == "instagram":
                        save_interface = PredictionInterface()
                        save_interface.setDB(InstagramConfig.prediction_db)
                        save_interface.setCollection(InstagramConfig.prediction_collection)
                        save_interface.saveDocument(p_json)
    return done
def save_to_mongo(_results, _saved, model_update_time):
    done = True
    for key in _results.keys():
        result_pair = _results[key]
        if result_pair[1].return_value is None:
            done = False
            continue
        else:
            if _saved[key] == False:
                _saved[key] = True
                to_save = (result_pair[0], result_pair[1].return_value, result_pair[2]) 
                region = to_save[0]
                for single_hour_prediction in zip(to_save[1], to_save[2]):
                    p = Prediction()
                    p.setRegion(region)
                    p.setModelUpdateTime(model_update_time)
                    p.setPredictedValues( float(single_hour_prediction[0][1]), math.sqrt(float(single_hour_prediction[0][2])))
                    p.setTime( str(single_hour_prediction[1]) )
                    p_json = p.toJSON()
                    save_interface = PredictionInterface()
                    save_interface.saveDocument( p_json )
    return done
Exemple #3
0
def session_length_performance(prediction: Prediction, dataset: RecSysDataset):
    _, score = prediction.get_score()
    score.reset_index(level=1, inplace=True)
    score.set_index(['session_id'], inplace=True)

    performances = {}
    for index in range(len(dataset)):
        indices = np.array(
            dataset.rec_sys_data.groups[dataset.session_ids[index]])
        session = dataset.rec_sys_data.session_df.loc[indices]
        length = len(session)
        session_id = session['session_id'].iloc[0]
        if length not in performances:
            performances[length] = []
        performances[length].append(score.loc[session_id]['score'])

    for length in performances.keys():
        arr = np.array(performances[length])
        performances[length] = (arr.mean(), len(arr))

    return performances
Exemple #4
0
def train(config, state=None):
    random.seed(42)

    config = prepare_config(config)

    batch_size = config.get('batch_size')
    patience = config.get('patience')
    num_epochs = config.get('num_epochs')
    learning_rate = config.get('learning_rate')
    phases = config.get('phases')
    reduce_factor = config.get('reduce_factor')
    reduce_patience = config.get('reduce_patience')
    weight_decay = config.get('weight_decay')
    use_cosine_similarity = config.get('use_cosine_similarity')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    data = get_rec_sys_data(size=config.get('dataset_size'))

    train_dataset = RecSysDataset(
        rec_sys_data=data,
        split_strategy=RandomSampleStrategy(split=0.7),
        include_impressions=False,
        train_mode=True)
    train_val_dataset = RecSysDataset(rec_sys_data=data,
                                      split_strategy=AllSamplesIncludeStrategy(
                                          include=train_dataset.session_ids),
                                      include_impressions=True)
    val_dataset = RecSysDataset(rec_sys_data=data,
                                split_strategy=AllSamplesExceptStrategy(
                                    exclude=train_dataset.session_ids),
                                include_impressions=True)
    print("Num Train Sessions", len(train_dataset))
    print("Num Validation Sessions", len(val_dataset))

    recommend_network = RecommenderNetwork(
        config=config,
        item_size=train_dataset.item_size,
        target_item_size=train_dataset.target_item_size,
    )
    loss_function = nn.CosineEmbeddingLoss()
    rc_optimizer = optim.Adam(recommend_network.parameters(),
                              lr=learning_rate,
                              weight_decay=weight_decay)
    rc_lr_scheduler = ReduceLROnPlateau(rc_optimizer,
                                        mode='max',
                                        factor=reduce_factor,
                                        patience=reduce_patience)

    start_epoch = 0
    best_score_so_far = None

    if state:
        rc_optimizer.load_state_dict(state['rc_optimizer_state_dict'])
        rc_lr_scheduler.load_state_dict(state['rc_optimizer_state_dict'])
        recommend_network.load_state_dict(state['network_state_dict'])
        start_epoch = state['epoch']
        best_score_so_far = state['best_score_so_far']

    datasets = {
        "train": train_dataset,
        "train_val": train_dataset,
        "val": val_dataset,
    }

    data_loaders = {
        "train":
        DataLoader(train_dataset,
                   batch_size=batch_size,
                   shuffle=False,
                   num_workers=6,
                   collate_fn=train_dataset.collator),
        "train_val":
        DataLoader(train_val_dataset,
                   batch_size=batch_size,
                   shuffle=False,
                   num_workers=6,
                   collate_fn=train_val_dataset.collator),
        "val":
        DataLoader(val_dataset,
                   batch_size=batch_size,
                   shuffle=False,
                   num_workers=6,
                   collate_fn=val_dataset.collator),
    }

    sizes = {
        "train": int(len(train_dataset) / batch_size) + 1,
        "train_val": int(len(train_dataset) / batch_size) + 1,
        "val": int(len(val_dataset) / batch_size) + 1,
    }

    losses = np.zeros(int(len(train_dataset) / batch_size + 1.0))

    recommend_network = recommend_network.to(device)

    cur_patience = 0
    print("Uses CUDA: {0}".format(use_cuda))
    for epoch in range(start_epoch, num_epochs):
        print('-' * 15, "Epoch: ", epoch + 1, '\t', '-' * 15)

        cur_phase = phases

        for phase in cur_phase:  # 'train_val',
            cur_dataset = datasets[phase]
            cur_prediction = Prediction(
                dataset=cur_dataset,
                device=device,
                use_cosine_similarity=use_cosine_similarity,
            )
            do_validation = phase != 'train'
            if phase == 'train':
                recommend_network.train()
            else:
                recommend_network.eval()

            losses.fill(0)
            with progressbar.ProgressBar(max_value=sizes[phase],
                                         redirect_stdout=True) as bar:
                for idx, data in enumerate(data_loaders[phase]):
                    if phase != 'train':
                        sessions, session_lengths, session_targets, item_impressions, impression_ids, target_index, prices, ids = data
                    else:
                        sessions, session_lengths, session_targets = data
                        impression_ids = None
                        item_impressions = None
                        ids = None
                        target_index = None
                        prices = None

                    sessions = sessions.to(device)
                    session_targets = session_targets.to(device)

                    if phase == 'train':
                        rc_optimizer.zero_grad()
                        with torch.set_grad_enabled(True):
                            item_scores = recommend_network(
                                sessions, session_lengths).float()
                            loss = loss_function(
                                item_scores, session_targets,
                                torch.ones(session_targets.size(0)).to(device))
                            loss.backward()
                            rc_optimizer.step()
                            losses[idx] = loss.item()
                    else:
                        with torch.set_grad_enabled(False):
                            item_scores = recommend_network(
                                sessions, session_lengths).float()

                            cur_prediction.add_predictions(
                                ids=ids,
                                impression_ids=impression_ids,
                                item_impressions=item_impressions,
                                item_scores=item_scores,
                            )
                    bar.update(idx)
            if do_validation:
                score, _ = cur_prediction.get_score()

                print(phase, " Score: ", score)
                rc_lr_scheduler.step(score)
                if phase == 'val':
                    if best_score_so_far is None or score > best_score_so_far:
                        best_score_so_far = score
                        torch.save(
                            {
                                'epoch':
                                epoch,
                                'best_score_so_far':
                                best_score_so_far,
                                'rc_optimizer_state_dict':
                                rc_optimizer.state_dict(),
                                'network_state_dict':
                                recommend_network.state_dict(),
                                'config':
                                config,
                            }, MODEL_PATH)
                        cur_patience = 0
                        print("New best \\o/")
                    else:
                        cur_patience += 1
                        if cur_patience > patience:
                            print("Not patient anymore => Quit")
                            break
            if not do_validation:
                print(phase, " Loss: ", losses.mean())
        if cur_patience > patience:
            break

    print("Final best model: ", best_score_so_far)
    target_path = os.path.join(
        MODEL_BASE_PATH, '{}_{}.pth'.format(MODEL_NAME,
                                            round(best_score_so_far, 2)))
    shutil.move(MODEL_PATH, target_path)

    return best_score_so_far, target_path
Exemple #5
0
def save_to_mongo(_results, _saved, model_update_time):
    done = True
    for key in _results.keys():
        result_pair = _results[key]
        if result_pair[1].return_value is None:
            done = False
            continue
        else:
            if _saved[key] == False:
                _saved[key] = True
                to_save = (result_pair[0], result_pair[1].return_value,
                           result_pair[2])
                region = to_save[0]
                for single_hour_prediction in zip(to_save[1], to_save[2]):
                    p = Prediction()
                    p.setRegion(region)
                    p.setModelUpdateTime(model_update_time)
                    p.setPredictedValues(
                        float(single_hour_prediction[0][1]),
                        math.sqrt(float(single_hour_prediction[0][2])))
                    p.setTime(str(single_hour_prediction[1]))
                    p_json = p.toJSON()
                    save_interface = PredictionInterface()
                    save_interface.saveDocument(p_json)
    return done
Exemple #6
0
def do_prediction(recommender_state, ranking_state, dataset):
    random.seed(42)
    from recommender_configs import prepare_config
    from utility.prediction import Prediction
    from network.recommender_network import RecommenderNetwork
    from network.impression_network import ImpressionRankNetwork
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    recommender_state_config = prepare_config(recommender_state.get('config'))
    rc_network_state_dict = recommender_state.get('network_state_dict')

    ir_state_config = prepare_config(ranking_state.get('config'))
    ir_network_state_dict = ranking_state.get('network_state_dict')

    recommender_network = RecommenderNetwork(
        config=recommender_state_config,
        item_size=dataset.item_size,
        target_item_size=dataset.target_item_size,
    )
    recommender_network.load_state_dict(rc_network_state_dict)
    recommender_network.eval()

    rank_network = ImpressionRankNetwork(
        config=ir_state_config,
        item_size=dataset.item_size,
        device=device,
    )
    rank_network.load_state_dict(ir_network_state_dict)
    rank_network.eval()

    data_loader = DataLoader(dataset,
                             batch_size=SUBMISSION_BATCH_SIZE,
                             shuffle=True,
                             num_workers=6,
                             collate_fn=dataset.collator)

    cur_prediction = Prediction(
        dataset=dataset,
        device=device,
    )

    recommender_network = recommender_network.to(device)
    rank_network = rank_network.to(device)
    print("Begin predicting...")
    with progressbar.ProgressBar(
            max_value=int(len(dataset) / SUBMISSION_BATCH_SIZE + 1),
            redirect_stdout=True) as bar:
        for idx, data in enumerate(data_loader):
            sessions, session_lengths, _, item_impressions, impression_ids, target_index, prices, ids = data
            sessions = sessions.to(device)

            item_scores = recommender_network(sessions,
                                              session_lengths).float()
            selected_impression = rank_network(item_impressions, item_scores,
                                               prices)

            cur_prediction.add_predictions(
                ids=ids,
                impression_ids=impression_ids,
                item_impressions=item_impressions,
                item_scores=item_scores,
                selected_impression=selected_impression)
            bar.update(idx)

    return cur_prediction