Ejemplo n.º 1
0
def test_all(data_path, file_prefix, density=1, slic=[0]):
    #data_path = 'data/tmall/slices/'
    #file_prefix = 'dataset15'
    #data_path = 'data/clef/slices/'
    #file_prefix = 'ds'
    #data_path = 'data/rsc15/slices/'
    #file_prefix = 'rsc15-clicks'
    #data_path = 'data/nowplaying/slices/'
    #file_prefix = 'nowplaying'
    #data_path = 'data/aotm/slices/'
    #file_prefix = 'playlists-aotm'
    #data_path = 'data/30music/slices/'
    #file_prefix = '30music-200ks'
    #data_path = 'data/retailrocket/slices/'
    #file_prefix = 'events'

    all_stats = defaultdict(int)
    for i in slic:
        train, test = loader.load_data(data_path,
                                       file_prefix,
                                       rows_train=None,
                                       rows_test=None,
                                       density=density,
                                       slice_num=i)

        s, i2s = load_sessions(train)
        print(data_path, file_prefix, i)

        stats = test_reachability(s, i2s, test)
        for k, v in stats.items():
            all_stats[k] += v
    for k, v in all_stats.items():
        print(k, v)
Ejemplo n.º 2
0
def dump_sequence(data_path, file_prefix, out_fn, density=1, slic=0):
    """
        Convert training/testing slices into a sequence format
        suitable for entropy rate estimation
    """
    #data_path = "data/tmall/slices/"
    #file_prefix = "dataset15"
    #data_path = "data/clef/slices/"
    #file_prefix = "ds"
    #data_path = "data/nowplaying/slices/"
    #file_prefix = "nowplaying"
    #data_path = "data/aotm/slices/"
    #file_prefix = "playlists-aotm"

    #data_path = "data/rsc15/slices/"
    #file_prefix = "rsc15-clicks"
    #data_path = "data/30music/slices/"
    #file_prefix = "30music-200ks"
    #data_path = "data/retailrocket/slices/"
    #file_prefix = "events"

    train, test = loader.load_data(data_path,
                                   file_prefix,
                                   rows_train=None,
                                   rows_test=None,
                                   density=density,
                                   slice_num=slic)

    # append all
    all_data = train.append(test)

    # sort by sequence, then timestamp
    groupby = all_data.groupby("SessionId")
    with open(out_fn, "w") as f:
        for session_id, session in groupby:
            item_ids = [
                item_id for item_id in session.sort_values("Time")["ItemId"]
            ]
            for item_id in item_ids:
                f.write("{}\n".format(item_id))
            f.write("-1\n")
Ejemplo n.º 3
0
     
    adpt = ad.Adapter(algo='fism')
    algs['fism'] = adpt
     
    adpt = ad.Adapter(algo='fossil')
    algs['fossil'] = adpt
     
    adpt = ad.Adapter(algo='fpmc')
    algs['fpmc'] = adpt
    
    '''

    #load data
    train, test = loader.load_data(data_path,
                                   file_prefix,
                                   rows_train=limit_train,
                                   rows_test=limit_test,
                                   density=density_value)
    buys = loader.load_buys(data_path, buys_prefix)
    item_ids = train.ItemId.unique()

    #init metrics
    for m in metric:
        m.init(train)
    # result dict
    res = {}
    res_buys = {}

    #train algorithms
    for k, a in algs.items():
        ts = time.time()
Ejemplo n.º 4
0
        else:
            return suff + ".w2v"


if __name__ == '__main__':
    # for testing in main
    import sys
    sys.path.append('../../')
    from evaluation import loader as loader
    data_path = '../../data/retailrocket/slices/'
    file_prefix = 'events'
    data_trained = '../../data/retailrocket/prepared2d/'

    train, test = loader.load_data(data_path,
                                   file_prefix,
                                   slice_num=0,
                                   rows_train=None,
                                   rows_test=None,
                                   density=1)
    items_to_predict = test['ItemId'].unique()

    factors = 100
    window = 5
    sg = 1
    epochs = 10
    model = Item2Vec(factors=factors,
                     window=window,
                     sg=sg,
                     workers=4,
                     hs=1,
                     epochs=epochs)
    model.fit(train)
Ejemplo n.º 5
0
def create_w2v_features( train, size=10, pos=False ):
    
    start = time.time()
    
    train['ItemId'] = train['ItemId'].astype('str')
    print( train['ItemId'].min() )
    
    sequences = train.groupby('SessionId')['ItemId'].apply(list)

    print('prepared features in ',(time.time() - start))
    
    # Learn decompositon ----------------------------------------------------------------
    print('ITEM2VEC FEATURES')
    start = time.time()
    
    model = gensim.models.Word2Vec(sequences, size=size, window=5, min_count=1, workers=4, iter=50)
    
    weights = model.wv.syn0
    np.save(open(FOLDER+'w2v.'+str(size)+'.wght', 'wb'), weights)
    
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    with open(FOLDER+'w2v.'+str(size)+'.voc', 'w') as f:
        f.write(json.dumps(vocab))
  
if __name__ == '__main__':
    
    train, test = ld.load_data(FOLDER, FILE)
#     create_latent_factors( combi, size=32, pos=False )
    create_w2v_features( train, size=64 )
    
Ejemplo n.º 6
0
def main():
    train, test = dl.load_data(FOLDER, PREFIX)
    split_data(train, FOLDER + PREFIX, DAYS_TEST)