Ejemplo n.º 1
0
def create_graph_domain():
    """
        Creates graph linking (domain searched, domain bought)
    """
    
    """
        Fetch data
    """
    
    from input.read_input import read_item_data
    df = read_item_data()
    df['item_id'] = df.index
    dct_title = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat= df['category_id'].to_dict()
    
    dct_price = df['price'].to_dict()
    
    """ Ratio stuff """    
    from input.create_ratio import get_ratio
    dct_ratio_dom = get_ratio(which='domain_id')
    
    ratio_df = get_ratio(which='item_id',full=True)
    ratio_df['popularity'] = 100.0*ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_b = ratio_df['popularity'].to_dict()
    
    
    
    """
        JSON
    
    """
    check = lambda x: x <= np.round(413163*0.8).astype(np.int32)
    
    DATA_PATH = path.join(DATA_DIR,'train_dataset.jl')
    line_i = 0
    
    

    """
        Create graph vertices
    """
    g = ig.Graph() 
    from input.read_input import get_mappings
    counter, f_map_func, r_map_func = get_mappings()
    
    num_items = df.shape[0]
    for k in dct_title.keys():
        g.add_vertex(value=k,deg=dct_ratio_item_b[k],domain_id=dct_domain[k],price=dct_price[k],cat='item_id')

    """ ['item_id','domain_id','category_id','product_id'] """
    
    for k in pd.unique(df['domain_id']):
        g.add_vertex(value=k,cat='domain_id')


    for k in pd.unique(df['category_id']):
        g.add_vertex(value=k,cat='category_id')


    for k in pd.unique(df['product_id']):
        g.add_vertex(value=k,cat='product_id')

    
    
    """
        Create edges
    """
    E1 = []
    E2 = []
    
    with jsonlines.open(DATA_PATH) as reader:
        for line_i, obj in enumerate(reader):
            if check(line_i):
                print(line_i)
                L = []
                for h in obj['user_history']:
                    if h['event_type'] == 'view':
                        #print("Viewed {}".format(dct[h['event_info']]))
                        L.append(h['event_info'])
                    elif h['event_type'] == 'search':
                        #print("Searched {}".format(h['event_info']))
                        pass
                L_domain = [dct_domain[k] for k in L]
                L_domain = pd.unique(L_domain)
                L_cat = [dct_cat[k] for k in L]
                L_cat = pd.unique(L_cat)
                
                for i in range(len(L)):
                        E1.append(dct_domain[L[i]])
                        E2.append(dct_domain[obj['item_bought']] )

    
    
    E1 = f_map_func['domain_id'](E1)
    E2 = f_map_func['domain_id'](E2)
    
    
    E =  pd.Series(list(zip(E1,E2))).value_counts()
    g.add_edges(E.index)
    g.es["weight"] = E.values
    
                     
    g.write_pickle(fname=path.join(DATA_DIR,'graph_domain_to_domain.pkl'))
Ejemplo n.º 2
0
def train_neural_domain_prediction():
    import tensorflow as tf
    """
        Create graph
    """
    from input.read_input import read_item_data
    df = read_item_data()
    dct_condition = df['condition'].to_dict()
    
    df2 = load_language_df()
    dct_lan_pt = df2['score_pt'].to_dict()
    dct_lan_en = df2['score_en'].to_dict()
    dct_lan_es = df2['score_es'].to_dict()
    
    
    
    NUM_ITEMS = read_item_data().shape[0]
    NUM_FEATURES = 1

   


    
    
    
    from input.read_input import  get_mappings, NUM_DOMS
    counter, f_map_func, r_map_func = get_mappings()
        
    NUM_DOMS = pd.unique(df['domain_id']).shape[0]
    NUM_CATS = pd.unique(df['category_id']).shape[0]
    
    """ Load graph """
    graph_fname = path.join(DATA_DIR,'graph_domain_to_domain.pkl')
    if not path.isfile(graph_fname):
        input("Did not find graph at {}. Will have to create it from scratch... (Any key to continue)".format(graph_fname))
        G = create_graph_domain()
    else:
        G = ig.Graph.Read_Pickle(path.join(DATA_DIR,'graph_domain_to_domain.pkl'))
    #weights = np.log(1+np.array(G.es["weight"]))
    weights = np.array(G.es["weight"])
    
    indices = np.array([ np.array(e.tuple)  for e in G.es]) - NUM_ITEMS 
    indices = np.transpose(indices) 
    
    """ Create sparse matrix W """
    from scipy.sparse import coo_matrix
    import scipy.sparse
    row = indices[0,:]
    col = indices[1,:]
    
    W = coo_matrix((weights, (row, col)),shape=(NUM_DOMS,NUM_DOMS))
    """ Normalize rows """
    #W = deg_matrix(W,pwr=-1) @ W
    W = W.transpose()
    W = scipy.sparse.csr_matrix(W)
    assert scipy.sparse.issparse(W)
    
            
    
    @tf.function
    def smooth_labels(labels, factor=0.001):
        # smooth the labels
        labels = tf.cast(labels,tf.float32)
        labels *= (1 - factor)
        labels += (factor / tf.cast(tf.shape(labels)[1],tf.float32))
        # returned the smoothed labels
        return labels
    @tf.function
    def compute_loss(labels,logits):
        logits = tf.reshape(logits,(-1,NUM_DOMS))
        labels = tf.reshape(labels,(-1,NUM_DOMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)
        
        logits = smooth_labels(logits)
        labels = smooth_labels(labels)
        
        losses = -tf.reduce_sum(logits*tf.math.log(labels),axis=1) 
        
        return tf.reduce_mean(losses)
    
    @tf.function
    def evaluate(labels,logits):
        logits = tf.reshape(logits,(-1,NUM_DOMS))
        labels = tf.reshape(labels,(-1,NUM_DOMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)
        
        logits = smooth_labels(logits)
        labels = smooth_labels(labels)
        
        acc = tf.metrics.categorical_accuracy(labels,logits)
        
        return tf.reduce_mean(acc)
    
    
    
    """
        Read data, yadda yadda
    
    """
    from input.create_ratio import get_ratio
    ratio_df = get_ratio(which='item_id',full=True,alternate=False)
    dct_ratio_item_b = ratio_df['bought'].to_dict()
    dct_ratio_item_s = ratio_df['searched'].to_dict()
    dct_ratio_item_r = ratio_df['searched'].to_dict()
    

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    
  
    
    """ Ratio stuff """    
    from input.create_ratio import get_ratio
    category_df = get_ratio(which='category_id',full=True)
    domain_df = get_ratio(which='domain_id', full = True)
    
  
    
    feat_1, feat_2, feat_3 = domain_df['searched'].values, domain_df['bought'].values, domain_df['rat'].values
    
    feat_4, feat_5 = domain_df['out_bought'].values,domain_df['rat2'].values
    
    feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].values, category_df['bought'].values, category_df['rat'].values
    
    
    def standardize(x):
        return (x - np.min(x)) / (np.max(x)+1e-06 - np.min(x))
    
    feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1,feat_2,feat_3]]
    
    feat_1_1, feat_2_1, feat_3_1 = [standardize(x) for x in [feat_1_1,feat_2_1,feat_3_1]]
    
    #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)])
    #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios)

    
    
    from  nn.domain_string_identifier import load_model
    domain_prediction_model = load_model()
    def my_generator(mode='train'):
            if mode == 'train':
                check = lambda x: x <= np.round(413163*0.8).astype(np.int32)
            elif mode == 'val':
                check = lambda x: x > np.round(413163*0.8).astype(np.int32)
            else:
                check = lambda x: True
            DATA_PATH = path.join(DATA_DIR,'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')
            print("Reading....")
            with jsonlines.open(DATA_PATH) as reader:
                for line_i, obj in enumerate(reader):
                    if check(line_i):
                        L = []
                        S = []
                        C =[]
                        IDS = []
                        for h in obj['user_history']:
                            if h['event_type'] == 'view':
                                L.append(dct_domain[h['event_info']])
                                C.append(dct_cat[h['event_info']])
                                IDS.append(h['event_info'])
                                
                            elif h['event_type'] == 'search':
                                S.append(h['event_info'])
    
                        
                        
                        
                        L =  f_map_func['domain_id'](L)
                        C =  f_map_func['category_id'](C)
                        
                        df = pd.DataFrame(
                            {"domain_id":L,
                             "feat_1_1":[feat_1_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))],
                             "feat_2_1":[feat_2_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))],
                             "feat_3_1":[feat_3_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))],
                             
                             },
                            index=IDS)
                        
                        
                        df['recency'] = range(len(L))
                        df['freq'] = np.ones((len(L),))
                        df['price'] = [ dct_price[k] for k in IDS]
                        df['item_b'] =[ dct_ratio_item_b[k] for k in IDS]
                        df['item_s'] =[ dct_ratio_item_s[k] for k in IDS]
                        
                        df['condition'] =[dct_condition[k] for k in IDS]
                        df['lan_pt'] = [dct_lan_pt[k] for k in IDS]
                        df['lan_en'] = [dct_lan_en[k] for k in IDS]
                        df['lan_es'] = [dct_lan_es[k] for k in IDS]
                        
                        
                        """ Adjust graph """
                        Y = np.zeros((NUM_DOMS,1)).astype(np.float32)
                        X = np.zeros((NUM_DOMS,55+55)).astype(np.float32)
                        
                        
                        X[:,0] = feat_1
                        X[:,1] = feat_2
                        X[:,2] = feat_3
                        X[:,3] = feat_4

                        i=4
                        for g, df2 in df.groupby(["domain_id"]):
                            i=4
                            v = df2.to_numpy()[:,1:]                            
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.sum(v,axis=0)
                            i += v.shape[1]
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.mean(v,axis=0)
                            i += v.shape[1]
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.nanstd(v,axis=0)
                            i += v.shape[1]
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.max(v,axis=0)
                            i += v.shape[1]

                        

                        if len(S) > 0:
                            s_pred = predict_model(domain_prediction_model,S,return_numeric=True)
                        else:
                            s_pred = np.zeros_like((1,NUM_DOMS))
                        if len(S) > 0:
                            X[:,i] =  np.mean(s_pred,axis=0)
                            X[:,i+1] =  np.max(s_pred,axis=0)
                            try:
                                X[:,i+2] =  np.nanstd(s_pred,axis=0)
                            except:
                                X[:,i+2] =  X[:,i+2] 
                            i += 3
                        
                        X[:,55:] = np.reshape(np.asarray(W @ X[:,55:]),(-1,X.shape[1]-55))
                        if not mode == 'test':
                            Y[     f_map_func['domain_id']( [ dct_domain[obj['item_bought']] ] )[0] - NUM_ITEMS,0    ] = 1.0
                        
                        
                        #X[:,:8] = 0

                        for i in range(55+3):
                            X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) 
                        
                        #X = X -0.5
                        yield X,Y
                    
    """
        Optimize
    """

    BS = 64
    step = 0
    
    def batch_generator(mode, loop =True,batch_size=BS):
        BATCH_X = []
        BATCH_Y = []
        i = 0
        while True:
            for x,y in my_generator(mode):
                
                BATCH_X.append(x[None,:,:])
                BATCH_Y.append(y[None,:,:])
                i+= 1
                if i % batch_size == 0:      
                    yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0)
                    BATCH_X = []
                    BATCH_Y = []
                    i = 0 
            if loop == False:
                yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0)
                break
    """
        Define model
    """
    import  tensorflow.keras as keras
    import tensorflow.keras.layers as layers
    inp_x = keras.Input((NUM_DOMS,55+55))
    x = layers.Dense(64,activation='relu')(inp_x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dense(1)(x)
    x = layers.Flatten()(x)
    x = layers.Softmax(axis=-1)(x)
    
    model = keras.Model(inputs=[inp_x],outputs=[x])
    print(model.summary())
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.5*1e-2,
        decay_steps=1000,
        decay_rate=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1*1e-2)
    
    model_fname = path.join(DATA_DIR,'model',"NEURAL_DOMAIN_PRED.h5")
    model.compile(optimizer=optimizer,loss=compute_loss,metrics=[evaluate])
    from functools import partial
    from input.read_input import TRAIN_LINES
    
    #model.load_weights(path.join(DATA_DIR,"MY_MODEL_2.h5"))
    if not path.isfile(model_fname):
        input("Warning!!! Did not find model weights at {}. Training takes many, many, many hours! (Press ENTER)".format(model_fname))
        
        model.fit_generator(batch_generator('train',True),
                  steps_per_epoch=TRAIN_LINES//BS,
                  epochs=5
                  )
        model.save_weights(model_fname)

    else:
        model.load_weights(model_fname)
        print("Testing fit... should be about 0.41 to 0.45")
        model.fit_generator(batch_generator('train',True),
          steps_per_epoch=25,
          epochs=1
          )

    
    
    def predict(mode):
        PREDS = []
        CONFS = []
        NUM_SELECT = 10
        batch_size = 320
        for batch_id, X in enumerate(batch_generator(mode,batch_size=batch_size,loop=False)):
            x = X[0]
            print("Predicting {} - Batch {}".format(mode,batch_id))
            pred = model.predict_on_batch(x)
            if batch_id == 0:
                print(pred)
            PREDS.append(tf.argsort(pred,axis=-1)[:,-NUM_SELECT:])
            CONFS.append(tf.sort(pred,axis=-1)[:,-NUM_SELECT:])
            
        PREDS = np.concatenate(PREDS,axis=0)
        CONFS = np.concatenate(CONFS,axis=0)
        PREDS = np.concatenate([PREDS,CONFS],axis=1)
        cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)] + \
         ['conf_{}'.format(k) for k in range(NUM_SELECT)] 
        fname = os.path.join(DATA_DIR,'dom_pred_{}.csv'.format(mode))
        pd.DataFrame(PREDS,index=range(PREDS.shape[0]),columns=cols).to_csv(fname)
    
    predict('val')
    predict('test')
    predict('train')
Ejemplo n.º 3
0
def final_prediction(mode='train', use_graph=True, debug=False):
    """
            Combines all classifiers in a hierarchical manner to create the final predictions.
            
                First, we create many rankings for items seen during the object history, such as ones based on frequency and recency.
            Perhaps the most important rankings are the ones related to the predictions of the RNN and LGB. I have hardcoded some
            coefficients that attained good validation accuracy. The top 10 items are selected.
            
                Then, I use the Neural Domain Classifier's predictions to eliminate items among those 10, specifically ones whose domain
                is very unlikely to be the one. Once again, there is a hardcoded cutoff that may need some tuning if you train the classifier from
                scratch, as it can have a significant effect on the NDCG.

        """
    TRAIN_LINES = 413163
    TEST_LINES = 177070
    df = read_item_data()

    from input.create_ratio import load_language_df
    df2 = load_language_df()
    dct_lan_pt = df2['score_pt'].to_dict()
    dct_lan_en = df2['score_en'].to_dict()
    dct_lan_es = df2['score_es'].to_dict()

    dct_condition = df['condition'].to_dict()

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    dct_pid = df['product_id'].to_dict()
    """ Ratio stuff """
    from input.create_ratio import get_ratio
    dct_ratio_dom = get_ratio(which='domain_id')

    ratio_df = get_ratio(which='item_id', full=True)
    ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_p = ratio_df['popularity'].to_dict()
    """ Most common embeddings. """
    ratio_df = ratio_df.sort_values(['popularity'], ascending=False)
    most_common_emb = get_emb(
        [first_two_words(dct[k]) for k in ratio_df.index[0:100]], [-1] * 100)

    ratio_df = get_ratio(which='item_id', full=True, alternate=False)
    dct_ratio_item_b = ratio_df['bought'].to_dict()
    dct_ratio_item_s = ratio_df['searched'].to_dict()
    dct_ratio_item_r = ratio_df['rat'].to_dict()

    df['item_popularity'] = [dct_ratio_item_p[k] for k in df.index]

    dct_ratio_cat = get_ratio(which='category_id')
    dct_ratio_item = get_ratio(which='item_id')

    dct_domain_df = {}
    dct_cat_df = {}
    for dom, df2 in df.groupby('domain_id'):
        df2 = df2.sort_values(['item_popularity'],
                              ascending=False)  #.iloc[0:10,:]
        dct_domain_df[dom] = df2

    for cat, df2 in df.groupby('category_id'):
        df2 = df2.sort_values(['item_popularity'],
                              ascending=False)  #.iloc[0:10,:]
        dct_cat_df[cat] = df2

        #print(df2)
    """ 
            RNN stuff.
        """
    from input.rnn_item_ranker import SEQ_LEN, CANDIDATES
    from input.rnn_item_ranker import read_predictions
    rnn_pred = read_predictions(mode)
    assert rnn_pred.shape[1] == 2 * CANDIDATES
    if mode == 'train' or mode == 'val':
        assert rnn_pred.shape[0] == TRAIN_LINES
    """
            LGB stuff
        """

    import lightgbm as lgb

    from sklearn.externals import joblib
    lgbc = joblib.load(path.join(DATA_DIR, 'model', 'lgb.pkl'))
    """
            Graph-related initialization
        """
    graph_fname = path.join(DATA_DIR, 'graph_domain_id.pkl')
    if not path.isfile(graph_fname):
        print("Creating item-to-item graph")
        create_item_graph(mode='train')
    G1 = ig.Graph.Read_Pickle(graph_fname)
    _, f_map_func, r_map_func = get_mappings()

    if mode == 'test':
        DF_DOM_PRED = pd.read_csv(path.join(DATA_DIR, 'domain_pred_test.csv'),
                                  index_col=0)

    else:
        DF_DOM_PRED = pd.concat([
            pd.read_csv(path.join(DATA_DIR, 'domain_pred_train.csv'),
                        index_col=0),
            pd.read_csv(path.join(DATA_DIR, 'domain_pred_val.csv'),
                        index_col=0)
        ],
                                ignore_index=True)
    DF_CONF_PRED = DF_DOM_PRED.loc[:, [
        'conf_{}'.format(i) for i in range(10)[::-1]
    ]]

    DF_DOM_PRED = DF_DOM_PRED.loc[:, [
        'pred_{}'.format(i) for i in range(10)[::-1]
    ]]
    vals = pd.unique(df['domain_id'].values)
    for c in DF_DOM_PRED.columns:
        DF_DOM_PRED[c] = DF_DOM_PRED[c].values.astype(np.int32)
        DF_DOM_PRED[c] = [vals[k] for k in DF_DOM_PRED[c]]
    """
            EMB stuff
        """
    from gcn.domain_string_identifier import predict_model, load_model
    domain_identifier = load_model()

    if mode == 'train':
        check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32)
    elif mode == 'val':
        check = lambda x: x > np.round(413163 * 0.8).astype(np.int32)
    else:
        check = lambda x: True

    DATA_PATH = path.join(
        DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')
    i = 0

    def rank_to_order(L, rank):
        assert rank.shape[0] == L.shape[0]
        ids = (-rank).argsort(kind='mergesort')
        return L[ids], rank[ids]

    pred = {}
    res = []
    actual = []
    domain_ids = []
    lgb_acc = 0
    rnn_acc = 0
    counter = 0
    del df
    del df2
    #_scores = np.zeros((10,)).astype(np.float32)
    with jsonlines.open(DATA_PATH) as reader:
        for line_id, obj in enumerate(reader):

            def score(k):
                if k == obj['item_bought']:
                    return 12
                elif dct_domain[k] == dct_domain[obj['item_bought']]:
                    return 1
                else:
                    return 0

            if check(line_id):

                print("Current line {}".format(line_id))
                L = [
                    h['event_info'] for h in obj['user_history']
                    if h['event_type'] == 'view'
                ]
                S = [
                    h['event_info'] for h in obj['user_history']
                    if h['event_type'] == 'search'
                ]

                L_k = pd.unique(L[::-1])[::-1]
                """
                        Calculate ranks
                    """

                if len(L_k) > 0:

                    rank_ratio_dom = pd.Series([
                        dct_ratio_dom[dct_domain[k]] for k in L_k
                    ]).rank(method="average").to_numpy()
                    rank_ratio_cat = pd.Series([
                        dct_ratio_cat[dct_cat[k]] for k in L_k
                    ]).rank(method="average").to_numpy()
                    rank_ratio_item = pd.Series([
                        dct_ratio_item_p[k] for k in L_k
                    ]).rank(method="average").to_numpy()

                    rank_freq = pd.Series(L, index=range(
                        len(L))).value_counts(sort=False)
                    rank_freq = rank_freq.rank(method="average").to_dict()
                    rank_freq = np.array([rank_freq[k] for k in L_k])
                    rank_latest = np.arange(len(L_k))
                    rank_price = pd.Series(
                        [-dct_price[k]
                         for k in L_k]).rank(method="average").to_numpy()

                    vals = DF_DOM_PRED.iloc[line_id, :].values
                    RANK_DOM = [
                        np.where(vals == dct_domain[k])[0] for k in L_k
                    ]
                    RANK_DOM = [
                        vals.shape[0] - k[0] if len(k) > 0 else 0
                        for k in RANK_DOM
                    ]
                    RANK_DOM = pd.Series(RANK_DOM).rank(
                        method="average").to_numpy()

                    from input.rnn_item_ranker import SEQ_LEN, CANDIDATES
                    dct_rnn = dict([
                        (int(x), y)
                        for x, y in zip(rnn_pred.iloc[line_id, 0:CANDIDATES],
                                        rnn_pred.iloc[line_id, -CANDIDATES:])
                    ])

                    if len(L_k) <= CANDIDATES:
                        try:
                            rank_ratio_rnn = pd.Series([
                                dct_rnn[k] for k in L_k
                            ]).rank(method="average").to_numpy()
                        except:

                            print(L_k)
                            print(rnn_pred.iloc[(line_id - 5):(line_id +
                                                               10), :])
                            raise ValueError(
                                "Did not find keys in RNN prediction")
                            raise ""
                    else:
                        rank_ratio_rnn = pd.Series(
                            [1.0
                             for k in L_k]).rank(method="average").to_numpy()

                    x = []
                    x.append([dct_ratio_dom[dct_domain[k]] for k in L_k])
                    x.append(rank_ratio_dom)
                    x.append(rank_ratio_cat)
                    x.append(rank_price)

                    x.append([dct_ratio_item_b[k] for k in L_k])

                    x.append([dct_ratio_cat[dct_cat[k]] for k in L_k])
                    x.append([dct_ratio_item_b[k] for k in L_k])
                    x.append([dct_ratio_item_s[k] for k in L_k])
                    x.append([dct_ratio_item_r[k] for k in L_k])
                    x.append(list(rank_latest / len(L_k)))
                    x.append([-dct_price[k] for k in L_k])

                    x.append([-dct_condition[k] for k in L_k])
                    x.append([-dct_lan_en[k] for k in L_k])
                    x.append([-dct_lan_es[k] for k in L_k])
                    x.append([-dct_lan_pt[k] for k in L_k])

                    x = np.transpose(np.reshape(np.array(x), (-1, len(L_k))))

                    rank_lgb = pd.Series(
                        lgbc.predict(x)).rank(method="average").to_numpy()

                    if (
                            not mode == 'test'
                    ) and obj['item_bought'] in L_k and len(L_k) <= CANDIDATES:
                        if L_k[np.argmax(rank_lgb)] == obj['item_bought']:
                            lgb_acc += 1
                        if L_k[np.argmax(
                                rank_ratio_rnn)] == obj['item_bought']:
                            rnn_acc += 1
                        counter += 1

                    COEFFS = [1.5, 1.5, 4.5, 0.4, 0.4, 0.6, 0.8, 0.0]
                    COEFFS = np.array(COEFFS) / np.sum(COEFFS)
                    final_rank =       COEFFS[0]*rank_freq + \
                                       COEFFS[1]*(rank_lgb) + \
                                       COEFFS[2]*(rank_ratio_rnn) + \
                                       COEFFS[3]*(rank_ratio_dom) +\
                                       COEFFS[4]*(rank_ratio_cat)+\
                                       COEFFS[5]*(rank_ratio_item)+\
                                       COEFFS[6]*(rank_latest)+\
                                       COEFFS[7]*(rank_price)
                    """
                            Yield rank
                        """
                    L, L_ranks = rank_to_order(L_k, final_rank)
                    #L = L[rank_freq.argsort(kind='mergesort')]

                    #L = np.array([d for d in L if (dct_ratio_dom[dct_domain[d]] > 0.01 and dct_ratio_cat[dct_cat[d]] > 0.01)])
                    #DF_DOM_PRED.iloc[line_id,:] = DF_DOM_PRED.iloc[line_id,:]/np.max(DF_DOM_PRED.iloc[line_id,:])
                    #print(DF_CONF_PRED.iloc[line_id,:])
                    #print(DF_CONF_PRED.iloc[line_id,:] > 0.001)
                    #print(np.where(DF_CONF_PRED.iloc[line_id,:] > 0.001)[0])

                    b = np.where(DF_CONF_PRED.iloc[line_id, :] > 0)[0]
                    vals = DF_DOM_PRED.iloc[line_id, :].values[b]
                    L = np.array([k for k in L if dct_domain[k] in vals])

                    L = np.array([k for k in L if dct_rnn.get(k, 1) > 1e-02])
                    L = L[:10]

                    P = np.zeros((10, ), dtype=np.int32)
                    P[0:L.shape[0]] = L
                else:
                    P = np.zeros((10, ), dtype=np.int32)
                    L = np.array(L)

                TEMP_MAX = 101
                if len(obj['user_history']) > 0:
                    temp = []
                    doms = [dct_domain[k] for k in L]
                    if len(L) > 0:
                        score_en = np.nanmean([dct_lan_en[k] for k in L])
                        score_es = np.nanmean([dct_lan_es[k] for k in L])
                        score_pt = np.nanmean([dct_lan_pt[k] for k in L])
                    else:
                        score_en, score_es, score_pt = 0, 0, 0

                    b = np.where(DF_CONF_PRED.iloc[line_id, :] > 1e-05)[0]
                    doms = DF_DOM_PRED.iloc[line_id, :].values[b]

                    cats = [
                        x[1]
                        for x in sorted([(-dct_ratio_cat[k], str(k))
                                         for k in [dct_cat[k] for k in L]])
                    ]
                    cat_rating = dict([(k, -dct_ratio_cat[k]) for k in cats])

                    if use_graph:
                        roots = pd.unique([k for k in L])
                        roots = f_map_func['item_id'](roots)
                    for dom in doms:

                        if use_graph and len(roots) > 0:
                            c_score = {}
                            candidates = []
                            for k in roots:
                                source_vert = G1.vs[k]
                                es = G1.incident(source_vert, mode='OUT')
                                es = G1.es[es]
                                vs = [e.target for e in es]

                                N = len(vs)
                                vs = G1.vs[vs].select(domain_id=dom)
                                vs = [v['value'] for v in vs]
                                candidates.extend(vs)
                            if len(candidates) > 0:
                                candidates = pd.Series([k for k in candidates
                                                        ]).value_counts()
                                candidates = candidates[candidates.values > 1]
                                _temp = [
                                    k for k in list(candidates.index)
                                    if not k in temp
                                ]
                                temp.extend(_temp)

                        if dom in dct_domain_df.keys():
                            if len(temp) > 40:
                                break
                            x = dct_domain_df[dom].index[0:TEMP_MAX]
                            """
                                    Here we try to restrict to items in the same language. This had minimal effect on the NDCG.
                                """
                            if score_pt - score_es > 0.4:
                                x = [
                                    k for k in x
                                    if score_pt - dct_lan_pt[k] < 0.2
                                ]
                            elif score_es - score_pt > 0.4:
                                [
                                    k for k in x
                                    if score_es - dct_lan_es[k] < 0.2
                                ]

                            x = sorted(x,
                                       key=lambda k: cat_rating[dct_cat[k]]
                                       if dct_cat[k] in cats else 0)
                            temp.extend(x)

                    ##############################################################
                    """ Add more items if there aren't enough"""
                    temp = temp[0:TEMP_MAX]
                    temp = [k for k in temp if k not in L]

                    x = 0
                    while len(pd.unique(temp)) < 10:
                        if isinstance(DF_DOM_PRED.iloc[line_id, x], str):
                            temp.extend(
                                dct_domain_df[DF_DOM_PRED.iloc[line_id,
                                                               x]].index[0:10])
                        x += 1

                    temp = [k for k in temp if k not in L]

                    temp = pd.unique(temp)

                    ########################################################3
                    """ Finally, add the ranked items to our prediction. """
                    P[L.shape[0]:] = temp[:(10 - L.shape[0])]

                else:
                    """ Special case for empty search and item"""
                    x = 0
                    while len(pd.unique(temp)) < 10:
                        if isinstance(DF_DOM_PRED.iloc[line_id, x], str):
                            temp.extend(
                                dct_domain_df[DF_DOM_PRED.iloc[line_id,
                                                               x]].index[0:10])
                        x += 1

                    temp = [k for k in temp if k not in L]

                    temp = pd.unique(temp)
                """
                        Set prediction
                    """
                pred[line_id] = P

                actual.append(obj.get('item_bought', 0))
                if len(actual) > 10000 and debug:
                    #print(lgb_acc/counter,rnn_acc/counter)
                    break

                #print("Item bought: {}".format(dct[obj['item_bought']]))
            #L.append(obj)
    """
            Now we calculate NDCG and save our prediction DataFrame.
        """
    if mode == 'test':
        pred = np.reshape(np.asarray(list(pred.values())), (-1, 10))
        OUT_PATH = path.join(SUBMISSIONS_DIR, 'submission.csv')
        out_df = pd.DataFrame(data=pred,
                              index=range(pred.shape[0]),
                              columns=range(pred.shape[1]))
        out_df.to_csv(OUT_PATH, index=False, header=False)
    else:
        pred = np.reshape(np.asarray(list(pred.values())), (-1, 10))
        print(pred)
        actual = np.asarray(actual)
        res = ndcg(pred, actual)
        print("Number of objects: {}".format(pred.shape[0]))
        print(COEFFS)
        print("NDCG: {}".format(res))
        return -res
Ejemplo n.º 4
0
def create_item_graph(mode='train'):
    """
        Creates graph, whose vertices correspond to items. 
        For each purchase, an edge is added from each searched item to the one that was bought. 
        Edges may be repeated.
    """
    """
        Fetch data
    """
    TRAIN_LINES = 413163
    TEST_LINES = 177070
    df = read_item_data()
    df['item_id'] = df.index
    dct_title = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_price = df['price'].to_dict()
    """ Ratio stuff """
    from input.create_ratio import get_ratio
    dct_ratio_dom = get_ratio(which='domain_id')

    ratio_df = get_ratio(which='item_id', full=True)
    ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_b = ratio_df['popularity'].to_dict()
    """
        JSON
    
    """
    if mode == 'train':
        check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32)
    elif mode == 'val':
        check = lambda x: x > np.round(413163 * 0.8).astype(np.int32)
    else:
        check = lambda x: True

    DATA_PATH = path.join(
        DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')
    line_i = 0
    """
        Create graph vertices
    """
    g = ig.Graph()

    counter, f_map_func, r_map_func = get_mappings()

    for k in dct_title.keys():
        g.add_vertex(value=k,
                     deg=dct_ratio_item_b[k],
                     domain_id=dct_domain[k],
                     price=dct_price[k],
                     cat='item_id')
    """ ['item_id','domain_id','category_id','product_id'] """

    for k in pd.unique(df['domain_id']):
        g.add_vertex(value=k, cat='domain_id')

    for k in pd.unique(df['category_id']):
        g.add_vertex(value=k, cat='category_id')

    for k in pd.unique(df['product_id']):
        g.add_vertex(value=k, cat='product_id')
    """
        Create edges
    """
    E1 = []
    E2 = []

    with jsonlines.open(DATA_PATH) as reader:
        for line_i, obj in enumerate(reader):
            if check(line_i):
                print(line_i)
                L = []
                for h in obj['user_history']:
                    if h['event_type'] == 'view':
                        #print("Viewed {}".format(dct[h['event_info']]))
                        L.append(h['event_info'])
                    elif h['event_type'] == 'search':
                        #print("Searched {}".format(h['event_info']))
                        pass
                L = pd.unique(L)
                #L_domain = [dct_domain[k] for k in L]
                for i in range(len(L)):
                    E1.append(L[i])
                    E2.append(obj['item_bought'])

    E1 = f_map_func['item_id'](E1)
    E2 = f_map_func['item_id'](E2)

    E = list(zip(E1, E2))
    g.add_edges(E)

    #g  = g.as_undirected()

    g.write_pickle(fname=path.join(DATA_DIR, 'graph_domain_id.pkl'))
Ejemplo n.º 5
0
def get_lgb_data(avoid_overfit=True):
    """
        Gets all the features necessary to train the LGB ranker, arranging them into matrices.
        Args:
            avoid_overfit (bool): If ``True``, avoid overfitting by decreasing the item/domain/category bought/searched count
            for the elements from the history of a given purchase. Default is ``True``.
        
        Returns: List with size 3:
            X (NDArray[float].shape[N,D]): Features
            Y (NDArray[float].shape[N,1]): Labels
            M (NDArray[float].shape[N]): Indicator variable (1 if train, 0 if validation)
            
        
    """
    from input.create_ratio import load_language_df
    mode = 'train'
    TRAIN_LINES = 413163
    TEST_LINES = 177070
    df = read_item_data()

    dct_condition = df['condition'].to_dict()

    df2 = load_language_df()
    dct_lan_pt = df2['score_pt'].to_dict()
    dct_lan_en = df2['score_en'].to_dict()
    dct_lan_es = df2['score_es'].to_dict()

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    """ Ratio stuff """
    from input.create_ratio import get_ratio
    dct_ratio_dom = get_ratio(which='domain_id')

    ratio_df = get_ratio(which='item_id', full=True)
    ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_p = ratio_df['popularity'].to_dict()

    ratio_df = get_ratio(which='item_id', full=True, alternate=False)
    dct_ratio_item_b = ratio_df['bought'].to_dict()
    dct_ratio_item_s = ratio_df['searched'].to_dict()
    dct_ratio_item_r = ratio_df['rat'].to_dict()

    df['item_bought'] = [dct_ratio_item_b[k] for k in df.index]

    dct_ratio_cat = get_ratio(which='category_id', full=True)
    dct_ratio_cat_s, dct_ratio_cat_b, dct_ratio_cat  = dct_ratio_cat['searched'].to_dict(),\
                                                       dct_ratio_cat['bought'].to_dict(),\
                                                       dct_ratio_cat['rat'].to_dict(),\

    dct_ratio_dom = get_ratio(which='domain_id', full=True)
    dct_ratio_dom_s, dct_ratio_dom_b, dct_ratio_dom  = dct_ratio_dom['searched'].to_dict(),\
                                               dct_ratio_dom['bought'].to_dict(),\
                                               dct_ratio_dom['rat'].to_dict(),\

    dct_ratio_item = get_ratio(which='item_id')

    dct_domain_df = {}
    dct_cat_df = {}
    for dom, df2 in df.groupby('domain_id'):
        df2 = df2.sort_values(['item_bought'], ascending=False)  #.iloc[0:10,:]
        dct_domain_df[dom] = df2

    for cat, df2 in df.groupby('category_id'):
        df2 = df2.sort_values(['item_bought'], ascending=False)  #.iloc[0:10,:]
        dct_cat_df[cat] = df2

        #print(df2)
    """ 
        RNN stuff.
    """
    from input.rnn_item_ranker import read_predictions
    rnn_pred = read_predictions(mode)
    #assert rnn_pred.shape[0] == TRAIN_LINES

    DATA_PATH = path.join(
        DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')
    i = 0

    def rank_to_order(L, rank):
        assert rank.shape[0] == L.shape[0]
        ids = (-rank).argsort(kind='mergesort')
        return L[ids], rank[ids]

    pred = {}

    actual = []
    domain_ids = []

    X = []
    Y = []
    M = []
    with jsonlines.open(DATA_PATH) as reader:
        for line_id, obj in enumerate(reader):
            if True:
                print(line_id)
                L = [
                    h['event_info'] for h in obj['user_history']
                    if h['event_type'] == 'view'
                ]
                S = [
                    h['event_info'] for h in obj['user_history']
                    if h['event_type'] == 'search'
                ]

                L_k = pd.unique(L[::-1])[::-1]
                """
                    OVERFITTING AVOIDANCE
                
                """
                if avoid_overfit:

                    if line_id <= 330530:
                        target_item = obj['item_bought']
                        target_dom = dct_domain[obj['item_bought']]
                        target_cat = dct_cat[obj['item_bought']]
                        for this_item in L_k:
                            """ Bought """
                            if this_item == target_item:
                                assert dct_ratio_item_b[this_item] > 0
                                dct_ratio_item_b[this_item] -= 1
                            """ Search """
                            dct_ratio_item_s[this_item] -= 1
                            assert dct_ratio_item_s[this_item] >= 0
                            """ Ratio """
                            dct_ratio_item_r[this_item] = dct_ratio_item_b[
                                this_item] / (dct_ratio_item_s[this_item] + 1)
                        for this_dom in pd.unique([dct_domain[k]
                                                   for k in L_k]):
                            if not isinstance(this_dom, str):
                                continue
                            """ Bought """
                            if this_dom == target_dom:
                                assert dct_ratio_dom_b[this_dom] > 0
                                dct_ratio_dom_b[this_dom] -= 1
                            """ Search """
                            dct_ratio_dom_s[this_dom] -= 1
                            assert dct_ratio_dom_s[this_dom] >= 0
                            """ Ratio """
                            dct_ratio_dom[this_dom] = dct_ratio_dom_b[
                                this_dom] / (dct_ratio_dom_s[this_dom] + 1)
                        for this_cat in pd.unique([dct_cat[k] for k in L_k]):
                            """ Bought """
                            if this_cat == target_cat:
                                assert dct_ratio_cat_b[this_cat] > 0
                                dct_ratio_cat_b[this_cat] -= 1
                            """ Search """
                            dct_ratio_cat_s[this_cat] -= 1
                            assert dct_ratio_cat_s[this_cat] >= 0
                            """ Ratio """
                            dct_ratio_cat[this_cat] = dct_ratio_cat_b[
                                this_cat] / (dct_ratio_cat_s[this_cat] + 1)
                """
                    Calculate ranks
                """

                dct_rnn = dict([
                    (int(x), y)
                    for x, y in zip(rnn_pred.iloc[i,
                                                  0:10], rnn_pred.iloc[i,
                                                                       -10:])
                ])
                if len(L_k) <= 10:
                    rank_ratio_rnn = pd.Series([
                        dct_rnn.get(k, 0) for k in L_k
                    ]).rank(method="average").to_numpy()
                else:
                    rank_ratio_rnn = pd.Series(
                        [1.0 for k in L_k]).rank(method="average").to_numpy()

                rank_ratio_dom = pd.Series([
                    dct_ratio_dom[dct_domain[k]] for k in L_k
                ]).rank(method="average").to_numpy()
                rank_ratio_cat = pd.Series([
                    dct_ratio_cat[dct_cat[k]] for k in L_k
                ]).rank(method="average").to_numpy()
                rank_ratio_item = pd.Series(
                    [dct_ratio_item_p[k]
                     for k in L_k]).rank(method="average").to_numpy()

                rank_freq = pd.Series(L, index=range(
                    len(L))).value_counts(sort=False)
                rank_freq = rank_freq.rank(method="average").to_dict()
                rank_freq = np.array([rank_freq[k] for k in L_k])
                rank_latest = np.arange(len(L_k))
                rank_price = pd.Series([-dct_price[k] for k in L_k
                                        ]).rank(method="average").to_numpy()

                x = []
                x.append([dct_ratio_dom[dct_domain[k]] for k in L_k])
                x.append(rank_ratio_dom)
                x.append(rank_ratio_cat)
                x.append(rank_price)

                x.append([dct_ratio_dom[dct_domain[k]] for k in L_k])

                x.append([dct_ratio_cat[dct_cat[k]] for k in L_k])
                x.append([dct_ratio_item_b[k] for k in L_k])
                x.append([dct_ratio_item_s[k] for k in L_k])
                x.append([dct_ratio_item_r[k] for k in L_k])
                x.append(list(rank_latest / len(L_k)))
                x.append([-dct_price[k] for k in L_k])

                x.append([-dct_condition[k] for k in L_k])
                x.append([-dct_lan_en[k] for k in L_k])
                x.append([-dct_lan_es[k] for k in L_k])
                x.append([-dct_lan_pt[k] for k in L_k])
                """
                    Overfitting avoidance - pt 2
                """
                if line_id <= 330530:
                    target_item = obj['item_bought']
                    target_dom = dct_domain[obj['item_bought']]
                    target_cat = dct_cat[obj['item_bought']]
                    for this_item in L_k:
                        """ Bought """
                        if this_item == target_item:
                            #assert dct_ratio_item_b[this_item] >= 0
                            dct_ratio_item_b[this_item] += 1
                        """ Search """
                        #assert dct_ratio_item_s[this_item] >= 0
                        dct_ratio_item_s[this_item] += 1
                        """ Ratio """
                        dct_ratio_item_r[this_item] = dct_ratio_item_b[
                            this_item] / (dct_ratio_item_s[this_item] + 1)
                    for this_dom in pd.unique([dct_domain[k] for k in L_k]):
                        if not isinstance(this_dom, str):
                            continue
                        """ Bought """
                        if this_dom == target_dom:
                            #assert dct_ratio_dom_b[this_dom] >= 0
                            dct_ratio_dom_b[this_dom] += 1
                        """ Search """
                        #assert dct_ratio_dom_s[this_dom] >= 0
                        dct_ratio_dom_s[this_dom] += 1
                        """ Ratio """
                        dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / (
                            dct_ratio_dom_s[this_dom] + 1)
                    for this_cat in pd.unique([dct_cat[k] for k in L_k]):
                        """ Bought """
                        if this_cat == target_cat:
                            #assert dct_ratio_cat_b[this_cat] >= 0
                            dct_ratio_cat_b[this_cat] += 1
                        """ Search """
                        #assert dct_ratio_cat_s[this_cat] >= 0
                        dct_ratio_cat_s[this_cat] += 1
                        """ Ratio """
                        dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / (
                            dct_ratio_cat_s[this_cat] + 1)

                if len(L_k) == 0:
                    continue
                x = np.transpose(np.reshape(np.array(x), (-1, len(L_k))))

                def score(k):
                    if k == obj['item_bought']:
                        return 2
                    elif dct_domain[k] == dct_domain[obj['item_bought']]:
                        return 1
                    else:
                        return 0

                y = np.array([score(k) for k in L_k])[:, None]
                #print(y.shape)
                if np.sum(y) >= 0:
                    X.append(x)
                    Y.append(y)
                    M.append(np.array([line_id] * len(L_k)))

    X = np.concatenate(X, axis=0)
    Y = np.concatenate(Y, axis=0)
    M = np.concatenate(M)
    return X, Y, M
Ejemplo n.º 6
0
def meli_iterator(mode='train', batch_size=BATCH_SIZE, full=False):
    from input.read_input import get_sentence_model, get_emb

    from input.create_ratio import load_language_df
    TRAIN_LINES = 413163
    TEST_LINES = 177070
    df = read_item_data()

    dct_condition = df['condition'].to_dict()

    df2 = load_language_df()
    dct_lan_pt = df2['score_pt'].to_dict()
    dct_lan_en = df2['score_en'].to_dict()
    dct_lan_es = df2['score_es'].to_dict()

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    """ Ratio stuff """
    from input.create_ratio import get_ratio
    dct_ratio_dom = get_ratio(which='domain_id')

    ratio_df = get_ratio(which='item_id', full=True)
    ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_p = ratio_df['popularity'].to_dict()

    ratio_df = get_ratio(which='item_id', full=True, alternate=False)
    dct_ratio_item_b = ratio_df['bought'].to_dict()
    dct_ratio_item_s = ratio_df['searched'].to_dict()
    dct_ratio_item_r = ratio_df['rat'].to_dict()

    df['item_bought'] = [dct_ratio_item_b[k] for k in df.index]

    dct_ratio_cat = get_ratio(which='category_id', full=True)
    dct_ratio_cat_s, dct_ratio_cat_b, dct_ratio_cat  = dct_ratio_cat['searched'].to_dict(),\
                                                       dct_ratio_cat['bought'].to_dict(),\
                                                       dct_ratio_cat['rat'].to_dict(),\

    dct_ratio_dom = get_ratio(which='domain_id', full=True)
    dct_ratio_dom_s, dct_ratio_dom_b, dct_ratio_dom  = dct_ratio_dom['searched'].to_dict(),\
                                               dct_ratio_dom['bought'].to_dict(),\
                                               dct_ratio_dom['rat'].to_dict(),\

    dct_ratio_item = get_ratio(which='item_id')

    dct_domain_df = {}
    dct_cat_df = {}
    for dom, df2 in df.groupby('domain_id'):
        df2 = df2.sort_values(['item_bought'], ascending=False)  #.iloc[0:10,:]
        dct_domain_df[dom] = df2

    for cat, df2 in df.groupby('category_id'):
        df2 = df2.sort_values(['item_bought'], ascending=False)  #.iloc[0:10,:]
        dct_cat_df[cat] = df2
    del df
    del df2

    def _begin_overfit_avoid(L_k):

        if not mode == 'train':
            return
        target_item = obj['item_bought']
        target_dom = dct_domain[obj['item_bought']]
        target_cat = dct_cat[obj['item_bought']]
        for this_item in L_k:
            """ Bought """
            if this_item == target_item:
                assert dct_ratio_item_b[this_item] > 0
                dct_ratio_item_b[this_item] -= 1
            """ Search """
            dct_ratio_item_s[this_item] -= 1
            assert dct_ratio_item_s[this_item] >= 0
            """ Ratio """
            dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / (
                dct_ratio_item_s[this_item] + 1)
        for this_dom in pd.unique([dct_domain[k] for k in L_k]):
            if not isinstance(this_dom, str):
                continue
            """ Bought """
            if this_dom == target_dom:
                assert dct_ratio_dom_b[this_dom] > 0
                dct_ratio_dom_b[this_dom] -= 1
            """ Search """
            dct_ratio_dom_s[this_dom] -= 1
            assert dct_ratio_dom_s[this_dom] >= 0
            """ Ratio """
            dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / (
                dct_ratio_dom_s[this_dom] + 1)
        for this_cat in pd.unique([dct_cat[k] for k in L_k]):
            """ Bought """
            if this_cat == target_cat:
                assert dct_ratio_cat_b[this_cat] > 0
                dct_ratio_cat_b[this_cat] -= 1
            """ Search """
            dct_ratio_cat_s[this_cat] -= 1
            assert dct_ratio_cat_s[this_cat] >= 0
            """ Ratio """
            dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / (
                dct_ratio_cat_s[this_cat] + 1)

    def _end_overfit_avoid(L_k):
        if not mode == 'train':
            return
        target_item = obj['item_bought']
        target_dom = dct_domain[obj['item_bought']]
        target_cat = dct_cat[obj['item_bought']]
        for this_item in L_k:
            """ Bought """
            if this_item == target_item:
                #assert dct_ratio_item_b[this_item] >= 0
                dct_ratio_item_b[this_item] += 1
            """ Search """
            #assert dct_ratio_item_s[this_item] >= 0
            dct_ratio_item_s[this_item] += 1
            """ Ratio """
            dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / (
                dct_ratio_item_s[this_item] + 1)
        for this_dom in pd.unique([dct_domain[k] for k in L_k]):
            if not isinstance(this_dom, str):
                continue
            """ Bought """
            if this_dom == target_dom:
                #assert dct_ratio_dom_b[this_dom] >= 0
                dct_ratio_dom_b[this_dom] += 1
            """ Search """
            #assert dct_ratio_dom_s[this_dom] >= 0
            dct_ratio_dom_s[this_dom] += 1
            """ Ratio """
            dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / (
                dct_ratio_dom_s[this_dom] + 1)
        for this_cat in pd.unique([dct_cat[k] for k in L_k]):
            """ Bought """
            if this_cat == target_cat:
                #assert dct_ratio_cat_b[this_cat] >= 0
                dct_ratio_cat_b[this_cat] += 1
            """ Search """
            #assert dct_ratio_cat_s[this_cat] >= 0
            dct_ratio_cat_s[this_cat] += 1
            """ Ratio """
            dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / (
                dct_ratio_cat_s[this_cat] + 1)

    if mode == 'train':
        check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32)
    elif mode == 'val':
        check = lambda x: x > np.round(413163 * 0.8).astype(np.int32)
    else:
        check = lambda x: True

    DATA_PATH = path.join(
        DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')

    def rank_to_order(L, rank):
        assert rank.shape[0] == L.shape[0]
        return L[(-rank).argsort(kind='mergesort')]

    pred = {}

    actual = []

    X = []
    Y = []
    MASK = []
    LKS = []
    ACTUAL = []
    while True:
        with jsonlines.open(DATA_PATH) as reader:
            print("Start!!!")
            for line_id, obj in enumerate(reader):
                if check(line_id):
                    #print(i)
                    L = []
                    timestamps = []
                    dct_emb = {}
                    if mode == 'test':
                        obj['item_bought'] = -999
                    for h in obj['user_history']:
                        if h['event_type'] == 'view':
                            L.append(h['event_info'])
                            timestamps.append(
                                pd.Timestamp(h['event_timestamp']))
                        elif h['event_type'] == 'search':
                            pass

                    def divide_time(d):
                        d = pd.Timedelta(d).total_seconds()
                        MINUTE_M = 60
                        HOUR_M = MINUTE_M * 60
                        DAY_M = HOUR_M * 24

                        div = [1, 24, 60]
                        res = [0, 0, 0]
                        for i, M in enumerate([DAY_M, HOUR_M, MINUTE_M]):
                            res[i] = np.floor(d / M)
                            d -= M * res[i]
                            res[i] /= div[i]
                            #res[i] -= 0.5

                        return tuple(res)

                    if not full and len(L) < 2:
                        continue
                    """ Create attributes """
                    if len(L) == 0:
                        attrs = np.zeros(
                            (1, (CANDIDATES + 1) + ATTR_SIZE + EMB_SIZE))
                        targets = np.zeros((1, (CANDIDATES + 1)))
                        targets[0, -1] = 0
                        L_k = []
                    else:
                        delta = [
                            timestamps[-1] - timestamps[i]
                            for i in range(0, len(timestamps))
                        ]
                        """
                            We'll use the latest delta
                        """
                        L = L[::-1]
                        u, unique_id = np.unique(np.array(L),
                                                 return_index=True)

                        #delta_day, delta_hour, delta_minute = zip(*[divide_time(d) for d in delta])
                        deltas = np.array([divide_time(d) for d in delta])
                        deltas = deltas[unique_id][:SEQ_LEN]

                        L_k = np.array(L)[unique_id][:CANDIDATES]
                        _begin_overfit_avoid(L_k)
                        """
                            rank_freq initial calculation needs whole L
                        """
                        rank_freq = pd.Series(L, index=range(
                            len(L))).value_counts(sort=False, normalize=True)
                        rank_freq = rank_freq.rank(method="average").to_dict()

                        L = np.array(L)[unique_id][:SEQ_LEN]
                        """
                            Calculate ranks
                        """
                        condition = np.array([
                            1.0 if dct_condition[k] == 'new' else 0.0
                            for k in L
                        ])[:, None]

                        #ratio_dom = np.array([dct_ratio_dom[dct_domain[k]] for k in L])[:,None]
                        #ratio_cat = np.array([dct_ratio_cat[dct_cat[k]] for k in L])[:,None]
                        #ratio_item = np.array([dct_ratio_item[k] for k in L])[:,None]
                        price = np.log(
                            np.array([
                                1 + np.abs(fix_na(dct_price[k])) for k in L
                            ])[:, None])
                        rank_freq = np.array([rank_freq[k] for k in L])[:,
                                                                        None]
                        #rank_latest = (1.0 - np.arange(len(L))/len(L))[:,None]

                        rank_ratio_dom = pd.Series([
                            dct_ratio_dom[dct_domain[k]] for k in L_k
                        ]).rank(method="average").to_numpy()
                        rank_ratio_cat = pd.Series([
                            dct_ratio_cat[dct_cat[k]] for k in L_k
                        ]).rank(method="average").to_numpy()
                        rank_ratio_item = pd.Series([
                            dct_ratio_item_r[k] for k in L_k
                        ]).rank(method="average").to_numpy()
                        rank_latest = (1.0 - np.arange(len(L)) / len(L))

                        x = []
                        x.append([dct_ratio_dom[dct_domain[k]] for k in L_k])
                        x.append(rank_ratio_dom)
                        x.append(rank_ratio_cat)
                        x.append(rank_ratio_item)

                        x.append([dct_ratio_dom[dct_domain[k]] for k in L_k])

                        x.append([dct_ratio_cat[dct_cat[k]] for k in L_k])
                        x.append([dct_ratio_item_b[k] for k in L_k])
                        x.append([dct_ratio_item_s[k] for k in L_k])
                        x.append([dct_ratio_item_r[k] for k in L_k])
                        x.append(list(rank_latest / len(L_k)))
                        x.append([-dct_price[k] for k in L_k])

                        x.append([-dct_condition[k] for k in L_k])
                        x.append([-dct_lan_en[k] for k in L_k])
                        x.append([-dct_lan_es[k] for k in L_k])
                        x.append([-dct_lan_pt[k] for k in L_k])
                        """
                            
                        """
                        #true_val = (np.array([str(dct_domain[k]) for k in L]) == dct_domain[obj['item_bought']])
                        #true_val = np.logical_and(true_val,[k in L_k for k in L])
                        #true_val = true_val[:,None]
                        #true_val = np.ones_like(true_val)
                        #true_val = np.random.rand(*(true_val.shape))

                        assert all([k in L for k in L_k])

                        ids = [
                            np.where(
                                L_k == l)[0][0] if l in L_k else CANDIDATES
                            for l in L
                        ]
                        ids_onehot = np.zeros((len(L), (CANDIDATES + 1)))
                        ids_onehot[np.arange(len(L)), ids] = 1
                        #ids_onehot = ids_onehot[:,0:10]
                        """
                            Create numeric attributes plus embeddings
                        """

                        attr_list = [
                            ids_onehot, deltas, condition, price, rank_freq
                        ] + [np.array(_x)[:, None] for _x in x]
                        if USE_EMB:
                            emb = predict_model(
                                get_sentence_model(),
                                query_list=[dct[k] for k in L_k],
                                return_emb=True)
                            emb = np.reshape(emb[:, 0:(EMB_SIZE // 512), :],
                                             (emb.shape[0], EMB_SIZE))
                            attr_list.append(emb)

                        attrs = np.concatenate(attr_list, axis=1)
                        """ Create targets """
                        if mode == 'test':
                            targets = np.zeros((1, (CANDIDATES + 1)))
                        else:
                            _b1 = (np.array(list(L_k == obj['item_bought'])))
                            _b2 = (np.array(
                                list([str(dct_domain[k]) for k in L_k
                                      ])) == dct_domain[obj['item_bought']])

                            targets = _b1.astype(
                                np.float32
                            ) * 1.0  #+ _b2.astype(np.float32)*0.0
                            if np.sum(targets) == 0:
                                targets = np.zeros((1, (CANDIDATES + 1)))
                                targets[0, -1] = 1
                                if not full:
                                    _end_overfit_avoid(L_k)
                                    continue
                            else:
                                targets = np.array(targets) / np.sum(targets)
                                targets = np.concatenate([
                                    targets[None, :],
                                    np.zeros((1, CANDIDATES + 1 - len(L_k)))
                                ],
                                                         axis=1)
                    """ Add attributes, targets. """
                    if attrs.shape[0] < SEQ_LEN:
                        attrs = np.concatenate([
                            np.zeros((
                                SEQ_LEN - attrs.shape[0],
                                attrs.shape[1],
                            )), attrs
                        ],
                                               axis=0)
                    attrs = attrs[-SEQ_LEN:, :]
                    attrs = attrs.astype(np.float32)
                    _end_overfit_avoid(L_k)

                    X.append(attrs[None, :])
                    Y.append(targets)
                    mask = np.concatenate([
                        np.ones((len(L_k))),
                        np.zeros((CANDIDATES + 1) - len(L_k))
                    ]).astype(np.float32)[None, :]
                    MASK.append(mask)

                    LKS.append(
                        np.concatenate([
                            L_k, -1 * np.ones(((CANDIDATES + 1) - len(L_k), ))
                        ])[None, :])
                    ACTUAL.append(np.array([obj['item_bought']])[None, :])

                if len(X) == batch_size:

                    X = np.concatenate(X, axis=0)
                    Y = np.concatenate(Y, axis=0)

                    MASK = np.concatenate(MASK, axis=0)
                    LKS = np.concatenate(np.array(LKS).astype(np.int32),
                                         axis=0)
                    ACTUAL = np.concatenate(np.array(ACTUAL).astype(np.int32),
                                            axis=0)

                    yield (X, MASK, LKS, ACTUAL), Y
                    X = []
                    Y = []
                    MASK = []
                    LKS = []
                    ACTUAL = []

                    #print(attrs.shape)
        if full:
            check = (lambda i: True)
Ejemplo n.º 7
0
def fit_RNN():
    import tensorflow as tf
    from tensorflow import keras
    import tf_geometric as tfg
    """
        Create graph
    """
    df = read_item_data()

    NUM_ITEMS = read_item_data().shape[0]
    NUM_FEATURES = 1

    counter, f_map_func, r_map_func = get_mappings()

    NUM_DOMS = pd.unique(df['domain_id']).shape[0]
    """ Load graph """
    G = ig.Graph.Read_Pickle(path.join(DATA_DIR, 'graph_item_to_item.pkl'))
    #weights = np.log(1+np.array(G.es["weight"]))
    weights = np.array(G.es["weight"])

    indices = np.array([np.array(e.tuple) for e in G.es])
    indices = np.transpose(indices)
    """ Create sparse matrix W """
    from scipy.sparse import coo_matrix
    import scipy.sparse
    row = indices[0, :]
    col = indices[1, :]

    W = coo_matrix((weights, (row, col)), shape=(NUM_ITEMS, NUM_ITEMS))
    """ Normalize rows """
    #W = deg_matrix(W,pwr=-1) @ W
    W = W.transpose()
    W = scipy.sparse.csr_matrix(W)
    assert scipy.sparse.issparse(W)

    @tf.function
    def smooth_labels(labels, factor=0.001):
        # smooth the labels
        labels = tf.cast(labels, tf.float32)
        labels *= (1 - factor)
        labels += (factor / tf.cast(tf.shape(labels)[1], tf.float32))
        # returned the smoothed labels
        return labels

    @tf.function
    def compute_loss(labels, logits):
        logits = tf.reshape(logits, (-1, NUM_ITEMS))
        labels = tf.reshape(labels, (-1, NUM_ITEMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)

        logits = smooth_labels(logits)
        labels = smooth_labels(labels)

        losses = -tf.reduce_sum(logits * tf.math.log(labels), axis=1)

        return tf.reduce_mean(losses)

    @tf.function
    def evaluate(labels, logits):
        logits = tf.reshape(logits, (-1, NUM_ITEMS))
        labels = tf.reshape(labels, (-1, NUM_ITEMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)

        logits = smooth_labels(logits)
        labels = smooth_labels(labels)

        acc = tf.metrics.categorical_accuracy(labels, logits)

        return tf.reduce_mean(acc)

    """
        Read data, yadda yadda
    
    """
    from input.create_ratio import get_ratio
    ratio_df = get_ratio(which='item_id', full=True)
    ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_b = ratio_df['popularity'].to_dict()

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    """ Ratio stuff """
    from input.create_ratio import get_ratio
    category_df = get_ratio(which='category_id', full=True)
    domain_df = get_ratio(which='domain_id', full=True)

    feat_1, feat_2, feat_3 = domain_df['searched'].to_dict(
    ), domain_df['bought'].to_dict(), domain_df['rat'].to_dict()

    feat_1, feat_2, feat_3 = [[X[dct_domain[k]] for k in df.index]
                              for X in [feat_1, feat_2, feat_3]]

    feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].to_dict(
    ), category_df['bought'].to_dict(), category_df['rat'].to_dict()
    feat_1_1, feat_2_1, feat_3_1 = [[X[dct_cat[k]] for k in df.index]
                                    for X in [feat_1_1, feat_2_1, feat_3_1]]

    def standardize(x):
        return (x - np.min(x)) / (np.max(x) + 1e-06 - np.min(x))

    feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1, feat_2, feat_3]]

    feat_1_1, feat_2_1, feat_3_1 = [
        standardize(x) for x in [feat_1_1, feat_2_1, feat_3_1]
    ]

    del df
    del domain_df
    del category_df
    del G
    #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)])
    #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios)

    from nn.domain_string_identifier import load_model
    domain_prediction_model = load_model()

    def my_generator(mode='train'):
        if mode == 'train':
            check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32)
        elif mode == 'val':
            check = lambda x: x > np.round(413163 * 0.8).astype(np.int32)
        else:
            check = lambda x: True
        DATA_PATH = path.join(
            DATA_DIR,
            'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')
        print("Reading....")

        X = np.zeros((NUM_ITEMS, 10)).astype(np.float32)
        with jsonlines.open(DATA_PATH) as reader:
            for line_i, obj in enumerate(reader):

                if check(line_i):
                    L = []
                    S = []
                    C = []
                    IDS = []
                    for h in obj['user_history']:
                        if h['event_type'] == 'view':
                            L.append(dct_domain[h['event_info']])
                            C.append(dct_cat[h['event_info']])
                            IDS.append(h['event_info'])

                        elif h['event_type'] == 'search':
                            S.append(h['event_info'])

                    if obj['item_bought'] in L:
                        continue

                    L = f_map_func['domain_id'](L)
                    C = f_map_func['category_id'](C)
                    IDS_map = f_map_func['item_id'](IDS)
                    """ Adjust graph """
                    Y = np.zeros((NUM_ITEMS, 1)).astype(np.float32)
                    """
                        X[:,0] = feat_1
                        X[:,1] = feat_2
                        X[:,2] = feat_3
                        X[:,6] = feat_1_1
                        X[:,7] = feat_2_1
                        X[:,8] = feat_3_1
                        
                        #if len(S) > 0:
                        #    X[:,8] =  np.mean(predict_model(domain_prediction_model,S,return_numeric=True),axis=0)
                        """
                    target_id = f_map_func['item_id']([obj['item_bought']])[0]
                    if not mode == 'test':
                        Y[target_id, 0] = 1.0
                    """
                        for i,k in enumerate(IDS_map):
                            X[k,3] +=  1
                            X[k,4] +=  dct_ratio_item_b[IDS[i]]/len(C)
                            X[k,5] =  dct_price[IDS[i]]
                        
                        #W[target_id,:] = (np.clip(np.array(W[target_id,:].todense())-1,a_min=0.0,a_max=None))
                        X[:,9] = np.reshape(np.asarray(W @ X[:,3]),(-1,))
                        X[:,9] = X[:,8] * X[:,2]
                        #X[:,:8] = 0

                        for i in range(10):
                            X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) 
                        """
                    if not mode == 'test':
                        Y[target_id, 0] = 0.0
                    #X = X -0.5
                    yield X, Y

    """
        Optimize
    """

    BS = 2
    step = 0

    def batch_generator(mode, loop=True, batch_size=BS):
        BATCH_X = []
        BATCH_Y = []
        i = 0
        while True:
            for x, y in my_generator(mode):

                BATCH_X.append(x[None, :, :])
                BATCH_Y.append(y[None, :, :])
                i += 1
                if i % batch_size == 0:
                    yield np.concatenate(BATCH_X,
                                         axis=0), np.concatenate(BATCH_Y,
                                                                 axis=0)
                    BATCH_X = []
                    BATCH_Y = []
                    i = 0
            if loop == False:
                yield np.concatenate(BATCH_X, axis=0), np.concatenate(BATCH_Y,
                                                                      axis=0)
                break

    """
        Define train_model
    """
    import tensorflow.keras as keras
    import tensorflow.keras.layers as layers
    inp_x = keras.Input((NUM_ITEMS, 10))
    x = layers.Dense(32, activation='relu')(inp_x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(1)(x)
    x = layers.Flatten()(x)
    x = layers.Softmax(axis=-1)(x)

    train_model = keras.Model(inputs=[inp_x], outputs=[x])
    print(train_model.summary())
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.5 * 1e-2, decay_steps=1000, decay_rate=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.2 * 1e-2)

    train_model.compile(optimizer=optimizer,
                        loss=compute_loss,
                        metrics=[evaluate])
    from functools import partial
    from input.read_input import TRAIN_LINES
    train_model.fit_generator(batch_generator('train', True),
                              steps_per_epoch=TRAIN_LINES // BS,
                              epochs=1)

    ITEM_PATH = path.join(DATA_DIR, 'train_model', 'item_classifier.h5')
    train_model.save_weights(ITEM_PATH)

    def predict(mode):
        PREDS = []
        CONFS = []
        NUM_SELECT = 10
        batch_size = 1
        for batch_id, X in enumerate(
                batch_generator(mode, batch_size=batch_size, loop=False)):
            x = X[0]
            print("Predicting {} - Batch {}".format(mode, batch_id))
            pred = train_model.predict_on_batch(x)
            if batch_id == 0:
                print(pred)
            PREDS.append(tf.argsort(pred, axis=-1)[:, -NUM_SELECT:])
            CONFS.append(tf.sort(pred, axis=-1)[:, -NUM_SELECT:])

        PREDS = np.concatenate(PREDS, axis=0)
        CONFS = np.concatenate(CONFS, axis=0)
        #PREDS = np.concatenate([PREDS,CONFS],axis=1)
        cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)]
        fname = os.path.join(DATA_DIR, 'item_pred_{}.csv'.format(mode))
        pd.DataFrame(PREDS, index=range(PREDS.shape[0]),
                     columns=cols).to_csv(fname)

    predict('train')
    predict('val')
    predict('test')