def title_generator(mode='train'):
    df = read_item_data()
    NUM_DOMS = pd.unique(df['domain_id']).size
    dom_to_id = dict([(x, i)
                      for i, x in enumerate(pd.unique(df['domain_id']))])
    NUM_DOMS = pd.unique(df['domain_id']).size

    BATCH_X = []
    BATCH_Y = []

    while True:
        line_id = 0
        for tit, dom in (zip(df['title'], df['domain_id'])):
            target = np.zeros((NUM_DOMS, ), dtype=np.float32)
            target[dom_to_id[dom]] = 1

            tit = preprocess_title(tit)

            embeddings = sentence_model.encode(tit)

            BATCH_X.append(embeddings[None, :, :])
            BATCH_Y.append(target[None, :])

            if line_id % BS == 0:
                X = np.concatenate(BATCH_X, axis=0)
                Y = np.concatenate(BATCH_Y, axis=0)
                BATCH_X = []
                BATCH_Y = []
                yield X, Y
            line_id += 1
Example #2
0
def create_language():
    df = read_item_data()
    import fasttext
    model_fname = path.join(DATA_DIR, "lid.176.bin")
    if not path.isfile(model_fname):
        print("Did not find fasttext model at {}".format(model_fname))
        print("Trying to download from the web...")
        try:
            urllib.request.urlretrieve(
                "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
                model_fname)

        except:
            raise Exception("Could not get fasttext model")
    if not path.isfile(model_fname):
        raise Exception("Could not get fasttext model")
    else:
        print("Fasttext model found at {}".format(model_fname))

    lid_model = fasttext.FastText.load_model(model_fname)

    def get_language(i, x):
        print(i)
        languages, scores = lid_model.predict(str(x), k=999, threshold=-1.0)
        languages = np.array(languages)

        return scores[np.where(
            languages == '__label__es')[0][0]], scores[np.where(
                languages == '__label__pt')[0][0]], scores[np.where(
                    languages == '__label__en')[0][0]]

    X = np.array(
        [get_language(i, x) for i, x in enumerate(df['title'].values)])
    for i, c in enumerate(['score_es', 'score_pt', 'score_en']):
        df[c] = X[:, i]
    df.loc[:, ['score_es', 'score_pt', 'score_en']].to_csv(
        path.join(DATA_DIR, 'language_identification.csv'))
def train_neural_domain_prediction():
    import tensorflow as tf
    """
        Create graph
    """
    from input.read_input import read_item_data
    df = read_item_data()
    dct_condition = df['condition'].to_dict()
    
    df2 = load_language_df()
    dct_lan_pt = df2['score_pt'].to_dict()
    dct_lan_en = df2['score_en'].to_dict()
    dct_lan_es = df2['score_es'].to_dict()
    
    
    
    NUM_ITEMS = read_item_data().shape[0]
    NUM_FEATURES = 1

   


    
    
    
    from input.read_input import  get_mappings, NUM_DOMS
    counter, f_map_func, r_map_func = get_mappings()
        
    NUM_DOMS = pd.unique(df['domain_id']).shape[0]
    NUM_CATS = pd.unique(df['category_id']).shape[0]
    
    """ Load graph """
    graph_fname = path.join(DATA_DIR,'graph_domain_to_domain.pkl')
    if not path.isfile(graph_fname):
        input("Did not find graph at {}. Will have to create it from scratch... (Any key to continue)".format(graph_fname))
        G = create_graph_domain()
    else:
        G = ig.Graph.Read_Pickle(path.join(DATA_DIR,'graph_domain_to_domain.pkl'))
    #weights = np.log(1+np.array(G.es["weight"]))
    weights = np.array(G.es["weight"])
    
    indices = np.array([ np.array(e.tuple)  for e in G.es]) - NUM_ITEMS 
    indices = np.transpose(indices) 
    
    """ Create sparse matrix W """
    from scipy.sparse import coo_matrix
    import scipy.sparse
    row = indices[0,:]
    col = indices[1,:]
    
    W = coo_matrix((weights, (row, col)),shape=(NUM_DOMS,NUM_DOMS))
    """ Normalize rows """
    #W = deg_matrix(W,pwr=-1) @ W
    W = W.transpose()
    W = scipy.sparse.csr_matrix(W)
    assert scipy.sparse.issparse(W)
    
            
    
    @tf.function
    def smooth_labels(labels, factor=0.001):
        # smooth the labels
        labels = tf.cast(labels,tf.float32)
        labels *= (1 - factor)
        labels += (factor / tf.cast(tf.shape(labels)[1],tf.float32))
        # returned the smoothed labels
        return labels
    @tf.function
    def compute_loss(labels,logits):
        logits = tf.reshape(logits,(-1,NUM_DOMS))
        labels = tf.reshape(labels,(-1,NUM_DOMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)
        
        logits = smooth_labels(logits)
        labels = smooth_labels(labels)
        
        losses = -tf.reduce_sum(logits*tf.math.log(labels),axis=1) 
        
        return tf.reduce_mean(losses)
    
    @tf.function
    def evaluate(labels,logits):
        logits = tf.reshape(logits,(-1,NUM_DOMS))
        labels = tf.reshape(labels,(-1,NUM_DOMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)
        
        logits = smooth_labels(logits)
        labels = smooth_labels(labels)
        
        acc = tf.metrics.categorical_accuracy(labels,logits)
        
        return tf.reduce_mean(acc)
    
    
    
    """
        Read data, yadda yadda
    
    """
    from input.create_ratio import get_ratio
    ratio_df = get_ratio(which='item_id',full=True,alternate=False)
    dct_ratio_item_b = ratio_df['bought'].to_dict()
    dct_ratio_item_s = ratio_df['searched'].to_dict()
    dct_ratio_item_r = ratio_df['searched'].to_dict()
    

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    
  
    
    """ Ratio stuff """    
    from input.create_ratio import get_ratio
    category_df = get_ratio(which='category_id',full=True)
    domain_df = get_ratio(which='domain_id', full = True)
    
  
    
    feat_1, feat_2, feat_3 = domain_df['searched'].values, domain_df['bought'].values, domain_df['rat'].values
    
    feat_4, feat_5 = domain_df['out_bought'].values,domain_df['rat2'].values
    
    feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].values, category_df['bought'].values, category_df['rat'].values
    
    
    def standardize(x):
        return (x - np.min(x)) / (np.max(x)+1e-06 - np.min(x))
    
    feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1,feat_2,feat_3]]
    
    feat_1_1, feat_2_1, feat_3_1 = [standardize(x) for x in [feat_1_1,feat_2_1,feat_3_1]]
    
    #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)])
    #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios)

    
    
    from  nn.domain_string_identifier import load_model
    domain_prediction_model = load_model()
    def my_generator(mode='train'):
            if mode == 'train':
                check = lambda x: x <= np.round(413163*0.8).astype(np.int32)
            elif mode == 'val':
                check = lambda x: x > np.round(413163*0.8).astype(np.int32)
            else:
                check = lambda x: True
            DATA_PATH = path.join(DATA_DIR,'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')
            print("Reading....")
            with jsonlines.open(DATA_PATH) as reader:
                for line_i, obj in enumerate(reader):
                    if check(line_i):
                        L = []
                        S = []
                        C =[]
                        IDS = []
                        for h in obj['user_history']:
                            if h['event_type'] == 'view':
                                L.append(dct_domain[h['event_info']])
                                C.append(dct_cat[h['event_info']])
                                IDS.append(h['event_info'])
                                
                            elif h['event_type'] == 'search':
                                S.append(h['event_info'])
    
                        
                        
                        
                        L =  f_map_func['domain_id'](L)
                        C =  f_map_func['category_id'](C)
                        
                        df = pd.DataFrame(
                            {"domain_id":L,
                             "feat_1_1":[feat_1_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))],
                             "feat_2_1":[feat_2_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))],
                             "feat_3_1":[feat_3_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))],
                             
                             },
                            index=IDS)
                        
                        
                        df['recency'] = range(len(L))
                        df['freq'] = np.ones((len(L),))
                        df['price'] = [ dct_price[k] for k in IDS]
                        df['item_b'] =[ dct_ratio_item_b[k] for k in IDS]
                        df['item_s'] =[ dct_ratio_item_s[k] for k in IDS]
                        
                        df['condition'] =[dct_condition[k] for k in IDS]
                        df['lan_pt'] = [dct_lan_pt[k] for k in IDS]
                        df['lan_en'] = [dct_lan_en[k] for k in IDS]
                        df['lan_es'] = [dct_lan_es[k] for k in IDS]
                        
                        
                        """ Adjust graph """
                        Y = np.zeros((NUM_DOMS,1)).astype(np.float32)
                        X = np.zeros((NUM_DOMS,55+55)).astype(np.float32)
                        
                        
                        X[:,0] = feat_1
                        X[:,1] = feat_2
                        X[:,2] = feat_3
                        X[:,3] = feat_4

                        i=4
                        for g, df2 in df.groupby(["domain_id"]):
                            i=4
                            v = df2.to_numpy()[:,1:]                            
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.sum(v,axis=0)
                            i += v.shape[1]
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.mean(v,axis=0)
                            i += v.shape[1]
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.nanstd(v,axis=0)
                            i += v.shape[1]
                            X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.max(v,axis=0)
                            i += v.shape[1]

                        

                        if len(S) > 0:
                            s_pred = predict_model(domain_prediction_model,S,return_numeric=True)
                        else:
                            s_pred = np.zeros_like((1,NUM_DOMS))
                        if len(S) > 0:
                            X[:,i] =  np.mean(s_pred,axis=0)
                            X[:,i+1] =  np.max(s_pred,axis=0)
                            try:
                                X[:,i+2] =  np.nanstd(s_pred,axis=0)
                            except:
                                X[:,i+2] =  X[:,i+2] 
                            i += 3
                        
                        X[:,55:] = np.reshape(np.asarray(W @ X[:,55:]),(-1,X.shape[1]-55))
                        if not mode == 'test':
                            Y[     f_map_func['domain_id']( [ dct_domain[obj['item_bought']] ] )[0] - NUM_ITEMS,0    ] = 1.0
                        
                        
                        #X[:,:8] = 0

                        for i in range(55+3):
                            X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) 
                        
                        #X = X -0.5
                        yield X,Y
                    
    """
        Optimize
    """

    BS = 64
    step = 0
    
    def batch_generator(mode, loop =True,batch_size=BS):
        BATCH_X = []
        BATCH_Y = []
        i = 0
        while True:
            for x,y in my_generator(mode):
                
                BATCH_X.append(x[None,:,:])
                BATCH_Y.append(y[None,:,:])
                i+= 1
                if i % batch_size == 0:      
                    yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0)
                    BATCH_X = []
                    BATCH_Y = []
                    i = 0 
            if loop == False:
                yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0)
                break
    """
        Define model
    """
    import  tensorflow.keras as keras
    import tensorflow.keras.layers as layers
    inp_x = keras.Input((NUM_DOMS,55+55))
    x = layers.Dense(64,activation='relu')(inp_x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dense(1)(x)
    x = layers.Flatten()(x)
    x = layers.Softmax(axis=-1)(x)
    
    model = keras.Model(inputs=[inp_x],outputs=[x])
    print(model.summary())
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.5*1e-2,
        decay_steps=1000,
        decay_rate=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1*1e-2)
    
    model_fname = path.join(DATA_DIR,'model',"NEURAL_DOMAIN_PRED.h5")
    model.compile(optimizer=optimizer,loss=compute_loss,metrics=[evaluate])
    from functools import partial
    from input.read_input import TRAIN_LINES
    
    #model.load_weights(path.join(DATA_DIR,"MY_MODEL_2.h5"))
    if not path.isfile(model_fname):
        input("Warning!!! Did not find model weights at {}. Training takes many, many, many hours! (Press ENTER)".format(model_fname))
        
        model.fit_generator(batch_generator('train',True),
                  steps_per_epoch=TRAIN_LINES//BS,
                  epochs=5
                  )
        model.save_weights(model_fname)

    else:
        model.load_weights(model_fname)
        print("Testing fit... should be about 0.41 to 0.45")
        model.fit_generator(batch_generator('train',True),
          steps_per_epoch=25,
          epochs=1
          )

    
    
    def predict(mode):
        PREDS = []
        CONFS = []
        NUM_SELECT = 10
        batch_size = 320
        for batch_id, X in enumerate(batch_generator(mode,batch_size=batch_size,loop=False)):
            x = X[0]
            print("Predicting {} - Batch {}".format(mode,batch_id))
            pred = model.predict_on_batch(x)
            if batch_id == 0:
                print(pred)
            PREDS.append(tf.argsort(pred,axis=-1)[:,-NUM_SELECT:])
            CONFS.append(tf.sort(pred,axis=-1)[:,-NUM_SELECT:])
            
        PREDS = np.concatenate(PREDS,axis=0)
        CONFS = np.concatenate(CONFS,axis=0)
        PREDS = np.concatenate([PREDS,CONFS],axis=1)
        cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)] + \
         ['conf_{}'.format(k) for k in range(NUM_SELECT)] 
        fname = os.path.join(DATA_DIR,'dom_pred_{}.csv'.format(mode))
        pd.DataFrame(PREDS,index=range(PREDS.shape[0]),columns=cols).to_csv(fname)
    
    predict('val')
    predict('test')
    predict('train')
def create_graph_domain():
    """
        Creates graph linking (domain searched, domain bought)
    """
    
    """
        Fetch data
    """
    
    from input.read_input import read_item_data
    df = read_item_data()
    df['item_id'] = df.index
    dct_title = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat= df['category_id'].to_dict()
    
    dct_price = df['price'].to_dict()
    
    """ Ratio stuff """    
    from input.create_ratio import get_ratio
    dct_ratio_dom = get_ratio(which='domain_id')
    
    ratio_df = get_ratio(which='item_id',full=True)
    ratio_df['popularity'] = 100.0*ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_b = ratio_df['popularity'].to_dict()
    
    
    
    """
        JSON
    
    """
    check = lambda x: x <= np.round(413163*0.8).astype(np.int32)
    
    DATA_PATH = path.join(DATA_DIR,'train_dataset.jl')
    line_i = 0
    
    

    """
        Create graph vertices
    """
    g = ig.Graph() 
    from input.read_input import get_mappings
    counter, f_map_func, r_map_func = get_mappings()
    
    num_items = df.shape[0]
    for k in dct_title.keys():
        g.add_vertex(value=k,deg=dct_ratio_item_b[k],domain_id=dct_domain[k],price=dct_price[k],cat='item_id')

    """ ['item_id','domain_id','category_id','product_id'] """
    
    for k in pd.unique(df['domain_id']):
        g.add_vertex(value=k,cat='domain_id')


    for k in pd.unique(df['category_id']):
        g.add_vertex(value=k,cat='category_id')


    for k in pd.unique(df['product_id']):
        g.add_vertex(value=k,cat='product_id')

    
    
    """
        Create edges
    """
    E1 = []
    E2 = []
    
    with jsonlines.open(DATA_PATH) as reader:
        for line_i, obj in enumerate(reader):
            if check(line_i):
                print(line_i)
                L = []
                for h in obj['user_history']:
                    if h['event_type'] == 'view':
                        #print("Viewed {}".format(dct[h['event_info']]))
                        L.append(h['event_info'])
                    elif h['event_type'] == 'search':
                        #print("Searched {}".format(h['event_info']))
                        pass
                L_domain = [dct_domain[k] for k in L]
                L_domain = pd.unique(L_domain)
                L_cat = [dct_cat[k] for k in L]
                L_cat = pd.unique(L_cat)
                
                for i in range(len(L)):
                        E1.append(dct_domain[L[i]])
                        E2.append(dct_domain[obj['item_bought']] )

    
    
    E1 = f_map_func['domain_id'](E1)
    E2 = f_map_func['domain_id'](E2)
    
    
    E =  pd.Series(list(zip(E1,E2))).value_counts()
    g.add_edges(E.index)
    g.es["weight"] = E.values
    
                     
    g.write_pickle(fname=path.join(DATA_DIR,'graph_domain_to_domain.pkl'))
    train_model = get_model()
    from tensorflow import TensorShape as ts
    import tensorflow.keras as keras

    train_ds = tf.data.Dataset.from_generator(
        title_generator,
        output_types=(tf.float32, tf.float32),
        output_shapes=(ts([None, NUM_WORDS, 512]), ts([None, NUM_DOMS])))
    train_model.load_weights(DOMAIN_IDENTIFIER_PATH)
    train_model.fit(x=train_ds, steps_per_epoch=TRAIN_LINES // BS, epochs=1)
    # Calling `save('my_model')` creates a SavedModel folder `my_model`.

    train_model.save_weights(DOMAIN_IDENTIFIER_PATH)


doms = pd.unique(read_item_data()['domain_id'])


def load_model():
    train_model = get_model()
    train_model.load_weights(DOMAIN_IDENTIFIER_PATH)
    return train_model


def predict_model(train_model,
                  query_list,
                  return_numeric=False,
                  return_emb=False):
    """
        Returns prediction of train_model on batch of input
    """
Example #6
0
def create_ratio(mode='train', CUTOFF=50, which='domain_id', alternate=False):
    assert mode in ['train', 'val']
    assert which in [
        'domain_id', 'category_id', 'item_id', 'price', 'condition'
    ]
    df = read_item_data()

    df['price'] = pd.qcut(df['price'].values, 100)

    dct_attr = df[which].to_dict()
    dct_dom = df['domain_id'].to_dict()

    if mode == 'train':
        check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32)
    elif mode == 'val':
        check = lambda x: x > np.round(413163 * 0.8).astype(np.int32)
    else:
        raise Exception("mode must be train or val")

    DATA_PATH = path.join(DATA_DIR, 'train_dataset.jl')
    i = 0
    """ Create dictionary holding domain counts (searched, bought) """
    attr_s = dict([(k, 0) for k in pd.unique(df[which])])
    attr_b = dict([(k, 0) for k in pd.unique(df[which])])
    attr_o = dict([(k, 0) for k in pd.unique(df[which])])
    with jsonlines.open(DATA_PATH) as reader:
        for obj in reader:
            if check(i):
                #print(i)
                L = []
                for h in obj['user_history']:
                    if h['event_type'] == 'view':
                        #print("Viewed {}".format(dct[h['event_info']]))
                        L.append(h['event_info'])
                    elif h['event_type'] == 'search':
                        #print("Searched {}".format(h['event_info']))
                        pass

                temp = pd.Series(L, index=range(len(L)), dtype=np.float64)

                L_k = pd.unique(L[::-1])[::-1]

                attr_unique = list(pd.unique([dct_attr[k] for k in L_k]))
                for dom in attr_unique:
                    if dom in attr_s:
                        attr_s[dom] += 1
                if alternate:
                    for attr in attr_unique:
                        if dct_dom[attr] == dct_dom[obj['item_bought']]:
                            attr_b[attr] += 1
                        else:
                            attr_o[attr] += 1
                else:
                    if dct_attr[obj['item_bought']] in attr_unique:
                        attr_b[dct_attr[obj['item_bought']]] += 1
                    else:
                        attr_o[dct_attr[obj['item_bought']]] += 1

            i += 1
            #L.append(obj)

    attr_b, attr_s = pd.DataFrame.from_dict(attr_b,orient = 'index'),\
                          pd.DataFrame.from_dict(attr_s,orient = 'index')
    attr_o = pd.DataFrame.from_dict(attr_o, orient='index')

    attr_b.columns, attr_s.columns, attr_o.columns = ['bought'
                                                      ], ['searched'
                                                          ], ['out_bought']
    attr_b['bought'] = attr_b['bought'].values.astype(np.float32)
    attr_s['searched'] = attr_s['searched'].values.astype(np.float32)

    rat = attr_b['bought'].values / (1.0 + attr_s['searched'].values)
    rat[attr_s['searched'].values < CUTOFF] = np.mean(
        rat[attr_s['searched'].values >= CUTOFF])

    rat2 = attr_o['out_bought'].values / (1.0 + attr_b['bought'].values)
    rat2[attr_s['searched'].values < CUTOFF] = np.mean(
        rat2[attr_s['searched'].values >= CUTOFF])

    rat = pd.DataFrame({"rat": np.array(rat)}, index=attr_b.index)
    rat2 = pd.DataFrame({"rat2": np.array(rat2)}, index=attr_b.index)

    res = pd.concat([attr_s, attr_b, attr_o, rat, rat2], axis=1)
    if alternate:
        res.to_csv(path.join(DATA_DIR, '{}_ratio_alternate.csv'.format(which)))
    else:
        res.to_csv(path.join(DATA_DIR, '{}_ratio.csv'.format(which)))
def meli_iterator(mode='train', batch_size=BATCH_SIZE, full=False):
    from input.read_input import get_sentence_model, get_emb

    from input.create_ratio import load_language_df
    TRAIN_LINES = 413163
    TEST_LINES = 177070
    df = read_item_data()

    dct_condition = df['condition'].to_dict()

    df2 = load_language_df()
    dct_lan_pt = df2['score_pt'].to_dict()
    dct_lan_en = df2['score_en'].to_dict()
    dct_lan_es = df2['score_es'].to_dict()

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    """ Ratio stuff """
    from input.create_ratio import get_ratio
    dct_ratio_dom = get_ratio(which='domain_id')

    ratio_df = get_ratio(which='item_id', full=True)
    ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_p = ratio_df['popularity'].to_dict()

    ratio_df = get_ratio(which='item_id', full=True, alternate=False)
    dct_ratio_item_b = ratio_df['bought'].to_dict()
    dct_ratio_item_s = ratio_df['searched'].to_dict()
    dct_ratio_item_r = ratio_df['rat'].to_dict()

    df['item_bought'] = [dct_ratio_item_b[k] for k in df.index]

    dct_ratio_cat = get_ratio(which='category_id', full=True)
    dct_ratio_cat_s, dct_ratio_cat_b, dct_ratio_cat  = dct_ratio_cat['searched'].to_dict(),\
                                                       dct_ratio_cat['bought'].to_dict(),\
                                                       dct_ratio_cat['rat'].to_dict(),\

    dct_ratio_dom = get_ratio(which='domain_id', full=True)
    dct_ratio_dom_s, dct_ratio_dom_b, dct_ratio_dom  = dct_ratio_dom['searched'].to_dict(),\
                                               dct_ratio_dom['bought'].to_dict(),\
                                               dct_ratio_dom['rat'].to_dict(),\

    dct_ratio_item = get_ratio(which='item_id')

    dct_domain_df = {}
    dct_cat_df = {}
    for dom, df2 in df.groupby('domain_id'):
        df2 = df2.sort_values(['item_bought'], ascending=False)  #.iloc[0:10,:]
        dct_domain_df[dom] = df2

    for cat, df2 in df.groupby('category_id'):
        df2 = df2.sort_values(['item_bought'], ascending=False)  #.iloc[0:10,:]
        dct_cat_df[cat] = df2
    del df
    del df2

    def _begin_overfit_avoid(L_k):

        if not mode == 'train':
            return
        target_item = obj['item_bought']
        target_dom = dct_domain[obj['item_bought']]
        target_cat = dct_cat[obj['item_bought']]
        for this_item in L_k:
            """ Bought """
            if this_item == target_item:
                assert dct_ratio_item_b[this_item] > 0
                dct_ratio_item_b[this_item] -= 1
            """ Search """
            dct_ratio_item_s[this_item] -= 1
            assert dct_ratio_item_s[this_item] >= 0
            """ Ratio """
            dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / (
                dct_ratio_item_s[this_item] + 1)
        for this_dom in pd.unique([dct_domain[k] for k in L_k]):
            if not isinstance(this_dom, str):
                continue
            """ Bought """
            if this_dom == target_dom:
                assert dct_ratio_dom_b[this_dom] > 0
                dct_ratio_dom_b[this_dom] -= 1
            """ Search """
            dct_ratio_dom_s[this_dom] -= 1
            assert dct_ratio_dom_s[this_dom] >= 0
            """ Ratio """
            dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / (
                dct_ratio_dom_s[this_dom] + 1)
        for this_cat in pd.unique([dct_cat[k] for k in L_k]):
            """ Bought """
            if this_cat == target_cat:
                assert dct_ratio_cat_b[this_cat] > 0
                dct_ratio_cat_b[this_cat] -= 1
            """ Search """
            dct_ratio_cat_s[this_cat] -= 1
            assert dct_ratio_cat_s[this_cat] >= 0
            """ Ratio """
            dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / (
                dct_ratio_cat_s[this_cat] + 1)

    def _end_overfit_avoid(L_k):
        if not mode == 'train':
            return
        target_item = obj['item_bought']
        target_dom = dct_domain[obj['item_bought']]
        target_cat = dct_cat[obj['item_bought']]
        for this_item in L_k:
            """ Bought """
            if this_item == target_item:
                #assert dct_ratio_item_b[this_item] >= 0
                dct_ratio_item_b[this_item] += 1
            """ Search """
            #assert dct_ratio_item_s[this_item] >= 0
            dct_ratio_item_s[this_item] += 1
            """ Ratio """
            dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / (
                dct_ratio_item_s[this_item] + 1)
        for this_dom in pd.unique([dct_domain[k] for k in L_k]):
            if not isinstance(this_dom, str):
                continue
            """ Bought """
            if this_dom == target_dom:
                #assert dct_ratio_dom_b[this_dom] >= 0
                dct_ratio_dom_b[this_dom] += 1
            """ Search """
            #assert dct_ratio_dom_s[this_dom] >= 0
            dct_ratio_dom_s[this_dom] += 1
            """ Ratio """
            dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / (
                dct_ratio_dom_s[this_dom] + 1)
        for this_cat in pd.unique([dct_cat[k] for k in L_k]):
            """ Bought """
            if this_cat == target_cat:
                #assert dct_ratio_cat_b[this_cat] >= 0
                dct_ratio_cat_b[this_cat] += 1
            """ Search """
            #assert dct_ratio_cat_s[this_cat] >= 0
            dct_ratio_cat_s[this_cat] += 1
            """ Ratio """
            dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / (
                dct_ratio_cat_s[this_cat] + 1)

    if mode == 'train':
        check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32)
    elif mode == 'val':
        check = lambda x: x > np.round(413163 * 0.8).astype(np.int32)
    else:
        check = lambda x: True

    DATA_PATH = path.join(
        DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')

    def rank_to_order(L, rank):
        assert rank.shape[0] == L.shape[0]
        return L[(-rank).argsort(kind='mergesort')]

    pred = {}

    actual = []

    X = []
    Y = []
    MASK = []
    LKS = []
    ACTUAL = []
    while True:
        with jsonlines.open(DATA_PATH) as reader:
            print("Start!!!")
            for line_id, obj in enumerate(reader):
                if check(line_id):
                    #print(i)
                    L = []
                    timestamps = []
                    dct_emb = {}
                    if mode == 'test':
                        obj['item_bought'] = -999
                    for h in obj['user_history']:
                        if h['event_type'] == 'view':
                            L.append(h['event_info'])
                            timestamps.append(
                                pd.Timestamp(h['event_timestamp']))
                        elif h['event_type'] == 'search':
                            pass

                    def divide_time(d):
                        d = pd.Timedelta(d).total_seconds()
                        MINUTE_M = 60
                        HOUR_M = MINUTE_M * 60
                        DAY_M = HOUR_M * 24

                        div = [1, 24, 60]
                        res = [0, 0, 0]
                        for i, M in enumerate([DAY_M, HOUR_M, MINUTE_M]):
                            res[i] = np.floor(d / M)
                            d -= M * res[i]
                            res[i] /= div[i]
                            #res[i] -= 0.5

                        return tuple(res)

                    if not full and len(L) < 2:
                        continue
                    """ Create attributes """
                    if len(L) == 0:
                        attrs = np.zeros(
                            (1, (CANDIDATES + 1) + ATTR_SIZE + EMB_SIZE))
                        targets = np.zeros((1, (CANDIDATES + 1)))
                        targets[0, -1] = 0
                        L_k = []
                    else:
                        delta = [
                            timestamps[-1] - timestamps[i]
                            for i in range(0, len(timestamps))
                        ]
                        """
                            We'll use the latest delta
                        """
                        L = L[::-1]
                        u, unique_id = np.unique(np.array(L),
                                                 return_index=True)

                        #delta_day, delta_hour, delta_minute = zip(*[divide_time(d) for d in delta])
                        deltas = np.array([divide_time(d) for d in delta])
                        deltas = deltas[unique_id][:SEQ_LEN]

                        L_k = np.array(L)[unique_id][:CANDIDATES]
                        _begin_overfit_avoid(L_k)
                        """
                            rank_freq initial calculation needs whole L
                        """
                        rank_freq = pd.Series(L, index=range(
                            len(L))).value_counts(sort=False, normalize=True)
                        rank_freq = rank_freq.rank(method="average").to_dict()

                        L = np.array(L)[unique_id][:SEQ_LEN]
                        """
                            Calculate ranks
                        """
                        condition = np.array([
                            1.0 if dct_condition[k] == 'new' else 0.0
                            for k in L
                        ])[:, None]

                        #ratio_dom = np.array([dct_ratio_dom[dct_domain[k]] for k in L])[:,None]
                        #ratio_cat = np.array([dct_ratio_cat[dct_cat[k]] for k in L])[:,None]
                        #ratio_item = np.array([dct_ratio_item[k] for k in L])[:,None]
                        price = np.log(
                            np.array([
                                1 + np.abs(fix_na(dct_price[k])) for k in L
                            ])[:, None])
                        rank_freq = np.array([rank_freq[k] for k in L])[:,
                                                                        None]
                        #rank_latest = (1.0 - np.arange(len(L))/len(L))[:,None]

                        rank_ratio_dom = pd.Series([
                            dct_ratio_dom[dct_domain[k]] for k in L_k
                        ]).rank(method="average").to_numpy()
                        rank_ratio_cat = pd.Series([
                            dct_ratio_cat[dct_cat[k]] for k in L_k
                        ]).rank(method="average").to_numpy()
                        rank_ratio_item = pd.Series([
                            dct_ratio_item_r[k] for k in L_k
                        ]).rank(method="average").to_numpy()
                        rank_latest = (1.0 - np.arange(len(L)) / len(L))

                        x = []
                        x.append([dct_ratio_dom[dct_domain[k]] for k in L_k])
                        x.append(rank_ratio_dom)
                        x.append(rank_ratio_cat)
                        x.append(rank_ratio_item)

                        x.append([dct_ratio_dom[dct_domain[k]] for k in L_k])

                        x.append([dct_ratio_cat[dct_cat[k]] for k in L_k])
                        x.append([dct_ratio_item_b[k] for k in L_k])
                        x.append([dct_ratio_item_s[k] for k in L_k])
                        x.append([dct_ratio_item_r[k] for k in L_k])
                        x.append(list(rank_latest / len(L_k)))
                        x.append([-dct_price[k] for k in L_k])

                        x.append([-dct_condition[k] for k in L_k])
                        x.append([-dct_lan_en[k] for k in L_k])
                        x.append([-dct_lan_es[k] for k in L_k])
                        x.append([-dct_lan_pt[k] for k in L_k])
                        """
                            
                        """
                        #true_val = (np.array([str(dct_domain[k]) for k in L]) == dct_domain[obj['item_bought']])
                        #true_val = np.logical_and(true_val,[k in L_k for k in L])
                        #true_val = true_val[:,None]
                        #true_val = np.ones_like(true_val)
                        #true_val = np.random.rand(*(true_val.shape))

                        assert all([k in L for k in L_k])

                        ids = [
                            np.where(
                                L_k == l)[0][0] if l in L_k else CANDIDATES
                            for l in L
                        ]
                        ids_onehot = np.zeros((len(L), (CANDIDATES + 1)))
                        ids_onehot[np.arange(len(L)), ids] = 1
                        #ids_onehot = ids_onehot[:,0:10]
                        """
                            Create numeric attributes plus embeddings
                        """

                        attr_list = [
                            ids_onehot, deltas, condition, price, rank_freq
                        ] + [np.array(_x)[:, None] for _x in x]
                        if USE_EMB:
                            emb = predict_model(
                                get_sentence_model(),
                                query_list=[dct[k] for k in L_k],
                                return_emb=True)
                            emb = np.reshape(emb[:, 0:(EMB_SIZE // 512), :],
                                             (emb.shape[0], EMB_SIZE))
                            attr_list.append(emb)

                        attrs = np.concatenate(attr_list, axis=1)
                        """ Create targets """
                        if mode == 'test':
                            targets = np.zeros((1, (CANDIDATES + 1)))
                        else:
                            _b1 = (np.array(list(L_k == obj['item_bought'])))
                            _b2 = (np.array(
                                list([str(dct_domain[k]) for k in L_k
                                      ])) == dct_domain[obj['item_bought']])

                            targets = _b1.astype(
                                np.float32
                            ) * 1.0  #+ _b2.astype(np.float32)*0.0
                            if np.sum(targets) == 0:
                                targets = np.zeros((1, (CANDIDATES + 1)))
                                targets[0, -1] = 1
                                if not full:
                                    _end_overfit_avoid(L_k)
                                    continue
                            else:
                                targets = np.array(targets) / np.sum(targets)
                                targets = np.concatenate([
                                    targets[None, :],
                                    np.zeros((1, CANDIDATES + 1 - len(L_k)))
                                ],
                                                         axis=1)
                    """ Add attributes, targets. """
                    if attrs.shape[0] < SEQ_LEN:
                        attrs = np.concatenate([
                            np.zeros((
                                SEQ_LEN - attrs.shape[0],
                                attrs.shape[1],
                            )), attrs
                        ],
                                               axis=0)
                    attrs = attrs[-SEQ_LEN:, :]
                    attrs = attrs.astype(np.float32)
                    _end_overfit_avoid(L_k)

                    X.append(attrs[None, :])
                    Y.append(targets)
                    mask = np.concatenate([
                        np.ones((len(L_k))),
                        np.zeros((CANDIDATES + 1) - len(L_k))
                    ]).astype(np.float32)[None, :]
                    MASK.append(mask)

                    LKS.append(
                        np.concatenate([
                            L_k, -1 * np.ones(((CANDIDATES + 1) - len(L_k), ))
                        ])[None, :])
                    ACTUAL.append(np.array([obj['item_bought']])[None, :])

                if len(X) == batch_size:

                    X = np.concatenate(X, axis=0)
                    Y = np.concatenate(Y, axis=0)

                    MASK = np.concatenate(MASK, axis=0)
                    LKS = np.concatenate(np.array(LKS).astype(np.int32),
                                         axis=0)
                    ACTUAL = np.concatenate(np.array(ACTUAL).astype(np.int32),
                                            axis=0)

                    yield (X, MASK, LKS, ACTUAL), Y
                    X = []
                    Y = []
                    MASK = []
                    LKS = []
                    ACTUAL = []

                    #print(attrs.shape)
        if full:
            check = (lambda i: True)
Example #8
0
def fit_RNN():
    import tensorflow as tf
    from tensorflow import keras
    import tf_geometric as tfg
    """
        Create graph
    """
    df = read_item_data()

    NUM_ITEMS = read_item_data().shape[0]
    NUM_FEATURES = 1

    counter, f_map_func, r_map_func = get_mappings()

    NUM_DOMS = pd.unique(df['domain_id']).shape[0]
    """ Load graph """
    G = ig.Graph.Read_Pickle(path.join(DATA_DIR, 'graph_item_to_item.pkl'))
    #weights = np.log(1+np.array(G.es["weight"]))
    weights = np.array(G.es["weight"])

    indices = np.array([np.array(e.tuple) for e in G.es])
    indices = np.transpose(indices)
    """ Create sparse matrix W """
    from scipy.sparse import coo_matrix
    import scipy.sparse
    row = indices[0, :]
    col = indices[1, :]

    W = coo_matrix((weights, (row, col)), shape=(NUM_ITEMS, NUM_ITEMS))
    """ Normalize rows """
    #W = deg_matrix(W,pwr=-1) @ W
    W = W.transpose()
    W = scipy.sparse.csr_matrix(W)
    assert scipy.sparse.issparse(W)

    @tf.function
    def smooth_labels(labels, factor=0.001):
        # smooth the labels
        labels = tf.cast(labels, tf.float32)
        labels *= (1 - factor)
        labels += (factor / tf.cast(tf.shape(labels)[1], tf.float32))
        # returned the smoothed labels
        return labels

    @tf.function
    def compute_loss(labels, logits):
        logits = tf.reshape(logits, (-1, NUM_ITEMS))
        labels = tf.reshape(labels, (-1, NUM_ITEMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)

        logits = smooth_labels(logits)
        labels = smooth_labels(labels)

        losses = -tf.reduce_sum(logits * tf.math.log(labels), axis=1)

        return tf.reduce_mean(losses)

    @tf.function
    def evaluate(labels, logits):
        logits = tf.reshape(logits, (-1, NUM_ITEMS))
        labels = tf.reshape(labels, (-1, NUM_ITEMS))

        #logits = tf.nn.softmax(logits)
        #print(logits)

        logits = smooth_labels(logits)
        labels = smooth_labels(labels)

        acc = tf.metrics.categorical_accuracy(labels, logits)

        return tf.reduce_mean(acc)

    """
        Read data, yadda yadda
    
    """
    from input.create_ratio import get_ratio
    ratio_df = get_ratio(which='item_id', full=True)
    ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched']
    dct_ratio_item_b = ratio_df['popularity'].to_dict()

    dct = df['title'].to_dict()
    dct_domain = df['domain_id'].to_dict()
    dct_cat = df['category_id'].to_dict()
    dct_price = df['price'].to_dict()
    """ Ratio stuff """
    from input.create_ratio import get_ratio
    category_df = get_ratio(which='category_id', full=True)
    domain_df = get_ratio(which='domain_id', full=True)

    feat_1, feat_2, feat_3 = domain_df['searched'].to_dict(
    ), domain_df['bought'].to_dict(), domain_df['rat'].to_dict()

    feat_1, feat_2, feat_3 = [[X[dct_domain[k]] for k in df.index]
                              for X in [feat_1, feat_2, feat_3]]

    feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].to_dict(
    ), category_df['bought'].to_dict(), category_df['rat'].to_dict()
    feat_1_1, feat_2_1, feat_3_1 = [[X[dct_cat[k]] for k in df.index]
                                    for X in [feat_1_1, feat_2_1, feat_3_1]]

    def standardize(x):
        return (x - np.min(x)) / (np.max(x) + 1e-06 - np.min(x))

    feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1, feat_2, feat_3]]

    feat_1_1, feat_2_1, feat_3_1 = [
        standardize(x) for x in [feat_1_1, feat_2_1, feat_3_1]
    ]

    del df
    del domain_df
    del category_df
    del G
    #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)])
    #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios)

    from nn.domain_string_identifier import load_model
    domain_prediction_model = load_model()

    def my_generator(mode='train'):
        if mode == 'train':
            check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32)
        elif mode == 'val':
            check = lambda x: x > np.round(413163 * 0.8).astype(np.int32)
        else:
            check = lambda x: True
        DATA_PATH = path.join(
            DATA_DIR,
            'test_dataset.jl' if mode == 'test' else 'train_dataset.jl')
        print("Reading....")

        X = np.zeros((NUM_ITEMS, 10)).astype(np.float32)
        with jsonlines.open(DATA_PATH) as reader:
            for line_i, obj in enumerate(reader):

                if check(line_i):
                    L = []
                    S = []
                    C = []
                    IDS = []
                    for h in obj['user_history']:
                        if h['event_type'] == 'view':
                            L.append(dct_domain[h['event_info']])
                            C.append(dct_cat[h['event_info']])
                            IDS.append(h['event_info'])

                        elif h['event_type'] == 'search':
                            S.append(h['event_info'])

                    if obj['item_bought'] in L:
                        continue

                    L = f_map_func['domain_id'](L)
                    C = f_map_func['category_id'](C)
                    IDS_map = f_map_func['item_id'](IDS)
                    """ Adjust graph """
                    Y = np.zeros((NUM_ITEMS, 1)).astype(np.float32)
                    """
                        X[:,0] = feat_1
                        X[:,1] = feat_2
                        X[:,2] = feat_3
                        X[:,6] = feat_1_1
                        X[:,7] = feat_2_1
                        X[:,8] = feat_3_1
                        
                        #if len(S) > 0:
                        #    X[:,8] =  np.mean(predict_model(domain_prediction_model,S,return_numeric=True),axis=0)
                        """
                    target_id = f_map_func['item_id']([obj['item_bought']])[0]
                    if not mode == 'test':
                        Y[target_id, 0] = 1.0
                    """
                        for i,k in enumerate(IDS_map):
                            X[k,3] +=  1
                            X[k,4] +=  dct_ratio_item_b[IDS[i]]/len(C)
                            X[k,5] =  dct_price[IDS[i]]
                        
                        #W[target_id,:] = (np.clip(np.array(W[target_id,:].todense())-1,a_min=0.0,a_max=None))
                        X[:,9] = np.reshape(np.asarray(W @ X[:,3]),(-1,))
                        X[:,9] = X[:,8] * X[:,2]
                        #X[:,:8] = 0

                        for i in range(10):
                            X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) 
                        """
                    if not mode == 'test':
                        Y[target_id, 0] = 0.0
                    #X = X -0.5
                    yield X, Y

    """
        Optimize
    """

    BS = 2
    step = 0

    def batch_generator(mode, loop=True, batch_size=BS):
        BATCH_X = []
        BATCH_Y = []
        i = 0
        while True:
            for x, y in my_generator(mode):

                BATCH_X.append(x[None, :, :])
                BATCH_Y.append(y[None, :, :])
                i += 1
                if i % batch_size == 0:
                    yield np.concatenate(BATCH_X,
                                         axis=0), np.concatenate(BATCH_Y,
                                                                 axis=0)
                    BATCH_X = []
                    BATCH_Y = []
                    i = 0
            if loop == False:
                yield np.concatenate(BATCH_X, axis=0), np.concatenate(BATCH_Y,
                                                                      axis=0)
                break

    """
        Define train_model
    """
    import tensorflow.keras as keras
    import tensorflow.keras.layers as layers
    inp_x = keras.Input((NUM_ITEMS, 10))
    x = layers.Dense(32, activation='relu')(inp_x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(1)(x)
    x = layers.Flatten()(x)
    x = layers.Softmax(axis=-1)(x)

    train_model = keras.Model(inputs=[inp_x], outputs=[x])
    print(train_model.summary())
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.5 * 1e-2, decay_steps=1000, decay_rate=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.2 * 1e-2)

    train_model.compile(optimizer=optimizer,
                        loss=compute_loss,
                        metrics=[evaluate])
    from functools import partial
    from input.read_input import TRAIN_LINES
    train_model.fit_generator(batch_generator('train', True),
                              steps_per_epoch=TRAIN_LINES // BS,
                              epochs=1)

    ITEM_PATH = path.join(DATA_DIR, 'train_model', 'item_classifier.h5')
    train_model.save_weights(ITEM_PATH)

    def predict(mode):
        PREDS = []
        CONFS = []
        NUM_SELECT = 10
        batch_size = 1
        for batch_id, X in enumerate(
                batch_generator(mode, batch_size=batch_size, loop=False)):
            x = X[0]
            print("Predicting {} - Batch {}".format(mode, batch_id))
            pred = train_model.predict_on_batch(x)
            if batch_id == 0:
                print(pred)
            PREDS.append(tf.argsort(pred, axis=-1)[:, -NUM_SELECT:])
            CONFS.append(tf.sort(pred, axis=-1)[:, -NUM_SELECT:])

        PREDS = np.concatenate(PREDS, axis=0)
        CONFS = np.concatenate(CONFS, axis=0)
        #PREDS = np.concatenate([PREDS,CONFS],axis=1)
        cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)]
        fname = os.path.join(DATA_DIR, 'item_pred_{}.csv'.format(mode))
        pd.DataFrame(PREDS, index=range(PREDS.shape[0]),
                     columns=cols).to_csv(fname)

    predict('train')
    predict('val')
    predict('test')