def title_generator(mode='train'): df = read_item_data() NUM_DOMS = pd.unique(df['domain_id']).size dom_to_id = dict([(x, i) for i, x in enumerate(pd.unique(df['domain_id']))]) NUM_DOMS = pd.unique(df['domain_id']).size BATCH_X = [] BATCH_Y = [] while True: line_id = 0 for tit, dom in (zip(df['title'], df['domain_id'])): target = np.zeros((NUM_DOMS, ), dtype=np.float32) target[dom_to_id[dom]] = 1 tit = preprocess_title(tit) embeddings = sentence_model.encode(tit) BATCH_X.append(embeddings[None, :, :]) BATCH_Y.append(target[None, :]) if line_id % BS == 0: X = np.concatenate(BATCH_X, axis=0) Y = np.concatenate(BATCH_Y, axis=0) BATCH_X = [] BATCH_Y = [] yield X, Y line_id += 1
def create_language(): df = read_item_data() import fasttext model_fname = path.join(DATA_DIR, "lid.176.bin") if not path.isfile(model_fname): print("Did not find fasttext model at {}".format(model_fname)) print("Trying to download from the web...") try: urllib.request.urlretrieve( "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", model_fname) except: raise Exception("Could not get fasttext model") if not path.isfile(model_fname): raise Exception("Could not get fasttext model") else: print("Fasttext model found at {}".format(model_fname)) lid_model = fasttext.FastText.load_model(model_fname) def get_language(i, x): print(i) languages, scores = lid_model.predict(str(x), k=999, threshold=-1.0) languages = np.array(languages) return scores[np.where( languages == '__label__es')[0][0]], scores[np.where( languages == '__label__pt')[0][0]], scores[np.where( languages == '__label__en')[0][0]] X = np.array( [get_language(i, x) for i, x in enumerate(df['title'].values)]) for i, c in enumerate(['score_es', 'score_pt', 'score_en']): df[c] = X[:, i] df.loc[:, ['score_es', 'score_pt', 'score_en']].to_csv( path.join(DATA_DIR, 'language_identification.csv'))
def train_neural_domain_prediction(): import tensorflow as tf """ Create graph """ from input.read_input import read_item_data df = read_item_data() dct_condition = df['condition'].to_dict() df2 = load_language_df() dct_lan_pt = df2['score_pt'].to_dict() dct_lan_en = df2['score_en'].to_dict() dct_lan_es = df2['score_es'].to_dict() NUM_ITEMS = read_item_data().shape[0] NUM_FEATURES = 1 from input.read_input import get_mappings, NUM_DOMS counter, f_map_func, r_map_func = get_mappings() NUM_DOMS = pd.unique(df['domain_id']).shape[0] NUM_CATS = pd.unique(df['category_id']).shape[0] """ Load graph """ graph_fname = path.join(DATA_DIR,'graph_domain_to_domain.pkl') if not path.isfile(graph_fname): input("Did not find graph at {}. Will have to create it from scratch... (Any key to continue)".format(graph_fname)) G = create_graph_domain() else: G = ig.Graph.Read_Pickle(path.join(DATA_DIR,'graph_domain_to_domain.pkl')) #weights = np.log(1+np.array(G.es["weight"])) weights = np.array(G.es["weight"]) indices = np.array([ np.array(e.tuple) for e in G.es]) - NUM_ITEMS indices = np.transpose(indices) """ Create sparse matrix W """ from scipy.sparse import coo_matrix import scipy.sparse row = indices[0,:] col = indices[1,:] W = coo_matrix((weights, (row, col)),shape=(NUM_DOMS,NUM_DOMS)) """ Normalize rows """ #W = deg_matrix(W,pwr=-1) @ W W = W.transpose() W = scipy.sparse.csr_matrix(W) assert scipy.sparse.issparse(W) @tf.function def smooth_labels(labels, factor=0.001): # smooth the labels labels = tf.cast(labels,tf.float32) labels *= (1 - factor) labels += (factor / tf.cast(tf.shape(labels)[1],tf.float32)) # returned the smoothed labels return labels @tf.function def compute_loss(labels,logits): logits = tf.reshape(logits,(-1,NUM_DOMS)) labels = tf.reshape(labels,(-1,NUM_DOMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) losses = -tf.reduce_sum(logits*tf.math.log(labels),axis=1) return tf.reduce_mean(losses) @tf.function def evaluate(labels,logits): logits = tf.reshape(logits,(-1,NUM_DOMS)) labels = tf.reshape(labels,(-1,NUM_DOMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) acc = tf.metrics.categorical_accuracy(labels,logits) return tf.reduce_mean(acc) """ Read data, yadda yadda """ from input.create_ratio import get_ratio ratio_df = get_ratio(which='item_id',full=True,alternate=False) dct_ratio_item_b = ratio_df['bought'].to_dict() dct_ratio_item_s = ratio_df['searched'].to_dict() dct_ratio_item_r = ratio_df['searched'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio category_df = get_ratio(which='category_id',full=True) domain_df = get_ratio(which='domain_id', full = True) feat_1, feat_2, feat_3 = domain_df['searched'].values, domain_df['bought'].values, domain_df['rat'].values feat_4, feat_5 = domain_df['out_bought'].values,domain_df['rat2'].values feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].values, category_df['bought'].values, category_df['rat'].values def standardize(x): return (x - np.min(x)) / (np.max(x)+1e-06 - np.min(x)) feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1,feat_2,feat_3]] feat_1_1, feat_2_1, feat_3_1 = [standardize(x) for x in [feat_1_1,feat_2_1,feat_3_1]] #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)]) #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios) from nn.domain_string_identifier import load_model domain_prediction_model = load_model() def my_generator(mode='train'): if mode == 'train': check = lambda x: x <= np.round(413163*0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163*0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join(DATA_DIR,'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') print("Reading....") with jsonlines.open(DATA_PATH) as reader: for line_i, obj in enumerate(reader): if check(line_i): L = [] S = [] C =[] IDS = [] for h in obj['user_history']: if h['event_type'] == 'view': L.append(dct_domain[h['event_info']]) C.append(dct_cat[h['event_info']]) IDS.append(h['event_info']) elif h['event_type'] == 'search': S.append(h['event_info']) L = f_map_func['domain_id'](L) C = f_map_func['category_id'](C) df = pd.DataFrame( {"domain_id":L, "feat_1_1":[feat_1_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))], "feat_2_1":[feat_2_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))], "feat_3_1":[feat_3_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))], }, index=IDS) df['recency'] = range(len(L)) df['freq'] = np.ones((len(L),)) df['price'] = [ dct_price[k] for k in IDS] df['item_b'] =[ dct_ratio_item_b[k] for k in IDS] df['item_s'] =[ dct_ratio_item_s[k] for k in IDS] df['condition'] =[dct_condition[k] for k in IDS] df['lan_pt'] = [dct_lan_pt[k] for k in IDS] df['lan_en'] = [dct_lan_en[k] for k in IDS] df['lan_es'] = [dct_lan_es[k] for k in IDS] """ Adjust graph """ Y = np.zeros((NUM_DOMS,1)).astype(np.float32) X = np.zeros((NUM_DOMS,55+55)).astype(np.float32) X[:,0] = feat_1 X[:,1] = feat_2 X[:,2] = feat_3 X[:,3] = feat_4 i=4 for g, df2 in df.groupby(["domain_id"]): i=4 v = df2.to_numpy()[:,1:] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.sum(v,axis=0) i += v.shape[1] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.mean(v,axis=0) i += v.shape[1] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.nanstd(v,axis=0) i += v.shape[1] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.max(v,axis=0) i += v.shape[1] if len(S) > 0: s_pred = predict_model(domain_prediction_model,S,return_numeric=True) else: s_pred = np.zeros_like((1,NUM_DOMS)) if len(S) > 0: X[:,i] = np.mean(s_pred,axis=0) X[:,i+1] = np.max(s_pred,axis=0) try: X[:,i+2] = np.nanstd(s_pred,axis=0) except: X[:,i+2] = X[:,i+2] i += 3 X[:,55:] = np.reshape(np.asarray(W @ X[:,55:]),(-1,X.shape[1]-55)) if not mode == 'test': Y[ f_map_func['domain_id']( [ dct_domain[obj['item_bought']] ] )[0] - NUM_ITEMS,0 ] = 1.0 #X[:,:8] = 0 for i in range(55+3): X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) #X = X -0.5 yield X,Y """ Optimize """ BS = 64 step = 0 def batch_generator(mode, loop =True,batch_size=BS): BATCH_X = [] BATCH_Y = [] i = 0 while True: for x,y in my_generator(mode): BATCH_X.append(x[None,:,:]) BATCH_Y.append(y[None,:,:]) i+= 1 if i % batch_size == 0: yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0) BATCH_X = [] BATCH_Y = [] i = 0 if loop == False: yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0) break """ Define model """ import tensorflow.keras as keras import tensorflow.keras.layers as layers inp_x = keras.Input((NUM_DOMS,55+55)) x = layers.Dense(64,activation='relu')(inp_x) x = layers.Dense(64,activation='relu')(x) x = layers.Dense(64,activation='relu')(x) x = layers.Dense(1)(x) x = layers.Flatten()(x) x = layers.Softmax(axis=-1)(x) model = keras.Model(inputs=[inp_x],outputs=[x]) print(model.summary()) lr_schedule = keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=0.5*1e-2, decay_steps=1000, decay_rate=0.9) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1*1e-2) model_fname = path.join(DATA_DIR,'model',"NEURAL_DOMAIN_PRED.h5") model.compile(optimizer=optimizer,loss=compute_loss,metrics=[evaluate]) from functools import partial from input.read_input import TRAIN_LINES #model.load_weights(path.join(DATA_DIR,"MY_MODEL_2.h5")) if not path.isfile(model_fname): input("Warning!!! Did not find model weights at {}. Training takes many, many, many hours! (Press ENTER)".format(model_fname)) model.fit_generator(batch_generator('train',True), steps_per_epoch=TRAIN_LINES//BS, epochs=5 ) model.save_weights(model_fname) else: model.load_weights(model_fname) print("Testing fit... should be about 0.41 to 0.45") model.fit_generator(batch_generator('train',True), steps_per_epoch=25, epochs=1 ) def predict(mode): PREDS = [] CONFS = [] NUM_SELECT = 10 batch_size = 320 for batch_id, X in enumerate(batch_generator(mode,batch_size=batch_size,loop=False)): x = X[0] print("Predicting {} - Batch {}".format(mode,batch_id)) pred = model.predict_on_batch(x) if batch_id == 0: print(pred) PREDS.append(tf.argsort(pred,axis=-1)[:,-NUM_SELECT:]) CONFS.append(tf.sort(pred,axis=-1)[:,-NUM_SELECT:]) PREDS = np.concatenate(PREDS,axis=0) CONFS = np.concatenate(CONFS,axis=0) PREDS = np.concatenate([PREDS,CONFS],axis=1) cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)] + \ ['conf_{}'.format(k) for k in range(NUM_SELECT)] fname = os.path.join(DATA_DIR,'dom_pred_{}.csv'.format(mode)) pd.DataFrame(PREDS,index=range(PREDS.shape[0]),columns=cols).to_csv(fname) predict('val') predict('test') predict('train')
def create_graph_domain(): """ Creates graph linking (domain searched, domain bought) """ """ Fetch data """ from input.read_input import read_item_data df = read_item_data() df['item_id'] = df.index dct_title = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat= df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio dct_ratio_dom = get_ratio(which='domain_id') ratio_df = get_ratio(which='item_id',full=True) ratio_df['popularity'] = 100.0*ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_b = ratio_df['popularity'].to_dict() """ JSON """ check = lambda x: x <= np.round(413163*0.8).astype(np.int32) DATA_PATH = path.join(DATA_DIR,'train_dataset.jl') line_i = 0 """ Create graph vertices """ g = ig.Graph() from input.read_input import get_mappings counter, f_map_func, r_map_func = get_mappings() num_items = df.shape[0] for k in dct_title.keys(): g.add_vertex(value=k,deg=dct_ratio_item_b[k],domain_id=dct_domain[k],price=dct_price[k],cat='item_id') """ ['item_id','domain_id','category_id','product_id'] """ for k in pd.unique(df['domain_id']): g.add_vertex(value=k,cat='domain_id') for k in pd.unique(df['category_id']): g.add_vertex(value=k,cat='category_id') for k in pd.unique(df['product_id']): g.add_vertex(value=k,cat='product_id') """ Create edges """ E1 = [] E2 = [] with jsonlines.open(DATA_PATH) as reader: for line_i, obj in enumerate(reader): if check(line_i): print(line_i) L = [] for h in obj['user_history']: if h['event_type'] == 'view': #print("Viewed {}".format(dct[h['event_info']])) L.append(h['event_info']) elif h['event_type'] == 'search': #print("Searched {}".format(h['event_info'])) pass L_domain = [dct_domain[k] for k in L] L_domain = pd.unique(L_domain) L_cat = [dct_cat[k] for k in L] L_cat = pd.unique(L_cat) for i in range(len(L)): E1.append(dct_domain[L[i]]) E2.append(dct_domain[obj['item_bought']] ) E1 = f_map_func['domain_id'](E1) E2 = f_map_func['domain_id'](E2) E = pd.Series(list(zip(E1,E2))).value_counts() g.add_edges(E.index) g.es["weight"] = E.values g.write_pickle(fname=path.join(DATA_DIR,'graph_domain_to_domain.pkl'))
train_model = get_model() from tensorflow import TensorShape as ts import tensorflow.keras as keras train_ds = tf.data.Dataset.from_generator( title_generator, output_types=(tf.float32, tf.float32), output_shapes=(ts([None, NUM_WORDS, 512]), ts([None, NUM_DOMS]))) train_model.load_weights(DOMAIN_IDENTIFIER_PATH) train_model.fit(x=train_ds, steps_per_epoch=TRAIN_LINES // BS, epochs=1) # Calling `save('my_model')` creates a SavedModel folder `my_model`. train_model.save_weights(DOMAIN_IDENTIFIER_PATH) doms = pd.unique(read_item_data()['domain_id']) def load_model(): train_model = get_model() train_model.load_weights(DOMAIN_IDENTIFIER_PATH) return train_model def predict_model(train_model, query_list, return_numeric=False, return_emb=False): """ Returns prediction of train_model on batch of input """
def create_ratio(mode='train', CUTOFF=50, which='domain_id', alternate=False): assert mode in ['train', 'val'] assert which in [ 'domain_id', 'category_id', 'item_id', 'price', 'condition' ] df = read_item_data() df['price'] = pd.qcut(df['price'].values, 100) dct_attr = df[which].to_dict() dct_dom = df['domain_id'].to_dict() if mode == 'train': check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163 * 0.8).astype(np.int32) else: raise Exception("mode must be train or val") DATA_PATH = path.join(DATA_DIR, 'train_dataset.jl') i = 0 """ Create dictionary holding domain counts (searched, bought) """ attr_s = dict([(k, 0) for k in pd.unique(df[which])]) attr_b = dict([(k, 0) for k in pd.unique(df[which])]) attr_o = dict([(k, 0) for k in pd.unique(df[which])]) with jsonlines.open(DATA_PATH) as reader: for obj in reader: if check(i): #print(i) L = [] for h in obj['user_history']: if h['event_type'] == 'view': #print("Viewed {}".format(dct[h['event_info']])) L.append(h['event_info']) elif h['event_type'] == 'search': #print("Searched {}".format(h['event_info'])) pass temp = pd.Series(L, index=range(len(L)), dtype=np.float64) L_k = pd.unique(L[::-1])[::-1] attr_unique = list(pd.unique([dct_attr[k] for k in L_k])) for dom in attr_unique: if dom in attr_s: attr_s[dom] += 1 if alternate: for attr in attr_unique: if dct_dom[attr] == dct_dom[obj['item_bought']]: attr_b[attr] += 1 else: attr_o[attr] += 1 else: if dct_attr[obj['item_bought']] in attr_unique: attr_b[dct_attr[obj['item_bought']]] += 1 else: attr_o[dct_attr[obj['item_bought']]] += 1 i += 1 #L.append(obj) attr_b, attr_s = pd.DataFrame.from_dict(attr_b,orient = 'index'),\ pd.DataFrame.from_dict(attr_s,orient = 'index') attr_o = pd.DataFrame.from_dict(attr_o, orient='index') attr_b.columns, attr_s.columns, attr_o.columns = ['bought' ], ['searched' ], ['out_bought'] attr_b['bought'] = attr_b['bought'].values.astype(np.float32) attr_s['searched'] = attr_s['searched'].values.astype(np.float32) rat = attr_b['bought'].values / (1.0 + attr_s['searched'].values) rat[attr_s['searched'].values < CUTOFF] = np.mean( rat[attr_s['searched'].values >= CUTOFF]) rat2 = attr_o['out_bought'].values / (1.0 + attr_b['bought'].values) rat2[attr_s['searched'].values < CUTOFF] = np.mean( rat2[attr_s['searched'].values >= CUTOFF]) rat = pd.DataFrame({"rat": np.array(rat)}, index=attr_b.index) rat2 = pd.DataFrame({"rat2": np.array(rat2)}, index=attr_b.index) res = pd.concat([attr_s, attr_b, attr_o, rat, rat2], axis=1) if alternate: res.to_csv(path.join(DATA_DIR, '{}_ratio_alternate.csv'.format(which))) else: res.to_csv(path.join(DATA_DIR, '{}_ratio.csv'.format(which)))
def meli_iterator(mode='train', batch_size=BATCH_SIZE, full=False): from input.read_input import get_sentence_model, get_emb from input.create_ratio import load_language_df TRAIN_LINES = 413163 TEST_LINES = 177070 df = read_item_data() dct_condition = df['condition'].to_dict() df2 = load_language_df() dct_lan_pt = df2['score_pt'].to_dict() dct_lan_en = df2['score_en'].to_dict() dct_lan_es = df2['score_es'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio dct_ratio_dom = get_ratio(which='domain_id') ratio_df = get_ratio(which='item_id', full=True) ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_p = ratio_df['popularity'].to_dict() ratio_df = get_ratio(which='item_id', full=True, alternate=False) dct_ratio_item_b = ratio_df['bought'].to_dict() dct_ratio_item_s = ratio_df['searched'].to_dict() dct_ratio_item_r = ratio_df['rat'].to_dict() df['item_bought'] = [dct_ratio_item_b[k] for k in df.index] dct_ratio_cat = get_ratio(which='category_id', full=True) dct_ratio_cat_s, dct_ratio_cat_b, dct_ratio_cat = dct_ratio_cat['searched'].to_dict(),\ dct_ratio_cat['bought'].to_dict(),\ dct_ratio_cat['rat'].to_dict(),\ dct_ratio_dom = get_ratio(which='domain_id', full=True) dct_ratio_dom_s, dct_ratio_dom_b, dct_ratio_dom = dct_ratio_dom['searched'].to_dict(),\ dct_ratio_dom['bought'].to_dict(),\ dct_ratio_dom['rat'].to_dict(),\ dct_ratio_item = get_ratio(which='item_id') dct_domain_df = {} dct_cat_df = {} for dom, df2 in df.groupby('domain_id'): df2 = df2.sort_values(['item_bought'], ascending=False) #.iloc[0:10,:] dct_domain_df[dom] = df2 for cat, df2 in df.groupby('category_id'): df2 = df2.sort_values(['item_bought'], ascending=False) #.iloc[0:10,:] dct_cat_df[cat] = df2 del df del df2 def _begin_overfit_avoid(L_k): if not mode == 'train': return target_item = obj['item_bought'] target_dom = dct_domain[obj['item_bought']] target_cat = dct_cat[obj['item_bought']] for this_item in L_k: """ Bought """ if this_item == target_item: assert dct_ratio_item_b[this_item] > 0 dct_ratio_item_b[this_item] -= 1 """ Search """ dct_ratio_item_s[this_item] -= 1 assert dct_ratio_item_s[this_item] >= 0 """ Ratio """ dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / ( dct_ratio_item_s[this_item] + 1) for this_dom in pd.unique([dct_domain[k] for k in L_k]): if not isinstance(this_dom, str): continue """ Bought """ if this_dom == target_dom: assert dct_ratio_dom_b[this_dom] > 0 dct_ratio_dom_b[this_dom] -= 1 """ Search """ dct_ratio_dom_s[this_dom] -= 1 assert dct_ratio_dom_s[this_dom] >= 0 """ Ratio """ dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / ( dct_ratio_dom_s[this_dom] + 1) for this_cat in pd.unique([dct_cat[k] for k in L_k]): """ Bought """ if this_cat == target_cat: assert dct_ratio_cat_b[this_cat] > 0 dct_ratio_cat_b[this_cat] -= 1 """ Search """ dct_ratio_cat_s[this_cat] -= 1 assert dct_ratio_cat_s[this_cat] >= 0 """ Ratio """ dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / ( dct_ratio_cat_s[this_cat] + 1) def _end_overfit_avoid(L_k): if not mode == 'train': return target_item = obj['item_bought'] target_dom = dct_domain[obj['item_bought']] target_cat = dct_cat[obj['item_bought']] for this_item in L_k: """ Bought """ if this_item == target_item: #assert dct_ratio_item_b[this_item] >= 0 dct_ratio_item_b[this_item] += 1 """ Search """ #assert dct_ratio_item_s[this_item] >= 0 dct_ratio_item_s[this_item] += 1 """ Ratio """ dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / ( dct_ratio_item_s[this_item] + 1) for this_dom in pd.unique([dct_domain[k] for k in L_k]): if not isinstance(this_dom, str): continue """ Bought """ if this_dom == target_dom: #assert dct_ratio_dom_b[this_dom] >= 0 dct_ratio_dom_b[this_dom] += 1 """ Search """ #assert dct_ratio_dom_s[this_dom] >= 0 dct_ratio_dom_s[this_dom] += 1 """ Ratio """ dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / ( dct_ratio_dom_s[this_dom] + 1) for this_cat in pd.unique([dct_cat[k] for k in L_k]): """ Bought """ if this_cat == target_cat: #assert dct_ratio_cat_b[this_cat] >= 0 dct_ratio_cat_b[this_cat] += 1 """ Search """ #assert dct_ratio_cat_s[this_cat] >= 0 dct_ratio_cat_s[this_cat] += 1 """ Ratio """ dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / ( dct_ratio_cat_s[this_cat] + 1) if mode == 'train': check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163 * 0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join( DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') def rank_to_order(L, rank): assert rank.shape[0] == L.shape[0] return L[(-rank).argsort(kind='mergesort')] pred = {} actual = [] X = [] Y = [] MASK = [] LKS = [] ACTUAL = [] while True: with jsonlines.open(DATA_PATH) as reader: print("Start!!!") for line_id, obj in enumerate(reader): if check(line_id): #print(i) L = [] timestamps = [] dct_emb = {} if mode == 'test': obj['item_bought'] = -999 for h in obj['user_history']: if h['event_type'] == 'view': L.append(h['event_info']) timestamps.append( pd.Timestamp(h['event_timestamp'])) elif h['event_type'] == 'search': pass def divide_time(d): d = pd.Timedelta(d).total_seconds() MINUTE_M = 60 HOUR_M = MINUTE_M * 60 DAY_M = HOUR_M * 24 div = [1, 24, 60] res = [0, 0, 0] for i, M in enumerate([DAY_M, HOUR_M, MINUTE_M]): res[i] = np.floor(d / M) d -= M * res[i] res[i] /= div[i] #res[i] -= 0.5 return tuple(res) if not full and len(L) < 2: continue """ Create attributes """ if len(L) == 0: attrs = np.zeros( (1, (CANDIDATES + 1) + ATTR_SIZE + EMB_SIZE)) targets = np.zeros((1, (CANDIDATES + 1))) targets[0, -1] = 0 L_k = [] else: delta = [ timestamps[-1] - timestamps[i] for i in range(0, len(timestamps)) ] """ We'll use the latest delta """ L = L[::-1] u, unique_id = np.unique(np.array(L), return_index=True) #delta_day, delta_hour, delta_minute = zip(*[divide_time(d) for d in delta]) deltas = np.array([divide_time(d) for d in delta]) deltas = deltas[unique_id][:SEQ_LEN] L_k = np.array(L)[unique_id][:CANDIDATES] _begin_overfit_avoid(L_k) """ rank_freq initial calculation needs whole L """ rank_freq = pd.Series(L, index=range( len(L))).value_counts(sort=False, normalize=True) rank_freq = rank_freq.rank(method="average").to_dict() L = np.array(L)[unique_id][:SEQ_LEN] """ Calculate ranks """ condition = np.array([ 1.0 if dct_condition[k] == 'new' else 0.0 for k in L ])[:, None] #ratio_dom = np.array([dct_ratio_dom[dct_domain[k]] for k in L])[:,None] #ratio_cat = np.array([dct_ratio_cat[dct_cat[k]] for k in L])[:,None] #ratio_item = np.array([dct_ratio_item[k] for k in L])[:,None] price = np.log( np.array([ 1 + np.abs(fix_na(dct_price[k])) for k in L ])[:, None]) rank_freq = np.array([rank_freq[k] for k in L])[:, None] #rank_latest = (1.0 - np.arange(len(L))/len(L))[:,None] rank_ratio_dom = pd.Series([ dct_ratio_dom[dct_domain[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_cat = pd.Series([ dct_ratio_cat[dct_cat[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_item = pd.Series([ dct_ratio_item_r[k] for k in L_k ]).rank(method="average").to_numpy() rank_latest = (1.0 - np.arange(len(L)) / len(L)) x = [] x.append([dct_ratio_dom[dct_domain[k]] for k in L_k]) x.append(rank_ratio_dom) x.append(rank_ratio_cat) x.append(rank_ratio_item) x.append([dct_ratio_dom[dct_domain[k]] for k in L_k]) x.append([dct_ratio_cat[dct_cat[k]] for k in L_k]) x.append([dct_ratio_item_b[k] for k in L_k]) x.append([dct_ratio_item_s[k] for k in L_k]) x.append([dct_ratio_item_r[k] for k in L_k]) x.append(list(rank_latest / len(L_k))) x.append([-dct_price[k] for k in L_k]) x.append([-dct_condition[k] for k in L_k]) x.append([-dct_lan_en[k] for k in L_k]) x.append([-dct_lan_es[k] for k in L_k]) x.append([-dct_lan_pt[k] for k in L_k]) """ """ #true_val = (np.array([str(dct_domain[k]) for k in L]) == dct_domain[obj['item_bought']]) #true_val = np.logical_and(true_val,[k in L_k for k in L]) #true_val = true_val[:,None] #true_val = np.ones_like(true_val) #true_val = np.random.rand(*(true_val.shape)) assert all([k in L for k in L_k]) ids = [ np.where( L_k == l)[0][0] if l in L_k else CANDIDATES for l in L ] ids_onehot = np.zeros((len(L), (CANDIDATES + 1))) ids_onehot[np.arange(len(L)), ids] = 1 #ids_onehot = ids_onehot[:,0:10] """ Create numeric attributes plus embeddings """ attr_list = [ ids_onehot, deltas, condition, price, rank_freq ] + [np.array(_x)[:, None] for _x in x] if USE_EMB: emb = predict_model( get_sentence_model(), query_list=[dct[k] for k in L_k], return_emb=True) emb = np.reshape(emb[:, 0:(EMB_SIZE // 512), :], (emb.shape[0], EMB_SIZE)) attr_list.append(emb) attrs = np.concatenate(attr_list, axis=1) """ Create targets """ if mode == 'test': targets = np.zeros((1, (CANDIDATES + 1))) else: _b1 = (np.array(list(L_k == obj['item_bought']))) _b2 = (np.array( list([str(dct_domain[k]) for k in L_k ])) == dct_domain[obj['item_bought']]) targets = _b1.astype( np.float32 ) * 1.0 #+ _b2.astype(np.float32)*0.0 if np.sum(targets) == 0: targets = np.zeros((1, (CANDIDATES + 1))) targets[0, -1] = 1 if not full: _end_overfit_avoid(L_k) continue else: targets = np.array(targets) / np.sum(targets) targets = np.concatenate([ targets[None, :], np.zeros((1, CANDIDATES + 1 - len(L_k))) ], axis=1) """ Add attributes, targets. """ if attrs.shape[0] < SEQ_LEN: attrs = np.concatenate([ np.zeros(( SEQ_LEN - attrs.shape[0], attrs.shape[1], )), attrs ], axis=0) attrs = attrs[-SEQ_LEN:, :] attrs = attrs.astype(np.float32) _end_overfit_avoid(L_k) X.append(attrs[None, :]) Y.append(targets) mask = np.concatenate([ np.ones((len(L_k))), np.zeros((CANDIDATES + 1) - len(L_k)) ]).astype(np.float32)[None, :] MASK.append(mask) LKS.append( np.concatenate([ L_k, -1 * np.ones(((CANDIDATES + 1) - len(L_k), )) ])[None, :]) ACTUAL.append(np.array([obj['item_bought']])[None, :]) if len(X) == batch_size: X = np.concatenate(X, axis=0) Y = np.concatenate(Y, axis=0) MASK = np.concatenate(MASK, axis=0) LKS = np.concatenate(np.array(LKS).astype(np.int32), axis=0) ACTUAL = np.concatenate(np.array(ACTUAL).astype(np.int32), axis=0) yield (X, MASK, LKS, ACTUAL), Y X = [] Y = [] MASK = [] LKS = [] ACTUAL = [] #print(attrs.shape) if full: check = (lambda i: True)
def fit_RNN(): import tensorflow as tf from tensorflow import keras import tf_geometric as tfg """ Create graph """ df = read_item_data() NUM_ITEMS = read_item_data().shape[0] NUM_FEATURES = 1 counter, f_map_func, r_map_func = get_mappings() NUM_DOMS = pd.unique(df['domain_id']).shape[0] """ Load graph """ G = ig.Graph.Read_Pickle(path.join(DATA_DIR, 'graph_item_to_item.pkl')) #weights = np.log(1+np.array(G.es["weight"])) weights = np.array(G.es["weight"]) indices = np.array([np.array(e.tuple) for e in G.es]) indices = np.transpose(indices) """ Create sparse matrix W """ from scipy.sparse import coo_matrix import scipy.sparse row = indices[0, :] col = indices[1, :] W = coo_matrix((weights, (row, col)), shape=(NUM_ITEMS, NUM_ITEMS)) """ Normalize rows """ #W = deg_matrix(W,pwr=-1) @ W W = W.transpose() W = scipy.sparse.csr_matrix(W) assert scipy.sparse.issparse(W) @tf.function def smooth_labels(labels, factor=0.001): # smooth the labels labels = tf.cast(labels, tf.float32) labels *= (1 - factor) labels += (factor / tf.cast(tf.shape(labels)[1], tf.float32)) # returned the smoothed labels return labels @tf.function def compute_loss(labels, logits): logits = tf.reshape(logits, (-1, NUM_ITEMS)) labels = tf.reshape(labels, (-1, NUM_ITEMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) losses = -tf.reduce_sum(logits * tf.math.log(labels), axis=1) return tf.reduce_mean(losses) @tf.function def evaluate(labels, logits): logits = tf.reshape(logits, (-1, NUM_ITEMS)) labels = tf.reshape(labels, (-1, NUM_ITEMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) acc = tf.metrics.categorical_accuracy(labels, logits) return tf.reduce_mean(acc) """ Read data, yadda yadda """ from input.create_ratio import get_ratio ratio_df = get_ratio(which='item_id', full=True) ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_b = ratio_df['popularity'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio category_df = get_ratio(which='category_id', full=True) domain_df = get_ratio(which='domain_id', full=True) feat_1, feat_2, feat_3 = domain_df['searched'].to_dict( ), domain_df['bought'].to_dict(), domain_df['rat'].to_dict() feat_1, feat_2, feat_3 = [[X[dct_domain[k]] for k in df.index] for X in [feat_1, feat_2, feat_3]] feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].to_dict( ), category_df['bought'].to_dict(), category_df['rat'].to_dict() feat_1_1, feat_2_1, feat_3_1 = [[X[dct_cat[k]] for k in df.index] for X in [feat_1_1, feat_2_1, feat_3_1]] def standardize(x): return (x - np.min(x)) / (np.max(x) + 1e-06 - np.min(x)) feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1, feat_2, feat_3]] feat_1_1, feat_2_1, feat_3_1 = [ standardize(x) for x in [feat_1_1, feat_2_1, feat_3_1] ] del df del domain_df del category_df del G #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)]) #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios) from nn.domain_string_identifier import load_model domain_prediction_model = load_model() def my_generator(mode='train'): if mode == 'train': check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163 * 0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join( DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') print("Reading....") X = np.zeros((NUM_ITEMS, 10)).astype(np.float32) with jsonlines.open(DATA_PATH) as reader: for line_i, obj in enumerate(reader): if check(line_i): L = [] S = [] C = [] IDS = [] for h in obj['user_history']: if h['event_type'] == 'view': L.append(dct_domain[h['event_info']]) C.append(dct_cat[h['event_info']]) IDS.append(h['event_info']) elif h['event_type'] == 'search': S.append(h['event_info']) if obj['item_bought'] in L: continue L = f_map_func['domain_id'](L) C = f_map_func['category_id'](C) IDS_map = f_map_func['item_id'](IDS) """ Adjust graph """ Y = np.zeros((NUM_ITEMS, 1)).astype(np.float32) """ X[:,0] = feat_1 X[:,1] = feat_2 X[:,2] = feat_3 X[:,6] = feat_1_1 X[:,7] = feat_2_1 X[:,8] = feat_3_1 #if len(S) > 0: # X[:,8] = np.mean(predict_model(domain_prediction_model,S,return_numeric=True),axis=0) """ target_id = f_map_func['item_id']([obj['item_bought']])[0] if not mode == 'test': Y[target_id, 0] = 1.0 """ for i,k in enumerate(IDS_map): X[k,3] += 1 X[k,4] += dct_ratio_item_b[IDS[i]]/len(C) X[k,5] = dct_price[IDS[i]] #W[target_id,:] = (np.clip(np.array(W[target_id,:].todense())-1,a_min=0.0,a_max=None)) X[:,9] = np.reshape(np.asarray(W @ X[:,3]),(-1,)) X[:,9] = X[:,8] * X[:,2] #X[:,:8] = 0 for i in range(10): X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) """ if not mode == 'test': Y[target_id, 0] = 0.0 #X = X -0.5 yield X, Y """ Optimize """ BS = 2 step = 0 def batch_generator(mode, loop=True, batch_size=BS): BATCH_X = [] BATCH_Y = [] i = 0 while True: for x, y in my_generator(mode): BATCH_X.append(x[None, :, :]) BATCH_Y.append(y[None, :, :]) i += 1 if i % batch_size == 0: yield np.concatenate(BATCH_X, axis=0), np.concatenate(BATCH_Y, axis=0) BATCH_X = [] BATCH_Y = [] i = 0 if loop == False: yield np.concatenate(BATCH_X, axis=0), np.concatenate(BATCH_Y, axis=0) break """ Define train_model """ import tensorflow.keras as keras import tensorflow.keras.layers as layers inp_x = keras.Input((NUM_ITEMS, 10)) x = layers.Dense(32, activation='relu')(inp_x) x = layers.Dense(32, activation='relu')(x) x = layers.Dense(1)(x) x = layers.Flatten()(x) x = layers.Softmax(axis=-1)(x) train_model = keras.Model(inputs=[inp_x], outputs=[x]) print(train_model.summary()) lr_schedule = keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=0.5 * 1e-2, decay_steps=1000, decay_rate=0.9) optimizer = tf.keras.optimizers.Adam(learning_rate=0.2 * 1e-2) train_model.compile(optimizer=optimizer, loss=compute_loss, metrics=[evaluate]) from functools import partial from input.read_input import TRAIN_LINES train_model.fit_generator(batch_generator('train', True), steps_per_epoch=TRAIN_LINES // BS, epochs=1) ITEM_PATH = path.join(DATA_DIR, 'train_model', 'item_classifier.h5') train_model.save_weights(ITEM_PATH) def predict(mode): PREDS = [] CONFS = [] NUM_SELECT = 10 batch_size = 1 for batch_id, X in enumerate( batch_generator(mode, batch_size=batch_size, loop=False)): x = X[0] print("Predicting {} - Batch {}".format(mode, batch_id)) pred = train_model.predict_on_batch(x) if batch_id == 0: print(pred) PREDS.append(tf.argsort(pred, axis=-1)[:, -NUM_SELECT:]) CONFS.append(tf.sort(pred, axis=-1)[:, -NUM_SELECT:]) PREDS = np.concatenate(PREDS, axis=0) CONFS = np.concatenate(CONFS, axis=0) #PREDS = np.concatenate([PREDS,CONFS],axis=1) cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)] fname = os.path.join(DATA_DIR, 'item_pred_{}.csv'.format(mode)) pd.DataFrame(PREDS, index=range(PREDS.shape[0]), columns=cols).to_csv(fname) predict('train') predict('val') predict('test')