hits_10 = hits_at_n_score(ranks, n=10) print("Hits@10: %.2f" % (hits_10)) hits_3 = hits_at_n_score(ranks, n=3) print("Hits@3: %.2f" % (hits_3)) hits_1 = hits_at_n_score(ranks, n=1) print("Hits@1: %.2f" % (hits_1)) data = pd.read_csv('triplet.csv') data.drop(data[data['name'] == 'no pc_item'].index, inplace=True) data.drop(data[data['prop'] == 'no price'].index, inplace=True) print(data.head()) import itertools pcItem = data['name'].unique() pcItem_embeddings = dict(zip(pcItem,model.get_embeddings(pcItem))) ke = [] val = [] for k,v in pcItem_embeddings.items(): ke.append(k) val.append(v) embed_df = pd.DataFrame({'name':ke,'embed':val}) price_df = pd.read_csv('item_price.csv') price_df.drop(price_df[price_df['item_name'] == 'no pc_item'].index, inplace=True) price_df.drop(price_df[price_df['price'] == 'no price'].index, inplace=True) price_df['embed'] = price_df['item_name'].apply(lambda x: pcItem_embeddings[x]) price_df1 = pd.DataFrame(price_df.embed.values.tolist()).add_prefix('embed_')
train_y, dtype=np.int32), to_categorical( test_y, dtype=np.int32), positives_filter.to_numpy(dtype=np.int32) print( "Shape of train_y: %s; Shape of test_y: %s; Shape of positives_filter: %s" % (train_y.shape, test_y.shape, positives_filter.shape)) # Feature Scaling: Normalize dataset via Generation of Embeddings print("\nFeature Scaling: Embeddings Generation") embed_dim = 100 embeds_model = ComplEx(k=embed_dim, verbose=True) tf.compat.v1.logging.set_verbosity( tf.compat.v1.logging.ERROR ) # TensorFlow will tell you all messages that have the label ERROR embeds_model.fit(positives_filter) embeds_source = embeds_model.get_embeddings(positives_filter[:, 0], embedding_type='entity') embeds_dest = embeds_model.get_embeddings(positives_filter[:, 2], embedding_type='entity') embeds = np.concatenate((embeds_source, embeds_dest), axis=1) train_sz = train_X_temp.shape[0] train_X, test_X = embeds[:train_sz, :], embeds[train_sz:, :] train_X = train_X.reshape( train_X.shape[0], 4, embed_dim ) # (samples, n_timesteps, feat_per_timestep) # n_timesteps=4 -> embeds_source(:, 2) & embeds_dest(:, 2) test_X = test_X.reshape( test_X.shape[0], 4, embed_dim ) # (samples, n_timesteps, feat_per_timestep) # n_timesteps=4 -> embeds_source(:, 2) & embeds_dest(:, 2) print("Shape of train_X: %s; Shape of train_y: %s" % (train_X.shape, train_y.shape)) print("Shape of test_X: %s; Shape of test_y: %s" %
from sklearn.decomposition import PCA import matplotlib.pyplot as plt import seaborn as sns from adjustText import adjust_text from incf.countryutils import transformations print("Extracting Embeddings..") id_to_name_map = { **dict(zip(df.home_team_id, df.home_team)), **dict(zip(df.away_team_id, df.away_team)) } teams = pd.concat( (df.home_team_id[df["train"]], df.away_team_id[df["train"]])).unique() team_embeddings = dict(zip(teams, model.get_embeddings(teams))) embeddings_2d = PCA(n_components=2).fit_transform( np.array([i for i in team_embeddings.values()])) print(embeddings_2d) first_embeddings = list(team_embeddings.values())[0] print(first_embeddings) print(first_embeddings.shape) print(embeddings_2d.shape) from ampligraph.discovery import find_clusters from sklearn.cluster import KMeans print("Clustering..") clustering_algorithm = KMeans(n_clusters=6,
optimizer="adam", optimizer_params={"lr": 0.01}) model.fit(X['train']) y_pred = model.predict(X['test'][:5, ]) from scipy.special import expit print(expit(y_pred)) ranks = evaluate_performance(X['test'][:10], model=model) print(ranks) mrr = mrr_score(ranks) hits_10 = hits_at_n_score(ranks, n=10) print("MRR: %f, Hits@10: %f" % (mrr, hits_10)) import matplotlib.pyplot as plt from sklearn.manifold import TSNE embs = model.get_embeddings(embs_labels, type='entity') embs_2d = TSNE(n_components=2).fit_transform(embs) fig, ax = plt.subplots() ax.scatter(embs_2d[:, 0], embs_2d[:, 1]) for i, lab in enumerate(embs_labels): ax.annotate(lab, (embs_2d[i, 0], embs_2d[i, 1])) plt.show(fig)
model2 = restore_model(model_name_path=ke_model_path + '2') with open(ke_wnkeys_path, 'rb') as handle: tok2id, id2tok = pickle.load(handle) def find_in_tok2id(w): for s in tok2id.keys(): if w in s: print(w, s, "it is alphabetically there") tok2id = OrderedDict(tok2id) print("Extracting Embeddings..") alle = table['n1'].tolist() + table['n2'].tolist() embedding_map = dict([(str(a), (model.get_embeddings(str(tok2id[str(a)])), tok2id[str(a)])) for a in alle if str(a) in tok2id]) embedding_map2 = dict([(str(a), (model2.get_embeddings(str(tok2id[str(a)])), tok2id[str(a)])) for a in alle if str(a) in tok2id]) embeddings_array = np.array([i[0] for i in embedding_map.values()]) print("PCA") embeddings_3d_pca = PCA(n_components=3).fit_transform(embeddings_array) print("TSNE") embeddings_3d_tsne = TSNE(n_components=3).fit_transform(embeddings_array) print("k2") embeddings_k2 = np.array([i[0] for i in embedding_map2.values()]) # Check if second dimension is 3