def nearest_neighbors_vectors(fname, out_fname): """Print out the nearest neighbors of each word embedding given a dictionary of word to embedding.""" top_n=3 with open(fname, 'r') as f: word_to_vector = pickle.load(f) ing_names = np.array(sorted(word_to_vector.keys())) vectors = np.array([word_to_vector[i] for i in ing_names]) ranks, neigh = get_nearest_neighbors(vectors, k=top_n) print_nearest_neighbors(ing_names, ranks, top_n=3, fname=out_fname, argsort=False)
def generate_nearest_neighbors(all_ings, ings, reps, print_neighbors=True, top_n=3): ranks, neigh = get_nearest_neighbors(reps) if print_neighbors: ing_to_nn = {} for i in range(ranks.shape[0]): nearest_neighbors = np.argsort(ranks[i]) neighbor_names = [ing for ing in ings[nearest_neighbors[:top_n+1]] if ing != ings[i]] ing_to_nn[ings[i]] = neighbor_names[:top_n] for i in all_ings: if i not in ing_to_nn: print '{} --> N/A'.format(i) else: print '{} --> {}'.format(i, ing_to_nn[i]) return ranks, neigh
def calc_new_ranks(all_ings, ings, reps): """Calculate ranks in the original ing ordering.""" new_reps = [] for i in all_ings: idx = np.where(ings==i)[0] if len(idx) == 0: new_reps.append(np.zeros(reps.shape[1])) else: idx = idx[0] new_reps.append(reps[idx]) new_ranks = get_nearest_neighbors(np.array(new_reps)) for i,v in enumerate(new_ranks): v[i] = 0 # Set itself to be rank 0 return new_ranks
def get_most_similar_restricted(limit=120): df, df_i = import_data() counts = df_i['ingredient'].value_counts() ings = counts.index.values found_ings, embeddings = retrieve_embeddings(model, ings) #found_ings, embeddings = np.load('word2vec_embeddings.npy') ranks = get_nearest_neighbors(embeddings) print_nearest_neighbors(ings[:limit], found_ings, ranks) highest_ranks, avg_rankings, random_avg_rankings = calc_score(ranks, limit, print_scores=False, score_path='../model/scores.csv') indices = found_ings[found_ings<highest_ranks.shape[0]] highest_ranks = highest_ranks[indices] avg_rankings = avg_rankings[indices] random_avg_rankings = random_avg_rankings[indices] print (highest_ranks<=3).sum(dtype=float) / np.isfinite(highest_ranks).sum() print highest_ranks[np.isfinite(highest_ranks)].mean() print avg_rankings[np.isfinite(avg_rankings)].mean() print random_avg_rankings[np.isfinite(random_avg_rankings)].mean()
def main(): num_ingredients = 1000 ings_per_prod = None use_embeddings = False output_cat = 'shelf' df, df_i = import_data() counts = df_i['ingredient'].value_counts() inputs_, outputs, idx_to_cat = gen_input_outputs_cat( df, counts, num_ingredients, output_cat, ings_per_prod) if use_embeddings: #embeddings = np.load('embeddings/embeddings_{}.npy'.format(num_ingredients)) #embeddings = np.load('../word2vec/word2vec_embeddings.npy')[1][:num_ingredients] embeddings = 2*np.random.random((num_ingredients, 300))-1 # Try random embeddings embeddings = embeddings.astype('float32') inputs = input_from_embeddings(inputs_, embeddings, normalize=False) #inputs = (2*np.random.random(inputs_.shape)-1).astype('float32') # Completely random inputs. else: inputs = inputs_ num_outputs = outputs.max()+1 print "# of data points:", len(inputs) # Scramble inputs/outputs np.random.seed(3) random_idx = np.random.permutation(len(inputs)) inputs = inputs[random_idx] outputs = outputs[random_idx] test_split_idx = int(len(inputs))*0.8 inputs, X_test = inputs[:test_split_idx], inputs[test_split_idx:] outputs, y_test = outputs[:test_split_idx], outputs[test_split_idx:] X_train, X_valid, y_train, y_valid = train_test_split( inputs, outputs, test_size=0.2, random_state=42) print "Running models..." # Max entropy model # Normalize #inputs_n = inputs / np.sum(inputs, axis=1)[:,None] #regr = max_entropy(X_train, y_train, X_valid, y_valid) #predict_cat(counts, regr, idx_to_cat, num_ingredients, ings) # Neural network model classifier, predict_model = run_nn(X_train, y_train, X_valid, y_valid, X_train.shape[1], num_outputs, m=100, n_epochs=10, batch_size=10, learning_rate=0.1, L2_reg=0.0001) pred_valid = predict_model(X_valid) pred_test = predict_model(X_test) if output_cat != 'aisle': lower_to_upper_cat = get_upper_cat(df, output_cat, 'aisle') print "Validation set:\n", calc_accuracy(pred_valid, y_valid) print calc_accuracy(pred_valid, y_valid, lower_to_upper_cat) print "Test set:\n", calc_accuracy(pred_test, y_test) print calc_accuracy(pred_test, y_test, lower_to_upper_cat) #print_predictions(X_valid, y_valid, # np.argmax(pred_valid, axis=1), idx_to_cat, counts, limit=100) if not use_embeddings: embeddings_out = classifier.hiddenLayer.W.get_value() ranks, neigh = get_nearest_neighbors(embeddings_out) #print_nearest_neighbors(counts.index.values[:num_ingredients], ranks) highest_rank, score, avg_rank_of_ing_cat, random_score = calc_score( ranks, num_ingredients)
def main(): num_ingredients = 1000 use_embeddings = False ings_per_prod = 5 frac_weighted = 0.95 invalid_multiplier = 1 df, df_i = import_data() counts = df_i['ingredient'].value_counts() inputs_v_, outputs_v = gen_input_outputs_valid( df, df_i, num_ingredients, ings_per_prod) inputs_i_w_, outputs_i_w = gen_input_outputs_invalid(inputs_v_, invalid_multiplier*frac_weighted, num_ingredients, ings_per_prod, weighted=True) inputs_i_, outputs_i = gen_input_outputs_invalid(inputs_v_, invalid_multiplier*(1-frac_weighted), num_ingredients, ings_per_prod, weighted=False) if use_embeddings: embeddings = np.load('embeddings/embeddings_{}.npy'.format(num_ingredients)) #embeddings = np.load('../word2vec/word2vec_embeddings.npy')[1][:num_ingredients] #embeddings = 2*np.random.random((num_ingredients, 20))-1 # Try random embeddings embeddings = embeddings.astype('float32') normalize = False inputs_v = input_from_embeddings(inputs_v_, embeddings, normalize=normalize) inputs_i_w = input_from_embeddings(inputs_i_w_, embeddings, normalize=normalize) inputs_i = input_from_embeddings(inputs_i_, embeddings, normalize=normalize) else: inputs_v = inputs_v_ inputs_i_w = inputs_i_w_ inputs_i = inputs_i_ X_train_v, X_test_v, y_train_v, y_test_v = train_test_split( inputs_v, outputs_v, test_size=1/3., random_state=42) X_train_i_w, X_test_i_w, y_train_i_w, y_test_i_w = train_test_split( inputs_i_w, outputs_i_w, test_size=1/3., random_state=42) X_train_i, X_test_i, y_train_i, y_test_i = train_test_split( inputs_i, outputs_i, test_size=1/3., random_state=42) X_train = np.vstack((X_train_v, X_train_i_w, X_train_i)) y_train = np.hstack((y_train_v, y_train_i_w, y_train_i)) X_test = np.vstack((X_test_v, X_test_i_w, X_test_i)) y_test = np.hstack((y_test_v, y_test_i_w, y_test_i)) # Scramble inputs/outputs np.random.seed(3) random_idx_tr = np.random.permutation(len(X_train)) random_idx_te = np.random.permutation(len(X_test)) X_train = X_train[random_idx_tr] y_train = y_train[random_idx_tr] X_test = X_test[random_idx_te] y_test = y_test[random_idx_te] print "Running models..." # Max entropy model #regr = max_entropy(X_train, y_train, X_test, y_test) #predict_cat(counts, regr, idx_to_cat, num_ingredients, ings) # Neural network model classifier, predict_model = run_nn(X_train, y_train, X_test, y_test, X_train.shape[1], num_outputs=2, m=20, n_epochs=10, batch_size=10, learning_rate=0.05, L2_reg=0.0001) #pred = predict_model(X_test) #pred_cats = np.argmax(pred, axis=1) #print calc_accuracy(pred, y_test) #print_predictions(X_test, y_test, pred_cats, counts, limit=100) print "Max Entropy (Valid, Invalid, Invalid weighted):" print calc_accuracy(regr.predict_proba(X_test_v), y_test_v) print calc_accuracy(regr.predict_proba(X_test_i), y_test_i) print calc_accuracy(regr.predict_proba(X_test_i_w), y_test_i_w) print "Neural Network (Valid, Invalid, Invalid weighted):" print calc_accuracy(predict_model(X_test_v), y_test_v) print calc_accuracy(predict_model(X_test_i), y_test_i) print calc_accuracy(predict_model(X_test_i_w), y_test_i_w) if not use_embeddings: embeddings_out = classifier.hiddenLayer.W.get_value() ranks, neigh = get_nearest_neighbors(embeddings_out) #print_nearest_neighbors(counts.index.values[:num_ingredients], ranks) highest_rank, score, avg_rank_of_ing_cat, random_score = calc_score( ranks, num_ingredients)