コード例 #1
0
ファイル: wikipedia.py プロジェクト: youyanggu/adulteration
def nearest_neighbors_vectors(fname, out_fname):
    """Print out the nearest neighbors of each word embedding given a dictionary 
    of word to embedding."""
    top_n=3
    with open(fname, 'r') as f:
        word_to_vector = pickle.load(f)
    ing_names = np.array(sorted(word_to_vector.keys()))
    vectors = np.array([word_to_vector[i] for i in ing_names])
    ranks, neigh = get_nearest_neighbors(vectors, k=top_n)
    print_nearest_neighbors(ing_names, ranks, top_n=3, fname=out_fname, argsort=False)
コード例 #2
0
ファイル: ncim.py プロジェクト: youyanggu/adulteration
def generate_nearest_neighbors(all_ings, ings, reps, print_neighbors=True, top_n=3):
    ranks, neigh = get_nearest_neighbors(reps)
    if print_neighbors:
        ing_to_nn = {}
        for i in range(ranks.shape[0]):
            nearest_neighbors = np.argsort(ranks[i])
            neighbor_names = [ing for ing in ings[nearest_neighbors[:top_n+1]] if ing != ings[i]]
            ing_to_nn[ings[i]] = neighbor_names[:top_n]
        for i in all_ings:
            if i not in ing_to_nn:
                print '{} --> N/A'.format(i)
            else:
                print '{} --> {}'.format(i, ing_to_nn[i])
    return ranks, neigh
コード例 #3
0
ファイル: ncim.py プロジェクト: youyanggu/adulteration
def calc_new_ranks(all_ings, ings, reps):
    """Calculate ranks in the original ing ordering."""
    new_reps = []
    for i in all_ings:
        idx = np.where(ings==i)[0]
        if len(idx) == 0:
            new_reps.append(np.zeros(reps.shape[1]))
        else:
            idx = idx[0]
            new_reps.append(reps[idx])
    new_ranks = get_nearest_neighbors(np.array(new_reps))
    for i,v in enumerate(new_ranks):
        v[i] = 0 # Set itself to be rank 0
    return new_ranks
コード例 #4
0
ファイル: word2vec.py プロジェクト: youyanggu/adulteration
def get_most_similar_restricted(limit=120):
    df, df_i = import_data()
    counts = df_i['ingredient'].value_counts()
    ings = counts.index.values

    found_ings, embeddings = retrieve_embeddings(model, ings)
    #found_ings, embeddings = np.load('word2vec_embeddings.npy')
    ranks = get_nearest_neighbors(embeddings)
    print_nearest_neighbors(ings[:limit], found_ings, ranks)
    highest_ranks, avg_rankings, random_avg_rankings = calc_score(ranks, limit, 
        print_scores=False, score_path='../model/scores.csv')

    indices = found_ings[found_ings<highest_ranks.shape[0]]
    highest_ranks = highest_ranks[indices]
    avg_rankings = avg_rankings[indices]
    random_avg_rankings = random_avg_rankings[indices]
    print (highest_ranks<=3).sum(dtype=float) / np.isfinite(highest_ranks).sum()
    print highest_ranks[np.isfinite(highest_ranks)].mean()
    print avg_rankings[np.isfinite(avg_rankings)].mean()
    print random_avg_rankings[np.isfinite(random_avg_rankings)].mean()
コード例 #5
0
ファイル: nn_category.py プロジェクト: youyanggu/adulteration
def main():
    num_ingredients = 1000
    ings_per_prod = None
    use_embeddings = False
    output_cat = 'shelf'
    df, df_i = import_data()
    counts = df_i['ingredient'].value_counts()
    inputs_, outputs, idx_to_cat = gen_input_outputs_cat(
                        df, counts, num_ingredients, output_cat, ings_per_prod)
    if use_embeddings:
        #embeddings = np.load('embeddings/embeddings_{}.npy'.format(num_ingredients))
        #embeddings = np.load('../word2vec/word2vec_embeddings.npy')[1][:num_ingredients]
        embeddings = 2*np.random.random((num_ingredients, 300))-1 # Try random embeddings
        embeddings = embeddings.astype('float32')
        inputs = input_from_embeddings(inputs_, embeddings, 
            normalize=False)
        #inputs = (2*np.random.random(inputs_.shape)-1).astype('float32') # Completely random inputs.
    else:
        inputs = inputs_
    num_outputs = outputs.max()+1

    print "# of data points:", len(inputs)
    # Scramble inputs/outputs
    np.random.seed(3)
    random_idx = np.random.permutation(len(inputs))
    inputs = inputs[random_idx]
    outputs = outputs[random_idx]

    test_split_idx = int(len(inputs))*0.8
    inputs, X_test = inputs[:test_split_idx], inputs[test_split_idx:]
    outputs, y_test = outputs[:test_split_idx], outputs[test_split_idx:]

    X_train, X_valid, y_train, y_valid = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42)

    print "Running models..."
    # Max entropy model
    # Normalize
    #inputs_n = inputs / np.sum(inputs, axis=1)[:,None]
    #regr = max_entropy(X_train, y_train, X_valid, y_valid)
    #predict_cat(counts, regr, idx_to_cat, num_ingredients, ings)

    # Neural network model
    classifier, predict_model = run_nn(X_train, y_train, X_valid, y_valid, 
                              X_train.shape[1], num_outputs,
                              m=100, n_epochs=10, batch_size=10,
                              learning_rate=0.1, L2_reg=0.0001)

    pred_valid = predict_model(X_valid)
    pred_test = predict_model(X_test)
    if output_cat != 'aisle':
        lower_to_upper_cat = get_upper_cat(df, output_cat, 'aisle')
        print "Validation set:\n", calc_accuracy(pred_valid, y_valid)
        print calc_accuracy(pred_valid, y_valid, lower_to_upper_cat)
        print "Test set:\n", calc_accuracy(pred_test, y_test)
        print calc_accuracy(pred_test, y_test, lower_to_upper_cat)
    #print_predictions(X_valid, y_valid, 
    #    np.argmax(pred_valid, axis=1), idx_to_cat, counts, limit=100)

    if not use_embeddings:
        embeddings_out = classifier.hiddenLayer.W.get_value()
        ranks, neigh = get_nearest_neighbors(embeddings_out)
        #print_nearest_neighbors(counts.index.values[:num_ingredients], ranks)
        highest_rank, score, avg_rank_of_ing_cat, random_score = calc_score(
            ranks, num_ingredients)
コード例 #6
0
ファイル: valid.py プロジェクト: youyanggu/adulteration
def main():
    num_ingredients = 1000
    use_embeddings = False
    ings_per_prod = 5
    frac_weighted = 0.95
    invalid_multiplier = 1
    df, df_i = import_data()
    counts = df_i['ingredient'].value_counts()
    inputs_v_, outputs_v = gen_input_outputs_valid(
                        df, df_i, num_ingredients, ings_per_prod)
    inputs_i_w_, outputs_i_w = gen_input_outputs_invalid(inputs_v_,
                        invalid_multiplier*frac_weighted, 
                        num_ingredients, ings_per_prod, weighted=True)
    inputs_i_, outputs_i = gen_input_outputs_invalid(inputs_v_,
                        invalid_multiplier*(1-frac_weighted), 
                        num_ingredients, ings_per_prod, weighted=False)

    if use_embeddings:
        embeddings = np.load('embeddings/embeddings_{}.npy'.format(num_ingredients))
        #embeddings = np.load('../word2vec/word2vec_embeddings.npy')[1][:num_ingredients]
        #embeddings = 2*np.random.random((num_ingredients, 20))-1 # Try random embeddings
        embeddings = embeddings.astype('float32')
        normalize = False
        inputs_v = input_from_embeddings(inputs_v_, embeddings, 
            normalize=normalize)
        inputs_i_w = input_from_embeddings(inputs_i_w_, embeddings, 
            normalize=normalize)
        inputs_i = input_from_embeddings(inputs_i_, embeddings, 
            normalize=normalize)
    else:
        inputs_v = inputs_v_
        inputs_i_w = inputs_i_w_
        inputs_i = inputs_i_

    X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(
        inputs_v, outputs_v, test_size=1/3., random_state=42)
    X_train_i_w, X_test_i_w, y_train_i_w, y_test_i_w = train_test_split(
        inputs_i_w, outputs_i_w, test_size=1/3., random_state=42)
    X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
        inputs_i, outputs_i, test_size=1/3., random_state=42)

    X_train = np.vstack((X_train_v, X_train_i_w, X_train_i))
    y_train = np.hstack((y_train_v, y_train_i_w, y_train_i))
    X_test = np.vstack((X_test_v, X_test_i_w, X_test_i))
    y_test = np.hstack((y_test_v, y_test_i_w, y_test_i))

    # Scramble inputs/outputs
    np.random.seed(3)
    random_idx_tr = np.random.permutation(len(X_train))
    random_idx_te = np.random.permutation(len(X_test))
    X_train = X_train[random_idx_tr]
    y_train = y_train[random_idx_tr]
    X_test = X_test[random_idx_te]
    y_test = y_test[random_idx_te]

    print "Running models..."
    # Max entropy model
    #regr = max_entropy(X_train, y_train, X_test, y_test)
    #predict_cat(counts, regr, idx_to_cat, num_ingredients, ings)

    # Neural network model
    classifier, predict_model = run_nn(X_train, y_train, X_test, y_test, 
                                X_train.shape[1], num_outputs=2,
                                m=20, n_epochs=10, batch_size=10,
                                learning_rate=0.05, L2_reg=0.0001)

    #pred = predict_model(X_test)
    #pred_cats = np.argmax(pred, axis=1)
    #print calc_accuracy(pred, y_test)
    #print_predictions(X_test, y_test, pred_cats, counts, limit=100)

    print "Max Entropy (Valid, Invalid, Invalid weighted):"
    print calc_accuracy(regr.predict_proba(X_test_v), y_test_v)
    print calc_accuracy(regr.predict_proba(X_test_i), y_test_i)
    print calc_accuracy(regr.predict_proba(X_test_i_w), y_test_i_w)
    print "Neural Network (Valid, Invalid, Invalid weighted):"
    print calc_accuracy(predict_model(X_test_v), y_test_v)
    print calc_accuracy(predict_model(X_test_i), y_test_i)
    print calc_accuracy(predict_model(X_test_i_w), y_test_i_w)

    if not use_embeddings:
        embeddings_out = classifier.hiddenLayer.W.get_value()
        ranks, neigh = get_nearest_neighbors(embeddings_out)
        #print_nearest_neighbors(counts.index.values[:num_ingredients], ranks)
        highest_rank, score, avg_rank_of_ing_cat, random_score = calc_score(
                ranks, num_ingredients)