Exemple #1
0
def load_data():
    real_recipes = db.all_recipes_pairs()
    fake_recipes = list(db.random_recipes(len(real_recipes)))
    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))
    x = vectorizer.fit_transform(real_recipes + fake_recipes)
    y = (1,) * len(real_recipes) + (0,) * len(fake_recipes)

    # Load the recipe data
    network_pairs = [load_pairs()]
    pairs_vector = vectorizer.transform(network_pairs)

    return x, y, pairs_vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split


import pdb
import sys

import db

plot = False
plot_size = 4

# LOAD DATA-----------------------------------------------------------------

# Load the recipe data
real_recipes = db.all_recipes_pairs()
fake_recipes = list(db.random_recipes(len(real_recipes)))
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))
X = tfidf_vectorizer.fit_transform(real_recipes + fake_recipes)
Y = [1] * len(real_recipes) + [0] * len(fake_recipes)

freqs = [(word, X.getcol(idx).sum()) for word, idx in tfidf_vectorizer.vocabulary_.items()[:30]]
print freqs
#sort from largest to smallest
#print sorted (freqs, key = lambda x: -x[1])

# Generate a random training/testing split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=.4)

# DEFINE PARAMETERS----------------------------------------------------------