def load_data(): real_recipes = db.all_recipes_pairs() fake_recipes = list(db.random_recipes(len(real_recipes))) vectorizer = CountVectorizer(tokenizer=lambda x: x.split(',')) x = vectorizer.fit_transform(real_recipes + fake_recipes) y = (1,) * len(real_recipes) + (0,) * len(fake_recipes) # Load the recipe data network_pairs = [load_pairs()] pairs_vector = vectorizer.transform(network_pairs) return x, y, pairs_vector
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cross_validation import train_test_split import pdb import sys import db plot = False plot_size = 4 # LOAD DATA----------------------------------------------------------------- # Load the recipe data real_recipes = db.all_recipes_pairs() fake_recipes = list(db.random_recipes(len(real_recipes))) tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(',')) X = tfidf_vectorizer.fit_transform(real_recipes + fake_recipes) Y = [1] * len(real_recipes) + [0] * len(fake_recipes) freqs = [(word, X.getcol(idx).sum()) for word, idx in tfidf_vectorizer.vocabulary_.items()[:30]] print freqs #sort from largest to smallest #print sorted (freqs, key = lambda x: -x[1]) # Generate a random training/testing split Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=.4) # DEFINE PARAMETERS----------------------------------------------------------