def __train_kmeans(matrix, sparse_matrix): """ Trains k-means model. @param matrix: The data matrix or None if sparse matrix to be used instead @param sparse_matrix: The sparse data matrix (numpy sparse format), to be used in case out of memory when trying to repopulate the dense one. @return: The trained model, which it saves """ if os.path.exists(config.KMEANS_MODEL_PATH): print(" |-> Found an existing model for kmeans at " +\ str(config.KMEANS_MODEL_PATH) + ", using that.") kmeans = myio.load_pickle(config.KMEANS_MODEL_PATH) else: print(" |-> Generating the kmeans model...") print(" |-> Started at " + myio.print_time()) kmeans = KMeans(n_clusters=6, verbose=1) if matrix is not None: kmeans.fit(matrix) else: kmeans.fit(sparse_matrix) print(" |-> Ended at " + myio.print_time()) print(" |-> Saving the model...") myio.save_pickle(kmeans, config.KMEANS_MODEL_PATH) return kmeans
def __retrieve_sparse_matrix(rec_table, ingredients, testing=False): """ Retrieves a sparse matrix representation of the recipes and ingredients. That is, retrieves a sparse matrix of the form: recipe 0 recipe 1 ... ing0 1 0 ... ing1 0 0 ... Either generates it from given args or else finds it on the disk. @param rec_table: All of the recipes as a RecipeTable object @param ingredients: All of the ingredients @param testing: Whether to find a matrix on the disk or generate a tmp one. True to generate a tmp one. @return: the matrix """ if not testing and os.path.isfile(config.MATRIX_SPARSE): sparse_matrix = myio.load_pickle(config.MATRIX_SPARSE) print(" |-> Found sparse matrix.") else: print(" |-> Generating sparse matrix...") print(" |-> Generating the rows, this may take a while...") rows = [ sparse.coo_matrix( rec_table.ingredient_to_feature_vector(ingredient)) for ingredient in tqdm(ingredients) ] print(" |-> Generating the matrix from the rows...") sparse_matrix = sparse.vstack(rows) if not testing: print(" |-> Pickling the sparse matrix...") myio.save_pickle(sparse_matrix, config.MATRIX_SPARSE) return sparse_matrix
def __load_model(path): """ Loads the given path and returns it as a model. @param path: The path to the given model. @return: The loaded model. """ model = myio.load_pickle(path) return model
def regenerate_clusters(kmeans, rec_table): """ Goes through each ingredient in each recipe in the table and caches each one into the right cluster according to the passed in kmeans. This will take an OUTRAGEOUS amount of time, but should only have to be done once and will allow O(1) lookup for retrieving whole clusters, which will then allow super fast searching of those clusters. Then saves them to disk. @param kmeans: The trained kmeans cluster model @param rec_table: The table @return: list of all of the clusters """ cluster_files = [fname for fname in os.listdir(config.CLUSTERS)] if cluster_files: print(" |-> Found cluster file(s) in " + str(config.CLUSTERS) + ", using those.") return [myio.load_pickle(config.CLUSTERS + "/" + fname) for fname in cluster_files] with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) labels = set(kmeans.labels_) clusters = [] for i, label in enumerate(labels): clusters.append(Cluster(i)) print(" |-> Total number of clusters to regenerate: " + str(len(clusters))) print(" |-> Started at: " + myio.print_time()) ingredients = [] for recipe in tqdm(rec_table): for ingredient in recipe: ingredients.append(ingredient) print(" |-> Number of ingredients: " + str(len(ingredients))) print(" |-> Removing duplicates from ingredients...") ingredients = set(ingredients) print(" |-> Number of unique ingredients: " + str(len(ingredients))) print(" |-> Pairing each ingredient with its predicted index...") tups = [] for ingredient in tqdm(ingredients): fv = rec_table.ingredient_to_feature_vector(ingredient) predicted_index = (kmeans.predict(fv))[0] as_tup = (ingredient, predicted_index) tups.append(as_tup) print(" |-> Sorting ingredients...") sorted(tups, key=lambda t: t[1]) print(" |-> Looping over sorted ingredients and clusters...") tups = list(tups) last_index = (tups[0])[1] for tup in tqdm(tups): ingredient, this_index = tup if this_index != last_index: print(" |-> Done with cluster index: " + str(last_index)) cluster = clusters[last_index] myio.save_pickle(cluster, config.CLUSTERS + "cluster" + str(cluster.get_index())) else: print(" |-> This index: " + str(this_index)) cluster = clusters[this_index] cluster.add(ingredient)
def load_clusters(): """ Loads the clusters from the disk. Returns them as a list of clusters. @return: List of clusters """ cluster_paths = [os.path.join(config.CLUSTERS, f) for f in os.listdir(config.CLUSTERS)\ if os.path.isfile(os.path.join(config.CLUSTERS, f))] clusters = [myio.load_pickle(cluster_path) for cluster_path in cluster_paths] return clusters
def __get_recipe_from_rnn(encoded_feature_vectors, ingredients, ing_table): """ Feeds the given bit vectors into the neural network and has it generate a recipe. @param encoded_feature_vectors: A list of encoded ingredients to use in the recipe. @param ingredients: The ingredients @return: The generated recipe """ training_data = myio.load_pickle(config.TRAINING_PATH) with open(os.path.join(config.CHECKPOINT_DIR, "words_vocab.pkl"), 'rb') as f: words, vocab = cPickle.load(f) data_loader = TextLoader(config.RNN_DATA_DIR, batch_size, seq_length) vocab_size = data_loader.vocab_size model = MyRNN(rnn_size, num_layers, batch_size, seq_length, vocab_size, grad_clip, infer=True) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) ckpt = tf.train.get_checkpoint_state(config.CHECKPOINT_DIR) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) vec_model = gensim.models.Word2Vec.load(os.path.join(\ config.CHECKPOINT_DIR, "word2vec.model")) return model.sample(sess, words, vocab, vec_model, ingredients, ing_table) #prime="apple")#ingredients) else: print("Could not locate trained model in " + str(config.CHECKPOINT_DIR)) return None
def ask_similar(args): """ Uses any already trained models to figure out the similarity between the given list of ingredients, prints the similarity matrix, and then gives args.n number of ingredients that are also similar to the given list of ingredients. @param args: ArgParse object that must have args.similar[0] (number of ingredients to get back) and args.simliar[1] (the list of ingredients to compute a matrix for and to match new ingredients with - may be an empty list, in which case the program simply gives back a list of similar ingredients of len args.similar[0]) @return: void """ # Get the number of ingredients the user wants and the ingredients they want to include num_ingredients = int(args.similar[0]) generate_a_recipe = True if args.similar[1].strip().lower( ) == "y" else False ingredients = args.similar[2:] print("Ingredients: " + str(ingredients)) # Load models and datastructures needed rec_table = recipe_table.load_from_disk(config.RECIPE_TABLE_PATH) clusters = cluster.load_clusters() rec_table.load_in_clusters(clusters) # Make sure all the ingredients the user asked to include actually exist for ingredient in ingredients: ingredient_exists = __check_for_ingredient(ingredient) if not ingredient_exists: print("Ingredient: " + str(ingredient) + " not found in models. Please replace.") return # If the user asked for zero ingredients, they want a random amount of ingredients # Get a random number, centered around the average number of ingredients in a recipe # Don't use anything less than three num_ingredients = rec_table.get_random_number()\ if num_ingredients is 0 else num_ingredients num_ingredients = 3 if num_ingredients < 3 else num_ingredients # Do what the user asked (get all new ingredients or get similar ones to given ones) print(str(ingredients)) if len(ingredients) is 0: ingredients = similar._get_random_similar_ingredients( num_ingredients, rec_table) else: sims = similar._get_similar_ingredients_to(ingredients, num_ingredients, rec_table) ingredients.extend(sims) if ingredients is None: print( "Could not get that many random similar ingredients. Maybe just try again." ) return elif len(ingredients) is 1: print("Here is your random ingredient: " + str(ingredients[0])) else: print("Here are: " + str(num_ingredients) + " similar ingredients: ") print(str(ingredients)) similarity_matrix, similarity_score, similarity_measure = \ similar._compute_similarity_stats(ingredients) print( str( pandas.DataFrame(similarity_matrix, columns=ingredients, index=ingredients))) print("Similarity score for these ingredients: " + str(similarity_score)) print("Z-score for similarity: " + str(similarity_measure)) # If the user wants to generate a recipe, use the ingredients in that recipe if generate_a_recipe: ing_table = myio.load_pickle(config.INGREDIENT_TABLE_PATH) recipe_generator._generate_recipe(ingredients, rec_table, ing_table)
from tqdm import tqdm import lzma import myio.myio as myio import sys index = int(sys.argv[1]) table = myio.load_pickle("tmp/recipe_table") recipes = table.get_recipes() recipe = recipes[index] ingredients = recipe.get_ingredients_list() text = recipe.get_text() print("Ingredients: " + str(ingredients)) print("Recipe text: " + str(text)) #bitvec = [] #for rec in table: # if rec.get_text() != "" and len(rec.get_ingredients_list()) != 0: # bitvec.append(1) # else: # bitvec.append(0) #coverage = sum(bitvec) / float(len(bitvec)) #coverage *= 100.0 #print("Coverage (% of recipes with ingredients and texts): " + str(coverage) + "%") # #compresseds = [] #all_ingredients = table.get_all_ingredients() #for ing in tqdm(all_ingredients): # fv = table.ingredient_to_feature_vector(ing) # fv_str = "" # for i in fv: # fv_str += str(i)