Esempio n. 1
0
def __train_kmeans(matrix, sparse_matrix):
    """
    Trains k-means model.
    @param matrix: The data matrix or None if sparse matrix to be used instead
    @param sparse_matrix: The sparse data matrix (numpy sparse format), to
                          be used in case out of memory when trying to repopulate
                          the dense one.
    @return: The trained model, which it saves
    """
    if os.path.exists(config.KMEANS_MODEL_PATH):
        print("    |-> Found an existing model for kmeans at " +\
            str(config.KMEANS_MODEL_PATH) + ", using that.")
        kmeans = myio.load_pickle(config.KMEANS_MODEL_PATH)
    else:
        print("    |-> Generating the kmeans model...")
        print("    |-> Started at " + myio.print_time())
        kmeans = KMeans(n_clusters=6, verbose=1)
        if matrix is not None:
            kmeans.fit(matrix)
        else:
            kmeans.fit(sparse_matrix)
        print("    |-> Ended at " + myio.print_time())

        print("    |-> Saving the model...")
        myio.save_pickle(kmeans, config.KMEANS_MODEL_PATH)

    return kmeans
Esempio n. 2
0
def __retrieve_sparse_matrix(rec_table, ingredients, testing=False):
    """
    Retrieves a sparse matrix representation of the recipes and ingredients.
    That is, retrieves a sparse matrix of the form:
        recipe 0    recipe 1    ...
    ing0   1           0        ...
    ing1   0           0        ...
    Either generates it from given args or else finds
    it on the disk.
    @param rec_table: All of the recipes as a RecipeTable object
    @param ingredients: All of the ingredients
    @param testing: Whether to find a matrix on the disk or generate a tmp one.
                    True to generate a tmp one.
    @return: the matrix
    """
    if not testing and os.path.isfile(config.MATRIX_SPARSE):
        sparse_matrix = myio.load_pickle(config.MATRIX_SPARSE)
        print("        |-> Found sparse matrix.")
    else:
        print("        |-> Generating sparse matrix...")
        print("            |-> Generating the rows, this may take a while...")
        rows = [
            sparse.coo_matrix(
                rec_table.ingredient_to_feature_vector(ingredient))
            for ingredient in tqdm(ingredients)
        ]
        print("            |-> Generating the matrix from the rows...")
        sparse_matrix = sparse.vstack(rows)
        if not testing:
            print("            |-> Pickling the sparse matrix...")
            myio.save_pickle(sparse_matrix, config.MATRIX_SPARSE)
    return sparse_matrix
Esempio n. 3
0
def __load_model(path):
    """
    Loads the given path and returns it as a model.
    @param path: The path to the given model.
    @return: The loaded model.
    """
    model = myio.load_pickle(path)
    return model
Esempio n. 4
0
def regenerate_clusters(kmeans, rec_table):
    """
    Goes through each ingredient in each recipe in the table and
    caches each one into the right cluster according to the passed
    in kmeans. This will take an OUTRAGEOUS amount of time, but
    should only have to be done once and will allow O(1) lookup
    for retrieving whole clusters, which will then allow super
    fast searching of those clusters.
    Then saves them to disk.
    @param kmeans: The trained kmeans cluster model
    @param rec_table: The table
    @return: list of all of the clusters
    """
    cluster_files = [fname for fname in os.listdir(config.CLUSTERS)]
    if cluster_files:
        print("    |-> Found cluster file(s) in " + str(config.CLUSTERS) + ", using those.")
        return [myio.load_pickle(config.CLUSTERS + "/" + fname) for fname in cluster_files]

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        labels = set(kmeans.labels_)
        clusters = []
        for i, label in enumerate(labels):
            clusters.append(Cluster(i))

        print("    |-> Total number of clusters to regenerate: " + str(len(clusters)))
        print("    |-> Started at: " + myio.print_time())
        ingredients = []
        for recipe in tqdm(rec_table):
            for ingredient in recipe:
                ingredients.append(ingredient)
        print("    |-> Number of ingredients: " + str(len(ingredients)))
        print("    |-> Removing duplicates from ingredients...")
        ingredients = set(ingredients)
        print("    |-> Number of unique ingredients: " + str(len(ingredients)))
        print("    |-> Pairing each ingredient with its predicted index...")
        tups = []
        for ingredient in tqdm(ingredients):
            fv = rec_table.ingredient_to_feature_vector(ingredient)
            predicted_index = (kmeans.predict(fv))[0]
            as_tup = (ingredient, predicted_index)
            tups.append(as_tup)
        print("    |-> Sorting ingredients...")
        sorted(tups, key=lambda t: t[1])
        print("    |-> Looping over sorted ingredients and clusters...")
        tups = list(tups)
        last_index = (tups[0])[1]
        for tup in tqdm(tups):
            ingredient, this_index = tup
            if this_index != last_index:
                print("    |-> Done with cluster index: " + str(last_index))
                cluster = clusters[last_index]
                myio.save_pickle(cluster, config.CLUSTERS + "cluster" + str(cluster.get_index()))
            else:
                print("    |-> This index: " + str(this_index))
                cluster = clusters[this_index]
                cluster.add(ingredient)
Esempio n. 5
0
def load_clusters():
    """
    Loads the clusters from the disk. Returns them as a list
    of clusters.
    @return: List of clusters
    """
    cluster_paths = [os.path.join(config.CLUSTERS, f) for f in os.listdir(config.CLUSTERS)\
                        if os.path.isfile(os.path.join(config.CLUSTERS, f))]
    clusters = [myio.load_pickle(cluster_path) for cluster_path in cluster_paths]
    return clusters
Esempio n. 6
0
def __get_recipe_from_rnn(encoded_feature_vectors, ingredients, ing_table):
    """
    Feeds the given bit vectors into the neural network
    and has it generate a recipe.
    @param encoded_feature_vectors: A list of encoded ingredients
                                    to use in the recipe.
    @param ingredients: The ingredients
    @return: The generated recipe
    """
    training_data = myio.load_pickle(config.TRAINING_PATH)

    with open(os.path.join(config.CHECKPOINT_DIR, "words_vocab.pkl"),
              'rb') as f:
        words, vocab = cPickle.load(f)

    data_loader = TextLoader(config.RNN_DATA_DIR, batch_size, seq_length)
    vocab_size = data_loader.vocab_size
    model = MyRNN(rnn_size,
                  num_layers,
                  batch_size,
                  seq_length,
                  vocab_size,
                  grad_clip,
                  infer=True)
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())
        ckpt = tf.train.get_checkpoint_state(config.CHECKPOINT_DIR)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            vec_model = gensim.models.Word2Vec.load(os.path.join(\
                                config.CHECKPOINT_DIR, "word2vec.model"))
            return model.sample(sess, words, vocab, vec_model, ingredients,
                                ing_table)  #prime="apple")#ingredients)
        else:
            print("Could not locate trained model in " +
                  str(config.CHECKPOINT_DIR))
            return None
Esempio n. 7
0
def ask_similar(args):
    """
    Uses any already trained models to figure out the
    similarity between the given list of ingredients, prints
    the similarity matrix, and then gives args.n number
    of ingredients that are also similar to the given
    list of ingredients.
    @param args: ArgParse object that must have args.similar[0] (number
                 of ingredients to get back) and args.simliar[1] (the
                 list of ingredients to compute a matrix for and to
                 match new ingredients with - may be an empty list,
                 in which case the program simply gives back a
                 list of similar ingredients of len args.similar[0])
    @return: void
    """
    # Get the number of ingredients the user wants and the ingredients they want to include
    num_ingredients = int(args.similar[0])
    generate_a_recipe = True if args.similar[1].strip().lower(
    ) == "y" else False
    ingredients = args.similar[2:]
    print("Ingredients: " + str(ingredients))

    # Load models and datastructures needed
    rec_table = recipe_table.load_from_disk(config.RECIPE_TABLE_PATH)
    clusters = cluster.load_clusters()
    rec_table.load_in_clusters(clusters)

    # Make sure all the ingredients the user asked to include actually exist
    for ingredient in ingredients:
        ingredient_exists = __check_for_ingredient(ingredient)
        if not ingredient_exists:
            print("Ingredient: " + str(ingredient) +
                  " not found in models. Please replace.")
            return

    # If the user asked for zero ingredients, they want a random amount of ingredients
    # Get a random number, centered around the average number of ingredients in a recipe
    # Don't use anything less than three
    num_ingredients = rec_table.get_random_number()\
                        if num_ingredients is 0 else num_ingredients
    num_ingredients = 3 if num_ingredients < 3 else num_ingredients

    # Do what the user asked (get all new ingredients or get similar ones to given ones)
    print(str(ingredients))
    if len(ingredients) is 0:
        ingredients = similar._get_random_similar_ingredients(
            num_ingredients, rec_table)
    else:
        sims = similar._get_similar_ingredients_to(ingredients,
                                                   num_ingredients, rec_table)
        ingredients.extend(sims)

    if ingredients is None:
        print(
            "Could not get that many random similar ingredients. Maybe just try again."
        )
        return
    elif len(ingredients) is 1:
        print("Here is your random ingredient: " + str(ingredients[0]))
    else:
        print("Here are: " + str(num_ingredients) + " similar ingredients: ")
        print(str(ingredients))

        similarity_matrix, similarity_score, similarity_measure = \
                                similar._compute_similarity_stats(ingredients)

        print(
            str(
                pandas.DataFrame(similarity_matrix,
                                 columns=ingredients,
                                 index=ingredients)))
        print("Similarity score for these ingredients: " +
              str(similarity_score))
        print("Z-score for similarity: " + str(similarity_measure))

    # If the user wants to generate a recipe, use the ingredients in that recipe
    if generate_a_recipe:
        ing_table = myio.load_pickle(config.INGREDIENT_TABLE_PATH)
        recipe_generator._generate_recipe(ingredients, rec_table, ing_table)
Esempio n. 8
0
from tqdm import tqdm
import lzma
import myio.myio as myio
import sys

index = int(sys.argv[1])
table = myio.load_pickle("tmp/recipe_table")
recipes = table.get_recipes()
recipe = recipes[index]
ingredients = recipe.get_ingredients_list()
text = recipe.get_text()
print("Ingredients: " + str(ingredients))
print("Recipe text: " + str(text))

#bitvec = []
#for rec in table:
#    if rec.get_text() != "" and len(rec.get_ingredients_list()) != 0:
#        bitvec.append(1)
#    else:
#        bitvec.append(0)
#coverage = sum(bitvec) / float(len(bitvec))
#coverage *= 100.0
#print("Coverage (% of recipes with ingredients and texts): " + str(coverage) + "%")
#
#compresseds = []
#all_ingredients = table.get_all_ingredients()
#for ing in tqdm(all_ingredients):
#    fv = table.ingredient_to_feature_vector(ing)
#    fv_str = ""
#    for i in fv:
#        fv_str += str(i)