Beispiel #1
0
def __do_trim(args):
    """
    Respond to the request to trim data.
    If the data directory is valid, trim the xml files down to their recipes.
    @param args: ArgParse object
    @return: void
    """
    print("Trimming the xml files to recipes...")
    prep_global._sanitize_input(args, "TRIMMER", "DATA")
    if args.trim:
        config.DATA_DIRECTORY = args.trim
    else:
        config.DATA_DIRECTORY = args.prep[1]
    debug.debug_print("Trimmer activated...")
    trim_all_files_to_recipes()

    # Now you have all of the recipes, but in different files
    # word2vec wants one big recipe file in plain English, so
    # make that and save it.
    print("Creating the big recipe file " + config.RECIPE_FILE_PATH)
    if os.path.exists(config.RECIPE_FILE_PATH):
        print("    |-> Found existing big recipe file at " + \
                str(config.RECIPE_FILE_PATH) + ", using that one.")
        pass
    else:
        print("    |-> Could not find existing version. Generating new one...")
        mover._append_all_recipe_files()

        print("    |-> Cleaning the big recipe file, this will take a while...")
        trimmer._clean_file(config.RECIPE_FILE_PATH)

        print("    |-> Stripping big recipe file...")
        myio.strip_file(config.RECIPE_FILE_PATH)
Beispiel #2
0
def _copy_master_to_data_location():
    """
    Copies the master directory files over the data
    directory. Deletes any files in the data directory
    before copying.
    @return: void
    """
    debug.debug_print(
        "_copy_master_to_data_location called, making assertions...")
    debug.assert_value_is_set(config.MASTER_DATA_DIRECTORY,
                              "config.MASTER_DATA_DIRECTORY")
    debug.assert_value_is_set(config.DATA_DIRECTORY, "config.DATA_DIRECTORY")

    print("Deleting old data files...")
    prep_global._apply_func_to_each_data_file(os.remove)

    print("Copying data files from " + str(config.MASTER_DATA_DIRECTORY) +
          " to " + str(config.DATA_DIRECTORY))
    # Collect files to copy
    list_of_file_names = [
        os.path.join(config.MASTER_DATA_DIRECTORY, file_name)
        for file_name in os.listdir(config.MASTER_DATA_DIRECTORY)
    ]

    for f in list_of_file_names:
        f_name = os.path.split(f)[-1]
        if f_name != "zuni.xml":
            # Don't include zuni - it is super weird
            debug.debug_print("Copying " + f_name + " to new directory...")
            shutil.copyfile(f, os.path.join(config.DATA_DIRECTORY, f_name))
Beispiel #3
0
def execute_based_on_args(args):
    """
    Executes the commands of the user based on what args they
    passed in.
    @param args: The arguments the user passed in
    @return: a printable result
    """
    # Check if debug messages should be printed as we go
    if args.verbose:
        chef_global.config.VERBOSE = True

    if args.unit_test:
        # Just do the unit tests and quit
        debug_print("Running unit tests...")
        preprocessor.run_unit_tests()
        statistics.run_unit_tests()
        return "Done with unit tests."

    # if any of the args are a preprocessor command, use the preprocessor:
    if args.trim or args.reset or args.prep or args.tabulate:
        preprocessor.execute_commands(args)
        return "Preprocessor ran successfully."

    if args.train:
        statistics.train_models(args)
        return "Models have been trained"

    if args.similar:
        statistics.ask_similar(args)
        return ""
Beispiel #4
0
def trim_all_files_to_recipes():
    """
    Trims all of the data files so that they have only recipes in them (with ingredients) and
    XML tags, but no intros, chapter headings, etc.
    @return: void
    """
    debug.debug_print("Calling trim_all_files_to_recipes...")
    debug.assert_value_is_set(config.DATA_DIRECTORY, "config.DATA_DIRECTORY")
    trimmer._trim_all_files_to_recipes()
    def __construct_from_file(self, path):
        """
        Constructs the table from the given ingredient file.
        @param path: The path to the ingredient file
        @return: void
        """

        debug.debug_print("Constructing IngredientsTable from " + path + "...")
        f = open(path, 'r')
        for line in f:
            ingredient = line.rstrip().replace(" ", "_")
            debug.debug_print("Adding '" + ingredient + "' to table.")
            self.put(ingredient)
        f.close()
Beispiel #6
0
def __generate_model_structures(rec_table, table, testing=False):
    """
    Generates all the necessary data structures for training the models.
    @param rec_table: A RecipeTable object
    @param table: An IngredientsTable object containing all the ingredients
                  found in the list of recipes.
    @param testing: Whether we are just testing the data
    @return: The datastructures
    """
    recipes = rec_table.get_recipes()

    print("    |-> Generating the variables heading...")
    variables = ["Recipe " + str(i) for i in range(len(recipes))]
    debug.debug_print("Variables: " + os.linesep + str(variables))

    labels = list(table.get_ingredients_list())
    debug.debug_print("Labels: " + os.linesep + str(labels))

    print("    |-> Retrieving scipy version of sparse matrix...")
    sparse_matrix = __retrieve_sparse_matrix(rec_table, labels,
                                             testing).tocoo()
    debug.debug_print("Sparse matrix: " + os.linesep + str(sparse_matrix))

    print("    |-> Retrieving dense representation of matrix...")
    matrix = __retrieve_matrix(sparse_matrix, testing)
    debug.debug_print("Dense matrix: " + os.linesep +
                      str(pd.DataFrame(matrix)))

    return variables, labels, sparse_matrix, matrix
Beispiel #7
0
def __parse_ingredients(cookbook_file_path):
    """
    Takes a cookbook data file path and parses it for its ingredients, storing
    them as a list in ing_tmp by appending them to the end of it.
    @param cookbook_file_path: A path to a cookbook data file (encoded in XML),
                               may or may not be trimmed.
    @return: void
    """
    debug.debug_print("Attempting to parse " + str(cookbook_file_path) +
                      " for ingredients.")
    __parse_between_tags(cookbook_file_path,
                         "<ingredient>",
                         "</ingredient>",
                         __ing_tmp,
                         append=True,
                         keep=config.NEW_RECIPE_LINE)
Beispiel #8
0
def __trim_non_recipe(cookbook_file_path):
    """
    Takes a cookbook data file path and removes all the non-recipe, non-ingredient
    info from it.
    @param cookbook_file_path: A path to a cookbook data file (encoded in XML)
    @return: void
    """
    debug.debug_print("Attempting to trim " + str(cookbook_file_path))
    tmp_path = "TEMPORARY_FILE_____"
    __parse_between_tags(cookbook_file_path,
                         "<recipe",
                         "</recipe>",
                         tmp_path,
                         append=False,
                         append_tag=config.NEW_RECIPE_LINE)
    myio.overwrite_file_contents(tmp_path, cookbook_file_path)
    os.remove(tmp_path)
Beispiel #9
0
def __do_reset(args):
    """
    Respond to the request to reset data.
    Reset the data directory from the given master copy location.
    @param args: ArgParse object
    @return: void
    """
    print("Reseting files...")
    prep_global._sanitize_input(args, "MOVER", "DATA")
    if args.reset:
        config.MASTER_DATA_DIRECTORY = args.reset[0]
        config.DATA_DIRECTORY = args.reset[1]
    else:
        config.MASTER_DATA_DIRECTORY = args.prep[0]
        config.DATA_DIRECTORY = args.prep[1]
    debug.debug_print("Mover activated...")
    copy_master_to_data_location()
Beispiel #10
0
def _compute_sim_stats_test():
    """
    Test for sim stats method.
    """
    recipes = []
    table = ingredients_table.load_from_disk(config.INGREDIENT_TABLE_PATH)
    recipes.append(Recipe(table, ingredients=["apple", "pineapple"]))
    recipes.append(Recipe(table, ingredients=[]))
    recipes.append(Recipe(table, ingredients=[""]))
    recipes.append(Recipe(table, ingredients=["soup", "tuna", "sandwhich"]))
    recipes.append(
        Recipe(table, ingredients=["fish", "salmon", "egg", "flour", "sugar"]))
    recipes.append(Recipe(table, ingredients=["ground_beef", "duck", "tuna"]))
    debug.debug_print("Recipes: " + str(recipes))
    rec_table = recipe_table.RecipeTable(recipes)

    _compute_sim_stats(rec_table)
Beispiel #11
0
def gather_random_recipes(recipe_file_path, num_recipes, seed=None):
    """
    Gathers num_recipes recipes from the file at recipe_file_path. The
    particular recipes gathered will be uniformly random with no replacement.
    An optional seed can be given to ensure the same recipes will be given
    each call, assuming the same file path.
    @param recipe_file_path: The path to the recipe file. The recipe file must
                             be already preprocessed to have its recipes separated
                             by the recipe separator from the config file.
    @param num_recipes: The number of recipes to give back
    @param seed: The random seed to use. Should be an integer or None.
    @return: A list of recipes, which are just strings of whatever happen
             to be between the file's recipe separator tags.
    """
    total_num_recipes = myio.count_occurrences(recipe_file_path, config.NEW_RECIPE_LINE.lower())

    if total_num_recipes < num_recipes:
        err_msg = "num_recipes cannot exceed the actual total number of recipes " +\
                  "in the recipe file given to gather_random_recipes. You passed in " +\
                  str(num_recipes) + ", but there are only " + str(total_num_recipes) +\
                  " recipes in the recipe file."
        raise ValueError(err_msg)
    elif total_num_recipes == num_recipes:
        debug.debug_print("num_recipes equals the total number of recipes in the " +\
                          "recipe file. This is a little irregular, as this means that " +\
                          "the 'random' recipes will actually just be ALL of the recipes.")
        recipe_indeces = [i for i in range(0, num_recipes + 1)]
    else:
        # Generate the recipe indeces that we should gather:
        print("    |-> Generating the indeces...")
        random.seed(seed)
        recipe_indeces = []
        while len(recipe_indeces) != num_recipes:
            next_index = None
            while next_index in recipe_indeces or next_index is None:
                next_index = random.randint(0, total_num_recipes - 1)
            recipe_indeces.append(next_index)

    recipes = []
    print("    |-> Retrieving those recipes...")
    for index in tqdm(recipe_indeces):
        recipe = trimmer._get_recipe_at_index(index, recipe_file_path)
        recipes.append(recipe)

    return recipes
Beispiel #12
0
def _apply_func_to_each_data_file(func, print_info=False):
    """
    Applies the given function to each data file in the cookbook data
    directory.
    @param func: A function which takes a file path as an input (and
                 presumably, does something with the file)
    @param print_info: If you would like this method to print a message whenever
                       it moves to another file.
    @return: void
    """
    debug_print("Applying a function to each file in the data directory...")

    # Collect each file in the directory
    assert_value_is_set(config.DATA_DIRECTORY, "config.DATA_DIRECTORY")
    print("Value of config.DATA_DIRECTORY: " + str(config.DATA_DIRECTORY))
    list_of_file_names = [os.path.join(config.DATA_DIRECTORY, file_name) for
                            file_name in os.listdir(config.DATA_DIRECTORY)]

    for f in tqdm(list_of_file_names):
        if print_info:
            print("    |-> working on " + f + "...")
        func(f)
Beispiel #13
0
def __gather_all_recipe_texts():
    """
    Gathers all of the recipes' texts into a list of the form:
    ["text from recipe 1", "text from recipe 2", etc.]
    @return: A list of recipe texts.
    """
    print("        |-> Gathering all of the recipe steps...")
    to_ret = []
    recipe_producer=\
        myio.get_lines_between_tags(\
                config.RECIPE_FILE_SINGLE_PATH, config.NEW_RECIPE_LINE.lower())
    lines_between_tags = next(recipe_producer)
    list_of_recipe_texts = []
    while lines_between_tags is not None:
        recipe = [line.rstrip() for line in lines_between_tags]
        debug.debug_print("        |-> Recipe from this iteration: " +\
                            str(recipe))
        list_of_recipe_texts.append(recipe)
        try:
            lines_between_tags = next(recipe_producer)
        except StopIteration:
            lines_between_tags = None
    return list_of_recipe_texts
Beispiel #14
0
def __gather_all_ingredients(unique_within_path):
    """
    Gathers all the ingredients into a list of the form:
    [[ingredients from recipe 1], [ingredients from recipe 2], etc.]
    @param unique_within_path: The path to the unique within text file.
    @return: list of ingredients lists
    """
    print("        |-> Gathering all of the ingredients...")
    ingredients_producer= \
        myio.get_lines_between_tags(unique_within_path, config.NEW_RECIPE_LINE.lower())
    lines_between_tags = next(ingredients_producer)
    list_of_ingredients_lists = []
    while lines_between_tags is not None:
        ingredients = [
            line.rstrip().replace(" ", "_") for line in lines_between_tags
        ]
        debug.debug_print("        |-> Ingredients from this iteration: " +\
                          str(ingredients))
        list_of_ingredients_lists.append(ingredients)
        try:
            lines_between_tags = next(ingredients_producer)
        except StopIteration:
            lines_between_tags = None
    return list_of_ingredients_lists
Beispiel #15
0
def __remove_plurals(file_path):
    """
    Removes obvious pluarls (those things that end in s, but which have counterparts
    which don't).
    @param file_path: The path to the ingredient file.
    @return: void
    """
    # This method assumes that the file is small enough to read into memory
    f = open(file_path, 'r')
    print("    |-> reading ingredients into memory...")
    ingredients = [line.rstrip() for line in f]
    f.close()

    print("    |-> searching for plurals...")
    keep = []
    already_removed = BoolTable()
    for ingredient in tqdm(ingredients):
        item_no_s = None
        already_exists_no_s = False
        if ingredient.endswith("s"):
            # Check if 1) we have already found and removed this before
            # or       2) we have an ingredient that is exactly this but without the final s
            item_no_s = ingredient[:-1]
            debug.debug_print("        |-> checking for both '" + ingredient +
                              "' and '" + item_no_s + "'...")
            if already_removed.has(ingredient):
                debug.debug_print("        |-> already removed '" +
                                  ingredient + "', do it again.")
                already_exists_no_s = True
            elif item_no_s in ingredients:
                debug.debug_print("        |-> found '" + item_no_s +
                                  "', so remove '" + ingredient + "'.")
                already_exists_no_s = True
                already_removed.put(ingredient)
        if already_exists_no_s:
            debug.debug_print("        |-> --Removing " + ingredient + "--")
            keep.append("")
        else:
            keep.append(ingredient)

    print("    |-> Writing remaining ingredients to file...")
    f = open(file_path, 'w')
    for ingredient in keep:
        f.write(ingredient + os.linesep)
    f.close()
Beispiel #16
0
def _compute_sim_stats(rec_table):
    """
    Computes the standard deviation and mean
    of the similarity scores for each recipe in
    the given rec_table.
    @param rec_table: The RecipeTable object.
    @return: The mean, standard deviation
    """
    if config.SIM_MEAN is not None:
        print("    |-> Mean has already been calculated, here it is: " +
              str(config.SIM_MEAN))
        print("    |-> Standard deviation: " + str(config.SIM_STAND_DEV))
        return config.SIM_MEAN, config.SIM_STAND_DEV
    else:
        total = 0.0
        N = 0
        scores = []
        print("    |-> Computing the mean...")
        for rec in tqdm(rec_table):
            try:
                debug.debug_print("Recipe: " + str(rec))
            except KeyError:
                pass
            if len(rec) == 0:
                debug.debug_print("Skipping this recipe, it is empty.")
                pass
            else:
                score = _compute_similarity_score(rec, w2v=w2v)
                debug.debug_print("Similarity: " + str(score))
                scores.append(score)
                if score is not None:
                    total += score
                    N += 1
        mean = total / N

        deviances = []
        print("    |-> Computing the standard deviation...")
        for score in tqdm(scores):
            if score is not None:
                deviance = score - mean
                deviances.append(deviance)
        dev_squared = [deviance * deviance for deviance in deviances]
        sum_of_squares = sum(dev_squared)
        variance = sum_of_squares / N
        std_dev = math.sqrt(variance)

        print("    |-> You should record these values in the config.py: ")
        print("    |-> Mean: " + str(mean))
        print("    |-> Standard deviation: " + str(std_dev))
        return (mean, std_dev)
Beispiel #17
0
def __generate_linkage(recipes, table, matrix, testing=False):
    """
    Generate the hierarchical linkage matrix by the clustering algorithm.
    @param recipes: A list of recipe objects
    @param table: An IngredientsTable object containing all the ingredients
                  found in the list of recipes.
    @param matrix: A dense representation of the data matrix.
    @param testing: Whether to use any files found on disk/overwrite
                    those files. If not, temporary ones will be created.
    """
    #    print("Normalizing row vectors...")
    #    # Normalize the row vector (ingredients), so that they
    #    # all have the same length, which means that ones that
    #    # are in all kinds of recipes will have a much smaller
    #    # score in any particular dimension, whereas those
    #    # that only show up in a couple of recipes will have very
    #    # strong scores in those
    #    normalized_matrix = __normalize_rows(matrix)
    #    debug.debug_print("Normalized matrix: " + os.linesep + str(pd.DataFrame(normalized_matrix)))
    normalized_matrix = matrix

    print("Running PCA on the matrix to reduce dimensionality...")
    matrix_after_pca = __run_pca(normalized_matrix)
    debug.debug_print("Matrix after PCA: " + os.linesep +
                      str(pd.DataFrame(matrix_after_pca)))
    #    matrix_after_pca = normalized_matrix

    #    print("Now scaling the row vectors so that they aren't tiny numbers...")
    #    # multiply each vector by like a thousand or something to make for reasonably
    #    # sized numbers
    #    scale_factor = 1000
    #    scaled_matrix = matrix_after_pca * scale_factor
    #    debug.debug_print("Scaled matrix: " + os.linesep + str(pd.DataFrame(scaled_matrix)))
    scaled_matrix = matrix_after_pca

    print("Generating the variables heading...")
    sm_rows = [row for row in scaled_matrix]
    variables = ["PCA comp " + str(i) for i in range(len(sm_rows[0].getA1()))]
    debug.debug_print("Variables: " + os.linesep + str(variables))

    print("Retrieving dataframe...")
    df = __retrieve_dataframe(scaled_matrix, variables, labels, testing)
    debug.debug_print("Data frame: " + os.linesep + str(df))

    print("Generating row_clusters (takes about 3 or 4 hours)...")
    print("Started at " + myio.print_time())
    row_clusters = linkage(pdist(df, metric="jaccard"), method="ward")

    return row_clusters, labels
Beispiel #18
0
def __remove_duplicates_between_bounds(file_path, bound, exceptions):
    """
    Searches the file at file_path for bounds, treating the start of the file
    as one, and removes duplicate lines within those bounds. So:

    cream                               cream
    money                               money
    cream
    cream
    peanuts                             peanuts
    BOUND                               BOUND
    cream      would turn into -->      cream
    money                               money
    spinach                             spinach
    waffles                             waffles
    cream
    BOUND                               BOUND

    @param file_path: The path to the file that will be searched
    @param bound: The bound
    @param exceptions: All the exceptions to keep regardless of repeats
    @return: void
    """
    all_lines_to_keep = []
    lines_to_keep = []
    f = open(file_path, 'r')
    for line in tqdm(f):
        if line.rstrip() == bound.rstrip():
            debug.debug_print("Found a bound, adding lines...")
            lines_to_keep.append(line)
            all_lines_to_keep.extend(lines_to_keep)
            lines_to_keep = []
        elif line in lines_to_keep and line not in exceptions:
            debug.debug_print("Found a repeat, skipping it. Repeat was: " +
                              line.rstrip())
            lines_to_keep.append(os.linesep)
        else:
            debug.debug_print("Keeping line: " + line.rstrip())
            lines_to_keep.append(line)

    all_lines_to_keep.extend(lines_to_keep)
    f.close()
    f = open(file_path, 'w')
    for line in all_lines_to_keep:
        f.write(line)
    f.close()
Beispiel #19
0
def _compute_similarity_score(ingredients, w2v=None):
    """
    Computes an average similarity for all the ingredients
    in the given list and returns it.
    @param ingredients: The list of ingredients.
    @return: The similarity score.
    """
    if w2v is None:
        w2v = __load_model(config.WORD2VEC_MODEL_PATH)
    try:
        debug.debug_print("Computing sim score for ingredients: " +
                          str(ingredients))
        pass
    except KeyError:
        pass

    combos_already_seen = []
    num_scores = 0
    score = 0.0
    for ingredient in ingredients:
        for other in ingredients:
            combo = (ingredient, other)
            already_seen = combo in combos_already_seen
            ingredient_and_other_are_same = ingredient == other
            if already_seen or ingredient_and_other_are_same:
                pass
            else:
                try:
                    similarity = w2v.similarity(ingredient, other)
                    score += similarity
                    combos_already_seen.append(combo)
                    # Also append the reverse of the combo
                    combos_already_seen.append((other, ingredient))
                    num_scores += 1
                except KeyError as e:
                    #print(str(e))
                    pass

    if num_scores == 0:
        debug.debug_print("Can't compute z score.")
        return None
    else:
        debug.debug_print("Can compute z score: " + str(score / num_scores))
        return score / num_scores
Beispiel #20
0
def __parse_between_tags(file_path,
                         start_tag,
                         stop_tag,
                         tmp_file_path,
                         append=False,
                         append_tag="",
                         keep=""):
    """
    Parses out a chunk of text from the given file between the start tag and the
    stop tag and puts that text into the given tmp_file.
    @param file_path: The path to the file to parse
    @param start_tag: The start tag
    @param stop_tag: The stop tag
    @param tmp_file_path: The path to the tmp file to write to
    @param append: Whether or not to append to the tmp file (if not, overwrite)
    @param append_tag: An optional string to add to the tmp_file after each parsed item.
    @param keep: An optional string to keep instances of from original file
    @return: void
    """
    append_or_overwrite = 'a' if append else 'w'
    cookbook_file = open(file_path, 'r')
    tmp_file = open(tmp_file_path, append_or_overwrite)
    recipe_lines = []
    record = False

    buf = ""
    tmp_buf = ""
    recording = False
    # New algorithm
    # Take in the input file char by char, read into a buffer until that buffer
    # is longer than 100 chars or the start tag is in the buffer. Either way, purge it.
    # If the start tag was in the buffer though, we need to start saving the chars as we
    # read them, until we reach the end tag, at which point we need to strip the buffer
    # of the end tag and write the resulting buffer to the tmp file. Then continue.
    for line_from_original in cookbook_file:
        line_from_original = line_from_original.encode("utf-8")
        line_from_original = line_from_original.decode("utf-8")
        for char in line_from_original:
            if recording:
                tmp_buf += char
                if stop_tag in tmp_buf:
                    tmp_buf = tmp_buf[:-len(stop_tag)]
                    debug.debug_print("Found stop tag. Writing to file: " +
                                      tmp_buf)
                    to_write = tmp_buf + os.linesep + append_tag + os.linesep
                    to_write = to_write.strip() + os.linesep
                    tmp_file.write(to_write)
                    tmp_buf = ""
                    recording = False
            else:
                buf += char
                if buf[-len(start_tag):] == start_tag:
                    debug.debug_print("Found start tag...")
                    recording = True
                    buf = ""
                elif buf[-len(keep):] == keep:
                    debug.debug_print("Found a keeper")
                    tmp_file.write(keep + os.linesep)
                elif len(buf) > 10000:
                    debug.debug_print("Purging buffer")
                    buf = ""
    cookbook_file.close()
    tmp_file.close()
Beispiel #21
0
 def __iter__(self):
     print("        |-> Iterating over ingredients lists...")
     for recipe in self.recipes:
         debug.debug_print("YIELDING: " + str(recipe))
         yield recipe
Beispiel #22
0
def _get_recipe_at_index(index, recipe_file_path):
    """
    Gets all the lines between new recipe tag at index - 1
    to new recipe tag at index from the recipe_file given.
    @param index: The index of the new recipe tag.
    @param recipe_file_path: The recipe file to look through.
    @return: The lines.
    """
    debug.debug_print("Recipe file: " + str(recipe_file_path))
    recipe_file = open(recipe_file_path, 'r')
    line_location = 0

    # First, read the file until we find the new recipe tag at index - 1
    if index == 0:
        # if index is 0, just use the start of the file as index - 1
        pass
    else:
        count_of_new_file_line = 0
        # Read the file until we find the right new recipe line
        debug.debug_print("Looking for new recipe index " + str(index - 1) +
                          "...")
        while count_of_new_file_line != index - 1:
            for line in recipe_file:
                line_location += 1
                if line.strip() == config.NEW_RECIPE_LINE.lower():
                    count_of_new_file_line += 1
                    if count_of_new_file_line == index - 1:
                        debug.debug_print("Found right index at file line " +
                                          str(line_location))
                        break
    recipe_file.close()

    debug.debug_print("Now retrieving lines...")
    number_of_lines = myio.get_number_of_lines(recipe_file_path)
    recipe_file = open(recipe_file_path, 'r')
    debug.debug_print("Spooling to line " + str(line_location) +
                      " out of a total " + str(number_of_lines))
    recipe = []
    found = False
    for line_number, line in enumerate(recipe_file):
        if found:
            debug.debug_print("Gathering ingredient: " + str(line.strip()) +
                              " at line " + str(line_number))
            if line.strip() == config.NEW_RECIPE_LINE.lower():
                recipe_file.close()
                debug.debug_print("Found the recipe. Returning: ")
                debug.debug_print(str(recipe))
                return recipe
            else:
                recipe.append(line.strip())
        elif line_number == line_location:
            found = True
            debug.debug_print("Found line " + str(line_location))

    # If we have gotten here, something went wrong
    raise ValueError("Logic error. There's a bug in this method.")
Beispiel #23
0
def _get_random_similar_ingredients(num_ingredients,
                                    rec_table,
                                    w2v=None,
                                    seed=None):
    """
    Returns num_ingredients random ingredients that are
    'similar' to one another.
    @param num_ingredients: The number of ingredients to get.
    @param rec_table: A RecipeTable object
    @param seed: A seed for choosing the same ones everytime.
    @return: The ingredients that are similar.
    """
    if num_ingredients < 1:
        raise ValueError("Number of ingredients must be more than 0. Given: " +
                         str(num_ingredients))

    # TODO: in desperate need of refactor
    if w2v is None:
        w2v = __load_model(config.WORD2VEC_MODEL_PATH)
    kmeans = __load_model(config.KMEANS_MODEL_PATH)

    all_ingredients_in_w2v = False
    while not all_ingredients_in_w2v:
        stored = []
        cluster = None
        while cluster is None:
            seed_ingredient = rec_table.get_random_ingredient(seed)
            if num_ingredients == 1:
                return [seed_ingredient]
            stored.append(seed_ingredient)
            if len(stored) == num_ingredients:
                break
            else:
                debug.debug_print("Got random ingredient: " +
                                  str(seed_ingredient))
                feature_vector = rec_table.ingredient_to_feature_vector(
                    seed_ingredient)
                seed_cluster_index = (kmeans.predict(
                    np.array(feature_vector).reshape(1, -1)))[0]
                debug.debug_print("Cluster index for this feature vector: " +
                                  str(seed_cluster_index))
                cluster = rec_table.get_cluster(seed_cluster_index)

        debug.debug_print("Stored ingredients: " + str(stored))
        stored = list(set(stored))
        ingredients = []
        if stored:
            ingredients.extend(stored)
        debug.debug_print("Ingredients after adding stored: " +
                          str(ingredients))
        converged = False
        for j in range(1000):
            debug.debug_print(
                "Attempting to find some similar ingredients, iteration: " +
                str(j))
            if cluster is not None:
                while len(ingredients) != num_ingredients:
                    index = random.randint(0, len(cluster.ingredients) - 1)
                    if cluster.ingredients[index] in ingredients:
                        pass
                    else:
                        ingredients.append(cluster.ingredients[index])
            else:
                debug.debug_print("No cluster to pull from.")
            #print("On iteration " + str(j) + " found these ingredients: " + str(ingredients))
            similarity = _compute_similarity_measure(ingredients, w2v)
            #print("Similarity for these ingredients: " + str(similarity))
            if similarity is None or similarity < 0.2:
                debug.debug_print("Did not converge on iteration: " + str(j))
                converged = False
                if len(stored) == num_ingredients:
                    # we walked through with just the random seed ingredients, and they didn't
                    # work together. Empty them out.
                    ingredients = []
                else:
                    ingredients = stored
                debug.debug_print("Trying again...")
            else:
                debug.debug_print("Converged!")
                converged = True
                break
        if not converged:
            break

        debug.debug_print(
            "Going to attempt to calculate similarity matrix now...")
        try:
            _compute_similarity_matrix(ingredients, w2v)
            all_ingredients_in_w2v = True
            debug.debug_print("And they are all in w2v, so we can move on.")
        except KeyError:
            all_ingredients_in_w2v = False
            debug.debug_print("But they were not all in w2v, so we try again.")

    if not converged:
        print("Could not converge on " + str(num_ingredients) +
              " similar items.")
        return None
    else:
        return ingredients