def extractRecipesFromJSON(allRecipes): jsonFilePath = os.path.join(c.PATH_TO_RESOURCES, "aliasdata", "miniAliases.json") myDict = util.loadJSONDict(jsonFilePath) # Fill allRecipes with just the values for each JSON member, # as the values actually contain the keys as a member for _, val in myDict.items(): allRecipes.append(val)
def createAliasDatapoints(jsonDataFilePath, featureList): # sparse matrix, shape = (n_samples, n_features) allPoints = [] countToAliasName = {} allAlias = util.loadJSONDict(jsonDataFilePath) count = 0 for aliasName, aliasData in allAlias.items(): point = createAliasPoint(aliasData, featureList) allPoints.append(point) countToAliasName[count] = aliasName count += 1 vec = DictVectorizer() dataMatrix = vec.fit_transform(allPoints) return dataMatrix, countToAliasName, allPoints
def createRecipeDatapoints(jsonDataFilePath, featureList): # sparse matrix, shape = (n_samples, n_features) allPoints = [] countToRecipeName = {} allRecipes = util.loadJSONDict(jsonDataFilePath) count = 0 for recipeName, recipe in allRecipes.items(): point = createRecipePoint(recipe, featureList) allPoints.append(point) countToRecipeName[count] = recipeName count += 1 vec = DictVectorizer() dataMatrix = vec.fit_transform(allPoints) return dataMatrix, countToRecipeName, allPoints
def initializeTraits(verbose): traitsDataPath = os.path.join(constants.PATH_TO_ROOT, "res", "csp_defaultTraits.json") traits = util.loadJSONDict(traitsDataPath) traits["verbose"] = verbose traits["ND"] = nutrientdatabase.NutrientDatabase() return traits
def main(argv): global ingredientMassDict global validAliasDict global unitCountDict validIngredientsFilePath = os.path.join(c.PATH_TO_RESOURCES, "validIngredients.json") validAliasDict = util.loadJSONDict(validIngredientsFilePath) conversionDict = util.createWaterConversionDict() allRecipes = [] # Each alias has 3 main fields: # "count" # "aliasBuddies" # "lines" aliasData = {} ingredientLineDict = {} #ingredientMassDict = {} #unitCountDict = {} # Read in and parse recipe data structures (dictionaries) from a json file. extractRecipesFromJSON(allRecipes) # Convert all string data to lowercase. lowerAllStrings(allRecipes) #ndb = ndb.NutrientDatabase() #Let's f**k around. unmatched = float(0) tried = float(0) for recipe in allRecipes: # print "Ingredient Lines: " + str(len(recipe['ingredientLines'])) # print recipe['ingredientLines'] # print "\nIngredients: " + str(len(recipe['ingredients'])) # print recipe['ingredients'] for ingredientLineIndex in range(0, len(recipe['ingredientLines'])): if ingredientLineIndex == len(recipe['ingredients']): break ingredientLine = recipe['ingredientLines'][ingredientLineIndex].encode('ascii', errors='ignore') ingredient = recipe['ingredients'][ingredientLineIndex].encode('ascii', errors='ignore') if ingredient not in validAliasDict: continue if ingredient not in ingredientLineDict: ingredientLineDict[ingredient] = [] ingredientLineDict[ingredient].append(ingredientLine) #print ingredientLineDict for ingredient in ingredientLineDict: for ingredientLine in ingredientLineDict[ingredient]: #TIME TO PARSE. words = ingredientLine.split() potentialStart = removeHyphen(words[0]) #If the first token is a number, try the next few. if isPossibleAmount(words[0]): if '/' in potentialStart: tokens = potentialStart.split('/') first = float(tokens[0]) second = float(tokens[1]) potentialStart = first/second amount = float(potentialStart) potentialUnit, foundUnit = extractUnit(words, ingredient, conversionDict) if potentialUnit != None: #Add both the mass and the unit count if foundUnit: massInGrams = amount*ndb.getConversionFactor(ingredient, potentialUnit) else: massInGrams = amount*conversionDict[potentialUnit] if ingredient not in ingredientMassDict: ingredientMassDict[ingredient] = [] ingredientMassDict[ingredient].append(massInGrams) if ingredient not in unitCountDict: unitCountDict[ingredient] = Counter() unitCountDict[ingredient][potentialUnit] += 1 #print "Amount: " + str(amount) + " Unit: " + potentialUnit else: unmatched += 1 #print "Couldn't match unit for ingredient: " + ingredient #print words #print elif not hasAnAmount(words): if ingredient not in unitCountDict: unitCountDict[ingredient] = Counter() unitCountDict[ingredient]['unitless'] += 1 tried += 1 print "Missed amounts for " + str(unmatched) + " / " + str(tried) + " ingredients." print str((tried-unmatched)/tried*100) + "% Success rate!" # Get the counts of ingredient short names. # Create a dictionary storing relationships between the various aliases. # Create a dictionary with aliases as keys and lists of lines they've been # associated with as values. fillAliasData(allRecipes, aliasData) #Temporarily removed to test. dumpAliasDataToJSONFiles(aliasData) #Now create small files smallAliasData = {} for _ in range(250): item = aliasData.popitem() smallAliasData[item[0]] = item[1] smallFilePath = os.path.join(c.PATH_TO_RESOURCES, "aliasData_small.json") util.dumpJSONDict(smallFilePath, smallAliasData) for _ in range(250): item = aliasData.popitem() smallAliasData[item[0]] = item[1] mediumFilePath = os.path.join(c.PATH_TO_RESOURCES, "aliasData_medium.json") util.dumpJSONDict(mediumFilePath, smallAliasData) for _ in range(500): item = aliasData.popitem() smallAliasData[item[0]] = item[1] largeFilePath = os.path.join(c.PATH_TO_RESOURCES, "aliasData_large.json") util.dumpJSONDict(largeFilePath, smallAliasData)
def createNutrientDataJSON(): nutrientIDsFilePath = os.path.join(c.PATH_TO_RESOURCES, "allNutrientIDs.json") nutrientIDs = util.loadJSONDict(nutrientIDsFilePath) buildNutritionalDatabase(nutrientIDs)