/
search.py
116 lines (106 loc) · 4.4 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import sys
from json import loads
import operator
import re
#import cPickle as pickle
import redis
import sys
from os.path import abspath
up = '/'.join(abspath(".").split('/')[:-1])
sys.path.append(up)
from config import *
import pattern
from pattern.en import singularize, wordnet
redis = redis.StrictRedis(host=redis_hostname)
def get_recipes(search):
# returns all recipes with the search string in the name or description
results = []
# pickling doesn't seem to be any faster than just loading from redis
#try:
# recipes = pickle.load(open( "recipes.p", "rb" ))
#except:
# recipes = [redis.get(k) for k in redis.keys('*recipe*')]
# pickle.dump( recipes, open( "recipes.p", "wb" ) )
recipes = [redis.get(k) for k in redis.keys('*recipe*')]
for r in recipes:
r = loads(r)
score = compute_match(search, r)
if score > 0:
results.append((r, score))
sorted_r = sorted(results, key=operator.itemgetter(1))
sorted_r.reverse()
i = min(len(sorted_r), 10)
#print sorted_r[:i]
#raw_input()
return sorted_r[:i]
def try_similarity(w1, w2):
try:
s = wordnet.similarity(w1,w2)
return s
except:
return 0
def compute_match(search, recipe):
#print search
pall = re.compile('('+search.lower()+')') #pall is the regex for the whole search string
ps = [re.compile('('+s.lower()+' )') for s in search.split(' ')] #ps is the list of regexs for each word.
rname = recipe['name'].lower()
#print [p.findall('chili') for p in ps]
#recipe_2grams = [' '.join(r) for r in zip(*[rname.split(' ')[i:] for i in range(2)])]
#recipe_2grams += ['-'.join(r) for r in zip(*[rname.split(' ')[i:] for i in range(2)])]
#search_2grams = [' '.join(s) for s in zip(*[search.lower().split(' ')[i:] for i in range(2)])]
#search_2grams += ['-'.join(s) for s in zip(*[search.lower().split(' ')[i:] for i in range(2)])]
#
#filters = [
# 'vegan', # no meat, cheese, egg, or animal-derived ingredients
# 'vegetarian', # no meat
# 'low fat', 'low-fat', # not a lot of saturated fat
# 'slow cooker', 'slow-cooker', # uses a slow-cooker or crockpot
# 'gluten', 'gluten-free', 'gluten free', # no gluten
# 'paleo', # high protein, low carb
# 'raw', # no cooking
# 'easy', 'quick', # low cook time, few ingredients
# 'low-sodium' # no high salt ingredients
# ]
#_filters = set(filters) & \
# (set([s.lower() for s in search.split(' ')]) | set(search_2grams)) & \
# (set(rname.split(' ')) | set(recipe_2grams))
#if not _filters:
# return 0 # if the filter isn't in the name, don't return the recipe(?)
#dashes = [' '.join(f.split('-')) for f in _filters if '-' in f]
#if dashes:
# _filters.update([' '.join(dashes)])
#for f in _filters:
# rname = rname.replace(f, '')
# scoring is this way:
# One or more of the search terms must appear in the recipe title.
# If both do, the score is doubled.
# The frequency of the search terms in the description and instructions determines the score.
# The body score is normalized by the length of the recipe body.
try:
rbody = (recipe['description'] + ' ' + ' '.join(recipe['recipeInstructions'])).lower()
except:
rbody = recipe['description']
name_score = sum([len(p.findall(rname)) for p in ps])
#print [p.findall(' chili') for p in ps]
#print name_score
#raw_input()
body_score = 0
if len(pall.findall(rname)) > 0 and len(search.split(' ')) > 1:
name_score = name_score * 2 #double name score is the whole search string is in the name
if name_score > 0: # only return recipe if search in name(?)
body_score = sum([sum([try_similarity(wsearch, wbody) for wsearch in search.split(' ')]) for wbody in rbody])
body_score = body_score / float(len(rbody))
return name_score + 100*body_score
else:
return 0
def outliers(recipes):
new_recipes = list(recipes)
# adjust order of recipes based on similarity of ingredient lists with recipes above
for i in xrange(len(recipes)):
ind = len(recipes) - i - 1
#
return recipes
if __name__ == '__main__':
for i, s in enumerate(get_recipes(sys.argv[1])):
if i < 10:
print s[0]['name'], s[1]