forked from simon987/irarchives
-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
executable file
·403 lines (351 loc) · 12.6 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
import json
import re
from os import path
from flask import Blueprint, Response, request
from common import DBFILE
from DB import DB
from Httpy import Httpy
from ImageHash import avhash, thumb_path, image_from_buffer
from util import clean_url, sort_by_ranking, is_user_valid
search_page = Blueprint('search', __name__, template_folder='templates')
AlphaNum = re.compile(r'[\W_]+')
db = DB(DBFILE)
web = Httpy()
@search_page.route("/search")
def search():
""" Searches for a single URL, prints results """
if "q" in request.args:
query = request.args["q"]
lquery = query.lower()
else:
return Response(json.dumps(""))
# Cache
if lquery.startswith('cache:'):
return search_cache(query[len('cache:'):])
# User
elif lquery.startswith('user:'):
return search_user(query[len('user:'):])
elif 'reddit.com/u/' in lquery:
return search_user(query[lquery.find('/u/') + 3:])
elif 'reddit.com/user/' in lquery:
return search_user(query[lquery.find('/user/') + 6:])
# Text
elif lquery.startswith('text:'):
return search_text(query[len('text:'):])
# Post
elif 'reddit.com/r/' in query and '/comments/' in query:
# Reddit post
if not query.endswith('.json'):
query += '.json'
r = web.get(query)
if '"url": "' in r:
query = web.between(r, '"url": "', '"')[0]
# URL
if ' ' in query:
query = query.replace(' ', '%20')
try:
query, posts, comments, related, downloaded = get_results_tuple_for_image(query)
except Exception as e:
return Response(json.dumps({'error': str(e)}), mimetype="application/json")
return Response(json.dumps({
'posts': posts,
'comments': comments,
'url': query,
'related': related
}), mimetype="application/json")
def search_user(user):
""" Returns posts/comments by a reddit user """
if user.strip() == '' or not is_user_valid(user):
raise Exception('invalid username')
posts = []
comments = []
related = []
# This search will pull up all posts and comments by the user
# NOTE It will also grab all comments containing links in the user's posts (!)
query_text = 'postid IN '
query_text += '(SELECT DISTINCT id FROM Posts '
query_text += 'WHERE author LIKE "%s" ' % user
query_text += 'ORDER BY ups DESC LIMIT 50) '
query_text += 'OR '
query_text += 'commentid IN '
query_text += '(SELECT DISTINCT id FROM Comments '
query_text += 'WHERE author LIKE "%s" ' % user
query_text += 'ORDER BY ups DESC LIMIT 50) '
query_text += 'GROUP BY postid, commentid' # LIMIT 50'
# To avoid comments not created by the author, use this query:
# query_text = 'commentid = 0 AND postid IN (SELECT DISTINCT id FROM Posts WHERE author LIKE "%s" ORDER BY ups DESC LIMIT 50) OR commentid IN (SELECT DISTINCT id FROM Comments WHERE author LIKE "%s" ORDER BY ups DESC LIMIT 50) GROUP BY postid, commentid LIMIT 50' % (user, user)
images = db.select('urlid, albumid, postid, commentid', 'Images', query_text)
for (urlid, albumid, postid, commentid) in images:
# Get image's URL, dimensions & size
if commentid != 0:
# Comment
try:
comment_dict = build_comment(commentid, urlid, albumid)
comments.append(comment_dict)
except:
pass
else:
# Post
try:
post_dict = build_post(postid, urlid, albumid)
posts.append(post_dict)
related += build_related_comments(postid, urlid, albumid)
except:
pass
posts = sort_by_ranking(posts)
comments = sort_by_ranking(comments)
for com in comments:
for rel in related:
if rel['hexid'] == com['hexid']:
related.remove(rel)
break
return Response(json.dumps({
'url': 'user:%s' % user,
'posts': posts,
'comments': comments,
'related': related
}), mimetype="application/json")
def search_cache(url):
"""
Prints list of images inside of an album
The images are stored in the database, so 404'd albums
can be retrieved via this method (sometimes)
"""
try:
url = clean_url(url)
except Exception as e:
return Response(json.dumps({"error": str(e)}), mimetype="application/json")
images = []
query_text = 'id IN (SELECT urlid FROM Images WHERE albumid IN (SELECT DISTINCT id FROM albums WHERE url LIKE "%s"))' \
% (url,)
image_tuples = db.select('id, url', 'ImageURLs', query_text)
for (urlid, imageurl) in image_tuples:
image = {
'thumb': path.join(thumb_path(urlid), '%d.jpg' % urlid),
'url': imageurl
}
images.append(image)
return Response(json.dumps({
'url': 'cache:%s' % url,
'images': images
}), mimetype="application/json")
def search_text(text):
""" Prints posts/comments containing text in title/body. """
text = AlphaNum.sub('', text)
posts = []
comments = []
related = []
query_text = 'commentid = 0 AND postid IN (SELECT DISTINCT id FROM Posts WHERE title LIKE "%%%s%%" or text LIKE "%%%s%%" ORDER BY ups DESC LIMIT 50) OR commentid IN (SELECT DISTINCT id FROM Comments WHERE body LIKE "%%%s%%" ORDER BY ups DESC LIMIT 50) GROUP BY postid, commentid LIMIT 50' \
% (text, text, text)
images = db.select('urlid, albumid, postid, commentid', 'Images', query_text)
for (urlid, albumid, postid, commentid) in images:
# Get image's URL, dimensions & size
if commentid != 0:
# Comment
try:
comment_dict = build_comment(commentid, urlid, albumid)
comments.append(comment_dict)
except:
pass
else:
# Post
try:
post_dict = build_post(postid, urlid, albumid)
posts.append(post_dict)
related += build_related_comments(postid, urlid, albumid)
except:
pass
posts = sort_by_ranking(posts)
comments = sort_by_ranking(comments)
return Response(json.dumps({
'url': 'text:%s' % text,
'posts': posts,
'comments': comments,
'related': related
}), mimetype="application/json")
###################
# Helper methods
def get_results_tuple_for_image(url):
""" Returns tuple of posts, comments, related for an image """
try:
(hashid, downloaded) = get_hashid(url)
if hashid == -1: # No hash matches
return url, [], [], [], downloaded
image_hashes = db.select('hash', 'Hashes', 'id = %d' % hashid)
if not image_hashes:
raise Exception('could not get hash for %s' % url)
image_hash = image_hashes[0][0]
except Exception as e:
raise e
return get_results_tuple_for_hash(url, image_hash, downloaded)
def get_results_tuple_for_hash(url, image_hash, downloaded):
posts = []
comments = []
related = [] # Comments contaiing links found in posts
# Get matching hashes in 'Images' table.
# This shows all of the posts, comments, and albums containing the hash
query_text = 'hashid IN'
query_text += ' (SELECT id FROM Hashes WHERE hash = "%s")' % image_hash
query_text += ' GROUP BY postid, commentid'
query_text += ' LIMIT 50'
images = db.select('urlid, albumid, postid, commentid', 'Images', query_text)
for (urlid, albumid, postid, commentid) in images:
# Get image's URL, dimensions & size
if commentid != 0:
# Comment
try:
comment_dict = build_comment(commentid, urlid, albumid)
if comment_dict['author'] == 'rarchives':
continue
comments.append(comment_dict)
except:
pass
else:
# Post
try:
post_dict = build_post(postid, urlid, albumid)
posts.append(post_dict)
for rel in build_related_comments(postid, urlid, albumid):
if rel['author'] == 'rarchives':
continue
related.append(rel)
except:
pass
for com in comments:
for rel in related:
if rel['hexid'] == com['hexid']:
related.remove(rel)
break
posts = sort_by_ranking(posts)
comments = sort_by_ranking(comments)
return url, posts, comments, related, downloaded
def get_hashid(url, timeout=10):
"""
Retrieves hash ID ('Hashes' table) for image.
Returns -1 if the image's hash was not found in the table.
Does not modify DB! (read only)
"""
cleaned_url = clean_url(url)
existing = db.select('hashid', 'ImageURLs', 'url LIKE "%s"' % cleaned_url)
if existing:
return existing[0][0], False
# Download image
image_buffer = web.download(url, timeout=timeout)
if not image_buffer:
raise Exception('unable to download image at %s' % url)
# Get image hash
try:
image = image_from_buffer(image_buffer)
image_hash = str(avhash(image))
except:
# Failed to get hash, delete image & raise exception
raise Exception("Could not identify image")
hashids = db.select('id', 'Hashes', 'hash = "%s"' % image_hash)
if not hashids:
return -1, True
return hashids[0][0], True
###################
# "Builder" methods
def build_post(postid, urlid, albumid):
""" Builds dict containing attributes about a post """
item = {
'thumb': path.join(thumb_path(urlid), '%d.jpg' % urlid),
}
# Get info about post
(postid,
item['hexid'],
item['title'],
item['url'],
item['text'],
item['author'],
item['permalink'],
item['subreddit'],
item['comments'],
item['ups'],
item['downs'],
item['score'],
item['created'],
item['is_self'],
item['over_18']) = db.select('*', 'Posts', 'id = %d' % postid)[0]
# Get info about image
(item['imageurl'],
item['width'],
item['height'],
item['size']) \
= db.select('url, width, height, bytes', 'ImageURLs', 'id = %d' % urlid)[0]
# Set URL to be the album (if it's an album)
if albumid != 0:
item['url'] = db.select("url", "Albums", "id = %d" % albumid)[0][0]
return item
def build_comment(commentid, urlid, albumid):
""" Builds dict containing attributes about a comment """
item = {
'thumb': path.join(thumb_path(urlid), '%d.jpg' % urlid),
}
# Thumbnail
if not path.exists(item['thumb']):
item['thumb'] = ''
# Get info about comment
(comid,
postid,
item['hexid'],
item['author'],
item['body'],
item['ups'],
item['downs'],
item['created']) \
= db.select('*', 'Comments', 'id = %d' % commentid)[0]
# Get info about post comment is replying to
(item['subreddit'],
item['permalink'],
item['postid']) \
= db.select('subreddit, permalink, hexid', 'Posts', 'id = %d' % postid)[0]
# Get info about image
(item['imageurl'],
item['width'],
item['height'],
item['size']) \
= db.select('url, width, height, bytes', 'ImageURLs', 'id = %d' % urlid)[0]
if albumid != 0:
item['url'] = db.select("url", "Albums", "id = %d" % albumid)[0][0]
return item
def build_related_comments(postid, urlid, albumid):
""" Builds dict containing attributes about a comment related to a post"""
items = [] # List to return
# return items
# Get info about post comment is replying to
(postsubreddit,
postpermalink,
posthex) \
= db.select('subreddit, permalink, hexid', 'Posts', 'id = %d' % postid)[0]
# Get & iterate over comments
for (comid,
postid,
comhexid,
comauthor,
combody,
comups,
comdowns,
comcreated) \
in db.select('*', 'Comments', 'postid = %d' % postid):
item = {
# Post-specific attributes
'subreddit': postsubreddit,
'permalink': postpermalink,
'postid': posthex,
# Comment-specific attributes
'hexid': comhexid,
'author': comauthor,
'body': combody,
'ups': comups,
'downs': comdowns,
'created': comcreated,
'thumb': '',
# Image-specific attributes (irrelevant)
'imageurl': '',
'width': 0,
'height': 0,
'size': 0
}
items.append(item)
return items