forked from hartleybrody/public-amazon-crawler
/
helpers.py
172 lines (143 loc) · 4.88 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os
import random
from datetime import datetime
from urlparse import urlparse
import redis
from BeautifulSoup import BeautifulSoup
from requests.exceptions import RequestException
import settings
import urllib
import pdb
import eventlet
# import_patched imports a module in a greened manner
# The module's use of networking libraries will use Eventlet's
# green versions instead.
requests = eventlet.import_patched('requests.__init__')
time = eventlet.import_patched('time')
# cumulative counter of number of requests made
num_requests = 0
# Connect to Redis Server
redis = redis.StrictRedis(host=settings.redis_host, port=settings.redis_port, db=settings.redis_db)
def download_image(url, name):
'''
Download image from URL
'''
_, img_ext = os.path.splitext(url)
save_name = os.path.join(settings.image_dir, str(name)+img_ext)
urllib.urlretrieve(url, save_name)
def make_request(url, return_soup=True):
'''
Used during initial setup
Returns:
page - <BeautifulSoup Obj>
html - <String>
'''
# Properly reformat the URL for request
url = format_url(url)
# Skip redirect URLs
if "picassoRedirect" in url:
return None
# Global counter of number of requests made so far
global num_requests
if num_requests >= settings.max_requests:
raise Exception("Reached the max number of requests: {}".format(settings.max_requests))
#dictionary of parsed http and https proxy socks5
proxies = get_proxy()
# (try) to make a request
try:
# Get a response object. This contains all the information we need!
# Attributes:
# text - Guesses encoding and has the stored text
# encoding - encoding used (automatically smartly guesses)
# content - access response body as bytes
# json - json decoding (if dealing with json data)
# raw - raw socket response. Make sure stream=true during get() call
# status_code - response status_code. 200 is a good response.
r = requests.get(url, headers=settings.headers, proxies=proxies)
except RequestException:
log("WARNING: Request for {} failed, trying again.".format(url))
# try request again, recursively
return make_request(url)
# Recording that a successful request has been made to the global counter
num_requests += 1
if r.status_code != 200:
os.system('say "Got non-200 Response"')
log("WARNING: Got a {} status code for URL: {}".format(r.status_code, url))
return None
if return_soup:
return BeautifulSoup(r.text), r.text
else:
return r
def format_url(url):
'''
Prepares a url.
Make sure amazon URLs aren't relative, and strip unnecessary
query args (to reduce the chance of them tracking the crawler!)
'''
# Parse URL into general structure components of a URL
# The important parts are taken in subsequent lines
u = urlparse(url)
scheme = u.scheme or "https"
host = u.netloc or "www.amazon.com"
path = u.path
# Remove non-approved query arguments from URL
if not u.query:
query = ""
else:
query = "?"
for piece in u.query.split("&"):
try:
k, v = piece.split("=", 1)
if k in settings.allowed_params:
query += "{k}={v}&".format(**locals())
except:
pass
query = query[:-1]
# replace things in {} with the strings in variables of the same name
return "{scheme}://{host}{path}{query}".format(**locals())
def log(msg):
'''
Global logging function
'''
if settings.log_stdout:
try:
print "{}: {}".format(datetime.now(), msg)
except UnicodeEncodeError:
pass # squash logging errors in case of non-ascii text
def get_proxy():
'''
Choose a proxy server to use for this request.
settings.proxies must be populated or no proxy will be used.
Returns:
Proxy dictionary of keys {http, https}
'''
if not settings.proxies or len(settings.proxies) == 0:
return None
proxy_ip = random.choice(settings.proxies)
# Parse the Proxy URL
proxy_url = "socks5://{user}:{passwd}@{ip}:{port}/".format(
user=settings.proxy_user,
passwd=settings.proxy_pass,
ip=proxy_ip,
port=settings.proxy_port,
)
return {
"http": proxy_url,
"https": proxy_url
}
def enqueue_url(u):
'''
Add the url to the Redis listing_url_queue
'''
url = format_url(u)
return redis.sadd("listing_url_queue", url)
def dequeue_url():
'''
Removes and returns (pops) one *random* url from the listing_url_queue
'''
return redis.spop("listing_url_queue")
if __name__ == '__main__':
# test proxy server IP masking
r = make_request('https://api.ipify.org?format=json', return_soup=False)
print r.text
pdb.set_trace()