-
Notifications
You must be signed in to change notification settings - Fork 3
/
spider.py
executable file
·149 lines (130 loc) · 4.5 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/home/tea/venv2.7/bin/python2.7
# coding=utf-8
__author__ = 'vic'
import json, time, sys, re
import mechanize
from crud import MongoCRUD
# from LatLngUtil import PointOnEarth
from conf import types, app_keys
from redis_queue import RedisQueues
'''
def init_browser():
# Browser
br = mechanize.Browser()
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-Agent',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15. 0.1 FirePHP/0.7.1 AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19')]
return br
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
init_browser = init_browser()
class Browser():
def __init__(self, url):
self.url = url
def get_html(self):
br = init_browser.open(self.url)
html = br.read()
josn_html = json.loads(html)
return josn_html
'''
class GooglePlacesParser():
def __init__(self,redis_que):
self.crud = MongoCRUD()
self.keys_count = app_keys
self.keys = app_keys.keys()
self.key_begin = self.app_keys_pop()
# self.key = app_keys_count(self.key_begin)
self.redis_queue = redis_que
def app_keys_pop(self):
if len(self.keys) > 0:
key = self.keys.pop()
return key
else:
print "*------*-*all app keys have been used*-*-----*"
sys.exit()
def app_keys_count(self, key):
count = self.keys_count[key]
if count > 999:
key = self.app_keys_pop()
count = self.keys_count[key]
count = count + 1
self.keys_count[key] = count
self.key_begin = key
print self.key_begin
print count
return key
def change_radius(self):
radius = 500
return radius
def change_language(self):
language = 'zh-TW'
return language
def get_url(self, location, type):
print '*********'
url = 'https://maps.googleapis.com/maps/api/place/search/json?sensor=false'
url += '&language=%s' % self.change_language()
url += '&location=' + '%s,%s' % (location['lat'], location['lng'])
url += '&radius=%s' % self.change_radius() # 500 m
url += '&types=%s' % '|'.join(type)
url += '&key=%s' % self.app_keys_count(self.key_begin)
url += '&pagetoken='
return url
def save_url_ToQueue(self):
all_locations = self.crud.read_all_locations()
if len(self.keys) > 0:
for location in all_locations:
for type in types:
url = self.get_url(location, type)
print url
self.redis_queue.put(url)
self.crud.update_location_status(location['_id'])
else:
print "*------*-*all app keys have been used*-*-----*"
sys.exit()
def parse_html(self, url):
# Show the source
time.sleep(2)
br = Browser(url)
josn_response = br.get_html()
status = josn_response['status']
if status == 'OK':
results = josn_response['results']
# insert to mongo
self.crud.save_map_data_insert(results)
if 'next_page_token' in josn_response:
pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
url = re.sub(r'&pagetoken=.*', pagetoken, url)
save_url_ToQueue(url)
# self.parse_html(url)
else:
pass
elif status == 'OVER_QUERY_LIMIT':
self.key = self.app_keys_pop()
url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url)
save_url_ToQueue(url)
# self.parse_html(url)
else:
return
# from datetime import datetime
if __name__ == "__main__":
# border_location_right = {
# "lat": 24.970,
# "lng": 121.666
# }
# print gpp.get_url(app_keys[0], border_location_right, 'food')
name_red = 'testA'
redis_que = RedisQueues(name_red)
print redis_que.length()
gpp = GooglePlacesParser(redis_que)
gpp.save_url_ToQueue()
print redis_que.length()