/
zenithwatch.py
213 lines (195 loc) · 7.9 KB
/
zenithwatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# coding=utf-8
import json
import string
import re
import common as cm
import geosense as gs
__author__ = 'Zephyre'
db = None
url = 'http://www.zenith-watches.com/zh_zh/shoplocator.html'
brand_id = 10395
brandname_e = u'Zenith'
brandname_c = u'真力时'
def fetch_store_list(url):
"""
获得门店的列表
:rtype : 门店列表。格式:[{'name':**, 'lat':**, 'lng':**, 'type':**, 'url':**}]
:param url:
"""
try:
html = cm.get_data(url)
except Exception:
print 'Error occured: %s' % url
dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'data': url}, 'brand_id': brand_id}
cm.dump(dump_data)
return []
# 开始解析工作
# 查找数据部分,位于var items和var\s\w+之间
start = html.find('var items')
if start == -1:
return {}
start += len('var items')
end = html.find('var ', start)
html = html[start:end]
stores = []
pattern = ur'\[(.+?)\]'
store_list = []
for m in re.findall(pattern, html, re.S):
store_entry = {}
m_list = re.findall(ur"'(.*)'", m)
try:
store_entry['name'] = cm.html2plain(m_list[0].strip())
store_entry['type'] = m_list[2].strip()
store_entry['url'] = m_list[4].strip()
except IndexError:
print 'Index error: %s' % m
# 去掉引号之间的内容,准备查找经纬度信息
m_list = re.findall(ur'(-?\d+\.\d+)', re.subn(ur"'(.*)'", '', m)[0])
try:
lat = string.atof(m_list[0])
lng = string.atof(m_list[1])
store_entry['lat'] = lat
store_entry['lng'] = lng
except (IndexError, ValueError):
print 'Index error in getting coordinates: %s' % m
# test
# if 'hong-kong' in store_entry['url'] or 'taichung' in store_entry['url']:
if len(store_entry.keys()) > 0:
store_list.append(store_entry)
return store_list
def fetch_store_details(url, data):
"""
获得门店的详细信息(url下可能有多个门店)
:rtype : [{}]
:param url:
:param data:
"""
try:
html = cm.get_data(url)
except Exception:
print 'Error occured: %s / %s' % (str(data), url)
dump_data = {'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id}
cm.dump(dump_data)
return []
# 可能有多个门店,拆分
sub_html = []
for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>', html):
start = m.start() + len(m.group())
end = html.find('</li>', start)
sub_html.append(html[start:end])
stores = []
# 针对每个门店:
for s in sub_html:
entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
cm.update_entry(entry, {cm.url: url, cm.name_e: data['name'], cm.lat: data['lat'], cm.lng: data['lng'],
cm.store_type: data['type']})
for m in re.findall(ur'<p class="boutique-info-cadre-titre">(.*?)</p>', s):
if len(m.strip()) >= 0:
entry[cm.store_type] = m.strip()
break
for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s, re.S):
if len(m.strip()) == 0:
break
for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>', m):
if len(m1.strip()) > 0:
entry[cm.tel] = m1.strip()
break
for m1 in re.findall(ur'<span itemprop="faxNumber">(.*?)</span>', m):
if len(m1.strip()) > 0:
entry[cm.fax] = m1.strip()
break
if entry[cm.tel] == '' and entry[cm.fax] == '':
entry[cm.tel] = cm.extract_tel(m.strip())
for m in re.findall(ur'<p class="boutique-info-cadre-horaires">(.*?)</p>', s, re.S):
if len(m.strip()) > 0:
entry[cm.hours] = m.strip()
break
for m in re.findall(ur'<p class="boutique-info-cadre-adresse".*?>(.*?)</p>', s, re.S):
if len(m.strip()) == 0:
break
street_addr = ''
zip_code = ''
city = ''
country = ''
for m1 in re.findall(ur'<span itemprop="streetAddress">(.*?)</span>', m, re.S):
if len(m1.strip()) > 0:
street_addr = cm.reformat_addr(m1)
break
for m1 in re.findall(ur'<span itemprop="postalCode">(.*?)</span>', m):
if len(m1.strip()) > 0:
zip_code = m1
break
for m1 in re.findall(ur'<span itemprop="addressLocality">(.*?)</span>', m):
if len(m1.strip()) > 0:
city = cm.extract_city(m1)[0]
break
for m1 in re.findall(ur'<span itemprop="addressCountry">(.*?)</span>', m):
if len(m1.strip()) > 0:
country = m1
break
entry[cm.zip_code] = zip_code
# 没有上述标签的情况
if street_addr == '':
tmp = cm.reformat_addr(m)
terms = tmp.split(',')
ret = gs.look_up(terms[-1], 1)
if ret is not None:
# t2 = cm.geo_translate(terms[-1])
# if len(t2) != 0:
# 这是一个国家
# 把最后的国家项分离出来
street_addr = ', '.join(terms[:-1])
entry[cm.addr_e] = cm.reformat_addr(street_addr)
entry[cm.country_c] = ret['name_c']
entry[cm.country_e] = ret['name_e']
entry[cm.continent_c] = ret['continent']['name_c']
entry[cm.continent_e] = ret['continent']['name_e']
else:
if cm.is_chinese(tmp):
entry[cm.addr_c] = tmp
else:
entry[cm.addr_e] = tmp
else:
street_addr = ', '.join([street_addr, zip_code, city])
entry[cm.addr_e] = cm.reformat_addr(street_addr)
ret = gs.look_up(country, 1)
if ret is None:
# t2 = cm.geo_translate(country)
# if len(t2) == 0:
entry[cm.country_c] = country
else:
entry[cm.country_c] = ret['name_c']
entry[cm.country_e] = ret['name_e']
entry[cm.continent_c] = ret['continent']['name_c']
entry[cm.continent_e] = ret['continent']['name_e']
entry[cm.city_e] = city
gs.field_sense(entry)
print '%s Found store: %s, %s. (%s, %s, %s)' % (brandname_e,
entry[cm.name_e], entry[cm.addr_e], entry[cm.continent_e],
entry[cm.country_e], entry[cm.city_e])
db.insert_record(entry, 'stores')
stores.append(entry)
return stores
def fetch(level=1, data=None, user='root', passwd=''):
def func(data, level):
"""
:param data:
:param level: 1: 获得众多门店列表;2:获得单独的门店信息
"""
if level == 1:
store_list = fetch_store_list(data['url'])
return [{'func': lambda data: func(data, 2), 'data': s} for s in store_list]
elif level == 2:
stores = fetch_store_details(data['url'], data)
return [{'func': None, 'data': s} for s in stores]
pass
global db
db = cm.StoresDb()
db.connect_db(user=user, passwd=passwd)
db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))
# Walk from the root node, where level == 1.
if data is None:
data = {'url': url}
results = cm.walk_tree({'func': lambda data: func(data, 1), 'data': data})
db.disconnect_db()
return results