-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
112 lines (87 loc) · 3.51 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# coding=utf-8
import scraperwiki
import lxml.html
from time import sleep
from thready import threaded
import dataset
import logging
logging.basicConfig()
import os
from hashlib import sha1
# a directory for caching files we've already downloaded
CACHE_DIR = os.path.join(os.path.dirname(__file__), 'cache')
def url_to_filename(url):
""" Make a URL into a file name, using SHA1 hashes. """
# use a sha1 hash to convert the url into a unique filename
hash_file = sha1(url).hexdigest() + '.html'
return os.path.join(CACHE_DIR, hash_file)
def store_local(url, content):
""" Save a local copy of the file. """
# If the cache directory does not exist, make one.
if not os.path.isdir(CACHE_DIR):
os.makedirs(CACHE_DIR)
# Save to disk.
local_path = url_to_filename(url)
with open(local_path, 'wb') as f:
f.write(content)
def load_local(url):
""" Read a local copy of a URL. """
local_path = url_to_filename(url)
if not os.path.exists(local_path):
return None
with open(local_path, 'rb') as f:
return f.read()
db = dataset.connect('sqlite:///scraperwiki.sqlite')
maxcr = 2120960
mincr = 1686538
def scrape(crno):
crnostr = "%07d" % crno
baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO="
url = baseurl + crnostr
print "trying local", crnostr
html = load_local(url)
if html is None:
print "trying site", crnostr
html = scraperwiki.scrape(url).decode('utf-8')
print "storing local", crnostr
store_local(url, html.encode('utf-8'))
else:
html = html.decode('utf-8')
if '沒有紀錄與輸入的查詢資料相符' in html.encode('utf-8'):
print 'NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!'
return nil
root = lxml.html.fromstring(html) # , encoding="utf-8")
tds = root.cssselect("tr td tr td")
namestds = root.cssselect("td.data")
while tds == []:
print "trying", crnostr, "again"
sleep(46)
html = scraperwiki.scrape(baseurl + crnostr).decode('utf-8')
root = lxml.html.fromstring(html) # , encoding="utf-8")
tds = root.cssselect("tr td tr td")
namestds = root.cssselect("td.data")
#for idx, val in enumerate(tds):
# print idx, ":", val.text_content().encode('utf-8')
names = {}
for nameidx, nameval in enumerate(namestds):
names["Name" + str(nameidx)] = nameval.text_content()[10:]
names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10]
print "got", tds[1].text_content()
data = {
'cr' : tds[1].text_content(),
'English Company Name' : tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'),
'Chinese Company Name' : tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'),
'Company Type' : tds[4].text_content()[:-1],
'Date of incorporation' : tds[6].text_content(),
# 'Company status' : tds[8].text_content()[:-1],
'Active status' : tds[8].text_content()[:-1],
'Remarks' : tds[9].text_content().replace(u"備註:",""),
'Winding up mode' : tds[11].text_content()[:-1],
'Date of Dissolution' : tds[13].text_content(),
'Register of Charges' : tds[15].text_content()[:-1],
'Important Note' : tds[16].text_content().replace(u"重要事項:","").lstrip('\r\n\t')
}
data.update(names)
db['swdata'].upsert(data, ['cr'])
print "wrote", tds[1].text_content()
threaded(range(mincr, maxcr), scrape, num_threads = 20)