/
crawl.py
executable file
·60 lines (54 loc) · 1.88 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/python
import sys
import urllib
import rethinkdb as r
from rethinkdb.errors import RqlRuntimeError
import HTMLParser
from profile import CandidatePage
from enum import enum, set_last_pid, get_finished, set_finished
def crawl(website, shard, conn):
name = 'foo-bar'
task = 'crawl'
count_for_checkpoint_pid = 20
if get_finished(conn, task, shard):
print 'Task %s at Shard %d finished.' % (task, shard)
return
for loop_count, pid in enumerate(enum(conn, task, shard)):
# remember where we are, so we can resume
# this pid will be retried when resume
if (loop_count+1) % count_for_checkpoint_pid == 0:
set_last_pid(conn, task, shard, pid)
url = 'http://www.%s.com/pub/%s/%s' % (website, name, pid)
print url
profile = r.table('profile').get(pid).run(conn)
if profile:
print '%s existed.' % (url,)
continue
try:
cp = CandidatePage.from_url(url)
except HTMLParser.HTMLParseError, e:
cp = None
pass
if not cp:
continue
try:
cp_dict = cp.to_dict()
cp_dict['pid'] = pid
r.table('profile').insert( cp.to_dict() ).run(conn)
except RqlRuntimeError, e:
print e
newname = cp.first_nm.replace('/','-') + '-' + cp.last_nm.replace('/','-')
newname = newname.replace(' ', '-').encode('utf-8')
newname = urllib.pathname2url(newname)
if len(newname) < 15:
name = newname
print url, cp.to_json()
set_finished(conn, task, shard)
print 'Task %s at Shard %d finished.' % (task, shard)
if __name__ == '__main__':
website = sys.argv[1]
server_ip_dns = sys.argv[2]
conn = r.connect(server_ip_dns, port=28015, db='people')
conn.repl()
shard = int(sys.argv[3])
crawl(website, shard, conn)