-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
executable file
·76 lines (67 loc) · 2.64 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python
import sys
import urllib
import rethinkdb as r
from rethinkdb.errors import RqlRuntimeError
from enum import enum, set_last_pid, get_finished, set_finished
def make_index(conn, shard, idx='company'):
task = 'index'
count_for_checkpoint_pid = 20
if get_finished(conn, task, shard):
print 'Task %s at Shard %d finished.' % (task, shard)
return
for loop_count, pid in enumerate(enum(conn, task, shard)):
# remember where we are, so we can resume
# this pid will be retried when resume
print 'indexing %s...' % (pid,)
if (loop_count+1) % count_for_checkpoint_pid == 0:
set_last_pid(conn, task, shard, pid)
profile_idx = r.table('profile_index').get(pid).run(conn)
if not profile_idx:
r.table('profile_index').insert({'pid':pid, 'indices':[]}).run(conn)
profile_idx = r.table('profile_index').get(pid).run(conn)
indices = profile_idx['indices']
if idx in indices:
print '%s indexed.' % (pid,)
# this means this pid has been indexed before
continue
profile = r.table('profile').get(pid).run(conn)
if not profile:
print '%s not existed.' % (pid,)
continue
print profile
positions = profile.get('positions')
if not positions:
continue
entry_or_list = map(lambda x:x.get('org'), positions)
if not any(entry_or_list):
continue
if not isinstance(entry_or_list, list):
entry_or_list = [entry_or_list]
primary_key = idx[0]+'id'
print entry_or_list
for entry in entry_or_list:
if not entry:
continue
pids = r.table(idx).get(entry).run(conn)
if not pids:
pids = []
r.table(idx).insert({primary_key: entry, 'pids':pids}).run(conn)
else:
pids = pids.get('pids', [])
print pids
if pid in pids:
# this means this pid's index has been processed before
continue
print (entry, pid)
r.table(idx).get(entry).update({'pids': r.row['pids'].append(pid)}).run(conn)
# tell indexer that we're done
r.table('profile_index').get(pid).update({'indices': r.row['indices'].append(idx)}).run(conn)
set_finished(conn, task, shard)
print 'Task %s at Shard %d finished.' % (task, shard)
if __name__ == '__main__':
server_ip_dns = sys.argv[1]
conn = r.connect(server_ip_dns, port=28015, db='people')
conn.repl()
shard = int(sys.argv[2])
make_index(conn, shard)