def make_index(conn, shard, idx='company'): task = 'index' count_for_checkpoint_pid = 20 if get_finished(conn, task, shard): print 'Task %s at Shard %d finished.' % (task, shard) return for loop_count, pid in enumerate(enum(conn, task, shard)): # remember where we are, so we can resume # this pid will be retried when resume print 'indexing %s...' % (pid,) if (loop_count+1) % count_for_checkpoint_pid == 0: set_last_pid(conn, task, shard, pid) profile_idx = r.table('profile_index').get(pid).run(conn) if not profile_idx: r.table('profile_index').insert({'pid':pid, 'indices':[]}).run(conn) profile_idx = r.table('profile_index').get(pid).run(conn) indices = profile_idx['indices'] if idx in indices: print '%s indexed.' % (pid,) # this means this pid has been indexed before continue profile = r.table('profile').get(pid).run(conn) if not profile: print '%s not existed.' % (pid,) continue print profile positions = profile.get('positions') if not positions: continue entry_or_list = map(lambda x:x.get('org'), positions) if not any(entry_or_list): continue if not isinstance(entry_or_list, list): entry_or_list = [entry_or_list] primary_key = idx[0]+'id' print entry_or_list for entry in entry_or_list: if not entry: continue pids = r.table(idx).get(entry).run(conn) if not pids: pids = [] r.table(idx).insert({primary_key: entry, 'pids':pids}).run(conn) else: pids = pids.get('pids', []) print pids if pid in pids: # this means this pid's index has been processed before continue print (entry, pid) r.table(idx).get(entry).update({'pids': r.row['pids'].append(pid)}).run(conn) # tell indexer that we're done r.table('profile_index').get(pid).update({'indices': r.row['indices'].append(idx)}).run(conn) set_finished(conn, task, shard) print 'Task %s at Shard %d finished.' % (task, shard)
try: cp = CandidatePage.from_url(url) except HTMLParser.HTMLParseError, e: cp = None pass if not cp: continue try: cp_dict = cp.to_dict() cp_dict['pid'] = pid r.table('profile').insert( cp.to_dict() ).run(conn) except RqlRuntimeError, e: print e newname = cp.first_nm.replace('/','-') + '-' + cp.last_nm.replace('/','-') newname = newname.replace(' ', '-').encode('utf-8') newname = urllib.pathname2url(newname) if len(newname) < 15: name = newname print url, cp.to_json() set_finished(conn, task, shard) print 'Task %s at Shard %d finished.' % (task, shard) if __name__ == '__main__': website = sys.argv[1] server_ip_dns = sys.argv[2] conn = r.connect(server_ip_dns, port=28015, db='people') conn.repl() shard = int(sys.argv[3]) crawl(website, shard, conn)