def job_news_court_patent(p_name, skip=2): pt = Partage() coll = db[db_name][table_name_insert] # for i, dc in enumerate(xlsx_infos()): for i, dc in enumerate(comp_infos()): print dc if i <= skip: continue if 'id' not in dc: log(log_normal, '%s company not id ' % dc['name'].encode('u8'), 'id:%d' % i) raise KeyError("`d` dictionary hasn't key `id`") name = dc['name'].encode('u8') result = pt.main((name, )) db_insert( coll, { 'id': dc['id'], 'date': datetime.datetime.now(), 'job': result[0], 'news': result[1], 'court': result[2], 'tm': result[3], 'name': name, 'web': dc['web'] }) print p_name, '=>', 'id:', i, name, '\n', result, '\n\n' log(log_normal, strftime('%Y-%m-%d %H:%M:%S %A'), 'id: %d' % i, name, str(result)) time.sleep(randint(3, 8)) else: log(log_normal, 'all ok!') db.close()
def data_gather_copy(): from_coll = db[db_name][table_name_insert] copy_coll = db[db_name][table_gather_copy] for doc in from_coll.find().sort([('_id', -1)]): doc.pop('_id', None) if not copy_coll.find_one(doc): copy_coll.insert(doc) db.close()
def comp_infos(): need = {'base.name': 1, 'base.web': 1} coll_get = db[db_name][table_name_source] all_infos = [] for d in coll_get.find({'base.typ': '0'}, need).sort([('_id', -1)]): if not d['base'].get('name'): continue if not d['base'].get('web'): d['base']['web'] = '' all_infos.append({'id': d['_id'], 'name': d['base']['name'], 'web': d['base']['web']}) db.close() return iter(all_infos)
def alexa_rank(p_name, skip=2): pt = Partage() coll = db[db_name][table_name_insert] # for k, d in enumerate(xlsx_infos()): for k, d in enumerate(comp_infos()): if k <= skip: continue if 'id' not in d: log(log_alexa, '%s company not id ' % d['name'].encode('u8'), 'id:%d' % k) raise KeyError("`d` dictionary hasn't key `id`") rets, name, web = { 'rank': '0', 'ip': '0', 'pv': '0' }, d['name'].encode('u8'), d['web'].encode('u8') if web: rets = pt.aleax(web) t = randint(50, 70) print p_name, '=>', k, name, web, '\n', rets, '\nnow sleeping %d seconds!\n' % t log(log_alexa, strftime('%Y-%m-%d %H:%M:%S %A'), name, 'id: %d' % k, web, str(rets)) time.sleep(t) db_insert( coll, { 'id': d['id'], 'date': datetime.datetime.now(), 'rank': rets.get('rank'), 'ip': rets.get('ip'), 'pv': rets.get('pv') }) else: log(log_alexa, 'all ok !') db.close()
def weibo(p_name, skip=-1): wb = WeiboCrawler(account, password, cookie_file) if wb.login(): coll = db[db_name][table_name_insert] # for k, d in enumerate(xlsx_infos()): for k, d in enumerate(comp_infos()): if k <= skip: continue if 'id' not in d: log(log_weibo, '%s company not id ' % d['name'].encode('u8')) raise KeyError("`d` dictionary hasn't key `id`") if k % 200 == 0: wb.login() ret = wb.main(d['name'].encode('u8')) db_insert( coll, { 'date': datetime.now(), 'weibo': ret[0][1], 'fans': ret[0][0], 'talk': ret[1], 'id': d['id'] }) t = randint(40, 60) print p_name, '=>', k + 1, d[ 'name'], '\n', ret, '\nnow sleeping %d seconds!\n' % t time.sleep(t) log(log_weibo, strftime('%Y-%m-%d %H:%M:%S %A'), 'id: %d' % k, d['name'].encode('u8'), str(ret), 'sleeping %d seconds!' % t) else: log(log_weibo, 'all ok') else: log(log_weibo, 'login error!') data_gather_copy() db.close()