def main(fetcher, **kwargs): fetch_data = kwargs.get('fetch_data', None) uids = kwargs.get('uids', None) msg_urls = kwargs.get('msg_urls', None) store_path = kwargs.get('store_path', STORE_PATH) window = kwargs.get('window', None) weibos_storage = kwargs.get('weibos_storage', None) uids_storage = kwargs.get('uids_storage', None) fetcher.window = window assert (fetch_data is not None and uids is not None) or (msg_urls is not None) n_ids = 0 start_time = time.time() last_time = start_time n_connections = 0 if uids is not None: fetch_data = fetch_data.lower() n_ids = len(uids) write_message(('=======Need to crawl: uids-%d=======' %n_ids), window) i = 0 dt_id = 0 for uid in uids: fetcher.n_connections = 0 now_time = time.time() dt = int(now_time - last_time) if dt >= 3600: msg = '-------\n' msg += 'Having Crawled for %d seconds, take a rest: 1 hours' %dt msg += '\n-------' logger.info(msg) write_message(msg, window) time.sleep(3600) last_time = time.time() if dt < 3600 and dt_id > 0: delay = adjust_delay(dt_id) msg = '-------\n' msg += 'Take a rest: %d seconds, and start new crawler..' %delay msg += '\n-------' write_message(msg, window) time.sleep(delay) t_id_s = time.time() if fetch_data == 'weibos': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window, weibos_storage=weibos_storage) crawler.crawl_weibos() elif fetch_data == 'follows': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window, uids_storage = uids_storage) crawler.crawl_follows() elif fetch_data == 'fans': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window, uids_storage = uids_storage) crawler.crawl_fans() elif fetch_data == 'infos': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window, uids_storage = uids_storage) crawler.crawl_infos() t_id_e = time.time() dt_id = int(t_id_e - t_id_s) i += 1 update_progress_bar(window, i*100/n_ids) n_connections += fetcher.n_connections elif msg_urls is not None: n_ids = len(msg_urls) write_message(('=======Need to crawl: messages-%d=======' %n_ids), window) i = 0 dt_id = 0 for msg_url in msg_urls: fetcher.n_connections = 0 now_time = time.time() dt = int(now_time - last_time) if dt >= 3600: msg = '-------\n' msg += 'Having Crawled for %d seconds, take a rest: 1 hours' %dt msg += '\n-------' logger.info(msg) write_message(msg, window) time.sleep(3600) last_time = time.time() if not msg_url.startswith('http://weibo.com/'): msg_url = 'http://weibo.com/' + msg_url.replace('/', '') if dt < 3600 and dt_id > 0: delay = adjust_delay(dt_id) msg = '-------\n' msg += 'Take a rest: %d seconds, and start new crawler..' %delay msg += '\n-------' write_message(msg, window) time.sleep(delay) t_id_s = time.time() #repost crawler = ComWeiboCrawler(fetcher, store_path, msg_url=msg_url, window=window) crawler.crawl_msg_reposts() n_connections += fetcher.n_connections fetcher.n_connections = 0 sec = 3 msg = 'Take a rest: %d seconds, and start to crawl the comments..' %sec write_message(msg, window) time.sleep(sec) #comment crawler = ComWeiboCrawler(fetcher, store_path, msg_url=msg_url, window=window) crawler.crawl_msg_comments() t_id_e = time.time() dt_id = int(t_id_e - t_id_s) i += 1 update_progress_bar(window, i*100/n_ids) n_connections += fetcher.n_connections cost_time = int(time.time() - start_time) d, h, m, s = format_delta_time(cost_time) msg = 'The task has successfully finished.\n' msg += 'Crawled [user|message]ids: %d, cost time: %d(d)-%d(h)-%d(m)-%d(s), connections: %d' %(n_ids, d, h, m, s, n_connections) write_message('=======', window) logger.info(msg) write_message(msg, window)
def main(fetcher, **kwargs): fetch_data = kwargs.get('fetch_data', None) uids = kwargs.get('uids', None) msg_urls = kwargs.get('msg_urls', None) store_path = kwargs.get('store_path', STORE_PATH) window = kwargs.get('window', None) weibo_com = kwargs.get('weibo_com', True) fetcher.window = window assert (fetch_data is not None and uids is not None) or (msg_urls is not None) n_ids = 0 n_connections = 0 n_errors = 0 succeed_fp = codecs.open(os.path.join(store_path, 'succeed-id.txt'), 'w+', 'utf-8') error_fp = codecs.open(os.path.join(store_path, 'error-id.txt'), 'w+', 'utf-8') not_exist_fp = codecs.open(os.path.join(store_path, 'not-exist-id.txt'), 'w+', 'utf-8') start_time = time.time() if weibo_com: if uids is not None: fetch_data = fetch_data.lower() n_ids = len(uids) write_message(('=======Need to crawl: uids-%d=======' %n_ids), window) i = 0 dt_id = 0 for uid in uids: fetcher.n_connections = 0 if dt_id > 0: delay = adjust_delay(dt_id) msg = '-------\n' msg += 'Take a rest: %d seconds, and start new crawler..' %delay msg += '\n-------' write_message(msg, window) time.sleep(delay) t_id_s = time.time() if fetch_data == 'weibos': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window) res = crawler.crawl_weibos() if res is None: n_errors += 1 error_fp.write(str(uid) + '\n') elif res is False: not_exist_fp.write(str(uid) + '\n') elif res is True: succeed_fp.write(str(uid) + '\n') elif fetch_data == 'follows': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window) res = crawler.crawl_follows() if res is None: n_errors += 1 error_fp.write(str(uid) + ';') elif res is False: not_exist_fp.write(str(uid) + ';') elif res is True: succeed_fp.write(str(uid) + ';') elif fetch_data == 'fans': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window) res = crawler.crawl_fans() if res is None: n_errors += 1 error_fp.write(str(uid) + ';') elif res is False: not_exist_fp.write(str(uid) + ';') elif res is True: succeed_fp.write(str(uid) + ';') elif fetch_data == 'infos': crawler = ComWeiboCrawler(fetcher, store_path, uid=uid, window=window) res = crawler.crawl_infos() if res is None: n_errors += 1 error_fp.write(str(uid) + ';') elif res is False: not_exist_fp.write(str(uid) + ';') elif res is True: succeed_fp.write(str(uid) + ';') #-- t_id_e = time.time() dt_id = int(t_id_e - t_id_s) i += 1 update_progress_bar(window, i*100/n_ids) n_connections += fetcher.n_connections #--end for uid in uids elif msg_urls is not None: n_ids = len(msg_urls) write_message(('=======Need to crawl: messages-%d=======' %n_ids), window) i = 0 dt_id = 0 for msg_url in msg_urls: fetcher.n_connections = 0 if not msg_url.startswith('http://weibo.com/'): msg_url = 'http://weibo.com/' + msg_url.replace('/', '') if dt_id > 0: delay = adjust_delay(dt_id) msg = '-------\n' msg += 'Take a rest: %d seconds, and start new crawler..' %delay msg += '\n-------' write_message(msg, window) time.sleep(delay) t_id_s = time.time() #repost if fetch_data == 'repost': crawler = ComWeiboCrawler(fetcher, store_path, msg_url=msg_url, window=window) res = crawler.crawl_msg_reposts() if res is None: n_errors += 1 error_fp.write(str(msg_url) + ';') elif res is False: not_exist_fp.write(str(msg_url) + ';') elif res is True: succeed_fp.write(str(msg_url) + ';') #comment elif fetch_data == 'comment': crawler = ComWeiboCrawler(fetcher, store_path, msg_url=msg_url, window=window) res = crawler.crawl_msg_comments() if res is None: n_errors += 1 error_fp.write(str(msg_url) + ';') elif res is False: not_exist_fp.write(str(msg_url) + ';') elif res is True: succeed_fp.write(str(msg_url) + ';') #-- t_id_e = time.time() dt_id = int(t_id_e - t_id_s) i += 1 update_progress_bar(window, i*100/n_ids) n_connections += fetcher.n_connections #--end for msg_url in msg_urls else: #weibo.cn if uids is not None: fetch_data = fetch_data.lower() n_ids = len(uids) write_message(('=======Need to crawl: uids-%d=======' %n_ids), window) i = 0 dt_id = 0 for uid in uids: fetcher.n_connections = 0 if dt_id > 0: delay = adjust_delay(dt_id) msg = '-------\n' msg += 'Take a rest: %d seconds, and start new crawler..' %delay msg += '\n-------' write_message(msg, window) time.sleep(delay) t_id_s = time.time() if fetch_data == 'follows': crawler = CnWeiboCrawler(fetcher, store_path, uid=uid, window=window) res = crawler.crawl_follows() if res is None: n_errors += 1 error_fp.write(str(uid) + ';') elif res is False: not_exist_fp.write(str(uid) + ';') elif res is True: succeed_fp.write(str(uid) + ';') elif fetch_data == 'fans': crawler = CnWeiboCrawler(fetcher, store_path, uid=uid, window=window) res = crawler.crawl_fans() if res is None: n_errors += 1 error_fp.write(str(uid) + ';') elif res is False: not_exist_fp.write(str(uid) + ';') elif res is True: succeed_fp.write(str(uid) + ';') #-- t_id_e = time.time() dt_id = int(t_id_e - t_id_s) i += 1 update_progress_bar(window, i*100/n_ids) n_connections += fetcher.n_connections #--end for uid in uids #-- succeed_fp.close() error_fp.close() not_exist_fp.close() cost_time = int(time.time() - start_time) d, h, m, s = format_delta_time(cost_time) msg = 'The task has successfully finished.\n' msg += 'Crawled [user|message]ids: %d, cost time: %d(d)-%d(h)-%d(m)-%d(s), connections: %d' %(n_ids, d, h, m, s, n_connections) accuracy = 1 - n_errors / n_ids if n_ids > 0 else 0 msg += '\nAccuracy:%d%%' %(accuracy*100) write_message('=======', window) logger.info(msg) write_message(msg, window) return accuracy