def farm_user_network(apikeys, config={}, output_folder='./farm/', network_type="followers"): network_output_folder = os.path.abspath( '%s/%s/' % (output_folder, network_type)) # by user id shutil.rmtree(network_output_folder, True) user_network_farmer = UserFarm(apikeys=apikeys, verbose=False, output_folder=network_output_folder) seeds = config['seeds'] if 'seeds' in config else [] depth = int(config.get('depth', 3)) # by default only fetch 3 layers #progress = config.get('progress', {}) #current_depth = progress.get('current_depth', 0) # start from the first layer #queue = progess.get('queue', {}) #queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example") user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) p = multiprocessing.Process(target=farm_user_timelines, args=(apikeys, user_timeline_queue, output_folder)) p.start() user_favorites_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) q = multiprocessing.Process(target=farm_user_favorites, args=(apikeys, user_favorites_queue, output_folder)) q.start() user_retweets_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) r = multiprocessing.Process(target=farm_user_retweets, args=(apikeys, user_retweets_queue, output_folder)) r.start() user_mentions_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) s = multiprocessing.Process(target=farm_user_mentions, args=(apikeys, user_mentions_queue, output_folder)) s.start() # get user_ids for the seeds user_network_queue = user_network_farmer.get_user_ids(seeds) try: #get user id first while depth > 0 and len(user_network_queue) > 0: temp_user_network_queue = set() for user_id in user_network_queue: time.sleep(5) if network_type == 'friends': f_ids = user_network_farmer.find_all_friends(user_id) else: f_ids = user_network_farmer.find_all_followers(user_id) logger.info('user_id: %d has %d friends' % (user_id, len(f_ids))) for f_id in f_ids: # user_timeline_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e a timeline dos amigos... Não é o nosso caso. # user_favorites_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e os favoritos dos amigos.... Não é o nosso caso. # user_retweets_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e os retweets dos amigos.... Não é o nosso caso. # user_mentions_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e os menções dos amigos.... Não é o nosso caso. temp_user_network_queue.add(f_id) user_network_farmer.close() # force flush once logger.info('finish depth: %d' % (depth)) depth -= 1 user_network_queue = temp_user_network_queue except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_network_farmer.close() user_timeline_queue.put_nowait(-1) user_favorites_queue.put_nowait(-1) user_retweets_queue.put_nowait(-1) user_mentions_queue.put_nowait(-1) p.join() q.join() r.join() s.join() logger.info('all done')
def farm_user_network(apikeys, config = {}, output_folder='./farm/', network_type="followers"): network_output_folder = os.path.abspath('%s/%s/'%(output_folder, network_type)) # by user id shutil.rmtree(network_output_folder, True) user_network_farmer = UserFarm(apikeys=apikeys, verbose=False, output_folder=network_output_folder) seeds = config['seeds'] if 'seeds' in config else [] depth = int(config.get('depth', 3)) # by default only fetch 3 layers #progress = config.get('progress', {}) #current_depth = progress.get('current_depth', 0) # start from the first layer #queue = progess.get('queue', {}) #queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example") user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) p = multiprocessing.Process(target=farm_user_timelines, args=(apikeys, user_timeline_queue, output_folder)) p.start() # get user_ids for the seeds user_network_queue = user_network_farmer.get_user_ids(seeds) try: #get user id first while depth > 0 and len(user_network_queue) > 0: temp_user_network_queue = set() for user_id in user_network_queue: time.sleep(5) if network_type == 'friends': f_ids = user_network_farmer.find_all_friends(user_id) else: f_ids = user_network_farmer.find_all_followers(user_id) logger.info('user_id: %d has %d friends'%(user_id, len(f_ids))) for f_id in f_ids: user_timeline_queue.put(f_id, block=True) temp_user_network_queue.add(f_id) user_network_farmer.close() # force flush once logger.info('finish depth: %d'%(depth)) depth -= 1 user_network_queue = temp_user_network_queue except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_network_farmer.close() user_timeline_queue.put_nowait(-1) p.join() logger.info('all done')
def farm_user_network(apikeys=None, seeds=[], depth=3, output_folder='./user_network', network_type='followers'): output_folder = os.path.abspath('%s/%s' % (output_folder, network_type)) user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=output_folder) progress = {} try: with open('progress.pickle', 'rb') as pf: progress = pickle.load(pf) except: pass try: depth = max(progress.keys()) logger.info('resume from depth: %d' % (depth)) except: pass try: #get user id first user_ids = user_farm.get_user_ids(seeds) progress[depth] = user_ids logger.info("number of seeds: %d" % len(user_ids)) while depth > 0 and len(user_ids) > 0: time.sleep(5) progress[depth - 1] = set() while len(progress[depth]) > 0: user_id = progress[depth].pop() logger.info("fetching %s of %d" % (network_type, user_id)) if os.path.exists( os.path.abspath('%s/%s' % (output_folder, user_id))): logger.info("%d already fetched... pass" % user_id) continue retry = False retry_cnt = MAX_RETRY_CNT while True: try: if network_type == 'friends': f_ids = user_farm.find_all_friends(user_id) else: f_ids = user_farm.find_all_followers(user_id) retry = False retry_cnt = MAX_RETRY_CNT if depth - 1 > 0: progress[depth - 1].update(f_ids) except: retry = True retry_cnt -= 1 time.sleep(60) logger.info("retries remaining if failed %d" % (retry_cnt)) if not retry or retry_cnt == 0: break # retry failed if retry and retry_cnt == 0: # add unprocessed back to the queue progress[depth].add(user_id) logger.info('finish depth: %d' % (depth)) depth -= 1 except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_farm.close() with open('progress.pickle', 'wb') as pf: pickle.dump(progress, pf)
def farm_user_network(apikeys=None, seeds= [], depth=3, output_folder='./user_network', network_type='followers'): output_folder = os.path.abspath('%s/%s'%(output_folder, network_type)) user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=output_folder) progress = {} try: with open('progress.pickle', 'rb') as pf: progress = pickle.load(pf) except: pass try: depth = max(progress.keys()) logger.info('resume from depth: %d'%(depth)) except: pass try: #get user id first user_ids = user_farm.get_user_ids(seeds) progress[depth] = user_ids logger.info("number of seeds: %d"%len(user_ids)) while depth > 0 and len(user_ids) > 0: time.sleep(5) progress[depth-1] = set() while len(progress[depth]) > 0: user_id = progress[depth].pop() logger.info("fetching %s of %d"%(network_type, user_id)) if os.path.exists(os.path.abspath('%s/%s'%(output_folder, user_id))): logger.info("%d already fetched... pass"%user_id) continue retry = False retry_cnt = MAX_RETRY_CNT while True: try: if network_type == 'friends': f_ids = user_farm.find_all_friends(user_id) else: f_ids = user_farm.find_all_followers(user_id) retry = False retry_cnt = MAX_RETRY_CNT if depth - 1 > 0: progress[depth-1].update(f_ids) except: retry = True retry_cnt -= 1 time.sleep(60) logger.info("retries remaining if failed %d"%(retry_cnt)) if not retry or retry_cnt == 0: break # retry failed if retry and retry_cnt == 0: # add unprocessed back to the queue progress[depth].add(user_id) logger.info('finish depth: %d'%(depth)) depth -= 1 except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_farm.close() with open('progress.pickle', 'wb') as pf: pickle.dump(progress, pf)