def farm_user_network(apikeys,
                      config={},
                      output_folder='./farm/',
                      network_type="followers"):

    network_output_folder = os.path.abspath(
        '%s/%s/' % (output_folder, network_type))  # by user id

    shutil.rmtree(network_output_folder, True)

    user_network_farmer = UserFarm(apikeys=apikeys,
                                   verbose=False,
                                   output_folder=network_output_folder)

    seeds = config['seeds'] if 'seeds' in config else []
    depth = int(config.get('depth', 3))  # by default only fetch 3 layers

    #progress = config.get('progress', {})

    #current_depth = progress.get('current_depth', 0) # start from the first layer
    #queue = progess.get('queue', {})
    #queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example")

    user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    p = multiprocessing.Process(target=farm_user_timelines,
                                args=(apikeys, user_timeline_queue,
                                      output_folder))
    p.start()

    user_favorites_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    q = multiprocessing.Process(target=farm_user_favorites,
                                args=(apikeys, user_favorites_queue,
                                      output_folder))
    q.start()

    user_retweets_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    r = multiprocessing.Process(target=farm_user_retweets,
                                args=(apikeys, user_retweets_queue,
                                      output_folder))
    r.start()

    user_mentions_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    s = multiprocessing.Process(target=farm_user_mentions,
                                args=(apikeys, user_mentions_queue,
                                      output_folder))
    s.start()

    # get user_ids for the seeds
    user_network_queue = user_network_farmer.get_user_ids(seeds)

    try:
        #get user id first
        while depth > 0 and len(user_network_queue) > 0:
            temp_user_network_queue = set()

            for user_id in user_network_queue:
                time.sleep(5)
                if network_type == 'friends':
                    f_ids = user_network_farmer.find_all_friends(user_id)
                else:
                    f_ids = user_network_farmer.find_all_followers(user_id)

                logger.info('user_id: %d has %d friends' %
                            (user_id, len(f_ids)))

                for f_id in f_ids:
                    #					user_timeline_queue.put(f_id, block=True)							# AVALIAR - Me parece que se eu habilitar ele pega os amigos e a timeline dos amigos... Não é o nosso caso.
                    #					user_favorites_queue.put(f_id, block=True)						# AVALIAR - Me parece que se eu habilitar ele pega os amigos e os favoritos dos amigos.... Não é o nosso caso.
                    #					user_retweets_queue.put(f_id, block=True)						# AVALIAR - Me parece que se eu habilitar ele pega os amigos e os retweets dos amigos.... Não é o nosso caso.
                    #					user_mentions_queue.put(f_id, block=True)						# AVALIAR - Me parece que se eu habilitar ele pega os amigos e os menções dos amigos.... Não é o nosso caso.
                    temp_user_network_queue.add(f_id)
                    user_network_farmer.close()  # force flush once

            logger.info('finish depth: %d' % (depth))

            depth -= 1
            user_network_queue = temp_user_network_queue

    except KeyboardInterrupt:
        print()
        logger.error('You pressed Ctrl+C!')
        raise
    except:
        raise
    finally:
        user_network_farmer.close()

    user_timeline_queue.put_nowait(-1)
    user_favorites_queue.put_nowait(-1)
    user_retweets_queue.put_nowait(-1)
    user_mentions_queue.put_nowait(-1)

    p.join()
    q.join()
    r.join()
    s.join()

    logger.info('all done')
Esempio n. 2
0
def farm_user_network(apikeys, config = {}, output_folder='./farm/', network_type="followers"):

	network_output_folder = os.path.abspath('%s/%s/'%(output_folder, network_type)) # by user id

	shutil.rmtree(network_output_folder, True)

	user_network_farmer = UserFarm(apikeys=apikeys, verbose=False, output_folder=network_output_folder)
	
	seeds = config['seeds'] if 'seeds' in config else []
	depth = int(config.get('depth', 3)) # by default only fetch 3 layers

	#progress = config.get('progress', {})

	#current_depth = progress.get('current_depth', 0) # start from the first layer
	#queue = progess.get('queue', {})
	#queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example")

	user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)

	p = multiprocessing.Process(target=farm_user_timelines, args=(apikeys, user_timeline_queue, output_folder))
	p.start()
	# get user_ids for the seeds
	user_network_queue = user_network_farmer.get_user_ids(seeds)

	try:
		#get user id first 
		while depth > 0 and len(user_network_queue) > 0:
			temp_user_network_queue = set()

			for user_id in user_network_queue:
				time.sleep(5)
				if network_type == 'friends':
					f_ids = user_network_farmer.find_all_friends(user_id)
				else:
					f_ids = user_network_farmer.find_all_followers(user_id)

				logger.info('user_id: %d has %d friends'%(user_id, len(f_ids)))

				for f_id in f_ids:
					user_timeline_queue.put(f_id, block=True)
					temp_user_network_queue.add(f_id)
					user_network_farmer.close() # force flush once 

			logger.info('finish depth: %d'%(depth))

			depth -= 1
	 		user_network_queue = temp_user_network_queue

	except KeyboardInterrupt:
		print()
		logger.error('You pressed Ctrl+C!')
		raise
	except:		
		raise
	finally:
		user_network_farmer.close()

	user_timeline_queue.put_nowait(-1)

	p.join()

	logger.info('all done')
Esempio n. 3
0
def farm_user_network(apikeys=None,
                      seeds=[],
                      depth=3,
                      output_folder='./user_network',
                      network_type='followers'):

    output_folder = os.path.abspath('%s/%s' % (output_folder, network_type))
    user_farm = UserFarm(apikeys=apikeys,
                         verbose=False,
                         output_folder=output_folder)

    progress = {}
    try:
        with open('progress.pickle', 'rb') as pf:
            progress = pickle.load(pf)
    except:
        pass

    try:
        depth = max(progress.keys())

        logger.info('resume from depth: %d' % (depth))
    except:
        pass

    try:

        #get user id first
        user_ids = user_farm.get_user_ids(seeds)

        progress[depth] = user_ids

        logger.info("number of seeds: %d" % len(user_ids))

        while depth > 0 and len(user_ids) > 0:
            time.sleep(5)
            progress[depth - 1] = set()

            while len(progress[depth]) > 0:

                user_id = progress[depth].pop()

                logger.info("fetching %s of %d" % (network_type, user_id))

                if os.path.exists(
                        os.path.abspath('%s/%s' % (output_folder, user_id))):
                    logger.info("%d already fetched... pass" % user_id)
                    continue

                retry = False
                retry_cnt = MAX_RETRY_CNT
                while True:
                    try:
                        if network_type == 'friends':
                            f_ids = user_farm.find_all_friends(user_id)
                        else:
                            f_ids = user_farm.find_all_followers(user_id)

                        retry = False
                        retry_cnt = MAX_RETRY_CNT
                        if depth - 1 > 0:
                            progress[depth - 1].update(f_ids)
                    except:
                        retry = True
                        retry_cnt -= 1
                        time.sleep(60)
                        logger.info("retries remaining if failed %d" %
                                    (retry_cnt))

                    if not retry or retry_cnt == 0:
                        break

                # retry failed
                if retry and retry_cnt == 0:
                    # add unprocessed back to the queue
                    progress[depth].add(user_id)

            logger.info('finish depth: %d' % (depth))

            depth -= 1

    except KeyboardInterrupt:
        print()
        logger.error('You pressed Ctrl+C!')
        raise
    except:
        raise
    finally:
        user_farm.close()
        with open('progress.pickle', 'wb') as pf:
            pickle.dump(progress, pf)
Esempio n. 4
0
def farm_user_network(apikeys=None, seeds= [], depth=3, output_folder='./user_network', network_type='followers'):

	output_folder = os.path.abspath('%s/%s'%(output_folder, network_type))
	user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=output_folder)
	
	progress = {}
	try:
		with open('progress.pickle', 'rb') as pf:
			progress = pickle.load(pf)
	except:
		pass

	try:
		depth = max(progress.keys())

		logger.info('resume from depth: %d'%(depth))
	except:
		pass

	try:
		
		#get user id first
		user_ids = user_farm.get_user_ids(seeds)

		progress[depth] = user_ids

		logger.info("number of seeds: %d"%len(user_ids))

		while depth > 0 and len(user_ids) > 0:
			time.sleep(5)
			progress[depth-1] = set()

			while len(progress[depth]) > 0:

				user_id = progress[depth].pop()

				logger.info("fetching %s of %d"%(network_type, user_id))

				if os.path.exists(os.path.abspath('%s/%s'%(output_folder, user_id))):
					logger.info("%d already fetched... pass"%user_id)
					continue

				retry = False
				retry_cnt = MAX_RETRY_CNT
				while True:
					try:
						if network_type == 'friends':
							f_ids = user_farm.find_all_friends(user_id)
						else:
							f_ids = user_farm.find_all_followers(user_id)

						retry = False
						retry_cnt = MAX_RETRY_CNT
						if depth - 1 > 0:
							progress[depth-1].update(f_ids)
					except:
						retry = True
						retry_cnt -= 1
						time.sleep(60)
						logger.info("retries remaining if failed %d"%(retry_cnt))

					if not retry or retry_cnt == 0:
						break

				# retry failed
				if retry and retry_cnt == 0:
					# add unprocessed back to the queue
					progress[depth].add(user_id)

			logger.info('finish depth: %d'%(depth))

			depth -= 1

	except KeyboardInterrupt:
		print()
		logger.error('You pressed Ctrl+C!')
		raise
	except:		
		raise
	finally:
		user_farm.close()
		with open('progress.pickle', 'wb') as pf:
			pickle.dump(progress, pf)