def farm_user_retweets(apikeys, user_retweets_queue, output_folder='./farm/'):

    retweets_output_folder = os.path.abspath('%s/retweets/' %
                                             (output_folder))  # by user id

    user_retweets_farmer = UserFarm(apikeys=apikeys,
                                    verbose=False,
                                    output_folder=retweets_output_folder)

    current_user_id = 0

    retry = False

    problem_queue = []

    while current_user_id != -1:
        time.sleep(10)
        if retry and retry_cnt > 0:
            time.sleep(RETRY_SLEEP)
            user_retweets_farmer.write_to_handler.delete(current_user_id)
            retury = False
            retry_cnt -= 1
        else:
            current_user_id = user_retweets_queue.get(
                True)  # will block and wait for the next user_id
            #logger.info("retweets queue size: %d"%(user_retweets_queue.qsize())) no qsize() function on mac os x
            if current_user_id == -1:
                if len(
                        problem_queue
                ) > 0:  #have issues with a few user_id, we try to add them back to the queue to retry
                    # add this point, the queue should be empty; so no need to worry about block on the put
                    for uid in problem_queue:
                        user_retweets_queue.put(uid, block=True)

                    # get one to continue the process
                    current_user_id = user_retweets_queue.get(True)
                else:
                    break  #continue

            logger.info('retrieving retweets for: %d' % (current_user_id))
            retry_cnt = MAX_RETRY

        try:
            if not os.path.exists(
                    os.path.abspath(
                        '%s/%s' % (retweets_output_folder, current_user_id))):
                user_retweets_farmer.user_retweets(current_user_id)

        except:
            logger.warn("exception; retry: %d" % (retry_cnt))
            retry = True
            # note the problem, but don't die; move onto the next; and push this to the back of the current queue
            user_retweets_queue.put(user_id, block=True)
        finally:
            user_retweets_farmer.close()

    # notify -1
    user_retweets_farmer.close()

    return True
Exemple #2
0
def farm_user_timelines(apikeys, user_timeline_queue, output_folder='./farm/'):

	timelines_output_folder = os.path.abspath('%s/timelines/'%(output_folder)) # by user id

	user_timelines_farmer = UserFarm(apikeys=apikeys, verbose=False, output_folder=timelines_output_folder)

	current_user_id = 0

	retry = False

	problem_queue = []

	while current_user_id != -1:
		time.sleep(10)
		if retry and retry_cnt > 0:
			time.sleep(RETRY_SLEEP)
			user_timelines_farmer.write_to_handler.delete(current_user_id)
			retury = False
			retry_cnt -= 1
		else:
			current_user_id = user_timeline_queue.get(True) # will block and wait for the next user_id
			#logger.info("timeline queue size: %d"%(user_timeline_queue.qsize())) no qsize() function on mac os x
			if current_user_id == -1:
				if len(problem_queue) > 0: #have issues with a few user_id, we try to add them back to the queue to retry
					# add this point, the queue should be empty; so no need to worry about block on the put
					for uid in problem_queue:
						user_timeline_queue.put(uid, block=True)

					# get one to continue the process
					current_user_id = user_timeline_queue.get(True)
				else:
					break#continue

			logger.info('retrieving timeline for: %d'%(current_user_id))
			retry_cnt = MAX_RETRY

		try:
			if not os.path.exists(os.path.abspath('%s/%s'%(timelines_output_folder, current_user_id))):
				user_timelines_farmer.user_timeline(current_user_id)

		except:
			logger.warn("exception; retry: %d"%(retry_cnt))
			retry = True
			# note the problem, but don't die; move onto the next; and push this to the back of the current queue
			user_timeline_queue.put(user_id, block=True)
		finally:
			user_timelines_farmer.close()
			

	# notify -1
	user_timelines_farmer.close()

	return True
def farm_user_timelines(apikeys, seeds, output_folder):

	user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=os.path.abspath(output_folder))

	try:
		#get user id first
		user_ids = user_farm.get_user_ids(seeds)

		for user_id in user_ids:
			# current it skips the user if the result file is already there. Obviously this is not reliable since the error could raise when only half of the tweets for an user is finished... this will mean losing the other half for this user... but my current use case doesn't really care... since I have millions of users to worry about, losing one isn't that big of deal... but certainly needs a better way to track progress
			if not os.path.exists(os.path.abspath('%s/%s'%(output_folder, user_id))):
				user_farm.user_timeline(user_id)
	except KeyboardInterrupt:
		logger.warn('You pressed Ctrl+C!')
		raise
	except:		
		raise
	finally:
		user_farm.close()
def farm_user_network(apikeys,
                      config={},
                      output_folder='./farm/',
                      network_type="followers"):

    network_output_folder = os.path.abspath(
        '%s/%s/' % (output_folder, network_type))  # by user id

    shutil.rmtree(network_output_folder, True)

    user_network_farmer = UserFarm(apikeys=apikeys,
                                   verbose=False,
                                   output_folder=network_output_folder)

    seeds = config['seeds'] if 'seeds' in config else []
    depth = int(config.get('depth', 3))  # by default only fetch 3 layers

    #progress = config.get('progress', {})

    #current_depth = progress.get('current_depth', 0) # start from the first layer
    #queue = progess.get('queue', {})
    #queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example")

    user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    p = multiprocessing.Process(target=farm_user_timelines,
                                args=(apikeys, user_timeline_queue,
                                      output_folder))
    p.start()

    user_favorites_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    q = multiprocessing.Process(target=farm_user_favorites,
                                args=(apikeys, user_favorites_queue,
                                      output_folder))
    q.start()

    user_retweets_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    r = multiprocessing.Process(target=farm_user_retweets,
                                args=(apikeys, user_retweets_queue,
                                      output_folder))
    r.start()

    user_mentions_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)
    s = multiprocessing.Process(target=farm_user_mentions,
                                args=(apikeys, user_mentions_queue,
                                      output_folder))
    s.start()

    # get user_ids for the seeds
    user_network_queue = user_network_farmer.get_user_ids(seeds)

    try:
        #get user id first
        while depth > 0 and len(user_network_queue) > 0:
            temp_user_network_queue = set()

            for user_id in user_network_queue:
                time.sleep(5)
                if network_type == 'friends':
                    f_ids = user_network_farmer.find_all_friends(user_id)
                else:
                    f_ids = user_network_farmer.find_all_followers(user_id)

                logger.info('user_id: %d has %d friends' %
                            (user_id, len(f_ids)))

                for f_id in f_ids:
                    #					user_timeline_queue.put(f_id, block=True)							# AVALIAR - Me parece que se eu habilitar ele pega os amigos e a timeline dos amigos... Não é o nosso caso.
                    #					user_favorites_queue.put(f_id, block=True)						# AVALIAR - Me parece que se eu habilitar ele pega os amigos e os favoritos dos amigos.... Não é o nosso caso.
                    #					user_retweets_queue.put(f_id, block=True)						# AVALIAR - Me parece que se eu habilitar ele pega os amigos e os retweets dos amigos.... Não é o nosso caso.
                    #					user_mentions_queue.put(f_id, block=True)						# AVALIAR - Me parece que se eu habilitar ele pega os amigos e os menções dos amigos.... Não é o nosso caso.
                    temp_user_network_queue.add(f_id)
                    user_network_farmer.close()  # force flush once

            logger.info('finish depth: %d' % (depth))

            depth -= 1
            user_network_queue = temp_user_network_queue

    except KeyboardInterrupt:
        print()
        logger.error('You pressed Ctrl+C!')
        raise
    except:
        raise
    finally:
        user_network_farmer.close()

    user_timeline_queue.put_nowait(-1)
    user_favorites_queue.put_nowait(-1)
    user_retweets_queue.put_nowait(-1)
    user_mentions_queue.put_nowait(-1)

    p.join()
    q.join()
    r.join()
    s.join()

    logger.info('all done')
Exemple #5
0
def farm_user_network(apikeys=None,
                      seeds=[],
                      depth=3,
                      output_folder='./user_network',
                      network_type='followers'):

    output_folder = os.path.abspath('%s/%s' % (output_folder, network_type))
    user_farm = UserFarm(apikeys=apikeys,
                         verbose=False,
                         output_folder=output_folder)

    progress = {}
    try:
        with open('progress.pickle', 'rb') as pf:
            progress = pickle.load(pf)
    except:
        pass

    try:
        depth = max(progress.keys())

        logger.info('resume from depth: %d' % (depth))
    except:
        pass

    try:

        #get user id first
        user_ids = user_farm.get_user_ids(seeds)

        progress[depth] = user_ids

        logger.info("number of seeds: %d" % len(user_ids))

        while depth > 0 and len(user_ids) > 0:
            time.sleep(5)
            progress[depth - 1] = set()

            while len(progress[depth]) > 0:

                user_id = progress[depth].pop()

                logger.info("fetching %s of %d" % (network_type, user_id))

                if os.path.exists(
                        os.path.abspath('%s/%s' % (output_folder, user_id))):
                    logger.info("%d already fetched... pass" % user_id)
                    continue

                retry = False
                retry_cnt = MAX_RETRY_CNT
                while True:
                    try:
                        if network_type == 'friends':
                            f_ids = user_farm.find_all_friends(user_id)
                        else:
                            f_ids = user_farm.find_all_followers(user_id)

                        retry = False
                        retry_cnt = MAX_RETRY_CNT
                        if depth - 1 > 0:
                            progress[depth - 1].update(f_ids)
                    except:
                        retry = True
                        retry_cnt -= 1
                        time.sleep(60)
                        logger.info("retries remaining if failed %d" %
                                    (retry_cnt))

                    if not retry or retry_cnt == 0:
                        break

                # retry failed
                if retry and retry_cnt == 0:
                    # add unprocessed back to the queue
                    progress[depth].add(user_id)

            logger.info('finish depth: %d' % (depth))

            depth -= 1

    except KeyboardInterrupt:
        print()
        logger.error('You pressed Ctrl+C!')
        raise
    except:
        raise
    finally:
        user_farm.close()
        with open('progress.pickle', 'wb') as pf:
            pickle.dump(progress, pf)
Exemple #6
0
def farm_user_network(apikeys, config = {}, output_folder='./farm/', network_type="followers"):

	network_output_folder = os.path.abspath('%s/%s/'%(output_folder, network_type)) # by user id

	shutil.rmtree(network_output_folder, True)

	user_network_farmer = UserFarm(apikeys=apikeys, verbose=False, output_folder=network_output_folder)
	
	seeds = config['seeds'] if 'seeds' in config else []
	depth = int(config.get('depth', 3)) # by default only fetch 3 layers

	#progress = config.get('progress', {})

	#current_depth = progress.get('current_depth', 0) # start from the first layer
	#queue = progess.get('queue', {})
	#queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example")

	user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE)

	p = multiprocessing.Process(target=farm_user_timelines, args=(apikeys, user_timeline_queue, output_folder))
	p.start()
	# get user_ids for the seeds
	user_network_queue = user_network_farmer.get_user_ids(seeds)

	try:
		#get user id first 
		while depth > 0 and len(user_network_queue) > 0:
			temp_user_network_queue = set()

			for user_id in user_network_queue:
				time.sleep(5)
				if network_type == 'friends':
					f_ids = user_network_farmer.find_all_friends(user_id)
				else:
					f_ids = user_network_farmer.find_all_followers(user_id)

				logger.info('user_id: %d has %d friends'%(user_id, len(f_ids)))

				for f_id in f_ids:
					user_timeline_queue.put(f_id, block=True)
					temp_user_network_queue.add(f_id)
					user_network_farmer.close() # force flush once 

			logger.info('finish depth: %d'%(depth))

			depth -= 1
	 		user_network_queue = temp_user_network_queue

	except KeyboardInterrupt:
		print()
		logger.error('You pressed Ctrl+C!')
		raise
	except:		
		raise
	finally:
		user_network_farmer.close()

	user_timeline_queue.put_nowait(-1)

	p.join()

	logger.info('all done')
def farm_user_network(apikeys=None, seeds= [], depth=3, output_folder='./user_network', network_type='followers'):

	output_folder = os.path.abspath('%s/%s'%(output_folder, network_type))
	user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=output_folder)
	
	progress = {}
	try:
		with open('progress.pickle', 'rb') as pf:
			progress = pickle.load(pf)
	except:
		pass

	try:
		depth = max(progress.keys())

		logger.info('resume from depth: %d'%(depth))
	except:
		pass

	try:
		
		#get user id first
		user_ids = user_farm.get_user_ids(seeds)

		progress[depth] = user_ids

		logger.info("number of seeds: %d"%len(user_ids))

		while depth > 0 and len(user_ids) > 0:
			time.sleep(5)
			progress[depth-1] = set()

			while len(progress[depth]) > 0:

				user_id = progress[depth].pop()

				logger.info("fetching %s of %d"%(network_type, user_id))

				if os.path.exists(os.path.abspath('%s/%s'%(output_folder, user_id))):
					logger.info("%d already fetched... pass"%user_id)
					continue

				retry = False
				retry_cnt = MAX_RETRY_CNT
				while True:
					try:
						if network_type == 'friends':
							f_ids = user_farm.find_all_friends(user_id)
						else:
							f_ids = user_farm.find_all_followers(user_id)

						retry = False
						retry_cnt = MAX_RETRY_CNT
						if depth - 1 > 0:
							progress[depth-1].update(f_ids)
					except:
						retry = True
						retry_cnt -= 1
						time.sleep(60)
						logger.info("retries remaining if failed %d"%(retry_cnt))

					if not retry or retry_cnt == 0:
						break

				# retry failed
				if retry and retry_cnt == 0:
					# add unprocessed back to the queue
					progress[depth].add(user_id)

			logger.info('finish depth: %d'%(depth))

			depth -= 1

	except KeyboardInterrupt:
		print()
		logger.error('You pressed Ctrl+C!')
		raise
	except:		
		raise
	finally:
		user_farm.close()
		with open('progress.pickle', 'wb') as pf:
			pickle.dump(progress, pf)