Beispiel #1
0
	def test_client(self):
		nid = node_id()
		logger.info("sending to %s"%(nid))
		node_queue = NodeQueue(nid, redis_config=self.config['redis_config'])
		#redis_cmd_queue.clear()

		cmd = {
			"cmd": "CRAWL_FRIENDS",
			"user_id": 1948122342,
			"data_type": "ids",
			"depth": 2,
			"bucket":"friend_ids"
		}

		# cmd = {
		# 	"cmd": "CRAWL_USER_TIMELINE",
		# 	"user_id": 1948122342,#53039176,
		# 	"bucket": "timelines"
		# }

		node_queue.put(cmd)

		#cmd = {"cmd":"TERMINATE"}
		
		#node_queue.put(cmd)

		return True
Beispiel #2
0
    def test_client(self):
        nid = node_id()
        logger.info("sending to %s" % (nid))
        node_queue = NodeQueue(nid, redis_config=self.config['redis_config'])
        #redis_cmd_queue.clear()

        cmd = {
            "cmd": "CRAWL_FRIENDS",
            "user_id": 1948122342,
            "data_type": "ids",
            "depth": 2,
            "bucket": "friend_ids"
        }

        # cmd = {
        # 	"cmd": "CRAWL_USER_TIMELINE",
        # 	"user_id": 1948122342,#53039176,
        # 	"bucket": "timelines"
        # }

        node_queue.put(cmd)

        #cmd = {"cmd":"TERMINATE"}

        #node_queue.put(cmd)

        return True
Beispiel #3
0
	}, 'SEARCH':{
		'-q/--query': dictionary['-q/--query']
	}}
	

	for k, v in cmds.iteritems():
		print('')
		print('\t%s:'%k)
		for kk, vv in v.iteritems():
			print('\t\t%s: %s'%(kk, vv))

	print('')


if __name__=="__main__":
	nid = node_id()
	import json, os
	
	parser = argparse.ArgumentParser(add_help=False)
	parser.add_argument('-c', '--config', help="config.json that contains a) twitter api keys; b) redis connection string;", required = True)
	parser.add_argument('-cmd', '--command', help="the cmd you want to run, e.g., \"CRAWL_FRIENDS\"", required=True)
	parser.add_argument('-uid', '--user_id', help="the user_id", default=0)
	parser.add_argument('-tid', '--tweet_id', help="the tweet_id", default=0)
	parser.add_argument('-dt', '--data_type', help="the data_type (e.g., 'ids' or 'users'", default='ids')
	parser.add_argument('-d', '--depth', help="the depth", default=1)
	parser.add_argument('-j', '--json', help="the location of the json file that has a list of user_ids or screen_names", required=False)
	parser.add_argument('-o', '--output', help="the location of the output json file for storing user_ids", default='user_ids.json')
	parser.add_argument('-nid', '--node_id', help="the node_id you want to interact with", default=nid)
	parser.add_argument('-q', '--query', help="the search query", default=None)
	
	try:
Beispiel #4
0
def start_server(config, proxies):

    check_config(config)
    config = copy.copy(config)

    folders_to_create = []
    buckets = [
        "tweets", "followers", "follower_ids", "friends", "friend_ids",
        "timelines"
    ]

    ouput_folder = os.path.abspath(config['output'])
    archive_output = os.path.abspath(
        config['archive_output']) if config['archive_output'] else ouput_folder
    archive_output = os.path.join(archive_output, 'archived')

    folders_to_create.append(ouput_folder)
    folders_to_create.append(archive_output)

    for bucket in buckets:
        folders_to_create.append(os.path.join(ouput_folder, bucket))
        folders_to_create.append(os.path.join(archive_output, bucket))

    for folder_to_create in folders_to_create:
        if not os.path.exists(folder_to_create):
            os.makedirs(folder_to_create)

    logger.info("output to %s" % ouput_folder)
    logger.info("archived to %s" % archive_output)

    this_node_id = node_id()
    node_queue = NodeQueue(this_node_id, redis_config=config['redis_config'])
    node_queue.clear()

    scheduler = Scheduler(this_node_id, config=config, proxies=proxies)

    logger.info('starting node_id: %s' % this_node_id)

    node_coordinator = NodeCoordinator(config['redis_config'])
    # node_coordinator.clear()

    # the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler;
    # but we need one to report the status of each crawler and perform the tarball tashs...

    last_archive_ts = time.time(
    ) + 3600  # the first archive event starts 2 hrs later...
    pre_time = time.time()
    last_load_balancing_task_ts = time.time()
    while True:

        if time.time() - pre_time > 120:
            logger.info(pprint.pformat(scheduler.crawler_status()))
            pre_time = time.time()
            if scheduler.is_alive():
                cmd = {'cmd': 'CRAWLER_FLUSH'}
                scheduler.enqueue(cmd)

        if time.time() - last_archive_ts > 3600:

            logger.info("start archive procedure...")
            with concurrent.futures.ProcessPoolExecutor(
                    max_workers=len(buckets)) as executor:

                future_proxies = {
                    executor.submit(tarball_results, ouput_folder, bucket,
                                    archive_output,
                                    int(time.time()) - 3600): bucket
                    for bucket in buckets
                }

                for future in future_proxies:
                    future.add_done_callback(lambda f: logger.info(
                        "archive created? %s: [%s]" % f.result()))

            last_archive_ts = time.time()

        # block, the main process...for a command
        if not scheduler.is_alive():
            logger.info(
                "no crawler is alive... waiting to recreate all crawlers...")
            time.sleep(120)  # sleep for a minute and retry
            continue

        if time.time(
        ) - last_load_balancing_task_ts > 1800:  # try to balance the local queues every 30 mins
            last_load_balancing_task_ts = time.time()
            cmd = {'cmd': 'BALANCING_LOAD'}
            scheduler.enqueue(cmd)

        cmd = node_queue.get(block=True, timeout=360)

        if cmd:
            scheduler.enqueue(cmd)
def start_server(config, proxies):
	import copy
	
	check_config(config)
	config = copy.copy(config)

	folders_to_create = []
	buckets = ["tweets", "followers", "follower_ids", "friends", "friend_ids", "timelines"]

	ouput_folder = os.path.abspath(config['output'])
	archive_output = os.path.abspath(config['output']) if config['output'] else ouput_folder
	#archive_output = os.path.abspath(config['archive_output']) if config['archive_output'] else ouput_folder
	archive_output = os.path.join(archive_output, 'archived')

	folders_to_create.append(ouput_folder)
	folders_to_create.append(archive_output)

	for bucket in buckets:
		folders_to_create.append(os.path.join(ouput_folder, bucket))
		folders_to_create.append(os.path.join(archive_output, bucket))

	for folder_to_create in folders_to_create:
		if (not os.path.exists(folder_to_create)):
			os.makedirs(folder_to_create)

	logger.info("output to %s"%(ouput_folder))
	logger.info("archived to %s"%(archive_output))

	this_node_id = node_id()
	node_queue = NodeQueue(this_node_id, redis_config=config['redis_config'])
	node_queue.clear()

	scheduler = Scheduler(this_node_id, config=config, proxies=proxies)

	logger.info('starting node_id: %s'%this_node_id)

	node_coordinator = NodeCoordinator(config['redis_config'])
	#node_coordinator.clear()
	
	#the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler;
	#but we need one to report the status of each crawler and perform the tarball tashs...
	
	last_archive_ts = time.time() + 3600 # the first archive event starts 2 hrs later... 
	pre_time = time.time()
	last_load_balancing_task_ts = time.time()
	while True:
		
		if (time.time() - pre_time > 120):
			logger.info(pprint.pformat(scheduler.crawler_status()))
			pre_time = time.time()
			if (scheduler.is_alive()):
				cmd = {'cmd': 'CRAWLER_FLUSH'}
				scheduler.enqueue(cmd)

		if (time.time() - last_archive_ts > 3600):

			logger.info("start archive procedure...")
			with concurrent.futures.ProcessPoolExecutor(max_workers=len(buckets)) as executor:

				future_proxies = {executor.submit(tarball_results, ouput_folder, bucket, archive_output, int(time.time()) - 3600): bucket for bucket in buckets}
		
				for future in future_proxies:
					future.add_done_callback(lambda f: logger.info("archive created? %s: [%s]"%f.result()))

			last_archive_ts = time.time()

		# block, the main process...for a command
		if(not scheduler.is_alive()):
			logger.info("no crawler is alive... waiting to recreate all crawlers...")
			time.sleep(120) # sleep for a minute and retry
			continue

		if (time.time() - last_load_balancing_task_ts > 1800): # try to balance the local queues every 30 mins
			last_load_balancing_task_ts = time.time()
			cmd = {'cmd': 'BALANCING_LOAD'}
			scheduler.enqueue(cmd)

		cmd = node_queue.get(block=True, timeout=360)

		if cmd:
			scheduler.enqueue(cmd)
Beispiel #6
0
        'SEARCH': {
            '-q/--query': dictionary['-q/--query']
        }
    }

    for k, v in cmds.iteritems():
        print('')
        print('\t%s:' % k)
        for kk, vv in v.iteritems():
            print('\t\t%s: %s' % (kk, vv))

    print('')


if __name__ == "__main__":
    nid = node_id()
    import json, os

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument(
        '-c',
        '--config',
        help=
        "config.json that contains a) twitter api keys; b) redis connection string;",
        required=True)
    parser.add_argument(
        '-cmd',
        '--command',
        help="the cmd you want to run, e.g., \"CRAWL_FRIENDS\"",
        required=True)
    parser.add_argument('-uid', '--user_id', help="the user_id", default=0)