Beispiel #1
0
    def test_client(self):
        nid = node_id()
        logger.info("sending to %s" % (nid))
        node_queue = NodeQueue(nid, redis_config=self.config['redis_config'])
        #redis_cmd_queue.clear()

        cmd = {
            "cmd": "CRAWL_FRIENDS",
            "user_id": 1948122342,
            "data_type": "ids",
            "depth": 2,
            "bucket": "friend_ids"
        }

        # cmd = {
        # 	"cmd": "CRAWL_USER_TIMELINE",
        # 	"user_id": 1948122342,#53039176,
        # 	"bucket": "timelines"
        # }

        node_queue.put(cmd)

        #cmd = {"cmd":"TERMINATE"}

        #node_queue.put(cmd)

        return True
Beispiel #2
0
	def test_client(self):
		nid = node_id()
		logger.info("sending to %s"%(nid))
		node_queue = NodeQueue(nid, redis_config=self.config['redis_config'])
		#redis_cmd_queue.clear()

		cmd = {
			"cmd": "CRAWL_FRIENDS",
			"user_id": 1948122342,
			"data_type": "ids",
			"depth": 2,
			"bucket":"friend_ids"
		}

		# cmd = {
		# 	"cmd": "CRAWL_USER_TIMELINE",
		# 	"user_id": 1948122342,#53039176,
		# 	"bucket": "timelines"
		# }

		node_queue.put(cmd)

		#cmd = {"cmd":"TERMINATE"}
		
		#node_queue.put(cmd)

		return True
    def __init__(self,
                 node_id,
                 crawler_id,
                 apikeys,
                 handlers,
                 redis_config,
                 proxies=None):
        if (handlers == None):
            raise MissingArgs("you need a handler to write the data to...")

        super(UserRelationshipCrawler, self).__init__(node_id, crawler_id,
                                                      redis_config, handlers)

        self.apikeys = copy.copy(apikeys)
        self.tasks = {
            "TERMINATE": "TERMINATE",
            "CRAWL_FRIENDS": {
                "users": "find_all_friends",
                "ids": "find_all_friend_ids",
                "network_type": "friends"
            },
            "CRAWL_FOLLOWERS": {
                "users": "find_all_followers",
                "ids": "find_all_follower_ids",
                "network_type": "followers"
            },
            "CRAWL_USER_TIMELINE": "fetch_user_timeline",
            "CRAWL_TWEET": "fetch_tweet_by_id"
        }
        self.node_queue = NodeQueue(self.node_id, redis_config=redis_config)
        self.client_args = {"timeout": 300}
        self.proxies = iter(proxies) if proxies else None
        self.user_api = None

        self.init_user_api()
	def __init__(self, node_id, crawler_id, apikeys, handlers, redis_config, proxies=None):
		if (handlers == None):
			raise MissingArgs("you need a handler to write the data to...")

		super(UserRelationshipCrawler, self).__init__(node_id, crawler_id, redis_config, handlers)

		self.apikeys = copy.copy(apikeys)
		self.tasks = {
			"TERMINATE": "TERMINATE", 
			"CRAWL_FRIENDS" : {
				"users": "find_all_friends",
				"ids": "find_all_friend_ids",
				"network_type": "friends"
			},
			"CRAWL_FOLLOWERS" :{
				"users": "find_all_followers",
				"ids": "find_all_follower_ids",
				"network_type": "followers"
			}, 
			"CRAWL_USER_TIMELINE": "fetch_user_timeline",
			"CRAWL_TWEET": "fetch_tweet_by_id"
		}
		self.node_queue = NodeQueue(self.node_id, redis_config=redis_config)
		self.client_args = {"timeout": 300}
		self.proxies = iter(proxies) if proxies else None
		self.user_api = None

		self.init_user_api()
def flush_cmd(bulk, data_type, template, redis_config):

	try:
		node_coordinator = NodeCoordinator(redis_config=redis_config)

		qsizes = node_coordinator.node_qsizes()

		logger.debug(qsizes)
		
		node_queues = {}

		for element in bulk:
			if data_type == "ids" and type(element) == int:
				user_id = element
			elif data_type =="users" and type(element) == dict and "id" in element:
				user_id = element['id']
			
			t = copy.copy(template)
			t["user_id"] = int(user_id)
			t["depth"] = int(t["depth"]) -1

			node_id = get_keys_by_min_value(qsizes)[0]

			if (node_id in node_queues):
				node_queue = node_queues[node_id]
			else:
				node_queue = NodeQueue(node_id, redis_config=redis_config)
				node_queues[node_id] = node_queue


			t['cmd_hash'] = hash_cmd(t)
			node_queue.put(t)
			qsizes[node_id] += 1

			logger.debug("send [%s] to node: %s"%(json.dumps(t),node_id))

		# intend to close all redis connections, but not sure yet...
		node_queues.clear()

		del node_coordinator

			
	except Exception as exc:
		logger.error('error during flush: %s'%exc)

	return True
def flush_cmd(bulk, data_type, template, redis_config):

    try:
        node_coordinator = NodeCoordinator(redis_config=redis_config)

        qsizes = node_coordinator.node_qsizes()

        logger.debug(qsizes)

        node_queues = {}

        for element in bulk:
            if data_type == "ids" and type(element) == int:
                user_id = element
            elif data_type == "users" and type(
                    element) == dict and "id" in element:
                user_id = element['id']

            t = copy.copy(template)
            t["user_id"] = int(user_id)
            t["depth"] = int(t["depth"]) - 1

            node_id = get_keys_by_min_value(qsizes)[0]

            if (node_id in node_queues):
                node_queue = node_queues[node_id]
            else:
                node_queue = NodeQueue(node_id, redis_config=redis_config)
                node_queues[node_id] = node_queue

            t['cmd_hash'] = hash_cmd(t)
            node_queue.put(t)
            qsizes[node_id] += 1

            logger.debug("send [%s] to node: %s" % (json.dumps(t), node_id))

        # intend to close all redis connections, but not sure yet...
        node_queues.clear()

        del node_coordinator

    except Exception as exc:
        logger.error('error during flush: %s' % exc)

    return True
Beispiel #7
0
	def test_redis_connections(self):
		nodes = {}

		cnt = 0
		while True:
			nodes[cnt] = NodeQueue("node_id", redis_config=self.config['redis_config'])
			cnt += 1
			if (cnt % 5 == 0):
				nodes.clear()
			time.sleep(1)
Beispiel #8
0
def cmd(config, args):
	
	if (args.command not in avaliable_cmds):
		raise Exception("not a valid command...")

	nid = args.node_id
	
	logger.info("node_id: %s"%(nid))
	node_queue = NodeQueue(nid, redis_config=config['redis_config'])
	node_coordinator = NodeCoordinator(config['redis_config'])
	# this can be done locally without sending the command to the servers...
	if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'):
		apikeys = config["apikeys"].values()[0]
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
			screen_names = json.load(f)
			user_api = User(apikeys=apikeys)
			user_ids = user_api.get_user_ids_by_screen_names(screen_names)
			json.dump(list(user_ids), o_f)
	elif (args.command == 'GET_USERS_FROM_IDS'):
		apikeys = config["apikeys"].values()[0]
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
			user_ids = json.load(f)
			user_api = User(apikeys=apikeys)
			users = user_api.get_users(user_ids)
			json.dump(list(users), o_f)
	elif (args.command.startswith('BATCH_')):
		command = args.command.replace('BATCH_', '')
		args_dict = copy.copy(args.__dict__)
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f:
			user_ids = json.load(f)
			for user_id in user_ids:
				args_dict['user_id'] = user_id
				cmd = new_cmd(command, args_dict)
				node_queue.put(cmd)
	elif (args.command == 'LIST_NODES'):
		pp.pprint(node_coordinator.list_nodes())
	elif (args.command == 'NODE_QSIZES'):
		raise NotImplemented("NotImplemented yet...")
		#pp.pprint(node_coordinator.list_nodes())
	elif (args.command == 'SHUTDOWN_NODE'):
		#node_coordinator.remove_node(nid)
		#pp.pprint(node_coordinator.list_nodes())
		raise NotImplemented("NotImplemented yet...")
	elif (args.command == 'CLEAR_NODE_QUEUES'):
		node_queue.clear_all_queues()
	else:
		args_dict = copy.copy(args.__dict__)
		cmd = new_cmd(args.command, args_dict)
		node_queue.put(cmd)
		logger.info('sent [%s]'%(cmd))
class TwitterCrawler(CrawlerProcess):
    def __init__(self,
                 node_id,
                 crawler_id,
                 apikeys,
                 handlers,
                 redis_config,
                 proxies=None):
        if (handlers == None):
            raise MissingArgs("you need a handler to write the data to...")

        super(TwitterCrawler, self).__init__(node_id, crawler_id, redis_config,
                                             handlers)

        self.apikeys = copy.copy(apikeys)
        self.tasks = {
            "TERMINATE": "TERMINATE",
            "CRAWL_FRIENDS": {
                "users": "find_all_friends",
                "ids": "find_all_friend_ids",
                "network_type": "friends"
            },
            "CRAWL_FOLLOWERS": {
                "users": "find_all_followers",
                "ids": "find_all_follower_ids",
                "network_type": "followers"
            },
            "CRAWL_USER_TIMELINE": "fetch_user_timeline",
            "CRAWL_TWEET": "fetch_tweet_by_id",
            "SEARCH": "search_by_query"
        }
        self.node_queue = NodeQueue(self.node_id, redis_config=redis_config)
        self.client_args = {"timeout": 300}
        self.proxies = iter(proxies) if proxies else None
        self.twitter_api = None

        self.init_twitter_api()

    def init_twitter_api(
        self
    ):  # this will throw StopIteration if all proxies have been tried...
        if (self.proxies):
            try:
                self.client_args['proxies'] = next(
                    self.proxies)['proxy_dict']  # this will throw out
            # logger.info("client_args: %s"%json.dumps(self.client_args))
            except StopIteration as exc:
                raise
            except Exception as exc:
                self.init_twitter_api()

        if (self.twitter_api):
            del self.twitter_api

        # crawler_id=self.crawler_id,
        self.twitter_api = TwitterAPI(apikeys=self.apikeys,
                                      client_args=self.client_args)

    def get_handlers(self):
        return self.handlers

    def avaliable_cmds(self):
        return self.tasks.keys()

    def run(self):
        while True:
            # cmd is in json format
            # cmd = {
            #	network_type: "followers", # or friends
            #	user_id: id,
            #	data_type: 'ids' # users
            # }
            cmd = self.get_cmd()

            command = cmd['cmd']

            logger.debug("new cmd: %s" % (cmd))

            redis_cmd_handler = None

            # maybe change this to a map will be less expressive, and easier to read... but well, not too many cases here yet...
            if (command == 'TERMINATE'):
                # make sure we need to flush all existing data in the handlers..
                for handler in self.handlers:
                    handler.flush_all()
                break
            elif (command == 'CRAWLER_FLUSH'):
                for handler in self.handlers:
                    handler.flush_all()
            else:

                # figure out args first...
                args = {}
                if (command == 'CRAWL_TWEET'):
                    args = {
                        "tweet_id": cmd['tweet_id'],
                        "write_to_handlers": self.handlers,
                        "cmd_handlers": []
                    }
                elif (command == 'SEARCH'):
                    args = {
                        "write_to_handlers": self.handlers,
                        "cmd_handlers": []
                    }
                else:
                    args = {
                        "user_id": cmd['user_id'],
                        "write_to_handlers": self.handlers,
                        "cmd_handlers": []
                    }

                bucket = cmd["bucket"] if "bucket" in cmd else None

                if (bucket):
                    args["bucket"] = bucket

                func = None
                if (command in ['CRAWL_USER_TIMELINE', 'CRAWL_TWEET']):
                    func = getattr(self.twitter_api, self.tasks[command])
                elif (command in ['SEARCH']):

                    if "lang" in cmd:
                        args['lang'] = cmd['lang']

                    if "geocode" in cmd:
                        args['geocode'] = cmd['geocode']

                    if "key" in cmd:
                        args['key'] = cmd['key']

                    # logger.info("new cmd: %s"%(cmd))
                    # q is required, otherwise let it fail...
                    if "query" in cmd:
                        args['query'] = cmd['query']
                        func = getattr(self.twitter_api, self.tasks[command])

                elif (command in ['CRAWL_FRIENDS', 'CRAWL_FOLLOWERS']):
                    data_type = cmd['data_type']

                    try:
                        depth = cmd["depth"] if "depth" in cmd else None
                        depth = int(depth)
                        # for handler in self.handlers:
                        # 	if isinstance(handler, InMemoryHandler):
                        # 		inmemory_handler = handler
                        if (depth > 1):
                            template = copy.copy(cmd)
                            # template = {
                            #	network_type: "followers", # or friends
                            #	user_id: id,
                            #	data_type: 'ids' # object
                            #	depth: depth
                            # }
                            # will throw out exception if redis_config doesn't exist...
                            args["cmd_handlers"].append(
                                CrawlUserRelationshipCommandHandler(
                                    template=template,
                                    redis_config=self.redis_config))

                            logger.info("depth: %d, # of cmd_handlers: %d" %
                                        (depth, len(args['cmd_handlers'])))

                    except Exception as exc:
                        logger.warn(exc)

                    func = getattr(self.twitter_api,
                                   self.tasks[command][data_type])

                if func:
                    try:
                        # logger.info(args)
                        func(**args)
                        del args['cmd_handlers']
                        for handler in self.handlers:
                            handler.flush_all()
                    except Exception as exc:
                        logger.error("%s" % exc)
                        try:
                            self.init_twitter_api()
                        except StopIteration as init_twitter_api_exc:
                            # import exceptions
                            # if (isinstance(init_user_api_exc, exceptions.StopIteration)): # no more proxy to try... so kill myself...
                            for handler in self.handlers:
                                handler.flush_all()

                            logger.warn(
                                'not enough proxy servers, kill me... %s' %
                                (self.crawler_id))
                            # flush first
                            self.node_queue.put({
                                'cmd': 'CRAWLER_FAILED',
                                'crawler_id': self.crawler_id
                            })
                            del self.node_queue
                            return False
                        # raise
                        else:
                            # put current task back to queue...
                            logger.info(
                                'pushing current task back to the queue: %s' %
                                (json.dumps(cmd)))
                            self.enqueue(cmd)

                            # logger.error(full_stack())

                else:
                    logger.warn("whatever are you trying to do?")

        logger.info("looks like i'm done...")

        return True
Beispiel #10
0
def start_server(config, proxies):

    check_config(config)
    config = copy.copy(config)

    folders_to_create = []
    buckets = [
        "tweets", "followers", "follower_ids", "friends", "friend_ids",
        "timelines"
    ]

    ouput_folder = os.path.abspath(config['output'])
    archive_output = os.path.abspath(
        config['archive_output']) if config['archive_output'] else ouput_folder
    archive_output = os.path.join(archive_output, 'archived')

    folders_to_create.append(ouput_folder)
    folders_to_create.append(archive_output)

    for bucket in buckets:
        folders_to_create.append(os.path.join(ouput_folder, bucket))
        folders_to_create.append(os.path.join(archive_output, bucket))

    for folder_to_create in folders_to_create:
        if not os.path.exists(folder_to_create):
            os.makedirs(folder_to_create)

    logger.info("output to %s" % ouput_folder)
    logger.info("archived to %s" % archive_output)

    this_node_id = node_id()
    node_queue = NodeQueue(this_node_id, redis_config=config['redis_config'])
    node_queue.clear()

    scheduler = Scheduler(this_node_id, config=config, proxies=proxies)

    logger.info('starting node_id: %s' % this_node_id)

    node_coordinator = NodeCoordinator(config['redis_config'])
    # node_coordinator.clear()

    # the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler;
    # but we need one to report the status of each crawler and perform the tarball tashs...

    last_archive_ts = time.time(
    ) + 3600  # the first archive event starts 2 hrs later...
    pre_time = time.time()
    last_load_balancing_task_ts = time.time()
    while True:

        if time.time() - pre_time > 120:
            logger.info(pprint.pformat(scheduler.crawler_status()))
            pre_time = time.time()
            if scheduler.is_alive():
                cmd = {'cmd': 'CRAWLER_FLUSH'}
                scheduler.enqueue(cmd)

        if time.time() - last_archive_ts > 3600:

            logger.info("start archive procedure...")
            with concurrent.futures.ProcessPoolExecutor(
                    max_workers=len(buckets)) as executor:

                future_proxies = {
                    executor.submit(tarball_results, ouput_folder, bucket,
                                    archive_output,
                                    int(time.time()) - 3600): bucket
                    for bucket in buckets
                }

                for future in future_proxies:
                    future.add_done_callback(lambda f: logger.info(
                        "archive created? %s: [%s]" % f.result()))

            last_archive_ts = time.time()

        # block, the main process...for a command
        if not scheduler.is_alive():
            logger.info(
                "no crawler is alive... waiting to recreate all crawlers...")
            time.sleep(120)  # sleep for a minute and retry
            continue

        if time.time(
        ) - last_load_balancing_task_ts > 1800:  # try to balance the local queues every 30 mins
            last_load_balancing_task_ts = time.time()
            cmd = {'cmd': 'BALANCING_LOAD'}
            scheduler.enqueue(cmd)

        cmd = node_queue.get(block=True, timeout=360)

        if cmd:
            scheduler.enqueue(cmd)
def start_server(config, proxies):
	import copy
	
	check_config(config)
	config = copy.copy(config)

	folders_to_create = []
	buckets = ["tweets", "followers", "follower_ids", "friends", "friend_ids", "timelines"]

	ouput_folder = os.path.abspath(config['output'])
	archive_output = os.path.abspath(config['output']) if config['output'] else ouput_folder
	#archive_output = os.path.abspath(config['archive_output']) if config['archive_output'] else ouput_folder
	archive_output = os.path.join(archive_output, 'archived')

	folders_to_create.append(ouput_folder)
	folders_to_create.append(archive_output)

	for bucket in buckets:
		folders_to_create.append(os.path.join(ouput_folder, bucket))
		folders_to_create.append(os.path.join(archive_output, bucket))

	for folder_to_create in folders_to_create:
		if (not os.path.exists(folder_to_create)):
			os.makedirs(folder_to_create)

	logger.info("output to %s"%(ouput_folder))
	logger.info("archived to %s"%(archive_output))

	this_node_id = node_id()
	node_queue = NodeQueue(this_node_id, redis_config=config['redis_config'])
	node_queue.clear()

	scheduler = Scheduler(this_node_id, config=config, proxies=proxies)

	logger.info('starting node_id: %s'%this_node_id)

	node_coordinator = NodeCoordinator(config['redis_config'])
	#node_coordinator.clear()
	
	#the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler;
	#but we need one to report the status of each crawler and perform the tarball tashs...
	
	last_archive_ts = time.time() + 3600 # the first archive event starts 2 hrs later... 
	pre_time = time.time()
	last_load_balancing_task_ts = time.time()
	while True:
		
		if (time.time() - pre_time > 120):
			logger.info(pprint.pformat(scheduler.crawler_status()))
			pre_time = time.time()
			if (scheduler.is_alive()):
				cmd = {'cmd': 'CRAWLER_FLUSH'}
				scheduler.enqueue(cmd)

		if (time.time() - last_archive_ts > 3600):

			logger.info("start archive procedure...")
			with concurrent.futures.ProcessPoolExecutor(max_workers=len(buckets)) as executor:

				future_proxies = {executor.submit(tarball_results, ouput_folder, bucket, archive_output, int(time.time()) - 3600): bucket for bucket in buckets}
		
				for future in future_proxies:
					future.add_done_callback(lambda f: logger.info("archive created? %s: [%s]"%f.result()))

			last_archive_ts = time.time()

		# block, the main process...for a command
		if(not scheduler.is_alive()):
			logger.info("no crawler is alive... waiting to recreate all crawlers...")
			time.sleep(120) # sleep for a minute and retry
			continue

		if (time.time() - last_load_balancing_task_ts > 1800): # try to balance the local queues every 30 mins
			last_load_balancing_task_ts = time.time()
			cmd = {'cmd': 'BALANCING_LOAD'}
			scheduler.enqueue(cmd)

		cmd = node_queue.get(block=True, timeout=360)

		if cmd:
			scheduler.enqueue(cmd)
class UserRelationshipCrawler(CrawlerProcess):

	def __init__(self, node_id, crawler_id, apikeys, handlers, redis_config, proxies=None):
		if (handlers == None):
			raise MissingArgs("you need a handler to write the data to...")

		super(UserRelationshipCrawler, self).__init__(node_id, crawler_id, redis_config, handlers)

		self.apikeys = copy.copy(apikeys)
		self.tasks = {
			"TERMINATE": "TERMINATE", 
			"CRAWL_FRIENDS" : {
				"users": "find_all_friends",
				"ids": "find_all_friend_ids",
				"network_type": "friends"
			},
			"CRAWL_FOLLOWERS" :{
				"users": "find_all_followers",
				"ids": "find_all_follower_ids",
				"network_type": "followers"
			}, 
			"CRAWL_USER_TIMELINE": "fetch_user_timeline",
			"CRAWL_TWEET": "fetch_tweet_by_id"
		}
		self.node_queue = NodeQueue(self.node_id, redis_config=redis_config)
		self.client_args = {"timeout": 300}
		self.proxies = iter(proxies) if proxies else None
		self.user_api = None

		self.init_user_api()

		#self.init_user_api()

	def init_user_api(self): # this will throw StopIteration if all proxies have been tried...
		if (self.proxies):
			try:
				self.client_args['proxies'] = next(self.proxies)['proxy_dict'] # this will throw out 
				#logger.info("client_args: %s"%json.dumps(self.client_args))
			except StopIteration as exc:
				raise
			except Exception as exc:
				self.init_user_api()

		if (self.user_api):
			del self.user_api

		#crawler_id=self.crawler_id, 
		self.user_api = User(apikeys=self.apikeys, client_args=self.client_args)


	def get_handlers(self):
		return self.handlers

	def avaliable_cmds(self):
		return self.tasks.keys()

	def run(self):
		while True:
			# cmd is in json format
			# cmd = {
			#	network_type: "followers", # or friends
			#	user_id: id,
			#	data_type: 'ids' # users
			#}
			cmd = self.get_cmd()

			command = cmd['cmd']

			logger.debug("new cmd: %s"%(cmd))

			redis_cmd_handler = None

			#maybe change this to a map will be less expressive, and easier to read... but well, not too many cases here yet...
			if (command == 'TERMINATE'):
				# make sure we need to flush all existing data in the handlers..
				for handler in self.handlers:
				 	handler.flush_all()
				break
			elif (command == 'CRAWLER_FLUSH'):
				for handler in self.handlers:
				 	handler.flush_all()
			else:

				args = {}
				if (command == 'CRAWL_TWEET'):
					args = {
						"tweet_id": cmd['tweet_id'],
						"write_to_handlers": self.handlers,
						"cmd_handlers" : []
					}
				else:
					args = {
						"user_id": cmd['user_id'],
						"write_to_handlers": self.handlers,
						"cmd_handlers" : []
					}

				bucket = cmd["bucket"] if "bucket" in cmd else None

				if (bucket):
					args["bucket"] = bucket
				
				func = None
				if  (command in ['CRAWL_USER_TIMELINE', 'CRAWL_TWEET']):
					func = getattr(self.user_api, self.tasks[command])
				elif (command in ['CRAWL_FRIENDS', 'CRAWL_FOLLOWERS']):
					data_type = cmd['data_type']
					
					try:
						depth = cmd["depth"] if "depth" in cmd else None
						depth = int(depth)
						# for handler in self.handlers:
						# 	if isinstance(handler, InMemoryHandler):
						# 		inmemory_handler = handler
						if (depth > 1):
							template = copy.copy(cmd)
							# template = {
							#	network_type: "followers", # or friends
							#	user_id: id,
							#	data_type: 'ids' # object
							#	depth: depth
							#}
							# will throw out exception if redis_config doesn't exist...
							args["cmd_handlers"].append(CrawlUserRelationshipCommandHandler(template=template, redis_config=self.redis_config))

							logger.info("depth: %d, # of cmd_handlers: %d"%(depth, len(args['cmd_handlers'])))

					except Exception as exc:
						logger.warn(exc)
					
					func = getattr(self.user_api, self.tasks[command][data_type])
				
				if func:
					try:
						func(**args)
						del args['cmd_handlers']						
					except Exception as exc:
						logger.error("%s"%exc)
						try:
							self.init_user_api()
						except StopIteration as init_user_api_exc:
							# import exceptions
							# if (isinstance(init_user_api_exc, exceptions.StopIteration)): # no more proxy to try... so kill myself...
							for handler in self.handlers:
			 					handler.flush_all()

			 				logger.warn('not enough proxy servers, kill me... %s'%(self.crawler_id))
			 				# flush first
							self.node_queue.put({
								'cmd':'CRAWLER_FAILED',
								'crawler_id': self.crawler_id
								})
							del self.node_queue
							return False
							#raise
						else:
							#put current task back to queue...
							logger.info('pushing current task back to the queue: %s'%(json.dumps(cmd)))
							self.enqueue(cmd)

						#logger.error(full_stack())
						
				else:
					logger.warn("whatever are you trying to do?")

		logger.info("looks like i'm done...")
			
		return True


			



			
Beispiel #13
0
def cmd(config, args):

    if (args.command not in avaliable_cmds):
        raise Exception("not a valid command...")

    nid = args.node_id

    logger.info("node_id: %s" % (nid))
    node_queue = NodeQueue(nid, redis_config=config['redis_config'])
    node_coordinator = NodeCoordinator(config['redis_config'])
    # this can be done locally without sending the command to the servers...
    if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            screen_names = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            user_ids = twitter_api.get_user_ids_by_screen_names(screen_names)
            json.dump(list(user_ids), o_f)
    elif (args.command == 'GET_USERS_FROM_IDS'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            user_ids = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            users = twitter_api.get_users(user_ids)
            json.dump(list(users), o_f)
    elif (args.command.startswith('BATCH_')):
        new_command = args.command.replace('BATCH_', '')
        args_dict = copy.copy(args.__dict__)
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json), 'rb') as f:
            if (args.command == 'BATCH_CRAWL_TWEET'):
                tweet_ids = json.load(f)
                for tweet_id in tweet_ids:
                    print "Loading Tweet ID: ", tweet_id
                    args_dict['tweet_id'] = tweet_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
            else:
                user_ids = json.load(f)
                for user_id in user_ids:
                    args_dict['user_id'] = user_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
    elif (args.command == 'LIST_NODES'):
        pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'NODE_QSIZES'):
        raise NotImplemented("NotImplemented yet...")
        #pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'SHUTDOWN_NODE'):
        #node_coordinator.remove_node(nid)
        #pp.pprint(node_coordinator.list_nodes())
        raise NotImplemented("NotImplemented yet...")
    elif (args.command == 'CLEAR_NODE_QUEUES'):
        node_queue.clear_all_queues()
    else:
        args_dict = copy.copy(args.__dict__)
        cmd = new_cmd(args.command, args_dict)
        node_queue.put(cmd)
        logger.info('sent [%s]' % (cmd))