Beispiel #1
0
	def test_client(self):
		nid = node_id()
		logger.info("sending to %s"%(nid))
		node_queue = NodeQueue(nid, redis_config=self.config['redis_config'])
		#redis_cmd_queue.clear()

		cmd = {
			"cmd": "CRAWL_FRIENDS",
			"user_id": 1948122342,
			"data_type": "ids",
			"depth": 2,
			"bucket":"friend_ids"
		}

		# cmd = {
		# 	"cmd": "CRAWL_USER_TIMELINE",
		# 	"user_id": 1948122342,#53039176,
		# 	"bucket": "timelines"
		# }

		node_queue.put(cmd)

		#cmd = {"cmd":"TERMINATE"}
		
		#node_queue.put(cmd)

		return True
Beispiel #2
0
    def test_client(self):
        nid = node_id()
        logger.info("sending to %s" % (nid))
        node_queue = NodeQueue(nid, redis_config=self.config['redis_config'])
        #redis_cmd_queue.clear()

        cmd = {
            "cmd": "CRAWL_FRIENDS",
            "user_id": 1948122342,
            "data_type": "ids",
            "depth": 2,
            "bucket": "friend_ids"
        }

        # cmd = {
        # 	"cmd": "CRAWL_USER_TIMELINE",
        # 	"user_id": 1948122342,#53039176,
        # 	"bucket": "timelines"
        # }

        node_queue.put(cmd)

        #cmd = {"cmd":"TERMINATE"}

        #node_queue.put(cmd)

        return True
Beispiel #3
0
def cmd(config, args):
	
	if (args.command not in avaliable_cmds):
		raise Exception("not a valid command...")

	nid = args.node_id
	
	logger.info("node_id: %s"%(nid))
	node_queue = NodeQueue(nid, redis_config=config['redis_config'])
	node_coordinator = NodeCoordinator(config['redis_config'])
	# this can be done locally without sending the command to the servers...
	if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'):
		apikeys = config["apikeys"].values()[0]
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
			screen_names = json.load(f)
			user_api = User(apikeys=apikeys)
			user_ids = user_api.get_user_ids_by_screen_names(screen_names)
			json.dump(list(user_ids), o_f)
	elif (args.command == 'GET_USERS_FROM_IDS'):
		apikeys = config["apikeys"].values()[0]
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
			user_ids = json.load(f)
			user_api = User(apikeys=apikeys)
			users = user_api.get_users(user_ids)
			json.dump(list(users), o_f)
	elif (args.command.startswith('BATCH_')):
		command = args.command.replace('BATCH_', '')
		args_dict = copy.copy(args.__dict__)
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f:
			user_ids = json.load(f)
			for user_id in user_ids:
				args_dict['user_id'] = user_id
				cmd = new_cmd(command, args_dict)
				node_queue.put(cmd)
	elif (args.command == 'LIST_NODES'):
		pp.pprint(node_coordinator.list_nodes())
	elif (args.command == 'NODE_QSIZES'):
		raise NotImplemented("NotImplemented yet...")
		#pp.pprint(node_coordinator.list_nodes())
	elif (args.command == 'SHUTDOWN_NODE'):
		#node_coordinator.remove_node(nid)
		#pp.pprint(node_coordinator.list_nodes())
		raise NotImplemented("NotImplemented yet...")
	elif (args.command == 'CLEAR_NODE_QUEUES'):
		node_queue.clear_all_queues()
	else:
		args_dict = copy.copy(args.__dict__)
		cmd = new_cmd(args.command, args_dict)
		node_queue.put(cmd)
		logger.info('sent [%s]'%(cmd))
def flush_cmd(bulk, data_type, template, redis_config):

	try:
		node_coordinator = NodeCoordinator(redis_config=redis_config)

		qsizes = node_coordinator.node_qsizes()

		logger.debug(qsizes)
		
		node_queues = {}

		for element in bulk:
			if data_type == "ids" and type(element) == int:
				user_id = element
			elif data_type =="users" and type(element) == dict and "id" in element:
				user_id = element['id']
			
			t = copy.copy(template)
			t["user_id"] = int(user_id)
			t["depth"] = int(t["depth"]) -1

			node_id = get_keys_by_min_value(qsizes)[0]

			if (node_id in node_queues):
				node_queue = node_queues[node_id]
			else:
				node_queue = NodeQueue(node_id, redis_config=redis_config)
				node_queues[node_id] = node_queue


			t['cmd_hash'] = hash_cmd(t)
			node_queue.put(t)
			qsizes[node_id] += 1

			logger.debug("send [%s] to node: %s"%(json.dumps(t),node_id))

		# intend to close all redis connections, but not sure yet...
		node_queues.clear()

		del node_coordinator

			
	except Exception as exc:
		logger.error('error during flush: %s'%exc)

	return True
def flush_cmd(bulk, data_type, template, redis_config):

    try:
        node_coordinator = NodeCoordinator(redis_config=redis_config)

        qsizes = node_coordinator.node_qsizes()

        logger.debug(qsizes)

        node_queues = {}

        for element in bulk:
            if data_type == "ids" and type(element) == int:
                user_id = element
            elif data_type == "users" and type(
                    element) == dict and "id" in element:
                user_id = element['id']

            t = copy.copy(template)
            t["user_id"] = int(user_id)
            t["depth"] = int(t["depth"]) - 1

            node_id = get_keys_by_min_value(qsizes)[0]

            if (node_id in node_queues):
                node_queue = node_queues[node_id]
            else:
                node_queue = NodeQueue(node_id, redis_config=redis_config)
                node_queues[node_id] = node_queue

            t['cmd_hash'] = hash_cmd(t)
            node_queue.put(t)
            qsizes[node_id] += 1

            logger.debug("send [%s] to node: %s" % (json.dumps(t), node_id))

        # intend to close all redis connections, but not sure yet...
        node_queues.clear()

        del node_coordinator

    except Exception as exc:
        logger.error('error during flush: %s' % exc)

    return True
class TwitterCrawler(CrawlerProcess):
    def __init__(self,
                 node_id,
                 crawler_id,
                 apikeys,
                 handlers,
                 redis_config,
                 proxies=None):
        if (handlers == None):
            raise MissingArgs("you need a handler to write the data to...")

        super(TwitterCrawler, self).__init__(node_id, crawler_id, redis_config,
                                             handlers)

        self.apikeys = copy.copy(apikeys)
        self.tasks = {
            "TERMINATE": "TERMINATE",
            "CRAWL_FRIENDS": {
                "users": "find_all_friends",
                "ids": "find_all_friend_ids",
                "network_type": "friends"
            },
            "CRAWL_FOLLOWERS": {
                "users": "find_all_followers",
                "ids": "find_all_follower_ids",
                "network_type": "followers"
            },
            "CRAWL_USER_TIMELINE": "fetch_user_timeline",
            "CRAWL_TWEET": "fetch_tweet_by_id",
            "SEARCH": "search_by_query"
        }
        self.node_queue = NodeQueue(self.node_id, redis_config=redis_config)
        self.client_args = {"timeout": 300}
        self.proxies = iter(proxies) if proxies else None
        self.twitter_api = None

        self.init_twitter_api()

    def init_twitter_api(
        self
    ):  # this will throw StopIteration if all proxies have been tried...
        if (self.proxies):
            try:
                self.client_args['proxies'] = next(
                    self.proxies)['proxy_dict']  # this will throw out
            # logger.info("client_args: %s"%json.dumps(self.client_args))
            except StopIteration as exc:
                raise
            except Exception as exc:
                self.init_twitter_api()

        if (self.twitter_api):
            del self.twitter_api

        # crawler_id=self.crawler_id,
        self.twitter_api = TwitterAPI(apikeys=self.apikeys,
                                      client_args=self.client_args)

    def get_handlers(self):
        return self.handlers

    def avaliable_cmds(self):
        return self.tasks.keys()

    def run(self):
        while True:
            # cmd is in json format
            # cmd = {
            #	network_type: "followers", # or friends
            #	user_id: id,
            #	data_type: 'ids' # users
            # }
            cmd = self.get_cmd()

            command = cmd['cmd']

            logger.debug("new cmd: %s" % (cmd))

            redis_cmd_handler = None

            # maybe change this to a map will be less expressive, and easier to read... but well, not too many cases here yet...
            if (command == 'TERMINATE'):
                # make sure we need to flush all existing data in the handlers..
                for handler in self.handlers:
                    handler.flush_all()
                break
            elif (command == 'CRAWLER_FLUSH'):
                for handler in self.handlers:
                    handler.flush_all()
            else:

                # figure out args first...
                args = {}
                if (command == 'CRAWL_TWEET'):
                    args = {
                        "tweet_id": cmd['tweet_id'],
                        "write_to_handlers": self.handlers,
                        "cmd_handlers": []
                    }
                elif (command == 'SEARCH'):
                    args = {
                        "write_to_handlers": self.handlers,
                        "cmd_handlers": []
                    }
                else:
                    args = {
                        "user_id": cmd['user_id'],
                        "write_to_handlers": self.handlers,
                        "cmd_handlers": []
                    }

                bucket = cmd["bucket"] if "bucket" in cmd else None

                if (bucket):
                    args["bucket"] = bucket

                func = None
                if (command in ['CRAWL_USER_TIMELINE', 'CRAWL_TWEET']):
                    func = getattr(self.twitter_api, self.tasks[command])
                elif (command in ['SEARCH']):

                    if "lang" in cmd:
                        args['lang'] = cmd['lang']

                    if "geocode" in cmd:
                        args['geocode'] = cmd['geocode']

                    if "key" in cmd:
                        args['key'] = cmd['key']

                    # logger.info("new cmd: %s"%(cmd))
                    # q is required, otherwise let it fail...
                    if "query" in cmd:
                        args['query'] = cmd['query']
                        func = getattr(self.twitter_api, self.tasks[command])

                elif (command in ['CRAWL_FRIENDS', 'CRAWL_FOLLOWERS']):
                    data_type = cmd['data_type']

                    try:
                        depth = cmd["depth"] if "depth" in cmd else None
                        depth = int(depth)
                        # for handler in self.handlers:
                        # 	if isinstance(handler, InMemoryHandler):
                        # 		inmemory_handler = handler
                        if (depth > 1):
                            template = copy.copy(cmd)
                            # template = {
                            #	network_type: "followers", # or friends
                            #	user_id: id,
                            #	data_type: 'ids' # object
                            #	depth: depth
                            # }
                            # will throw out exception if redis_config doesn't exist...
                            args["cmd_handlers"].append(
                                CrawlUserRelationshipCommandHandler(
                                    template=template,
                                    redis_config=self.redis_config))

                            logger.info("depth: %d, # of cmd_handlers: %d" %
                                        (depth, len(args['cmd_handlers'])))

                    except Exception as exc:
                        logger.warn(exc)

                    func = getattr(self.twitter_api,
                                   self.tasks[command][data_type])

                if func:
                    try:
                        # logger.info(args)
                        func(**args)
                        del args['cmd_handlers']
                        for handler in self.handlers:
                            handler.flush_all()
                    except Exception as exc:
                        logger.error("%s" % exc)
                        try:
                            self.init_twitter_api()
                        except StopIteration as init_twitter_api_exc:
                            # import exceptions
                            # if (isinstance(init_user_api_exc, exceptions.StopIteration)): # no more proxy to try... so kill myself...
                            for handler in self.handlers:
                                handler.flush_all()

                            logger.warn(
                                'not enough proxy servers, kill me... %s' %
                                (self.crawler_id))
                            # flush first
                            self.node_queue.put({
                                'cmd': 'CRAWLER_FAILED',
                                'crawler_id': self.crawler_id
                            })
                            del self.node_queue
                            return False
                        # raise
                        else:
                            # put current task back to queue...
                            logger.info(
                                'pushing current task back to the queue: %s' %
                                (json.dumps(cmd)))
                            self.enqueue(cmd)

                            # logger.error(full_stack())

                else:
                    logger.warn("whatever are you trying to do?")

        logger.info("looks like i'm done...")

        return True
class UserRelationshipCrawler(CrawlerProcess):

	def __init__(self, node_id, crawler_id, apikeys, handlers, redis_config, proxies=None):
		if (handlers == None):
			raise MissingArgs("you need a handler to write the data to...")

		super(UserRelationshipCrawler, self).__init__(node_id, crawler_id, redis_config, handlers)

		self.apikeys = copy.copy(apikeys)
		self.tasks = {
			"TERMINATE": "TERMINATE", 
			"CRAWL_FRIENDS" : {
				"users": "find_all_friends",
				"ids": "find_all_friend_ids",
				"network_type": "friends"
			},
			"CRAWL_FOLLOWERS" :{
				"users": "find_all_followers",
				"ids": "find_all_follower_ids",
				"network_type": "followers"
			}, 
			"CRAWL_USER_TIMELINE": "fetch_user_timeline",
			"CRAWL_TWEET": "fetch_tweet_by_id"
		}
		self.node_queue = NodeQueue(self.node_id, redis_config=redis_config)
		self.client_args = {"timeout": 300}
		self.proxies = iter(proxies) if proxies else None
		self.user_api = None

		self.init_user_api()

		#self.init_user_api()

	def init_user_api(self): # this will throw StopIteration if all proxies have been tried...
		if (self.proxies):
			try:
				self.client_args['proxies'] = next(self.proxies)['proxy_dict'] # this will throw out 
				#logger.info("client_args: %s"%json.dumps(self.client_args))
			except StopIteration as exc:
				raise
			except Exception as exc:
				self.init_user_api()

		if (self.user_api):
			del self.user_api

		#crawler_id=self.crawler_id, 
		self.user_api = User(apikeys=self.apikeys, client_args=self.client_args)


	def get_handlers(self):
		return self.handlers

	def avaliable_cmds(self):
		return self.tasks.keys()

	def run(self):
		while True:
			# cmd is in json format
			# cmd = {
			#	network_type: "followers", # or friends
			#	user_id: id,
			#	data_type: 'ids' # users
			#}
			cmd = self.get_cmd()

			command = cmd['cmd']

			logger.debug("new cmd: %s"%(cmd))

			redis_cmd_handler = None

			#maybe change this to a map will be less expressive, and easier to read... but well, not too many cases here yet...
			if (command == 'TERMINATE'):
				# make sure we need to flush all existing data in the handlers..
				for handler in self.handlers:
				 	handler.flush_all()
				break
			elif (command == 'CRAWLER_FLUSH'):
				for handler in self.handlers:
				 	handler.flush_all()
			else:

				args = {}
				if (command == 'CRAWL_TWEET'):
					args = {
						"tweet_id": cmd['tweet_id'],
						"write_to_handlers": self.handlers,
						"cmd_handlers" : []
					}
				else:
					args = {
						"user_id": cmd['user_id'],
						"write_to_handlers": self.handlers,
						"cmd_handlers" : []
					}

				bucket = cmd["bucket"] if "bucket" in cmd else None

				if (bucket):
					args["bucket"] = bucket
				
				func = None
				if  (command in ['CRAWL_USER_TIMELINE', 'CRAWL_TWEET']):
					func = getattr(self.user_api, self.tasks[command])
				elif (command in ['CRAWL_FRIENDS', 'CRAWL_FOLLOWERS']):
					data_type = cmd['data_type']
					
					try:
						depth = cmd["depth"] if "depth" in cmd else None
						depth = int(depth)
						# for handler in self.handlers:
						# 	if isinstance(handler, InMemoryHandler):
						# 		inmemory_handler = handler
						if (depth > 1):
							template = copy.copy(cmd)
							# template = {
							#	network_type: "followers", # or friends
							#	user_id: id,
							#	data_type: 'ids' # object
							#	depth: depth
							#}
							# will throw out exception if redis_config doesn't exist...
							args["cmd_handlers"].append(CrawlUserRelationshipCommandHandler(template=template, redis_config=self.redis_config))

							logger.info("depth: %d, # of cmd_handlers: %d"%(depth, len(args['cmd_handlers'])))

					except Exception as exc:
						logger.warn(exc)
					
					func = getattr(self.user_api, self.tasks[command][data_type])
				
				if func:
					try:
						func(**args)
						del args['cmd_handlers']						
					except Exception as exc:
						logger.error("%s"%exc)
						try:
							self.init_user_api()
						except StopIteration as init_user_api_exc:
							# import exceptions
							# if (isinstance(init_user_api_exc, exceptions.StopIteration)): # no more proxy to try... so kill myself...
							for handler in self.handlers:
			 					handler.flush_all()

			 				logger.warn('not enough proxy servers, kill me... %s'%(self.crawler_id))
			 				# flush first
							self.node_queue.put({
								'cmd':'CRAWLER_FAILED',
								'crawler_id': self.crawler_id
								})
							del self.node_queue
							return False
							#raise
						else:
							#put current task back to queue...
							logger.info('pushing current task back to the queue: %s'%(json.dumps(cmd)))
							self.enqueue(cmd)

						#logger.error(full_stack())
						
				else:
					logger.warn("whatever are you trying to do?")

		logger.info("looks like i'm done...")
			
		return True


			



			
Beispiel #8
0
def cmd(config, args):

    if (args.command not in avaliable_cmds):
        raise Exception("not a valid command...")

    nid = args.node_id

    logger.info("node_id: %s" % (nid))
    node_queue = NodeQueue(nid, redis_config=config['redis_config'])
    node_coordinator = NodeCoordinator(config['redis_config'])
    # this can be done locally without sending the command to the servers...
    if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            screen_names = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            user_ids = twitter_api.get_user_ids_by_screen_names(screen_names)
            json.dump(list(user_ids), o_f)
    elif (args.command == 'GET_USERS_FROM_IDS'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            user_ids = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            users = twitter_api.get_users(user_ids)
            json.dump(list(users), o_f)
    elif (args.command.startswith('BATCH_')):
        new_command = args.command.replace('BATCH_', '')
        args_dict = copy.copy(args.__dict__)
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json), 'rb') as f:
            if (args.command == 'BATCH_CRAWL_TWEET'):
                tweet_ids = json.load(f)
                for tweet_id in tweet_ids:
                    print "Loading Tweet ID: ", tweet_id
                    args_dict['tweet_id'] = tweet_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
            else:
                user_ids = json.load(f)
                for user_id in user_ids:
                    args_dict['user_id'] = user_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
    elif (args.command == 'LIST_NODES'):
        pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'NODE_QSIZES'):
        raise NotImplemented("NotImplemented yet...")
        #pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'SHUTDOWN_NODE'):
        #node_coordinator.remove_node(nid)
        #pp.pprint(node_coordinator.list_nodes())
        raise NotImplemented("NotImplemented yet...")
    elif (args.command == 'CLEAR_NODE_QUEUES'):
        node_queue.clear_all_queues()
    else:
        args_dict = copy.copy(args.__dict__)
        cmd = new_cmd(args.command, args_dict)
        node_queue.put(cmd)
        logger.info('sent [%s]' % (cmd))