Esempio n. 1
0
    def test_client(self):
        nid = node_id()
        logger.info("sending to %s" % (nid))
        node_queue = NodeQueue(nid, redis_config=self.config['redis_config'])
        #redis_cmd_queue.clear()

        cmd = {
            "cmd": "CRAWL_FRIENDS",
            "user_id": 1948122342,
            "data_type": "ids",
            "depth": 2,
            "bucket": "friend_ids"
        }

        # cmd = {
        # 	"cmd": "CRAWL_USER_TIMELINE",
        # 	"user_id": 1948122342,#53039176,
        # 	"bucket": "timelines"
        # }

        node_queue.put(cmd)

        #cmd = {"cmd":"TERMINATE"}

        #node_queue.put(cmd)

        return True
    def __init__(self,
                 node_id,
                 crawler_id,
                 apikeys,
                 handlers,
                 redis_config,
                 proxies=None):
        if (handlers == None):
            raise MissingArgs("you need a handler to write the data to...")

        super(UserRelationshipCrawler, self).__init__(node_id, crawler_id,
                                                      redis_config, handlers)

        self.apikeys = copy.copy(apikeys)
        self.tasks = {
            "TERMINATE": "TERMINATE",
            "CRAWL_FRIENDS": {
                "users": "find_all_friends",
                "ids": "find_all_friend_ids",
                "network_type": "friends"
            },
            "CRAWL_FOLLOWERS": {
                "users": "find_all_followers",
                "ids": "find_all_follower_ids",
                "network_type": "followers"
            },
            "CRAWL_USER_TIMELINE": "fetch_user_timeline",
            "CRAWL_TWEET": "fetch_tweet_by_id"
        }
        self.node_queue = NodeQueue(self.node_id, redis_config=redis_config)
        self.client_args = {"timeout": 300}
        self.proxies = iter(proxies) if proxies else None
        self.user_api = None

        self.init_user_api()
Esempio n. 3
0
	def test_redis_connections(self):
		nodes = {}

		cnt = 0
		while True:
			nodes[cnt] = NodeQueue("node_id", redis_config=self.config['redis_config'])
			cnt += 1
			if (cnt % 5 == 0):
				nodes.clear()
			time.sleep(1)
def flush_cmd(bulk, data_type, template, redis_config):

    try:
        node_coordinator = NodeCoordinator(redis_config=redis_config)

        qsizes = node_coordinator.node_qsizes()

        logger.debug(qsizes)

        node_queues = {}

        for element in bulk:
            if data_type == "ids" and type(element) == int:
                user_id = element
            elif data_type == "users" and type(
                    element) == dict and "id" in element:
                user_id = element['id']

            t = copy.copy(template)
            t["user_id"] = int(user_id)
            t["depth"] = int(t["depth"]) - 1

            node_id = get_keys_by_min_value(qsizes)[0]

            if (node_id in node_queues):
                node_queue = node_queues[node_id]
            else:
                node_queue = NodeQueue(node_id, redis_config=redis_config)
                node_queues[node_id] = node_queue

            t['cmd_hash'] = hash_cmd(t)
            node_queue.put(t)
            qsizes[node_id] += 1

            logger.debug("send [%s] to node: %s" % (json.dumps(t), node_id))

        # intend to close all redis connections, but not sure yet...
        node_queues.clear()

        del node_coordinator

    except Exception as exc:
        logger.error('error during flush: %s' % exc)

    return True
Esempio n. 5
0
def start_server(config, proxies):

    check_config(config)
    config = copy.copy(config)

    folders_to_create = []
    buckets = [
        "tweets", "followers", "follower_ids", "friends", "friend_ids",
        "timelines"
    ]

    ouput_folder = os.path.abspath(config['output'])
    archive_output = os.path.abspath(
        config['archive_output']) if config['archive_output'] else ouput_folder
    archive_output = os.path.join(archive_output, 'archived')

    folders_to_create.append(ouput_folder)
    folders_to_create.append(archive_output)

    for bucket in buckets:
        folders_to_create.append(os.path.join(ouput_folder, bucket))
        folders_to_create.append(os.path.join(archive_output, bucket))

    for folder_to_create in folders_to_create:
        if not os.path.exists(folder_to_create):
            os.makedirs(folder_to_create)

    logger.info("output to %s" % ouput_folder)
    logger.info("archived to %s" % archive_output)

    this_node_id = node_id()
    node_queue = NodeQueue(this_node_id, redis_config=config['redis_config'])
    node_queue.clear()

    scheduler = Scheduler(this_node_id, config=config, proxies=proxies)

    logger.info('starting node_id: %s' % this_node_id)

    node_coordinator = NodeCoordinator(config['redis_config'])
    # node_coordinator.clear()

    # the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler;
    # but we need one to report the status of each crawler and perform the tarball tashs...

    last_archive_ts = time.time(
    ) + 3600  # the first archive event starts 2 hrs later...
    pre_time = time.time()
    last_load_balancing_task_ts = time.time()
    while True:

        if time.time() - pre_time > 120:
            logger.info(pprint.pformat(scheduler.crawler_status()))
            pre_time = time.time()
            if scheduler.is_alive():
                cmd = {'cmd': 'CRAWLER_FLUSH'}
                scheduler.enqueue(cmd)

        if time.time() - last_archive_ts > 3600:

            logger.info("start archive procedure...")
            with concurrent.futures.ProcessPoolExecutor(
                    max_workers=len(buckets)) as executor:

                future_proxies = {
                    executor.submit(tarball_results, ouput_folder, bucket,
                                    archive_output,
                                    int(time.time()) - 3600): bucket
                    for bucket in buckets
                }

                for future in future_proxies:
                    future.add_done_callback(lambda f: logger.info(
                        "archive created? %s: [%s]" % f.result()))

            last_archive_ts = time.time()

        # block, the main process...for a command
        if not scheduler.is_alive():
            logger.info(
                "no crawler is alive... waiting to recreate all crawlers...")
            time.sleep(120)  # sleep for a minute and retry
            continue

        if time.time(
        ) - last_load_balancing_task_ts > 1800:  # try to balance the local queues every 30 mins
            last_load_balancing_task_ts = time.time()
            cmd = {'cmd': 'BALANCING_LOAD'}
            scheduler.enqueue(cmd)

        cmd = node_queue.get(block=True, timeout=360)

        if cmd:
            scheduler.enqueue(cmd)
Esempio n. 6
0
def cmd(config, args):

    if (args.command not in avaliable_cmds):
        raise Exception("not a valid command...")

    nid = args.node_id

    logger.info("node_id: %s" % (nid))
    node_queue = NodeQueue(nid, redis_config=config['redis_config'])
    node_coordinator = NodeCoordinator(config['redis_config'])
    # this can be done locally without sending the command to the servers...
    if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            screen_names = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            user_ids = twitter_api.get_user_ids_by_screen_names(screen_names)
            json.dump(list(user_ids), o_f)
    elif (args.command == 'GET_USERS_FROM_IDS'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            user_ids = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            users = twitter_api.get_users(user_ids)
            json.dump(list(users), o_f)
    elif (args.command.startswith('BATCH_')):
        new_command = args.command.replace('BATCH_', '')
        args_dict = copy.copy(args.__dict__)
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json), 'rb') as f:
            if (args.command == 'BATCH_CRAWL_TWEET'):
                tweet_ids = json.load(f)
                for tweet_id in tweet_ids:
                    print "Loading Tweet ID: ", tweet_id
                    args_dict['tweet_id'] = tweet_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
            else:
                user_ids = json.load(f)
                for user_id in user_ids:
                    args_dict['user_id'] = user_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
    elif (args.command == 'LIST_NODES'):
        pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'NODE_QSIZES'):
        raise NotImplemented("NotImplemented yet...")
        #pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'SHUTDOWN_NODE'):
        #node_coordinator.remove_node(nid)
        #pp.pprint(node_coordinator.list_nodes())
        raise NotImplemented("NotImplemented yet...")
    elif (args.command == 'CLEAR_NODE_QUEUES'):
        node_queue.clear_all_queues()
    else:
        args_dict = copy.copy(args.__dict__)
        cmd = new_cmd(args.command, args_dict)
        node_queue.put(cmd)
        logger.info('sent [%s]' % (cmd))