Ejemplo n.º 1
0
    def run(self):
        Connection.Instance().set_access_token_secret(self.job.access_token,
                                                      self.job.access_secret)

        if not Connection.Instance().job_exists(self.job):
            initialize_job(self.job)

        print("{} started !".format(self.job.name))

        collection_job = Connection.Instance().jobs_db[self.job.name]

        current_number_of_users = collection_job.count()

        while current_number_of_users < self.job.user_limit and not self.exit.is_set(
        ):
            print(
                "Heartbeat from job {}, access token = {}, access_secret = {}".
                format(self.job.name, self.job.access_token,
                       self.job.access_secret))
            next_user = self.job.crawling_strategy(collection_job)

            print("Fetching followers of {}...".format(
                next_user["screen_name"]))

            process_user(next_user, self.job, collection_job)

            current_number_of_users = collection_job.count()
        else:
            print(
                "!!!JOB FINISHED!!!\nuser limit : {}, number of collected users : {}"
                .format(self.job.user_limit, current_number_of_users))
Ejemplo n.º 2
0
def execute_job(job):
    Connection.Instance().set_access_token_secret(job.access_token,
                                                  job.access_secret)

    if not Connection.Instance().job_exists(job):
        initialize_job(job)

    print("{} started !".format(job.name))

    collection_job = Connection.Instance().jobs_db[job.name]

    current_number_of_users = collection_job.count()

    while current_number_of_users < job.user_limit:
        next_user = job.crawling_strategy(collection_job)

        print("Fetching followers of {}...".format(next_user["screen_name"]))

        process_user(next_user, job, collection_job)

        current_number_of_users = collection_job.count()
    else:
        print(
            "!!!JOB FINISHED!!!\nuser limit : {}, number of collected users : {}"
            .format(job.user_limit, current_number_of_users))
Ejemplo n.º 3
0
def process_user(user, job, collection_job):
    result = get_followers_page_and_next_cursor(user["screen_name"],
                                                user["last_cursor"])

    if result:
        page, next_cursor = result
    else:
        print("...Account unauthorized, skipping")
        collection_job.update_one({"id": user["id"]},
                                  {"$set": {
                                      "authorized": False
                                  }})
        return

    # find user id's that are not currently in the database and fetch their profiles
    try:
        q = Queue("default", connection=Connection.Instance().redis_server)
        ret = q.enqueue(save_new_users, args=(
            page,
            job.name,
        ))
    except ModuleNotFoundError as e:
        print(e)

    collection_job.update({"id": user["id"]}, {
        "$addToSet": {
            "follower_ids": {
                "$each": page
            }
        },
        "$set": {
            "finished": next_cursor == 0,
            "last_cursor": next_cursor
        }
    })
Ejemplo n.º 4
0
def initialize_job(job):
    print("Initializing job : {}".format(job.name))

    user_profiles = get_user_profiles_single_request(job.seed_list)

    # Determine features for each profile
    for profile in user_profiles:
        profile["features"] = {
            func.__name__: func(profile)
            for func in job.classifiers
        }

    db = Connection.Instance().jobs_db

    collection_job = db[job.name]

    collection_job.create_index("id", unique=True)

    collection_job.insert_many(user_profiles)