def clear_jobs(
    client: scheduler_v1.CloudSchedulerClient, project_id: str, location_id: str
) -> None:
    """ Delete all scheduled jobs """
    parent = client.location_path(project_id, location_id)
    for job in client.list_jobs(parent):
        client.delete_job(job.name)
def schedule_job(
    client: scheduler_v1.CloudSchedulerClient,
    project_id: str,
    location_id: str,
    timezone: str,
    schedule: str,
    path: str,
) -> None:
    """ Schedules the given job for the specified project and location """
    # Create a Job to schedule
    target = AppEngineHttpTarget(relative_uri=path, http_method="GET")
    job = Job(app_engine_http_target=target,
              schedule=schedule,
              time_zone=timezone)

    # Schedule the Job we just created
    parent = client.location_path(project_id, location_id)
    client.create_job(parent, job)
def create_source_cron_job(source_entity_id, datatype):
    create_cron_job_env = os.getenv("CREATE_CRONJOB", "false")
    create_cron_job_bool = json.loads(create_cron_job_env)
    if create_cron_job_bool is True:
        # create cron job
        project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
        location_id = os.environ["LOCATION_ID"]
        dispatcher_topic = os.environ["TOPIC_DISPATCHER"]
        cron_job_message = {
            SOURCE_ENTITY_ID: source_entity_id,
            DATATYPE: datatype,
        }
        cron_job_message_json = json.dumps(cron_job_message)
        encoded_cron_job_message = cron_job_message_json.encode("utf-8")
        topic_name = f"projects/{project_id}/topics/{dispatcher_topic}"
        pubsub_target = PubsubTarget(
            topic_name=topic_name,
            data=encoded_cron_job_message,
        )
        location = f"projects/{project_id}/locations/{location_id}"
        source_scheduler_job = Job(
            dict(
                name=f"{location}/jobs/{source_entity_id}",
                pubsub_target=pubsub_target,
                schedule="0 1 */10 * *",
            )
        )
        scheduler_client = CloudSchedulerClient()
        new_job = scheduler_client.create_job(parent=location, job=source_scheduler_job)
        return new_job
    else:
        print(
            f"Received a message for source entity id: {source_entity_id} "
            f" on staging or test env. Not creating a cron job"
        )
        return None
Exemple #4
0
"""
Utility wrappers around Google's Cloud Scheduler API. See
https://googleapis.dev/python/cloudscheduler/latest/_modules/google/cloud/scheduler_v1/gapic/cloud_scheduler_client.html
for API reference.
"""
from typing import Union, Any, Optional, List

from pydantic import BaseModel

from google.cloud.scheduler_v1 import CloudSchedulerClient
from cloud_tasks.conf import REGION, PROJECT_ID

client = CloudSchedulerClient()
parent = client.location_path(PROJECT_ID, REGION)


class JobRetrieveError(BaseException):
    pass


class JobCreationError(BaseException):
    pass


class JobUpdateError(BaseException):
    pass


class JobDeleteError(BaseException):
    pass
Exemple #5
0
def start_process():
    start_time = time()
    storage_client = Client()
    scheduler_client = CloudSchedulerClient()
    scheduler_path = scheduler_client.location_path(config.PROJECT_ID,
                                                    config.REGION_ID)
    cred = credentials.ApplicationDefault()

    try:
        scheduler_client.delete_job(
            f"{scheduler_path}/jobs/{config.CRON_NAME}")
    except GoogleAPICallError or PermissionDenied:
        logging.warning("course-collect manually triggered")

    try:
        scheduler_client.delete_job(f"{scheduler_path}/jobs/forcequit")
    except GoogleAPICallError or PermissionDenied:
        logging.warning("forcequit job does not exist")

    if not _apps:
        initialize_app(cred, {"projectId": config.PROJECT_ID})
        logging.info("initializing firebase")

    firebase_db = firestore.client()

    if storage_client.bucket(config.BUCKET_NAME).exists():
        logging.info("reading from existing bucket")
        coursepickle_bucket = storage_client.bucket(config.BUCKET_NAME)
    else:
        logging.info("creating new bucket")
        coursepickle_bucket = storage_client.create_bucket(config.BUCKET_NAME)

    # Get unfinished course codes
    coursecode_blob = coursepickle_bucket.blob(config.COURSE_CODE_BLOB_NAME)
    try:
        coursecode_raw = coursecode_blob.download_as_string()
        unique_course_codes = pickle.loads(coursecode_raw)
    except NotFound:
        # Fetch course metadata per code for instructor, schedule, time, location, GPA, grade distributions
        all_courses = get_all_courses(firebase_db)
        unique_course_codes = set(
            [course["code"] for course in all_courses.values()])

    # Get existing course metadata
    coursepickle_blob = coursepickle_bucket.blob(
        config.COURSE_METADATA_BLOB_NAME)
    try:
        course_metadata_raw = coursepickle_blob.download_as_string()
        course_metadata = pickle.loads(course_metadata_raw)
    except NotFound:
        course_metadata = {}

    course_metadata = course_metadata if course_metadata else {}

    # Conform to free tier limits (looks like {"runtime": 123, "datetime": datetime(...)}
    last_modified_blob = coursepickle_bucket.blob(
        config.LAST_MODIFIED_BLOB_NAME)
    try:
        last_modified_raw = last_modified_blob.download_as_string()
        last_modified = pickle.loads(last_modified_raw)
    except NotFound:
        last_modified = {}

    last_modified = last_modified if last_modified else {
        "runtime": 0,
        "datetime": None
    }

    check_free_tier_force_exit(
        scheduler_client, scheduler_path,
        get_curr_runtime(last_modified["runtime"], start_time))
    if last_modified[
            "datetime"] and last_modified["datetime"].day < datetime.now().day:
        last_modified["runtime"] = 0

    if bool(int(config.UPDATE_EXTRA_FIELDS)):
        course_code_done = []
        for code in unique_course_codes:
            try:
                logging.info(f"Checking class {code}")
                print(code)
                split_code = code.split()
                pg = requests_connectionerror_bypass(
                    config.SCHEDULE_TARGET_URL_FMT,
                    [config.LATEST_TERM, *split_code], scheduler_client,
                    scheduler_path, last_modified, start_time)

                html_content = requests_bandwith_bypass(
                    pg, config.SCHEDULE_TARGET_URL_FMT, split_code,
                    scheduler_client, scheduler_path, last_modified,
                    start_time)

                class_ddtitle = html_content.find_all("th",
                                                      {"scope": "colgroup"},
                                                      class_="ddtitle")

                class_titles = [
                    th.a.text for th in class_ddtitle
                    if "table" in str(th.find_next("tr"))
                ]

                class_dddefaults = [
                    str(c).replace("\n", "")
                    for c in html_content.find_all("td", class_="dddefault")
                    if "cc.gatech.edu" in c.text or "students" in c.text
                    or "lecture" in c.text or "Semester" in c.text
                ]

                class_terms = [
                    re.search(
                        "(?<=Associated Term: </span>)([a-zA-Z0-9'\s]*)(?=<br)",
                        c).group(0).strip() for c in class_dddefaults
                ]

                class_registration_dates = [
                    re.search(
                        "(?<=Registration Dates: </span>)([a-zA-Z0-9,\s]*)(?=<br)",
                        c).group(0).strip() for c in class_dddefaults
                ]

                class_attributes = [
                    re.search("(?<=Attributes: </span>)([^<]*)(?=<br)",
                              c).group(0).strip()
                    if "Attributes" in c else None for c in class_dddefaults
                ]

                class_grade_bases = [
                    re.search("(?<=Grade Basis: </span>)([A-Z0-9\s]*)(?=<br)",
                              c).group(0).strip() for c in class_dddefaults
                ]

                class_table = html_content.find_all(
                    "table", class_="datadisplaytable")[1:-1]

                class_schedule_headers = [[
                    "_".join(header.text.lower().split())
                    for header in table.find_all("th")
                ] for table in class_table]

                class_schedule_data = [[
                    header.text.replace("(P)", "").strip()
                    for header in table.find_all("td")
                ] for table in class_table]

                for c in class_schedule_data:
                    c[-1] = " ".join(c[-1].split())

                instructor_emails = [
                    re.search(
                        "([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)",
                        str(c)).group(1) if "mailto" in str(c) else None
                    for c in class_table
                ]

                pg = requests_connectionerror_bypass(
                    config.CRITIQUE_TARGET_URL_FMT, split_code,
                    scheduler_client, scheduler_path, last_modified,
                    start_time)

                html_content = requests_bandwith_bypass(
                    pg, config.CRITIQUE_TARGET_URL_FMT, split_code,
                    scheduler_client, scheduler_path, last_modified,
                    start_time)

                critique_table = html_content.find("table",
                                                   {"id": "dataTable"})

                critique_headers = [
                    "_".join(th.text.lower().split())
                    for th in critique_table.find_all("th")
                ][1:]

                critique_data_raw = [
                    td.text for td in critique_table.find_all("td")
                ]

                critique_data = [
                    critique_data_raw[x:x + len(critique_headers) + 1]
                    for x in range(0, len(critique_data_raw),
                                   len(critique_headers) + 1)
                ]

                critique_instructors = []
                for i in range(len(critique_data)):
                    critique_instructors.append(" ".join(
                        critique_data[i][0].split(", ")[::-1]))
                    del critique_data[i][0]
                    critique_data[i] = [critique_data[i][0]] + [
                        float(x) for x in critique_data[i][1:]
                    ]

                critique_averages = {}

                for i in range(len(critique_instructors)):
                    critique_averages[critique_instructors[i]] = dict(
                        zip(critique_headers, critique_data[i]))

                for i in range(len(class_titles)):
                    try:
                        schedule = dict(
                            zip(class_schedule_headers[i],
                                class_schedule_data[i]))
                    except:
                        print(i)
                        raise RuntimeError

                    course_metadata[class_titles[i]] = {
                        "terms":
                        class_terms[i],
                        "registration_dates":
                        class_registration_dates[i],
                        "attributes":
                        class_attributes[i],
                        "grade_basis":
                        class_grade_bases[i],
                        "schedule":
                        schedule,
                        "instructor_email":
                        instructor_emails[i],
                        "averages":
                        critique_averages[schedule["instructors"]] if
                        schedule["instructors"] in critique_averages else None
                    }

                course_code_done.append(code)
            except RuntimeError as e:
                write_blobs_before_exit(coursepickle_blob, coursecode_blob,
                                        last_modified_blob, course_metadata,
                                        unique_course_codes, course_code_done,
                                        last_modified, start_time)
                schedule_next_try(scheduler_client, scheduler_path)
                raise e
    """
    Fetch per course seat, credit, and requirement information
    """
    for i in range(config.START_IDX, config.END_IDX):
        try:
            logging.info(f"Checking class with id {i}")

            pg = requests_connectionerror_bypass(
                config.REGISTRATION_TARGET_URL_FMT, [config.LATEST_TERM, i],
                scheduler_client, scheduler_path, last_modified, start_time)

            html_content = requests_bandwith_bypass(
                pg, config.REGISTRATION_TARGET_URL_FMT, [i], scheduler_client,
                scheduler_path, last_modified, start_time)

            if "-" not in html_content.text:
                logging.info(f"skipping {i}")
                continue

            class_general = html_content.find_all("th", {"scope": "row"},
                                                  class_="ddlabel")[0].text

            # For classes with dashes in the class name, replace them one by one with spaces
            # TODO retain dashes by using an alternative delimiter like " - "
            while len(re.findall("-", class_general)) != 3:
                class_general = re.sub("-", " ", class_general, 1)

            class_general_delimited = [
                s.strip() for s in class_general.split("-")
            ]

            class_name = class_general_delimited[0]

            class_id = int(class_general_delimited[1])

            class_code = class_general_delimited[2]

            class_dddefault = " ".join(
                html_content.find_all("td",
                                      class_="dddefault")[0].text.replace(
                                          "\n", " ").split())

            class_credits = float(
                re.search("\d+\.\d+(?=\s+Credits)", class_dddefault).group(0))

            class_seats = [
                int(
                    re.search("Seats (-*\d+) (-*\d+) (-*\d+)",
                              class_dddefault).group(x)) for x in range(1, 4)
            ]

            class_waitlist_seats = [
                int(
                    re.search("Waitlist Seats (-*\d+) (-*\d+) (-*\d+)",
                              class_dddefault).group(x)) for x in range(1, 4)
            ]

            # Regex search method depends on prerequisites and restrictions combination
            if "Prerequisites" in class_dddefault:
                if "Restrictions" in class_dddefault:
                    class_prerequisites = re.search("Prerequisites: (.*)",
                                                    class_dddefault).group(1)
                    class_restrictions = re.search(
                        "Restrictions: (.*) Prerequisites",
                        class_dddefault).group(1)
                else:
                    class_prerequisites = re.search("Prerequisites: (.*)",
                                                    class_dddefault).group(1)
                    class_restrictions = None
            else:
                if "Restrictions" in class_dddefault:
                    class_prerequisites = None
                    class_restrictions = re.search("Restrictions: (.*)",
                                                   class_dddefault).group(1)
                else:
                    class_prerequisites = None
                    class_restrictions = None

            course_dict = {
                "id": class_id,
                "code": class_code,
                "name": class_name,
                "credits": class_credits,
                "seats": {
                    "capacity": class_seats[0],
                    "actual": class_seats[1],
                    "remaining": class_seats[2]
                },
                "waitlist": {
                    "capacity": class_waitlist_seats[0],
                    "actual": class_waitlist_seats[1],
                    "remaining": class_waitlist_seats[2]
                },
                "restrictions": class_restrictions,
                "prerequisites": class_prerequisites,
                "last_updated": datetime.now()
            }
            if class_general in course_metadata:
                course_dict.update(course_metadata[class_general])

            # Send all collected class metadata
            firebase_db.collection(u'{}'.format(
                config.PRIMARY_TABLE_NAME)).document(
                    u'{}'.format(class_id)).set(course_dict)

            all_table_name = f"{config.SECONDARY_TABLE_NAME}{i // 500}"
            all_courses_doc = firebase_db.collection(
                u'{}'.format(all_table_name)).document(
                    u'{}'.format("all_courses")).get()
            if all_courses_doc.exists:
                all_courses = all_courses_doc.to_dict()
                all_courses[str(class_id)] = course_dict
                firebase_db.collection(u'{}'.format(all_table_name)).document(
                    u'{}'.format("all_courses")).set(all_courses)
            else:
                firebase_db.collection(u'{}'.format(all_table_name)).document(
                    u'{}'.format("all_courses")).set(
                        {str(class_id): course_dict})
        except RuntimeError as e:
            write_blobs_before_exit(coursepickle_blob, coursecode_blob,
                                    last_modified_blob, course_metadata, [],
                                    [], last_modified, start_time)
            schedule_next_try(scheduler_client, scheduler_path)
            raise e

    # Delete all blobs
    coursepickle_blob.delete()
    coursecode_blob.delete()
    last_modified_blob.delete()
    schedule_next_try(scheduler_client,
                      scheduler_path,
                      adjust_cron=timedelta(days=1))
    return "200 OK"