def main(): args = parse_args() # Imports of thor modules are deferred until after argument parsing to avoid # numba JIT time if the arguments are invalid or the user asked for --help. import thor.utils.logging thor.utils.logging.setupLogger("thor") from thor.taskqueue.client import Client as TaskQueueClient from thor.taskqueue.queue import TaskQueueConnection from thor.orbits import Orbits from thor.config import Config if not isinstance(args.config, str): config = Config else: config = Config.fromYaml(args.config) # Read observations preprocessed_observations = pd.read_csv(args.preprocessed_observations, index_col=False, dtype={"obs_id": str}) # Read test orbits test_orbits = Orbits.from_csv(args.test_orbits) # Connect to Rabbit queue = TaskQueueConnection( pika.ConnectionParameters( host=args.rabbit_host, port=args.rabbit_port, credentials=pika.PlainCredentials( username=args.rabbit_username, password=args.rabbit_password, ), ), args.queue, ) queue.connect() # Connect to GCS bucket gcs = GCSClient() if args.create_bucket: try: gcs.create_bucket(args.bucket) except google.cloud.exceptions.Conflict: # Bucket already exists. pass bucket = gcs.bucket(args.bucket) taskqueue_client = TaskQueueClient(bucket, queue) manifest = taskqueue_client.launch_job(config, preprocessed_observations, test_orbits) taskqueue_client.monitor_job_status(manifest.job_id) taskqueue_client.download_results(manifest, args.out_dir)
def __init__( self, bucketname: str, #Should be bucket and/or blob name filename: str, store_user_data: bool = True, store_chat_data: bool = True, store_bot_data: bool = True, single_file: bool = True, #If false, stores in chatID_user_data.json, chatID_chat_data.json, chatID_bot_data.json on_flush: bool = False, storage_client: storage.Client = storage.Client()): super().__init__( store_user_data=store_user_data, store_chat_data=store_chat_data, store_bot_data=store_bot_data, ) self.bucketname = bucketname self.filename = filename try: self.bucket = storage_client.get_bucket(bucketname) except: self.bucket = storage_client.create_bucket(bucketname) blob = self.bucket.blob(filename) blob.upload_from_string(json.dumps({})) self.filename = filename self.storage_client = storage_client self.single_file = single_file self.on_flush = on_flush self.user_data: Optional[DefaultDict[int, Dict]] = None self.chat_data: Optional[DefaultDict[int, Dict]] = None self.bot_data: Optional[Dict] = None self.conversations: Optional[Dict[str, Dict[Tuple, object]]] = None
def test_extract_table(client, to_delete): DATASET_ID = 'export_data_dataset_{}'.format(_millis()) dataset = bigquery.Dataset(client.dataset(DATASET_ID)) client.create_dataset(dataset) to_delete.append(dataset) table_ref = dataset.table('person_ages') table = client.create_table(bigquery.Table(table_ref, schema=SCHEMA)) to_delete.insert(0, table) client.create_rows(table, ROWS) bucket_name = 'extract_person_ages_job_{}'.format(_millis()) # [START extract_table] from google.cloud.storage import Client as StorageClient storage_client = StorageClient() bucket = storage_client.create_bucket(bucket_name) # API request destination_blob_name = 'person_ages_out.csv' destination = bucket.blob(destination_blob_name) destination_uri = 'gs://{}/{}'.format(bucket_name, destination_blob_name) extract_job = client.extract_table(table_ref, destination_uri) # API request extract_job.result(timeout=100) # Waits for job to complete. got = destination.download_as_string().decode('utf-8') # API request assert 'Bharney Rhubble' in got # [END extract_table] to_delete.append(bucket) to_delete.insert(0, destination)
def bucket_object(storage_client: storage.Client) -> storage.Bucket: """ GCS Bucket from .env config """ if not storage_client.lookup_bucket(TEST_BUCKET): bucket = storage_client.create_bucket(TEST_BUCKET) else: bucket = storage_client.get_bucket(TEST_BUCKET) yield bucket
def _write_csv_to_storage(bucket_name, blob_name, header_row, data_rows): import csv from google.cloud._testing import _NamedTemporaryFile from google.cloud.storage import Client as StorageClient storage_client = StorageClient() # In the **very** rare case the bucket name is reserved, this # fails with a ConnectionError. bucket = storage_client.create_bucket(bucket_name) blob = bucket.blob(blob_name) with _NamedTemporaryFile() as temp: with open(temp.name, 'w') as csv_write: writer = csv.writer(csv_write) writer.writerow(header_row) writer.writerows(data_rows) with open(temp.name, 'rb') as csv_read: blob.upload_from_file(csv_read, content_type='text/csv') return bucket, blob
def test_extract_table(client, to_delete): DATASET_ID = 'export_data_dataset_{}'.format(_millis()) dataset = bigquery.Dataset(client.dataset(DATASET_ID)) client.create_dataset(dataset) to_delete.append(dataset) table_ref = dataset.table('person_ages') to_insert = [ {'full_name': name, 'age': age} for name, age in ROWS ] rows = [json.dumps(row) for row in to_insert] body = six.StringIO('{}\n'.format('\n'.join(rows))) job_config = bigquery.LoadJobConfig() job_config.write_disposition = 'WRITE_TRUNCATE' job_config.source_format = 'NEWLINE_DELIMITED_JSON' job_config.schema = SCHEMA to_delete.insert(0, table_ref) # Load a table using a local JSON file from memory. client.load_table_from_file( body, table_ref, job_config=job_config).result() bucket_name = 'extract_person_ages_job_{}'.format(_millis()) # [START extract_table] from google.cloud.storage import Client as StorageClient storage_client = StorageClient() bucket = storage_client.create_bucket(bucket_name) # API request destination_blob_name = 'person_ages_out.csv' destination = bucket.blob(destination_blob_name) destination_uri = 'gs://{}/{}'.format(bucket_name, destination_blob_name) extract_job = client.extract_table( table_ref, destination_uri) # API request extract_job.result(timeout=100) # Waits for job to complete. got = destination.download_as_string().decode('utf-8') # API request assert 'Bharney Rhubble' in got # [END extract_table] to_delete.append(bucket) to_delete.insert(0, destination)
def gcs_bucket(request, gcs: storage.Client) -> storage.Bucket: """GCS bucket for test artifacts""" bucket = gcs.create_bucket(f"test_gcs_ocn_bq_ingest_{str(uuid.uuid4())}") bucket.versioning_enabled = True bucket.patch() # overide default field delimiter at bucket level load_config_json = { "fieldDelimiter": "|", } load_json_blob: storage.Blob = bucket.blob("_config/load.json") load_json_blob.upload_from_string(json.dumps(load_config_json)) def teardown(): # Since bucket has object versioning enabled, you must # delete all versions of objects before you can delete the bucket. for blob in gcs.list_blobs(bucket, versions=True): blob.delete() bucket.delete(force=True) request.addfinalizer(teardown) return bucket
def test_load_table_from_storage_then_dump_table(self): import csv import tempfile from google.cloud.storage import Client as StorageClient local_id = unique_resource_id() BUCKET_NAME = 'bq_load_test' + local_id BLOB_NAME = 'person_ages.csv' GS_URL = 'gs://%s/%s' % (BUCKET_NAME, BLOB_NAME) ROWS = [ ('Phred Phlyntstone', 32), ('Bharney Rhubble', 33), ('Wylma Phlyntstone', 29), ('Bhettye Rhubble', 27), ] TABLE_NAME = 'test_table' s_client = StorageClient() # In the **very** rare case the bucket name is reserved, this # fails with a ConnectionError. bucket = s_client.create_bucket(BUCKET_NAME) self.to_delete.append(bucket) blob = bucket.blob(BLOB_NAME) with tempfile.TemporaryFile(mode='w+') as csv_file: writer = csv.writer(csv_file) writer.writerow(('Full Name', 'Age')) writer.writerows(ROWS) blob.upload_from_file( csv_file, rewind=True, content_type='text/csv') self.to_delete.insert(0, blob) dataset = Config.CLIENT.dataset( _make_dataset_name('load_gcs_then_dump')) retry_403(dataset.create)() self.to_delete.append(dataset) full_name = bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED') age = bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED') table = dataset.table(TABLE_NAME, schema=[full_name, age]) table.create() self.to_delete.insert(0, table) job = Config.CLIENT.load_table_from_storage( 'bq_load_storage_test_' + local_id, table, GS_URL) job.create_disposition = 'CREATE_NEVER' job.skip_leading_rows = 1 job.source_format = 'CSV' job.write_disposition = 'WRITE_EMPTY' job.begin() def _job_done(instance): return instance.state in ('DONE', 'done') # Allow for 90 seconds of "warm up" before rows visible. See: # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds retry = RetryInstanceState(_job_done, max_tries=8) retry(job.reload)() rows, _, _ = table.fetch_data() by_age = operator.itemgetter(1) self.assertEqual(sorted(rows, key=by_age), sorted(ROWS, key=by_age))
def test_load_table_from_storage_then_dump_table(self): import csv import tempfile from google.cloud.storage import Client as StorageClient local_id = unique_resource_id() BUCKET_NAME = 'bq_load_test' + local_id BLOB_NAME = 'person_ages.csv' GS_URL = 'gs://%s/%s' % (BUCKET_NAME, BLOB_NAME) ROWS = [ ('Phred Phlyntstone', 32), ('Bharney Rhubble', 33), ('Wylma Phlyntstone', 29), ('Bhettye Rhubble', 27), ] TABLE_NAME = 'test_table' s_client = StorageClient() # In the **very** rare case the bucket name is reserved, this # fails with a ConnectionError. bucket = s_client.create_bucket(BUCKET_NAME) self.to_delete.append(bucket) blob = bucket.blob(BLOB_NAME) with tempfile.TemporaryFile(mode='w+') as csv_file: writer = csv.writer(csv_file) writer.writerow(('Full Name', 'Age')) writer.writerows(ROWS) blob.upload_from_file(csv_file, rewind=True, content_type='text/csv') self.to_delete.insert(0, blob) dataset = Config.CLIENT.dataset( _make_dataset_name('load_gcs_then_dump')) retry_403(dataset.create)() self.to_delete.append(dataset) full_name = bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED') age = bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED') table = dataset.table(TABLE_NAME, schema=[full_name, age]) table.create() self.to_delete.insert(0, table) job = Config.CLIENT.load_table_from_storage( 'bq_load_storage_test_' + local_id, table, GS_URL) job.create_disposition = 'CREATE_NEVER' job.skip_leading_rows = 1 job.source_format = 'CSV' job.write_disposition = 'WRITE_EMPTY' job.begin() def _job_done(instance): return instance.state in ('DONE', 'done') # Allow for 90 seconds of "warm up" before rows visible. See: # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds retry = RetryInstanceState(_job_done, max_tries=8) retry(job.reload)() rows, _, _ = table.fetch_data() by_age = operator.itemgetter(1) self.assertEqual(sorted(rows, key=by_age), sorted(ROWS, key=by_age))
class GoogleClient(CloudClient): """ Implementation of a Google Client using the Google API """ def __init__(self, auth_dict, *args, **kwargs): super().__init__(*args, **kwargs) self.cred_dict_string = base64.b64decode( auth_dict.get("CREDENTIALS_JSON_BASE64")).decode("utf-8") cred_dict = json.loads(self.cred_dict_string) credentials = service_account.Credentials.from_service_account_info( cred_dict) with open(constants.GOOGLE_CREDS_JSON_PATH, "w") as cred_dump: cred_dump.write(self.cred_dict_string) self.secret = self.create_gcp_secret() try: self.client = GCPStorageClient(project=cred_dict["project_id"], credentials=credentials) except DefaultCredentialsError: raise def internal_create_uls(self, name, region=None): """ Creates the Underlying Storage using the Google API Args: name (str): The Underlying Storage name to be created region (str): The region to create the Underlying Storage """ if region is None: self.client.create_bucket(name) else: self.client.create_bucket(name, location=region) def internal_delete_uls(self, name): """ Deletes the Underlying Storage using the Google API Args: name (str): The Underlying Storage name to be deleted """ # Todo: Replace with a TimeoutSampler for _ in range(10): try: bucket = GCPBucket(client=self.client, name=name) bucket.delete_blobs(bucket.list_blobs()) bucket.delete() break except GoogleExceptions.NotFound: logger.warning( "Failed to delete some of the bucket blobs. Retrying...") sleep(10) def get_all_uls_names(self): """ Returns a set containing all the bucket names that the client has access to """ return {bucket.id for bucket in self.client.list_buckets()} def verify_uls_exists(self, uls_name): """ Verifies whether a Underlying Storage with the given uls_name exists Args: uls_name (str): The Underlying Storage name to be verified Returns: bool: True if Underlying Storage exists, False otherwise """ try: self.client.get_bucket(uls_name) return True except GoogleExceptions.NotFound: return False def create_gcp_secret(self): """ Create a Kubernetes secret to allow NooBaa to create Google-based backingstores """ bs_secret_data = templating.load_yaml( constants.MCG_BACKINGSTORE_SECRET_YAML) bs_secret_data["metadata"]["name"] = create_unique_resource_name( "cldmgr-gcp", "secret") bs_secret_data["metadata"]["namespace"] = config.ENV_DATA[ "cluster_namespace"] bs_secret_data["data"][ "GoogleServiceAccountPrivateKeyJson"] = base64.urlsafe_b64encode( self.cred_dict_string.encode("UTF-8")).decode("ascii") return create_resource(**bs_secret_data)
class BucketClientGCS(BucketClient): client: Optional[GCSNativeClient] def __init__(self, client: Optional[GCSNativeClient] = None): try: self.client = GCSNativeClient() if GCSNativeClient else None except (BaseException, DefaultCredentialsError): self.client = None def make_uri(self, path: PurePathy) -> str: return str(path) def create_bucket(self, path: PurePathy) -> Bucket: assert self.client is not None, _MISSING_DEPS return self.client.create_bucket(path.root) def delete_bucket(self, path: PurePathy) -> None: assert self.client is not None, _MISSING_DEPS bucket = self.client.get_bucket(path.root) bucket.delete() def exists(self, path: PurePathy) -> bool: # Because we want all the parents of a valid blob (e.g. "directory" in # "directory/foo.file") to return True, we enumerate the blobs with a prefix # and compare the object names to see if they match a substring of the path key_name = str(path.key) try: for obj in self.list_blobs(path): if obj.name == key_name: return True if obj.name.startswith(key_name + path._flavour.sep): return True except gcs_errors.ClientError: return False return False def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]: assert self.client is not None, _MISSING_DEPS try: native_bucket = self.client.bucket(path.root) if native_bucket is not None: return BucketGCS(str(path.root), bucket=native_bucket) except gcs_errors.ClientError as err: print(err) return None def get_bucket(self, path: PurePathy) -> BucketGCS: assert self.client is not None, _MISSING_DEPS try: native_bucket = self.client.bucket(path.root) if native_bucket is not None: return BucketGCS(str(path.root), bucket=native_bucket) raise FileNotFoundError(f"Bucket {path.root} does not exist!") except gcs_errors.ClientError as e: raise ClientError(message=e.message, code=e.code) def list_buckets( self, **kwargs: Dict[str, Any]) -> Generator[GCSNativeBucket, None, None]: assert self.client is not None, _MISSING_DEPS return self.client.list_buckets(**kwargs) # type:ignore def scandir( # type:ignore[override] self, path: Optional[PurePathy] = None, prefix: Optional[str] = None, delimiter: Optional[str] = None, ) -> Generator[BucketEntryGCS, None, None]: # type:ignore[override] assert self.client is not None, _MISSING_DEPS continuation_token = None if path is None or not path.root: gcs_bucket: GCSNativeBucket for gcs_bucket in self.list_buckets(): yield BucketEntryGCS(gcs_bucket.name, is_dir=True, raw=None) return sep = path._flavour.sep bucket = self.lookup_bucket(path) if bucket is None: return while True: if continuation_token: response = self.client.list_blobs( bucket.name, prefix=prefix, delimiter=sep, page_token=continuation_token, ) else: response = self.client.list_blobs(bucket.name, prefix=prefix, delimiter=sep) for page in response.pages: for folder in list(page.prefixes): full_name = folder[:-1] if folder.endswith(sep) else folder name = full_name.split(sep)[-1] if name: yield BucketEntryGCS(name, is_dir=True, raw=None) for item in page: name = item.name.split(sep)[-1] if name: yield BucketEntryGCS( name=name, is_dir=False, size=item.size, last_modified=item.updated.timestamp(), raw=item, ) if response.next_page_token is None: break continuation_token = response.next_page_token def list_blobs( self, path: PurePathy, prefix: Optional[str] = None, delimiter: Optional[str] = None, include_dirs: bool = False, ) -> Generator[BlobGCS, None, None]: assert self.client is not None, _MISSING_DEPS continuation_token = None bucket = self.lookup_bucket(path) if bucket is None: return while True: if continuation_token: response = self.client.list_blobs( path.root, prefix=prefix, delimiter=delimiter, page_token=continuation_token, ) else: response = self.client.list_blobs(path.root, prefix=prefix, delimiter=delimiter) for page in response.pages: for item in page: yield BlobGCS( bucket=bucket, owner=item.owner, name=item.name, raw=item, size=item.size, updated=item.updated.timestamp(), ) if response.next_page_token is None: break continuation_token = response.next_page_token
def test_load_table_from_storage_w_autodetect_schema(self): from google.cloud._testing import _NamedTemporaryFile from google.cloud.storage import Client as StorageClient from google.cloud.bigquery import SchemaField local_id = unique_resource_id() bucket_name = 'bq_load_test' + local_id blob_name = 'person_ages.csv' gs_url = 'gs://{}/{}'.format(bucket_name, blob_name) rows = [ ('Phred Phlyntstone', 32), ('Bharney Rhubble', 33), ('Wylma Phlyntstone', 29), ('Bhettye Rhubble', 27), ] * 100 # BigQuery internally uses the first 100 rows to detect schema table_name = 'test_table' storage_client = StorageClient() # In the **very** rare case the bucket name is reserved, this # fails with a ConnectionError. bucket = storage_client.create_bucket(bucket_name) self.to_delete.append(bucket) blob = bucket.blob(blob_name) with _NamedTemporaryFile() as temp: with open(temp.name, 'w') as csv_write: writer = csv.writer(csv_write) writer.writerow(('Full Name', 'Age')) writer.writerows(rows) with open(temp.name, 'rb') as csv_read: blob.upload_from_file(csv_read, content_type='text/csv') self.to_delete.insert(0, blob) dataset = Config.CLIENT.dataset( _make_dataset_name('load_gcs_then_dump')) retry_403(dataset.create)() self.to_delete.append(dataset) table = dataset.table(table_name) self.to_delete.insert(0, table) job = Config.CLIENT.load_table_from_storage( 'bq_load_storage_test_' + local_id, table, gs_url) job.autodetect = True job.begin() # Allow for 90 seconds of "warm up" before rows visible. See # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds retry = RetryInstanceState(_job_done, max_tries=8) retry(job.reload)() table.reload() field_name = SchemaField(u'Full_Name', u'string', u'NULLABLE', None, ()) field_age = SchemaField(u'Age', u'integer', u'NULLABLE', None, ()) self.assertEqual(table.schema, [field_name, field_age]) actual_rows = self._fetch_single_page(table) by_age = operator.itemgetter(1) self.assertEqual(sorted(actual_rows, key=by_age), sorted(rows, key=by_age))
class BucketClientGCS(BucketClient): client: Optional[GCSNativeClient] @property def client_params(self) -> Any: return dict(client=self.client) def __init__(self, **kwargs: Any) -> None: self.recreate(**kwargs) def recreate(self, **kwargs: Any) -> None: creds = kwargs["credentials"] if "credentials" in kwargs else None if creds is not None: kwargs["project"] = creds.project_id try: self.client = GCSNativeClient(**kwargs) except TypeError: # TypeError is raised if the imports for GCSNativeClient fail and are # assigned to Any, which is not callable. self.client = None def make_uri(self, path: PurePathy) -> str: return str(path) def create_bucket(self, path: PurePathy) -> Bucket: assert self.client is not None, _MISSING_DEPS return self.client.create_bucket(path.root) def delete_bucket(self, path: PurePathy) -> None: assert self.client is not None, _MISSING_DEPS bucket = self.client.get_bucket(path.root) bucket.delete() def exists(self, path: PurePathy) -> bool: # Because we want all the parents of a valid blob (e.g. "directory" in # "directory/foo.file") to return True, we enumerate the blobs with a prefix # and compare the object names to see if they match a substring of the path key_name = str(path.key) try: for obj in self.list_blobs(path): if obj.name == key_name: return True if obj.name.startswith(key_name + path._flavour.sep): return True except gcs_errors.ClientError: return False return False def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]: assert self.client is not None, _MISSING_DEPS try: native_bucket = self.client.bucket(path.root) if native_bucket is not None: return BucketGCS(str(path.root), bucket=native_bucket) except gcs_errors.ClientError as err: print(err) return None def get_bucket(self, path: PurePathy) -> BucketGCS: assert self.client is not None, _MISSING_DEPS try: native_bucket = self.client.bucket(path.root) if native_bucket is not None: return BucketGCS(str(path.root), bucket=native_bucket) raise FileNotFoundError(f"Bucket {path.root} does not exist!") except gcs_errors.ClientError as e: raise ClientError(message=e.message, code=e.code) def list_buckets( self, **kwargs: Dict[str, Any] ) -> Generator[GCSNativeBucket, None, None]: assert self.client is not None, _MISSING_DEPS return self.client.list_buckets(**kwargs) # type:ignore def scandir( # type:ignore[override] self, path: Optional[PurePathy] = None, prefix: Optional[str] = None, delimiter: Optional[str] = None, ) -> PathyScanDir: return _GCSScanDir(client=self, path=path, prefix=prefix, delimiter=delimiter) def list_blobs( self, path: PurePathy, prefix: Optional[str] = None, delimiter: Optional[str] = None, include_dirs: bool = False, ) -> Generator[BlobGCS, None, None]: assert self.client is not None, _MISSING_DEPS continuation_token = None bucket = self.lookup_bucket(path) if bucket is None: return while True: if continuation_token: response = self.client.list_blobs( path.root, prefix=prefix, delimiter=delimiter, page_token=continuation_token, ) else: response = self.client.list_blobs( path.root, prefix=prefix, delimiter=delimiter ) for page in response.pages: for item in page: yield BlobGCS( bucket=bucket, owner=item.owner, name=item.name, raw=item, size=item.size, updated=item.updated.timestamp(), ) if response.next_page_token is None: break continuation_token = response.next_page_token
def start_process(): start_time = time() storage_client = Client() scheduler_client = CloudSchedulerClient() scheduler_path = scheduler_client.location_path(config.PROJECT_ID, config.REGION_ID) cred = credentials.ApplicationDefault() try: scheduler_client.delete_job( f"{scheduler_path}/jobs/{config.CRON_NAME}") except GoogleAPICallError or PermissionDenied: logging.warning("course-collect manually triggered") try: scheduler_client.delete_job(f"{scheduler_path}/jobs/forcequit") except GoogleAPICallError or PermissionDenied: logging.warning("forcequit job does not exist") if not _apps: initialize_app(cred, {"projectId": config.PROJECT_ID}) logging.info("initializing firebase") firebase_db = firestore.client() if storage_client.bucket(config.BUCKET_NAME).exists(): logging.info("reading from existing bucket") coursepickle_bucket = storage_client.bucket(config.BUCKET_NAME) else: logging.info("creating new bucket") coursepickle_bucket = storage_client.create_bucket(config.BUCKET_NAME) # Get unfinished course codes coursecode_blob = coursepickle_bucket.blob(config.COURSE_CODE_BLOB_NAME) try: coursecode_raw = coursecode_blob.download_as_string() unique_course_codes = pickle.loads(coursecode_raw) except NotFound: # Fetch course metadata per code for instructor, schedule, time, location, GPA, grade distributions all_courses = get_all_courses(firebase_db) unique_course_codes = set( [course["code"] for course in all_courses.values()]) # Get existing course metadata coursepickle_blob = coursepickle_bucket.blob( config.COURSE_METADATA_BLOB_NAME) try: course_metadata_raw = coursepickle_blob.download_as_string() course_metadata = pickle.loads(course_metadata_raw) except NotFound: course_metadata = {} course_metadata = course_metadata if course_metadata else {} # Conform to free tier limits (looks like {"runtime": 123, "datetime": datetime(...)} last_modified_blob = coursepickle_bucket.blob( config.LAST_MODIFIED_BLOB_NAME) try: last_modified_raw = last_modified_blob.download_as_string() last_modified = pickle.loads(last_modified_raw) except NotFound: last_modified = {} last_modified = last_modified if last_modified else { "runtime": 0, "datetime": None } check_free_tier_force_exit( scheduler_client, scheduler_path, get_curr_runtime(last_modified["runtime"], start_time)) if last_modified[ "datetime"] and last_modified["datetime"].day < datetime.now().day: last_modified["runtime"] = 0 if bool(int(config.UPDATE_EXTRA_FIELDS)): course_code_done = [] for code in unique_course_codes: try: logging.info(f"Checking class {code}") print(code) split_code = code.split() pg = requests_connectionerror_bypass( config.SCHEDULE_TARGET_URL_FMT, [config.LATEST_TERM, *split_code], scheduler_client, scheduler_path, last_modified, start_time) html_content = requests_bandwith_bypass( pg, config.SCHEDULE_TARGET_URL_FMT, split_code, scheduler_client, scheduler_path, last_modified, start_time) class_ddtitle = html_content.find_all("th", {"scope": "colgroup"}, class_="ddtitle") class_titles = [ th.a.text for th in class_ddtitle if "table" in str(th.find_next("tr")) ] class_dddefaults = [ str(c).replace("\n", "") for c in html_content.find_all("td", class_="dddefault") if "cc.gatech.edu" in c.text or "students" in c.text or "lecture" in c.text or "Semester" in c.text ] class_terms = [ re.search( "(?<=Associated Term: </span>)([a-zA-Z0-9'\s]*)(?=<br)", c).group(0).strip() for c in class_dddefaults ] class_registration_dates = [ re.search( "(?<=Registration Dates: </span>)([a-zA-Z0-9,\s]*)(?=<br)", c).group(0).strip() for c in class_dddefaults ] class_attributes = [ re.search("(?<=Attributes: </span>)([^<]*)(?=<br)", c).group(0).strip() if "Attributes" in c else None for c in class_dddefaults ] class_grade_bases = [ re.search("(?<=Grade Basis: </span>)([A-Z0-9\s]*)(?=<br)", c).group(0).strip() for c in class_dddefaults ] class_table = html_content.find_all( "table", class_="datadisplaytable")[1:-1] class_schedule_headers = [[ "_".join(header.text.lower().split()) for header in table.find_all("th") ] for table in class_table] class_schedule_data = [[ header.text.replace("(P)", "").strip() for header in table.find_all("td") ] for table in class_table] for c in class_schedule_data: c[-1] = " ".join(c[-1].split()) instructor_emails = [ re.search( "([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", str(c)).group(1) if "mailto" in str(c) else None for c in class_table ] pg = requests_connectionerror_bypass( config.CRITIQUE_TARGET_URL_FMT, split_code, scheduler_client, scheduler_path, last_modified, start_time) html_content = requests_bandwith_bypass( pg, config.CRITIQUE_TARGET_URL_FMT, split_code, scheduler_client, scheduler_path, last_modified, start_time) critique_table = html_content.find("table", {"id": "dataTable"}) critique_headers = [ "_".join(th.text.lower().split()) for th in critique_table.find_all("th") ][1:] critique_data_raw = [ td.text for td in critique_table.find_all("td") ] critique_data = [ critique_data_raw[x:x + len(critique_headers) + 1] for x in range(0, len(critique_data_raw), len(critique_headers) + 1) ] critique_instructors = [] for i in range(len(critique_data)): critique_instructors.append(" ".join( critique_data[i][0].split(", ")[::-1])) del critique_data[i][0] critique_data[i] = [critique_data[i][0]] + [ float(x) for x in critique_data[i][1:] ] critique_averages = {} for i in range(len(critique_instructors)): critique_averages[critique_instructors[i]] = dict( zip(critique_headers, critique_data[i])) for i in range(len(class_titles)): try: schedule = dict( zip(class_schedule_headers[i], class_schedule_data[i])) except: print(i) raise RuntimeError course_metadata[class_titles[i]] = { "terms": class_terms[i], "registration_dates": class_registration_dates[i], "attributes": class_attributes[i], "grade_basis": class_grade_bases[i], "schedule": schedule, "instructor_email": instructor_emails[i], "averages": critique_averages[schedule["instructors"]] if schedule["instructors"] in critique_averages else None } course_code_done.append(code) except RuntimeError as e: write_blobs_before_exit(coursepickle_blob, coursecode_blob, last_modified_blob, course_metadata, unique_course_codes, course_code_done, last_modified, start_time) schedule_next_try(scheduler_client, scheduler_path) raise e """ Fetch per course seat, credit, and requirement information """ for i in range(config.START_IDX, config.END_IDX): try: logging.info(f"Checking class with id {i}") pg = requests_connectionerror_bypass( config.REGISTRATION_TARGET_URL_FMT, [config.LATEST_TERM, i], scheduler_client, scheduler_path, last_modified, start_time) html_content = requests_bandwith_bypass( pg, config.REGISTRATION_TARGET_URL_FMT, [i], scheduler_client, scheduler_path, last_modified, start_time) if "-" not in html_content.text: logging.info(f"skipping {i}") continue class_general = html_content.find_all("th", {"scope": "row"}, class_="ddlabel")[0].text # For classes with dashes in the class name, replace them one by one with spaces # TODO retain dashes by using an alternative delimiter like " - " while len(re.findall("-", class_general)) != 3: class_general = re.sub("-", " ", class_general, 1) class_general_delimited = [ s.strip() for s in class_general.split("-") ] class_name = class_general_delimited[0] class_id = int(class_general_delimited[1]) class_code = class_general_delimited[2] class_dddefault = " ".join( html_content.find_all("td", class_="dddefault")[0].text.replace( "\n", " ").split()) class_credits = float( re.search("\d+\.\d+(?=\s+Credits)", class_dddefault).group(0)) class_seats = [ int( re.search("Seats (-*\d+) (-*\d+) (-*\d+)", class_dddefault).group(x)) for x in range(1, 4) ] class_waitlist_seats = [ int( re.search("Waitlist Seats (-*\d+) (-*\d+) (-*\d+)", class_dddefault).group(x)) for x in range(1, 4) ] # Regex search method depends on prerequisites and restrictions combination if "Prerequisites" in class_dddefault: if "Restrictions" in class_dddefault: class_prerequisites = re.search("Prerequisites: (.*)", class_dddefault).group(1) class_restrictions = re.search( "Restrictions: (.*) Prerequisites", class_dddefault).group(1) else: class_prerequisites = re.search("Prerequisites: (.*)", class_dddefault).group(1) class_restrictions = None else: if "Restrictions" in class_dddefault: class_prerequisites = None class_restrictions = re.search("Restrictions: (.*)", class_dddefault).group(1) else: class_prerequisites = None class_restrictions = None course_dict = { "id": class_id, "code": class_code, "name": class_name, "credits": class_credits, "seats": { "capacity": class_seats[0], "actual": class_seats[1], "remaining": class_seats[2] }, "waitlist": { "capacity": class_waitlist_seats[0], "actual": class_waitlist_seats[1], "remaining": class_waitlist_seats[2] }, "restrictions": class_restrictions, "prerequisites": class_prerequisites, "last_updated": datetime.now() } if class_general in course_metadata: course_dict.update(course_metadata[class_general]) # Send all collected class metadata firebase_db.collection(u'{}'.format( config.PRIMARY_TABLE_NAME)).document( u'{}'.format(class_id)).set(course_dict) all_table_name = f"{config.SECONDARY_TABLE_NAME}{i // 500}" all_courses_doc = firebase_db.collection( u'{}'.format(all_table_name)).document( u'{}'.format("all_courses")).get() if all_courses_doc.exists: all_courses = all_courses_doc.to_dict() all_courses[str(class_id)] = course_dict firebase_db.collection(u'{}'.format(all_table_name)).document( u'{}'.format("all_courses")).set(all_courses) else: firebase_db.collection(u'{}'.format(all_table_name)).document( u'{}'.format("all_courses")).set( {str(class_id): course_dict}) except RuntimeError as e: write_blobs_before_exit(coursepickle_blob, coursecode_blob, last_modified_blob, course_metadata, [], [], last_modified, start_time) schedule_next_try(scheduler_client, scheduler_path) raise e # Delete all blobs coursepickle_blob.delete() coursecode_blob.delete() last_modified_blob.delete() schedule_next_try(scheduler_client, scheduler_path, adjust_cron=timedelta(days=1)) return "200 OK"
class BucketClientGCS(BucketClient): client: GCSNativeClient @property def client_params(self) -> Any: return dict(client=self.client) def __init__(self, **kwargs: Any) -> None: self.recreate(**kwargs) def recreate(self, **kwargs: Any) -> None: creds = kwargs["credentials"] if "credentials" in kwargs else None if creds is not None: kwargs["project"] = creds.project_id self.client = GCSNativeClient(**kwargs) def make_uri(self, path: PurePathy) -> str: return str(path) def create_bucket( # type:ignore[override] self, path: PurePathy) -> GCSNativeBucket: return self.client.create_bucket(path.root) # type:ignore def delete_bucket(self, path: PurePathy) -> None: bucket = self.client.get_bucket(path.root) # type:ignore bucket.delete() # type:ignore def exists(self, path: PurePathy) -> bool: # Because we want all the parents of a valid blob (e.g. "directory" in # "directory/foo.file") to return True, we enumerate the blobs with a prefix # and compare the object names to see if they match a substring of the path key_name = str(path.key) for obj in self.list_blobs(path): if obj.name.startswith(key_name + path._flavour.sep): # type:ignore return True return False def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]: try: return self.get_bucket(path) except FileNotFoundError: return None def get_bucket(self, path: PurePathy) -> BucketGCS: native_bucket: Any = self.client.bucket(path.root) # type:ignore try: if native_bucket.exists(): return BucketGCS(str(path.root), bucket=native_bucket) except BadRequest: pass raise FileNotFoundError(f"Bucket {path.root} does not exist!") def list_buckets( # type:ignore[override] self, **kwargs: Dict[str, Any]) -> Generator[GCSNativeBucket, None, None]: return self.client.list_buckets(**kwargs) # type:ignore def scandir( # type:ignore[override] self, path: Optional[PurePathy] = None, prefix: Optional[str] = None, delimiter: Optional[str] = None, ) -> PathyScanDir: return ScanDirGCS(client=self, path=path, prefix=prefix, delimiter=delimiter) def list_blobs( self, path: PurePathy, prefix: Optional[str] = None, delimiter: Optional[str] = None, ) -> Generator[BlobGCS, None, None]: bucket = self.lookup_bucket(path) if bucket is None: return response: Any = self.client.list_blobs( # type:ignore path.root, prefix=prefix, delimiter=delimiter) for page in response.pages: # type:ignore for item in page: yield BlobGCS( bucket=bucket, owner=item.owner, name=item.name, raw=item, size=item.size, updated=item.updated.timestamp(), )