Esempio n. 1
0
def main():
    args = parse_args()

    # Imports of thor modules are deferred until after argument parsing to avoid
    # numba JIT time if the arguments are invalid or the user asked for --help.
    import thor.utils.logging

    thor.utils.logging.setupLogger("thor")

    from thor.taskqueue.client import Client as TaskQueueClient
    from thor.taskqueue.queue import TaskQueueConnection
    from thor.orbits import Orbits
    from thor.config import Config

    if not isinstance(args.config, str):
        config = Config
    else:
        config = Config.fromYaml(args.config)

    # Read observations
    preprocessed_observations = pd.read_csv(args.preprocessed_observations,
                                            index_col=False,
                                            dtype={"obs_id": str})

    # Read test orbits
    test_orbits = Orbits.from_csv(args.test_orbits)

    # Connect to Rabbit
    queue = TaskQueueConnection(
        pika.ConnectionParameters(
            host=args.rabbit_host,
            port=args.rabbit_port,
            credentials=pika.PlainCredentials(
                username=args.rabbit_username,
                password=args.rabbit_password,
            ),
        ),
        args.queue,
    )
    queue.connect()

    # Connect to GCS bucket
    gcs = GCSClient()
    if args.create_bucket:
        try:
            gcs.create_bucket(args.bucket)
        except google.cloud.exceptions.Conflict:
            # Bucket already exists.
            pass
    bucket = gcs.bucket(args.bucket)
    taskqueue_client = TaskQueueClient(bucket, queue)

    manifest = taskqueue_client.launch_job(config, preprocessed_observations,
                                           test_orbits)
    taskqueue_client.monitor_job_status(manifest.job_id)
    taskqueue_client.download_results(manifest, args.out_dir)
    def __init__(
        self,
        bucketname: str,  #Should be bucket and/or blob name
        filename: str,
        store_user_data: bool = True,
        store_chat_data: bool = True,
        store_bot_data: bool = True,
        single_file:
        bool = True,  #If false, stores in chatID_user_data.json, chatID_chat_data.json, chatID_bot_data.json
        on_flush: bool = False,
        storage_client: storage.Client = storage.Client()):
        super().__init__(
            store_user_data=store_user_data,
            store_chat_data=store_chat_data,
            store_bot_data=store_bot_data,
        )
        self.bucketname = bucketname
        self.filename = filename
        try:
            self.bucket = storage_client.get_bucket(bucketname)
        except:

            self.bucket = storage_client.create_bucket(bucketname)
            blob = self.bucket.blob(filename)
            blob.upload_from_string(json.dumps({}))
        self.filename = filename
        self.storage_client = storage_client

        self.single_file = single_file
        self.on_flush = on_flush
        self.user_data: Optional[DefaultDict[int, Dict]] = None
        self.chat_data: Optional[DefaultDict[int, Dict]] = None
        self.bot_data: Optional[Dict] = None
        self.conversations: Optional[Dict[str, Dict[Tuple, object]]] = None
Esempio n. 3
0
def test_extract_table(client, to_delete):
    DATASET_ID = 'export_data_dataset_{}'.format(_millis())
    dataset = bigquery.Dataset(client.dataset(DATASET_ID))
    client.create_dataset(dataset)
    to_delete.append(dataset)

    table_ref = dataset.table('person_ages')
    table = client.create_table(bigquery.Table(table_ref, schema=SCHEMA))
    to_delete.insert(0, table)
    client.create_rows(table, ROWS)

    bucket_name = 'extract_person_ages_job_{}'.format(_millis())
    # [START extract_table]
    from google.cloud.storage import Client as StorageClient

    storage_client = StorageClient()
    bucket = storage_client.create_bucket(bucket_name)  # API request
    destination_blob_name = 'person_ages_out.csv'
    destination = bucket.blob(destination_blob_name)

    destination_uri = 'gs://{}/{}'.format(bucket_name, destination_blob_name)
    extract_job = client.extract_table(table_ref,
                                       destination_uri)  # API request
    extract_job.result(timeout=100)  # Waits for job to complete.

    got = destination.download_as_string().decode('utf-8')  # API request
    assert 'Bharney Rhubble' in got
    # [END extract_table]
    to_delete.append(bucket)
    to_delete.insert(0, destination)
Esempio n. 4
0
def bucket_object(storage_client: storage.Client) -> storage.Bucket:
    """ GCS Bucket from .env config """
    if not storage_client.lookup_bucket(TEST_BUCKET):
        bucket = storage_client.create_bucket(TEST_BUCKET)
    else:
        bucket = storage_client.get_bucket(TEST_BUCKET)
    yield bucket
Esempio n. 5
0
def _write_csv_to_storage(bucket_name, blob_name, header_row, data_rows):

    import csv

    from google.cloud._testing import _NamedTemporaryFile

    from google.cloud.storage import Client as StorageClient



    storage_client = StorageClient()



    # In the **very** rare case the bucket name is reserved, this

    # fails with a ConnectionError.

    bucket = storage_client.create_bucket(bucket_name)



    blob = bucket.blob(blob_name)



    with _NamedTemporaryFile() as temp:

        with open(temp.name, 'w') as csv_write:

            writer = csv.writer(csv_write)

            writer.writerow(header_row)

            writer.writerows(data_rows)



        with open(temp.name, 'rb') as csv_read:

            blob.upload_from_file(csv_read, content_type='text/csv')



    return bucket, blob
Esempio n. 6
0
def test_extract_table(client, to_delete):
    DATASET_ID = 'export_data_dataset_{}'.format(_millis())
    dataset = bigquery.Dataset(client.dataset(DATASET_ID))
    client.create_dataset(dataset)
    to_delete.append(dataset)

    table_ref = dataset.table('person_ages')
    to_insert = [
        {'full_name': name, 'age': age}
        for name, age in ROWS
    ]
    rows = [json.dumps(row) for row in to_insert]
    body = six.StringIO('{}\n'.format('\n'.join(rows)))
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = 'WRITE_TRUNCATE'
    job_config.source_format = 'NEWLINE_DELIMITED_JSON'
    job_config.schema = SCHEMA
    to_delete.insert(0, table_ref)
    # Load a table using a local JSON file from memory.
    client.load_table_from_file(
        body, table_ref, job_config=job_config).result()

    bucket_name = 'extract_person_ages_job_{}'.format(_millis())
    # [START extract_table]
    from google.cloud.storage import Client as StorageClient

    storage_client = StorageClient()
    bucket = storage_client.create_bucket(bucket_name)  # API request
    destination_blob_name = 'person_ages_out.csv'
    destination = bucket.blob(destination_blob_name)

    destination_uri = 'gs://{}/{}'.format(bucket_name, destination_blob_name)
    extract_job = client.extract_table(
        table_ref, destination_uri)  # API request
    extract_job.result(timeout=100)  # Waits for job to complete.

    got = destination.download_as_string().decode('utf-8')  # API request
    assert 'Bharney Rhubble' in got
    # [END extract_table]
    to_delete.append(bucket)
    to_delete.insert(0, destination)
Esempio n. 7
0
def gcs_bucket(request, gcs: storage.Client) -> storage.Bucket:
    """GCS bucket for test artifacts"""
    bucket = gcs.create_bucket(f"test_gcs_ocn_bq_ingest_{str(uuid.uuid4())}")
    bucket.versioning_enabled = True
    bucket.patch()
    # overide default field delimiter at bucket level
    load_config_json = {
        "fieldDelimiter": "|",
    }
    load_json_blob: storage.Blob = bucket.blob("_config/load.json")
    load_json_blob.upload_from_string(json.dumps(load_config_json))

    def teardown():
        # Since bucket has object versioning enabled, you must
        # delete all versions of objects before you can delete the bucket.
        for blob in gcs.list_blobs(bucket, versions=True):
            blob.delete()
        bucket.delete(force=True)

    request.addfinalizer(teardown)
    return bucket
Esempio n. 8
0
    def test_load_table_from_storage_then_dump_table(self):
        import csv
        import tempfile
        from google.cloud.storage import Client as StorageClient
        local_id = unique_resource_id()
        BUCKET_NAME = 'bq_load_test' + local_id
        BLOB_NAME = 'person_ages.csv'
        GS_URL = 'gs://%s/%s' % (BUCKET_NAME, BLOB_NAME)
        ROWS = [
            ('Phred Phlyntstone', 32),
            ('Bharney Rhubble', 33),
            ('Wylma Phlyntstone', 29),
            ('Bhettye Rhubble', 27),
        ]
        TABLE_NAME = 'test_table'

        s_client = StorageClient()

        # In the **very** rare case the bucket name is reserved, this
        # fails with a ConnectionError.
        bucket = s_client.create_bucket(BUCKET_NAME)
        self.to_delete.append(bucket)

        blob = bucket.blob(BLOB_NAME)

        with tempfile.TemporaryFile(mode='w+') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(('Full Name', 'Age'))
            writer.writerows(ROWS)
            blob.upload_from_file(
                csv_file, rewind=True, content_type='text/csv')

        self.to_delete.insert(0, blob)

        dataset = Config.CLIENT.dataset(
            _make_dataset_name('load_gcs_then_dump'))

        retry_403(dataset.create)()
        self.to_delete.append(dataset)

        full_name = bigquery.SchemaField('full_name', 'STRING',
                                         mode='REQUIRED')
        age = bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED')
        table = dataset.table(TABLE_NAME, schema=[full_name, age])
        table.create()
        self.to_delete.insert(0, table)

        job = Config.CLIENT.load_table_from_storage(
            'bq_load_storage_test_' + local_id, table, GS_URL)
        job.create_disposition = 'CREATE_NEVER'
        job.skip_leading_rows = 1
        job.source_format = 'CSV'
        job.write_disposition = 'WRITE_EMPTY'

        job.begin()

        def _job_done(instance):
            return instance.state in ('DONE', 'done')

        # Allow for 90 seconds of "warm up" before rows visible.  See:
        # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability
        # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds
        retry = RetryInstanceState(_job_done, max_tries=8)
        retry(job.reload)()

        rows, _, _ = table.fetch_data()
        by_age = operator.itemgetter(1)
        self.assertEqual(sorted(rows, key=by_age),
                         sorted(ROWS, key=by_age))
Esempio n. 9
0
    def test_load_table_from_storage_then_dump_table(self):
        import csv
        import tempfile
        from google.cloud.storage import Client as StorageClient
        local_id = unique_resource_id()
        BUCKET_NAME = 'bq_load_test' + local_id
        BLOB_NAME = 'person_ages.csv'
        GS_URL = 'gs://%s/%s' % (BUCKET_NAME, BLOB_NAME)
        ROWS = [
            ('Phred Phlyntstone', 32),
            ('Bharney Rhubble', 33),
            ('Wylma Phlyntstone', 29),
            ('Bhettye Rhubble', 27),
        ]
        TABLE_NAME = 'test_table'

        s_client = StorageClient()

        # In the **very** rare case the bucket name is reserved, this
        # fails with a ConnectionError.
        bucket = s_client.create_bucket(BUCKET_NAME)
        self.to_delete.append(bucket)

        blob = bucket.blob(BLOB_NAME)

        with tempfile.TemporaryFile(mode='w+') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(('Full Name', 'Age'))
            writer.writerows(ROWS)
            blob.upload_from_file(csv_file,
                                  rewind=True,
                                  content_type='text/csv')

        self.to_delete.insert(0, blob)

        dataset = Config.CLIENT.dataset(
            _make_dataset_name('load_gcs_then_dump'))

        retry_403(dataset.create)()
        self.to_delete.append(dataset)

        full_name = bigquery.SchemaField('full_name',
                                         'STRING',
                                         mode='REQUIRED')
        age = bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED')
        table = dataset.table(TABLE_NAME, schema=[full_name, age])
        table.create()
        self.to_delete.insert(0, table)

        job = Config.CLIENT.load_table_from_storage(
            'bq_load_storage_test_' + local_id, table, GS_URL)
        job.create_disposition = 'CREATE_NEVER'
        job.skip_leading_rows = 1
        job.source_format = 'CSV'
        job.write_disposition = 'WRITE_EMPTY'

        job.begin()

        def _job_done(instance):
            return instance.state in ('DONE', 'done')

        # Allow for 90 seconds of "warm up" before rows visible.  See:
        # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability
        # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds
        retry = RetryInstanceState(_job_done, max_tries=8)
        retry(job.reload)()

        rows, _, _ = table.fetch_data()
        by_age = operator.itemgetter(1)
        self.assertEqual(sorted(rows, key=by_age), sorted(ROWS, key=by_age))
Esempio n. 10
0
class GoogleClient(CloudClient):
    """
    Implementation of a Google Client using the Google API

    """
    def __init__(self, auth_dict, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cred_dict_string = base64.b64decode(
            auth_dict.get("CREDENTIALS_JSON_BASE64")).decode("utf-8")
        cred_dict = json.loads(self.cred_dict_string)
        credentials = service_account.Credentials.from_service_account_info(
            cred_dict)

        with open(constants.GOOGLE_CREDS_JSON_PATH, "w") as cred_dump:
            cred_dump.write(self.cred_dict_string)

        self.secret = self.create_gcp_secret()

        try:
            self.client = GCPStorageClient(project=cred_dict["project_id"],
                                           credentials=credentials)
        except DefaultCredentialsError:
            raise

    def internal_create_uls(self, name, region=None):
        """
        Creates the Underlying Storage using the Google API

        Args:
           name (str): The Underlying Storage name to be created
           region (str): The region to create the Underlying Storage

        """
        if region is None:
            self.client.create_bucket(name)
        else:
            self.client.create_bucket(name, location=region)

    def internal_delete_uls(self, name):
        """
        Deletes the Underlying Storage using the Google API

        Args:
           name (str): The Underlying Storage name to be deleted

        """
        # Todo: Replace with a TimeoutSampler
        for _ in range(10):
            try:
                bucket = GCPBucket(client=self.client, name=name)
                bucket.delete_blobs(bucket.list_blobs())
                bucket.delete()
                break
            except GoogleExceptions.NotFound:
                logger.warning(
                    "Failed to delete some of the bucket blobs. Retrying...")
                sleep(10)

    def get_all_uls_names(self):
        """
        Returns a set containing all the bucket names that the client has access to

        """
        return {bucket.id for bucket in self.client.list_buckets()}

    def verify_uls_exists(self, uls_name):
        """
        Verifies whether a Underlying Storage with the given uls_name exists

        Args:
           uls_name (str): The Underlying Storage name to be verified

        Returns:
             bool: True if Underlying Storage exists, False otherwise

        """
        try:
            self.client.get_bucket(uls_name)
            return True
        except GoogleExceptions.NotFound:
            return False

    def create_gcp_secret(self):
        """
        Create a Kubernetes secret to allow NooBaa to create Google-based backingstores

        """
        bs_secret_data = templating.load_yaml(
            constants.MCG_BACKINGSTORE_SECRET_YAML)
        bs_secret_data["metadata"]["name"] = create_unique_resource_name(
            "cldmgr-gcp", "secret")
        bs_secret_data["metadata"]["namespace"] = config.ENV_DATA[
            "cluster_namespace"]
        bs_secret_data["data"][
            "GoogleServiceAccountPrivateKeyJson"] = base64.urlsafe_b64encode(
                self.cred_dict_string.encode("UTF-8")).decode("ascii")

        return create_resource(**bs_secret_data)
Esempio n. 11
0
class BucketClientGCS(BucketClient):
    client: Optional[GCSNativeClient]

    def __init__(self, client: Optional[GCSNativeClient] = None):
        try:
            self.client = GCSNativeClient() if GCSNativeClient else None
        except (BaseException, DefaultCredentialsError):
            self.client = None

    def make_uri(self, path: PurePathy) -> str:
        return str(path)

    def create_bucket(self, path: PurePathy) -> Bucket:
        assert self.client is not None, _MISSING_DEPS
        return self.client.create_bucket(path.root)

    def delete_bucket(self, path: PurePathy) -> None:
        assert self.client is not None, _MISSING_DEPS
        bucket = self.client.get_bucket(path.root)
        bucket.delete()

    def exists(self, path: PurePathy) -> bool:
        # Because we want all the parents of a valid blob (e.g. "directory" in
        # "directory/foo.file") to return True, we enumerate the blobs with a prefix
        # and compare the object names to see if they match a substring of the path
        key_name = str(path.key)
        try:
            for obj in self.list_blobs(path):
                if obj.name == key_name:
                    return True
                if obj.name.startswith(key_name + path._flavour.sep):
                    return True
        except gcs_errors.ClientError:
            return False
        return False

    def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
        except gcs_errors.ClientError as err:
            print(err)

        return None

    def get_bucket(self, path: PurePathy) -> BucketGCS:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
            raise FileNotFoundError(f"Bucket {path.root} does not exist!")
        except gcs_errors.ClientError as e:
            raise ClientError(message=e.message, code=e.code)

    def list_buckets(
            self,
            **kwargs: Dict[str,
                           Any]) -> Generator[GCSNativeBucket, None, None]:
        assert self.client is not None, _MISSING_DEPS
        return self.client.list_buckets(**kwargs)  # type:ignore

    def scandir(  # type:ignore[override]
        self,
        path: Optional[PurePathy] = None,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
    ) -> Generator[BucketEntryGCS, None, None]:  # type:ignore[override]
        assert self.client is not None, _MISSING_DEPS
        continuation_token = None
        if path is None or not path.root:
            gcs_bucket: GCSNativeBucket
            for gcs_bucket in self.list_buckets():
                yield BucketEntryGCS(gcs_bucket.name, is_dir=True, raw=None)
            return
        sep = path._flavour.sep
        bucket = self.lookup_bucket(path)
        if bucket is None:
            return
        while True:
            if continuation_token:
                response = self.client.list_blobs(
                    bucket.name,
                    prefix=prefix,
                    delimiter=sep,
                    page_token=continuation_token,
                )
            else:
                response = self.client.list_blobs(bucket.name,
                                                  prefix=prefix,
                                                  delimiter=sep)
            for page in response.pages:
                for folder in list(page.prefixes):
                    full_name = folder[:-1] if folder.endswith(sep) else folder
                    name = full_name.split(sep)[-1]
                    if name:
                        yield BucketEntryGCS(name, is_dir=True, raw=None)
                for item in page:
                    name = item.name.split(sep)[-1]
                    if name:
                        yield BucketEntryGCS(
                            name=name,
                            is_dir=False,
                            size=item.size,
                            last_modified=item.updated.timestamp(),
                            raw=item,
                        )
            if response.next_page_token is None:
                break
            continuation_token = response.next_page_token

    def list_blobs(
        self,
        path: PurePathy,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
        include_dirs: bool = False,
    ) -> Generator[BlobGCS, None, None]:
        assert self.client is not None, _MISSING_DEPS
        continuation_token = None
        bucket = self.lookup_bucket(path)
        if bucket is None:
            return
        while True:
            if continuation_token:
                response = self.client.list_blobs(
                    path.root,
                    prefix=prefix,
                    delimiter=delimiter,
                    page_token=continuation_token,
                )
            else:
                response = self.client.list_blobs(path.root,
                                                  prefix=prefix,
                                                  delimiter=delimiter)
            for page in response.pages:
                for item in page:
                    yield BlobGCS(
                        bucket=bucket,
                        owner=item.owner,
                        name=item.name,
                        raw=item,
                        size=item.size,
                        updated=item.updated.timestamp(),
                    )
            if response.next_page_token is None:
                break
            continuation_token = response.next_page_token
Esempio n. 12
0
    def test_load_table_from_storage_w_autodetect_schema(self):
        from google.cloud._testing import _NamedTemporaryFile
        from google.cloud.storage import Client as StorageClient
        from google.cloud.bigquery import SchemaField

        local_id = unique_resource_id()
        bucket_name = 'bq_load_test' + local_id
        blob_name = 'person_ages.csv'
        gs_url = 'gs://{}/{}'.format(bucket_name, blob_name)
        rows = [
            ('Phred Phlyntstone', 32),
            ('Bharney Rhubble', 33),
            ('Wylma Phlyntstone', 29),
            ('Bhettye Rhubble', 27),
        ] * 100  # BigQuery internally uses the first 100 rows to detect schema
        table_name = 'test_table'

        storage_client = StorageClient()

        # In the **very** rare case the bucket name is reserved, this
        # fails with a ConnectionError.
        bucket = storage_client.create_bucket(bucket_name)
        self.to_delete.append(bucket)

        blob = bucket.blob(blob_name)

        with _NamedTemporaryFile() as temp:
            with open(temp.name, 'w') as csv_write:
                writer = csv.writer(csv_write)
                writer.writerow(('Full Name', 'Age'))
                writer.writerows(rows)

            with open(temp.name, 'rb') as csv_read:
                blob.upload_from_file(csv_read, content_type='text/csv')

        self.to_delete.insert(0, blob)

        dataset = Config.CLIENT.dataset(
            _make_dataset_name('load_gcs_then_dump'))

        retry_403(dataset.create)()
        self.to_delete.append(dataset)

        table = dataset.table(table_name)
        self.to_delete.insert(0, table)

        job = Config.CLIENT.load_table_from_storage(
            'bq_load_storage_test_' + local_id, table, gs_url)
        job.autodetect = True

        job.begin()

        # Allow for 90 seconds of "warm up" before rows visible.  See
        # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability
        # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds
        retry = RetryInstanceState(_job_done, max_tries=8)
        retry(job.reload)()

        table.reload()
        field_name = SchemaField(u'Full_Name', u'string', u'NULLABLE', None,
                                 ())
        field_age = SchemaField(u'Age', u'integer', u'NULLABLE', None, ())
        self.assertEqual(table.schema, [field_name, field_age])

        actual_rows = self._fetch_single_page(table)
        by_age = operator.itemgetter(1)
        self.assertEqual(sorted(actual_rows, key=by_age),
                         sorted(rows, key=by_age))
Esempio n. 13
0
class BucketClientGCS(BucketClient):
    client: Optional[GCSNativeClient]

    @property
    def client_params(self) -> Any:
        return dict(client=self.client)

    def __init__(self, **kwargs: Any) -> None:
        self.recreate(**kwargs)

    def recreate(self, **kwargs: Any) -> None:
        creds = kwargs["credentials"] if "credentials" in kwargs else None
        if creds is not None:
            kwargs["project"] = creds.project_id
        try:
            self.client = GCSNativeClient(**kwargs)
        except TypeError:
            # TypeError is raised if the imports for GCSNativeClient fail and are
            #  assigned to Any, which is not callable.
            self.client = None

    def make_uri(self, path: PurePathy) -> str:
        return str(path)

    def create_bucket(self, path: PurePathy) -> Bucket:
        assert self.client is not None, _MISSING_DEPS
        return self.client.create_bucket(path.root)

    def delete_bucket(self, path: PurePathy) -> None:
        assert self.client is not None, _MISSING_DEPS
        bucket = self.client.get_bucket(path.root)
        bucket.delete()

    def exists(self, path: PurePathy) -> bool:
        # Because we want all the parents of a valid blob (e.g. "directory" in
        # "directory/foo.file") to return True, we enumerate the blobs with a prefix
        # and compare the object names to see if they match a substring of the path
        key_name = str(path.key)
        try:
            for obj in self.list_blobs(path):
                if obj.name == key_name:
                    return True
                if obj.name.startswith(key_name + path._flavour.sep):
                    return True
        except gcs_errors.ClientError:
            return False
        return False

    def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
        except gcs_errors.ClientError as err:
            print(err)

        return None

    def get_bucket(self, path: PurePathy) -> BucketGCS:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
            raise FileNotFoundError(f"Bucket {path.root} does not exist!")
        except gcs_errors.ClientError as e:
            raise ClientError(message=e.message, code=e.code)

    def list_buckets(
        self, **kwargs: Dict[str, Any]
    ) -> Generator[GCSNativeBucket, None, None]:
        assert self.client is not None, _MISSING_DEPS
        return self.client.list_buckets(**kwargs)  # type:ignore

    def scandir(  # type:ignore[override]
        self,
        path: Optional[PurePathy] = None,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
    ) -> PathyScanDir:
        return _GCSScanDir(client=self, path=path, prefix=prefix, delimiter=delimiter)

    def list_blobs(
        self,
        path: PurePathy,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
        include_dirs: bool = False,
    ) -> Generator[BlobGCS, None, None]:
        assert self.client is not None, _MISSING_DEPS
        continuation_token = None
        bucket = self.lookup_bucket(path)
        if bucket is None:
            return
        while True:
            if continuation_token:
                response = self.client.list_blobs(
                    path.root,
                    prefix=prefix,
                    delimiter=delimiter,
                    page_token=continuation_token,
                )
            else:
                response = self.client.list_blobs(
                    path.root, prefix=prefix, delimiter=delimiter
                )
            for page in response.pages:
                for item in page:
                    yield BlobGCS(
                        bucket=bucket,
                        owner=item.owner,
                        name=item.name,
                        raw=item,
                        size=item.size,
                        updated=item.updated.timestamp(),
                    )
            if response.next_page_token is None:
                break
            continuation_token = response.next_page_token
Esempio n. 14
0
def start_process():
    start_time = time()
    storage_client = Client()
    scheduler_client = CloudSchedulerClient()
    scheduler_path = scheduler_client.location_path(config.PROJECT_ID,
                                                    config.REGION_ID)
    cred = credentials.ApplicationDefault()

    try:
        scheduler_client.delete_job(
            f"{scheduler_path}/jobs/{config.CRON_NAME}")
    except GoogleAPICallError or PermissionDenied:
        logging.warning("course-collect manually triggered")

    try:
        scheduler_client.delete_job(f"{scheduler_path}/jobs/forcequit")
    except GoogleAPICallError or PermissionDenied:
        logging.warning("forcequit job does not exist")

    if not _apps:
        initialize_app(cred, {"projectId": config.PROJECT_ID})
        logging.info("initializing firebase")

    firebase_db = firestore.client()

    if storage_client.bucket(config.BUCKET_NAME).exists():
        logging.info("reading from existing bucket")
        coursepickle_bucket = storage_client.bucket(config.BUCKET_NAME)
    else:
        logging.info("creating new bucket")
        coursepickle_bucket = storage_client.create_bucket(config.BUCKET_NAME)

    # Get unfinished course codes
    coursecode_blob = coursepickle_bucket.blob(config.COURSE_CODE_BLOB_NAME)
    try:
        coursecode_raw = coursecode_blob.download_as_string()
        unique_course_codes = pickle.loads(coursecode_raw)
    except NotFound:
        # Fetch course metadata per code for instructor, schedule, time, location, GPA, grade distributions
        all_courses = get_all_courses(firebase_db)
        unique_course_codes = set(
            [course["code"] for course in all_courses.values()])

    # Get existing course metadata
    coursepickle_blob = coursepickle_bucket.blob(
        config.COURSE_METADATA_BLOB_NAME)
    try:
        course_metadata_raw = coursepickle_blob.download_as_string()
        course_metadata = pickle.loads(course_metadata_raw)
    except NotFound:
        course_metadata = {}

    course_metadata = course_metadata if course_metadata else {}

    # Conform to free tier limits (looks like {"runtime": 123, "datetime": datetime(...)}
    last_modified_blob = coursepickle_bucket.blob(
        config.LAST_MODIFIED_BLOB_NAME)
    try:
        last_modified_raw = last_modified_blob.download_as_string()
        last_modified = pickle.loads(last_modified_raw)
    except NotFound:
        last_modified = {}

    last_modified = last_modified if last_modified else {
        "runtime": 0,
        "datetime": None
    }

    check_free_tier_force_exit(
        scheduler_client, scheduler_path,
        get_curr_runtime(last_modified["runtime"], start_time))
    if last_modified[
            "datetime"] and last_modified["datetime"].day < datetime.now().day:
        last_modified["runtime"] = 0

    if bool(int(config.UPDATE_EXTRA_FIELDS)):
        course_code_done = []
        for code in unique_course_codes:
            try:
                logging.info(f"Checking class {code}")
                print(code)
                split_code = code.split()
                pg = requests_connectionerror_bypass(
                    config.SCHEDULE_TARGET_URL_FMT,
                    [config.LATEST_TERM, *split_code], scheduler_client,
                    scheduler_path, last_modified, start_time)

                html_content = requests_bandwith_bypass(
                    pg, config.SCHEDULE_TARGET_URL_FMT, split_code,
                    scheduler_client, scheduler_path, last_modified,
                    start_time)

                class_ddtitle = html_content.find_all("th",
                                                      {"scope": "colgroup"},
                                                      class_="ddtitle")

                class_titles = [
                    th.a.text for th in class_ddtitle
                    if "table" in str(th.find_next("tr"))
                ]

                class_dddefaults = [
                    str(c).replace("\n", "")
                    for c in html_content.find_all("td", class_="dddefault")
                    if "cc.gatech.edu" in c.text or "students" in c.text
                    or "lecture" in c.text or "Semester" in c.text
                ]

                class_terms = [
                    re.search(
                        "(?<=Associated Term: </span>)([a-zA-Z0-9'\s]*)(?=<br)",
                        c).group(0).strip() for c in class_dddefaults
                ]

                class_registration_dates = [
                    re.search(
                        "(?<=Registration Dates: </span>)([a-zA-Z0-9,\s]*)(?=<br)",
                        c).group(0).strip() for c in class_dddefaults
                ]

                class_attributes = [
                    re.search("(?<=Attributes: </span>)([^<]*)(?=<br)",
                              c).group(0).strip()
                    if "Attributes" in c else None for c in class_dddefaults
                ]

                class_grade_bases = [
                    re.search("(?<=Grade Basis: </span>)([A-Z0-9\s]*)(?=<br)",
                              c).group(0).strip() for c in class_dddefaults
                ]

                class_table = html_content.find_all(
                    "table", class_="datadisplaytable")[1:-1]

                class_schedule_headers = [[
                    "_".join(header.text.lower().split())
                    for header in table.find_all("th")
                ] for table in class_table]

                class_schedule_data = [[
                    header.text.replace("(P)", "").strip()
                    for header in table.find_all("td")
                ] for table in class_table]

                for c in class_schedule_data:
                    c[-1] = " ".join(c[-1].split())

                instructor_emails = [
                    re.search(
                        "([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)",
                        str(c)).group(1) if "mailto" in str(c) else None
                    for c in class_table
                ]

                pg = requests_connectionerror_bypass(
                    config.CRITIQUE_TARGET_URL_FMT, split_code,
                    scheduler_client, scheduler_path, last_modified,
                    start_time)

                html_content = requests_bandwith_bypass(
                    pg, config.CRITIQUE_TARGET_URL_FMT, split_code,
                    scheduler_client, scheduler_path, last_modified,
                    start_time)

                critique_table = html_content.find("table",
                                                   {"id": "dataTable"})

                critique_headers = [
                    "_".join(th.text.lower().split())
                    for th in critique_table.find_all("th")
                ][1:]

                critique_data_raw = [
                    td.text for td in critique_table.find_all("td")
                ]

                critique_data = [
                    critique_data_raw[x:x + len(critique_headers) + 1]
                    for x in range(0, len(critique_data_raw),
                                   len(critique_headers) + 1)
                ]

                critique_instructors = []
                for i in range(len(critique_data)):
                    critique_instructors.append(" ".join(
                        critique_data[i][0].split(", ")[::-1]))
                    del critique_data[i][0]
                    critique_data[i] = [critique_data[i][0]] + [
                        float(x) for x in critique_data[i][1:]
                    ]

                critique_averages = {}

                for i in range(len(critique_instructors)):
                    critique_averages[critique_instructors[i]] = dict(
                        zip(critique_headers, critique_data[i]))

                for i in range(len(class_titles)):
                    try:
                        schedule = dict(
                            zip(class_schedule_headers[i],
                                class_schedule_data[i]))
                    except:
                        print(i)
                        raise RuntimeError

                    course_metadata[class_titles[i]] = {
                        "terms":
                        class_terms[i],
                        "registration_dates":
                        class_registration_dates[i],
                        "attributes":
                        class_attributes[i],
                        "grade_basis":
                        class_grade_bases[i],
                        "schedule":
                        schedule,
                        "instructor_email":
                        instructor_emails[i],
                        "averages":
                        critique_averages[schedule["instructors"]] if
                        schedule["instructors"] in critique_averages else None
                    }

                course_code_done.append(code)
            except RuntimeError as e:
                write_blobs_before_exit(coursepickle_blob, coursecode_blob,
                                        last_modified_blob, course_metadata,
                                        unique_course_codes, course_code_done,
                                        last_modified, start_time)
                schedule_next_try(scheduler_client, scheduler_path)
                raise e
    """
    Fetch per course seat, credit, and requirement information
    """
    for i in range(config.START_IDX, config.END_IDX):
        try:
            logging.info(f"Checking class with id {i}")

            pg = requests_connectionerror_bypass(
                config.REGISTRATION_TARGET_URL_FMT, [config.LATEST_TERM, i],
                scheduler_client, scheduler_path, last_modified, start_time)

            html_content = requests_bandwith_bypass(
                pg, config.REGISTRATION_TARGET_URL_FMT, [i], scheduler_client,
                scheduler_path, last_modified, start_time)

            if "-" not in html_content.text:
                logging.info(f"skipping {i}")
                continue

            class_general = html_content.find_all("th", {"scope": "row"},
                                                  class_="ddlabel")[0].text

            # For classes with dashes in the class name, replace them one by one with spaces
            # TODO retain dashes by using an alternative delimiter like " - "
            while len(re.findall("-", class_general)) != 3:
                class_general = re.sub("-", " ", class_general, 1)

            class_general_delimited = [
                s.strip() for s in class_general.split("-")
            ]

            class_name = class_general_delimited[0]

            class_id = int(class_general_delimited[1])

            class_code = class_general_delimited[2]

            class_dddefault = " ".join(
                html_content.find_all("td",
                                      class_="dddefault")[0].text.replace(
                                          "\n", " ").split())

            class_credits = float(
                re.search("\d+\.\d+(?=\s+Credits)", class_dddefault).group(0))

            class_seats = [
                int(
                    re.search("Seats (-*\d+) (-*\d+) (-*\d+)",
                              class_dddefault).group(x)) for x in range(1, 4)
            ]

            class_waitlist_seats = [
                int(
                    re.search("Waitlist Seats (-*\d+) (-*\d+) (-*\d+)",
                              class_dddefault).group(x)) for x in range(1, 4)
            ]

            # Regex search method depends on prerequisites and restrictions combination
            if "Prerequisites" in class_dddefault:
                if "Restrictions" in class_dddefault:
                    class_prerequisites = re.search("Prerequisites: (.*)",
                                                    class_dddefault).group(1)
                    class_restrictions = re.search(
                        "Restrictions: (.*) Prerequisites",
                        class_dddefault).group(1)
                else:
                    class_prerequisites = re.search("Prerequisites: (.*)",
                                                    class_dddefault).group(1)
                    class_restrictions = None
            else:
                if "Restrictions" in class_dddefault:
                    class_prerequisites = None
                    class_restrictions = re.search("Restrictions: (.*)",
                                                   class_dddefault).group(1)
                else:
                    class_prerequisites = None
                    class_restrictions = None

            course_dict = {
                "id": class_id,
                "code": class_code,
                "name": class_name,
                "credits": class_credits,
                "seats": {
                    "capacity": class_seats[0],
                    "actual": class_seats[1],
                    "remaining": class_seats[2]
                },
                "waitlist": {
                    "capacity": class_waitlist_seats[0],
                    "actual": class_waitlist_seats[1],
                    "remaining": class_waitlist_seats[2]
                },
                "restrictions": class_restrictions,
                "prerequisites": class_prerequisites,
                "last_updated": datetime.now()
            }
            if class_general in course_metadata:
                course_dict.update(course_metadata[class_general])

            # Send all collected class metadata
            firebase_db.collection(u'{}'.format(
                config.PRIMARY_TABLE_NAME)).document(
                    u'{}'.format(class_id)).set(course_dict)

            all_table_name = f"{config.SECONDARY_TABLE_NAME}{i // 500}"
            all_courses_doc = firebase_db.collection(
                u'{}'.format(all_table_name)).document(
                    u'{}'.format("all_courses")).get()
            if all_courses_doc.exists:
                all_courses = all_courses_doc.to_dict()
                all_courses[str(class_id)] = course_dict
                firebase_db.collection(u'{}'.format(all_table_name)).document(
                    u'{}'.format("all_courses")).set(all_courses)
            else:
                firebase_db.collection(u'{}'.format(all_table_name)).document(
                    u'{}'.format("all_courses")).set(
                        {str(class_id): course_dict})
        except RuntimeError as e:
            write_blobs_before_exit(coursepickle_blob, coursecode_blob,
                                    last_modified_blob, course_metadata, [],
                                    [], last_modified, start_time)
            schedule_next_try(scheduler_client, scheduler_path)
            raise e

    # Delete all blobs
    coursepickle_blob.delete()
    coursecode_blob.delete()
    last_modified_blob.delete()
    schedule_next_try(scheduler_client,
                      scheduler_path,
                      adjust_cron=timedelta(days=1))
    return "200 OK"
Esempio n. 15
0
class BucketClientGCS(BucketClient):
    client: GCSNativeClient

    @property
    def client_params(self) -> Any:
        return dict(client=self.client)

    def __init__(self, **kwargs: Any) -> None:
        self.recreate(**kwargs)

    def recreate(self, **kwargs: Any) -> None:
        creds = kwargs["credentials"] if "credentials" in kwargs else None
        if creds is not None:
            kwargs["project"] = creds.project_id
        self.client = GCSNativeClient(**kwargs)

    def make_uri(self, path: PurePathy) -> str:
        return str(path)

    def create_bucket(  # type:ignore[override]
            self, path: PurePathy) -> GCSNativeBucket:
        return self.client.create_bucket(path.root)  # type:ignore

    def delete_bucket(self, path: PurePathy) -> None:
        bucket = self.client.get_bucket(path.root)  # type:ignore
        bucket.delete()  # type:ignore

    def exists(self, path: PurePathy) -> bool:
        # Because we want all the parents of a valid blob (e.g. "directory" in
        # "directory/foo.file") to return True, we enumerate the blobs with a prefix
        # and compare the object names to see if they match a substring of the path
        key_name = str(path.key)
        for obj in self.list_blobs(path):
            if obj.name.startswith(key_name +
                                   path._flavour.sep):  # type:ignore
                return True
        return False

    def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]:
        try:
            return self.get_bucket(path)
        except FileNotFoundError:
            return None

    def get_bucket(self, path: PurePathy) -> BucketGCS:
        native_bucket: Any = self.client.bucket(path.root)  # type:ignore
        try:
            if native_bucket.exists():
                return BucketGCS(str(path.root), bucket=native_bucket)
        except BadRequest:
            pass
        raise FileNotFoundError(f"Bucket {path.root} does not exist!")

    def list_buckets(  # type:ignore[override]
            self,
            **kwargs: Dict[str,
                           Any]) -> Generator[GCSNativeBucket, None, None]:
        return self.client.list_buckets(**kwargs)  # type:ignore

    def scandir(  # type:ignore[override]
        self,
        path: Optional[PurePathy] = None,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
    ) -> PathyScanDir:
        return ScanDirGCS(client=self,
                          path=path,
                          prefix=prefix,
                          delimiter=delimiter)

    def list_blobs(
        self,
        path: PurePathy,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
    ) -> Generator[BlobGCS, None, None]:
        bucket = self.lookup_bucket(path)
        if bucket is None:
            return
        response: Any = self.client.list_blobs(  # type:ignore
            path.root, prefix=prefix, delimiter=delimiter)
        for page in response.pages:  # type:ignore
            for item in page:
                yield BlobGCS(
                    bucket=bucket,
                    owner=item.owner,
                    name=item.name,
                    raw=item,
                    size=item.size,
                    updated=item.updated.timestamp(),
                )