Esempio n. 1
0
    def download_table_as_df(self, full_table_id, staging_location):
        """
        Download a BigQuery table as Pandas Dataframe
        Args:
            full_table_id (src) : fully qualified BigQuery table id
            staging_location: url to staging_location (currently
                support a folder in GCS)

        Returns: pandas.DataFrame: dataframe of the training dataset

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = DestinationFormat.CSV
        job = self.bq.extract_table(Table.from_string(full_table_id),
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()
        return gcs_to_df(staging_file_path)
Esempio n. 2
0
    def download_table_as_file(self, full_table_id, dest, staging_location,
                               file_type):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            staging_location (str): url to staging_location (currently
                support a folder in GCS)
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
        Returns: (str) path to the downloaded file

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(full_table_id)
        job = self.bq.extract_table(src_table,
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.gcs.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Esempio n. 3
0
    def from_csv(cls,
                 path,
                 entity,
                 granularity,
                 owner,
                 staging_location=None,
                 id_column=None,
                 feature_columns=None,
                 timestamp_column=None,
                 timestamp_value=None,
                 serving_store=None,
                 warehouse_store=None):
        """Creates an importer from a given csv dataset. 
        This file can be either local or remote (in gcs). If it's a local file 
        then staging_location must be determined.
        
        Args:
            path (str): path to csv file
            entity (str): entity id
            granularity (Granularity): granularity of data
            owner (str): owner
            staging_location (str, optional): Defaults to None. Staging location 
                for ingesting a local csv file.
            id_column (str, optional): Defaults to None. Id column in the csv. 
                If not set, will default to the `entity` argument.
            feature_columns ([str], optional): Defaults to None. Feature columns
                to ingest. If not set, the importer will by default ingest all 
                available columns.
            timestamp_column (str, optional): Defaults to None. Timestamp 
                column in the csv. If not set, defaults to timestamp value.
            timestamp_value (datetime, optional): Defaults to current datetime. 
                Timestamp value to assign to all features in the dataset.
            serving_store (feast.sdk.resources.feature.DataStore): Defaults to None.
                Serving store to write the features in this instance to.
            warehouse_store (feast.sdk.resources.feature.DataStore): Defaults to None.
                Warehouse store to write the features in this instance to.
        
        Returns:
            Importer: the importer for the dataset provided.
        """
        import_spec_options = {"format": "csv"}
        import_spec_options["path"], require_staging = \
            _get_remote_location(path, staging_location)
        if is_gs_path(path):
            df = gcs_to_df(path)
        else:
            df = pd.read_csv(path)
        schema, features = \
            _detect_schema_and_feature(entity, granularity, owner, id_column,
                                       feature_columns, timestamp_column,
                                       timestamp_value, serving_store,
                                       warehouse_store, df)
        iport_spec = _create_import("file", import_spec_options, entity,
                                    schema)

        props = (_properties("csv", len(df.index), require_staging,
                             import_spec_options["path"]))
        specs = _specs(iport_spec, Entity(name=entity), features)

        return cls(specs, df, props)
Esempio n. 4
0
    def download_table_as_file(
        self, full_table_id, dest, file_type, staging_location=None
    ):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
            staging_location (str, optional): url to staging_location (currently
                support a folder in GCS)
        Returns: (str) path to the downloaded file

        """
        if not staging_location:
            df = self.download_table_as_df(full_table_id)
            if file_type == FileType.CSV:
                df.to_csv(dest, index=False)
            elif file_type == FileType.JSON:
                df.to_json(dest, index=False)
            else:
                raise ValueError(
                    "Only FileType: CSV and JSON are supported for download_table_as_file without staging location"
                )
            return dest

        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        shard_folder = self.__extract_table_to_shard_folder(
            full_table_id, staging_location, file_type)
        return gcs_folder_to_file(shard_folder, dest)
Esempio n. 5
0
def _get_remote_location(path, staging_location):
    """Get the remote location of the file
    
    Args:
        path (str): raw path of the file
        staging_location (str): path to stage the file

    """
    if is_gs_path(path):
        return path, False

    if staging_location is None:
        raise ValueError(
            "Specify staging_location for importing local file/dataframe")
    if not is_gs_path(staging_location):
        raise ValueError("Staging location must be in GCS")

    filename = ntpath.basename(path)
    return staging_location + "/" + filename, True
Esempio n. 6
0
    def _validate_csv_importer(self,
                               importer,
                               csv_path,
                               entity_name,
                               feature_granularity,
                               owner,
                               staging_location=None,
                               id_column=None,
                               feature_columns=None,
                               timestamp_column=None,
                               timestamp_value=None):
        df = pd.read_csv(csv_path)
        assert not importer.require_staging == is_gs_path(csv_path)
        if importer.require_staging:
            assert importer.remote_path == "{}/{}".format(
                staging_location, ntpath.basename(csv_path))

        # check features created
        for feature in importer.features.values():
            assert feature.name in df.columns
            assert feature.id == "{}.{}.{}".format(
                entity_name,
                Granularity_pb2.Enum.Name(feature_granularity.value).lower(),
                feature.name)

        import_spec = importer.spec
        assert import_spec.type == "file"
        path = importer.remote_path if importer.require_staging else csv_path
        assert import_spec.options == {"format": "csv", "path": path}
        assert import_spec.entities == [entity_name]

        schema = import_spec.schema
        assert schema.entityIdColumn == id_column if id_column is not None else entity_name
        if timestamp_column is not None:
            assert schema.timestampColumn == timestamp_column
        elif timestamp_value is not None:
            assert schema.timestampValue == timestamp_value

        if feature_columns is None:
            feature_columns = list(df.columns.values)
            feature_columns.remove(id_column)
            feature_columns.remove(timestamp_column)

        # check schema's field
        for col, field in zip(df.columns.values, schema.fields):
            assert col == field.name
            if col in feature_columns:
                assert field.featureId == "{}.{}.{}".format(
                    entity_name,
                    Granularity_pb2.Enum.Name(
                        feature_granularity.value).lower(), col)
Esempio n. 7
0
    def download_table_as_file(self,
                               full_table_id,
                               dest,
                               file_type,
                               staging_location=None):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
            staging_location (str, optional): url to staging_location (currently
                support a folder in GCS)
        Returns: (str) path to the downloaded file

        """
        if not staging_location:
            df = self.download_table_as_df(full_table_id)
            if file_type == FileType.CSV:
                df.to_csv(dest, index=False)
            elif file_type == FileType.JSON:
                df.to_json(dest, index=False)
            else:
                raise ValueError(
                    "Only FileType: CSV and JSON are supported for download_table_as_file without staging location"
                )
            return dest

        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(full_table_id)
        job = self.bqclient.extract_table(src_table,
                                          staging_file_path,
                                          job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.storageclient.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Esempio n. 8
0
    def download_table_as_df(self, full_table_id, staging_location=None):
        """
        Download a BigQuery table as Pandas Dataframe
        Args:
            full_table_id (src) : fully qualified BigQuery table id
            staging_location: url to staging_location (currently
                support a folder in GCS)

        Returns: pandas.DataFrame: dataframe of the training dataset

        """
        if not staging_location:
            table = bigquery.TableReference.from_string(full_table_id)
            rows = self.bqclient.list_rows(table)
            return rows.to_dataframe(bqstorage_client=self.bqstorageclient)

        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        shard_folder = self.__extract_table_to_shard_folder(
            full_table_id, staging_location, DestinationFormat.CSV)
        return gcs_folder_to_df(shard_folder)
Esempio n. 9
0
def test_is_gs_path():
    assert is_gs_path("gs://valid/gs/file.csv") == True
    assert is_gs_path("local/path/file.csv") == False