Exemple #1
0
    def download_table_as_df(self, full_table_id, staging_location):
        """
        Download a BigQuery table as Pandas Dataframe
        Args:
            full_table_id (src) : fully qualified BigQuery table id
            staging_location: url to staging_location (currently
                support a folder in GCS)

        Returns: pandas.DataFrame: dataframe of the training dataset

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = DestinationFormat.CSV
        job = self.bq.extract_table(Table.from_string(full_table_id),
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()
        return gcs_to_df(staging_file_path)
Exemple #2
0
    def from_csv(cls,
                 path,
                 entity,
                 granularity,
                 owner,
                 staging_location=None,
                 id_column=None,
                 feature_columns=None,
                 timestamp_column=None,
                 timestamp_value=None,
                 serving_store=None,
                 warehouse_store=None):
        """Creates an importer from a given csv dataset. 
        This file can be either local or remote (in gcs). If it's a local file 
        then staging_location must be determined.
        
        Args:
            path (str): path to csv file
            entity (str): entity id
            granularity (Granularity): granularity of data
            owner (str): owner
            staging_location (str, optional): Defaults to None. Staging location 
                for ingesting a local csv file.
            id_column (str, optional): Defaults to None. Id column in the csv. 
                If not set, will default to the `entity` argument.
            feature_columns ([str], optional): Defaults to None. Feature columns
                to ingest. If not set, the importer will by default ingest all 
                available columns.
            timestamp_column (str, optional): Defaults to None. Timestamp 
                column in the csv. If not set, defaults to timestamp value.
            timestamp_value (datetime, optional): Defaults to current datetime. 
                Timestamp value to assign to all features in the dataset.
            serving_store (feast.sdk.resources.feature.DataStore): Defaults to None.
                Serving store to write the features in this instance to.
            warehouse_store (feast.sdk.resources.feature.DataStore): Defaults to None.
                Warehouse store to write the features in this instance to.
        
        Returns:
            Importer: the importer for the dataset provided.
        """
        import_spec_options = {"format": "csv"}
        import_spec_options["path"], require_staging = \
            _get_remote_location(path, staging_location)
        if is_gs_path(path):
            df = gcs_to_df(path)
        else:
            df = pd.read_csv(path)
        schema, features = \
            _detect_schema_and_feature(entity, granularity, owner, id_column,
                                       feature_columns, timestamp_column,
                                       timestamp_value, serving_store,
                                       warehouse_store, df)
        iport_spec = _create_import("file", import_spec_options, entity,
                                    schema)

        props = (_properties("csv", len(df.index), require_staging,
                             import_spec_options["path"]))
        specs = _specs(iport_spec, Entity(name=entity), features)

        return cls(specs, df, props)