def test_file_path_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    expect = pdx.read_avro(tf.name)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    assert_frame_equal(expect, dataframe)
Exemple #2
0
 def write(self, df: pd.DataFrame, schema_key: str, file_key: str) -> None:
     schema = self.__parse_schema(path=schema_key)
     pandavro.to_avro(file_key,
                      df,
                      schema=schema,
                      append=False,
                      codec='snappy')
def test_delegation(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    expect = pdx.from_avro(tf.name)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    assert_frame_equal(expect, dataframe)
Exemple #4
0
def test_buffer_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    with open(tf.name, 'rb') as f:
        expect = pdx.from_avro(BytesIO(f.read()))
    assert_frame_equal(expect, dataframe)
    f.close()
def test_get_batch_features_with_file(client):
    file_fs1 = client.get_feature_set(name="file_feature_set", version=1)

    N_ROWS = 10
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame(
        {
            "datetime": [time_offset] * N_ROWS,
            "entity_id": [i for i in range(N_ROWS)],
            "feature_value1": [f"{i}" for i in range(N_ROWS)],
        }
    )
    client.ingest(file_fs1, features_1_df, timeout=480)

    # Rename column (datetime -> event_timestamp)
    features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"})

    to_avro(df=features_1_df[["event_timestamp", "entity_id"]], file_path_or_buffer="file_feature_set.avro")

    time.sleep(15)
    feature_retrieval_job = client.get_batch_features(
        entity_rows="file://file_feature_set.avro", feature_refs=[f"{PROJECT_NAME}/feature_value1:1"]
    )

    output = feature_retrieval_job.to_dataframe()
    print(output.head())

    assert output["entity_id"].to_list() == [int(i) for i in output["feature_value1"].to_list()]
Exemple #6
0
    def run(self) -> bool:
        """
        Runs the command

        Returns:
            False on failure
        """

        # load schema
        schema = self.get_schema()

        # load sql file
        sql_query = self.get_sql_query()

        # query data frame from db
        logger.log(f'Read data from SQL', format=logger.Format.ITALICS)
        df = read_dataframe(self.db_alias, sql_query)

        # write avro file
        avro_file_path = f"{pathlib.Path(config.data_dir()) / self.file_name}"
        logger.log(f'Write to AVRO file {avro_file_path}',
                   format=logger.Format.ITALICS)
        pdx.to_avro(avro_file_path, df, schema=schema)

        return True
Exemple #7
0
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
    """Save pandas dataframe as avro.
    If providing your own schema, the usage of schema argument is preferred

    :param schema: Avro Schema determines dtypes saved
    """
    import pandavro as pdx

    kw = ParamDict(kwargs)

    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    if "schema" in kw:
        schema = kw["schema"]
        del kw["schema"]

    if "append" in kw:
        append = kw[
            "append"]  # default is overwrite (False) instead of append (True)
        del kw["append"]

    if "times_as_micros" in kw:
        times_as_micros = kw["times_as_micros"]
        del kw["times_as_micros"]

    pdf = df.as_pandas()
    pdx.to_avro(p.uri,
                pdf,
                schema=schema,
                append=append,
                times_as_micros=times_as_micros,
                **kw)
Exemple #8
0
def write_with_compression(df, compression):
    start = timer()
    pdx.to_avro(OUTPUT_FILE_PATH, df, codec=compression)
    end = timer()
    print('Time to write avro with {} compression: {} seconds'.format(
        compression, end - start))
    print('Resulting size: {}'.format(
        util.get_readable_file_size(OUTPUT_FILE_PATH)))
def test_buffer_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    with open(tf.name, 'rb') as f:
        expect = pdx.read_avro(BytesIO(f.read()))
        expect['DateTime64'] = expect['DateTime64'].astype(
            np.dtype('datetime64[ns]'))
    assert_frame_equal(expect, dataframe)
def test_append(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe[0:int(dataframe.shape[0] / 2)])
    pdx.to_avro(tf.name, dataframe[int(dataframe.shape[0] / 2):], append=True)
    expect = pdx.from_avro(tf.name)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    assert_frame_equal(expect, dataframe)
Exemple #11
0
def test_batch_get_historical_features_with_file(client):
    file_fs1 = client.get_feature_set(name="file_feature_set")

    N_ROWS = 10
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame(
        {
            "datetime": [time_offset] * N_ROWS,
            "entity_id": [i for i in range(N_ROWS)],
            "feature_value1": [f"{i}" for i in range(N_ROWS)],
        }
    )

    # feature set may be ready (direct runner set ready  right after job submitted),
    # but kafka consumer is not configured
    # give some time to warm up ingestion job
    wait_retry_backoff(
        retry_fn=(
            lambda: (
                None,
                client.get_feature_set(name="file_feature_set").status
                == FeatureSetStatus.STATUS_READY,
            )
        ),
        timeout_secs=480,
        timeout_msg="Wait for FeatureSet to be READY",
    )
    time.sleep(20)

    client.ingest(file_fs1, features_1_df, timeout=480)

    # Rename column (datetime -> event_timestamp)
    features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"})

    to_avro(
        df=features_1_df[["event_timestamp", "entity_id"]],
        file_path_or_buffer="file_feature_set.avro",
    )

    time.sleep(10)

    def check():
        feature_retrieval_job = client.get_historical_features(
            entity_rows="file://file_feature_set.avro",
            feature_refs=["feature_value1"],
            project=PROJECT_NAME,
        )

        output = feature_retrieval_job.to_dataframe(timeout_sec=180)
        print(output.head())

        assert output["entity_id"].to_list() == [
            int(i) for i in output["feature_value1"].to_list()
        ]
        clean_up_remote_files(feature_retrieval_job.get_avro_files())

    wait_for(check, timedelta(minutes=10))
    def save(self, save_path: str) -> None:
        save_file = self.setup_save_file(save_path=save_path, extension="avro")

        structured_data = self._get_structured_data()

        dataframe = pandas.DataFrame.from_dict(structured_data)
        pandavro.to_avro(save_file, dataframe)

        self._log_save(save_file)
Exemple #13
0
def serialize_panda_df_to_str(df: pd.DataFrame, schema: Dict) -> str:
    with io.BytesIO() as bytes_io:
        # else we get: ValueError: NaTType does not support timestamp
        # it's really a pandavro issue, see https://github.com/fastavro/fastavro/issues/313
        # TODO(talebz): Create a Pandavro issue for this!
        df = df.replace({np.nan: None})
        pandavro.to_avro(bytes_io, df, schema=schema)
        bytes_io.seek(0)
        return base64.b64encode(bytes_io.read()).decode("utf-8")
Exemple #14
0
def _save_avro(df: LocalDataFrame,
               p: FileParser,
               columns: Any = None,
               **kwargs: Any):
    """Save pandas dataframe as avro.
    If providing your own schema, the usage of schema argument is preferred

    """

    kw = ParamDict(kwargs)
    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    if "schema" in kw:
        schema = kw["schema"]
        if schema is None:
            if columns is not None:
                schema = _convert_pyarrow_to_avro_schema(df, columns)
        else:
            if columns:
                # both schema and columns provided
                raise Exception("set columns to None when schema is provided")

        del kw["infer_schema"]

    if "infer_schema" in kw:
        infer_schema = kw["infer_schema"]
        if infer_schema and (schema is not None):
            # infer_schema set to True but schema was provided
            raise Exception(
                "set infer_schema to False when schema is provided")
        del kw["infer_schema"]

    if "append" in kw:
        append = kw[
            "append"]  # default is overwrite (False) instead of append (True)
        del kw["append"]

    if "times_as_micros" in kw:
        times_as_micros = kw["times_as_micros"]
        del kw["times_as_micros"]

    pdf = df.as_pandas()
    pdx.to_avro(p.uri,
                pdf,
                schema=schema,
                append=append,
                times_as_micros=times_as_micros,
                **kw)
Exemple #15
0
def test_batch_get_historical_features_with_gs_path(client, gcs_path):
    gcs_fs1 = client.get_feature_set(name="gcs_feature_set")

    N_ROWS = 10
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame(
        {
            "datetime": [time_offset] * N_ROWS,
            "entity_id": [i for i in range(N_ROWS)],
            "feature_value2": [f"{i}" for i in range(N_ROWS)],
        }
    )
    client.ingest(gcs_fs1, features_1_df, timeout=360)

    # Rename column (datetime -> event_timestamp)
    features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"})

    # Output file to local
    file_name = "gcs_feature_set.avro"
    to_avro(
        df=features_1_df[["event_timestamp", "entity_id"]],
        file_path_or_buffer=file_name,
    )

    uri = urlparse(gcs_path)
    bucket = uri.hostname
    ts = int(time.time())
    remote_path = str(uri.path).strip("/") + f"/{ts}/{file_name}"

    # Upload file to gcs
    storage_client = storage.Client(project=None)
    bucket = storage_client.get_bucket(bucket)
    blob = bucket.blob(remote_path)
    blob.upload_from_filename(file_name)

    time.sleep(10)

    def check():
        feature_retrieval_job = client.get_historical_features(
            entity_rows=f"{gcs_path}/{ts}/*",
            feature_refs=["feature_value2"],
            project=PROJECT_NAME,
        )

        output = feature_retrieval_job.to_dataframe(timeout_sec=180)
        print(output.head())
        assert output["entity_id"].to_list() == [
            int(i) for i in output["feature_value2"].to_list()
        ]

        clean_up_remote_files(feature_retrieval_job.get_avro_files())
        blob.delete()

    wait_for(check, timedelta(minutes=5))
Exemple #16
0
def main():
    df = pd.DataFrame({"Boolean": [True, False, True, False],
                       "Float64": np.random.randn(4),
                       "Int64": np.random.randint(0, 10, 4),
                       "String": ['foo', 'bar', 'foo', 'bar'],
                       "DateTime64": [pd.Timestamp('20190101'), pd.Timestamp('20190102'),
                                      pd.Timestamp('20190103'), pd.Timestamp('20190104')]})

    pdx.to_avro(OUTPUT_PATH, df)
    saved = pdx.read_avro(OUTPUT_PATH)
    print(saved)
Exemple #17
0
def save_transformed_data(data):
    try:
        print('saving as Parquet')
        data.to_parquet(cs.TRANSFORMED_DATA_PATH_PARQUET)
        print('saving as AVRO')
        data.to_csv('filtered.csv')
        new_data = pd.read_csv('filtered.csv', keep_default_na=False)
        pdx.to_avro(cs.TRANSFORMED_DATA_PATH_AVRO, new_data)
        print('saving as JSON gzip')
        data.to_json(cs.TRANSFORMED_DATA_PATH_JSON, compression='gzip')
    except Exception as error:
        print(error)
def _write_avro(df, tmpfile, times_as_micros=False, *args, **kwargs):
    """
    Saves a DataFrame to Avro format

    Args:
        obj (pd.DataFrame): The DataFrame to be written to Avro
        tmpfile (tempfile.NamedTemporaryFile):
            Connection to the file to be written to
        save_datetimes_as_millis (bool):
            Whether to save timestamps as milliseconds or
            leave it as the pandavro default microseconds
    """
    pdx.to_avro(tmpfile, df, times_as_micros=times_as_micros, *args, **kwargs)
Exemple #19
0
    def put(self, data: pd.DataFrame, short_description: str) -> DatasetVersion:
        ns: str = self.__namespace or '_'
        ds: str = self.__name

        file: BytesIO = BytesIO()
        pandavro.to_avro(file, data)
        file.seek(0)

        resp = client.put('/datasets/' + ns + '/' + ds + '/versions', files = {
            'message': short_description,
            'file': file
        })

        return self.version(resp.json())
def test_get_batch_features_with_gs_path(client, gcs_path):
    gcs_fs1 = FeatureSet(
        "gcs_feature_set",
        features=[Feature("feature_value", ValueType.STRING)],
        entities=[Entity("entity_id", ValueType.INT64)],
        max_age=Duration(seconds=100),
    )

    client.apply(gcs_fs1)
    gcs_fs1 = client.get_feature_set(name="gcs_feature_set", version=1)

    N_ROWS = 10
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame({
        "datetime": [time_offset] * N_ROWS,
        "entity_id": [i for i in range(N_ROWS)],
        "feature_value": [f"{i}" for i in range(N_ROWS)],
    })
    client.ingest(gcs_fs1, features_1_df)

    # Rename column (datetime -> event_timestamp)
    features_1_df = features_1_df.rename(
        columns={"datetime": "event_timestamp"})

    # Output file to local
    file_name = "gcs_feature_set.avro"
    to_avro(df=features_1_df, file_path_or_buffer=file_name)

    uri = urlparse(gcs_path)
    bucket = uri.hostname
    ts = int(time.time())
    remote_path = str(uri.path).strip("/") + f"{ts}/{file_name}"

    # Upload file to gcs
    storage_client = storage.Client(project=None)
    bucket = storage_client.get_bucket(bucket)
    blob = bucket.blob(remote_path)
    blob.upload_from_filename(file_name)

    feature_retrieval_job = client.get_batch_features(
        entity_rows=f"{gcs_path}{ts}/*",
        feature_ids=["gcs_feature_set:1:feature_value"])

    output = feature_retrieval_job.to_dataframe()
    print(output.head())

    assert output["entity_id"].to_list() == [
        int(i) for i in output["gcs_feature_set_v1_feature_value"].to_list()
    ]
Exemple #21
0
def converter_csv_to_avro(INPUT_PATH, OUTPUT_PATH, converter_to_datetime):
    df = pd.read_csv(INPUT_PATH)

    # Trasnform columns string to datetime
    for columns_to_converter in converter_to_datetime:
        df[columns_to_converter] = pd.to_datetime(df[columns_to_converter])

    print(df.info())

    pdx.to_avro(OUTPUT_PATH, df)  # Converter
    saved = pdx.read_avro(OUTPUT_PATH)  # Only read to control

    print(saved)

    return
Exemple #22
0
def export_dataframe_to_local(
        df: pd.DataFrame,
        dir_path: Optional[str] = None) -> Tuple[str, str, str]:
    """
    Exports a pandas DataFrame to the local filesystem.

    Args:
        df (pd.DataFrame):
            Pandas DataFrame to save.

        dir_path (Optional[str]):
            Absolute directory path '/data/project/subfolder/'.

    Returns:
        Tuple[str, str, str]:
            Tuple of directory path, file name and destination path. The
            destination path can be obtained by concatenating the directory
            path and file name.
    """

    # Create local staging location if not provided
    if dir_path is None:
        dir_path = tempfile.mkdtemp()

    file_name = _get_file_name()
    dest_path = f"{dir_path}/{file_name}"

    # Temporarily rename datetime column to event_timestamp. Ideally we would
    # force the schema with our avro writer instead.
    df.columns = [
        "event_timestamp" if col == "datetime" else col for col in df.columns
    ]

    try:
        # Export dataset to file in local path
        to_avro(df=df, file_path_or_buffer=dest_path)
    except Exception:
        raise
    finally:
        # Revert event_timestamp column to datetime
        df.columns = [
            "datetime" if col == "event_timestamp" else col
            for col in df.columns
        ]

    return dir_path, file_name, dest_path
Exemple #23
0
def export_dataframe_to_local(df: pd.DataFrame,
                              dir_path: Optional[str] = None):
    """
    Exports a pandas dataframe to the local filesystem
    :param df: Pandas dataframe to save
    :param dir_path: (optional) Absolute directory path '/data/project/subfolder/'
    :return:
    """
    # Create local staging location if not provided
    if dir_path is None:
        dir_path = tempfile.mkdtemp()

    file_name = f'{datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")}_{str(uuid.uuid4())[:8]}.avro'
    dest_path = f"{dir_path}/{file_name}"

    # Export dataset to file in local path
    to_avro(df=df, file_path_or_buffer=dest_path)
    return dir_path, file_name, dest_path
def test_dataframe_kwargs(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    # include columns
    columns = ['Boolean', 'Int64']
    expect = pdx.read_avro(tf.name, columns=columns)
    df = dataframe[columns]
    assert_frame_equal(expect, df)
    # exclude columns
    columns = ['String', 'Boolean']
    expect = pdx.read_avro(tf.name, exclude=columns)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    df = dataframe.drop(columns, axis=1)
    assert_frame_equal(expect, df)
    # specify index
    index = 'String'
    expect = pdx.read_avro(tf.name, index=index)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    df = dataframe.set_index(index)
    assert_frame_equal(expect, df)
Exemple #25
0
def test_batch_get_batch_features_with_file(client):
    file_fs1 = client.get_feature_set(name="file_feature_set")

    N_ROWS = 10
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame({
        "datetime": [time_offset] * N_ROWS,
        "entity_id": [i for i in range(N_ROWS)],
        "feature_value1": [f"{i}" for i in range(N_ROWS)],
    })
    client.ingest(file_fs1, features_1_df, timeout=480)

    # Rename column (datetime -> event_timestamp)
    features_1_df['datetime'] + pd.Timedelta(
        seconds=1)  # adds buffer to avoid rounding errors
    features_1_df = features_1_df.rename(
        columns={"datetime": "event_timestamp"})

    to_avro(
        df=features_1_df[["event_timestamp", "entity_id"]],
        file_path_or_buffer="file_feature_set.avro",
    )

    time.sleep(15)
    feature_retrieval_job = client.get_batch_features(
        entity_rows="file://file_feature_set.avro",
        feature_refs=["feature_value1"],
        project=PROJECT_NAME,
    )

    output = feature_retrieval_job.to_dataframe()
    clean_up_remote_files(feature_retrieval_job.get_avro_files())
    print(output.head())

    assert output["entity_id"].to_list() == [
        int(i) for i in output["feature_value1"].to_list()
    ]
def test_get_batch_features_with_file(client):
    file_fs1 = FeatureSet(
        "file_feature_set",
        features=[Feature("feature_value", ValueType.STRING)],
        entities=[Entity("entity_id", ValueType.INT64)],
        max_age=Duration(seconds=100),
    )

    client.apply(file_fs1)
    file_fs1 = client.get_feature_set(name="file_feature_set", version=1)

    N_ROWS = 10
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame({
        "datetime": [time_offset] * N_ROWS,
        "entity_id": [i for i in range(N_ROWS)],
        "feature_value": [f"{i}" for i in range(N_ROWS)],
    })
    client.ingest(file_fs1, features_1_df)

    # Rename column (datetime -> event_timestamp)
    features_1_df = features_1_df.rename(
        columns={"datetime": "event_timestamp"})

    to_avro(df=features_1_df, file_path_or_buffer="file_feature_set.avro")

    feature_retrieval_job = client.get_batch_features(
        entity_rows="file://file_feature_set.avro",
        feature_ids=["file_feature_set:1:feature_value"])

    output = feature_retrieval_job.to_dataframe()
    print(output.head())

    assert output["entity_id"].to_list() == [
        int(i) for i in output["file_feature_set_v1_feature_value"].to_list()
    ]
Exemple #27
0
def export_dataframe_to_local(df: pd.DataFrame,
                              dir_path: Optional[str] = None):
    """
    Exports a pandas dataframe to the local filesystem

    Args:
        df: Pandas dataframe to save
        dir_path: (optional) Absolute directory path '/data/project/subfolder/'
    """

    # Create local staging location if not provided
    if dir_path is None:
        dir_path = tempfile.mkdtemp()

    file_name = f'{datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")}_{str(uuid.uuid4())[:8]}.avro'
    dest_path = f"{dir_path}/{file_name}"

    # Temporarily rename datetime column to event_timestamp. Ideally we would
    # force the schema with our avro writer instead.
    df.columns = [
        "event_timestamp" if col == "datetime" else col for col in df.columns
    ]

    try:
        # Export dataset to file in local path
        to_avro(df=df, file_path_or_buffer=dest_path)
    except Exception:
        raise
    finally:
        # Revert event_timestamp column to datetime
        df.columns = [
            "datetime" if col == "event_timestamp" else col
            for col in df.columns
        ]

    return dir_path, file_name, dest_path
Exemple #28
0
 def avro_data_path(self):
     final_results = tempfile.mktemp()
     pandavro.to_avro(file_path_or_buffer=final_results, df=TEST_DATA_FRAME)
     return final_results
Exemple #29
0
def test_delegation(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    expect = pdx.from_avro(tf.name)
    assert_frame_equal(expect, dataframe)
Exemple #30
0
def test_file_path_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    expect = pdx.read_avro(tf.name)
    assert_frame_equal(expect, dataframe)