Example #1
0
def test_bad_table_mode_raises(df, tmp_hyper):
    msg = "'table_mode' must be either 'w' or 'a'"
    with pytest.raises(ValueError, match=msg):
        pantab.frame_to_hyper(df, tmp_hyper, table="test", table_mode="x")

    with pytest.raises(ValueError, match=msg):
        pantab.frames_to_hyper({"a": df}, tmp_hyper, table_mode="x")
Example #2
0
def test_append_mode_raises_column_dtype_mismatch(df, tmp_hyper, table_name):
    pantab.frame_to_hyper(df, tmp_hyper, table=table_name)

    df["int16"] = df["int16"].astype(np.int64)
    msg = "^Mismatched column definitions:"
    with pytest.raises(TypeError, match=msg):
        pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode="a")
Example #3
0
def test_bad_value_gives_clear_message(tmp_hyper):
    df = pd.DataFrame([[{"a": "b"}]], columns=["a"])

    msg = r"Invalid value \"{'a': 'b'}\" found \(row 0 column 0\)"

    with pytest.raises(TypeError, match=msg):
        pantab.frame_to_hyper(df, tmp_hyper, table="test")
Example #4
0
def test_append_mode_raises_column_mismatch(df, tmp_hyper, table_name):
    pantab.frame_to_hyper(df, tmp_hyper, table=table_name)

    df = df.drop("object", axis=1)
    msg = "^Mismatched column definitions:"
    with pytest.raises(TypeError, match=msg):
        pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode="a")
Example #5
0
def test_read_doesnt_modify_existing_file(df, tmp_hyper):
    pantab.frame_to_hyper(df, tmp_hyper, table="test")
    last_modified = tmp_hyper.stat().st_mtime

    # Try out our read methods
    pantab.frame_from_hyper(tmp_hyper, table="test")
    pantab.frames_from_hyper(tmp_hyper)

    # Neither should not update file stats
    assert last_modified == tmp_hyper.stat().st_mtime
Example #6
0
    def setup_cache(self):
        nrows = 10_000
        data = [[
            1,
            2,
            3,
            1,
            2,
            3,
            4.0,
            5.0,
            True,
            pd.to_datetime("2018-01-01"),
            pd.to_datetime("2018-01-01", utc=True),
            pd.Timedelta("1 days 2 hours 3 minutes 4 seconds"),
            "foo",
        ]] * nrows

        df = pd.DataFrame(
            data,
            columns=[
                "int16",
                "int32",
                "int64",
                "Int16",
                "Int32",
                "Int64",
                "float32",
                "float64",
                "bool",
                "datetime64",
                "datetime64_utc",
                "timedelta64",
                "object",
            ],
        )

        df = df.astype({
            "int16": np.int16,
            "int32": np.int32,
            "int64": np.int64,
            "float32": np.float32,
            "float64": np.float64,
            "bool": np.bool,
            "datetime64": "datetime64[ns]",
            "datetime64_utc": "datetime64[ns, UTC]",
            "timedelta64": "timedelta64[ns]",
            "object": "object",
        })

        path = "test.hyper"
        pantab.frame_to_hyper(df, path, table="test")
        return df
Example #7
0
def test_duplicate_columns_raises(tmp_hyper):
    df = pd.DataFrame([[1, 1]], columns=[1, 1])
    with pytest.raises(
            tab_api.hyperexception.HyperException,
            match="column '1' specified more than once",
    ):
        pantab.frame_to_hyper(df, tmp_hyper, table="foo")

    with pytest.raises(
            tab_api.hyperexception.HyperException,
            match="column '1' specified more than once",
    ):
        pantab.frames_to_hyper({"test": df}, tmp_hyper)
Example #8
0
def test_read_query(df, tmp_hyper):
    pantab.frame_to_hyper(df, tmp_hyper, table="test")

    query = "SELECT int16 AS i, '_' || int32 AS _i2 FROM test"
    result = pantab.frame_from_hyper_query(tmp_hyper, query)

    expected = pd.DataFrame(
        [[1, "_2"], [6, "_7"], [0, "_0"]],
        columns=["i", "_i2"],
    )
    str_type = "string" if compat.PANDAS_100 else "object"
    expected = expected.astype({"i": "Int16", "_i2": str_type})

    tm.assert_frame_equal(result, expected)
Example #9
0
def execute_refresh(rest_helper, config, query_text, socketio=None):

    TABLE_NAME = 'google_places'

    # get data
    MAIN_LOGGER.info(
        f'Refreshing Google Places Extract Based on Query: [{query_text}]...')
    if socketio:
        socketio.emit(
            'push-message',
            f'Refreshing Extract Data Based on <br/> Query: [{query_text}]...',
            broadcast=True)
        socketio.emit('push-message',
                      f'Querying Google Places API...',
                      broadcast=True)
    extract_data_df = get_google_places_dataframe(config, query_text)

    # create hyper extract and publish
    hyper_file_path = os.path.join(file_paths.DATA_STAGING_DIR,
                                   f'GooglePlacesData.hyper')

    if socketio:
        socketio.emit('push-message',
                      f'Creating New Hyper File...',
                      broadcast=True)
    pantab.frame_to_hyper(extract_data_df,
                          hyper_file_path,
                          table=TABLE_NAME,
                          table_mode='a')

    target_datasource_name = config['target_datasource_name']
    target_project_name = config['target_project_name']
    success = rest_helper.publish_hyper(hyper_file_path,
                                        target_datasource_name,
                                        target_project_name)
    if socketio:
        socketio.emit(
            'push-message',
            f'Published Datasource as <br/>"{target_datasource_name}"...',
            broadcast=True)

    MAIN_LOGGER.info(
        f'Call to publish {target_datasource_name} datasource returned {success}'
    )

    MAIN_LOGGER.info(f'Task Execution Completed')
    if socketio:
        socketio.emit('push-message',
                      f'Extract Task Completed',
                      broadcast=True)
Example #10
0
def test_months_in_interval_raises(df, tmp_hyper, monkeypatch):
    # Monkeypatch a new constructor that hard codes months
    def __init__(self, months: int, days: int, microseconds: int):
        self.months = 1
        self.days = days
        self.microseconds = microseconds

    monkeypatch.setattr(pantab._writer.tab_api.Interval, "__init__", __init__)
    pantab.frame_to_hyper(df, tmp_hyper, table="test")
    with pytest.raises(ValueError,
                       match=r"Cannot read Intervals with month components\."):
        pantab.frame_from_hyper(tmp_hyper, table="test")

    with pytest.raises(ValueError,
                       match=r"Cannot read Intervals with month components\."):
        pantab.frames_from_hyper(tmp_hyper)
Example #11
0
def get_data():
    d = datetime.today() - timedelta(days=1)
    fn = str(d.date()) + '-ourworldindata-org-covid-ecdc-full-data.csv'
    hyper_name = 'COVID-19.hyper'
    print('- Archiving yesterday\'s dataset as {}'.format(fn))
    # !cp 'ourworldindata-org-covid-ecdc-full-data.csv' {fn}
    print('- Downloading today\'s dataset')
    # !curl 'https://covid.ourworldindata.org/data/ecdc/full_data.csv' > 'ourworldindata-org-covid-ecdc-full-data.csv'
    print('- Reading data into dataframe and adding rolling calculations')
    df = pd.read_csv('ourworldindata-org-covid-ecdc-full-data.csv',
                     parse_dates=['date'])

    # add calculations
    df.set_index('date', inplace=True)
    countries = df['location'].unique()
    frames = []
    for c in countries:
        # total cases related
        temp_df = df.loc[df['location'] == c].copy()
        temp_df['total_cases_7day_avg'] = temp_df['total_cases'].rolling(
            7).mean()
        temp_df['total_cases_7day_pct_change'] = temp_df[
            'total_cases_7day_avg'].pct_change()
        temp_df['total_cases_doubling_days'] = temp_df[
            'total_cases_7day_pct_change'].apply(
                lambda x: np.log(2.0) / np.log(1.0 + x))
        # total deaths related
        temp_df['total_deaths_7day_avg'] = temp_df['total_deaths'].rolling(
            7).mean()
        temp_df['total_deaths_7day_pct_change'] = temp_df[
            'total_deaths_7day_avg'].pct_change()
        temp_df['total_deaths_doubling_days'] = temp_df[
            'total_deaths_7day_pct_change'].apply(
                lambda x: np.log(2.0) / np.log(1.0 + x))
        frames.append(temp_df)
    df = pd.concat(frames)
    df.reset_index(inplace=True)

    print('- Exporting dataframe as hyper extract {}'.format(hyper_name))
    pantab.frame_to_hyper(df,
                          hyper_name,
                          table='ECDC Worldwide (via Our World in Data.org)')
    print('- Returning dataframe for further use\n')

    return df
Example #12
0
def test_basic(df, tmp_hyper, table_name, table_mode):
    # Write twice; depending on mode this should either overwrite or duplicate entries
    pantab.frame_to_hyper(df,
                          tmp_hyper,
                          table=table_name,
                          table_mode=table_mode)
    pantab.frame_to_hyper(df,
                          tmp_hyper,
                          table=table_name,
                          table_mode=table_mode)
    result = pantab.frame_from_hyper(tmp_hyper, table=table_name)

    expected = df.copy()

    if table_mode == "a":
        expected = pd.concat([expected, expected]).reset_index(drop=True)

    assert_roundtrip_equal(result, expected)
Example #13
0
def export_dataframe_tableau(dataframe, filename="dataframe.hyper"):
    """
    Export a Pandas dataframe to a Tableau file.

    :param dataframe: Pandas dataframe.
    :param filename: Python string.
    :return: None.
    """
    global result
    import pantab
    # TODO: Fix ModuleNotFoundError: No module named 'tableauhyperapi'
    result = dataframe.to_json(orient='split')
    pantab.frame_to_hyper(dataframe, filename)
    with open(filename, "rb") as binary_file:
        file_bin = binary_file.read()
    assert file_bin is not None
    result = file_bin
    resultMetadata.put("file.extension", ".hyper")
    resultMetadata.put("file.name", filename)
    resultMetadata.put("content.type", "application/octet-stream")
Example #14
0
def test_missing_data(tmp_hyper, table_name, table_mode):
    df = pd.DataFrame([[np.nan], [1]], columns=list("a"))
    df["b"] = pd.Series([None, np.nan], dtype=object)  # no inference
    df["c"] = pd.Series([np.nan, "c"])

    pantab.frame_to_hyper(df,
                          tmp_hyper,
                          table=table_name,
                          table_mode=table_mode)
    pantab.frame_to_hyper(df,
                          tmp_hyper,
                          table=table_name,
                          table_mode=table_mode)

    result = pantab.frame_from_hyper(tmp_hyper, table=table_name)
    expected = pd.DataFrame([[np.nan, np.nan, np.nan], [1, np.nan, "c"]],
                            columns=list("abc"))
    if table_mode == "a":
        expected = pd.concat([expected, expected]).reset_index(drop=True)

    tm.assert_frame_equal(result, expected)
Example #15
0
    def test_roundtrip(self, df):
        test_data = os.path.join(self.data_dir, 'test.hyper')
        with open(test_data, 'rb') as infile:
            data = infile.read()

        # Ideally we could just use a buffer, but the Tableau SDK
        # requires a physical string to be passed to the Extract object
        # Because it creates more than just the .hyper file, we need to
        # create a temporary directory for it to write to
        with tempfile.TemporaryDirectory() as tmp:
            fn = os.path.join(tmp, 'test.hyper')
            pantab.frame_to_hyper(df, fn)
            comp = pantab.frame_from_hyper(fn)

        # Because Tableau only supports the 64 bit variants, upcast the
        # particular df dtypes that are lower bit
        df = df.astype({
            'foo': np.int64,
            'bar': np.int64,
            'qux': np.float64,
        })

        tm.assert_frame_equal(df, comp)
Example #16
0
def test_error_on_first_column(df, tmp_hyper, monkeypatch):
    """
    We had a defect due to which pantab segfaulted when an error occured in one of
    the first two columns. This test case is a regression test against that.
    """
    # Monkeypatch a new constructor that hard codes months
    def __init__(self, months: int, days: int, microseconds: int):
        self.months = 1
        self.days = days
        self.microseconds = microseconds

    monkeypatch.setattr(pantab._writer.tab_api.Interval, "__init__", __init__)

    df = pd.DataFrame(
        [[pd.Timedelta("1 days 2 hours 3 minutes 4 seconds")]],
        columns=["timedelta64"],
    ).astype({"timedelta64": "timedelta64[ns]"})
    pantab.frame_to_hyper(df, tmp_hyper, table="test")

    with pytest.raises(
        ValueError, match=r"Cannot read Intervals with month components\."
    ):
        pantab.frame_from_hyper(tmp_hyper, table="test")
Example #17
0
def test_roundtrip_with_external_hyper_process(df, tmp_hyper):
    default_log_path = Path.cwd() / "hyperd.log"
    if default_log_path.exists():
        default_log_path.unlink()

    # By passing in a pre-spawned HyperProcess, one can e.g. avoid creating a log file
    parameters = {"log_config": ""}
    with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU,
                      parameters=parameters) as hyper:
        # test frame_to_hyper/frame_from_hyper
        pantab.frame_to_hyper(df, tmp_hyper, table="test", hyper_process=hyper)
        result = pantab.frame_from_hyper(tmp_hyper,
                                         table="test",
                                         hyper_process=hyper)
        assert_roundtrip_equal(result, df)

        # test frame_from_hyper_query
        result = pantab.frame_from_hyper_query(tmp_hyper,
                                               "SELECT * FROM test",
                                               hyper_process=hyper)
        assert result.size == 63

        # test frames_to_hyper/frames_from_hyper
        pantab.frames_to_hyper({
            "test2": df,
            "test": df
        },
                               tmp_hyper,
                               hyper_process=hyper)
        result = pantab.frames_from_hyper(tmp_hyper, hyper_process=hyper)
        assert set(result.keys()) == set(
            (TableName("public", "test"), TableName("public", "test2")))
        for val in result.values():
            assert_roundtrip_equal(val, df)

    assert not default_log_path.exists()
Example #18
0
def get_data(rest_url):
    '''This function pings a REST API and returns hardcoded fields as a flat, specifically ordered, nested list with values and no keys.'''
    '''
    THIS WILL NEED TO BE MODIFIED BASED ON YOUR API!
    THIS WILL NEED TO BE MODIFIED BASED ON YOUR API!
    THIS WILL NEED TO BE MODIFIED BASED ON YOUR API!
    '''

    # Get everything ready and create GET request
    query_string = "?per_page=100"
    print("Call: " + rest_url + query_string)

    # Use requests to grab the data and pull into JSON object
    json_call = requests.get(url=rest_url + query_string, headers=headers)
    response = json.loads(json_call.text)

    # Load data into df
    df = pd.json_normalize(response[json_object])

    # Set the table name for the hyper file
    table_name = hyper_name.replace('.hyper', '')

    # Use pantab library to easily create hyper file
    pantab.frame_to_hyper(df=df, database=hyper_name, table=table_name)
Example #19
0
def test_failed_write_doesnt_overwrite_file(df, tmp_hyper, monkeypatch,
                                            table_mode):
    pantab.frame_to_hyper(df, tmp_hyper, table="test", table_mode=table_mode)
    last_modified = tmp_hyper.stat().st_mtime

    # Let's patch the Inserter to fail on creation
    def failure(*args, **kwargs):
        raise ValueError("dummy failure")

    monkeypatch.setattr(pantab._writer.tab_api,
                        "Inserter",
                        failure,
                        raising=True)

    # Try out our write methods
    with pytest.raises(ValueError, match="dummy failure"):
        pantab.frame_to_hyper(df,
                              tmp_hyper,
                              table="test",
                              table_mode=table_mode)
        pantab.frames_to_hyper({"test": df}, tmp_hyper, table_mode=table_mode)

    # Neither should not update file stats
    assert last_modified == tmp_hyper.stat().st_mtime
Example #20
0
if OUTPUT_TYPE == "CSV":
    #result = dataframe.to_csv(encoding='utf-8', index=False)
    result = dataframe.to_csv(index=False)
    resultMetadata.put("file.extension", ".csv")
    resultMetadata.put("file.name", "dataframe.csv")
    resultMetadata.put("content.type", "text/csv")

if OUTPUT_TYPE == "JSON":
    result = dataframe.to_json(orient='split', encoding='utf-8')
    resultMetadata.put("file.extension", ".json")
    resultMetadata.put("file.name", "dataframe.json")
    resultMetadata.put("content.type", "application/json")

if OUTPUT_TYPE == "TABLEAU":
    result = dataframe.to_json(orient='split')
    pantab.frame_to_hyper(dataframe, "dataframe.hyper")
    FILE_BIN = None
    with open("dataframe.hyper", "rb") as binary_file:
        FILE_BIN = binary_file.read()
    assert FILE_BIN is not None
    result = FILE_BIN
    resultMetadata.put("file.extension", ".hyper")
    resultMetadata.put("file.name", "dataframe.hyper")
    resultMetadata.put("content.type", "application/octet-stream")

if OUTPUT_TYPE == "HTML":
    LIMIT_OUTPUT_VIEW = variables.get("LIMIT_OUTPUT_VIEW")
    LIMIT_OUTPUT_VIEW = 5 if LIMIT_OUTPUT_VIEW is None else int(
        LIMIT_OUTPUT_VIEW)
    if LIMIT_OUTPUT_VIEW > 0:
        print("task result limited to: ", LIMIT_OUTPUT_VIEW, " rows")
Example #21
0
 def test_frame_to_file_raises_extract(self, df):
     with pytest.raises(ValueError,
                        message="The Tableau SDK currently only"
                        " supports a table name of 'Extract'"):
         pantab.frame_to_hyper(df, 'foo.hyper', table='foo')
Example #22
0
        # create the project
        new_project = server.projects.create(new_project)
        # Create new datasource_item with project id '3a8b6148-493c-11e6-a621-6f3499394a39'

    #save dataframe into a .hyper file
    input_variables = {'task.dataframe_id': None}
    for key in input_variables.keys():
        for res in results:
            value = res.getMetadata().get(key)
            if value is not None:
                input_variables[key] = value
                break

    dataframe_id = input_variables['task.dataframe_id']
    print("dataframe id (in): ", dataframe_id)

    dataframe_json = variables.get(dataframe_id)
    assert dataframe_json is not None
    dataframe_json = bz2.decompress(dataframe_json).decode()

    dataframe = pd.read_json(dataframe_json, orient='split')
    output_file = OUTPUT_FILE_NAME
    pantab.frame_to_hyper(dataframe, output_file)

    # Create new datasource_item
    new_datasource = TSC.DatasourceItem(project_id)
    new_datasource = server.datasources.publish(new_datasource, output_file,
                                                'CreateNew')
    print(new_datasource.__dict__)

print("END " + __file__)
Example #23
0
def test_unsupported_dtype_raises(dtype, tmp_hyper):
    df = pd.DataFrame([[1]], dtype=dtype)

    msg = re.escape(f"Conversion of '{dtype}' dtypes not supported!")
    with pytest.raises(TypeError, match=msg):
        pantab.frame_to_hyper(df, tmp_hyper, table="test")
Example #24
0
 def time_write_frame(self):
     pantab.frame_to_hyper(self.df, "dummy.hyper", table="dummy")
Example #25
0
        # create a new project item
    	new_project = TSC.ProjectItem(name=PROJECT_NAME, content_permissions='LockedToProject', id='LockedToProject')
    	# create the project 
    	new_project = server.projects.create(new_project)
    	# Create new datasource_item with project id '3a8b6148-493c-11e6-a621-6f3499394a39'
        
    #save dataframe into a .hyper file
    input_variables = {'task.dataframe_id': None}
    for key in input_variables.keys():
      for res in results:
        value = res.getMetadata().get(key)
        if value is not None:
          input_variables[key] = value
          break

    dataframe_id = input_variables['task.dataframe_id']
    print("dataframe id (in): ", dataframe_id)

    dataframe_json = variables.get(dataframe_id)
    assert dataframe_json is not None
    dataframe_json = bz2.decompress(dataframe_json).decode()

    dataframe = pd.read_json(dataframe_json, orient='split')
    pantab.frame_to_hyper(dataframe, INPUT_FILE_NAME)
    
    # Create new datasource_item
    new_datasource = TSC.DatasourceItem(project_id)
    new_datasource = server.datasources.publish(new_datasource, INPUT_FILE_NAME, 'CreateNew')
    print(new_datasource.__dict__)

print("END " + __file__)
Example #26
0
import time

import numpy as np
import pandas as pd

import pantab

if __name__ == '__main__':
    df = pd.DataFrame(np.ones((100_000_000, 1)), columns=["a"])
    start = time.time()
    pantab.frame_to_hyper(df, "test.hyper", table="test")
    end = time.time()
    print(f"Execution took: {end - start}")