def test_bad_table_mode_raises(df, tmp_hyper): msg = "'table_mode' must be either 'w' or 'a'" with pytest.raises(ValueError, match=msg): pantab.frame_to_hyper(df, tmp_hyper, table="test", table_mode="x") with pytest.raises(ValueError, match=msg): pantab.frames_to_hyper({"a": df}, tmp_hyper, table_mode="x")
def test_append_mode_raises_column_dtype_mismatch(df, tmp_hyper, table_name): pantab.frame_to_hyper(df, tmp_hyper, table=table_name) df["int16"] = df["int16"].astype(np.int64) msg = "^Mismatched column definitions:" with pytest.raises(TypeError, match=msg): pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode="a")
def test_bad_value_gives_clear_message(tmp_hyper): df = pd.DataFrame([[{"a": "b"}]], columns=["a"]) msg = r"Invalid value \"{'a': 'b'}\" found \(row 0 column 0\)" with pytest.raises(TypeError, match=msg): pantab.frame_to_hyper(df, tmp_hyper, table="test")
def test_append_mode_raises_column_mismatch(df, tmp_hyper, table_name): pantab.frame_to_hyper(df, tmp_hyper, table=table_name) df = df.drop("object", axis=1) msg = "^Mismatched column definitions:" with pytest.raises(TypeError, match=msg): pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode="a")
def test_read_doesnt_modify_existing_file(df, tmp_hyper): pantab.frame_to_hyper(df, tmp_hyper, table="test") last_modified = tmp_hyper.stat().st_mtime # Try out our read methods pantab.frame_from_hyper(tmp_hyper, table="test") pantab.frames_from_hyper(tmp_hyper) # Neither should not update file stats assert last_modified == tmp_hyper.stat().st_mtime
def setup_cache(self): nrows = 10_000 data = [[ 1, 2, 3, 1, 2, 3, 4.0, 5.0, True, pd.to_datetime("2018-01-01"), pd.to_datetime("2018-01-01", utc=True), pd.Timedelta("1 days 2 hours 3 minutes 4 seconds"), "foo", ]] * nrows df = pd.DataFrame( data, columns=[ "int16", "int32", "int64", "Int16", "Int32", "Int64", "float32", "float64", "bool", "datetime64", "datetime64_utc", "timedelta64", "object", ], ) df = df.astype({ "int16": np.int16, "int32": np.int32, "int64": np.int64, "float32": np.float32, "float64": np.float64, "bool": np.bool, "datetime64": "datetime64[ns]", "datetime64_utc": "datetime64[ns, UTC]", "timedelta64": "timedelta64[ns]", "object": "object", }) path = "test.hyper" pantab.frame_to_hyper(df, path, table="test") return df
def test_duplicate_columns_raises(tmp_hyper): df = pd.DataFrame([[1, 1]], columns=[1, 1]) with pytest.raises( tab_api.hyperexception.HyperException, match="column '1' specified more than once", ): pantab.frame_to_hyper(df, tmp_hyper, table="foo") with pytest.raises( tab_api.hyperexception.HyperException, match="column '1' specified more than once", ): pantab.frames_to_hyper({"test": df}, tmp_hyper)
def test_read_query(df, tmp_hyper): pantab.frame_to_hyper(df, tmp_hyper, table="test") query = "SELECT int16 AS i, '_' || int32 AS _i2 FROM test" result = pantab.frame_from_hyper_query(tmp_hyper, query) expected = pd.DataFrame( [[1, "_2"], [6, "_7"], [0, "_0"]], columns=["i", "_i2"], ) str_type = "string" if compat.PANDAS_100 else "object" expected = expected.astype({"i": "Int16", "_i2": str_type}) tm.assert_frame_equal(result, expected)
def execute_refresh(rest_helper, config, query_text, socketio=None): TABLE_NAME = 'google_places' # get data MAIN_LOGGER.info( f'Refreshing Google Places Extract Based on Query: [{query_text}]...') if socketio: socketio.emit( 'push-message', f'Refreshing Extract Data Based on <br/> Query: [{query_text}]...', broadcast=True) socketio.emit('push-message', f'Querying Google Places API...', broadcast=True) extract_data_df = get_google_places_dataframe(config, query_text) # create hyper extract and publish hyper_file_path = os.path.join(file_paths.DATA_STAGING_DIR, f'GooglePlacesData.hyper') if socketio: socketio.emit('push-message', f'Creating New Hyper File...', broadcast=True) pantab.frame_to_hyper(extract_data_df, hyper_file_path, table=TABLE_NAME, table_mode='a') target_datasource_name = config['target_datasource_name'] target_project_name = config['target_project_name'] success = rest_helper.publish_hyper(hyper_file_path, target_datasource_name, target_project_name) if socketio: socketio.emit( 'push-message', f'Published Datasource as <br/>"{target_datasource_name}"...', broadcast=True) MAIN_LOGGER.info( f'Call to publish {target_datasource_name} datasource returned {success}' ) MAIN_LOGGER.info(f'Task Execution Completed') if socketio: socketio.emit('push-message', f'Extract Task Completed', broadcast=True)
def test_months_in_interval_raises(df, tmp_hyper, monkeypatch): # Monkeypatch a new constructor that hard codes months def __init__(self, months: int, days: int, microseconds: int): self.months = 1 self.days = days self.microseconds = microseconds monkeypatch.setattr(pantab._writer.tab_api.Interval, "__init__", __init__) pantab.frame_to_hyper(df, tmp_hyper, table="test") with pytest.raises(ValueError, match=r"Cannot read Intervals with month components\."): pantab.frame_from_hyper(tmp_hyper, table="test") with pytest.raises(ValueError, match=r"Cannot read Intervals with month components\."): pantab.frames_from_hyper(tmp_hyper)
def get_data(): d = datetime.today() - timedelta(days=1) fn = str(d.date()) + '-ourworldindata-org-covid-ecdc-full-data.csv' hyper_name = 'COVID-19.hyper' print('- Archiving yesterday\'s dataset as {}'.format(fn)) # !cp 'ourworldindata-org-covid-ecdc-full-data.csv' {fn} print('- Downloading today\'s dataset') # !curl 'https://covid.ourworldindata.org/data/ecdc/full_data.csv' > 'ourworldindata-org-covid-ecdc-full-data.csv' print('- Reading data into dataframe and adding rolling calculations') df = pd.read_csv('ourworldindata-org-covid-ecdc-full-data.csv', parse_dates=['date']) # add calculations df.set_index('date', inplace=True) countries = df['location'].unique() frames = [] for c in countries: # total cases related temp_df = df.loc[df['location'] == c].copy() temp_df['total_cases_7day_avg'] = temp_df['total_cases'].rolling( 7).mean() temp_df['total_cases_7day_pct_change'] = temp_df[ 'total_cases_7day_avg'].pct_change() temp_df['total_cases_doubling_days'] = temp_df[ 'total_cases_7day_pct_change'].apply( lambda x: np.log(2.0) / np.log(1.0 + x)) # total deaths related temp_df['total_deaths_7day_avg'] = temp_df['total_deaths'].rolling( 7).mean() temp_df['total_deaths_7day_pct_change'] = temp_df[ 'total_deaths_7day_avg'].pct_change() temp_df['total_deaths_doubling_days'] = temp_df[ 'total_deaths_7day_pct_change'].apply( lambda x: np.log(2.0) / np.log(1.0 + x)) frames.append(temp_df) df = pd.concat(frames) df.reset_index(inplace=True) print('- Exporting dataframe as hyper extract {}'.format(hyper_name)) pantab.frame_to_hyper(df, hyper_name, table='ECDC Worldwide (via Our World in Data.org)') print('- Returning dataframe for further use\n') return df
def test_basic(df, tmp_hyper, table_name, table_mode): # Write twice; depending on mode this should either overwrite or duplicate entries pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode=table_mode) pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode=table_mode) result = pantab.frame_from_hyper(tmp_hyper, table=table_name) expected = df.copy() if table_mode == "a": expected = pd.concat([expected, expected]).reset_index(drop=True) assert_roundtrip_equal(result, expected)
def export_dataframe_tableau(dataframe, filename="dataframe.hyper"): """ Export a Pandas dataframe to a Tableau file. :param dataframe: Pandas dataframe. :param filename: Python string. :return: None. """ global result import pantab # TODO: Fix ModuleNotFoundError: No module named 'tableauhyperapi' result = dataframe.to_json(orient='split') pantab.frame_to_hyper(dataframe, filename) with open(filename, "rb") as binary_file: file_bin = binary_file.read() assert file_bin is not None result = file_bin resultMetadata.put("file.extension", ".hyper") resultMetadata.put("file.name", filename) resultMetadata.put("content.type", "application/octet-stream")
def test_missing_data(tmp_hyper, table_name, table_mode): df = pd.DataFrame([[np.nan], [1]], columns=list("a")) df["b"] = pd.Series([None, np.nan], dtype=object) # no inference df["c"] = pd.Series([np.nan, "c"]) pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode=table_mode) pantab.frame_to_hyper(df, tmp_hyper, table=table_name, table_mode=table_mode) result = pantab.frame_from_hyper(tmp_hyper, table=table_name) expected = pd.DataFrame([[np.nan, np.nan, np.nan], [1, np.nan, "c"]], columns=list("abc")) if table_mode == "a": expected = pd.concat([expected, expected]).reset_index(drop=True) tm.assert_frame_equal(result, expected)
def test_roundtrip(self, df): test_data = os.path.join(self.data_dir, 'test.hyper') with open(test_data, 'rb') as infile: data = infile.read() # Ideally we could just use a buffer, but the Tableau SDK # requires a physical string to be passed to the Extract object # Because it creates more than just the .hyper file, we need to # create a temporary directory for it to write to with tempfile.TemporaryDirectory() as tmp: fn = os.path.join(tmp, 'test.hyper') pantab.frame_to_hyper(df, fn) comp = pantab.frame_from_hyper(fn) # Because Tableau only supports the 64 bit variants, upcast the # particular df dtypes that are lower bit df = df.astype({ 'foo': np.int64, 'bar': np.int64, 'qux': np.float64, }) tm.assert_frame_equal(df, comp)
def test_error_on_first_column(df, tmp_hyper, monkeypatch): """ We had a defect due to which pantab segfaulted when an error occured in one of the first two columns. This test case is a regression test against that. """ # Monkeypatch a new constructor that hard codes months def __init__(self, months: int, days: int, microseconds: int): self.months = 1 self.days = days self.microseconds = microseconds monkeypatch.setattr(pantab._writer.tab_api.Interval, "__init__", __init__) df = pd.DataFrame( [[pd.Timedelta("1 days 2 hours 3 minutes 4 seconds")]], columns=["timedelta64"], ).astype({"timedelta64": "timedelta64[ns]"}) pantab.frame_to_hyper(df, tmp_hyper, table="test") with pytest.raises( ValueError, match=r"Cannot read Intervals with month components\." ): pantab.frame_from_hyper(tmp_hyper, table="test")
def test_roundtrip_with_external_hyper_process(df, tmp_hyper): default_log_path = Path.cwd() / "hyperd.log" if default_log_path.exists(): default_log_path.unlink() # By passing in a pre-spawned HyperProcess, one can e.g. avoid creating a log file parameters = {"log_config": ""} with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU, parameters=parameters) as hyper: # test frame_to_hyper/frame_from_hyper pantab.frame_to_hyper(df, tmp_hyper, table="test", hyper_process=hyper) result = pantab.frame_from_hyper(tmp_hyper, table="test", hyper_process=hyper) assert_roundtrip_equal(result, df) # test frame_from_hyper_query result = pantab.frame_from_hyper_query(tmp_hyper, "SELECT * FROM test", hyper_process=hyper) assert result.size == 63 # test frames_to_hyper/frames_from_hyper pantab.frames_to_hyper({ "test2": df, "test": df }, tmp_hyper, hyper_process=hyper) result = pantab.frames_from_hyper(tmp_hyper, hyper_process=hyper) assert set(result.keys()) == set( (TableName("public", "test"), TableName("public", "test2"))) for val in result.values(): assert_roundtrip_equal(val, df) assert not default_log_path.exists()
def get_data(rest_url): '''This function pings a REST API and returns hardcoded fields as a flat, specifically ordered, nested list with values and no keys.''' ''' THIS WILL NEED TO BE MODIFIED BASED ON YOUR API! THIS WILL NEED TO BE MODIFIED BASED ON YOUR API! THIS WILL NEED TO BE MODIFIED BASED ON YOUR API! ''' # Get everything ready and create GET request query_string = "?per_page=100" print("Call: " + rest_url + query_string) # Use requests to grab the data and pull into JSON object json_call = requests.get(url=rest_url + query_string, headers=headers) response = json.loads(json_call.text) # Load data into df df = pd.json_normalize(response[json_object]) # Set the table name for the hyper file table_name = hyper_name.replace('.hyper', '') # Use pantab library to easily create hyper file pantab.frame_to_hyper(df=df, database=hyper_name, table=table_name)
def test_failed_write_doesnt_overwrite_file(df, tmp_hyper, monkeypatch, table_mode): pantab.frame_to_hyper(df, tmp_hyper, table="test", table_mode=table_mode) last_modified = tmp_hyper.stat().st_mtime # Let's patch the Inserter to fail on creation def failure(*args, **kwargs): raise ValueError("dummy failure") monkeypatch.setattr(pantab._writer.tab_api, "Inserter", failure, raising=True) # Try out our write methods with pytest.raises(ValueError, match="dummy failure"): pantab.frame_to_hyper(df, tmp_hyper, table="test", table_mode=table_mode) pantab.frames_to_hyper({"test": df}, tmp_hyper, table_mode=table_mode) # Neither should not update file stats assert last_modified == tmp_hyper.stat().st_mtime
if OUTPUT_TYPE == "CSV": #result = dataframe.to_csv(encoding='utf-8', index=False) result = dataframe.to_csv(index=False) resultMetadata.put("file.extension", ".csv") resultMetadata.put("file.name", "dataframe.csv") resultMetadata.put("content.type", "text/csv") if OUTPUT_TYPE == "JSON": result = dataframe.to_json(orient='split', encoding='utf-8') resultMetadata.put("file.extension", ".json") resultMetadata.put("file.name", "dataframe.json") resultMetadata.put("content.type", "application/json") if OUTPUT_TYPE == "TABLEAU": result = dataframe.to_json(orient='split') pantab.frame_to_hyper(dataframe, "dataframe.hyper") FILE_BIN = None with open("dataframe.hyper", "rb") as binary_file: FILE_BIN = binary_file.read() assert FILE_BIN is not None result = FILE_BIN resultMetadata.put("file.extension", ".hyper") resultMetadata.put("file.name", "dataframe.hyper") resultMetadata.put("content.type", "application/octet-stream") if OUTPUT_TYPE == "HTML": LIMIT_OUTPUT_VIEW = variables.get("LIMIT_OUTPUT_VIEW") LIMIT_OUTPUT_VIEW = 5 if LIMIT_OUTPUT_VIEW is None else int( LIMIT_OUTPUT_VIEW) if LIMIT_OUTPUT_VIEW > 0: print("task result limited to: ", LIMIT_OUTPUT_VIEW, " rows")
def test_frame_to_file_raises_extract(self, df): with pytest.raises(ValueError, message="The Tableau SDK currently only" " supports a table name of 'Extract'"): pantab.frame_to_hyper(df, 'foo.hyper', table='foo')
# create the project new_project = server.projects.create(new_project) # Create new datasource_item with project id '3a8b6148-493c-11e6-a621-6f3499394a39' #save dataframe into a .hyper file input_variables = {'task.dataframe_id': None} for key in input_variables.keys(): for res in results: value = res.getMetadata().get(key) if value is not None: input_variables[key] = value break dataframe_id = input_variables['task.dataframe_id'] print("dataframe id (in): ", dataframe_id) dataframe_json = variables.get(dataframe_id) assert dataframe_json is not None dataframe_json = bz2.decompress(dataframe_json).decode() dataframe = pd.read_json(dataframe_json, orient='split') output_file = OUTPUT_FILE_NAME pantab.frame_to_hyper(dataframe, output_file) # Create new datasource_item new_datasource = TSC.DatasourceItem(project_id) new_datasource = server.datasources.publish(new_datasource, output_file, 'CreateNew') print(new_datasource.__dict__) print("END " + __file__)
def test_unsupported_dtype_raises(dtype, tmp_hyper): df = pd.DataFrame([[1]], dtype=dtype) msg = re.escape(f"Conversion of '{dtype}' dtypes not supported!") with pytest.raises(TypeError, match=msg): pantab.frame_to_hyper(df, tmp_hyper, table="test")
def time_write_frame(self): pantab.frame_to_hyper(self.df, "dummy.hyper", table="dummy")
# create a new project item new_project = TSC.ProjectItem(name=PROJECT_NAME, content_permissions='LockedToProject', id='LockedToProject') # create the project new_project = server.projects.create(new_project) # Create new datasource_item with project id '3a8b6148-493c-11e6-a621-6f3499394a39' #save dataframe into a .hyper file input_variables = {'task.dataframe_id': None} for key in input_variables.keys(): for res in results: value = res.getMetadata().get(key) if value is not None: input_variables[key] = value break dataframe_id = input_variables['task.dataframe_id'] print("dataframe id (in): ", dataframe_id) dataframe_json = variables.get(dataframe_id) assert dataframe_json is not None dataframe_json = bz2.decompress(dataframe_json).decode() dataframe = pd.read_json(dataframe_json, orient='split') pantab.frame_to_hyper(dataframe, INPUT_FILE_NAME) # Create new datasource_item new_datasource = TSC.DatasourceItem(project_id) new_datasource = server.datasources.publish(new_datasource, INPUT_FILE_NAME, 'CreateNew') print(new_datasource.__dict__) print("END " + __file__)
import time import numpy as np import pandas as pd import pantab if __name__ == '__main__': df = pd.DataFrame(np.ones((100_000_000, 1)), columns=["a"]) start = time.time() pantab.frame_to_hyper(df, "test.hyper", table="test") end = time.time() print(f"Execution took: {end - start}")