def test_data_frame_type_check(): df = pd.DataFrame({ 'String': pd.Categorical(['a', 'a', 'b', 'a']), 'Date': pd.Timestamp('20130102'), 'Value': np.array([3] * 4, dtype='int32') }) config = __generate_cfg(df) # Each referenced col must exist in the data frame with pytest.raises(Exception) as e_info: noDateFrame = pd.DataFrame({ 'String': pd.Categorical(['a', 'a', 'b', 'a']), 'Value': np.array([3] * 4, dtype='int32') }) validate_data_loader(config, noDateFrame) assert 'data frame missing required field: Date' in str(e_info) # Each referenced col must be the correct type with pytest.raises(Exception) as e_info: wrongTypeFrame = pd.DataFrame({ 'String': pd.Timestamp('20130102'), 'Date': pd.Timestamp('20130102'), 'Value': np.array([3] * 4, dtype='int32') }) validate_data_loader(config, wrongTypeFrame) assert 'ata frame field String must be of type StringColumnConfig.' in str( e_info)
def test_no_tuples(): # No time/value tuples with pytest.raises(Exception) as e_info: df = pd.DataFrame({ 'String': pd.Categorical(['a', 'a', 'b', 'a']), 'Date': pd.Timestamp('20130102') }) validate_data_loader(__generate_cfg(df)) assert 'Time tuples empty. No column loaded.' in str(e_info)
def test_valid_config(): # Completely valid config df = pd.DataFrame({ 'Date': pd.Timestamp('20130102'), 'Value': np.array([3] * 4, dtype='int32') }) assert validate_data_loader(__generate_cfg(df), df)
def publish(session: dv_auth.Session, dataSourceId, df): ''' Publish a DataFrame to Datavore that was configured on the client ''' # Get the current loader config currentConfig = get_data_loader_config(session, dataSourceId) # Validate the config against the dataframe if not dataload_validate.validate_data_loader(currentConfig, df): raise Exception('Could not validate config.') # Cancel load if it exists try: __cancel_data_load(session, dataSourceId) except: print('No job to cancel. Continuing.') # @todo: log debug here -- exception if never loaded before # Generate upload url uploadUrl = __get_pre_signed_url(session, dataSourceId) print('Uploading data frame...') # Update the config with the URL @todo: use patch so we don't send the entire data brick back up :/ # currentConfig['uploadUrl'] = uploadUrl # __setDatasourceLoaderConfig(session, dataSourceId, currentConfig) # Put data to the uploadUrl retries = 2 with tempfile.NamedTemporaryFile(mode='r+') as temp: df.to_csv(temp.name, index=False, date_format='%Y-%m-%dT%H:%M:%SZ') while retries > 0: with open(temp.name, mode='rb') as csvFile: res = requests.put(uploadUrl, data=csvFile) if res.status_code == 200: print('Data frame uploaded. Datavore load started.') return res elif res.status_code == 401: retries -= 1 print( f'Session upload error. Log in again. Attempts remaining: {retries}' ) session = dv_auth.login(session.user_name, session.env_conf) else: raise Exception(res.status_code, res.content.decode('ascii')) # Ran out of retries and never managed to upload :( raise Exception('Failed to upload.')
def set_data_source_sample(session: dv_auth.Session, data_source_id, df): ''' Sets the sample for a data source. Once the sample is set, go to the Datavore client to configure the load. Immutable on input DataFrame :param session: Session - The Datavore session to use :param data_source_id: String - The data source to set the sample for :param df: - DataFrame to use as a sample source :return: response - The result from setting the sample. Not very useful. :raises Exception: on invalid DataFrame config or sample generation. ''' # get the samples sample = df_util.get_sample(df) # Build our default config column_configs = dataload_domain.get_column_configs(df) data_source_meta = dataload_domain.ds_meta(data_source_id=data_source_id, datasource='python', publisher=session.user_name, dataset='data frame') loader_config = dataload_domain.csv_loader_config( source_settings=dataload_domain.csv_source_settings(column_configs), mapping=dataload_domain.simple_load_mapping(column_configs), data_source_meta=data_source_meta, sample_data=sample['sampleData'], column_samples=sample['columnSamples']) data_loader = dataload_domain.csv_data_loader(data_source_id, loader_config) # Validate the config against the dataframe if not dataload_validate.validate_data_loader(data_loader, df): raise Exception('Could not validate config.') # save the loaderConfig out = set_data_loader_config(session, data_source_id, data_loader) print('Sample uploaded. Go to the Datavore client to review') return out
def test_config_type_check(): df = pd.DataFrame({ 'String': pd.Categorical(['a', 'a', 'b', 'a']), 'Date': pd.Timestamp('20130102'), 'Value': np.array([3] * 4, dtype='int32') }) baseConfig = __generate_cfg(df) # all keyColumns defined with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['keyColumns'] = ['NotAField'] validate_data_loader(local) assert 'key column NotAField not found' in str(e_info) # all keyColumns are strings with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['keyColumns'] = ['Date'] validate_data_loader(local) assert 'key column Date must be a string' in str(e_info) # all valueLabelColumn are defined with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['valueLabelColumn'] = ['NotAField'] validate_data_loader(local) assert 'value label NotAField not found' in str(e_info) # all valueLabelColumn are strings with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['valueLabelColumn'] = ['Date'] validate_data_loader(local) assert 'value label Date must be a string' in str(e_info) # all timeColumns are defined with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['timeColumns'] = ['NotAField'] validate_data_loader(local) assert 'time column NotAField not found' in str(e_info) # all timeColumns are time with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['timeColumns'] = ['String'] validate_data_loader(local) assert 'time column String must be a time' in str(e_info) # all timeTuples times are defined with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['timeTuples'][0][ 'timeColumn'] = 'NotAField' validate_data_loader(local) assert 'not found' in str(e_info) # all timeTuples times are time with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['timeTuples'][0][ 'timeColumn'] = 'String' validate_data_loader(local) assert 'must be a time' in str(e_info) # all timeTuples values are defined with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['timeTuples'][0][ 'valueColumn'] = 'NotAField' validate_data_loader(local) assert 'not found' in str(e_info) # all timeTuples values are Number with pytest.raises(Exception) as e_info: local = copy.deepcopy(baseConfig) local['loaderConfig']['mapping']['timeTuples'][0][ 'valueColumn'] = 'String' validate_data_loader(local) assert 'must be a number' in str(e_info)
def test_no_time(): # No time columns with pytest.raises(Exception) as e_info: df = pd.DataFrame({}) validate_data_loader(__generate_cfg(df)) assert 'Loader config requires non-empty time columns.' in str(e_info)
def test_empty(): # No config with pytest.raises(Exception) as e_info: validate_data_loader({}) assert 'Empty loader config' in str(e_info)