Ejemplo n.º 1
0
def test_data_frame_type_check():
    df = pd.DataFrame({
        'String': pd.Categorical(['a', 'a', 'b', 'a']),
        'Date': pd.Timestamp('20130102'),
        'Value': np.array([3] * 4, dtype='int32')
    })
    config = __generate_cfg(df)

    # Each referenced col must exist in the data frame
    with pytest.raises(Exception) as e_info:
        noDateFrame = pd.DataFrame({
            'String':
            pd.Categorical(['a', 'a', 'b', 'a']),
            'Value':
            np.array([3] * 4, dtype='int32')
        })
        validate_data_loader(config, noDateFrame)
    assert 'data frame missing required field: Date' in str(e_info)

    # Each referenced col must be the correct type
    with pytest.raises(Exception) as e_info:
        wrongTypeFrame = pd.DataFrame({
            'String': pd.Timestamp('20130102'),
            'Date': pd.Timestamp('20130102'),
            'Value': np.array([3] * 4, dtype='int32')
        })
        validate_data_loader(config, wrongTypeFrame)
    assert 'ata frame field String must be of type StringColumnConfig.' in str(
        e_info)
Ejemplo n.º 2
0
def test_no_tuples():
    # No time/value tuples
    with pytest.raises(Exception) as e_info:
        df = pd.DataFrame({
            'String': pd.Categorical(['a', 'a', 'b', 'a']),
            'Date': pd.Timestamp('20130102')
        })
        validate_data_loader(__generate_cfg(df))
    assert 'Time tuples empty. No column loaded.' in str(e_info)
Ejemplo n.º 3
0
def test_valid_config():
    # Completely valid config
    df = pd.DataFrame({
        'Date': pd.Timestamp('20130102'),
        'Value': np.array([3] * 4, dtype='int32')
    })
    assert validate_data_loader(__generate_cfg(df), df)
Ejemplo n.º 4
0
def publish(session: dv_auth.Session, dataSourceId, df):
    '''
    Publish a DataFrame to Datavore that was configured on the client
    '''
    # Get the current loader config
    currentConfig = get_data_loader_config(session, dataSourceId)

    # Validate the config against the dataframe
    if not dataload_validate.validate_data_loader(currentConfig, df):
        raise Exception('Could not validate config.')

    # Cancel load if it exists
    try:
        __cancel_data_load(session, dataSourceId)
    except:
        print('No job to cancel. Continuing.')
        # @todo: log debug here -- exception if never loaded before

    # Generate upload url
    uploadUrl = __get_pre_signed_url(session, dataSourceId)

    print('Uploading data frame...')

    # Update the config with the URL @todo: use patch so we don't send the entire data brick back up :/
    # currentConfig['uploadUrl'] = uploadUrl
    # __setDatasourceLoaderConfig(session, dataSourceId, currentConfig)

    # Put data to the uploadUrl
    retries = 2
    with tempfile.NamedTemporaryFile(mode='r+') as temp:
        df.to_csv(temp.name, index=False, date_format='%Y-%m-%dT%H:%M:%SZ')

        while retries > 0:
            with open(temp.name, mode='rb') as csvFile:
                res = requests.put(uploadUrl, data=csvFile)
                if res.status_code == 200:
                    print('Data frame uploaded. Datavore load started.')
                    return res
                elif res.status_code == 401:
                    retries -= 1
                    print(
                        f'Session upload error. Log in again. Attempts remaining: {retries}'
                    )
                    session = dv_auth.login(session.user_name,
                                            session.env_conf)
                else:
                    raise Exception(res.status_code,
                                    res.content.decode('ascii'))

        # Ran out of retries and never managed to upload :(
        raise Exception('Failed to upload.')
Ejemplo n.º 5
0
def set_data_source_sample(session: dv_auth.Session, data_source_id, df):
    '''
    Sets the sample for a data source.
    Once the sample is set, go to the Datavore client to configure the load.
    Immutable on input DataFrame

    :param session: Session - The Datavore session to use
    :param data_source_id: String - The data source to set the sample for
    :param df: - DataFrame to use as a sample source
    :return: response - The result from setting the sample. Not very useful.
    :raises Exception: on invalid DataFrame config or sample generation.
    '''
    # get the samples
    sample = df_util.get_sample(df)

    # Build our default config
    column_configs = dataload_domain.get_column_configs(df)
    data_source_meta = dataload_domain.ds_meta(data_source_id=data_source_id,
                                               datasource='python',
                                               publisher=session.user_name,
                                               dataset='data frame')
    loader_config = dataload_domain.csv_loader_config(
        source_settings=dataload_domain.csv_source_settings(column_configs),
        mapping=dataload_domain.simple_load_mapping(column_configs),
        data_source_meta=data_source_meta,
        sample_data=sample['sampleData'],
        column_samples=sample['columnSamples'])
    data_loader = dataload_domain.csv_data_loader(data_source_id,
                                                  loader_config)

    # Validate the config against the dataframe
    if not dataload_validate.validate_data_loader(data_loader, df):
        raise Exception('Could not validate config.')

    # save the loaderConfig
    out = set_data_loader_config(session, data_source_id, data_loader)
    print('Sample uploaded. Go to the Datavore client to review')
    return out
Ejemplo n.º 6
0
def test_config_type_check():
    df = pd.DataFrame({
        'String': pd.Categorical(['a', 'a', 'b', 'a']),
        'Date': pd.Timestamp('20130102'),
        'Value': np.array([3] * 4, dtype='int32')
    })
    baseConfig = __generate_cfg(df)

    # all keyColumns defined
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['keyColumns'] = ['NotAField']
        validate_data_loader(local)
    assert 'key column NotAField not found' in str(e_info)

    # all keyColumns are strings
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['keyColumns'] = ['Date']
        validate_data_loader(local)
    assert 'key column Date must be a string' in str(e_info)

    # all valueLabelColumn are defined
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['valueLabelColumn'] = ['NotAField']
        validate_data_loader(local)
    assert 'value label NotAField not found' in str(e_info)

    # all valueLabelColumn are strings
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['valueLabelColumn'] = ['Date']
        validate_data_loader(local)
    assert 'value label Date must be a string' in str(e_info)

    # all timeColumns are defined
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['timeColumns'] = ['NotAField']
        validate_data_loader(local)
    assert 'time column NotAField not found' in str(e_info)

    # all timeColumns are time
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['timeColumns'] = ['String']
        validate_data_loader(local)
    assert 'time column String must be a time' in str(e_info)

    # all timeTuples times are defined
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['timeTuples'][0][
            'timeColumn'] = 'NotAField'
        validate_data_loader(local)
    assert 'not found' in str(e_info)

    # all timeTuples times are time
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['timeTuples'][0][
            'timeColumn'] = 'String'
        validate_data_loader(local)
    assert 'must be a time' in str(e_info)

    # all timeTuples values are defined
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['timeTuples'][0][
            'valueColumn'] = 'NotAField'
        validate_data_loader(local)
    assert 'not found' in str(e_info)

    # all timeTuples values are Number
    with pytest.raises(Exception) as e_info:
        local = copy.deepcopy(baseConfig)
        local['loaderConfig']['mapping']['timeTuples'][0][
            'valueColumn'] = 'String'
        validate_data_loader(local)
    assert 'must be a number' in str(e_info)
Ejemplo n.º 7
0
def test_no_time():
    # No time columns
    with pytest.raises(Exception) as e_info:
        df = pd.DataFrame({})
        validate_data_loader(__generate_cfg(df))
    assert 'Loader config requires non-empty time columns.' in str(e_info)
Ejemplo n.º 8
0
def test_empty():
    # No config
    with pytest.raises(Exception) as e_info:
        validate_data_loader({})
    assert 'Empty loader config' in str(e_info)