Ejemplo n.º 1
0
def example_failing_verification():
    n_failures = 0
    df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN],
                       'b': ['one', 'one', 'two', 'three', pd.np.NaN]})
    v = verify_df(df, TDDA_FILE)

    if v.failures > 0:
        print('Correctly failed to verify dataframe that does not satisify '
              'all the constraints in %s' % TDDA_FILE)
        if v.failures != 7 and v.passes != 5:
            print('However, expected 7 failures and 5 passes.\n'
                  'Actual: Failures: %d, Passes: %s.\n'
                  '*** Not great!' % (v.failures, v.passes))
            n_failures = 1
    elif v.failures == 0:
        print('*** Incorrectly verified dataframe that should have failed '
              'against constraints in\n %s.' % TDDA_FILE, file=sys.stderr)
        n_failures = 1

    print('\nRESULT AS A STRING:\n')
    print(str(v))
    print('\nRESULT AS A DATAFRAME:\n')
    print(v.to_frame())
    print('\n')
    return n_failures
Ejemplo n.º 2
0
def example_failing_verification():
    n_failures = 0
    df = pd.DataFrame({'a': [0, 1, 2, 10, np.NaN],
                       'b': ['one', 'one', 'two', 'three', np.NaN]})
    v = verify_df(df, TDDA_FILE)

    if v.failures > 0:
        print('Correctly failed to verify dataframe that does not satisify '
              'all the constraints in %s' % TDDA_FILE)
        if v.failures != 7 and v.passes != 5:
            print('However, expected 7 failures and 5 passes.\n'
                  'Actual: Failures: %d, Passes: %s.\n'
                  '*** Not great!' % (v.failures, v.passes))
            n_failures = 1
    elif v.failures == 0:
        print('*** Incorrectly verified dataframe that should have failed '
              'against constraints in\n %s.' % TDDA_FILE, file=sys.stderr)
        n_failures = 1

    print('\nRESULT AS A STRING:\n')
    print(str(v))
    print('\nRESULT AS A DATAFRAME:\n')
    print(v.to_frame())
    print('\n')
    return n_failures
Ejemplo n.º 3
0
 def testElements92rex(self):
     csv_path = os.path.join(TESTDATA_DIR, 'elements92.csv')
     df = pd.read_csv(csv_path)
     constraints_path = os.path.join(TESTDATA_DIR, 'elements92rex.tdda')
     v = verify_df(df, constraints_path)
     self.assertEqual(v.passes, 78)
     self.assertEqual(v.failures, 0)
 def test_input_csvs_meet_constraints(self):
     """Check that each csv in the /data/raw directory meets the constraints
     required.  This should be a layup - the files and the constraints
     should not have changed."""
     self.failures = {}
     for name in self.filenames:
         df = self.raw_dfs_dict[name]
         v = verify_df(df, self.constraint_paths[name])
         assert v.failures == 0
Ejemplo n.º 5
0
 def testDDD_df(self):
     csv_path = os.path.join(TESTDATA_DIR, 'ddd.csv')
     df = pd.read_csv(csv_path)
     constraints_path = os.path.join(TESTDATA_DIR, 'ddd.tdda')
     v = verify_df(df, constraints_path)
     # expect 3 failures:
     #   - the pandas CSV reader will have read 'elevens' as an int
     #   - the pandas CSV reader will have read the date columns as strings
     self.assertEqual(v.passes, 58)
     self.assertEqual(v.failures, 3)
Ejemplo n.º 6
0
 def testElements118rex(self):
     csv_path = os.path.join(TESTDATA_DIR, 'elements118.csv')
     df = pd.read_csv(csv_path)
     constraints_path = os.path.join(TESTDATA_DIR, 'elements92rex.tdda')
     v = verify_df(df, constraints_path, report='fields')
     self.assertEqual(v.passes, 61)
     self.assertEqual(v.failures, 17)
     vdf = v.to_dataframe()
     vdf.sort_values('field', inplace=True)
     self.assertStringCorrect(vdf.to_string(), 'elements118rex.df')
 def _validate(self, data, constraints):
     """
     1. Check if everything is available and is conform to our expectations
     """
     result = verify_df(data, constraints, type_checking='strict')
     if result.failures != 0:
         raise KeyError(
             "One or more columns were not fitting the validation constraints: failures: {}"
             .format(result.failures))
     else:
         pass
Ejemplo n.º 8
0
 def testDetectElements118rexToFile(self):
     csv_path = os.path.join(TESTDATA_DIR, 'elements118.csv')
     df = pd.read_csv(csv_path)
     constraints_path = os.path.join(TESTDATA_DIR, 'elements92rex.tdda')
     detectfile = os.path.join(self.tmp_dir, 'elements118rex_detect.csv')
     v = verify_df(df,
                   constraints_path,
                   report='fields',
                   detect_outpath=detectfile,
                   detect_output_fields=['Z'])
     self.assertEqual(v.passes, 61)
     self.assertEqual(v.failures, 17)
     self.assertFileCorrect(detectfile, 'elements118rex_detect.csv')
Ejemplo n.º 9
0
def example_positive_verification():
    n_failures = 0
    df = pd.DataFrame({'a': [2, 4], 'b': ['one', np.NaN]})
    v = verify_df(df, TDDA_FILE)

    if v.failures == 0:
        print('Correctly verified dataframe against constraints in %s.'
              % TDDA_FILE)
    else:
        print('*** Unexpectedly failed to verify dataframe against constraints'
              ' in %s.\nSomething is wrong!' % TDDA_FILE, file=sys.stderr)
        print(v)
        n_failures = 1
    return n_failures
Ejemplo n.º 10
0
def example_positive_verification():
    n_failures = 0
    df = pd.DataFrame({'a': [2, 4], 'b': ['one', pd.np.NaN]})
    v = verify_df(df, TDDA_FILE)

    if v.failures == 0:
        print('Correctly verified dataframe against constraints in %s.'
              % TDDA_FILE)
    else:
        print('*** Unexpectedly failed to verify dataframe against constraints'
              ' in %s.\nSomething is wrong!' % TDDA_FILE, file=sys.stderr)
        print(v)
        n_failures = 1
    return n_failures
Ejemplo n.º 11
0
def verify_df_from_file(df_path, constraints_path, verbose=True, **kwargs):
    if df_path == '-' or df_path is None:
        df_path = StringIO(sys.stdin.read())
    if constraints_path is None:
        if not isinstance(df_path, StringIO):
            split = os.path.splitext(df_path)
            if split[1] in ('.csv', '.feather'):
                constraints_path = split[0] + '.tdda'
        if constraints_path is None:
            print('No constraints file specified.', file=sys.stderr)
            sys.exit(1)
    
    df = load_df(df_path)
    v = verify_df(df, constraints_path, **kwargs)
    if verbose:
        print(v)
    return v
Ejemplo n.º 12
0
Archivo: verify.py Proyecto: tdda/tdda
def verify_df_from_file(df_path, constraints_path, verbose=True, **kwargs):
    if df_path == '-' or df_path is None:
        df_path = StringIO(sys.stdin.read())
    if constraints_path is None:
        if not isinstance(df_path, StringIO):
            split = os.path.splitext(df_path)
            if split[1] in ('.csv', '.feather'):
                constraints_path = split[0] + '.tdda'
        if constraints_path is None:
            print('No constraints file specified.', file=sys.stderr)
            sys.exit(1)

    df = load_df(df_path)
    v = verify_df(df, constraints_path, **kwargs)
    if verbose:
        print(v)
    return v
Ejemplo n.º 13
0
# accounts_verify_25k_against_1k.py

from __future__ import print_function
import pandas as pd

from tdda.constraints.pd.constraints import verify_df

df = pd.read_csv('testdata/accounts25k.csv')
v = verify_df(df, 'accounts1k.tdda')
vdf = v.to_frame()
print(vdf)

Ejemplo n.º 14
0
import pandas as pd
from tdda.constraints.pd.constraints import verify_df

df = pd.DataFrame({'a': [2, 4], 'b': ['one', pd.np.NaN]})
v = verify_df(df, 'example_constraints.tdda')

print('Passes: %d' % v.passes)
print('Failures: %d\n\n\n' % v.failures)
print(str(v))
print('\n\n')
print(v.to_frame())
Ejemplo n.º 15
0
# elements_verify_118_against_92.py

from __future__ import print_function
import pandas as pd

from tdda.constraints.pd.constraints import verify_df

df = pd.read_csv('testdata/elements118.csv')
print(verify_df(df, 'elements92.tdda'))
Ejemplo n.º 16
0
import numpy as np
import pandas as pd
from tdda.constraints.pd.constraints import verify_df

df = pd.DataFrame({'a': [2, 4], 'b': ['one', np.NaN]})
v = verify_df(df, 'example_constraints.tdda')

print('Passes: %d' % v.passes)
print('Failures: %d\n\n\n' % v.failures)
print(str(v))
print('\n\n')
print(v.to_frame())
Ejemplo n.º 17
0
def find_with_tdda(df, show=True):
    v = verify_df(df, 'constraints.tdda', detect=True,
                  detect_per_constraint=True, detect_output_fields=[])
    bads = v.detected()
    show_df(bads, 'BAD RECORDS (FOUND WITH TDDA)', show, all_cols=True)
    return bads
Ejemplo n.º 18
0
# accounts_verify_25k.py

from __future__ import print_function
import pandas as pd

from tdda.constraints.pd.constraints import verify_df

df = pd.read_csv('testdata/accounts25k.csv')
print(verify_df(df, 'accounts25k.tdda'))
Ejemplo n.º 19
0
def verify_df_from_file(df_path, constraints_path, verbose=True, **kwargs):
    df = load_df(df_path)
    v = verify_df(df, constraints_path, **kwargs)
    if verbose:
        print(v)
    return v
# accounts_verify_25k_against_1k_feather.py

from __future__ import print_function
import pandas as pd

from tdda.constraints.pd.constraints import verify_df

df = pd.read_csv('testdata/accounts25k.csv')
verification = verify_df(df, 'testdata/accounts1k.tdda')

print('Basic Verification:')
print(verification)
print('\n')
print('Verification DataFrame:')
dfv = verification.to_frame()
print(dfv)

import os
import pandas as pd
import sys

from tdda.constraints.pd.constraints import verify_df

inpath = '../data/processed/wrangled_dataframe.csv'
constraint_path = ''.join([
    '../data/interim/constraints_initial_csvs/',
    'wrangled_dataframe_constraints.tdda'
])
outpath = '../data/interim/constraints_initial_csvs/wrangled_verification.tdda'

df = pd.read_csv(inpath, low_memory=False)
v = verify_df(df, constraint_path)

print('Constraints passing: %d\n' % v.passes)
print('Constraints failing: %d\n' % v.failures)
if v.failures > 0:
    print('\n', str(v))
    print('\n', v.to_frame())

if v.failures == 0:
    with open(outpath, 'w') as f:
        f.write('Success!')
        f.write('\n')
        f.write(f'{inpath} meets all the constraints of {constraint_path}.')

else:
    with open(outpath, 'w') as f:
        f.write('There was at least one failure.')
Ejemplo n.º 22
0
# elements_verify_118.py

from __future__ import print_function
import pandas as pd

from tdda.constraints.pd.constraints import verify_df

df = pd.read_csv('testdata/elements118.csv')
print(verify_df(df, 'elements118.tdda'))
 def test_wrangled_csv_meets_constraints(self):
     """Check that the wrangled csv meets the constraints required."""
     wrangled_df = pd.read_csv(self.raw_csv_paths['wrangled'],
                               low_memory=False)
     v = verify_df(wrangled_df, self.constraint_paths['wrangled_csv'])
     assert v.failures == 0
# elements_verify_118_against_92_feather.py

from __future__ import print_function
import pandas as pd

from tdda.constraints.pd.constraints import verify_df

df = pd.read_csv('testdata/elements118.csv')
verification = verify_df(df, 'testdata/elements92.tdda')

print('Basic Verification:')
print(verification)
print('\n')
print('Verification DataFrame:')
dfv = verification.to_frame()
print(dfv)

# elements_verify_118.py

from __future__ import print_function
import pandas as pd

from tdda.constraints.pd.constraints import verify_df

df = pd.read_csv('testdata/elements118.csv')
verification = verify_df(df, 'testdata/elements92.tdda')

print('Basic Verification:')
print(verification)
print('\n')
print('Verification DataFrame:')
dfv = verification.to_frame()
print(dfv)