def test_data_non_bool_boolean_truefix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': '1' }, { '_id': '4', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': '2' }, { '_id': '5', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'True' }, { '_id': '6', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'true' }, { '_id': '7', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'Yes' }, { '_id': '8', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'YES' }, { '_id': '9', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'Y' }, { '_id': '10', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'y' }] clean_data(testrows, vschema) for r in testrows: assert r['ColBool'] == True validate_data(testrows, vschema)
def test_data_nonvalid_bool_boolean_fix(): testrows = [{'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'jello'}] clean_data(testrows, vschema) assert not('ColBool' in testrows[1]) validate_data(testrows, vschema)
def test_data_valid_rows_fix(): refrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }, { '_id': '3' }] testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }, { '_id': '3' }] clean_data(testrows, vschema) assert testrows == refrows
def test_data_nonstring_id_fix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id':2, 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}] clean_data(testrows, vschema) assert testrows[1]['_id'] == '2' validate_data(testrows, vschema)
def test_read_csv_assign_id(): handle, filename = mkstemp() refrows = [{ 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }, {}] write_csv(refrows, filename, dialect=csv.excel) testrows = read_csv(filename, dialect=csv.excel) cschema = { 'ColInt': { 'type': 'count' }, 'ColFloat': { 'type': 'real' }, 'ColCat': { 'type': 'categorical' }, 'ColBool': { 'type': 'boolean' } } clean_data(testrows, cschema) assert len(testrows) == len(refrows) for i in range(len(testrows)): refrows[i]['_id'] = str(i + 1) assert testrows[i] == refrows[i] os.remove(filename)
def test_data_missing_id_fix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}] clean_data(testrows, vschema, assign_ids=True) assert testrows[0]['_id'] != testrows[1]['_id'] validate_data(testrows, vschema)
def test_data_non_str_cat_fix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat':3, 'ColBool':False}] clean_data(testrows, vschema) assert testrows[1]['ColCat'] == '3' validate_data(testrows, vschema)
def test_data_int_count_limit_fix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt': 100001, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}] clean_data(testrows, vschema) assert not('ColInt' in testrows[1]) validate_data(testrows, vschema)
def test_data_nonefield_fix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat':None, 'ColBool':False}] clean_data(testrows, vschema) assert not('ColCat' in testrows[1]) validate_data(testrows, vschema)
def test_data_extrafield_fix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColEx':4, 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}] clean_data(testrows, vschema, remove_extra_fields=True) assert not('ColEx' in testrows[1]) validate_data(testrows, vschema)
def test_data_negative_int_count_fix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt': -3, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}] assert_raises(VeritableError, clean_data, testrows, vschema, remove_invalids=False) clean_data(testrows, vschema) assert not('ColInt' in testrows[1])
def test_data_nonvalid_bool_boolean_fixfail(): testrows = [{'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'jello'}] assert_raises(VeritableError, validate_data, testrows, vschema) try: clean_data(testrows, vschema, remove_invalids=False) except VeritableError as e: assert e.row == 1 assert e.col == 'ColBool'
def test_data_non_bool_boolean_falsefix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': False }, { '_id': '2', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': '0' }, { '_id': '5', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'False' }, { '_id': '6', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'false' }, { '_id': '7', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'No' }, { '_id': '8', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'NO' }, { '_id': '9', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'N' }, { '_id': '10', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'n' }] clean_data(testrows, vschema) for r in testrows: assert r['ColBool'] == False validate_data(testrows, vschema)
def test_data_negative_int_count_fixfail(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt': -3, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}] assert_raises(VeritableError, clean_data, testrows, vschema, remove_invalids=False) try: clean_data(testrows, vschema, remove_invalids=False) except VeritableError as e: assert e.row == 1 assert e.col == 'ColInt'
def test_data_valid_rows_fix(): refrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}, {'_id': '3'}] testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}, {'_id': '3'}] clean_data(testrows, vschema) assert testrows == refrows
def test_data_non_bool_boolean_falsefix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':False}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': '0'}, {'_id': '5', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'False'}, {'_id': '6', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'false'}, {'_id': '7', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'No'}, {'_id': '8', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'NO'}, {'_id': '9', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'N'}, {'_id': '10', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'n'}] clean_data(testrows, vschema) for r in testrows: assert r['ColBool'] == False validate_data(testrows, vschema)
def test_data_too_many_cats_fix(): eschema = {'ColCat': {'type': 'categorical'}} testrows = [] rid = 0 maxCols = 256 for i in range(maxCols - 1): testrows.append({'_id': str(rid), 'ColCat': str(i)}) testrows.append({'_id': str(rid + 1), 'ColCat': str(i)}) rid = rid + 2 testrows.append({'_id': str(rid), 'ColCat': str(maxCols - 1)}) testrows.append({'_id': str(rid + 1), 'ColCat': str(maxCols)}) clean_data(testrows, eschema) assert testrows[510]['ColCat'] == 'Other' assert testrows[511]['ColCat'] == 'Other' validate_data(testrows, eschema)
def test_data_non_bool_boolean_truefix(): testrows = [ {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': '1'}, {'_id': '4', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': '2'}, {'_id': '5', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'True'}, {'_id': '6', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'true'}, {'_id': '7', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'Yes'}, {'_id': '8', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'YES'}, {'_id': '9', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'Y'}, {'_id': '10', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'y'}] clean_data(testrows, vschema) for r in testrows: assert r['ColBool'] == True validate_data(testrows, vschema)
def test_data_missing_id_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }] clean_data(testrows, vschema, assign_ids=True) assert testrows[0]['_id'] != testrows[1]['_id'] validate_data(testrows, vschema)
def test_data_too_many_cats_fix(): eschema = { 'ColCat': {'type': 'categorical'} } testrows = [] rid = 0 maxCols = 256 for i in range(maxCols - 1): testrows.append({'_id':str(rid), 'ColCat':str(i)}) testrows.append({'_id':str(rid + 1), 'ColCat':str(i)}) rid = rid + 2 testrows.append({'_id':str(rid), 'ColCat':str(maxCols - 1)}) testrows.append({'_id':str(rid + 1), 'ColCat':str(maxCols)}) clean_data(testrows, eschema) assert testrows[510]['ColCat'] == 'Other' assert testrows[511]['ColCat'] == 'Other' validate_data(testrows, eschema)
def test_data_int_count_limit_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': 100001, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }] clean_data(testrows, vschema) assert not ('ColInt' in testrows[1]) validate_data(testrows, vschema)
def test_data_nonefield_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': None, 'ColBool': False }] clean_data(testrows, vschema) assert not ('ColCat' in testrows[1]) validate_data(testrows, vschema)
def test_data_non_int_count_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': '4', 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }] clean_data(testrows, vschema) assert testrows[1]['ColInt'] == 4 validate_data(testrows, vschema)
def test_data_nonstring_id_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': 2, 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }] clean_data(testrows, vschema) assert testrows[1]['_id'] == '2' validate_data(testrows, vschema)
def test_data_nonvalid_bool_boolean_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'jello' }] clean_data(testrows, vschema) assert not ('ColBool' in testrows[1]) validate_data(testrows, vschema)
def test_data_extrafield_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColEx': 4, 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }] clean_data(testrows, vschema, remove_extra_fields=True) assert not ('ColEx' in testrows[1]) validate_data(testrows, vschema)
def test_write_read_csv(): handle, filename = mkstemp() refrows = [{'_id': '7', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a'}, {'_id': '8', 'ColInt':4, 'ColCat': 'b', 'ColBool':False}, {'_id': '9'}] write_csv(refrows, filename, dialect=csv.excel) testrows = read_csv(filename, dialect=csv.excel) cschema = { 'ColInt': {'type': 'count'}, 'ColFloat': {'type': 'real'}, 'ColCat': {'type': 'categorical'}, 'ColBool': {'type': 'boolean'} } clean_data(testrows, cschema) assert len(testrows) == len(refrows) for i in range(len(testrows)): assert testrows[i] == refrows[i] os.remove(filename)
def test_data_nonvalid_bool_boolean_fixfail(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': 4, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': 'jello' }] assert_raises(VeritableError, validate_data, testrows, vschema) try: clean_data(testrows, vschema, remove_invalids=False) except VeritableError as e: assert e.row == 1 assert e.col == 'ColBool'
def test_data_negative_int_count_fix(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': -3, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }] assert_raises(VeritableError, clean_data, testrows, vschema, remove_invalids=False) clean_data(testrows, vschema) assert not ('ColInt' in testrows[1])
def test_data_negative_int_count_fixfail(): testrows = [{ '_id': '1', 'ColInt': 3, 'ColFloat': 3.1, 'ColCat': 'a', 'ColBool': True }, { '_id': '2', 'ColInt': -3, 'ColFloat': 4.1, 'ColCat': 'b', 'ColBool': False }] assert_raises(VeritableError, clean_data, testrows, vschema, remove_invalids=False) try: clean_data(testrows, vschema, remove_invalids=False) except VeritableError as e: assert e.row == 1 assert e.col == 'ColInt'
def main(): ########## # UPLOAD # ########## # 1. Define the schema for the table - specify column names and data types table_schema = { 'age': {'type': 'count'}, 'sex': {'type': 'categorical'}, 'region': {'type': 'categorical'}, 'income': {'type': 'real'}, 'married': {'type': 'boolean'}, 'children': {'type': 'count'}, 'car': {'type': 'boolean'}, 'save_act': {'type': 'boolean'}, 'current_act': {'type': 'boolean'}, 'mortgage': {'type': 'boolean'}, 'pep': {'type': 'boolean'}, } # 2. Load the data from csv and divide it into training and test subsets rows = read_csv(DATA_FILE) # Load rows from CSV, returns all row data values as strings clean_data(rows, table_schema) # Convert row data values to correct types based on schema training_rows, test_rows = split_rows(rows, TRAIN_FRAC) # Split into training and test sets # 3. Connect to the Veritable API api = veritable.connect() if api.table_exists(TABLE_ID): print("Deleting old table '%s'" %TABLE_ID) api.delete_table(TABLE_ID) # 4. Create a Veritable Table and upload training rows print("Creating table '%s' and uploading rows" %TABLE_ID) table = api.create_table(table_id=TABLE_ID) table.batch_upload_rows(training_rows) ########### # ANALYZE # ########### # 5. Create a Veritable Analysis and wait for it to complete print("Creating analysis '%s' and waiting for it to complete" %ANALYSIS_ID) analysis = table.create_analysis(schema=table_schema, analysis_id=ANALYSIS_ID) analysis.wait() ########### # PREDICT # ########### # 6. For each row in the test set, predict the value and uncertainty for the target column print("Making predictions") prediction_results = [] for test_row in test_rows: # Prepare the prediction request prediction_request = test_row.copy() # Copy known values from test row del prediction_request['_id'] # '_id' should not be present in prediction requests prediction_request[TARGET_COL] = None # None values are predicted by Veritable # Make predictions prediction = analysis.predict(prediction_request, PRED_COUNT) # Derive a single value estimate and uncertainty metric estimate = prediction[TARGET_COL] uncertainty = prediction.uncertainty[TARGET_COL] # Compare estimate to actual value from test row is_correct = (estimate == test_row[TARGET_COL]) # Collect results prediction_results.append( { 'is_correct':is_correct, 'uncertainty':uncertainty } ) # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS: # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold unknown_prediction_results = [r for r in prediction_results if r['uncertainty'] > maximum_uncertainty] unknown_count = len(unknown_prediction_results) # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold known_prediction_results = [r for r in prediction_results if r['uncertainty'] <= maximum_uncertainty] known_count = len(known_prediction_results) # Identify prediction results we looked at that are correct known_correct_prediction_results = [r for r in known_prediction_results if r['is_correct']] known_correct_count = len(known_correct_prediction_results) print( "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}".format( TARGET_COL, 0.0 if known_count == 0 else float(known_correct_count) / known_count, known_correct_count, known_count, float(unknown_count) / (known_count+unknown_count), unknown_count, known_count+unknown_count, maximum_uncertainty ) )
def main(): ########## # UPLOAD # ########## # 1. Define the schema for the table - specify column names and data types table_schema = { 'age': { 'type': 'count' }, 'sex': { 'type': 'categorical' }, 'region': { 'type': 'categorical' }, 'income': { 'type': 'real' }, 'married': { 'type': 'boolean' }, 'children': { 'type': 'count' }, 'car': { 'type': 'boolean' }, 'save_act': { 'type': 'boolean' }, 'current_act': { 'type': 'boolean' }, 'mortgage': { 'type': 'boolean' }, 'pep': { 'type': 'boolean' }, } # 2. Load the data from csv and divide it into training and test subsets rows = read_csv( DATA_FILE ) # Load rows from CSV, returns all row data values as strings clean_data(rows, table_schema ) # Convert row data values to correct types based on schema training_rows, test_rows = split_rows( rows, TRAIN_FRAC) # Split into training and test sets # 3. Connect to the Veritable API api = veritable.connect() if api.table_exists(TABLE_ID): print("Deleting old table '%s'" % TABLE_ID) api.delete_table(TABLE_ID) # 4. Create a Veritable Table and upload training rows print("Creating table '%s' and uploading rows" % TABLE_ID) table = api.create_table(table_id=TABLE_ID) table.batch_upload_rows(training_rows) ########### # ANALYZE # ########### # 5. Create a Veritable Analysis and wait for it to complete print("Creating analysis '%s' and waiting for it to complete" % ANALYSIS_ID) analysis = table.create_analysis(schema=table_schema, analysis_id=ANALYSIS_ID) analysis.wait() ########### # PREDICT # ########### # 6. For each row in the test set, predict the value and uncertainty for the target column print("Making predictions") prediction_results = [] for test_row in test_rows: # Prepare the prediction request prediction_request = test_row.copy() # Copy known values from test row del prediction_request[ '_id'] # '_id' should not be present in prediction requests prediction_request[ TARGET_COL] = None # None values are predicted by Veritable # Make predictions prediction = analysis.predict(prediction_request, PRED_COUNT) # Derive a single value estimate and uncertainty metric estimate = prediction[TARGET_COL] uncertainty = prediction.uncertainty[TARGET_COL] # Compare estimate to actual value from test row is_correct = (estimate == test_row[TARGET_COL]) # Collect results prediction_results.append({ 'is_correct': is_correct, 'uncertainty': uncertainty }) # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS: # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold unknown_prediction_results = [ r for r in prediction_results if r['uncertainty'] > maximum_uncertainty ] unknown_count = len(unknown_prediction_results) # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold known_prediction_results = [ r for r in prediction_results if r['uncertainty'] <= maximum_uncertainty ] known_count = len(known_prediction_results) # Identify prediction results we looked at that are correct known_correct_prediction_results = [ r for r in known_prediction_results if r['is_correct'] ] known_correct_count = len(known_correct_prediction_results) print( "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}" .format( TARGET_COL, 0.0 if known_count == 0 else float(known_correct_count) / known_count, known_correct_count, known_count, float(unknown_count) / (known_count + unknown_count), unknown_count, known_count + unknown_count, maximum_uncertainty))
def main(): API = veritable.connect(ssl_verify=False) print("Loading and preparing data...") # load the data and schema describing all column datatypes with open(DATA_FILE, 'rb') as fd: data = json.loads(fd.read()) with open(SCHEMA_FILE, 'rb') as fd: master_schema = json.loads(fd.read()) # divide the data into a training and test set, and ensure data is # of the correct type for each column train_data, test_data = split_rows(data, .8) clean_data(train_data, master_schema, remove_extra_fields=True, assign_ids=True) # we have to account for the possibility that the training data doesn't # contain all of the columns in the master schema schema = subset_schema(master_schema, train_data) # use the subset of the schema to clean the test data - make sure we don't # condition test predictions on columns or categorical values that aren't # present in the training data clean_data(test_data, schema, remove_extra_fields=True, assign_ids=True) validate_test_categoricals(test_data, train_data, schema) # we'll run the analysis twice: one with the original multinomial target # column, and once converting it to a binary column def binary_transform(x): transform = {'0': False, '1': True, '2': True, '3': True, '4': True} return transform[x] # make the binary dataset and schema binary_train_data = deepcopy(train_data) binary_test_data = deepcopy(test_data) binary_schema = deepcopy(schema) binary_schema['target']['type'] = 'boolean' for d in (binary_train_data, binary_test_data): for r in d: if 'target' in r: r['target'] = binary_transform(r['target']) # delete existing tables if present if API.table_exists(TABLE_ID): print("Deleting old table '%s'" %TABLE_ID) API.delete_table(TABLE_ID) if API.table_exists(TABLE_ID+"-binary"): print("Deleting old table '%s'" %(TABLE_ID+"-binary")) API.delete_table(TABLE_ID+"-binary") # upload the data and start the analyses print("Uploading data and running analyses...") table = API.create_table(TABLE_ID) table.batch_upload_rows(train_data) analysis = table.create_analysis(schema) binary_table = API.create_table(TABLE_ID+"-binary") binary_table.batch_upload_rows(binary_train_data) binary_analysis = binary_table.create_analysis(binary_schema) # now we'll make predictions for each test row, collecting the # predicted values for the target column analysis.wait() print("Making predictions....") results = predict_known_target_column(test_data, analysis, schema, 'target') # and for the binary table binary_analysis.wait() binary_results = predict_known_target_column(binary_test_data, binary_analysis, binary_schema, 'target') # summarize the results print("multinomial dataset, raw predictions: " \ "{0}% test error".format(test_error(results, 'target') * 100)) print("multinomial dataset, binary transform: " \ "{0}% test error".format(test_error(results, 'target', transform=binary_transform) * 100)) print("binary dataset, raw predictions: " \ "{0}% test error".format(test_error(binary_results, 'target') * 100))