Ejemplo n.º 1
0
def test_data_non_bool_boolean_truefix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': '1'
    }, {
        '_id': '4',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': '2'
    }, {
        '_id': '5',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'True'
    }, {
        '_id': '6',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'true'
    }, {
        '_id': '7',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'Yes'
    }, {
        '_id': '8',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'YES'
    }, {
        '_id': '9',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'Y'
    }, {
        '_id': '10',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'y'
    }]
    clean_data(testrows, vschema)
    for r in testrows:
        assert r['ColBool'] == True
    validate_data(testrows, vschema)
Ejemplo n.º 2
0
def test_data_nonvalid_bool_boolean_fix():
    testrows = [{'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a',
        'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b',
        'ColBool': 'jello'}]
    clean_data(testrows, vschema)
    assert not('ColBool' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 3
0
def test_data_valid_rows_fix():
    refrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }, {
        '_id': '3'
    }]
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }, {
        '_id': '3'
    }]
    clean_data(testrows, vschema)
    assert testrows == refrows
Ejemplo n.º 4
0
def test_data_nonstring_id_fix():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id':2, 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}]
    clean_data(testrows, vschema)
    assert testrows[1]['_id'] == '2'
    validate_data(testrows, vschema)
Ejemplo n.º 5
0
def test_read_csv_assign_id():
    handle, filename = mkstemp()
    refrows = [{
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }, {}]
    write_csv(refrows, filename, dialect=csv.excel)
    testrows = read_csv(filename, dialect=csv.excel)
    cschema = {
        'ColInt': {
            'type': 'count'
        },
        'ColFloat': {
            'type': 'real'
        },
        'ColCat': {
            'type': 'categorical'
        },
        'ColBool': {
            'type': 'boolean'
        }
    }
    clean_data(testrows, cschema)
    assert len(testrows) == len(refrows)
    for i in range(len(testrows)):
        refrows[i]['_id'] = str(i + 1)
        assert testrows[i] == refrows[i]
    os.remove(filename)
Ejemplo n.º 6
0
def test_data_missing_id_fix():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool':False}]
    clean_data(testrows, vschema, assign_ids=True)
    assert testrows[0]['_id'] != testrows[1]['_id']
    validate_data(testrows, vschema)
Ejemplo n.º 7
0
def test_data_non_str_cat_fix():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat':3, 'ColBool':False}]
    clean_data(testrows, vschema)
    assert testrows[1]['ColCat'] == '3'
    validate_data(testrows, vschema)
Ejemplo n.º 8
0
def test_data_int_count_limit_fix():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColInt': 100001, 'ColFloat':4.1, 'ColCat': 'b',
         'ColBool':False}]
    clean_data(testrows, vschema)
    assert not('ColInt' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 9
0
def test_data_nonefield_fix():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat':None,
         'ColBool':False}]
    clean_data(testrows, vschema)
    assert not('ColCat' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 10
0
def test_data_extrafield_fix():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColEx':4, 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b',
         'ColBool':False}]
    clean_data(testrows, vschema, remove_extra_fields=True)
    assert not('ColEx' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 11
0
def test_data_negative_int_count_fix():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColInt': -3, 'ColFloat':4.1, 'ColCat': 'b',
         'ColBool':False}]
    assert_raises(VeritableError, clean_data, testrows, vschema,
        remove_invalids=False)
    clean_data(testrows, vschema)
    assert not('ColInt' in testrows[1])
Ejemplo n.º 12
0
def test_data_nonvalid_bool_boolean_fixfail():
    testrows = [{'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a',
        'ColBool':True}, {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b',
        'ColBool': 'jello'}]
    assert_raises(VeritableError, validate_data, testrows, vschema)
    try:
        clean_data(testrows, vschema, remove_invalids=False)
    except VeritableError as e:
        assert e.row == 1
        assert e.col == 'ColBool'
Ejemplo n.º 13
0
def test_data_non_bool_boolean_falsefix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': False
    }, {
        '_id': '2',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': '0'
    }, {
        '_id': '5',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'False'
    }, {
        '_id': '6',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'false'
    }, {
        '_id': '7',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'No'
    }, {
        '_id': '8',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'NO'
    }, {
        '_id': '9',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'N'
    }, {
        '_id': '10',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'n'
    }]
    clean_data(testrows, vschema)
    for r in testrows:
        assert r['ColBool'] == False
    validate_data(testrows, vschema)
Ejemplo n.º 14
0
def test_data_negative_int_count_fixfail():
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColInt': -3, 'ColFloat':4.1, 'ColCat': 'b',
         'ColBool':False}]
    assert_raises(VeritableError, clean_data, testrows, vschema,
        remove_invalids=False)
    try:
        clean_data(testrows, vschema, remove_invalids=False)
    except VeritableError as e:
        assert e.row == 1
        assert e.col == 'ColInt'
Ejemplo n.º 15
0
def test_data_valid_rows_fix():
    refrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b',
         'ColBool':False},
        {'_id': '3'}]
    testrows = [
        {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
        {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b',
         'ColBool':False},
        {'_id': '3'}]
    clean_data(testrows, vschema)
    assert testrows == refrows
Ejemplo n.º 16
0
def test_data_non_bool_boolean_falsefix():
    testrows = [
    {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':False},
    {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': '0'},
    {'_id': '5', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'False'},
    {'_id': '6', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'false'},
    {'_id': '7', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'No'},
    {'_id': '8', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'NO'},
    {'_id': '9', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'N'},
    {'_id': '10', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'n'}]
    clean_data(testrows, vschema)
    for r in testrows:
        assert r['ColBool'] == False
    validate_data(testrows, vschema)
Ejemplo n.º 17
0
def test_data_too_many_cats_fix():
    eschema = {'ColCat': {'type': 'categorical'}}
    testrows = []
    rid = 0
    maxCols = 256
    for i in range(maxCols - 1):
        testrows.append({'_id': str(rid), 'ColCat': str(i)})
        testrows.append({'_id': str(rid + 1), 'ColCat': str(i)})
        rid = rid + 2
    testrows.append({'_id': str(rid), 'ColCat': str(maxCols - 1)})
    testrows.append({'_id': str(rid + 1), 'ColCat': str(maxCols)})
    clean_data(testrows, eschema)
    assert testrows[510]['ColCat'] == 'Other'
    assert testrows[511]['ColCat'] == 'Other'
    validate_data(testrows, eschema)
Ejemplo n.º 18
0
def test_data_non_bool_boolean_truefix():
    testrows = [
    {'_id': '1', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a', 'ColBool':True},
    {'_id': '2', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': '1'},
    {'_id': '4', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': '2'},
    {'_id': '5', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'True'},
    {'_id': '6', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'true'},
    {'_id': '7', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'Yes'},
    {'_id': '8', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'YES'},
    {'_id': '9', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'Y'},
    {'_id': '10', 'ColInt':4, 'ColFloat':4.1, 'ColCat': 'b', 'ColBool': 'y'}]
    clean_data(testrows, vschema)
    for r in testrows:
        assert r['ColBool'] == True
    validate_data(testrows, vschema)
Ejemplo n.º 19
0
def test_data_missing_id_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }]
    clean_data(testrows, vschema, assign_ids=True)
    assert testrows[0]['_id'] != testrows[1]['_id']
    validate_data(testrows, vschema)
Ejemplo n.º 20
0
def test_data_too_many_cats_fix():
    eschema = {
        'ColCat': {'type': 'categorical'}
    }
    testrows = []
    rid = 0
    maxCols = 256
    for i in range(maxCols - 1):
        testrows.append({'_id':str(rid), 'ColCat':str(i)})
        testrows.append({'_id':str(rid + 1), 'ColCat':str(i)})
        rid = rid + 2
    testrows.append({'_id':str(rid), 'ColCat':str(maxCols - 1)})
    testrows.append({'_id':str(rid + 1), 'ColCat':str(maxCols)})
    clean_data(testrows, eschema)
    assert testrows[510]['ColCat'] == 'Other'
    assert testrows[511]['ColCat'] == 'Other'
    validate_data(testrows, eschema)
Ejemplo n.º 21
0
def test_data_int_count_limit_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': 100001,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }]
    clean_data(testrows, vschema)
    assert not ('ColInt' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 22
0
def test_data_nonefield_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': None,
        'ColBool': False
    }]
    clean_data(testrows, vschema)
    assert not ('ColCat' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 23
0
def test_data_non_int_count_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': '4',
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }]
    clean_data(testrows, vschema)
    assert testrows[1]['ColInt'] == 4
    validate_data(testrows, vschema)
Ejemplo n.º 24
0
def test_data_nonstring_id_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': 2,
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }]
    clean_data(testrows, vschema)
    assert testrows[1]['_id'] == '2'
    validate_data(testrows, vschema)
Ejemplo n.º 25
0
def test_data_nonvalid_bool_boolean_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'jello'
    }]
    clean_data(testrows, vschema)
    assert not ('ColBool' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 26
0
def test_data_extrafield_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColEx': 4,
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }]
    clean_data(testrows, vschema, remove_extra_fields=True)
    assert not ('ColEx' in testrows[1])
    validate_data(testrows, vschema)
Ejemplo n.º 27
0
def test_write_read_csv():
    handle, filename = mkstemp()
    refrows = [{'_id': '7', 'ColInt':3, 'ColFloat':3.1, 'ColCat': 'a'},
               {'_id': '8', 'ColInt':4, 'ColCat': 'b', 'ColBool':False},
               {'_id': '9'}]
    write_csv(refrows, filename, dialect=csv.excel)
    testrows = read_csv(filename, dialect=csv.excel)
    cschema = {
        'ColInt': {'type': 'count'},
        'ColFloat': {'type': 'real'},
        'ColCat': {'type': 'categorical'},
        'ColBool': {'type': 'boolean'}
        }
    clean_data(testrows, cschema)
    assert len(testrows) == len(refrows)
    for i in range(len(testrows)):
        assert testrows[i] == refrows[i]
    os.remove(filename)
Ejemplo n.º 28
0
def test_data_nonvalid_bool_boolean_fixfail():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': 4,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': 'jello'
    }]
    assert_raises(VeritableError, validate_data, testrows, vschema)
    try:
        clean_data(testrows, vschema, remove_invalids=False)
    except VeritableError as e:
        assert e.row == 1
        assert e.col == 'ColBool'
Ejemplo n.º 29
0
def test_data_negative_int_count_fix():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': -3,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }]
    assert_raises(VeritableError,
                  clean_data,
                  testrows,
                  vschema,
                  remove_invalids=False)
    clean_data(testrows, vschema)
    assert not ('ColInt' in testrows[1])
Ejemplo n.º 30
0
def test_data_negative_int_count_fixfail():
    testrows = [{
        '_id': '1',
        'ColInt': 3,
        'ColFloat': 3.1,
        'ColCat': 'a',
        'ColBool': True
    }, {
        '_id': '2',
        'ColInt': -3,
        'ColFloat': 4.1,
        'ColCat': 'b',
        'ColBool': False
    }]
    assert_raises(VeritableError,
                  clean_data,
                  testrows,
                  vschema,
                  remove_invalids=False)
    try:
        clean_data(testrows, vschema, remove_invalids=False)
    except VeritableError as e:
        assert e.row == 1
        assert e.col == 'ColInt'
Ejemplo n.º 31
0
def main():

    ##########
    # UPLOAD #
    ##########

    # 1. Define the schema for the table - specify column names and data types
    table_schema = {
        'age': {'type': 'count'},
        'sex': {'type': 'categorical'},
        'region': {'type': 'categorical'},
        'income': {'type': 'real'},
        'married': {'type': 'boolean'},
        'children': {'type': 'count'},
        'car': {'type': 'boolean'},
        'save_act': {'type': 'boolean'},
        'current_act': {'type': 'boolean'},
        'mortgage': {'type': 'boolean'},
        'pep': {'type': 'boolean'},
    }

    # 2. Load the data from csv and divide it into training and test subsets
    rows = read_csv(DATA_FILE)                                  # Load rows from CSV, returns all row data values as strings
    clean_data(rows, table_schema)                               # Convert row data values to correct types based on schema
    training_rows, test_rows = split_rows(rows, TRAIN_FRAC)     # Split into training and test sets

    # 3. Connect to the Veritable API
    api = veritable.connect()
    if api.table_exists(TABLE_ID):
        print("Deleting old table '%s'" %TABLE_ID)
        api.delete_table(TABLE_ID)

    # 4. Create a Veritable Table and upload training rows
    print("Creating table '%s' and uploading rows" %TABLE_ID)
    table = api.create_table(table_id=TABLE_ID)
    table.batch_upload_rows(training_rows)



    ###########
    # ANALYZE #
    ###########

    # 5. Create a Veritable Analysis and wait for it to complete
    print("Creating analysis '%s' and waiting for it to complete" %ANALYSIS_ID)
    analysis = table.create_analysis(schema=table_schema, analysis_id=ANALYSIS_ID)
    analysis.wait()



    ###########
    # PREDICT #
    ###########


    # 6. For each row in the test set, predict the value and uncertainty for the target column
    print("Making predictions")

    prediction_results = []
    for test_row in test_rows:
        # Prepare the prediction request
        prediction_request = test_row.copy()        # Copy known values from test row
        del prediction_request['_id']               # '_id' should not be present in prediction requests
        prediction_request[TARGET_COL] = None       # None values are predicted by Veritable

        # Make predictions
        prediction = analysis.predict(prediction_request, PRED_COUNT)

        # Derive a single value estimate and uncertainty metric
        estimate = prediction[TARGET_COL]
        uncertainty = prediction.uncertainty[TARGET_COL]

        # Compare estimate to actual value from test row
        is_correct = (estimate == test_row[TARGET_COL])

        # Collect results
        prediction_results.append( { 'is_correct':is_correct, 'uncertainty':uncertainty } )


    # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds
    for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS:
        # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold
        unknown_prediction_results = [r for r in prediction_results if r['uncertainty'] > maximum_uncertainty]
        unknown_count = len(unknown_prediction_results)

        # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold
        known_prediction_results = [r for r in prediction_results if r['uncertainty'] <= maximum_uncertainty]
        known_count = len(known_prediction_results)

        # Identify prediction results we looked at that are correct
        known_correct_prediction_results = [r for r in known_prediction_results if r['is_correct']]
        known_correct_count = len(known_correct_prediction_results)

        print( "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}".format(
                    TARGET_COL,
                    0.0 if known_count == 0 else float(known_correct_count) / known_count,
                    known_correct_count,
                    known_count,
                    float(unknown_count) / (known_count+unknown_count),
                    unknown_count,
                    known_count+unknown_count,
                    maximum_uncertainty ) )
Ejemplo n.º 32
0
def main():

    ##########
    # UPLOAD #
    ##########

    # 1. Define the schema for the table - specify column names and data types
    table_schema = {
        'age': {
            'type': 'count'
        },
        'sex': {
            'type': 'categorical'
        },
        'region': {
            'type': 'categorical'
        },
        'income': {
            'type': 'real'
        },
        'married': {
            'type': 'boolean'
        },
        'children': {
            'type': 'count'
        },
        'car': {
            'type': 'boolean'
        },
        'save_act': {
            'type': 'boolean'
        },
        'current_act': {
            'type': 'boolean'
        },
        'mortgage': {
            'type': 'boolean'
        },
        'pep': {
            'type': 'boolean'
        },
    }

    # 2. Load the data from csv and divide it into training and test subsets
    rows = read_csv(
        DATA_FILE
    )  # Load rows from CSV, returns all row data values as strings
    clean_data(rows, table_schema
               )  # Convert row data values to correct types based on schema
    training_rows, test_rows = split_rows(
        rows, TRAIN_FRAC)  # Split into training and test sets

    # 3. Connect to the Veritable API
    api = veritable.connect()
    if api.table_exists(TABLE_ID):
        print("Deleting old table '%s'" % TABLE_ID)
        api.delete_table(TABLE_ID)

    # 4. Create a Veritable Table and upload training rows
    print("Creating table '%s' and uploading rows" % TABLE_ID)
    table = api.create_table(table_id=TABLE_ID)
    table.batch_upload_rows(training_rows)

    ###########
    # ANALYZE #
    ###########

    # 5. Create a Veritable Analysis and wait for it to complete
    print("Creating analysis '%s' and waiting for it to complete" %
          ANALYSIS_ID)
    analysis = table.create_analysis(schema=table_schema,
                                     analysis_id=ANALYSIS_ID)
    analysis.wait()

    ###########
    # PREDICT #
    ###########

    # 6. For each row in the test set, predict the value and uncertainty for the target column
    print("Making predictions")

    prediction_results = []
    for test_row in test_rows:
        # Prepare the prediction request
        prediction_request = test_row.copy()  # Copy known values from test row
        del prediction_request[
            '_id']  # '_id' should not be present in prediction requests
        prediction_request[
            TARGET_COL] = None  # None values are predicted by Veritable

        # Make predictions
        prediction = analysis.predict(prediction_request, PRED_COUNT)

        # Derive a single value estimate and uncertainty metric
        estimate = prediction[TARGET_COL]
        uncertainty = prediction.uncertainty[TARGET_COL]

        # Compare estimate to actual value from test row
        is_correct = (estimate == test_row[TARGET_COL])

        # Collect results
        prediction_results.append({
            'is_correct': is_correct,
            'uncertainty': uncertainty
        })

    # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds
    for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS:
        # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold
        unknown_prediction_results = [
            r for r in prediction_results
            if r['uncertainty'] > maximum_uncertainty
        ]
        unknown_count = len(unknown_prediction_results)

        # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold
        known_prediction_results = [
            r for r in prediction_results
            if r['uncertainty'] <= maximum_uncertainty
        ]
        known_count = len(known_prediction_results)

        # Identify prediction results we looked at that are correct
        known_correct_prediction_results = [
            r for r in known_prediction_results if r['is_correct']
        ]
        known_correct_count = len(known_correct_prediction_results)

        print(
            "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}"
            .format(
                TARGET_COL,
                0.0 if known_count == 0 else float(known_correct_count) /
                known_count, known_correct_count, known_count,
                float(unknown_count) / (known_count + unknown_count),
                unknown_count, known_count + unknown_count,
                maximum_uncertainty))
Ejemplo n.º 33
0
def main():
    API = veritable.connect(ssl_verify=False)

    print("Loading and preparing data...")
    # load the data and schema describing all column datatypes
    with open(DATA_FILE, 'rb') as fd:
        data = json.loads(fd.read())

    with open(SCHEMA_FILE, 'rb') as fd:
        master_schema = json.loads(fd.read())

    # divide the data into a training and test set, and ensure data is
    # of the correct type for each column
    train_data, test_data = split_rows(data, .8)
    clean_data(train_data, master_schema, remove_extra_fields=True,
        assign_ids=True)

    # we have to account for the possibility that the training data doesn't
    # contain all of the columns in the master schema
    schema = subset_schema(master_schema, train_data)

    # use the subset of the schema to clean the test data - make sure we don't
    # condition test predictions on columns or categorical values that aren't
    # present in the training data
    clean_data(test_data, schema, remove_extra_fields=True, assign_ids=True)
    validate_test_categoricals(test_data, train_data, schema)

    # we'll run the analysis twice: one with the original multinomial target
    # column, and once converting it to a binary column
    def binary_transform(x):
        transform = {'0': False, '1': True, '2': True, '3': True, '4': True}
        return transform[x]

    # make the binary dataset and schema
    binary_train_data = deepcopy(train_data)
    binary_test_data = deepcopy(test_data)
    binary_schema = deepcopy(schema)
    binary_schema['target']['type'] = 'boolean'
    for d in (binary_train_data, binary_test_data):
        for r in d:
            if 'target' in r:
                r['target'] = binary_transform(r['target'])

    # delete existing tables if present
    if API.table_exists(TABLE_ID):
        print("Deleting old table '%s'" %TABLE_ID)
        API.delete_table(TABLE_ID)
    if API.table_exists(TABLE_ID+"-binary"):
        print("Deleting old table '%s'" %(TABLE_ID+"-binary"))
        API.delete_table(TABLE_ID+"-binary")

    # upload the data and start the analyses
    print("Uploading data and running analyses...")
    table = API.create_table(TABLE_ID)
    table.batch_upload_rows(train_data)
    analysis = table.create_analysis(schema)

    binary_table = API.create_table(TABLE_ID+"-binary")
    binary_table.batch_upload_rows(binary_train_data)
    binary_analysis = binary_table.create_analysis(binary_schema)

    # now we'll make predictions for each test row, collecting the
    # predicted values for the target column
    analysis.wait()
    print("Making predictions....")
    results = predict_known_target_column(test_data, analysis, schema,
        'target')

    # and for the binary table
    binary_analysis.wait()
    binary_results = predict_known_target_column(binary_test_data,
        binary_analysis, binary_schema, 'target')

    # summarize the results
    print("multinomial dataset, raw predictions: " \
    "{0}% test error".format(test_error(results, 'target') * 100))
    print("multinomial dataset, binary transform: " \
    "{0}% test error".format(test_error(results, 'target',
        transform=binary_transform) * 100))
    print("binary dataset, raw predictions: " \
    "{0}% test error".format(test_error(binary_results, 'target') * 100))