Exemple #1
0
 def setup_class(self):
     self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL,
                                  **connect_kwargs)
     self.t = self.API.create_table()
     self.t.batch_upload_rows([{
         '_id': 'row1',
         'cat': 'a',
         'ct': 0,
         'real': 1.02394,
         'bool': True
     }, {
         '_id': 'row2',
         'cat': 'b',
         'ct': 0,
         'real': 0.92131,
         'bool': False
     }, {
         '_id': 'row3',
         'cat': 'c',
         'ct': 1,
         'real': 1.82812,
         'bool': True
     }, {
         '_id': 'row4',
         'cat': 'c',
         'ct': 1,
         'real': 0.81271,
         'bool': True
     }, {
         '_id': 'row5',
         'cat': 'd',
         'ct': 2,
         'real': 1.14561,
         'bool': False
     }, {
         '_id': 'row6',
         'cat': 'a',
         'ct': 5,
         'real': 1.03412,
         'bool': False
     }])
     self.schema = {
         'cat': {
             'type': 'categorical'
         },
         'ct': {
             'type': 'count'
         },
         'real': {
             'type': 'real'
         },
         'bool': {
             'type': 'boolean'
         }
     }
     self.a = self.t.create_analysis(self.schema,
                                     analysis_id="a1",
                                     force=True)
     self.a.wait()
Exemple #2
0
def predict():
	API_KEY = '<API_KEY>'
	api = veritable.connect(api_key=API_KEY)
	table = api.get_table('heart_data')
	analysis = table.get_analysis('heart_analysis')
	foo = request.args.to_dict().keys()
	new_patient = sj.loads(foo[0])
	del new_patient['']
	for k in new_patient:
		new_patient[k] = float(new_patient[k])
	new_patient['num'] = None
	prediction = analysis.predict(new_patient)
	return jsonify(predicted=prediction['num'])
 def setup_class(self):
     self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL,
         **connect_kwargs)
     self.t = self.API.create_table()
     self.t.batch_upload_rows(
     [{'_id': 'row1', 'cat': 'a', 'ct': 0, 'real': 1.02394, 'bool': True},
      {'_id': 'row2', 'cat': 'b', 'ct': 0, 'real': 0.92131, 'bool': False},
      {'_id': 'row3', 'cat': 'c', 'ct': 1, 'real': 1.82812, 'bool': True},
      {'_id': 'row4', 'cat': 'c', 'ct': 1, 'real': 0.81271, 'bool': True},
      {'_id': 'row5', 'cat': 'd', 'ct': 2, 'real': 1.14561, 'bool': False},
      {'_id': 'row6', 'cat': 'a', 'ct': 5, 'real': 1.03412, 'bool': False}
     ])
     self.connection = self.t._conn
     self.collection = self.t._link("rows")
Exemple #4
0
def main(data_file, schema_file):
    rows = json.loads(open(data_file).read())
    schema = json.loads(open(schema_file).read())
    
    api = veritable.connect()
    
    if not api.table_exists(TABLE_NAME):
        print 'Creating table'
        table = api.create_table(TABLE_NAME)
    else:
        print 'Getting table'
        table = api.get_table(TABLE_NAME)
    
    print 'Uploading rows'
    table.batch_upload_rows(rows)
    print 'Creating analysis'
    analysis = table.create_analysis(schema)
 def setup_class(self):
     self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL,
                                  **connect_kwargs)
     self.t = self.API.create_table()
     self.t.batch_upload_rows([{
         '_id': 'row1',
         'cat': 'a',
         'ct': 0,
         'real': 1.02394,
         'bool': True
     }, {
         '_id': 'row2',
         'cat': 'b',
         'ct': 0,
         'real': 0.92131,
         'bool': False
     }, {
         '_id': 'row3',
         'cat': 'c',
         'ct': 1,
         'real': 1.82812,
         'bool': True
     }, {
         '_id': 'row4',
         'cat': 'c',
         'ct': 1,
         'real': 0.81271,
         'bool': True
     }, {
         '_id': 'row5',
         'cat': 'd',
         'ct': 2,
         'real': 1.14561,
         'bool': False
     }, {
         '_id': 'row6',
         'cat': 'a',
         'ct': 5,
         'real': 1.03412,
         'bool': False
     }])
     self.connection = self.t._conn
     self.collection = self.t._link("rows")
 def setup_class(self):
     self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL,
         **connect_kwargs)
     self.t = self.API.create_table()
     self.rows = [{'_id': 'row1', 'cat': 'a', 'ct': 0, 'real': 1.02394, 'bool': True},
      {'_id': 'row2', 'cat': 'b', 'ct': 0, 'real': 0.92131, 'bool': False},
      {'_id': 'row3', 'cat': 'c', 'ct': 1, 'real': 1.82812, 'bool': True},
      {'_id': 'row4', 'cat': 'c', 'ct': 1, 'real': 0.81271, 'bool': True},
      {'_id': 'row5', 'cat': 'd', 'ct': 2, 'real': 1.14561, 'bool': False},
      {'_id': 'row6', 'cat': 'a', 'ct': 5, 'real': 1.03412, 'bool': False}]
     self.t.batch_upload_rows(self.rows)
     self.schema = {'cat': {'type': 'categorical'},
               'ct': {'type': 'count'},
               'real': {'type': 'real'},
               'bool': {'type': 'boolean'}
               }
     self.a = self.t.create_analysis(self.schema, analysis_id="a1",
         force=True)
     self.a.wait()
def main():

    ##########
    # UPLOAD #
    ##########

    # 1. Define the schema for the table - specify column names and data types
    table_schema = {
        'age': {
            'type': 'count'
        },
        'sex': {
            'type': 'categorical'
        },
        'region': {
            'type': 'categorical'
        },
        'income': {
            'type': 'real'
        },
        'married': {
            'type': 'boolean'
        },
        'children': {
            'type': 'count'
        },
        'car': {
            'type': 'boolean'
        },
        'save_act': {
            'type': 'boolean'
        },
        'current_act': {
            'type': 'boolean'
        },
        'mortgage': {
            'type': 'boolean'
        },
        'pep': {
            'type': 'boolean'
        },
    }

    # 2. Load the data from csv and divide it into training and test subsets
    rows = read_csv(
        DATA_FILE
    )  # Load rows from CSV, returns all row data values as strings
    clean_data(rows, table_schema
               )  # Convert row data values to correct types based on schema
    training_rows, test_rows = split_rows(
        rows, TRAIN_FRAC)  # Split into training and test sets

    # 3. Connect to the Veritable API
    api = veritable.connect()
    if api.table_exists(TABLE_ID):
        print("Deleting old table '%s'" % TABLE_ID)
        api.delete_table(TABLE_ID)

    # 4. Create a Veritable Table and upload training rows
    print("Creating table '%s' and uploading rows" % TABLE_ID)
    table = api.create_table(table_id=TABLE_ID)
    table.batch_upload_rows(training_rows)

    ###########
    # ANALYZE #
    ###########

    # 5. Create a Veritable Analysis and wait for it to complete
    print("Creating analysis '%s' and waiting for it to complete" %
          ANALYSIS_ID)
    analysis = table.create_analysis(schema=table_schema,
                                     analysis_id=ANALYSIS_ID)
    analysis.wait()

    ###########
    # PREDICT #
    ###########

    # 6. For each row in the test set, predict the value and uncertainty for the target column
    print("Making predictions")

    prediction_results = []
    for test_row in test_rows:
        # Prepare the prediction request
        prediction_request = test_row.copy()  # Copy known values from test row
        del prediction_request[
            '_id']  # '_id' should not be present in prediction requests
        prediction_request[
            TARGET_COL] = None  # None values are predicted by Veritable

        # Make predictions
        prediction = analysis.predict(prediction_request, PRED_COUNT)

        # Derive a single value estimate and uncertainty metric
        estimate = prediction[TARGET_COL]
        uncertainty = prediction.uncertainty[TARGET_COL]

        # Compare estimate to actual value from test row
        is_correct = (estimate == test_row[TARGET_COL])

        # Collect results
        prediction_results.append({
            'is_correct': is_correct,
            'uncertainty': uncertainty
        })

    # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds
    for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS:
        # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold
        unknown_prediction_results = [
            r for r in prediction_results
            if r['uncertainty'] > maximum_uncertainty
        ]
        unknown_count = len(unknown_prediction_results)

        # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold
        known_prediction_results = [
            r for r in prediction_results
            if r['uncertainty'] <= maximum_uncertainty
        ]
        known_count = len(known_prediction_results)

        # Identify prediction results we looked at that are correct
        known_correct_prediction_results = [
            r for r in known_prediction_results if r['is_correct']
        ]
        known_correct_count = len(known_correct_prediction_results)

        print(
            "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}"
            .format(
                TARGET_COL,
                0.0 if known_count == 0 else float(known_correct_count) /
                known_count, known_correct_count, known_count,
                float(unknown_count) / (known_count + unknown_count),
                unknown_count, known_count + unknown_count,
                maximum_uncertainty))
Exemple #8
0
def main():

    ##########
    # UPLOAD #
    ##########

    # 1. Define the schema for the table - specify column names and data types
    table_schema = {
        'age': {'type': 'count'},
        'sex': {'type': 'categorical'},
        'region': {'type': 'categorical'},
        'income': {'type': 'real'},
        'married': {'type': 'boolean'},
        'children': {'type': 'count'},
        'car': {'type': 'boolean'},
        'save_act': {'type': 'boolean'},
        'current_act': {'type': 'boolean'},
        'mortgage': {'type': 'boolean'},
        'pep': {'type': 'boolean'},
    }

    # 2. Load the data from csv and divide it into training and test subsets
    rows = read_csv(DATA_FILE)                                  # Load rows from CSV, returns all row data values as strings
    clean_data(rows, table_schema)                               # Convert row data values to correct types based on schema
    training_rows, test_rows = split_rows(rows, TRAIN_FRAC)     # Split into training and test sets

    # 3. Connect to the Veritable API
    api = veritable.connect()
    if api.table_exists(TABLE_ID):
        print("Deleting old table '%s'" %TABLE_ID)
        api.delete_table(TABLE_ID)

    # 4. Create a Veritable Table and upload training rows
    print("Creating table '%s' and uploading rows" %TABLE_ID)
    table = api.create_table(table_id=TABLE_ID)
    table.batch_upload_rows(training_rows)



    ###########
    # ANALYZE #
    ###########

    # 5. Create a Veritable Analysis and wait for it to complete
    print("Creating analysis '%s' and waiting for it to complete" %ANALYSIS_ID)
    analysis = table.create_analysis(schema=table_schema, analysis_id=ANALYSIS_ID)
    analysis.wait()



    ###########
    # PREDICT #
    ###########


    # 6. For each row in the test set, predict the value and uncertainty for the target column
    print("Making predictions")

    prediction_results = []
    for test_row in test_rows:
        # Prepare the prediction request
        prediction_request = test_row.copy()        # Copy known values from test row
        del prediction_request['_id']               # '_id' should not be present in prediction requests
        prediction_request[TARGET_COL] = None       # None values are predicted by Veritable

        # Make predictions
        prediction = analysis.predict(prediction_request, PRED_COUNT)

        # Derive a single value estimate and uncertainty metric
        estimate = prediction[TARGET_COL]
        uncertainty = prediction.uncertainty[TARGET_COL]

        # Compare estimate to actual value from test row
        is_correct = (estimate == test_row[TARGET_COL])

        # Collect results
        prediction_results.append( { 'is_correct':is_correct, 'uncertainty':uncertainty } )


    # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds
    for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS:
        # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold
        unknown_prediction_results = [r for r in prediction_results if r['uncertainty'] > maximum_uncertainty]
        unknown_count = len(unknown_prediction_results)

        # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold
        known_prediction_results = [r for r in prediction_results if r['uncertainty'] <= maximum_uncertainty]
        known_count = len(known_prediction_results)

        # Identify prediction results we looked at that are correct
        known_correct_prediction_results = [r for r in known_prediction_results if r['is_correct']]
        known_correct_count = len(known_correct_prediction_results)

        print( "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}".format(
                    TARGET_COL,
                    0.0 if known_count == 0 else float(known_correct_count) / known_count,
                    known_correct_count,
                    known_count,
                    float(unknown_count) / (known_count+unknown_count),
                    unknown_count,
                    known_count+unknown_count,
                    maximum_uncertainty ) )
 def setup_class(self):
     self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL,
         **connect_kwargs)
Exemple #10
0

MIN_RATINGS = 100
TABLE_NAME = 'movielens'

# load the item metadata from a static file; in a production system this
# information would likely come from a database
ITEMS = [it 
    for it in json.loads(open('recs/static/movie_descriptions.json').read())
    if it['num_ratings'] > MIN_RATINGS]
ITEMS.sort(key=lambda x: x['name'])
ITEM_NAMES = dict([(m['id'], m['name']) for m in ITEMS])

# connect to the Veritable API and perform baseline predictions; this will
# allow us to compute rating "lift" later
api = veritable.connect()
analysis = get_last_successful_analysis(api, TABLE_NAME)
baselines = get_baselines(analysis, ITEMS)
app = Flask(__name__)


def item_filter(per_item_preds, baseline_val):
    '''
    Decides whether an item should be considered for inclusion in the 
    recommendations. This version requires that three conditions be met:
    
    1. The item must have a reasonably high predicted rating (per_item_mean > 3.)
    2. The predictions must indicate that the user will rate the item higher than
       its baseline rating (lift > .2)
    3. The predictions must not be too uncertain regarding the positive 
       lift (conf > .75)
 def test_create_api_with_enable_gzip_false(self):
     veritable.connect(TEST_API_KEY, TEST_BASE_URL, enable_gzip=False,
         **connect_kwargs)
 def test_create_api_with_ssl_verify_false(self):
     veritable.connect(TEST_API_KEY, TEST_BASE_URL, ssl_verify=False,
         **connect_kwargs)
 def test_create_api_with_debug(self):
     veritable.connect(TEST_API_KEY, TEST_BASE_URL, debug=True,
         **connect_kwargs)
 def test_print_connection(self):
     api = veritable.connect(TEST_API_KEY, TEST_BASE_URL,
         **connect_kwargs)
     print(api._conn)
 def test_create_api(self):
     veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs)
Exemple #16
0
 def setup_class(self):
     self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL,
                                  **connect_kwargs)
import heart_disease.run
import random
import veritable

API = veritable.connect()


def test_example():
    heart_disease.run.TABLE_ID = 'heart-disease-example_' + str(
        random.randint(0, 100000000))
    heart_disease.run.main()
    API.delete_table(heart_disease.run.TABLE_ID)
    API.delete_table(heart_disease.run.TABLE_ID + "-binary")
def main():
    API = veritable.connect(ssl_verify=False)

    print("Loading and preparing data...")
    # load the data and schema describing all column datatypes
    with open(DATA_FILE, 'rb') as fd:
        data = json.loads(fd.read())

    with open(SCHEMA_FILE, 'rb') as fd:
        master_schema = json.loads(fd.read())

    # divide the data into a training and test set, and ensure data is
    # of the correct type for each column
    train_data, test_data = split_rows(data, .8)
    clean_data(train_data, master_schema, remove_extra_fields=True,
        assign_ids=True)

    # we have to account for the possibility that the training data doesn't
    # contain all of the columns in the master schema
    schema = subset_schema(master_schema, train_data)

    # use the subset of the schema to clean the test data - make sure we don't
    # condition test predictions on columns or categorical values that aren't
    # present in the training data
    clean_data(test_data, schema, remove_extra_fields=True, assign_ids=True)
    validate_test_categoricals(test_data, train_data, schema)

    # we'll run the analysis twice: one with the original multinomial target
    # column, and once converting it to a binary column
    def binary_transform(x):
        transform = {'0': False, '1': True, '2': True, '3': True, '4': True}
        return transform[x]

    # make the binary dataset and schema
    binary_train_data = deepcopy(train_data)
    binary_test_data = deepcopy(test_data)
    binary_schema = deepcopy(schema)
    binary_schema['target']['type'] = 'boolean'
    for d in (binary_train_data, binary_test_data):
        for r in d:
            if 'target' in r:
                r['target'] = binary_transform(r['target'])

    # delete existing tables if present
    if API.table_exists(TABLE_ID):
        print("Deleting old table '%s'" %TABLE_ID)
        API.delete_table(TABLE_ID)
    if API.table_exists(TABLE_ID+"-binary"):
        print("Deleting old table '%s'" %(TABLE_ID+"-binary"))
        API.delete_table(TABLE_ID+"-binary")

    # upload the data and start the analyses
    print("Uploading data and running analyses...")
    table = API.create_table(TABLE_ID)
    table.batch_upload_rows(train_data)
    analysis = table.create_analysis(schema)

    binary_table = API.create_table(TABLE_ID+"-binary")
    binary_table.batch_upload_rows(binary_train_data)
    binary_analysis = binary_table.create_analysis(binary_schema)

    # now we'll make predictions for each test row, collecting the
    # predicted values for the target column
    analysis.wait()
    print("Making predictions....")
    results = predict_known_target_column(test_data, analysis, schema,
        'target')

    # and for the binary table
    binary_analysis.wait()
    binary_results = predict_known_target_column(binary_test_data,
        binary_analysis, binary_schema, 'target')

    # summarize the results
    print("multinomial dataset, raw predictions: " \
    "{0}% test error".format(test_error(results, 'target') * 100))
    print("multinomial dataset, binary transform: " \
    "{0}% test error".format(test_error(results, 'target',
        transform=binary_transform) * 100))
    print("binary dataset, raw predictions: " \
    "{0}% test error".format(test_error(binary_results, 'target') * 100))
import heart_disease.run
import random
import veritable

API = veritable.connect()


def test_example():
    heart_disease.run.TABLE_ID = 'heart-disease-example_'+str(random.randint(0, 100000000))
    heart_disease.run.main()
    API.delete_table(heart_disease.run.TABLE_ID)
    API.delete_table(heart_disease.run.TABLE_ID+"-binary")