def setup_class(self): self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs) self.t = self.API.create_table() self.t.batch_upload_rows([{ '_id': 'row1', 'cat': 'a', 'ct': 0, 'real': 1.02394, 'bool': True }, { '_id': 'row2', 'cat': 'b', 'ct': 0, 'real': 0.92131, 'bool': False }, { '_id': 'row3', 'cat': 'c', 'ct': 1, 'real': 1.82812, 'bool': True }, { '_id': 'row4', 'cat': 'c', 'ct': 1, 'real': 0.81271, 'bool': True }, { '_id': 'row5', 'cat': 'd', 'ct': 2, 'real': 1.14561, 'bool': False }, { '_id': 'row6', 'cat': 'a', 'ct': 5, 'real': 1.03412, 'bool': False }]) self.schema = { 'cat': { 'type': 'categorical' }, 'ct': { 'type': 'count' }, 'real': { 'type': 'real' }, 'bool': { 'type': 'boolean' } } self.a = self.t.create_analysis(self.schema, analysis_id="a1", force=True) self.a.wait()
def predict(): API_KEY = '<API_KEY>' api = veritable.connect(api_key=API_KEY) table = api.get_table('heart_data') analysis = table.get_analysis('heart_analysis') foo = request.args.to_dict().keys() new_patient = sj.loads(foo[0]) del new_patient[''] for k in new_patient: new_patient[k] = float(new_patient[k]) new_patient['num'] = None prediction = analysis.predict(new_patient) return jsonify(predicted=prediction['num'])
def setup_class(self): self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs) self.t = self.API.create_table() self.t.batch_upload_rows( [{'_id': 'row1', 'cat': 'a', 'ct': 0, 'real': 1.02394, 'bool': True}, {'_id': 'row2', 'cat': 'b', 'ct': 0, 'real': 0.92131, 'bool': False}, {'_id': 'row3', 'cat': 'c', 'ct': 1, 'real': 1.82812, 'bool': True}, {'_id': 'row4', 'cat': 'c', 'ct': 1, 'real': 0.81271, 'bool': True}, {'_id': 'row5', 'cat': 'd', 'ct': 2, 'real': 1.14561, 'bool': False}, {'_id': 'row6', 'cat': 'a', 'ct': 5, 'real': 1.03412, 'bool': False} ]) self.connection = self.t._conn self.collection = self.t._link("rows")
def main(data_file, schema_file): rows = json.loads(open(data_file).read()) schema = json.loads(open(schema_file).read()) api = veritable.connect() if not api.table_exists(TABLE_NAME): print 'Creating table' table = api.create_table(TABLE_NAME) else: print 'Getting table' table = api.get_table(TABLE_NAME) print 'Uploading rows' table.batch_upload_rows(rows) print 'Creating analysis' analysis = table.create_analysis(schema)
def setup_class(self): self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs) self.t = self.API.create_table() self.t.batch_upload_rows([{ '_id': 'row1', 'cat': 'a', 'ct': 0, 'real': 1.02394, 'bool': True }, { '_id': 'row2', 'cat': 'b', 'ct': 0, 'real': 0.92131, 'bool': False }, { '_id': 'row3', 'cat': 'c', 'ct': 1, 'real': 1.82812, 'bool': True }, { '_id': 'row4', 'cat': 'c', 'ct': 1, 'real': 0.81271, 'bool': True }, { '_id': 'row5', 'cat': 'd', 'ct': 2, 'real': 1.14561, 'bool': False }, { '_id': 'row6', 'cat': 'a', 'ct': 5, 'real': 1.03412, 'bool': False }]) self.connection = self.t._conn self.collection = self.t._link("rows")
def setup_class(self): self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs) self.t = self.API.create_table() self.rows = [{'_id': 'row1', 'cat': 'a', 'ct': 0, 'real': 1.02394, 'bool': True}, {'_id': 'row2', 'cat': 'b', 'ct': 0, 'real': 0.92131, 'bool': False}, {'_id': 'row3', 'cat': 'c', 'ct': 1, 'real': 1.82812, 'bool': True}, {'_id': 'row4', 'cat': 'c', 'ct': 1, 'real': 0.81271, 'bool': True}, {'_id': 'row5', 'cat': 'd', 'ct': 2, 'real': 1.14561, 'bool': False}, {'_id': 'row6', 'cat': 'a', 'ct': 5, 'real': 1.03412, 'bool': False}] self.t.batch_upload_rows(self.rows) self.schema = {'cat': {'type': 'categorical'}, 'ct': {'type': 'count'}, 'real': {'type': 'real'}, 'bool': {'type': 'boolean'} } self.a = self.t.create_analysis(self.schema, analysis_id="a1", force=True) self.a.wait()
def main(): ########## # UPLOAD # ########## # 1. Define the schema for the table - specify column names and data types table_schema = { 'age': { 'type': 'count' }, 'sex': { 'type': 'categorical' }, 'region': { 'type': 'categorical' }, 'income': { 'type': 'real' }, 'married': { 'type': 'boolean' }, 'children': { 'type': 'count' }, 'car': { 'type': 'boolean' }, 'save_act': { 'type': 'boolean' }, 'current_act': { 'type': 'boolean' }, 'mortgage': { 'type': 'boolean' }, 'pep': { 'type': 'boolean' }, } # 2. Load the data from csv and divide it into training and test subsets rows = read_csv( DATA_FILE ) # Load rows from CSV, returns all row data values as strings clean_data(rows, table_schema ) # Convert row data values to correct types based on schema training_rows, test_rows = split_rows( rows, TRAIN_FRAC) # Split into training and test sets # 3. Connect to the Veritable API api = veritable.connect() if api.table_exists(TABLE_ID): print("Deleting old table '%s'" % TABLE_ID) api.delete_table(TABLE_ID) # 4. Create a Veritable Table and upload training rows print("Creating table '%s' and uploading rows" % TABLE_ID) table = api.create_table(table_id=TABLE_ID) table.batch_upload_rows(training_rows) ########### # ANALYZE # ########### # 5. Create a Veritable Analysis and wait for it to complete print("Creating analysis '%s' and waiting for it to complete" % ANALYSIS_ID) analysis = table.create_analysis(schema=table_schema, analysis_id=ANALYSIS_ID) analysis.wait() ########### # PREDICT # ########### # 6. For each row in the test set, predict the value and uncertainty for the target column print("Making predictions") prediction_results = [] for test_row in test_rows: # Prepare the prediction request prediction_request = test_row.copy() # Copy known values from test row del prediction_request[ '_id'] # '_id' should not be present in prediction requests prediction_request[ TARGET_COL] = None # None values are predicted by Veritable # Make predictions prediction = analysis.predict(prediction_request, PRED_COUNT) # Derive a single value estimate and uncertainty metric estimate = prediction[TARGET_COL] uncertainty = prediction.uncertainty[TARGET_COL] # Compare estimate to actual value from test row is_correct = (estimate == test_row[TARGET_COL]) # Collect results prediction_results.append({ 'is_correct': is_correct, 'uncertainty': uncertainty }) # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS: # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold unknown_prediction_results = [ r for r in prediction_results if r['uncertainty'] > maximum_uncertainty ] unknown_count = len(unknown_prediction_results) # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold known_prediction_results = [ r for r in prediction_results if r['uncertainty'] <= maximum_uncertainty ] known_count = len(known_prediction_results) # Identify prediction results we looked at that are correct known_correct_prediction_results = [ r for r in known_prediction_results if r['is_correct'] ] known_correct_count = len(known_correct_prediction_results) print( "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}" .format( TARGET_COL, 0.0 if known_count == 0 else float(known_correct_count) / known_count, known_correct_count, known_count, float(unknown_count) / (known_count + unknown_count), unknown_count, known_count + unknown_count, maximum_uncertainty))
def main(): ########## # UPLOAD # ########## # 1. Define the schema for the table - specify column names and data types table_schema = { 'age': {'type': 'count'}, 'sex': {'type': 'categorical'}, 'region': {'type': 'categorical'}, 'income': {'type': 'real'}, 'married': {'type': 'boolean'}, 'children': {'type': 'count'}, 'car': {'type': 'boolean'}, 'save_act': {'type': 'boolean'}, 'current_act': {'type': 'boolean'}, 'mortgage': {'type': 'boolean'}, 'pep': {'type': 'boolean'}, } # 2. Load the data from csv and divide it into training and test subsets rows = read_csv(DATA_FILE) # Load rows from CSV, returns all row data values as strings clean_data(rows, table_schema) # Convert row data values to correct types based on schema training_rows, test_rows = split_rows(rows, TRAIN_FRAC) # Split into training and test sets # 3. Connect to the Veritable API api = veritable.connect() if api.table_exists(TABLE_ID): print("Deleting old table '%s'" %TABLE_ID) api.delete_table(TABLE_ID) # 4. Create a Veritable Table and upload training rows print("Creating table '%s' and uploading rows" %TABLE_ID) table = api.create_table(table_id=TABLE_ID) table.batch_upload_rows(training_rows) ########### # ANALYZE # ########### # 5. Create a Veritable Analysis and wait for it to complete print("Creating analysis '%s' and waiting for it to complete" %ANALYSIS_ID) analysis = table.create_analysis(schema=table_schema, analysis_id=ANALYSIS_ID) analysis.wait() ########### # PREDICT # ########### # 6. For each row in the test set, predict the value and uncertainty for the target column print("Making predictions") prediction_results = [] for test_row in test_rows: # Prepare the prediction request prediction_request = test_row.copy() # Copy known values from test row del prediction_request['_id'] # '_id' should not be present in prediction requests prediction_request[TARGET_COL] = None # None values are predicted by Veritable # Make predictions prediction = analysis.predict(prediction_request, PRED_COUNT) # Derive a single value estimate and uncertainty metric estimate = prediction[TARGET_COL] uncertainty = prediction.uncertainty[TARGET_COL] # Compare estimate to actual value from test row is_correct = (estimate == test_row[TARGET_COL]) # Collect results prediction_results.append( { 'is_correct':is_correct, 'uncertainty':uncertainty } ) # 7. Evaluate prediction accuracy using different maximum uncertainty thresholds for maximum_uncertainty in MAXIMUM_UNCERTAINTY_THRESHOLDS: # Treat prediction results as unknown if uncertainty is above the maximum_uncertainty threshold unknown_prediction_results = [r for r in prediction_results if r['uncertainty'] > maximum_uncertainty] unknown_count = len(unknown_prediction_results) # Only look at prediction results if uncertainty is below the maximum_uncertainty threshold known_prediction_results = [r for r in prediction_results if r['uncertainty'] <= maximum_uncertainty] known_count = len(known_prediction_results) # Identify prediction results we looked at that are correct known_correct_prediction_results = [r for r in known_prediction_results if r['is_correct']] known_correct_count = len(known_correct_prediction_results) print( "Predictions for {0} are {1:.0%} ({2}/{3}) correct with {4:.0%} ({5}/{6}) ignored using a maximum uncertainty of {7}".format( TARGET_COL, 0.0 if known_count == 0 else float(known_correct_count) / known_count, known_correct_count, known_count, float(unknown_count) / (known_count+unknown_count), unknown_count, known_count+unknown_count, maximum_uncertainty ) )
def setup_class(self): self.API = veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs)
MIN_RATINGS = 100 TABLE_NAME = 'movielens' # load the item metadata from a static file; in a production system this # information would likely come from a database ITEMS = [it for it in json.loads(open('recs/static/movie_descriptions.json').read()) if it['num_ratings'] > MIN_RATINGS] ITEMS.sort(key=lambda x: x['name']) ITEM_NAMES = dict([(m['id'], m['name']) for m in ITEMS]) # connect to the Veritable API and perform baseline predictions; this will # allow us to compute rating "lift" later api = veritable.connect() analysis = get_last_successful_analysis(api, TABLE_NAME) baselines = get_baselines(analysis, ITEMS) app = Flask(__name__) def item_filter(per_item_preds, baseline_val): ''' Decides whether an item should be considered for inclusion in the recommendations. This version requires that three conditions be met: 1. The item must have a reasonably high predicted rating (per_item_mean > 3.) 2. The predictions must indicate that the user will rate the item higher than its baseline rating (lift > .2) 3. The predictions must not be too uncertain regarding the positive lift (conf > .75)
def test_create_api_with_enable_gzip_false(self): veritable.connect(TEST_API_KEY, TEST_BASE_URL, enable_gzip=False, **connect_kwargs)
def test_create_api_with_ssl_verify_false(self): veritable.connect(TEST_API_KEY, TEST_BASE_URL, ssl_verify=False, **connect_kwargs)
def test_create_api_with_debug(self): veritable.connect(TEST_API_KEY, TEST_BASE_URL, debug=True, **connect_kwargs)
def test_print_connection(self): api = veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs) print(api._conn)
def test_create_api(self): veritable.connect(TEST_API_KEY, TEST_BASE_URL, **connect_kwargs)
import heart_disease.run import random import veritable API = veritable.connect() def test_example(): heart_disease.run.TABLE_ID = 'heart-disease-example_' + str( random.randint(0, 100000000)) heart_disease.run.main() API.delete_table(heart_disease.run.TABLE_ID) API.delete_table(heart_disease.run.TABLE_ID + "-binary")
def main(): API = veritable.connect(ssl_verify=False) print("Loading and preparing data...") # load the data and schema describing all column datatypes with open(DATA_FILE, 'rb') as fd: data = json.loads(fd.read()) with open(SCHEMA_FILE, 'rb') as fd: master_schema = json.loads(fd.read()) # divide the data into a training and test set, and ensure data is # of the correct type for each column train_data, test_data = split_rows(data, .8) clean_data(train_data, master_schema, remove_extra_fields=True, assign_ids=True) # we have to account for the possibility that the training data doesn't # contain all of the columns in the master schema schema = subset_schema(master_schema, train_data) # use the subset of the schema to clean the test data - make sure we don't # condition test predictions on columns or categorical values that aren't # present in the training data clean_data(test_data, schema, remove_extra_fields=True, assign_ids=True) validate_test_categoricals(test_data, train_data, schema) # we'll run the analysis twice: one with the original multinomial target # column, and once converting it to a binary column def binary_transform(x): transform = {'0': False, '1': True, '2': True, '3': True, '4': True} return transform[x] # make the binary dataset and schema binary_train_data = deepcopy(train_data) binary_test_data = deepcopy(test_data) binary_schema = deepcopy(schema) binary_schema['target']['type'] = 'boolean' for d in (binary_train_data, binary_test_data): for r in d: if 'target' in r: r['target'] = binary_transform(r['target']) # delete existing tables if present if API.table_exists(TABLE_ID): print("Deleting old table '%s'" %TABLE_ID) API.delete_table(TABLE_ID) if API.table_exists(TABLE_ID+"-binary"): print("Deleting old table '%s'" %(TABLE_ID+"-binary")) API.delete_table(TABLE_ID+"-binary") # upload the data and start the analyses print("Uploading data and running analyses...") table = API.create_table(TABLE_ID) table.batch_upload_rows(train_data) analysis = table.create_analysis(schema) binary_table = API.create_table(TABLE_ID+"-binary") binary_table.batch_upload_rows(binary_train_data) binary_analysis = binary_table.create_analysis(binary_schema) # now we'll make predictions for each test row, collecting the # predicted values for the target column analysis.wait() print("Making predictions....") results = predict_known_target_column(test_data, analysis, schema, 'target') # and for the binary table binary_analysis.wait() binary_results = predict_known_target_column(binary_test_data, binary_analysis, binary_schema, 'target') # summarize the results print("multinomial dataset, raw predictions: " \ "{0}% test error".format(test_error(results, 'target') * 100)) print("multinomial dataset, binary transform: " \ "{0}% test error".format(test_error(results, 'target', transform=binary_transform) * 100)) print("binary dataset, raw predictions: " \ "{0}% test error".format(test_error(binary_results, 'target') * 100))
import heart_disease.run import random import veritable API = veritable.connect() def test_example(): heart_disease.run.TABLE_ID = 'heart-disease-example_'+str(random.randint(0, 100000000)) heart_disease.run.main() API.delete_table(heart_disease.run.TABLE_ID) API.delete_table(heart_disease.run.TABLE_ID+"-binary")