def _setup(self, query, host='localhost', user='******', password=None, port=8123, protocol='http'): if protocol not in ('https', 'http'): raise ValueError('Unexpected protocol {}'.fomat(protocol)) if ' format ' in query.lower(): err_msg = 'Please refrain from adding a "FORMAT" statement to the query' log.error(err_msg) raise Exception(err_msg) query = '{} FORMAT JSON'.format(query.rstrip(" ;\n")) log.info(f'Getting data via the query: "{query}"') params = {'user': user} if password is not None: params['password'] = password response = requests.post(f'{protocol}://{host}:{port}', data=query, params=params) try: data = response.json()['data'] except: log.error(f'Got an invalid response from the database: {response.text}') raise Exception(response.text) df = pd.DataFrame(data) col_map = {} for col in df.columns: col_map[col] = col return df, col_map
def getDS(from_data): ''' Get a datasource give the input :param input: a string or an object :return: a datasource ''' if isinstance(from_data, DataSource): from_ds = from_data elif isinstance(from_data, DataFrame): from_ds = DataSource(from_data) else: # assume is a file from_ds = FileDS(from_data) if from_ds is None: log.error('No data matched the input data') return from_ds
def run(self): np.seterr(divide='warn', invalid='warn') """ # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions """ output_columns = self.transaction.lmd['predict_columns'] input_columns = [ col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore'] ] # Make predictions on the validation dataset normally and with various columns missing normal_predictions = self.transaction.model_backend.predict('validate') normal_predictions_test = self.transaction.model_backend.predict( 'test') normal_accuracy = evaluate_accuracy( normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) for col in output_columns: reals = self.transaction.input_data.validation_df[col] preds = normal_predictions[col] fails = False data_type = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_subtype'] if data_type == DATA_TYPES.CATEGORICAL: if data_subtype == DATA_SUBTYPES.TAGS: encoder = self.transaction.model_backend.predictor._mixer.encoders[ col] if accuracy_score( encoder.encode(reals), encoder.encode(preds)) <= self.transaction.lmd[ 'stats_v2'][col]['guess_probability']: fails = True else: if accuracy_score(reals, preds) <= self.transaction.lmd[ 'stats_v2'][col]['guess_probability']: fails = True elif data_type == DATA_TYPES.NUMERIC: if r2_score(reals, preds) < 0: fails = True else: pass if fails: if not self.transaction.lmd['force_predict']: def predict_wrapper(*args, **kwargs): raise Exception('Failed to train model') self.session.predict = predict_wrapper log.error('Failed to train model to predict {}'.format(col)) empty_input_predictions = {} empty_input_accuracy = {} empty_input_predictions_test = {} ignorable_input_columns = [ x for x in input_columns if self.transaction.lmd['stats_v2'][x] ['typing']['data_type'] != DATA_TYPES.FILE_PATH and x not in [y[0] for y in self.transaction.lmd['model_order_by']] ] for col in ignorable_input_columns: empty_input_predictions[ col] = self.transaction.model_backend.predict( 'validate', ignore_columns=[col]) empty_input_predictions_test[ col] = self.transaction.model_backend.predict( 'test', ignore_columns=[col]) empty_input_accuracy[col] = evaluate_accuracy( empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) # Get some information about the importance of each column self.transaction.lmd['column_importances'] = {} for col in ignorable_input_columns: accuracy_increase = (normal_accuracy - empty_input_accuracy[col]) # normalize from 0 to 10 self.transaction.lmd['column_importances'][col] = 10 * max( 0, accuracy_increase) # Run Probabilistic Validator overall_accuracy_arr = [] self.transaction.lmd['accuracy_histogram'] = {} self.transaction.lmd['confusion_matrices'] = {} self.transaction.lmd['accuracy_samples'] = {} self.transaction.hmd['probabilistic_validators'] = {} self.transaction.lmd['train_data_accuracy'] = {} self.transaction.lmd['test_data_accuracy'] = {} self.transaction.lmd['valid_data_accuracy'] = {} for col in output_columns: # Training data accuracy predictions = self.transaction.model_backend.predict( 'predict_on_train_data', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['train_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.train_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Testing data accuracy predictions = self.transaction.model_backend.predict( 'test', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['test_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.test_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Validation data accuracy predictions = self.transaction.model_backend.predict( 'validate', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['valid_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) for col in output_columns: pval = ProbabilisticValidator( col_stats=self.transaction.lmd['stats_v2'][col], col_name=col, input_columns=input_columns) predictions_arr = [normal_predictions_test] + [ x for x in empty_input_predictions_test.values() ] pval.fit(self.transaction.input_data.test_df, predictions_arr, [[ignored_column] for ignored_column in empty_input_predictions_test]) overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats( ) overall_accuracy_arr.append(overall_accuracy) self.transaction.lmd['accuracy_histogram'][ col] = accuracy_histogram self.transaction.lmd['confusion_matrices'][col] = cm self.transaction.lmd['accuracy_samples'][col] = accuracy_samples self.transaction.hmd['probabilistic_validators'][col] = pickle_obj( pval) self.transaction.lmd['validation_set_accuracy'] = sum( overall_accuracy_arr) / len(overall_accuracy_arr)
def run(self): np.seterr(divide='warn', invalid='warn') """ # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions """ output_columns = self.transaction.lmd['predict_columns'] input_columns = [ col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore'] ] # Make predictions on the validation dataset normally and with various columns missing normal_predictions = self.transaction.model_backend.predict('validate') normal_predictions_test = self.transaction.model_backend.predict( 'test') normal_accuracy = evaluate_accuracy( normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) for col in output_columns: if self.transaction.lmd['tss']['is_timeseries']: reals = list(self.transaction.input_data.validation_df[ self.transaction.input_data. validation_df['make_predictions'] == True][col]) else: reals = self.transaction.input_data.validation_df[col] preds = normal_predictions[col] fails = False data_type = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_subtype'] if data_type == DATA_TYPES.CATEGORICAL: if data_subtype == DATA_SUBTYPES.TAGS: encoder = self.transaction.model_backend.predictor._mixer.encoders[ col] if balanced_accuracy_score( encoder.encode(reals).argmax(axis=1), encoder.encode(preds).argmax( axis=1)) <= self.transaction.lmd['stats_v2'][ col]['balanced_guess_probability']: fails = True else: if balanced_accuracy_score( reals, preds) <= self.transaction.lmd['stats_v2'][ col]['balanced_guess_probability']: fails = True elif data_type == DATA_TYPES.NUMERIC: if r2_score(reals, preds) < 0: fails = True else: pass if fails: if not self.transaction.lmd['force_predict']: def predict_wrapper(*args, **kwargs): raise Exception('Failed to train model') self.session.predict = predict_wrapper log.error('Failed to train model to predict {}'.format(col)) empty_input_predictions = {} empty_input_accuracy = {} empty_input_predictions_test = {} ignorable_input_columns = [ x for x in input_columns if self.transaction.lmd['stats_v2'][x] ['typing']['data_type'] != DATA_TYPES.FILE_PATH and ( not self.transaction.lmd['tss']['is_timeseries'] or x not in self.transaction.lmd['tss']['order_by']) ] for col in ignorable_input_columns: empty_input_predictions[ col] = self.transaction.model_backend.predict( 'validate', ignore_columns=[col]) empty_input_predictions_test[ col] = self.transaction.model_backend.predict( 'test', ignore_columns=[col]) empty_input_accuracy[col] = evaluate_accuracy( empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) # Get some information about the importance of each column self.transaction.lmd['column_importances'] = {} for col in ignorable_input_columns: accuracy_increase = (normal_accuracy - empty_input_accuracy[col]) # normalize from 0 to 10 self.transaction.lmd['column_importances'][col] = 10 * max( 0, accuracy_increase) # Run Probabilistic Validator overall_accuracy_arr = [] self.transaction.lmd['accuracy_histogram'] = {} self.transaction.lmd['confusion_matrices'] = {} self.transaction.lmd['accuracy_samples'] = {} self.transaction.hmd['probabilistic_validators'] = {} self.transaction.lmd['train_data_accuracy'] = {} self.transaction.lmd['test_data_accuracy'] = {} self.transaction.lmd['valid_data_accuracy'] = {} for col in output_columns: # Training data accuracy predictions = self.transaction.model_backend.predict( 'predict_on_train_data', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['train_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.train_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Testing data accuracy predictions = self.transaction.model_backend.predict( 'test', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['test_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.test_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Validation data accuracy predictions = self.transaction.model_backend.predict( 'validate', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['valid_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) for col in output_columns: pval = ProbabilisticValidator( col_stats=self.transaction.lmd['stats_v2'][col], col_name=col, input_columns=input_columns) predictions_arr = [normal_predictions_test] + [ x for x in empty_input_predictions_test.values() ] pval.fit(self.transaction.input_data.test_df, predictions_arr, [[ignored_column] for ignored_column in empty_input_predictions_test]) overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats( ) overall_accuracy_arr.append(overall_accuracy) self.transaction.lmd['accuracy_histogram'][ col] = accuracy_histogram self.transaction.lmd['confusion_matrices'][col] = cm self.transaction.lmd['accuracy_samples'][col] = accuracy_samples self.transaction.hmd['probabilistic_validators'][col] = pickle_obj( pval) self.transaction.lmd['validation_set_accuracy'] = sum( overall_accuracy_arr) / len(overall_accuracy_arr) # conformal prediction confidence estimation self.transaction.lmd['stats_v2']['train_std_dev'] = {} self.transaction.hmd['label_encoders'] = {} self.transaction.hmd['icp'] = {'active': False} for target in output_columns: data_type = self.transaction.lmd['stats_v2'][target]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][target]['typing'][ 'data_subtype'] is_classification = data_type == DATA_TYPES.CATEGORICAL fit_params = { 'target': target, 'all_columns': self.transaction.lmd['columns'], 'columns_to_ignore': [] } fit_params['columns_to_ignore'].extend( self.transaction.lmd['columns_to_ignore']) fit_params['columns_to_ignore'].extend( [col for col in output_columns if col != target]) if is_classification: if data_subtype != DATA_SUBTYPES.TAGS: all_targets = [ elt[1][target].values for elt in inspect.getmembers( self.transaction.input_data) if elt[0] in {'test_df', 'train_df', 'validation_df'} ] all_classes = np.unique( np.concatenate([np.unique(arr) for arr in all_targets])) enc = OneHotEncoder(sparse=False, handle_unknown='ignore') enc.fit(all_classes.reshape(-1, 1)) fit_params['one_hot_enc'] = enc self.transaction.hmd['label_encoders'][target] = enc else: fit_params['one_hot_enc'] = None self.transaction.hmd['label_encoders'][target] = None adapter = ConformalClassifierAdapter nc_function = MarginErrFunc( ) # better than IPS as we'd need the complete distribution over all classes nc_class = ClassifierNc icp_class = IcpClassifier else: adapter = ConformalRegressorAdapter nc_function = AbsErrorErrFunc() nc_class = RegressorNc icp_class = IcpRegressor if (data_type == DATA_TYPES.NUMERIC or (is_classification and data_subtype != DATA_SUBTYPES.TAGS) ) and not self.transaction.lmd['tss']['is_timeseries']: model = adapter(self.transaction.model_backend.predictor, fit_params=fit_params) nc = nc_class(model, nc_function) X = deepcopy(self.transaction.input_data.train_df) y = X.pop(target) if is_classification: self.transaction.hmd['icp'][target] = icp_class( nc, smoothing=False) else: self.transaction.hmd['icp'][target] = icp_class(nc) self.transaction.lmd['stats_v2']['train_std_dev'][ target] = self.transaction.input_data.train_df[ target].std() X = clean_df(X, self.transaction.lmd['stats_v2'], output_columns) self.transaction.hmd['icp'][target].fit(X.values, y.values) self.transaction.hmd['icp']['active'] = True # calibrate conformal estimator on test set X = deepcopy(self.transaction.input_data.validation_df) y = X.pop(target).values if is_classification: if isinstance(enc.categories_[0][0], str): cats = enc.categories_[0].tolist() y = np.array([cats.index(i) for i in y]) y = y.astype(int) X = clean_df(X, self.transaction.lmd['stats_v2'], output_columns) self.transaction.hmd['icp'][target].calibrate(X.values, y)
def _getDataIo(self, file): """ This gets a file either url or local file and defiens what the format is as well as dialect :param file: file path or url :return: data_io, format, dialect """ ############ # get file as io object ############ data = BytesIO() # get data from either url or file load in memory if file.startswith('http:') or file.startswith('https:'): r = requests.get(file, stream=True) if r.status_code == 200: for chunk in r: data.write(chunk) data.seek(0) # else read file from local file system else: try: data = open(file, 'rb') except Exception as e: error = 'Could not load file, possible exception : {exception}'.format( exception=e) log.error(error) raise ValueError(error) dialect = None ############ # check for file type ############ # try to guess if its an excel file xlsx_sig = b'\x50\x4B\x05\06' xlsx_sig2 = b'\x50\x4B\x03\x04' xls_sig = b'\x09\x08\x10\x00\x00\x06\x05\x00' # different whence, offset, size for different types excel_meta = [('xls', 0, 512, 8), ('xlsx', 2, -22, 4)] for filename, whence, offset, size in excel_meta: try: data.seek(offset, whence) # Seek to the offset. bytes = data.read( size) # Capture the specified number of bytes. data.seek(0) codecs.getencoder('hex')(bytes) if bytes == xls_sig: return data, 'xls', dialect elif bytes == xlsx_sig: return data, 'xlsx', dialect except: data.seek(0) # if not excel it can be a json file or a CSV, convert from binary to stringio byte_str = data.read() # Move it to StringIO try: # Handle Microsoft's BOM "special" UTF-8 encoding if byte_str.startswith(codecs.BOM_UTF8): data = StringIO(byte_str.decode('utf-8-sig')) else: data = StringIO(byte_str.decode('utf-8')) except: log.error(traceback.format_exc()) log.error('Could not load into string') # see if its JSON buffer = data.read(100) data.seek(0) text = buffer.strip() # analyze first n characters if len(text) > 0: text = text.strip() # it it looks like a json, then try to parse it if text.startswith('{') or text.startswith('['): try: json.loads(data.read()) data.seek(0) return data, 'json', dialect except: data.seek(0) return data, None, dialect # lets try to figure out if its a csv try: data.seek(0) first_few_lines = [] i = 0 for line in data: if line in ['\r\n', '\n']: continue first_few_lines.append(line) i += 1 if i > 0: break accepted_delimiters = [',', '\t', ';'] dialect = csv.Sniffer().sniff(''.join(first_few_lines[0]), delimiters=accepted_delimiters) data.seek(0) # if csv dialect identified then return csv if dialect: return data, 'csv', dialect else: return data, None, dialect except: data.seek(0) log.error('Could not detect format for this file') log.error(traceback.format_exc()) # No file type identified return data, None, dialect