def __iter__(self): for i in range(self.folds): y = build_target(self.config.target, self.context) positives = y[y != 0].index negatives = y[y == 0].index np = len(positives) nn = len(negatives) test_positives = random.sample(positives, self.pos_test) test_negatives = random.sample(negatives, self.neg_test) train_positives = random.sample(positives - test_positives, self.pos_train) train_negatives = random.sample(negatives - test_negatives, self.neg_train) test = test_positives + test_negatives train = train_positives + train_negatives if self.verbose: print "Sampled Folds:" print "\tPos\tNeg\tPos pct" print "Train:\t%d\t%d\t%0.3f" % ( self.pos_train, self.neg_train, self.pos_train / float(self.pos_train + self.neg_train)) print "Test:\t%d\t%d\t%0.3f" % ( self.pos_test, self.neg_test, self.pos_test / float(self.neg_test + self.pos_test)) yield pd.Index(train), pd.Index(test)
def get_xy(config, context): x = build_featureset(config.features, context) y = build_target(config.target, context) if config.column_subset: x = x[config.column_subset] return x, y
def __iter__(self): for i in range(self.folds): y = build_target(self.config.target, self.context) positives = y[y != 0].index negatives = y[y == 0].index np = len(positives) print "posssss", np, len(y) nn = len(negatives) test_positives = random.sample( positives, int(np * self.positive_proportion_test)) np_test = len(test_positives) test_negatives = random.sample( negatives, int(np_test * (1 / self.positive_ratio_test - 1))) nn_test = len(test_negatives) test = test_positives + test_negatives if self.positive_ratio_train: train_negs = random.sample( negatives - test_negatives, int((np - np_test) * (1 / self.positive_ratio_train - 1))) train = train_negs + list(positives - test_positives) nn_train = len(train_negs) else: train = y.index - test nn_train = nn - nn_test if self.verbose: print "Weighted Sample Folds:" print "\tPos\tNeg\tPos pct" print "Train:\t%d\t%d\t%0.3f" % ( np - np_test, nn_train, (np - np_test) / float(np - np_test + nn_train)) print "Test:\t%d\t%d\t%0.3f" % (np_test, nn_test, np_test / float(nn_test + np_test)) yield pd.Index(train), pd.Index(test)
def evaluate(config, ctx, predict_index, predict_method=None, predict_update_column=None): if predict_method is None: result = predict(config, ctx, predict_index) else: # TODO: hacky! result = predict_method(config, ctx, predict_index, update_column=predict_update_column) preds = result['predictions'] y = result['actuals'] try: if config.actual is not None: actuals = build_target(config.actual, ctx).reindex(predict_index) else: actuals = y.reindex(predict_index) #TODO: HACK -- there may not be an actual attribute on the config except AttributeError: actuals = y.reindex(predict_index) scores = {} for metric in config.metrics: name = get_metric_name(metric) if hasattr(metric, 'score'): scores[name] = metric.score(actuals, preds) else: scores[name] = metric(actuals, preds) return scores, result
def __iter__(self): for i in range(self.folds): y = build_target(self.config.target, self.context) positives = y[y != 0].index negatives = y[y == 0].index np = len(positives) print "posssss", np, len(y) nn = len(negatives) test_positives = random.sample(positives, int(np * self.positive_proportion_test)) np_test = len(test_positives) test_negatives = random.sample(negatives, int(np_test * (1 / self.positive_ratio_test - 1))) nn_test = len(test_negatives) test = test_positives + test_negatives if self.positive_ratio_train: train_negs = random.sample(negatives - test_negatives, int((np - np_test) * (1 / self.positive_ratio_train - 1))) train = train_negs + list(positives - test_positives) nn_train = len(train_negs) else: train = y.index - test nn_train = nn - nn_test if self.verbose: print "Weighted Sample Folds:" print "\tPos\tNeg\tPos pct" print "Train:\t%d\t%d\t%0.3f" % (np - np_test, nn_train, (np - np_test) / float( np - np_test + nn_train)) print "Test:\t%d\t%d\t%0.3f" % (np_test, nn_test, np_test / float(nn_test + np_test)) yield pd.Index(train), pd.Index(test)
def predict_autosequence(config, context, predict_index, fit_model=True, update_column=None): if len(context.train_index & predict_index): logging.warning("Train and predict indices overlap...") x, y = None, None if fit_model: x, y = fit(config, context) logging.debug(x.columns) logging.debug(config.model.coef_) ctx = context.copy() ps = [] for i in predict_index: ctx.data = context.data x = get_x(config, ctx) predict_x = x.reindex([i]) # make actual predictions p = config.model.predict(predict_x.values) if update_column is not None: ctx.data[update_column][i] = p[0] ps.append(p[0]) try: preds = Series(ps, index=predict_index) except: preds = DataFrame(ps, index=predict_index) # prediction post-processing if config.prediction is not None: context.data[config.predictions_name] = preds preds = build_target(config.prediction, context) preds = preds.reindex(predict_index) preds.name = '' return preds, x, y
def predict(config, context, predict_index, fit_model=True, model_name=None): if len(context.train_index & predict_index): print "WARNING: train and predict indices overlap..." x, y = None, None if model_name: config.model = context.store.load(model_name) if not model_name and fit_model: x, y = fit(config, context) # TODO: possible to have x loaded without new prediction rows if x is None: # rebuild just the necessary x: ctx = context.copy() ctx.data = context.data.ix[predict_index] x = get_x(config, ctx) try: # we may or may not have y's in predict context # we get them if we can for metrics and reporting y = get_y(config, ctx) except KeyError: pass if debug: print x.columns predict_x = x.reindex(predict_index) print "Making predictions... ", # make actual predictions ps = config.model.predict(predict_x.values) try: preds = Series(ps, index=predict_x.index) except: preds = DataFrame(ps, index=predict_x.index) print "[OK]" # prediction post-processing if config.prediction is not None: old = context.data context.data = context.data.reindex(predict_x.index) context.data[config.predictions_name] = preds preds = build_target(config.prediction, context) preds = preds.reindex(predict_x.index) context.data = old preds.name = '' actuals = y.reindex(predict_index) # TODO: handle multi-variate predictions predict_x['predictions'] = preds predict_x['actuals'] = actuals config.update_reporters_with_predictions(context, predict_x, actuals, preds) return predict_x
def predict(config, context, predict_index, fit_model=True): if len(context.train_index & predict_index): print "WARNING: train and predict indices overlap..." x, y = None, None if fit_model: x, y = fit(config, context) # TODO: possible to have x loaded without new prediction rows if x is None: # rebuild just the necessary x: ctx = context.copy() ctx.data = context.data.ix[predict_index] x = get_x(config, ctx) try: # we may or may not have y's in predict context # we get them if we can for metrics and reporting y = get_y(config, ctx) except KeyError: pass if debug: print x.columns print config.model.coef_ predict_x = x.reindex(predict_index) print "Making predictions... ", # make actual predictions ps = config.model.predict(predict_x.values) try: preds = Series(ps, index=predict_x.index) except: preds = DataFrame(ps, index=predict_x.index) print "[OK]" # prediction post-processing if config.prediction is not None: context.data[config.predictions_name] = preds preds = build_target(config.prediction, context) preds = get_single_column(preds).reindex(predict_x.index) preds.name = '' return preds, x, y
def predict(config, context, predict_index, fit_model=True): if len(context.train_index & predict_index): print "WARNING: train and predict indices overlap..." x, y = None, None if fit_model: x, y = fit(config, context) # TODO: possible to have x loaded without new prediction rows if x is None: # rebuild just the necessary x: ctx = context.copy() ctx.data = context.data.ix[predict_index] x = get_x(config, ctx) try: # we may or may not have y's in predict context # we get them if we can for metrics and reporting y = get_y(config, ctx) except KeyError: pass if debug: print x.columns print config.model.coef_ predict_x = x.reindex(predict_index) # make actual predictions ps = config.model.predict(predict_x.values) try: preds = Series(ps, index=predict_x.index) except: preds = DataFrame(ps, index=predict_x.index) # prediction post-processing if config.prediction is not None: context.data[config.predictions_name] = preds preds = build_target(config.prediction, context) preds = get_single_column(preds).reindex(predict_x.index) preds.name = '' return preds, x, y
def __iter__(self): for i in range(self.folds): y = build_target(self.config.target, self.context) positives = y[y != 0].index negatives = y[y == 0].index np = len(positives) nn = len(negatives) test_positives = random.sample(positives, self.pos_test) test_negatives = random.sample(negatives, self.neg_test) train_positives = random.sample(positives - test_positives, self.pos_train) train_negatives = random.sample(negatives - test_negatives, self.neg_train) test = test_positives + test_negatives train = train_positives + train_negatives if self.verbose: print "Sampled Folds:" print "\tPos\tNeg\tPos pct" print "Train:\t%d\t%d\t%0.3f" % (self.pos_train, self.neg_train, self.pos_train / float( self.pos_train + self.neg_train)) print "Test:\t%d\t%d\t%0.3f" % (self.pos_test, self.neg_test, self.pos_test / float(self.neg_test + self.pos_test)) yield pd.Index(train), pd.Index(test)
def predict_autosequence(config, context, predict_index, fit_model=True, update_column=None): if len(context.train_index & predict_index): print "WARNING: train and predict indices overlap..." x, y = None, None if fit_model: x, y = fit(config, context) if debug: print x.columns print config.model.coef_ ctx = context.copy() ps = [] for i in predict_index: ctx.data = context.data x = get_x(config, ctx) predict_x = x.reindex([i]) # make actual predictions p = config.model.predict(predict_x.values) if update_column is not None: ctx.data[update_column][i] = p[0] ps.append(p[0]) try: preds = Series(ps, index=predict_index) except: preds = DataFrame(ps, index=predict_index) # prediction post-processing if config.prediction is not None: context.data[config.predictions_name] = preds preds = build_target(config.prediction, context) preds = preds.reindex(predict_index) preds.name = '' return preds, x, y
def predict(config, context, predict_index, force_prediction=False): if (context.train_index & predict_index): print "WARNING: train and predict indices overlap..." x, y = fit(config, context) # TODO: possible to have x loaded without new prediction rows if x is None: # rebuild just the necessary x: ctx = context.copy() ctx.data = context.data.ix[predict_index] x, y = get_xy(config, ctx) # ensure correct columns exist: # for col in columns_used: # if col not in x.columns: # print "WARNING: filling missing column '%s' with zeros" % col # x[col] = Series(np.random.randn(len(x)) / 100, index=x.index) # symdif = set(x.columns) ^ set(columns_used) # if symdif: # print symdif # raise Exception("mismatched columns between fit and predict.") # re-order columns # x = x.reindex(columns=columns_used) predict_x = x.reindex(predict_index) # make actual predictions ps = config.model.predict(predict_x.values) preds = Series(ps, index=predict_x.index) # prediction post-processing if config.prediction is not None: context.data[config.predictions_name] = preds preds = build_target(config.prediction, context) preds = get_single_column(preds).reindex(predict_x.index) preds.name = '' return preds, x, y
def get_y(config, context): return build_target(config.target, context)