def generate_bracket_csv(aug_train_set): df = aug_train_set[aug_train_set.bracketing_pattern == brapa] grouped = df.groupby("tube_assembly_id") taids = [] fixed_costs = [] var_costs = [] for taid, indices in grouped.groups.iteritems(): quantities = df.quantity[indices].values costs = inverse_log_transform_y(df.log_cost[indices].values) fixed_cost, var_cost, r2 = get_fixed_and_var_cost(quantities, costs) if r2 < 0.9999: print "{} has bad r2".format(taid) taids.append(taid) fixed_costs.append(fixed_cost) var_costs.append(var_cost) fixed_costs = np.array(fixed_costs) fc_class = -1 * np.ones(len(taids), dtype=np.int) adj_fixed_costs = np.zeros(len(taids)) for i, fc_val in enumerate(fc_vals): indices = np.abs(fixed_costs - fc_val) < 0.1 fc_class[indices] = i adj_fixed_costs[indices] = fc_val assert np.all(np.unique(fc_class) == [0, 1, 2, 3]) adj_var_costs = np.zeros(len(taids)) for i, taid in enumerate(taids): indices = grouped.groups[taid] quantities = df.quantity[indices].values costs = inverse_log_transform_y(df.log_cost[indices].values) fixed_cost = adj_fixed_costs[i] adj_var_costs[i] = get_var_cost_only(quantities, costs, fixed_cost) assert np.abs(adj_var_costs[i] - var_costs[i]) < 0.01 df = pd.DataFrame( { "tube_assembly_id": taids, "fixed_cost_class": fc_class, "fixed_cost": adj_fixed_costs, "var_cost": adj_var_costs, } ) df.to_csv("bracket.csv", index=False, columns=["tube_assembly_id", "fixed_cost_class", "fixed_cost", "var_cost"])
def generate_bracket_csv(aug_train_set): df = aug_train_set[aug_train_set.bracketing_pattern == brapa] grouped = df.groupby('tube_assembly_id') taids = [] fixed_costs = [] var_costs = [] for taid, indices in grouped.groups.iteritems(): quantities = df.quantity[indices].values costs = inverse_log_transform_y(df.log_cost[indices].values) fixed_cost, var_cost, r2 = get_fixed_and_var_cost(quantities, costs) if r2 < 0.9999: print "{} has bad r2".format(taid) taids.append(taid) fixed_costs.append(fixed_cost) var_costs.append(var_cost) fixed_costs = np.array(fixed_costs) fc_class = -1 * np.ones(len(taids), dtype=np.int) adj_fixed_costs = np.zeros(len(taids)) for i, fc_val in enumerate(fc_vals): indices = np.abs(fixed_costs - fc_val) < 0.1 fc_class[indices] = i adj_fixed_costs[indices] = fc_val assert np.all(np.unique(fc_class) == [0, 1, 2, 3]) adj_var_costs = np.zeros(len(taids)) for i, taid in enumerate(taids): indices = grouped.groups[taid] quantities = df.quantity[indices].values costs = inverse_log_transform_y(df.log_cost[indices].values) fixed_cost = adj_fixed_costs[i] adj_var_costs[i] = get_var_cost_only(quantities, costs, fixed_cost) assert np.abs(adj_var_costs[i] - var_costs[i]) < 0.01 df = pd.DataFrame({ 'tube_assembly_id': taids, 'fixed_cost_class': fc_class, 'fixed_cost': adj_fixed_costs, 'var_cost': adj_var_costs, }) df.to_csv('bracket.csv', index=False, columns=[ 'tube_assembly_id', 'fixed_cost_class', 'fixed_cost', 'var_cost'])
X_train = aug_train_set y_train = X_train.pop('log_cost') X_test = aug_test_set print "Predicting..." timer = time() y_train_pred = get_predictions('all', expert_names, base_get_indices, aug_train_set) train_rmsle = np.sqrt(mean_squared_error(y_train.values, y_train_pred)) print "train RMSLE", train_rmsle y_test_pred = get_predictions('all', expert_names, base_get_indices, aug_test_set) timer = time() - timer print " {} seconds elapsed".format(timer) print "Writing output..." timer = time() df = pd.DataFrame() df['cost'] = inverse_log_transform_y(y_train_pred) df['id'] = df.index + 1 df.to_csv("train_pred.csv", index=False, columns=['id', 'cost']) df = pd.DataFrame() df['cost'] = inverse_log_transform_y(y_test_pred) df['id'] = df.index + 1 df.to_csv("test_pred.csv", index=False, columns=['id', 'cost']) timer = time() - timer print " {} seconds elapsed".format(timer) print "Done!"
X_train = aug_train_set y_train = X_train.pop('log_cost') X_test = aug_test_set print "Predicting..." timer = time() y_train_pred = get_predictions( 'all', ['base'], base_get_indices, aug_train_set) train_rmsle = np.sqrt(mean_squared_error(y_train.values, y_train_pred)) print "train RMSLE", train_rmsle y_test_pred = get_predictions( 'all', ['base'], base_get_indices, aug_test_set) timer = time() - timer print " {} seconds elapsed".format(timer) print "Writing output..." timer = time() df = pd.DataFrame() df['cost'] = inverse_log_transform_y(y_train_pred) df['id'] = df.index + 1 df.to_csv("train_pred.csv", index=False, columns=['id', 'cost']) df = pd.DataFrame() df['cost'] = inverse_log_transform_y(y_test_pred) df['id'] = df.index + 1 df.to_csv("test_pred.csv", index=False, columns=['id', 'cost']) timer = time() - timer print " {} seconds elapsed".format(timer) print "Done!"