def _predict(ids, X, clf, outputFpath): """ predict test data and write to file @return predictions """ print 'Prediction data:' print_missing_values_info(X) X = Imputer().fit_transform(X) # res = pandas.DataFrame({'id': ids, 'repeatProbability': clf.predict_proba(X)[:, 1]}) res = pandas.DataFrame({'id': [int(id) for id in ids], 'repeatProbability': clf.predict(X)}) res.to_csv(outputFpath, index=False) return res
def make_data(dataFname, enc, features=None): """ reads x and y data (no imputation, yes feature selection) also encodes the categorical features f776 and f777 @param dataFname: name of the training csv file @param features: specific features to use. None by default. @param enc: the OneHotEncoder. None for training data, not-None for testing data @return xdata, ydata (None if test data), ids, enc (OneHotEncoder for f776 and f777) """ origData = pandas.read_csv(dataFname) ids = origData['id'] # remove unused columns if 'Unnamed: 0' in origData.columns: del origData['Unnamed: 0'] del origData['id'] # remove "data leakage" columns for f in prohobitedFeatures: del origData[f] # separate into X & y values xData = origData[[col for col in origData.columns if not col=='loss']] set_vars_as_type(xData, discreteVars, object) yVec = origData.loss if 'loss' in origData.columns else None # try f528 - f274 xData['f528f274'] = xData['f528'] - xData['f274'] # encode the categorical features f776 and f777 if enc is None: enc = OneHotEncoder(n_values=[2, 2]) enc.fit(xData[['f776', 'f777']]) xData[['f776_isZero', 'f776_isOne', 'f777_isZero', 'f777_isOne']] = pandas.DataFrame(enc.transform(xData[['f776', 'f777']]).toarray()) del xData['f776'] del xData['f777'] print_missing_values_info(origData) # feature selection if features: filteredXData = xData[features] else: # use ALL features filteredXData = xData return filteredXData, yVec, ids, enc
def classify(X, y, lossString, fit=True): """ train classifier on training data @return classifier """ print 'Training data:' print_missing_values_info(X) X = Imputer().fit_transform(X) # clf = LogisticRegression() clf = GradientBoostingRegressor(learning_rate=0.1, loss=lossString, n_estimators=1000, subsample=0.9) if fit: clf.fit(X, y) return clf