def load_dataset(): df = pd.read_csv(intrusioncsvpath, names=[c for c, iscat in cols]) df.index.name = idcol df = utils.categorize(df, cols) df[labelcol] = (df[labelcol] != 'normal') # abnormal connection df = df.drop(cols[-1][0], axis=1) # last column is not feature return utils.splitdf(df, labelcol)
def load_dataset(): '''Return Iris data and a binary label (not Virginica=0, Virginica=1).''' df = pd.read_csv(iriscsvpath) df = df.rename(columns={'Unnamed: 0': idcol}) df[labelcol] = (df[labelcol] == 'virginica') df = df.rename(columns={'Species': 'virginica'}) df = df.set_index(idcol) features, labels = utils.splitdf(df, 'virginica') return features, labels
def load_dataset(): '''Return IBM customers and labels.''' df = pd.read_csv(telco_data_path) df = drop_missing(df).reset_index() df.index.name = 'id' features, labels = utils.splitdf(df, labelcol) features = booleanize_senior_citizen(features) features = utils.drop_non_features(features, cols) features = utils.categorize(features, cols) labels = (labels == 'Yes') return features, labels
def load_dataset(): '''Return Real Telco customers and labels.''' #df = pd.read_excel(ibmxlsxpath) conf = SparkConf().setAppName("Telco Churn IRL") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) df = sqlContext.sql("select * from jfletcher.churn_test_3").toPandas() df = drop_missing(df).reset_index() df.index.name = 'id' features, labels = utils.splitdf(df, labelcol) features = booleanize_senior_citizen(features) features = utils.drop_non_features(features, cols) features = utils.categorize(features, cols) labels = (labels == 'Yes') return features, labels
def load_dataset(): try: loans = utils.load_processed_dataset('loans') except IOError: print('Not found. Regenerating...') loans = read_raw_data() loans = loans.set_index(idcol) loans = remove_incomplete(loans) loans = remove_missing_revol_util(loans) loans = add_frac_repaid(loans) loans = remove_unfully_paid(loans) loans = remove_overpaid(loans) loans = add_not_repaid(loans) loans = parse_term(loans) loans = parse_percent(loans) loans = utils.categorize(loans, cols) loans = utils.drop_non_features(loans, cols) utils.save_processed_dataset(loans, 'loans') return utils.splitdf(loans, labelcol)
def load_dataset(): df = pd.read_csv(breastcancercsvpath) df = df.set_index(idcol) df[labelcol] = (df[labelcol] == 'M') # Malignant == True return utils.splitdf(df, labelcol)