예제 #1
0
def load_dataset():
    df = pd.read_csv(intrusioncsvpath, names=[c for c, iscat in cols])
    df.index.name = idcol
    df = utils.categorize(df, cols)
    df[labelcol] = (df[labelcol] != 'normal')  # abnormal connection
    df = df.drop(cols[-1][0], axis=1)          # last column is not feature
    return utils.splitdf(df, labelcol)
예제 #2
0
def load_dataset():
    '''Return Iris data and a binary label (not Virginica=0, Virginica=1).'''
    df = pd.read_csv(iriscsvpath)
    df = df.rename(columns={'Unnamed: 0': idcol})
    df[labelcol] = (df[labelcol] == 'virginica')
    df = df.rename(columns={'Species': 'virginica'})
    df = df.set_index(idcol)

    features, labels = utils.splitdf(df, 'virginica')
    return features, labels
예제 #3
0
def load_dataset():
    '''Return IBM customers and labels.'''
    df = pd.read_csv(telco_data_path)
    df = drop_missing(df).reset_index()
    df.index.name = 'id'
    features, labels = utils.splitdf(df, labelcol)
    features = booleanize_senior_citizen(features)
    features = utils.drop_non_features(features, cols)
    features = utils.categorize(features, cols)
    labels = (labels == 'Yes')
    return features, labels
예제 #4
0
def load_dataset():
    '''Return Real Telco customers and labels.'''
    #df = pd.read_excel(ibmxlsxpath)

    conf = SparkConf().setAppName("Telco Churn IRL")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)
    df = sqlContext.sql("select * from jfletcher.churn_test_3").toPandas()

    df = drop_missing(df).reset_index()
    df.index.name = 'id'
    features, labels = utils.splitdf(df, labelcol)
    features = booleanize_senior_citizen(features)
    features = utils.drop_non_features(features, cols)
    features = utils.categorize(features, cols)
    labels = (labels == 'Yes')
    return features, labels
예제 #5
0
def load_dataset():
    try:
        loans = utils.load_processed_dataset('loans')
    except IOError:
        print('Not found. Regenerating...')
        loans = read_raw_data()
        loans = loans.set_index(idcol)
        loans = remove_incomplete(loans)
        loans = remove_missing_revol_util(loans)

        loans = add_frac_repaid(loans)
        loans = remove_unfully_paid(loans)
        loans = remove_overpaid(loans)
        loans = add_not_repaid(loans)

        loans = parse_term(loans)
        loans = parse_percent(loans)

        loans = utils.categorize(loans, cols)
        loans = utils.drop_non_features(loans, cols)
        utils.save_processed_dataset(loans, 'loans')

    return utils.splitdf(loans, labelcol)
예제 #6
0
def load_dataset():
    df = pd.read_csv(breastcancercsvpath)
    df = df.set_index(idcol)
    df[labelcol] = (df[labelcol] == 'M')  # Malignant == True
    return utils.splitdf(df, labelcol)