Ejemplo n.º 1
0
def main():
    print("Begin: " + __file__)

    for key in definitions.dataFiles:
        begin = time.time()
        print("Processing: " + key)

        df = pandas_helper.readData(definitions.dataFiles[key],
                                    lendingclub_helper.dtypes)

        lendingclub_helper.buildFeatures(df)
        featureColumns = lendingclub_helper.getFeatureColumns(df)
        assert (not pandas_helper.columnsHaveNull(df, featureColumns.tolist()))

        lendingclub_helper.buildLabels(df)
        labelColumns = lendingclub_helper.getLabelColumns(df)
        df = lendingclub_helper.getFinishedLoans(df)
        assert (not pandas_helper.columnsHaveNull(df, labelColumns.tolist()))

        X = df[featureColumns].values
        y = df[labelColumns[0]].values
        clf = scikit_helper.trainModel(X, y)

        filename = "svc_{0}.pkl".format(key)
        scikit_helper.saveModel(clf, filename)

        end = time.time()
        print("Elapsed {0} seconds".format(end - begin))
Ejemplo n.º 2
0
def getTrainableDataFrame(filename):
    train_data_file = filename

    df = pandas_helper.readData(train_data_file, lendingclub_helper.dtypes)

    lendingclub_helper.buildFeatures(df)
    lendingclub_helper.buildCategorialLabel(
        df, "loan_status", lendingclub_helper.parse_loan_status)

    return df
Ejemplo n.º 3
0
def main():
    df = pd.DataFrame()

    dataFrames = []

    for key in definitions.dataFiles:
        filename = definitions.dataFiles[key]
        print("Reading " + filename)
        dataFrames.append(pandas_helper.readData(filename, lendingclub_helper.dtypes))

    print("Combining dataFrames")
    df = pd.concat(dataFrames)
    print("Shape: {0}".format(df.shape))


    # df.to_pickle("everything.pickle")  # can't do large dataFrames

    df.to_hdf("everything.hdf", "everything")
Ejemplo n.º 4
0
def main():
    print("Begin: " + __file__)

    dataFrames = []
    for key in definitions.dataFiles:
        print("Loading data for " + key)
        dataFrames.append(
            pandas_helper.readData(definitions.dataFiles[key],
                                   lendingclub_columns.get_dtypes_by_name))

    for df in dataFrames:
        print(df.shape)

    DF = pd.concat(dataFrames)
    print(DF.shape)

    DF.to_csv("combined.csv", index=False)

    print("End: " + __file__)
Ejemplo n.º 5
0
def main():

    #  Pick one
    key = "2007-2011"
    # key = "2012-2013"
    # key = "2014"
    # key = "2015"
    # key = "2016Q1"
    # key = "2016Q2"
    # key = "2016Q3"
    # key = "2016Q4"
    # key = "2017Q1"
    # key = "2017Q2"
    # key = "2017Q3"
    # key = "2017Q4"

    print("Generate model for " + key)
    print("Reading data...")
    dataFile = definitions.dataFiles[key]
    dataFrame = pandas_helper.readData(dataFile, lendingclub_helper.dtypes)

    print("Building features...")
    lendingclub_helper.buildFeatures(dataFrame)
    lendingclub_helper.buildLabels(dataFrame)

    featureColumns = lendingclub_helper.getFeatureColumns(dataFrame)
    labelColumns = lendingclub_helper.getLabelColumns(dataFrame)

    print("Training model...")
    X = dataFrame[featureColumns].values
    y = dataFrame[labelColumns[0]].values

    model = scikit_helper.trainModel(X, y)

    print("Saving model...")
    modelFile = "model_{0}.pkl".format(key)
    scikit_helper.saveModel(model, modelFile)

    print("Done")
Ejemplo n.º 6
0
def main():
    print("Begin: " + __file__)

    setupLogging()
    logger = logging.getLogger()


    # read data
    models = {}
    dataFrames = {}
    for key in definitions.dataFiles:
        print("Reading " + key)

        modelFilename = "../models/svc_{0}.pkl".format(key)
        models[key] = scikit_helper.loadModel(modelFilename)

        dataFrames[key] = pandas_helper.readData(definitions.dataFiles[key])



    for keyModel in models:
        print(keyModel)
        break
        for keyDataFrame in dataFrames:
            print("Using model " + keyModel + " with data " + keyDataFrame)


    return


    for key1 in definitions.dataFiles:
        begin = time.time()

        filename = "../models/svc_{0}.pkl".format(key1)
        clf = scikit_helper.loadModel(filename)

        for key2 in definitions.dataFiles:
            begin = time.time()

            df = pandas_helper.readData(definitions.dataFiles[key2])

            lendingclub_helper.buildFeatures(df)
            featureColumns = lendingclub_helper.getFeatureColumns(df)
            assert(not pandas_helper.columnsHaveNull(df, featureColumns.tolist()))

            lendingclub_helper.buildLabels(df)
            labelColumns = lendingclub_helper.getLabelColumns(df)

            df = lendingclub_helper.getFinishedLoans(df)
            assert(not pandas_helper.columnsHaveNull(df, labelColumns.tolist()))

            X = df[featureColumns].values
            y = df[labelColumns[0]].values
            
            accuracy = scikit_helper.getAccuracy(clf, X, y)

            logger.info("model: {0}, data: {1}, accurary: {2}".format(key1, key2, accuracy))

            end = time.time()
            logger.info("Elapsed {0} seconds".format(end-begin))

    print("End: " + __file__)