def performFeatureSelection(maxlag):
    import functions
    import datetime

    target = 'CLASSIFICATION'
    lags = range(2, maxlag) 
    print 'Maximum time lag applied', max(lags)
    print ''

    for maxdelta in range(3,12):
        #datasets = functions.loadDatasets('/home/francesco/Dropbox/DSR/StocksProject/longdatasets')
        #start = datetime.datetime(1990, 1, 1)
        #end = datetime.datetime(2014, 8, 31)
        #out = functions.getStock('AAPL', start, end)
        datasets = functions.loadDatasets('/home/francesco/Dropbox/DSR/StocksProject/longdatasets')
        #datasets.insert(0, out)


        delta = range(2,maxdelta) 
        print 'Delta days accounted: ', max(delta)
    
        for dataset in datasets:
            columns = dataset.columns    
            adjclose = columns[-2]
            returns = columns[-1]
            for n in delta:    
                functions.addFeatures(dataset, adjclose, returns, n)
            #dataset = dataset.iloc[max(delta):,:] 
        
        finance = functions.mergeDataframes(datasets, 6, target)
        #finance = finance.ix[max(delta):]
        print 'Size of data frame: ', finance.shape
        print 'Number of NaN after merging: ', functions.count_missing(finance)
    
        finance = finance.interpolate(method='time')
        print 'Number of NaN after time interpolation: ', functions.count_missing(finance)

        finance = finance.fillna(finance.mean())
        print 'Number of NaN after mean interpolation: ', functions.count_missing(finance)    
        
        back = -1
        finance.Return_Out = finance.Return_Out.shift(back)

        finance = functions.applyTimeLag(finance, lags, delta, back, target)
    
        print 'Number of NaN after temporal shifting: ', functions.count_missing(finance)
    
        print 'Size of data frame after feature creation: ', finance.shape   
    
        if target == 'CLASSIFICATION':
            start_test = datetime.datetime(2014,4,1)
            X_train, y_train, X_test, y_test  = functions.prepareDataForClassification(finance, start_test)
         
            acc = functions.performCV(X_train, y_train, 10, 'GTB', [])           
            print ''            
            print 'Mean Accuracy for (%d, %d): %f' % (max(lags), max(delta), acc)             
            #print functions.performClassification(X, y, X_val, y_val, 'ADA', [100, 1])
            print '============================================================================'
Beispiel #2
0
def final():
    target = "CLASSIFICATION"
    lags = range(2, 3)
    print "Maximum time lag applied", max(lags)

    start = datetime.datetime(1990, 1, 1)
    end = datetime.datetime(2014, 8, 31)
    out = functions.getStock("GE", start, end)
    datasets = functions.loadDatasets("/home/francesco/Dropbox/DSR/StocksProject/longdatasets")
    datasets.insert(0, out)

    delta = range(2, 5)
    print "Max Delta days accounted: ", max(delta)

    for dataset in datasets:
        columns = dataset.columns
        adjclose = columns[-2]
        returns = columns[-1]
        for n in delta:
            functions.addFeatures(dataset, adjclose, returns, n)
        # dataset = dataset.iloc[max(delta):,:]
    finance = functions.mergeDataframes(datasets, 6, target)
    # finance = finance.ix[max(delta):]
    print "Size of data frame: ", finance.shape
    print "Number of NaN after merging: ", functions.count_missing(finance)
    print "% of NaN after merging: ", (
        functions.count_missing(finance) / float(finance.shape[0] * finance.shape[1])
    ) * 100, "%"

    finance = finance.interpolate(method="time")
    print "Number of NaN after time interpolation: ", functions.count_missing(finance)

    finance = finance.fillna(finance.mean())
    print "Number of NaN after mean interpolation: ", functions.count_missing(finance)

    back = -1
    # finance.Return_SP500 = finance.Return_SP500.shift(back)
    finance.Return_Out = finance.Return_Out.shift(back)

    finance = functions.applyTimeLag(finance, lags, delta, back, target)
    # finance = functions.mergeSentimenToStocks(finance)
    # print finance.columns
    print "Number of NaN after temporal shifting: ", functions.count_missing(finance)
    print "Size of data frame after feature creation: ", finance.shape
    if target == "CLASSIFICATION":
        start_test = datetime.datetime(2014, 4, 1)
        X_train, y_train, X_test, y_test = functions.prepareDataForClassification(finance, start_test)

        print ""
        # print 'Performing CV...'
        # grid = {'n_estimators': [80, 100, 150], 'learning_rate': [0.01, 0.1, 1, 10]}
        # grid = {'n_estimators': [50, 80, 100, 1000]}
        # functions.performTimeSeriesSearchGrid(finance, 4, 0.8, features, 'ADA', grid)

        print functions.performClassification(X_train, y_train, X_test, y_test, "RF", [])
Beispiel #3
0
path_manure_nonpts = 'GIS_data/final_data/manure_nonpts.csv'
path_AD_pts = 'GIS_data/final_data/AD_pts.csv'
path_COMB_pts = 'GIS_data/final_data/COMB_pts.csv'
path_W2E_pts = 'GIS_data/final_data/W2E_pts.csv'
path_DES_CBG_pts = 'GIS_data/final_data/DES_CBGcntrd.csv'
path_PROC_ZC_pts = 'GIS_data/final_data/PROC_ZCcntrd.csv'
path_msw_CBGcntrd_pts = 'GIS_data/final_data/DES_CBGcntrd.csv'
path_crp2016_pts = 'GIS_data/final_data/DES_CBGcntrd.csv'
path_crp2020_pts = 'GIS_data/final_data/DES_CBGcntrd.csv'
path_crp2050_pts = 'GIS_data/final_data/DES_CBGcntrd.csv'
#path_proc_pts = 'GIS_data/final_data/proc_pts.csv'
path_proc_nonpts = 'GIS_data/final_data/proc_nonpts.csv'
path_thermal = 'GIS_data/final_data/DES_CBGcntrd.csv'

# Load all data as geodataframes
counties_df = loadDatasets(path_counties)
biomass_pts = loadDatasets(path_manure)
AD_pts = loadDatasets(path_AD_pts)
COMB_pts = loadDatasets(path_COMB_pts)
W2E_pts = loadDatasets(path_W2E_pts)
DES_CBG_pts = loadDatasets(path_DES_CBG_pts)
PROC_ZC_pts = loadDatasets(path_PROC_ZC_pts)
msw_CBGcntrd_pts = loadDatasets(path_msw_CBGcntrd_pts)
crp2016_pts = loadDatasets(path_crp2016_pts)
crp2020_pts = loadDatasets(path_crp2020_pts)
crp2050_pts = loadDatasets(path_crp2050_pts)
manure_nonpts = pd.read_csv(path_manure_nonpts)
proc_nonpts = pd.read_csv(path_proc_nonpts)
thermal = loadDatasets(path_thermal)

# Set the preferred resulted columns for each dataset
def performFeatureSelection(maxlag):
    import functions
    import datetime

    target = 'CLASSIFICATION'
    lags = range(2, maxlag)
    print 'Maximum time lag applied', max(lags)
    print ''

    for maxdelta in range(3, 12):
        #datasets = functions.loadDatasets('/home/francesco/Dropbox/DSR/StocksProject/longdatasets')
        #start = datetime.datetime(1990, 1, 1)
        #end = datetime.datetime(2014, 8, 31)
        #out = functions.getStock('AAPL', start, end)
        datasets = functions.loadDatasets(
            '/home/francesco/Dropbox/DSR/StocksProject/longdatasets')
        #datasets.insert(0, out)

        delta = range(2, maxdelta)
        print 'Delta days accounted: ', max(delta)

        for dataset in datasets:
            columns = dataset.columns
            adjclose = columns[-2]
            returns = columns[-1]
            for n in delta:
                functions.addFeatures(dataset, adjclose, returns, n)
            #dataset = dataset.iloc[max(delta):,:]

        finance = functions.mergeDataframes(datasets, 6, target)
        #finance = finance.ix[max(delta):]
        print 'Size of data frame: ', finance.shape
        print 'Number of NaN after merging: ', functions.count_missing(finance)

        finance = finance.interpolate(method='time')
        print 'Number of NaN after time interpolation: ', functions.count_missing(
            finance)

        finance = finance.fillna(finance.mean())
        print 'Number of NaN after mean interpolation: ', functions.count_missing(
            finance)

        back = -1
        finance.Return_Out = finance.Return_Out.shift(back)

        finance = functions.applyTimeLag(finance, lags, delta, back, target)

        print 'Number of NaN after temporal shifting: ', functions.count_missing(
            finance)

        print 'Size of data frame after feature creation: ', finance.shape

        if target == 'CLASSIFICATION':
            start_test = datetime.datetime(2014, 4, 1)
            X_train, y_train, X_test, y_test = functions.prepareDataForClassification(
                finance, start_test)

            acc = functions.performCV(X_train, y_train, 10, 'GTB', [])
            print ''
            print 'Mean Accuracy for (%d, %d): %f' % (max(lags), max(delta),
                                                      acc)
            #print functions.performClassification(X, y, X_val, y_val, 'ADA', [100, 1])
            print '============================================================================'
Beispiel #5
0
def final():
    target = 'CLASSIFICATION'
    lags = range(2, 3)
    print 'Maximum time lag applied', max(lags)

    start = datetime.datetime(1990, 1, 1)
    end = datetime.datetime(2014, 8, 31)
    out = functions.getStock('GE', start, end)
    datasets = functions.loadDatasets(
        '/home/francesco/Dropbox/DSR/StocksProject/longdatasets')
    datasets.insert(0, out)

    delta = range(2, 5)
    print 'Max Delta days accounted: ', max(delta)

    for dataset in datasets:
        columns = dataset.columns
        adjclose = columns[-2]
        returns = columns[-1]
        for n in delta:
            functions.addFeatures(dataset, adjclose, returns, n)
        #dataset = dataset.iloc[max(delta):,:]
    finance = functions.mergeDataframes(datasets, 6, target)
    #finance = finance.ix[max(delta):]
    print 'Size of data frame: ', finance.shape
    print 'Number of NaN after merging: ', functions.count_missing(finance)
    print '% of NaN after merging: ', (
        functions.count_missing(finance) /
        float(finance.shape[0] * finance.shape[1])) * 100, '%'

    finance = finance.interpolate(method='time')
    print 'Number of NaN after time interpolation: ', functions.count_missing(
        finance)

    finance = finance.fillna(finance.mean())
    print 'Number of NaN after mean interpolation: ', functions.count_missing(
        finance)

    back = -1
    #finance.Return_SP500 = finance.Return_SP500.shift(back)
    finance.Return_Out = finance.Return_Out.shift(back)

    finance = functions.applyTimeLag(finance, lags, delta, back, target)
    #finance = functions.mergeSentimenToStocks(finance)
    #print finance.columns
    print 'Number of NaN after temporal shifting: ', functions.count_missing(
        finance)
    print 'Size of data frame after feature creation: ', finance.shape
    if target == 'CLASSIFICATION':
        start_test = datetime.datetime(2014, 4, 1)
        X_train, y_train, X_test, y_test = functions.prepareDataForClassification(
            finance, start_test)

        print ''
        #print 'Performing CV...'
        #grid = {'n_estimators': [80, 100, 150], 'learning_rate': [0.01, 0.1, 1, 10]}
        #grid = {'n_estimators': [50, 80, 100, 1000]}
        #functions.performTimeSeriesSearchGrid(finance, 4, 0.8, features, 'ADA', grid)

        print functions.performClassification(X_train, y_train, X_test, y_test,
                                              'RF', [])