def getDataForNeuralNetworkModel(standardScaler=True, squashHoliday=False, dropDescription=True):
    mt = readMetroTrafficCSV()

    if squashHoliday:
        mt.holiday = np.vectorize(lambda h: 'None' if h == 'None' else 'Holiday')(mt.holiday)

    columnsToEncode = ['holiday', 'weather_main']
    if not dropDescription:
        columnsToEncode.append('weather_description')
    encoder = data_utils.DataEncoder(columnsToEncode, oneHotEncoding=True)
    mt = encoder.encode(mt)

    mt = cleanupMetroTrafficDups(mt, keep='last')
    mt = updateMetroTrafficData(mt, reindex=False, temp='F')

    columnsToDrop = ['date_time', 'rain_1h', 'snow_1h']
    if dropDescription:
        columnsToDrop.append('weather_description')
    mt = mt.drop(columns=columnsToDrop)

    scaler = StandardScaler() if standardScaler else MinMaxScaler()
    scaleColumns = ['week_day', 'hour', 'temp', 'clouds_all']
    scaler.fit(mt[scaleColumns])
    mt[scaleColumns] = scaler.transform(mt[scaleColumns])

    xl, xt, yl, yt = splitMetroTrafficData(mt, intensity=True, approach='random')
    return xl, xt, yl - 1, yt - 1
def getDowJonesData(stockClusters=0,
                    stockScale=False,
                    scale=None,
                    components=None,
                    window=None):
    data = readDowJonesCSV()

    addDowJonesDerivedData(data)

    columns = [
        'quarter', 'stock', 'volume', 'percent_change_price',
        'percent_change_high', 'percent_change_low', 'days_to_next_dividend',
        'percent_return_next_dividend', 'percent_change_next_weeks_price'
    ]
    data = data[columns].copy()

    if stockScale:
        gmmScaler = data_utils.GroupMinMaxScaler('stock',
                                                 keepColumns=['quarter'
                                                              ]).fit(data)
        data = gmmScaler.transform(data)

    xl, xt, yl, yt = splitDowJonesData(data)
    xl.drop(columns=['quarter', 'stock'], inplace=True)
    xt.drop(columns=['quarter', 'stock'], inplace=True)

    pipeline = data_utils.createPipeline(xl,
                                         scale=scale,
                                         components=components)
    if pipeline is not None:
        xl, xt = data_utils.preprocessData(pipeline,
                                           xl,
                                           xt,
                                           copyColumns=(components is None))

    if stockClusters > 0:
        km = KMeans(n_clusters=stockClusters).fit(xl)
        xl['cluster'] = km.labels_
        xt['cluster'] = km.predict(xt)
        encoder = data_utils.DataEncoder(columns=['cluster'],
                                         oneHotEncoding=True)
        xl = encoder.encode(xl)
        xt = encoder.encode(xt)

    if window:
        xl = xl.rolling(window).mean().dropna()
        xt = xt.rolling(window).mean().dropna()
        yl = yl.rolling(window).mean().dropna()
        yt = yt.rolling(window).mean().dropna()

    xl.reset_index(drop=True, inplace=True)
    xt.reset_index(drop=True, inplace=True)
    yl.reset_index(drop=True, inplace=True)
    yt.reset_index(drop=True, inplace=True)

    return xl, xt, yl, yt
    def testDataEncoder(self):
        df = pd.DataFrame({'A': ['11', '11', '22'], 'B': ['33', '44', '55']})
        de = data_utils.DataEncoder(['A', 'B'])
        self.assertEqual(de.getColumns(), ['A', 'B'])
        self.assertFalse(de.isOneHotEncoding())

        adf = de.encode(df)
        edf = pd.DataFrame({'A': [0, 0, 1], 'B': [0, 1, 2]})
        self.assertTrue(aequal(adf.values, edf.values))

        self.assertEqual(de.getLabel('A', 0), '11')
        self.assertEqual(de.getLabel('A', 1), '22')
        self.assertEqual(de.getLabel('B', 0), '33')
        self.assertEqual(de.getLabel('B', 1), '44')
        self.assertEqual(de.getLabel('B', 2), '55')
        self.assertEqual(de.getLabel('C', 0), '')
def getMetroTrafficData(dupsKeep='last',
                        gapsAction='fill',
                        gapsSubAction=None,
                        dateTimeIndex=False,
                        temp=None):
    mt = readMetroTrafficCSV()

    encoder = data_utils.DataEncoder(['holiday', 'weather_main', 'weather_description'],
                                     oneHotEncoding=False)
    mt = encoder.encode(mt)

    if dupsKeep is not None:
        mt = cleanupMetroTrafficDups(mt, keep=dupsKeep)

    if gapsAction is not None:
        mt = cleanupMetroTrafficGaps(mt, action=gapsAction, subAction=gapsSubAction)

    mt = updateMetroTrafficData(mt, reindex=dateTimeIndex, temp=temp)

    return mt
    def testDataEncoderOneHotEncoding(self):
        df = pd.DataFrame({'A': ['11', '11', '22'], 'B': ['33', '44', '55']})
        de = data_utils.DataEncoder(['A', 'B'], oneHotEncoding=True)
        self.assertEqual(de.getColumns(), ['A', 'B'])
        self.assertTrue(de.isOneHotEncoding())

        adf = de.encode(df)
        edf = pd.DataFrame({
            'A_11': [1, 1, 0],
            'A_22': [0, 0, 1],
            'B_33': [1, 0, 0],
            'B_44': [0, 1, 0],
            'B_55': [0, 0, 1]
        })
        self.assertTrue(aequal(adf.values, edf.values))

        self.assertEqual(de.getLabel('A', 0), '')
        self.assertEqual(de.getLabel('A', 1), '')
        self.assertEqual(de.getLabel('B', 0), '')
        self.assertEqual(de.getLabel('B', 1), '')
        self.assertEqual(de.getLabel('C', 0), '')