Beispiel #1
0
def getQuantityBasedFeatures(dataName, dataFrequency):
    import pandas as pd
    import numpy as np

    from utilities.fileFolderManipulations import getJupyterRootDirectory
    from config.environment import getAppConfigData

    # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath)
    # using various python commands like os.path.abspath and os.path.join
    jupyterNodePath = getJupyterRootDirectory()

    autoConfigData = getAppConfigData()

    preProcessedDataFilePath = autoConfigData[dataName][dataFrequency][
        'preProcessedDataFilePath']

    # read the raw processed data from csv file
    df = pd.read_csv(jupyterNodePath + preProcessedDataFilePath)

    qtyMean = np.mean(df['quantity'])
    qtyMax = np.max(df['quantity'])
    normalizedQuantityDf = (df['quantity'] - qtyMean) / qtyMax

    qtyDiffDf = df['quantity'] - df['quantity'].shift(1)
    qtyDiffMean = np.mean(qtyDiffDf)
    qtyDiffMax = np.max(qtyDiffDf)
    normalizedQtyDiffDf = (qtyDiffDf - qtyDiffMean) / qtyDiffMax

    return pd.concat([normalizedQuantityDf, normalizedQtyDiffDf], axis=1)
def getPreprocessedData(dataName, dataFrequency, autoConfigFileRelativePath,
                        KEY_preProcessedDataFilePath):
    import pandas as pd
    import numpy as np

    from utilities.fileFolderManipulations import getJupyterRootDirectory
    from config.environment import getAppConfigData

    # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath)
    # using various python commands like os.path.abspath and os.path.join
    jupyterNodePath = None
    configFilePath = None

    # holds data from input data file - Truth source, should be usd only for reference and no updates should happen to this variable
    inputRawProcessedDataDF = None

    #caluclate the deployment directory path of the current juypter node in the operating system
    jupyterNodePath = getJupyterRootDirectory()
    print("jupyterNodePath >>> " + jupyterNodePath)

    configFilePath = jupyterNodePath + autoConfigFileRelativePath
    print("configFilePath >>> " + configFilePath)

    autoConfigData = getAppConfigData()

    preProcessedDataFilePath = autoConfigData[dataName][dataFrequency][
        KEY_preProcessedDataFilePath]

    # read the raw processed data from csv file
    inputRawProcessedDataDF = pd.read_csv(preProcessedDataFilePath)

    return inputRawProcessedDataDF
def preProcessData(dataName,
                   dataFrequency,
                   outputFileName="processedRawData.csv"):

    import os, sys, traceback

    import pandas as pd
    import glob

    from utilities.fileFolderManipulations import getParentFolder
    from utilities.fileFolderManipulations import createFolder
    from utilities.fileFolderManipulations import getJupyterRootDirectory

    from config.environment import getAppConfigData
    from config.environment import setAppConfigData

    from fastai.tabular import add_datepart

    print(' data pre-processing >> imported dependencies')

    relativeDataFolderPath = 'data/' + dataName + '/raw/' + dataFrequency

    # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath)
    # using various python commands like os.path.abspath and os.path.join
    jupyterNodePath = None

    # Variable to hold a dataframe created with the data from input data files in the relativeDataFolderPath provided
    inputRawDataDF = None

    # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath)
    # using various python commands like os.path.abspath and os.path.join
    dataFolderPath = None

    # Variable to hold query like value of python to query all json file names in the source folder (dataFolderPath).
    # Will be used in the glob function to execute the query
    json_pattern = None

    # Variable to contain the list of all input json file names in the source folder (dataFolderPath)
    file_list = None

    # return values of this method
    # -------------------------------------------------------------------------------
    # Current methods return value initialized to false. Will be maked as true
    # after every single line in the method has been executed with out errors
    returnValue = False
    # complete filepath of the csv file with the processed raw data
    outputFilePath = None
    outputFolderName = None

    # -------------------------------------------------------------------------------
    try:
        #caluclate the deployment directory path of the current juypter node in the operating system
        jupyterNodePath = getJupyterRootDirectory()

        # TO BE MODIFIED - NOT SURE WHY I USED THIS - WILL HAVE TO CHECK
        pd.set_option('display.max_columns', None)

        # creating pandas dataframe references for further modification
        inputRawDataDF = pd.DataFrame()

        #calculating the complete data folder path of the relative path provided as parameter
        dataFolderPath = jupyterNodePath + '/' + relativeDataFolderPath

        # creating OS queryable object for python to work with to find json files in the dataFolderPath calcuated in the previous step
        json_pattern = os.path.join(dataFolderPath, '*.json')

        # store all the json file paths in the dataFolderPath for further processing
        file_list = glob.glob(json_pattern)

        # execution assertion/ui progress update info
        print('looping through all the files to create input data')
        # loop through all the files in the folder and create inputRawDataDF pandas datafram
        for file in file_list:
            print("reading input file >>> " + file + " ...")
            data = pd.read_json(file, lines=True)
            if isinstance(data, str):
                data = data['data'][0]['candles']
            else:
                data = data.values[0][0]['candles']

            inputRawDataDF = inputRawDataDF.append(data, ignore_index=True)
            print("File read - SUCCESS")

        inputRawDataDF.columns = [
            'date-time', 'open', 'high', 'low', 'close', 'quantity',
            'dont-know'
        ]

        buffer = inputRawDataDF['date-time']
        add_datepart(inputRawDataDF, 'date-time')

        inputRawDataDF = pd.concat([buffer, inputRawDataDF], axis=1)

        #create prior_holidays feature
        priorHolidaysStamps = getPriorHoliDaysStamps(
            inputRawDataDF['date-timeDayofyear'])
        priorHolidaysStamps_df = pd.DataFrame(
            {'prior_holidays': priorHolidaysStamps[:]})

        inputRawDataDF = pd.concat([inputRawDataDF, priorHolidaysStamps_df],
                                   axis=1)
        print('added prior_holidays feature in pre-processed data')

        #create following_holidays feature
        followingHolidaysStamps = getFollowingHolidaysDaysStamp(
            inputRawDataDF['date-timeDayofyear'])
        followingHolidaysStamps_df = pd.DataFrame(
            {'following_holidays': followingHolidaysStamps[:]})

        inputRawDataDF = pd.concat(
            [inputRawDataDF, followingHolidaysStamps_df], axis=1)
        print('added following_holidays feature in pre-processed data')
        '''
        w  write mode
        r  read mode
        a  append mode

        w+  create file if it doesn't exist and open it in (over)write mode
            [it overwrites the file if it already exists]
        r+  open an existing file in read+write mode
        a+  create file if it doesn't exist and open it in append mode
        '''

        processFolderName = getParentFolder(dataFolderPath,
                                            2) + '/processed/' + dataFrequency
        print('Attempting to create folder if it does not exist >>>' +
              processFolderName)
        createFolder(processFolderName)

        outputFolderName = processFolderName + '/preProcessedData'
        print('Attempting to create folder if it does not exist >>>' +
              outputFolderName)
        createFolder(outputFolderName)

        outputFilePath = outputFolderName + '/' + outputFileName
        print('Attempting to create/update file >>>' + outputFilePath)
        #f = open(output_file_name, 'w+')  # open file in append mode
        #f.write('')
        #f.close()
        #np.savetxt(output_file_name, inputRawDataDF, delimiter=",")
        inputRawDataDF.to_csv(outputFilePath, sep=',', index=False)

        print(
            'created raw easy to use csv data to be used for preparing training data in the location  >>>'
            + outputFilePath)

        print(' creating/updating autoConfig file')
        configFilePath = jupyterNodePath + '/src/config/autoConfig/config.json'

        autoConfigData = getAppConfigData()

        if not autoConfigData.get(dataName):
            autoConfigData[dataName] = {}

        if not autoConfigData[dataName].get(dataFrequency):
            autoConfigData[dataName][dataFrequency] = {}

        autoConfigData[dataName][dataFrequency] = {
            'preProcessedDataFilePath':
            outputFilePath.replace(jupyterNodePath, '')
        }

        setAppConfigData(autoConfigData)
        print(' creating/updating autoConfig file >>>' + configFilePath)

        returnValue = True
    except:
        print("Error executing method >>> ")
        # exc_type, exc_obj, exc_tb = sys.exc_info()
        # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        # print("Unexpected error:", sys.exc_info())
        # print(exc_type, fname, exc_tb.tb_lineno)

        # http://docs.python.org/2/library/sys.html#sys.exc_info
        exc_type, exc_value, exc_traceback = sys.exc_info(
        )  # most recent (if any) by default
        '''
        Reason this _can_ be bad: If an (unhandled) exception happens AFTER this,
        or if we do not delete the labels on (not much) older versions of Py, the
        reference we created can linger.

        traceback.format_exc/print_exc do this very thing, BUT note this creates a
        temp scope within the function.
        '''

        traceback_details = {
            'filename': exc_traceback.tb_frame.f_code.co_filename,
            'lineno': exc_traceback.tb_lineno,
            'name': exc_traceback.tb_frame.f_code.co_name,
            'type': exc_type.__name__,
            'message': traceback.extract_tb(exc_traceback)
        }

        del (exc_type, exc_value, exc_traceback
             )  # So we don't leave our local labels/objects dangling
        # This still isn't "completely safe", though!
        # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback
        # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]

        print
        print(traceback.format_exc())
        print
        print(traceback_template % traceback_details)
        print

        #traceback.print_exception()
        raise

    finally:
        return [
            returnValue, outputFolderName, outputFileName, outputFilePath,
            inputRawDataDF
        ]
def getRedGreenCandlesCatogizedBySizeDf(df,
                                        dataName,
                                        dataFrequency,
                                        boundaryValues=None):
    # @Param :: boundaryValues
    #   - should be  Array of 5 tuples
    #   - each tuble must be a pair of negative and positive float compatible values only (-0.44,0.44)
    #   - absolute value of each elements in the tuple should be less than the previous corresponding previous entry
    #       example [(-3.44,2.44),(-3.32,1.37),(-1.11,1.01),(-0.53,0.76),(-0.02,0.019)]
    import os, sys, traceback
    import json

    import pandas as pd
    import numpy as np

    from utilities.fileFolderManipulations import getJupyterRootDirectory
    from config.environment import getAppConfigData
    from config.environment import setAppConfigData

    redCandlesBySizeDf = None
    greenCandlesBySizeDf = None
    redCandlesBySizeTimesMagnitudeDf = None
    greenCandlesBySizeTimesMagnitudeDf = None
    redGreenCandlesTanhDf = None
    redGreenCandlesTanhTimesMagnitudeDf = None

    try:

        if boundaryValues is None:
            print('boundary values is none')
            #caluclate the deployment directory path of the current juypter node in the operating system
            jupyterNodePath = getJupyterRootDirectory()
            print("jupyterNodePath >>> " + jupyterNodePath)

            autoConfigData = getAppConfigData()

            if not autoConfigData.get(dataName):
                autoConfigData[dataName] = {}

            if not autoConfigData[dataName].get(dataFrequency):
                autoConfigData[dataName][dataFrequency] = {}

            boundaryValues = autoConfigData[dataName][dataFrequency].get(
                'redGreenCandleSizeBoundaries')
            if boundaryValues is None or (type(boundaryValues) == 'str'
                                          and boundaryValues.strip() == ''):
                print('boundary values is not configured')
                closeOpenDiffDf = (df['close'] -
                                   df['open']).rename('close_open_diff')

                candlesByBodyLengthDf = closeOpenDiffDf.sort_values(
                    axis=0,
                    ascending=True,
                    inplace=False,
                    kind='quicksort',
                    na_position='last').reset_index(drop=True)
                sortedRedCandles = candlesByBodyLengthDf.loc[
                    candlesByBodyLengthDf[0:] < 0].reset_index(drop=True)

                candlesByBodyLengthDf = closeOpenDiffDf.sort_values(
                    axis=0,
                    ascending=False,
                    inplace=False,
                    kind='quicksort',
                    na_position='last').reset_index(drop=True)
                sortedGreenCandles = candlesByBodyLengthDf.loc[
                    candlesByBodyLengthDf[0:] > 0].reset_index(drop=True)

                interval = np.arange(.2, 1, .2)
                indexArr = [
                    ((int)(sortedRedCandles.shape[0] * interval[itr]),
                     (int)(sortedGreenCandles.shape[0] * interval[itr]))
                    for itr in range(0, interval.size)
                ]

                boundaryValues = [(sortedRedCandles[indexItr[0] - 1],
                                   sortedGreenCandles[indexItr[1] - 1])
                                  for indexItr in indexArr]

                autoConfigData[dataName][dataFrequency].update(
                    {'redGreenCandleSizeBoundaries': boundaryValues})

                #print('pushing values to autoConfigFile >>> ' + configFilePath + ' with data '+ json.dumps(autoConfigData))
                setAppConfigData(autoConfigData)
            else:
                print(
                    'using configured boundary values - do not update configurations unless u r absolutely sure of it'
                )
        else:
            print('using boundary values provided as parameter')

        dataMagnitudeDf = np.divide(
            np.sqrt(
                np.sum(np.square(df[['open', 'close', 'high', 'low']]),
                       axis=1)), 4)

        redCandlesBySizeDf = (df['close'] -
                              df['open']).rename('redCandlesBySize')
        redCandlesBySizeDf[redCandlesBySizeDf >= 0] = 0
        redCandlesBySizeDf[redCandlesBySizeDf < boundaryValues[0][0]] = 5
        redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[0][0],
                                                      boundaryValues[1][0],
                                                      inclusive=True)] = 4
        redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[1][0],
                                                      boundaryValues[2][0],
                                                      inclusive=False)] = 3
        redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[2][0],
                                                      boundaryValues[3][0],
                                                      inclusive=True)] = 2
        redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[3][0],
                                                      0,
                                                      inclusive=False)] = 1

        greenCandlesBySizeDf = (df['close'] -
                                df['open']).rename('greenCandlesBySize')
        greenCandlesBySizeDf[greenCandlesBySizeDf <= 0] = 0
        greenCandlesBySizeDf[greenCandlesBySizeDf > boundaryValues[0][1]] = 5
        greenCandlesBySizeDf[greenCandlesBySizeDf.between(boundaryValues[1][1],
                                                          boundaryValues[0][1],
                                                          inclusive=True)] = 4
        greenCandlesBySizeDf[greenCandlesBySizeDf.between(boundaryValues[2][1],
                                                          boundaryValues[1][1],
                                                          inclusive=False)] = 3
        greenCandlesBySizeDf[greenCandlesBySizeDf.between(boundaryValues[3][1],
                                                          boundaryValues[2][1],
                                                          inclusive=True)] = 2
        greenCandlesBySizeDf[greenCandlesBySizeDf.between(0,
                                                          boundaryValues[3][1],
                                                          inclusive=False)] = 1

        redGreenCandlesTanhDf = (df['close'] -
                                 df['open']).rename('redGreenCandlesTanh')
        redGreenCandlesTanhDf[redGreenCandlesTanhDf < 0] = -1
        redGreenCandlesTanhDf[redGreenCandlesTanhDf > 0] = 1
        redGreenCandlesTanhTimesMagnitudeDf = np.multiply(
            redGreenCandlesTanhDf, dataMagnitudeDf)

        redCandlesBySizeTimesMagnitudeDf = -np.multiply(
            redCandlesBySizeDf, dataMagnitudeDf)
        greenCandlesBySizeTimesMagnitudeDf = np.multiply(
            greenCandlesBySizeDf, dataMagnitudeDf)

        redCandlesBySizeTimesMagnitudeDf = redCandlesBySizeTimesMagnitudeDf.rename(
            'redCandlesBySizeTimesMagnitude')
        greenCandlesBySizeTimesMagnitudeDf = greenCandlesBySizeTimesMagnitudeDf.rename(
            'greenCandlesBySizeTimesMagnitude')

    except:
        print("Error executing method >>> ")
        # exc_type, exc_obj, exc_tb = sys.exc_info()
        # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        # print("Unexpected error:", sys.exc_info())
        # print(exc_type, fname, exc_tb.tb_lineno)

        # http://docs.python.org/2/library/sys.html#sys.exc_info
        exc_type, exc_value, exc_traceback = sys.exc_info(
        )  # most recent (if any) by default
        '''
        Reason this _can_ be bad: If an (unhandled) exception happens AFTER this,
        or if we do not delete the labels on (not much) older versions of Py, the
        reference we created can linger.

        traceback.format_exc/print_exc do this very thing, BUT note this creates a
        temp scope within the function.
        '''

        traceback_details = {
            'filename': exc_traceback.tb_frame.f_code.co_filename,
            'lineno': exc_traceback.tb_lineno,
            'name': exc_traceback.tb_frame.f_code.co_name,
            'type': exc_type.__name__,
            'message': traceback.extract_tb(exc_traceback)
        }

        del (exc_type, exc_value, exc_traceback
             )  # So we don't leave our local labels/objects dangling
        # This still isn't "completely safe", though!
        # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback
        # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]

        print
        print(traceback.format_exc())
        print
        print(traceback_template % traceback_details)
        print

        #traceback.print_exception()
        raise

    finally:

        return pd.concat([
            redCandlesBySizeDf, greenCandlesBySizeDf,
            redCandlesBySizeTimesMagnitudeDf,
            greenCandlesBySizeTimesMagnitudeDf, redGreenCandlesTanhDf,
            redGreenCandlesTanhTimesMagnitudeDf.rename(
                'redGreenCandlesTanhTimesMagnitudeDf')
        ],
                         axis=1)
Beispiel #5
0
def doBasicOperation(dataName, dataFrequency):
    import os, sys, traceback
    from datetime import datetime, timedelta

    import pandas as pd
    import numpy as np

    from config.environment import getAppConfigData
    from config.environment import setAppConfigData

    from utilities.fileFolderManipulations import getJupyterRootDirectory
    from utilities.fileFolderManipulations import getParentFolder
    from utilities.fileFolderManipulations import createFolder
    print("into method doBasicOperation")

    return_fundamentalFeaturesDf = None

    try:

        # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath)
        # using various python commands like os.path.abspath and os.path.join
        jupyterNodePath = getJupyterRootDirectory()

        configFilePath = None

        # holds data from input data file - Truth source, should be usd only for reference and no updates should happen to this variable
        inputRawProcessedDataDF = None

        autoConfigData = getAppConfigData()

        preProcessedDataFilePath = autoConfigData[dataName][dataFrequency][
            'preProcessedDataFilePath']

        # read the raw processed data from csv file
        inputRawProcessedDataDF = pd.read_csv(jupyterNodePath +
                                              preProcessedDataFilePath)

        return_fundamentalFeaturesDf = createFundamentalFeatures(
            inputRawProcessedDataDF)

        print("before return statement of method doBasicOperation ")

    except:
        print("Error executing method >>> ")
        # exc_type, exc_obj, exc_tb = sys.exc_info()
        # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        # print("Unexpected error:", sys.exc_info())
        # print(exc_type, fname, exc_tb.tb_lineno)

        # http://docs.python.org/2/library/sys.html#sys.exc_info
        exc_type, exc_value, exc_traceback = sys.exc_info(
        )  # most recent (if any) by default
        '''
        Reason this _can_ be bad: If an (unhandled) exception happens AFTER this,
        or if we do not delete the labels on (not much) older versions of Py, the
        reference we created can linger.

        traceback.format_exc/print_exc do this very thing, BUT note this creates a
        temp scope within the function.
        '''

        traceback_details = {
            'filename': exc_traceback.tb_frame.f_code.co_filename,
            'lineno': exc_traceback.tb_lineno,
            'name': exc_traceback.tb_frame.f_code.co_name,
            'type': exc_type.__name__,
            'message': traceback.extract_tb(exc_traceback)
        }

        del (exc_type, exc_value, exc_traceback
             )  # So we don't leave our local labels/objects dangling
        # This still isn't "completely safe", though!
        # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback
        # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]

        print
        print(traceback.format_exc())
        print
        print(traceback_template % traceback_details)
        print

        #traceback.print_exception()
        raise
    finally:
        return return_fundamentalFeaturesDf
Beispiel #6
0
def createFinalTrainingFeatureList(dataName,
                                   dataFrequency,
                                   variation_degree=-1):
    import glob
    import pandas as pd

    from utilities.fileFolderManipulations import getJupyterRootDirectory
    from utilities.fileFolderManipulations import getParentFolder

    from config.environment import getAppConfigData
    from config.environment import setAppConfigData

    from dataPreparation.featurePreparation import doBasicOperation

    configData = getAppConfigData()

    projectRootFolderPath = getJupyterRootDirectory()

    if not isinstance(variation_degree, int) or variation_degree == -1:
        variation_degree = configData['variationDegreeForFeatureGeneration']

    _basicDf = doBasicOperation(dataName, dataFrequency)

    filteredFeaturesPath = "/data/" + dataName + "/processed/" + dataFrequency + "/features/filteredFeatures"
    outputFinalFeatureListFilePath = "/data/" + dataName + "/processed/" + dataFrequency + "/features/finalTrainingFeatureList.csv"
    print("filteredFeaturesFolderPath >>> " + filteredFeaturesPath)

    # creating OS queryable object for python to work with to find json files in the dataFolderPath calcuated in the previous step
    csv_pattern = os.path.join(
        projectRootFolderPath + '/' + filteredFeaturesPath, '*.csv')
    print("declared csv_pattern")

    # store all the json file paths in the dataFolderPath for further processing
    file_list = glob.glob(csv_pattern)
    print("obtained file_list")

    # creating pandas dataframe references for further modification
    trainingFeatureDF = _basicDf
    print('initialized trainingFeatureDF')

    # execution assertion/ui progress update info
    print('looping through all the files to create input data')

    #loop through all the files in the folder and create inputRawDataDF pandas datafram
    for file in file_list:
        print("reading input file >>> " + file + " ...")
        data = pd.read_csv(file)
        #data=data.values[0][0]['candles']
        trainingFeatureDF = pd.concat(
            [trainingFeatureDF, data],
            axis=1)  #trainingFeatureDF.append(data, ignore_index = True)
        print("File read - SUCCESS")

    # crate the final training list file
    print("creating finalTrainingFeatureList in location >>> " +
          outputFinalFeatureListFilePath)
    trainingFeatureDF.to_csv(projectRootFolderPath + '/' +
                             outputFinalFeatureListFilePath)

    # update auto config file
    autoConfigData = getAppConfigData()
    autoConfigData[dataName][dataFrequency].update(
        {'finalTrainingFeaturesListFile': outputFinalFeatureListFilePath})
    setAppConfigData(autoConfigData)

    print("updated config file with data >>>> finalTrainingFeaturesListFile:" +
          outputFinalFeatureListFilePath)

    return trainingFeatureDF, outputFinalFeatureListFilePath
Beispiel #7
0
def prepareFeatureWithData(dataName,
                           dataFrequency,
                           newFeatureDf,
                           basicDf,
                           featureIndexStamp,
                           variation_degree=-1,
                           requiredMinimumCorrelation=-1):
    import numpy as np
    import pandas as pd

    from utilities.fileFolderManipulations import getJupyterRootDirectory
    from utilities.fileFolderManipulations import getParentFolder
    from utilities.fileFolderManipulations import createFolder
    from utilities.fileFolderManipulations import deleteFile

    from config.environment import setAppConfigData
    from config.environment import getAppConfigData

    correlation = None
    reasonableCorelation = None
    newTrainingSetDf = None
    trainableFeaturesDf = None

    try:

        configData = getAppConfigData()

        if not (isinstance(requiredMinimumCorrelation, int)
                or isinstance(requiredMinimumCorrelation,
                              float)) or requiredMinimumCorrelation == -1:
            requiredMinimumCorrelation = configData[
                'requiredMinimumFeatureCorrelationWithOutputData']
        print('requiredMinimumCorrelation is >>> ' +
              str(requiredMinimumCorrelation))

        if not isinstance(variation_degree, int) or variation_degree == -1:
            variation_degree = configData[
                'variationDegreeForFeatureGeneration']

        trainableFeaturesDf = None
        featureOfInterest = newFeatureDf.columns
        if variation_degree == 0:
            newTrainingSetDf = pd.concat([basicDf, newFeatureDf], axis=1)
        else:
            newTrainingSetDf = _createNewTrainingSetWithFeatureVariations(
                basicDf, newFeatureDf, featureOfInterest, variation_degree)

        # return newTrainingSetDf
        correlation = newTrainingSetDf.corr()
        correlation = correlation.drop_duplicates(keep='first')

        reasonableCorelation = correlation.loc[
            (np.abs(correlation['open']) > requiredMinimumCorrelation)
            & (np.abs(correlation['high']) > requiredMinimumCorrelation) &
            (np.abs(correlation['low']) > requiredMinimumCorrelation) &
            (np.abs(correlation['close']) > requiredMinimumCorrelation)]

        # drop duplicate features based on its correlation with 'open' attribute - this is a experimental stuff can be tried using close, high and close as well
        reasonableCorelation = reasonableCorelation.drop_duplicates(
            subset='open', keep='first')

        # create necessary file folder structure for storing and filtering features
        preprocessedFolderPath = '/data/' + dataName + '/processed/' + dataFrequency + '/preProcessedData'
        preProcessedDataFilePath = preprocessedFolderPath + '/processedRawData.csv'
        #getParentFolder(preProcessedDataFilePath)
        outputFolderPath = getParentFolder(preprocessedFolderPath)

        print('preprocessedFolderPath interim test >>> ' +
              preprocessedFolderPath)
        print('outputFolderPath interim test >>> ' + outputFolderPath)

        featuresFolder = outputFolderPath + "/features"
        createFolder(featuresFolder)
        print('featuresFolder interim test >>> ' + featuresFolder)

        rawFeaturesFolder = featuresFolder + "/rawFeatures"
        createFolder(rawFeaturesFolder)
        print('rawFeaturesFolder interim test >>> ' + rawFeaturesFolder)

        filteredFeaturesFolder = featuresFolder + "/filteredFeatures"
        createFolder(filteredFeaturesFolder)
        print('filteredFeaturesFolder interim test >>> ' +
              filteredFeaturesFolder)

        correlationsFolder = featuresFolder + "/correlations"
        createFolder(correlationsFolder)
        print('correlationsFolder interim test >>> ' + correlationsFolder)

        reasonableCorrelationsFolder = correlationsFolder + "/reasonableCorrelations"
        createFolder(reasonableCorrelationsFolder)
        print('reasonableCorrelationsFolder interim test >>> ' +
              reasonableCorrelationsFolder)

        trainableFeaturesListFilePath = filteredFeaturesFolder + "/" + featureIndexStamp + featureOfInterest[
            0] + "_trainableFeaturesList.csv"
        currentFeatureListFilePath = rawFeaturesFolder + "/" + featureIndexStamp + featureOfInterest[
            0] + "_variations_list.csv"
        currentFeatureCorrelationListFilePath = correlationsFolder + "/" + featureIndexStamp + featureOfInterest[
            0] + "_variations_correlation_list.csv"
        reasonableCorelationListFilePath = reasonableCorrelationsFolder + "/" + featureIndexStamp + featureOfInterest[
            0] + "_variations_reasonable_correlation_list.csv"

        print('trainableFeaturesListFilePath interim test >>> ' +
              trainableFeaturesListFilePath)
        print('currentFeatureListFilePath interim test >>> ' +
              currentFeatureListFilePath)
        print('currentFeatureCorrelationListFilePath interim test >>> ' +
              currentFeatureCorrelationListFilePath)
        print('reasonableCorelationListFilePath interim test >>> ' +
              reasonableCorelationListFilePath)

        deleteFile(trainableFeaturesListFilePath)
        deleteFile(currentFeatureListFilePath)
        deleteFile(currentFeatureCorrelationListFilePath)
        deleteFile(reasonableCorelationListFilePath)

        # store output information related to current
        print('currentFeatureListFilePath interim test >>> ' +
              currentFeatureListFilePath)
        newTrainingSetDf.to_csv(currentFeatureListFilePath,
                                sep=',',
                                index=False)
        correlation.to_csv(currentFeatureCorrelationListFilePath,
                           sep=',',
                           index=True)
        reasonableCorelation.to_csv(reasonableCorelationListFilePath,
                                    sep=',',
                                    index=True)

        if len(reasonableCorelation.index) > 4:
            # store trainable features in global file - to be used by other training feature creation procedures
            newFilteredTrainableFeaturesDf = newTrainingSetDf[[
                filteredIndex for filteredIndex in reasonableCorelation.index
            ]]
            trainableFeaturesDf = newFilteredTrainableFeaturesDf.drop(
                columns=["open", "close", "high", "low"])

            if not trainableFeaturesDf is None or trainableFeaturesDf.shape[
                    1] > 0:
                trainableFeaturesDf.to_csv(trainableFeaturesListFilePath,
                                           sep=',',
                                           index=False)

            # assertions
            print("newTrainingSetDf shape>>>" +
                  str(newTrainingSetDf.shape[0]) + "," +
                  str(newTrainingSetDf.shape[1]))
            print("trainableFeaturesDf shape>>>" +
                  str(trainableFeaturesDf.shape[0]) + "," +
                  str(trainableFeaturesDf.shape[1]))

            autoConfigData = getAppConfigData()
            autoConfigData[dataName][dataFrequency].update(
                {'trainableFeaturesListFile': trainableFeaturesListFilePath})
            setAppConfigData(autoConfigData)
        else:
            trainableFeaturesDf = getTrainableFeaturesListDf(
                trainableFeaturesListFilePath)

    except:
        print("Error executing method >>> ")
        # exc_type, exc_obj, exc_tb = sys.exc_info()
        # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        # print("Unexpected error:", sys.exc_info())
        # print(exc_type, fname, exc_tb.tb_lineno)

        # http://docs.python.org/2/library/sys.html#sys.exc_info
        exc_type, exc_value, exc_traceback = sys.exc_info(
        )  # most recent (if any) by default
        '''
        Reason this _can_ be bad: If an (unhandled) exception happens AFTER this,
        or if we do not delete the labels on (not much) older versions of Py, the
        reference we created can linger.

        traceback.format_exc/print_exc do this very thing, BUT note this creates a
        temp scope within the function.
        '''

        traceback_details = {
            'filename': exc_traceback.tb_frame.f_code.co_filename,
            'lineno': exc_traceback.tb_lineno,
            'name': exc_traceback.tb_frame.f_code.co_name,
            'type': exc_type.__name__,
            'message': traceback.extract_tb(exc_traceback)
        }

        del (exc_type, exc_value, exc_traceback
             )  # So we don't leave our local labels/objects dangling
        # This still isn't "completely safe", though!
        # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback
        # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]

        print
        print(traceback.format_exc())
        print
        print(traceback_template % traceback_details)
        print

        #traceback.print_exception()
        raise
    finally:
        return correlation, reasonableCorelation, newTrainingSetDf, trainableFeaturesDf