Beispiel #1
0
def main(date, delete, keywords=[], byLeaf=True, saveProportion=0.5):
    """
    Generates ML training and testing data from extracted CSV files

    :param date: (string) Data collection date YYYY_MMDD
    :param delete: (boolean) Determines whether or not to delete the existing
                             training/testing data files
    :param keywords: (list of strings) Data filename keywords
    :param byLeaf: (boolean) Should we separate the train/test data
                             by leaf, or should we randomly separate
                             the data according to a set proportion?
    :param saveProportion: (float) Amount of each CSV file to save as training
                                   and testing data.

    :return: (None)
    """

    # Get the data files we will be looking at
    dataPath = DATA_DIRECTORIES[date]
    dataFilenames = FileIO.getDatafileNames(dataPath, keywords)

    # If desired, remove the old training data and start fresh
    if delete:

        mlDataPath = DATA_DIRECTORIES[date+"_ML"]
        trainingDataPath = os.path.join(mlDataPath, TRAINING_DATA_PATH)
        testingDataPath = os.path.join(mlDataPath, TESTING_DATA_PATH)
        sampleCountsPath = os.path.join(mlDataPath, SAMPLE_COUNTS_PATH)

        if os.path.exists(trainingDataPath):
            os.remove(trainingDataPath)

        if os.path.exists(testingDataPath):
            os.remove(testingDataPath)

        if os.path.exists(sampleCountsPath):
            os.remove(sampleCountsPath)

    # Consolidate the CSV files into training and testing data
    (train_X, train_y, test_X, test_y) = DataManipulation.separateTrainTest(dataPath, 
                                                                            dataFilenames, 
                                                                            byLeaf=byLeaf, 
                                                                            saveProportion=saveProportion)

    # Save the training and testing data in the proper spot
    FileIO.saveTrainingData(date, train_X, train_y)
    FileIO.saveTestingData(date, test_X, test_y)