Python DataProcessor.discretize Examples

Programming Language: Python

Namespace/Package Name: DataProcessor

Class/Type: DataProcessor

Method/Function: discretize

Examples at hotexamples.com: 1

Python DataProcessor.discretize - 1 examples found. These are the top rated real world Python examples of DataProcessor.DataProcessor.discretize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DataProcessor(30)

ProcessTestData(18)

execute(6)

get_model(4)

ProcessTrainData(4)

get_labels(3)

get_dev_examples(3)

get_geo_twi_target_info(2)

create_tuples(2)

create_txt_files(2)

applyPipeline(2)

get_X(2)

create_dataframe(2)

GetTimeStampsFromTestData(2)

ExtractValidationLabels(2)

generate_valtrain_batch(1)

getAuraWithId(1)

generate_valtest_batch(1)

getAuraWithIdLine(1)

getData(1)

fivefold(1)

getDuration(1)

filterAndNormalizeFullDataset(1)

getListOfTuples(1)

CreateGreyPickle(1)

getMaxLength(1)

getPdLocation(1)

getTags(1)

getWords(1)

get_accommodation_stats(1)

get_column_np(1)

get_features(1)

get_matrix(1)

get_medicine_examples(1)

loadInput(1)

newDataProcessor(1)

printOutput(1)

filter(1)

divide_xy(1)

evalfscore_v17(1)

calc_expected_return_probablity_based_on_monte_carlo(1)

GetOutputsFromTestData(1)

GetPitchFromTestData(1)

GetRollFromTestData(1)

ProcessInferenceData(1)

ProcessTestDataGray(1)

_get_valtest_mul_data(1)

_get_valtrain_mul_data(1)

_internal_validate_predict_best_param(1)

add_row(1)

Example #1

Show file

def main():
    """
	Initialization point for running the entire experiment.
	Takes no arguments
	Returns no output
	"""

    dp = DataProcessor(
        '../datasets', 'filtered'
    )  # Create DataProcessor object with a path to the data directory and
    # output directory name.

    dp.clean(
        'breast-cancer-wisconsin.csv')  # Clean breast-cancer-wisconsin data
    dp.discretize('iris.csv', categories=6)
    dp.discretize('glass.csv', categories=10)

    dp.filter(
        'breast-cancer-wisconsin.csv', 2
    )  # Seperate breast-cancer-wisconsin data classes each into their own csv
    dp.mangle('glass_categorized.csv')
    dp.mangle('house-votes-84.csv')
    dp.mangle('iris_categorized.csv')
    dp.mangle('soybean-small.csv')

    dp.filter('breast-cancer-wisconsin.csv', 2)
    dp.filter('breast-cancer-wisconsin.csv', 4)
    dp.filter('breast-cancer-wisconsin_mangled.csv', 2)
    dp.filter('breast-cancer-wisconsin_mangled.csv', 4)

    dp.discretize('glass.csv',
                  categories=10)  # Discretize glass dataset into 10 catagories

    dp.filter('glass_categorized.csv',
              1)  # Seperate glass data classes each into their own csv
    dp.filter('glass_categorized.csv', 2)
    dp.filter('glass_categorized.csv', 3)
    dp.filter('glass_categorized.csv', 5)
    dp.filter('glass_categorized.csv', 6)
    dp.filter('glass_categorized.csv', 7)
    dp.filter('glass_categorized_mangled.csv', 1)
    dp.filter('glass_categorized_mangled.csv', 2)
    dp.filter('glass_categorized_mangled.csv', 3)
    dp.filter('glass_categorized_mangled.csv', 5)
    dp.filter('glass_categorized_mangled.csv', 6)
    dp.filter('glass_categorized_mangled.csv', 7)

    dp.filter('house-votes-84.csv', 'republican'
              )  # Seperate house-votes data classes each into their own csv
    dp.filter('house-votes-84.csv', 'democrat')
    dp.filter('house-votes-84_mangled.csv', 'republican')
    dp.filter('house-votes-84_mangled.csv', 'democrat')

    dp.discretize('iris.csv',
                  categories=6)  # Discretize iris data set into 6 catagories
    dp.filter(
        'iris_categorized.csv',
        'Iris-setosa')  # Seperate iris data classes each into their own csv
    dp.filter('iris_categorized.csv', 'Iris-versicolor')
    dp.filter('iris_categorized.csv', 'Iris-virginica')
    dp.filter('iris_categorized_mangled.csv', 'Iris-setosa')
    dp.filter('iris_categorized_mangled.csv', 'Iris-versicolor')
    dp.filter('iris_categorized_mangled.csv', 'Iris-virginica')

    dp.filter(
        'soybean-small.csv',
        'D1')  # Seperate soybean-small data classes each into their own csv
    dp.filter('soybean-small.csv', 'D2')
    dp.filter('soybean-small.csv', 'D3')
    dp.filter('soybean-small.csv', 'D4')
    dp.filter('soybean-small_mangled.csv', 'D1')
    dp.filter('soybean-small_mangled.csv', 'D2')
    dp.filter('soybean-small_mangled.csv', 'D3')
    dp.filter('soybean-small_mangled.csv', 'D4')

    print("BREAST CANCER WISCONSIN\n")
    # Create a new TrainingSetCalc object with the breast-cancer-wisconsin data set
    tsc = TrainingSetCalc(
        "../datasets/filtered/",
        ['breast-cancer-wisconsin_2.csv', 'breast-cancer-wisconsin_4.csv'])
    bcw_results = Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(10)))

    print("HOUSE VOTES 84\n")
    # Create a new TrainingSetCalc object with the house-votes data set
    tsc = TrainingSetCalc(
        "../datasets/filtered/",
        ['house-votes-84_democrat.csv', 'house-votes-84_republican.csv'])
    hv_results = Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(10)))

    print("GLASS\n")
    # Create a new TrainingSetCalc object with the glass-catagorized data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'glass_categorized_1.csv', 'glass_categorized_2.csv',
        'glass_categorized_3.csv', 'glass_categorized_5.csv',
        'glass_categorized_6.csv', 'glass_categorized_7.csv'
    ])
    glass_results = Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(10)))

    print("IRIS\n")
    # Create a new TrainingSetCalc object with the iris-catagorized data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'iris_categorized_Iris-setosa.csv',
        'iris_categorized_Iris-versicolor.csv',
        'iris_categorized_Iris-virginica.csv'
    ])
    iris_results = Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(10)))

    print("SOYBEANS\n")
    # Create a new TrainingSetCalc object with the soybean-small data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'soybean-small_D1.csv', 'soybean-small_D2.csv', 'soybean-small_D3.csv',
        'soybean-small_D4.csv'
    ])
    soybean_results = Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(10)))

    print("BREAST CANCER WISCONSIN (MANGLED)\n")
    # Create a new TrainingSetCalc object with the mangled breast-cancer-wisconsin data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'breast-cancer-wisconsin_mangled_2.csv',
        'breast-cancer-wisconsin_mangled_4.csv'
    ])
    bcw_mangled_results = Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(10)))

    print("HOUSE VOTES 84 (MANGLED)\n")
    # Create a new TrainingSetCalc object with the mangled house-votes data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'house-votes-84_mangled_democrat.csv',
        'house-votes-84_mangled_republican.csv'
    ])
    hv_mangled_results = Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(10)))
    Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(
        10)))  # Perform 10-fold cross validation on the algorithm using the
    # mangled house-votes data set and print formatted result

    print("GLASS (MANGLED)\n")
    # Create a new TrainingSetCalc object with the mangled glass-catagorized data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'glass_categorized_mangled_1.csv', 'glass_categorized_mangled_2.csv',
        'glass_categorized_mangled_3.csv', 'glass_categorized_mangled_5.csv',
        'glass_categorized_mangled_6.csv', 'glass_categorized_mangled_7.csv'
    ])
    glass_mangled_results = Evaluate.evaluate(
        tsc.kFoldTrain(tsc.createSets(10)))
    Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(
        10)))  # Perform 10-fold cross validation on the algorithm using the
    # mangled glass-catagorized data set and print formatted result

    print("IRIS (MANGLED)\n")
    # Create a new TrainingSetCalc object with the mangled iris-catagorized data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'iris_categorized_mangled_Iris-setosa.csv',
        'iris_categorized_mangled_Iris-versicolor.csv',
        'iris_categorized_mangled_Iris-virginica.csv'
    ])
    iris_mangled_results = Evaluate.evaluate(tsc.kFoldTrain(
        tsc.createSets(10)))
    Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(
        10)))  # Perform 10-fold cross validation on the algorithm using the
    # iris-catagorized data set and print formatted result

    print("SOYBEANS (MANGLED)\n")
    # Create a new TrainingSetCalc object with the mangled soybean-small data set
    tsc = TrainingSetCalc("../datasets/filtered/", [
        'soybean-small_mangled_D1.csv', 'soybean-small_mangled_D2.csv',
        'soybean-small_mangled_D3.csv', 'soybean-small_mangled_D4.csv'
    ])
    Evaluate.evaluate(tsc.kFoldTrain(tsc.createSets(
        10)))  # Perform 10-fold cross validation on the algorithm using the
    # mangled soybean-small data set and print formatted result
    soybean_mangled_results = Evaluate.evaluate(
        tsc.kFoldTrain(tsc.createSets(10)))

    bcw_diffs = []
    hv_diffs = []
    glass_diffs = []
    iris_diffs = []
    soybean_diffs = []

    for i in range(len(bcw_results)):
        bcw_diffs.append((bcw_results[i][0] - bcw_mangled_results[i][0],
                          bcw_results[i][1] - bcw_mangled_results[i][1],
                          bcw_results[i][2] - bcw_mangled_results[i][2]))

    for i in range(len(hv_results)):
        hv_diffs.append((hv_results[i][0] - hv_mangled_results[i][0],
                         hv_results[i][1] - hv_mangled_results[i][1],
                         hv_results[i][2] - hv_mangled_results[i][2]))

    for i in range(len(glass_results)):
        glass_diffs.append((glass_results[i][0] - glass_mangled_results[i][0],
                            glass_results[i][1] - glass_mangled_results[i][1],
                            glass_results[i][2] - glass_mangled_results[i][2]))

    for i in range(len(iris_results)):
        iris_diffs.append((iris_results[i][0] - iris_mangled_results[i][0],
                           iris_results[i][1] - iris_mangled_results[i][1],
                           iris_results[i][2] - iris_mangled_results[i][2]))

    for i in range(len(soybean_results)):
        soybean_diffs.append(
            (soybean_results[i][0] - soybean_mangled_results[i][0],
             soybean_results[i][1] - soybean_mangled_results[i][1],
             soybean_results[i][2] - soybean_mangled_results[i][2]))

    print("BREAST CANCER DIFFS")
    for i in range(len(bcw_diffs)):
        print(bcw_diffs[i][0], "\t", bcw_diffs[i][1], "\t", bcw_diffs[i][2])
    print("\n\n")

    print("HOUSE VOTE DIFFS")
    for i in range(len(hv_diffs)):
        print(hv_diffs[i][0], "\t", hv_diffs[i][1], "\t", hv_diffs[i][2])
    print("\n\n")

    print("GLASS DIFFS")
    for i in range(len(glass_diffs)):
        print(glass_diffs[i][0], "\t", glass_diffs[i][1], "\t",
              glass_diffs[i][2])
    print("\n\n")

    print("IRIS DIFFS")
    for i in range(len(iris_diffs)):
        print(iris_diffs[i][0], "\t", iris_diffs[i][1], "\t", iris_diffs[i][2])
    print("\n\n")

    print("SOYBEAN DIFFS")
    for i in range(len(soybean_diffs)):
        print(soybean_diffs[i][0], "\t", soybean_diffs[i][1], "\t",
              soybean_diffs[i][2])
    print("\n\n")