Python cleanData Examples, cleanData.cleanData Python Examples

Example #1

0

Show file

def processingData(*year):
    downloadSuccess = downloadData.downloadYear(*year)

    for (yearseason, form) in downloadSuccess:
        cleanData.cleanData(yearseason, form)
    print(
        'Execution complete. For details in cleaning data, please check invalidDataLog.txt and outlierLog.txt. '
    )

    return

Example #2

0

Show file

File: max_min_volts_time_dur.py Project: kjans123/bme590hrm

    def get_data(self):
        """"method that imports data via cleanData function. Finds max and min
        voltages as well as the maximum time on the ECG strip"

        :param: takes as input a pandas dataframe
        :raises ImportError: raises error if cleanData file is not found
        :raises AttributeError: raises error if no dataframe is input to class
        """
        import logging
        str1 = logging.DEBUG
        logging.basicConfig(filename="bme590hrmlogs.txt",
                            format='%(levelname)s %(asctime)s %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p', level=str1)
        try:
            from cleanData import cleanData
            try:
                voltageList, timeList = cleanData(self.inputDataFrame)
                volt_max = max(voltageList)
                volt_min = min(voltageList)
                self.tuple_min_max_volts = (volt_max, volt_min)
                self.max_time_strip = max(timeList)
                logging.debug("max volts: " + str(volt_max))
                logging.debug("min volts: " + str(volt_min))
                logging.debug("max time: " + str(self.max_time_strip) +
                              " " + str(self.timeUnit))
            except AttributeError:
                print("no dataframe input")
                logging.warning("no dataframe found")
        except ImportError:
            print("cleanData not found. Check file is in local dir")
            logging.warning("cleanData not found. Check file is in local dir")

Example #3

0

Show file

File: test_max_min_volts_time_dur.py Project: kjans123/bme590hrm

def test_correctExcp():
    from pickImportData import PickImportData
    from max_min_volts_time_dur import MaxMinVoltsTimeDur
    from cleanData import cleanData
    testClass = PickImportData()
    testClass.FilePath = "test_data1.csv"
    testClass.ImportFile()
    testClassMaxMin2 = MaxMinVoltsTimeDur(testClass.outPutArray)
    import pytest
    with pytest.raises(ImportError, message="Expecting ImportError"):
        import randomFunc
    with pytest.raises(AttributeError, message="Expecting AttributeError"):
        voltageList, timeList = cleanData(2)

Example #4

0

Show file

File: test_cleanData.py Project: kjans123/bme590hrm

def test_cleanData():
    import pandas as pd
    from cleanData import cleanData
    import math as mt
    from pickImportData import PickImportData
    testClass = PickImportData()
    testClass.FilePath = "test_data30.csv"
    testClass.ImportFile()
    voltage, time = cleanData(testClass.outPutArray)
    for i in range(len(voltage)):
        assert isinstance(voltage[i], float) is True
        assert isinstance(time[i], float) is True
        assert mt.isnan(voltage[i]) is False
        assert mt.isnan(time[i]) is False
    for i in range(1, len(time)):
        assert time[i] != 0
    assert len(voltage) == 10000

Example #5

0

Show file

File: assignment10.py Project: fanyichen/assignment10-1

def main():
    try:
        while True:
            try:  # try to catch the invalid input and still run the program
                x = raw_input("Import data file? \nPlease enter Yes or No: \n")
                if x in ["Yes", "yes", "Y", "y"]:

                    data = pd.read_csv("DOHMH_New_York_City_Restaurant_Inspection_Results.csv", low_memory=False)
                    data = cl.cleanData(data).process_data()
                    print "\nData is cleaned!"

                    # Q4 part 1
                    print "Calculating...."
                    print_total_trend(data)

                    # Q4 part 2
                    result = boro_trend(data)
                    for key in result.keys():
                        print "Summation of the trending identifiers in {} is {}".format(key, result[key])

                    # Q5
                    print "Generating plots...."

                    for area in data["BORO"].unique():
                        generate_bar_plot(data, area, False)
                    generate_bar_plot(data, area, True)

                    print "Saved plots into figures dictory."

                    break

                elif x in ["No", "no", "N", "n"]:
                    sys.exit()

                elif x == "quit":
                    sys.exit()

                else:
                    raise KeyError("Error: Invalid Command!\n")

            except KeyError:
                print "\n Invalid input! \nPlease follow input instruction.\n"
            except IOError:
                print "\n ERROR: NO DATA\n"
    except KeyboardInterrupt, ValueError:
        print "\n Interrupted!"

Example #6

0

Show file

    def cleaningData(self):
        obj = cleanData()
        obj.set_dir(path)
        obj.loadData("combined.csv")
        obj.identify_columns_with_missing_values()
        obj.uniqueValues()
        obj.corr_plots_price_lat_long()
        obj.data_price_yearbuilt()
        obj.data_price_age()

        obj.clean_PropertyType()
        obj.keep_necessary_columns()
        obj.remove_duplicates()
        obj.beds_Baths()
        obj.clean_sqft()
        obj.clean_ZipCode()
        obj.clean_City()
        obj.create_csv()

Example #7

0

Show file

File: test_meanBPM.py Project: kjans123/bme590hrm

def test_correctExcp():
    from pickImportData import PickImportData
    from meanBPM import MeanBPM
    import numpy as np
    import pandas as pd
    from cleanData import cleanData
    testClass = PickImportData()
    testClass.FilePath = "test_data1.csv"
    testClass.ImportFile()
    testClassBPM = MeanBPM(testClass.outPutArray)
    testClassBPM.get_mean_bpm()
    testClassBPM.convert_beat_times_list_to_np_array()
    import pytest
    with pytest.raises(ImportError, message="Expecting ImportError"):
        import randomFunc
    with pytest.raises(FileNotFoundError,
                       message="Expecting \
                       FileNotFoundError"):
        standFilePath = "standardPatter.csv"
        sdf = pd.read_csv(standFilePath, header=None)
    with pytest.raises(AttributeError, message="Expecting AttributeError"):
        voltageList, timeList = cleanData(2)
    with pytest.raises(IndexError, message="Expecting IndexError"):
        testClass2 = PickImportData()
        testClass2.FilePath = "test_data1.csv"
        testClass2.ImportFile()
        testClassBPM2 = MeanBPM(testClass.outPutArray, 200)
    with pytest.raises(ValueError, message="Expecting ValueError"):
        testClass3 = PickImportData()
        testClass3.FilePath = "test_data1.csv"
        testClass3.ImportFile()
        testClassBPM3 = MeanBPM(testClass.outPutArray, -8)
    with pytest.raises(IndexError,
                       message="Expecting IndexError. \
                       Too many heartbeats found over interval."):
        testClass4 = PickImportData()
        testClass4.FilePath = "test_data1.csv"
        testClass4.ImportFile()
        testClassBPM4 = MeanBPM(testClass4.outPutArray, 0.5)

Example #8

0

Show file

File: main.py Project: danerbrear/k-nearest-neighbor

# inputNamePath = "iris_data/irisNames.txt"
inputNamePath = "abalone_data/abaloneNames.txt"
# inputNamePath = "mock_data/mock.txt"
if os.path.exists(inputNamePath):
    hasHeader = False
nameFile = open(inputNamePath, "r+")
nameFile2 = open(inputNamePath, "r+")

# Get k for number of voters
k = int(input("k = "))

# Get row name for classification attribute
classCol = input("Name of column to be the class attribute: ")

# Clean data
trainingSet = cleanData(dataFile, nameFile)
testSet = cleanData(testFile, nameFile2)

# Preprocess datasets
preprocessedTrainingSet = preprocess(trainingSet)
preprocessedTestSet = preprocess(testSet)

# Find nearest neighbor for each test row and compute error rate
testCount = 0
errors = 0
totalCount = len(preprocessedTestSet)
printProgressBar(0, totalCount, prefix = 'Progress:', suffix = 'Complete', length = 50)
for row in preprocessedTestSet.iterrows():
    errors += nearestNeighbor(preprocessedTrainingSet, row, classCol, k)
    testCount += 1
    printProgressBar(testCount, totalCount, prefix = 'Progress:', suffix = 'Complete', length = 50)

Example #9

0

Show file

File: htmlElements.py Project: xandhiller/ttDashboard

import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go
from cleanData import cleanData


# GLOBALS:
# TODO: Can this be done from the net? This is obviously a local path
path_to_spreadsheet = '/Users/alexanderhiller/Downloads/Bednight_Report_2018_180702.xlsx'

bednight_graphs = []
df_lz, df_na, df_liu, df_sl = cleanData(path_to_spreadsheet)

# Bednight graphs and titles
nosy_ankao_title = html.Div(html.H4(children='Nosy Ankao'))
nosy_ankao_bn = html.Div(
    dcc.Graph(
        id='Nosy Ankao',
        figure={
            'data': [
                go.Scatter(
                    x=df_na.index,
                    y=df_na[i],
                    text='Nosy Ankao',
                    mode='markers',
                    opacity=0.7,
                    marker={
                        'size': 15,
                        'line': {'width': 0.5, 'color': 'white'}
                    },

Example #10

0

Show file

File: MelanomaSentinel.py Project: karenlarson/MelanomaSentinel

sql_mela = sql.connect(dbname="melanoma")
query = """SELECT "AGE", "SEX", "DEPTH", "ULCERATION", "MITOSES", "CS_EXTENSION", "PRIMARY_SITE", "CS_LYMPH_NODE_METS"
            FROM melanoma"""

#query for matching JCO data
#query = """SELECT "AGE", "SEX", "DEPTH", "ULCERATION", "MITOSES", "CS_EXTENSION", "PRIMARY_SITE", "CS_LYMPH_NODE_METS"
#           FROM melanoma
#           WHERE ("DEPTH" <= 400 AND "DEPTH" >= 100)
#           OR (("DEPTH" < 100 AND "DEPTH" >= 75) AND
#           ("ULCERATION" >= 1 OR "MITOSES" >= 1 OR "AGE" < 40 )) OR
#           (("DEPTH" < 75 AND "DEPTH" >= 54) AND
#           (("MITOSES" >= 1 AND "ULCERATION" >= 1) OR ("ULCERATION" >= 1
#           AND "AGE" < 40) OR ("MITOSES" >= 1 AND "AGE" < 40)));"""

mela = pd.read_sql_query(query, sql_mela)
X, y = cleanData(mela)

#split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    stratify=y,
                                                    random_state=20)
X_train, X_cv, y_train, y_cv = train_test_split(X_train,
                                                y_train,
                                                test_size=0.125,
                                                stratify=y,
                                                random_state=20)

#using functions so easily adjusted
num_attribs = ['AGE', 'DEPTH', "CS_EXTENSION", "MITOSES"]

Example #11

0

Show file

# encoding: UTF-8

"""
立即下载数据到数据库中，并且清洗数据，用于手动更新数据的操作
"""
from dataService import downloadAllMinuteBar
from cleanData import cleanData, repairData

if __name__ == '__main__':
    start_date = "20171201"
    end_date = None
    # 下载数据
    downloadAllMinuteBar(start_date=start_date, end_date=end_date)
    # 清洗数据
    cleanData(start_date, end_date)
    # 修复数据
    repairData(start_date, end_date)

Example #12

0

Show file

File: hw10_test.py Project: fanyichen/assignment10-1

 def test_test_restaurant_grades(self):
     data = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv', low_memory=False)
     self.data = cl.cleanData(data).process_data()
     self.assertEqual(0, test_restaurant_grades(self.data, 50042805))
     self.assertEqual(0, test_restaurant_grades(self.data, 30112340))