def processingData(*year): downloadSuccess = downloadData.downloadYear(*year) for (yearseason, form) in downloadSuccess: cleanData.cleanData(yearseason, form) print( 'Execution complete. For details in cleaning data, please check invalidDataLog.txt and outlierLog.txt. ' ) return
def get_data(self): """"method that imports data via cleanData function. Finds max and min voltages as well as the maximum time on the ECG strip" :param: takes as input a pandas dataframe :raises ImportError: raises error if cleanData file is not found :raises AttributeError: raises error if no dataframe is input to class """ import logging str1 = logging.DEBUG logging.basicConfig(filename="bme590hrmlogs.txt", format='%(levelname)s %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=str1) try: from cleanData import cleanData try: voltageList, timeList = cleanData(self.inputDataFrame) volt_max = max(voltageList) volt_min = min(voltageList) self.tuple_min_max_volts = (volt_max, volt_min) self.max_time_strip = max(timeList) logging.debug("max volts: " + str(volt_max)) logging.debug("min volts: " + str(volt_min)) logging.debug("max time: " + str(self.max_time_strip) + " " + str(self.timeUnit)) except AttributeError: print("no dataframe input") logging.warning("no dataframe found") except ImportError: print("cleanData not found. Check file is in local dir") logging.warning("cleanData not found. Check file is in local dir")
def test_correctExcp(): from pickImportData import PickImportData from max_min_volts_time_dur import MaxMinVoltsTimeDur from cleanData import cleanData testClass = PickImportData() testClass.FilePath = "test_data1.csv" testClass.ImportFile() testClassMaxMin2 = MaxMinVoltsTimeDur(testClass.outPutArray) import pytest with pytest.raises(ImportError, message="Expecting ImportError"): import randomFunc with pytest.raises(AttributeError, message="Expecting AttributeError"): voltageList, timeList = cleanData(2)
def test_cleanData(): import pandas as pd from cleanData import cleanData import math as mt from pickImportData import PickImportData testClass = PickImportData() testClass.FilePath = "test_data30.csv" testClass.ImportFile() voltage, time = cleanData(testClass.outPutArray) for i in range(len(voltage)): assert isinstance(voltage[i], float) is True assert isinstance(time[i], float) is True assert mt.isnan(voltage[i]) is False assert mt.isnan(time[i]) is False for i in range(1, len(time)): assert time[i] != 0 assert len(voltage) == 10000
def main(): try: while True: try: # try to catch the invalid input and still run the program x = raw_input("Import data file? \nPlease enter Yes or No: \n") if x in ["Yes", "yes", "Y", "y"]: data = pd.read_csv("DOHMH_New_York_City_Restaurant_Inspection_Results.csv", low_memory=False) data = cl.cleanData(data).process_data() print "\nData is cleaned!" # Q4 part 1 print "Calculating...." print_total_trend(data) # Q4 part 2 result = boro_trend(data) for key in result.keys(): print "Summation of the trending identifiers in {} is {}".format(key, result[key]) # Q5 print "Generating plots...." for area in data["BORO"].unique(): generate_bar_plot(data, area, False) generate_bar_plot(data, area, True) print "Saved plots into figures dictory." break elif x in ["No", "no", "N", "n"]: sys.exit() elif x == "quit": sys.exit() else: raise KeyError("Error: Invalid Command!\n") except KeyError: print "\n Invalid input! \nPlease follow input instruction.\n" except IOError: print "\n ERROR: NO DATA\n" except KeyboardInterrupt, ValueError: print "\n Interrupted!"
def cleaningData(self): obj = cleanData() obj.set_dir(path) obj.loadData("combined.csv") obj.identify_columns_with_missing_values() obj.uniqueValues() obj.corr_plots_price_lat_long() obj.data_price_yearbuilt() obj.data_price_age() obj.clean_PropertyType() obj.keep_necessary_columns() obj.remove_duplicates() obj.beds_Baths() obj.clean_sqft() obj.clean_ZipCode() obj.clean_City() obj.create_csv()
def test_correctExcp(): from pickImportData import PickImportData from meanBPM import MeanBPM import numpy as np import pandas as pd from cleanData import cleanData testClass = PickImportData() testClass.FilePath = "test_data1.csv" testClass.ImportFile() testClassBPM = MeanBPM(testClass.outPutArray) testClassBPM.get_mean_bpm() testClassBPM.convert_beat_times_list_to_np_array() import pytest with pytest.raises(ImportError, message="Expecting ImportError"): import randomFunc with pytest.raises(FileNotFoundError, message="Expecting \ FileNotFoundError"): standFilePath = "standardPatter.csv" sdf = pd.read_csv(standFilePath, header=None) with pytest.raises(AttributeError, message="Expecting AttributeError"): voltageList, timeList = cleanData(2) with pytest.raises(IndexError, message="Expecting IndexError"): testClass2 = PickImportData() testClass2.FilePath = "test_data1.csv" testClass2.ImportFile() testClassBPM2 = MeanBPM(testClass.outPutArray, 200) with pytest.raises(ValueError, message="Expecting ValueError"): testClass3 = PickImportData() testClass3.FilePath = "test_data1.csv" testClass3.ImportFile() testClassBPM3 = MeanBPM(testClass.outPutArray, -8) with pytest.raises(IndexError, message="Expecting IndexError. \ Too many heartbeats found over interval."): testClass4 = PickImportData() testClass4.FilePath = "test_data1.csv" testClass4.ImportFile() testClassBPM4 = MeanBPM(testClass4.outPutArray, 0.5)
# inputNamePath = "iris_data/irisNames.txt" inputNamePath = "abalone_data/abaloneNames.txt" # inputNamePath = "mock_data/mock.txt" if os.path.exists(inputNamePath): hasHeader = False nameFile = open(inputNamePath, "r+") nameFile2 = open(inputNamePath, "r+") # Get k for number of voters k = int(input("k = ")) # Get row name for classification attribute classCol = input("Name of column to be the class attribute: ") # Clean data trainingSet = cleanData(dataFile, nameFile) testSet = cleanData(testFile, nameFile2) # Preprocess datasets preprocessedTrainingSet = preprocess(trainingSet) preprocessedTestSet = preprocess(testSet) # Find nearest neighbor for each test row and compute error rate testCount = 0 errors = 0 totalCount = len(preprocessedTestSet) printProgressBar(0, totalCount, prefix = 'Progress:', suffix = 'Complete', length = 50) for row in preprocessedTestSet.iterrows(): errors += nearestNeighbor(preprocessedTrainingSet, row, classCol, k) testCount += 1 printProgressBar(testCount, totalCount, prefix = 'Progress:', suffix = 'Complete', length = 50)
import dash import dash_core_components as dcc import dash_html_components as html import plotly.graph_objs as go from cleanData import cleanData # GLOBALS: # TODO: Can this be done from the net? This is obviously a local path path_to_spreadsheet = '/Users/alexanderhiller/Downloads/Bednight_Report_2018_180702.xlsx' bednight_graphs = [] df_lz, df_na, df_liu, df_sl = cleanData(path_to_spreadsheet) # Bednight graphs and titles nosy_ankao_title = html.Div(html.H4(children='Nosy Ankao')) nosy_ankao_bn = html.Div( dcc.Graph( id='Nosy Ankao', figure={ 'data': [ go.Scatter( x=df_na.index, y=df_na[i], text='Nosy Ankao', mode='markers', opacity=0.7, marker={ 'size': 15, 'line': {'width': 0.5, 'color': 'white'} },
sql_mela = sql.connect(dbname="melanoma") query = """SELECT "AGE", "SEX", "DEPTH", "ULCERATION", "MITOSES", "CS_EXTENSION", "PRIMARY_SITE", "CS_LYMPH_NODE_METS" FROM melanoma""" #query for matching JCO data #query = """SELECT "AGE", "SEX", "DEPTH", "ULCERATION", "MITOSES", "CS_EXTENSION", "PRIMARY_SITE", "CS_LYMPH_NODE_METS" # FROM melanoma # WHERE ("DEPTH" <= 400 AND "DEPTH" >= 100) # OR (("DEPTH" < 100 AND "DEPTH" >= 75) AND # ("ULCERATION" >= 1 OR "MITOSES" >= 1 OR "AGE" < 40 )) OR # (("DEPTH" < 75 AND "DEPTH" >= 54) AND # (("MITOSES" >= 1 AND "ULCERATION" >= 1) OR ("ULCERATION" >= 1 # AND "AGE" < 40) OR ("MITOSES" >= 1 AND "AGE" < 40)));""" mela = pd.read_sql_query(query, sql_mela) X, y = cleanData(mela) #split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=20) X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.125, stratify=y, random_state=20) #using functions so easily adjusted num_attribs = ['AGE', 'DEPTH', "CS_EXTENSION", "MITOSES"]
# encoding: UTF-8 """ 立即下载数据到数据库中,并且清洗数据,用于手动更新数据的操作 """ from dataService import downloadAllMinuteBar from cleanData import cleanData, repairData if __name__ == '__main__': start_date = "20171201" end_date = None # 下载数据 downloadAllMinuteBar(start_date=start_date, end_date=end_date) # 清洗数据 cleanData(start_date, end_date) # 修复数据 repairData(start_date, end_date)
def test_test_restaurant_grades(self): data = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv', low_memory=False) self.data = cl.cleanData(data).process_data() self.assertEqual(0, test_restaurant_grades(self.data, 50042805)) self.assertEqual(0, test_restaurant_grades(self.data, 30112340))