def rwCSV(filesAggregated: list, fileOutput: str, needColumns: list, Default: list, pivot='datetime') -> list: files = [] raw_output = fileOutput fileOutput = commonUtil.getPath(fileOutput) for f in filesAggregated: raw_name = f fileInput = commonUtil.getPath(f) data = fileUtil.readFromFileToData(fileInput=fileInput) # Default columns in the last columns vals = commonUtil.d(needColumns, Default) for attr in needColumns + Default: vals[attr] = data[attr] tmpName = raw_name.split(os.sep)[1] # Lost Order fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", vals) files.append(raw_output + f"_{tmpName}") return files
def addLabelToCSV(srcFile: str, labelFile: str, row='a'): import os srcFile = commonUtil.getPath(srcFile) labelFile = commonUtil.getPath(labelFile) srcData = readFromFileToData(srcFile, row) lbData = readFromFileToData(labelFile, row) srcData['label'] = lbData['label'] saveDataToCSV(srcFile, srcData) os.remove(labelFile)
def readFromFileToList(fileInput: str) -> list: fileInput = commonUtil.getPath(fileInput) try: f = open(fileInput, "r") return list(f.read().splitlines()) except Exception as e: # TODO error catch raise (e)
def saveDataToCSV(fileOutput: str, data: dict): fileOutput = commonUtil.getPath(fileOutput) import pandas as pd dataframe = pd.DataFrame(data) try: dataframe.to_csv(fileOutput, index=False, sep=',') except Exception as e: # TODO error catch raise (e)
def getTimeLabelColumns(srcFile: str, row='a'): srcFile = commonUtil.getPath(srcFile) srcData = readFromFileToData(srcFile, row) cols = srcData.columns[:].tolist() cols1 = cols[:0] cols2 = cols[-1:] return cols1 + cols2
def loadYAML(fileInput: str) -> dict: yamlPath = commonUtil.getPath(fileInput) try: f = open(yamlPath, 'r', encoding='utf-8') cfg = f.read() # convert to dict d = yaml.safe_load(cfg) return d except Exception as e: raise(e)
def __init__(self, fileOutput, databasesetting): self.fileOutput = commonUtil.getPath(fileOutput) self.connectionInfo = LoadYAML.rDatabaseSetting(databasesetting) self.DATABASE_NAME = self.connectionInfo[-2] self.TABLE_NAME = self.connectionInfo[-1][0] # Sql head order will be changed self.head = [] self.headString = ''
def retainTimeColumnsInCSV(srcFile: str, row='a'): srcFile = commonUtil.getPath(srcFile) srcData = readFromFileToData(srcFile, row) cols = srcData.columns[:].tolist() cols1 = cols[:2] cols2 = cols[-1:] srcData.drop(cols1 + cols2, axis=1, inplace=True) return srcData
def deleteAllFolder(folderName: str): import shutil import os folderName = commonUtil.getPath(folderName) del_list = os.listdir(folderName) for f in del_list: file_path = os.path.join(folderName, f) if os.path.isfile(file_path): os.remove(file_path) if os.path.isdir(file_path): shutil.rmtree(file_path)
def addLinesToCSV(fileInput: str, lines: list): import csv try: fileInput = commonUtil.getPath(fileInput) out_f = open(fileInput, 'a', newline='') writer = csv.writer(out_f) writer.writerows(lines) out_f.close() except Exception as e: # TODO error catch raise (e)
def prediction_flow(csv_filepath, prediction_filepath, model_filepath, labels, YESTERDAY): """ Prediction flows reads the file, removes columns not allowed in ml model, makes predictions, appends predictions to original dataframe, filters out necessary columns args: csv_filepath str Filepath of the aggregated csv model_filepath str Filepath of the joblib file labels list List of strings for the label, order matters remove_columns list List of strings for columns that need to be removed before placing in model retain_columns list List of strings for columns that need to be kept overall before storing in DB """ # create list for predictions predictions = [] # Load our pre-trained model model_filepath = commonUtil.getPath(model_filepath) clf = load(model_filepath) # read the data_temperature, header included csv_filepath = commonUtil.getPath(csv_filepath) df_model_input = fileUtil.retainTimeColumnsInCSV(csv_filepath) # make a list of prediction labels for row in df_model_input.values: prediction = make_predictions(clf, row, labels) predictions.append(prediction) prediction_filepath = commonUtil.getPath(prediction_filepath) vals = commonUtil.d(['Date', 'Label', 'Frequency']) fileUtil.saveDataToCSV(prediction_filepath, vals) res = [] for label in labels: tmp = [YESTERDAY, label] tmp.append(predictions.count(label)) res.append(tmp) fileUtil.addLinesToCSV(prediction_filepath, res)
def saveListToCSV(fileOutput: str, srcList: list): fileOutput = commonUtil.getPath(fileOutput) try: f = open(fileOutput, "w") for idx, item in enumerate(srcList): if idx < len(srcList) - 1: f.write(str(item) + '\n') else: f.write(str(item)) f.close() except Exception as e: # TODO error catch raise (e)
def rwCSV(filesAggregated: list, fileOutput: str, UserInputColumns: list, APPENDIX=APPENDIX, normalizeMethod=zscore) -> list: files = [] raw_output = fileOutput fileOutput = commonUtil.getPath(fileOutput) for f in filesAggregated: raw_name = f fileInput = commonUtil.getPath(f) data = fileUtil.readFromFileToData(fileInput) vals = commonUtil.d(UserInputColumns) for attr in UserInputColumns: vals[attr] = normalizeMethod(data, attr) # last columns is always APPENDIX, which is usually as 'Minute', 'Hour' 'Day' 'Month' 'Year' for attr in APPENDIX: vals[attr] = data[attr] # delete 'nan' value newVals = {} for key in vals.keys(): if not np.isnan(vals[key][0]): newVals[key] = vals[key] else: newVals[key] = 0 tmpName = raw_name.split(os.sep)[1] fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", newVals) files.append(raw_output + f"_{tmpName}") return files
def processCSV(data, fileOutput, UserInputColumns, APPENDIX, timeFormat, timeInterval: int): startMinute, startHour, startDay, startMonth, startYear = combineAppendixItem( data, APPENDIX, rowIndex=0) tString = f'{startDay}-{startMonth}-{startYear}' startTimeStamp = time.mktime(time.strptime(tString, '%d-%m-%Y')) prevStartRow = 0 slots = int(24 * 60 / timeInterval) dataPlots = [False for i in range(slots)] curTimeStamp = startTimeStamp #tmpName = raw_name.split(os.sep)[1] location = commonUtil.getPath(fileOutput) out_f = open(location, 'a', newline='') writer = csv.writer(out_f) for idx in range(len(data)): curMinute, curHour, curDay, curMonth, curYear = combineAppendixItem( data, APPENDIX, rowIndex=idx) curTime = f'{curMinute}-{curHour}-{curDay}-{curMonth}-{curYear}' curTimeStamp = time.mktime(time.strptime(curTime, timeFormat)) if curTimeStamp - startTimeStamp < 86400: dataPlots[int((curHour * 60 + curMinute) / timeInterval)] = True else: startTimeStamp = curTimeStamp writeToDigestCSV(data, UserInputColumns, APPENDIX, prevStartRow, dataPlots, writer, timeInterval) dataPlots = [False for i in range(slots)] dataPlots[int((curHour * 60 + curMinute) / timeInterval)] = True prevStartRow = idx writeToDigestCSV(data, UserInputColumns, APPENDIX, prevStartRow, dataPlots, writer, timeInterval) out_f.close()
def readFromFileToData(fileInput: str, row='a'): fileInput = commonUtil.getPath(fileInput) import pandas as pd try: if row == 'a': data = pd.read_csv(fileInput, encoding='utf-8', delimiter=',') # nrows=1000 else: data = pd.read_csv(fileInput, encoding='utf-8', delimiter=',', nrows=int(row)) if (len(data) < 1): pass else: return data except Exception as e: # TODO error catch raise (e)
def rwCSV(fileInput: str, fileOutput: str, timeIntervalList: list, timeFormat=timeFormat, pivot='datetime', appendColumns=appendColumns) -> list: if len(timeIntervalList) == 0: timeIntervalList = ['1hour'] raw_name = fileOutput path = commonUtil.getPath(fileInput) fileOutput = commonUtil.getPath(fileOutput) data = fileUtil.readFromFileToData(path) data_len = len(data) # No order allColumns = data.columns.tolist() time_col = data[pivot] len_timeIntervals_sorted, timeIntervals_sorted = commonUtil.getIntervalList(timeIntervalList) selecColumns = [] for col in allColumns: if col != pivot and (re.findall('id', col, flags=re.IGNORECASE) == []): selecColumns.append(col) vals = commonUtil.d(timeIntervals_sorted) for i in range(len_timeIntervals_sorted): vals[i] = commonUtil.d(selecColumns, appendColumns) # Not recommend to write inline function like this, but in order to get high speed def timeAggregate(startHourTimeStamp, startLine, endLine, interval: int): localTime = time.localtime(startHourTimeStamp[interval]) timeSpan = endLine - startLine + 1 alertLevel = int(timeSpan * 0.75) print(f"aggregating") # (1)AVERAGE VALUE CAL # (2)75 percent is 0, then 0 for attr in selecColumns: # scan from startLine to endLine vList = data[attr][startLine:endLine + 1] counter = vList.tolist().count(0) if counter < alertLevel: value = sum(vList / timeSpan) vals[interval][attr].append(value) else: vals[interval][attr].append(0) # HARDCODE # TODO Currently no need to modify, not sure in the future version vals[interval]['Minute'].append(localTime.tm_min) vals[interval]['Hour'].append(localTime.tm_hour) vals[interval]['Day'].append(localTime.tm_mday) vals[interval]['Month'].append(localTime.tm_mon) vals[interval]['Year'].append(localTime.tm_year) startLine = [0 for i in range(len_timeIntervals_sorted)] lastStartLine = [0 for i in range(len_timeIntervals_sorted)] lastEndLine = [0 for i in range(len_timeIntervals_sorted)] data_raw_time = time_col[0] lastHourTimeStamp = [time.mktime(time.strptime(data_raw_time, timeFormat)) for i in range(len_timeIntervals_sorted)] currentHour = 0 for lineIdx in range(data_len): data_raw_time = time_col[lineIdx] currentHour = time.mktime(time.strptime(data_raw_time, timeFormat)) # Layer combination # Start a new time set for idx, timeInterval in enumerate(timeIntervals_sorted): if currentHour - lastHourTimeStamp[idx] >= timeInterval * 60: startLine[idx] = lineIdx lastEndLine[idx] = lineIdx - 1 timeAggregate(lastHourTimeStamp, lastStartLine[idx], lastEndLine[idx], idx) lastHourTimeStamp[idx] = currentHour lastStartLine[idx] = startLine[idx] timeAggregate(lastHourTimeStamp, lastStartLine[idx], data_len - 1, idx) files = [] for idx, timeInterval in enumerate(timeIntervals_sorted): fileUtil.saveDataToCSV(fileOutput + f"_{timeInterval}.csv", vals[idx]) files.append(raw_name + f"_{timeInterval}.csv") return files