def createNewCSV(fileOutput, timeInterval: int): heads = [] heads.append('Date') heads.append('Attr') counter = int(24 * 60 / timeInterval) # Minute style if timeInterval < 60: for i in range(counter): tmpM = (i * timeInterval) % 60 tmpH = (i * timeInterval) / 60 if tmpM == 0: heads.append(f"{int(tmpH)}:00") else: heads.append(f"{int(tmpH)}:{tmpM}") # Hour style else: for i in range(counter): heads.append(f"{i}:00") heads.append('label') vals = commonUtil.d(heads) #tmpName = raw_name.split(os.sep)[1] fileUtil.saveDataToCSV(fileOutput, vals)
def rwCSV(filesAggregated: list, fileOutput: str, needColumns: list, Default: list, pivot='datetime') -> list: files = [] raw_output = fileOutput fileOutput = commonUtil.getPath(fileOutput) for f in filesAggregated: raw_name = f fileInput = commonUtil.getPath(f) data = fileUtil.readFromFileToData(fileInput=fileInput) # Default columns in the last columns vals = commonUtil.d(needColumns, Default) for attr in needColumns + Default: vals[attr] = data[attr] tmpName = raw_name.split(os.sep)[1] # Lost Order fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", vals) files.append(raw_output + f"_{tmpName}") return files
def prediction_flow(csv_filepath, prediction_filepath, model_filepath, labels, YESTERDAY): """ Prediction flows reads the file, removes columns not allowed in ml model, makes predictions, appends predictions to original dataframe, filters out necessary columns args: csv_filepath str Filepath of the aggregated csv model_filepath str Filepath of the joblib file labels list List of strings for the label, order matters remove_columns list List of strings for columns that need to be removed before placing in model retain_columns list List of strings for columns that need to be kept overall before storing in DB """ # create list for predictions predictions = [] # Load our pre-trained model model_filepath = commonUtil.getPath(model_filepath) clf = load(model_filepath) # read the data_temperature, header included csv_filepath = commonUtil.getPath(csv_filepath) df_model_input = fileUtil.retainTimeColumnsInCSV(csv_filepath) # make a list of prediction labels for row in df_model_input.values: prediction = make_predictions(clf, row, labels) predictions.append(prediction) prediction_filepath = commonUtil.getPath(prediction_filepath) vals = commonUtil.d(['Date', 'Label', 'Frequency']) fileUtil.saveDataToCSV(prediction_filepath, vals) res = [] for label in labels: tmp = [YESTERDAY, label] tmp.append(predictions.count(label)) res.append(tmp) fileUtil.addLinesToCSV(prediction_filepath, res)
def rwCSV(filesAggregated: list, fileOutput: str, UserInputColumns: list, APPENDIX=APPENDIX, normalizeMethod=zscore) -> list: files = [] raw_output = fileOutput fileOutput = commonUtil.getPath(fileOutput) for f in filesAggregated: raw_name = f fileInput = commonUtil.getPath(f) data = fileUtil.readFromFileToData(fileInput) vals = commonUtil.d(UserInputColumns) for attr in UserInputColumns: vals[attr] = normalizeMethod(data, attr) # last columns is always APPENDIX, which is usually as 'Minute', 'Hour' 'Day' 'Month' 'Year' for attr in APPENDIX: vals[attr] = data[attr] # delete 'nan' value newVals = {} for key in vals.keys(): if not np.isnan(vals[key][0]): newVals[key] = vals[key] else: newVals[key] = 0 tmpName = raw_name.split(os.sep)[1] fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", newVals) files.append(raw_output + f"_{tmpName}") return files
def createNewCSV(self): self.initConnection() vals = commonUtil.d(self.head) fileUtil.saveDataToCSV(self.fileOutput, vals)
def rwCSV(fileInput: str, fileOutput: str, timeIntervalList: list, timeFormat=timeFormat, pivot='datetime', appendColumns=appendColumns) -> list: if len(timeIntervalList) == 0: timeIntervalList = ['1hour'] raw_name = fileOutput path = commonUtil.getPath(fileInput) fileOutput = commonUtil.getPath(fileOutput) data = fileUtil.readFromFileToData(path) data_len = len(data) # No order allColumns = data.columns.tolist() time_col = data[pivot] len_timeIntervals_sorted, timeIntervals_sorted = commonUtil.getIntervalList(timeIntervalList) selecColumns = [] for col in allColumns: if col != pivot and (re.findall('id', col, flags=re.IGNORECASE) == []): selecColumns.append(col) vals = commonUtil.d(timeIntervals_sorted) for i in range(len_timeIntervals_sorted): vals[i] = commonUtil.d(selecColumns, appendColumns) # Not recommend to write inline function like this, but in order to get high speed def timeAggregate(startHourTimeStamp, startLine, endLine, interval: int): localTime = time.localtime(startHourTimeStamp[interval]) timeSpan = endLine - startLine + 1 alertLevel = int(timeSpan * 0.75) print(f"aggregating") # (1)AVERAGE VALUE CAL # (2)75 percent is 0, then 0 for attr in selecColumns: # scan from startLine to endLine vList = data[attr][startLine:endLine + 1] counter = vList.tolist().count(0) if counter < alertLevel: value = sum(vList / timeSpan) vals[interval][attr].append(value) else: vals[interval][attr].append(0) # HARDCODE # TODO Currently no need to modify, not sure in the future version vals[interval]['Minute'].append(localTime.tm_min) vals[interval]['Hour'].append(localTime.tm_hour) vals[interval]['Day'].append(localTime.tm_mday) vals[interval]['Month'].append(localTime.tm_mon) vals[interval]['Year'].append(localTime.tm_year) startLine = [0 for i in range(len_timeIntervals_sorted)] lastStartLine = [0 for i in range(len_timeIntervals_sorted)] lastEndLine = [0 for i in range(len_timeIntervals_sorted)] data_raw_time = time_col[0] lastHourTimeStamp = [time.mktime(time.strptime(data_raw_time, timeFormat)) for i in range(len_timeIntervals_sorted)] currentHour = 0 for lineIdx in range(data_len): data_raw_time = time_col[lineIdx] currentHour = time.mktime(time.strptime(data_raw_time, timeFormat)) # Layer combination # Start a new time set for idx, timeInterval in enumerate(timeIntervals_sorted): if currentHour - lastHourTimeStamp[idx] >= timeInterval * 60: startLine[idx] = lineIdx lastEndLine[idx] = lineIdx - 1 timeAggregate(lastHourTimeStamp, lastStartLine[idx], lastEndLine[idx], idx) lastHourTimeStamp[idx] = currentHour lastStartLine[idx] = startLine[idx] timeAggregate(lastHourTimeStamp, lastStartLine[idx], data_len - 1, idx) files = [] for idx, timeInterval in enumerate(timeIntervals_sorted): fileUtil.saveDataToCSV(fileOutput + f"_{timeInterval}.csv", vals[idx]) files.append(raw_name + f"_{timeInterval}.csv") return files