コード例 #1
0
def rwCSV(filesAggregated: list,
          fileOutput: str,
          needColumns: list,
          Default: list,
          pivot='datetime') -> list:

    files = []
    raw_output = fileOutput
    fileOutput = commonUtil.getPath(fileOutput)
    for f in filesAggregated:

        raw_name = f
        fileInput = commonUtil.getPath(f)

        data = fileUtil.readFromFileToData(fileInput=fileInput)
        # Default columns in the last columns
        vals = commonUtil.d(needColumns, Default)
        for attr in needColumns + Default:
            vals[attr] = data[attr]

        tmpName = raw_name.split(os.sep)[1]

        # Lost Order
        fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", vals)
        files.append(raw_output + f"_{tmpName}")

    return files
コード例 #2
0
def addLabelToCSV(srcFile: str, labelFile: str, row='a'):
    import os

    srcFile = commonUtil.getPath(srcFile)
    labelFile = commonUtil.getPath(labelFile)

    srcData = readFromFileToData(srcFile, row)
    lbData = readFromFileToData(labelFile, row)
    srcData['label'] = lbData['label']
    saveDataToCSV(srcFile, srcData)
    os.remove(labelFile)
コード例 #3
0
def readFromFileToList(fileInput: str) -> list:
    fileInput = commonUtil.getPath(fileInput)
    try:
        f = open(fileInput, "r")
        return list(f.read().splitlines())
    except Exception as e:
        # TODO error catch
        raise (e)
コード例 #4
0
def saveDataToCSV(fileOutput: str, data: dict):
    fileOutput = commonUtil.getPath(fileOutput)
    import pandas as pd
    dataframe = pd.DataFrame(data)
    try:
        dataframe.to_csv(fileOutput, index=False, sep=',')
    except Exception as e:
        # TODO error catch
        raise (e)
コード例 #5
0
def getTimeLabelColumns(srcFile: str, row='a'):
    srcFile = commonUtil.getPath(srcFile)
    srcData = readFromFileToData(srcFile, row)

    cols = srcData.columns[:].tolist()
    cols1 = cols[:0]
    cols2 = cols[-1:]

    return cols1 + cols2
コード例 #6
0
def loadYAML(fileInput: str) -> dict:
    yamlPath = commonUtil.getPath(fileInput)
    try:
        f = open(yamlPath, 'r', encoding='utf-8')
        cfg = f.read()
        # convert to dict
        d = yaml.safe_load(cfg)
        return d
    except Exception as e:
        raise(e)
コード例 #7
0
    def __init__(self, fileOutput, databasesetting):
        self.fileOutput = commonUtil.getPath(fileOutput)
        self.connectionInfo = LoadYAML.rDatabaseSetting(databasesetting)

        self.DATABASE_NAME = self.connectionInfo[-2]
        self.TABLE_NAME = self.connectionInfo[-1][0]

        # Sql head order will be changed
        self.head = []
        self.headString = ''
コード例 #8
0
def retainTimeColumnsInCSV(srcFile: str, row='a'):
    srcFile = commonUtil.getPath(srcFile)
    srcData = readFromFileToData(srcFile, row)

    cols = srcData.columns[:].tolist()
    cols1 = cols[:2]
    cols2 = cols[-1:]

    srcData.drop(cols1 + cols2, axis=1, inplace=True)
    return srcData
コード例 #9
0
def deleteAllFolder(folderName: str):
    import shutil
    import os
    folderName = commonUtil.getPath(folderName)
    del_list = os.listdir(folderName)
    for f in del_list:
        file_path = os.path.join(folderName, f)
        if os.path.isfile(file_path):
            os.remove(file_path)
        if os.path.isdir(file_path):
            shutil.rmtree(file_path)
コード例 #10
0
def addLinesToCSV(fileInput: str, lines: list):
    import csv
    try:
        fileInput = commonUtil.getPath(fileInput)
        out_f = open(fileInput, 'a', newline='')
        writer = csv.writer(out_f)
        writer.writerows(lines)
        out_f.close()
    except Exception as e:
        # TODO error catch
        raise (e)
コード例 #11
0
def prediction_flow(csv_filepath, prediction_filepath, model_filepath, labels,
                    YESTERDAY):
    """
    Prediction flows reads the file, removes columns not allowed in ml model,
    makes predictions, appends predictions to original dataframe, filters out necessary columns

    args:
        csv_filepath str Filepath of the aggregated csv
        model_filepath str Filepath of the joblib file
        labels list List of strings for the label, order matters
        remove_columns list List of strings for columns that need to be removed before placing in model
        retain_columns list List of strings for columns that need to be kept overall before storing in DB

    """
    # create list for predictions
    predictions = []

    # Load our pre-trained model
    model_filepath = commonUtil.getPath(model_filepath)
    clf = load(model_filepath)

    # read the data_temperature, header included
    csv_filepath = commonUtil.getPath(csv_filepath)
    df_model_input = fileUtil.retainTimeColumnsInCSV(csv_filepath)

    # make a list of prediction labels
    for row in df_model_input.values:
        prediction = make_predictions(clf, row, labels)
        predictions.append(prediction)

    prediction_filepath = commonUtil.getPath(prediction_filepath)
    vals = commonUtil.d(['Date', 'Label', 'Frequency'])
    fileUtil.saveDataToCSV(prediction_filepath, vals)

    res = []
    for label in labels:
        tmp = [YESTERDAY, label]
        tmp.append(predictions.count(label))
        res.append(tmp)
    fileUtil.addLinesToCSV(prediction_filepath, res)
コード例 #12
0
def saveListToCSV(fileOutput: str, srcList: list):
    fileOutput = commonUtil.getPath(fileOutput)
    try:
        f = open(fileOutput, "w")
        for idx, item in enumerate(srcList):
            if idx < len(srcList) - 1:
                f.write(str(item) + '\n')
            else:
                f.write(str(item))
        f.close()
    except Exception as e:
        # TODO error catch
        raise (e)
コード例 #13
0
def rwCSV(filesAggregated: list,
          fileOutput: str,
          UserInputColumns: list,
          APPENDIX=APPENDIX,
          normalizeMethod=zscore) -> list:
    files = []
    raw_output = fileOutput
    fileOutput = commonUtil.getPath(fileOutput)
    for f in filesAggregated:

        raw_name = f
        fileInput = commonUtil.getPath(f)

        data = fileUtil.readFromFileToData(fileInput)

        vals = commonUtil.d(UserInputColumns)

        for attr in UserInputColumns:
            vals[attr] = normalizeMethod(data, attr)
        # last columns is always APPENDIX, which is usually as 'Minute', 'Hour' 'Day' 'Month' 'Year'
        for attr in APPENDIX:
            vals[attr] = data[attr]

        # delete 'nan' value
        newVals = {}
        for key in vals.keys():
            if not np.isnan(vals[key][0]):
                newVals[key] = vals[key]
            else:
                newVals[key] = 0

        tmpName = raw_name.split(os.sep)[1]
        fileUtil.saveDataToCSV(fileOutput + f"_{tmpName}", newVals)
        files.append(raw_output + f"_{tmpName}")

    return files
コード例 #14
0
def processCSV(data, fileOutput, UserInputColumns, APPENDIX, timeFormat,
               timeInterval: int):
    startMinute, startHour, startDay, startMonth, startYear = combineAppendixItem(
        data, APPENDIX, rowIndex=0)

    tString = f'{startDay}-{startMonth}-{startYear}'
    startTimeStamp = time.mktime(time.strptime(tString, '%d-%m-%Y'))

    prevStartRow = 0

    slots = int(24 * 60 / timeInterval)
    dataPlots = [False for i in range(slots)]

    curTimeStamp = startTimeStamp

    #tmpName = raw_name.split(os.sep)[1]
    location = commonUtil.getPath(fileOutput)

    out_f = open(location, 'a', newline='')
    writer = csv.writer(out_f)
    for idx in range(len(data)):
        curMinute, curHour, curDay, curMonth, curYear = combineAppendixItem(
            data, APPENDIX, rowIndex=idx)
        curTime = f'{curMinute}-{curHour}-{curDay}-{curMonth}-{curYear}'

        curTimeStamp = time.mktime(time.strptime(curTime, timeFormat))

        if curTimeStamp - startTimeStamp < 86400:
            dataPlots[int((curHour * 60 + curMinute) / timeInterval)] = True

        else:
            startTimeStamp = curTimeStamp
            writeToDigestCSV(data, UserInputColumns, APPENDIX, prevStartRow,
                             dataPlots, writer, timeInterval)

            dataPlots = [False for i in range(slots)]
            dataPlots[int((curHour * 60 + curMinute) / timeInterval)] = True
            prevStartRow = idx

    writeToDigestCSV(data, UserInputColumns, APPENDIX, prevStartRow, dataPlots,
                     writer, timeInterval)
    out_f.close()
コード例 #15
0
def readFromFileToData(fileInput: str, row='a'):
    fileInput = commonUtil.getPath(fileInput)

    import pandas as pd
    try:
        if row == 'a':
            data = pd.read_csv(fileInput, encoding='utf-8',
                               delimiter=',')  # nrows=1000
        else:
            data = pd.read_csv(fileInput,
                               encoding='utf-8',
                               delimiter=',',
                               nrows=int(row))
        if (len(data) < 1):
            pass
        else:
            return data
    except Exception as e:
        # TODO error catch
        raise (e)
コード例 #16
0
def rwCSV(fileInput: str, fileOutput: str,
          timeIntervalList: list,
          timeFormat=timeFormat, pivot='datetime', appendColumns=appendColumns) -> list:
    if len(timeIntervalList) == 0:
        timeIntervalList = ['1hour']

    raw_name = fileOutput

    path = commonUtil.getPath(fileInput)
    fileOutput = commonUtil.getPath(fileOutput)
    data = fileUtil.readFromFileToData(path)

    data_len = len(data)

    # No order
    allColumns = data.columns.tolist()
    time_col = data[pivot]

    len_timeIntervals_sorted, timeIntervals_sorted = commonUtil.getIntervalList(timeIntervalList)
    selecColumns = []
    for col in allColumns:
        if col != pivot and (re.findall('id', col, flags=re.IGNORECASE) == []):
            selecColumns.append(col)

    vals = commonUtil.d(timeIntervals_sorted)
    for i in range(len_timeIntervals_sorted):
        vals[i] = commonUtil.d(selecColumns, appendColumns)

    # Not recommend to write inline function like this, but in order to get high speed
    def timeAggregate(startHourTimeStamp, startLine, endLine, interval: int):

        localTime = time.localtime(startHourTimeStamp[interval])

        timeSpan = endLine - startLine + 1
        alertLevel = int(timeSpan * 0.75)

        print(f"aggregating")

        # (1)AVERAGE VALUE CAL
        # (2)75 percent is 0, then 0
        for attr in selecColumns:
            # scan from startLine to endLine
            vList = data[attr][startLine:endLine + 1]
            counter = vList.tolist().count(0)

            if counter < alertLevel:
                value = sum(vList / timeSpan)
                vals[interval][attr].append(value)
            else:
                vals[interval][attr].append(0)

        # HARDCODE
        # TODO Currently no need to modify, not sure in the future version
        vals[interval]['Minute'].append(localTime.tm_min)
        vals[interval]['Hour'].append(localTime.tm_hour)
        vals[interval]['Day'].append(localTime.tm_mday)
        vals[interval]['Month'].append(localTime.tm_mon)
        vals[interval]['Year'].append(localTime.tm_year)

    startLine = [0 for i in range(len_timeIntervals_sorted)]
    lastStartLine = [0 for i in range(len_timeIntervals_sorted)]
    lastEndLine = [0 for i in range(len_timeIntervals_sorted)]

    data_raw_time = time_col[0]
    lastHourTimeStamp = [time.mktime(time.strptime(data_raw_time, timeFormat)) for i in range(len_timeIntervals_sorted)]
    currentHour = 0
    for lineIdx in range(data_len):

        data_raw_time = time_col[lineIdx]
        currentHour = time.mktime(time.strptime(data_raw_time, timeFormat))

        # Layer combination
        # Start a new time set
        for idx, timeInterval in enumerate(timeIntervals_sorted):
            if currentHour - lastHourTimeStamp[idx] >= timeInterval * 60:
                startLine[idx] = lineIdx
                lastEndLine[idx] = lineIdx - 1
                timeAggregate(lastHourTimeStamp, lastStartLine[idx], lastEndLine[idx], idx)
                lastHourTimeStamp[idx] = currentHour

            lastStartLine[idx] = startLine[idx]

    timeAggregate(lastHourTimeStamp, lastStartLine[idx], data_len - 1, idx)

    files = []
    for idx, timeInterval in enumerate(timeIntervals_sorted):
        fileUtil.saveDataToCSV(fileOutput + f"_{timeInterval}.csv", vals[idx])

        files.append(raw_name + f"_{timeInterval}.csv")

    return files