def get_mask_from_interval(times,start,end): """ give start and end time as epoch or iso string and a time vector, return a mask which points are inside """ start= date2secs(start) end = date2secs(end) left = numpy.searchsorted(times,start) #this is the first value > start right = numpy.searchsorted(times,end) #this is the first value > end mask = numpy.full(len(times),False) mask[left:right]=True return mask
def create_test_model_large(myModel, noVars=100, blobs=10, lines=1000): """ creating a large model. we insert blobs with each containing lines !! we totally enter blobs*lines lines to the table Args: vars: the number of vars to be created blobs: the number of blobs to be inserted lines: the number of lines to be inserted with EACH blob Returns: the model """ t = Timer() #make a table and put in the variables print("make a table with ", noVars, " rows and ", (blobs * lines), " rows") vars = [] for var in range(noVars): variableName = "myvariable" + str(var) vars.append(variableName) modeltemplates.create_table(myModel, variableNames=vars) #prepare a row as dataBlob rowEntry = {} for leaf in myModel.get_leaves('root.newTable.columns'): rowEntry[leaf["browsePath"]] = [] #remove the time field and add the __time timeBrowsePath = myModel.get_leaves( 'root.newTable.timeField')[0]['browsePath'] rowEntry.pop(timeBrowsePath) #rowEntry['__time']=[] print(rowEntry) #put in some data startTime = datetime.datetime.now(tz=pytz.utc) deltaseconds = 0 tt = Timer() t.start() for blobNo in range(blobs): print("blob", blobNo, "of", blobs) blob = rowEntry.copy() for key in blob: blob[key] = list(range(deltaseconds, deltaseconds + lines)) #for the time it's special blob['__time'] = [] for d in range(lines): blob['__time'].append( model.date2secs(startTime + datetime.timedelta(seconds=deltaseconds + d))) deltaseconds += lines print("deltasec", deltaseconds) tt.remaining(blobNo, blobs) t.start() myModel.ts_table_add_blob(blob) t.stop("entered" + str(lines) + "lines") return myModel
def motif_jumper(functionNode): logger = functionNode.get_logger() logger.info("==>>>> in motif_jumper " + functionNode.get_browse_path()) minerNode = functionNode.get_child("miner").get_targets()[0] widgetNode = minerNode.get_child("widget").get_targets()[0] dates = minerNode.get_child("peaks").get_value() if type(dates) is not list: dates = [] currentIndex = functionNode.get_child("jumpPos").get_value() inc = functionNode.get_child("jumpInc").get_value() nextIndex = currentIndex+inc if nextIndex>=len(dates): nextIndex=0 functionNode.get_child("jumpPos").set_value(nextIndex) logger.debug(f"jump to index {nextIndex} : {dates[nextIndex]}") if len(dates): motif = minerNode.get_child("motif").get_targets()[0] motifStart = motif.get_child("startTime").get_value() motifEnd = motif.get_child("endTime").get_value() currentViewStart = widgetNode.get_child("startTime").get_value() currentViewEnd = widgetNode.get_child("endTime").get_value() currentViewWidth = date2secs(currentViewEnd)- date2secs(currentViewStart) windowStart = date2secs(dates[nextIndex])-currentViewWidth/2 windowEnd = date2secs(dates[nextIndex])+currentViewWidth/2 widgetNode.get_child("startTime").set_value(epochToIsoString(windowStart,zone=timezone('Europe/Berlin'))) widgetNode.get_child("endTime").set_value(epochToIsoString(windowEnd, zone=timezone('Europe/Berlin'))) return True
def motif_jumper(functionNode): """ jump from one result the the next/prev if the increase/decrese is zero, we jump to the motif instead """ logger = functionNode.get_logger() logger.info("==>>>> in motif_jumper " + functionNode.get_browse_path()) minerNode = functionNode.get_child("miner").get_targets()[0] widgetNode = minerNode.get_child("widget").get_targets()[0] dates = minerNode.get_child("peaks").get_value() if type(dates) is not list: dates = [] currentIndex = functionNode.get_child("jumpPos").get_value() inc = functionNode.get_child("jumpInc").get_value() if inc != 0: nextIndex = currentIndex + inc if nextIndex >= len(dates): nextIndex = 0 functionNode.get_child("jumpPos").set_value(nextIndex) logger.debug(f"jump to index {nextIndex} : {dates[nextIndex]}") motif = minerNode.get_child("motif").get_targets()[0] motifStart = motif.get_child("startTime").get_value() motifEnd = motif.get_child("endTime").get_value() motifWidth = date2secs(motifEnd) - date2secs(motifStart) currentViewStart = widgetNode.get_child("startTime").get_value() currentViewEnd = widgetNode.get_child("endTime").get_value() currentViewWidth = date2secs(currentViewEnd) - date2secs(currentViewStart) if inc == 0: center = date2secs(motifStart) + 0.5 * (motifWidth) windowStart = center - currentViewWidth / 2 windowEnd = center + currentViewWidth / 2 widgetNode.get_child("startTime").set_value( epochToIsoString(windowStart, zone=timezone('Europe/Berlin'))) widgetNode.get_child("endTime").set_value( epochToIsoString(windowEnd, zone=timezone('Europe/Berlin'))) else: if len(dates): center = date2secs(dates[nextIndex]) - 0.5 * motifWidth windowStart = center - currentViewWidth / 2 windowEnd = center + currentViewWidth / 2 widgetNode.get_child("startTime").set_value( epochToIsoString(windowStart, zone=timezone('Europe/Berlin'))) widgetNode.get_child("endTime").set_value( epochToIsoString(windowEnd, zone=timezone('Europe/Berlin'))) return True
def write_test(rate, port=6001): while True: time.sleep(float(rate) / 1000) now = datetime.datetime.now(pytz.timezone("Europe/Berlin")) epoch = date2secs(now) blob = make_blob(epoch) body = [blob] try: startTime = datetime.datetime.now() host = "http://127.0.0.1:" + str(port) + "/_appendRow" r = requests.post(host, data=json.dumps(body), timeout=5) diff = (datetime.datetime.now() - startTime).total_seconds() print( f"sent {json.dumps(body)} with result {r.status_code} difftime{diff}" ) except Exception as ex: print(f"sent {json.dumps(body)} with exception {ex}")
def show_timeseries_results(functionNode): results = functionNode.get_child("results").get_value() motifNode = functionNode.get_child("motif").get_target() startTime = motifNode.get_child("startTime").get_value() endTime = motifNode.get_child("endTime").get_value() varNode = motifNode.get_child("variable").get_target() motifTimeSeries = varNode.get_time_series(start=startTime, end=endTime) varName = varNode.get_property("name") for child in functionNode.get_children(): if child.get_name().endswith("_expected"): if not child.get_name().startswith(varName): child.delete() resultNode = functionNode.create_child(name=varName+'_expected', type="timeseries") resultNode.set_time_series([],[]) cnt = 0 for result in results: resultTimes = motifTimeSeries['__time']+result['epochStart']-date2secs(startTime) # time + offset resultValues = (motifTimeSeries['values']).copy() lastIdx = len(resultTimes)-1 ### for each result excerptFullTs = varNode.get_time_series(start=result['startTime'], end=result['endTime']) excerptFullTsValues = (excerptFullTs['values'])[:-1] # stumpy_print_labeled_2_axis(resultValues, excerptFullTsValues, cnt, str(cnt), varName) #resultValuesNorm = mixed_norm_cross(resultValues, excerptFullTsValues) resultValuesNorm = std_norm(resultValues, excerptFullTsValues) #kna resultValuesNormNan = resultValuesNorm.copy() resultValuesNormNan = numpy.insert(resultValuesNormNan,0, numpy.nan) resultValuesNormNan = numpy.append(resultValuesNormNan, numpy.nan) resultTimesNan = resultTimes.copy() resultTimesNan = numpy.insert(resultTimesNan, 0, resultTimes[0]+resultTimes[0]-resultTimes[1]) resultTimesNan = numpy.append(resultTimesNan, resultTimes[lastIdx]+resultTimes[lastIdx]-resultTimes[lastIdx-1]) cnt = cnt + 1 #to avoid overlaps with old results (which can happen if results actually overlap), delete first resultNode.delete_time_series(start = resultTimesNan[0],end=resultTimesNan[-1]) #kna resultNode.insert_time_series(values=resultValuesNormNan, times=resultTimesNan) widgetNode = functionNode.get_child("widget").get_target() widgetNode.get_child("selectedVariables").add_references(resultNode,allowDuplicates=False)
def import_run(iN): _helper_log(f"IMPORT STARTED") timeStartImport = dt.datetime.now() # --- define vars importerNode = iN.get_parent() # --- [vars] define tablename = iN.get_child("tablename").get_value() _helper_log(f"tablename: {tablename}") # # --- create needed nodes importerNode.create_child('imports', type="folder") importsNode = importerNode.get_child("imports") # TODO importsNode.get_child(tablename).delete() importsNode.create_child(tablename, type="table") table = importsNode.get_child(tablename) table.create_child('variables', type="folder") table.create_child('columns', type="referencer") table.create_child('metadata', type="const") vars = table.get_child("variables") cols = table.get_child("columns") # --- read metadata and fields metadataRaw = iN.get_child("metadata").get_value() metadata = json.loads(metadataRaw) table.get_child("metadata").set_value(metadata) fields = metadata["fields"] timefield = int(metadata["timefield"]) - 1 filename = metadata["filename"] headerexists = metadata["headerexists"] filepath = 'upload/' + filename # --- load csv data # * https://www.shanelynn.ie/python-pandas-read_csv-load-data-from-csv-files/ # * [ ] optimize speed? https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas # * [ ] vectorize a loop https://stackoverflow.com/questions/27575854/vectorizing-a-function-in-pandas df = pd.read_csv(filepath) # --- define time list # * select rows and columns from dataframe https://thispointer.com/select-rows-columns-by-name-or-index-in-dataframe-using-loc-iloc-python-pandas/ timeList = df.iloc[:,timefield].to_list() epochs = [date2secs(time) for time in timeList] print(epochs) # --- import data, set vars and columns data = {} for field in fields: fieldno = int(field["no"]) - 1 fieldname = field["val"] fieldvar = vars.create_child(fieldname, type="timeseries") if timefield != fieldno: data[fieldname] = df.iloc[ :, fieldno].to_list() fieldvar.set_time_series(values=data[fieldname],times=epochs) cols.add_references(fieldvar) _helper_log(f"val: {fieldname}") print(fieldvar) _helper_log(f"IMPORT DONE (seconds: {(dt.datetime.now()-timeStartImport).seconds})") return True
def reader(functionNode): logger = functionNode.get_logger() logger.info("==>>>> in raw_reader_2 " + functionNode.get_browse_path()) full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) folder = path + r'\..\upload' m = functionNode.get_model() signalNode = functionNode.get_child("control").get_child("signal") signalNode.set_value("nosignal") varFolder = functionNode.get_child("variablesFolder").get_targets( )[0] # we expect a direct link to the folder where to place the new variables fileNameMatch = functionNode.get_child("fileFilter").get_value() progressNode = functionNode.get_child("control").get_child("progress") localtz = timezone('Europe/Berlin') fileNames = os.listdir(folder) #fileNames = [r'20190805_TempSenMachine_1.dat',r'20190805_TempSenMachine_3.dat',r'20190805_TempSenMachine_2.dat'] for idx, fileName in enumerate(fileNames): progressNode.set_value(round(idx / len(fileNames), 2)) if fileNameMatch not in fileName: logger.debug(f"skip file {fileName}, not match") continue # this file will be ignored if fileName in functionNode.get_child("processedFiles").get_value(): logger.debug(f"skip file {fileName}, already done ") continue try: # now open the file, read it in fullFileName = folder + "\\" + fileName logger.info(f"processing {fullFileName}") data = pandas.read_csv(fullFileName, sep=",") data.rename(columns_renamer, axis="columns", inplace=True) #first the times cols = list(data.columns) times = [] for dateString in data[cols[0]]: mydate = dateutil.parser.parse(dateString, dayfirst=True) try: mydateAware = localtz.localize(mydate) except: mydateAware = mydate # is already localized epoch = model.date2secs(mydateAware) times.append(epoch) drops = [] for col in cols[1:]: try: values = numpy.asarray(data[col], dtype=numpy.float64) except: print(f"could not convert column {col}") drops.append(col) continue # print(values) values[numpy.isfinite(values) == False] = numpy.nan # set all not finites to +numpy.nan data[col] = values data = data.drop(columns=drops) cols = list(data.columns) # in the file, the variables only have a single, unique name, now we need to find the matches in the table # so we will iterate over all columns of the table and try to match the sensor names of the file to the variables # by looking at the actual name of the node, we don't care where the nodes reside, just the name of the nodes must match #build a look up to speed up things existingColumns = {} for columnNode in functionNode.get_child( "variablesFolder").get_leaves(): existingColumns[columnNode.get_name()] = columnNode #now insert for col in cols[1:]: if signalNode.get_value() == "stop": raise Exception("user stop") if col in existingColumns: mynode = existingColumns[col] else: mynode = varFolder.create_child(col, properties={ "tsAllocSize": 100000, "type": "timeseries" }) m.disable_observers() res = mynode.insert_time_series(times=times, values=data[col]) m.enable_observers() logger.debug( f"inserting {len(data[col])} on node {mynode.get_browse_path()}, result = {res}" ) except Exception as ex: print(f"can't import file {fileName}, {ex}") if signalNode.get_value() == "stop": break continue #remember these files done doneFilesList = functionNode.get_child("processedFiles").get_value() doneFilesList.extend(fileNames) functionNode.get_child("processedFiles").set_value( doneFilesList) # remember the files return True
def dates(): s = "2018-1-1T08:15:00+02:00" s2 = "2018-1-1T06:15:00+00:00" print(model.date2secs(s),model.date2secs(s2))
def pps_miner(functionNode): logger = functionNode.get_logger() logger.info("==>>>> in pps_miner " + functionNode.get_browse_path()) myWidget = functionNode.get_child("widget").get_target() myModel = functionNode.get_model() motifs = myWidget.get_child("hasAnnotation").get_child( "selectedAnnotations").get_leaves() annotations = functionNode.get_child("annotations") progressNode = functionNode.get_child("control").get_child("progress") progressNode.set_value(0) """ preparation: - find the current widget, take the current selected selection for my motif - if not available - connect the result to the table """ motif = None if motifs: motif = motifs[0] functionNode.get_child("motif").add_references( motif, deleteAll=True) # take this annotation else: #there is currently no selection, we take the motif from last time if we have one motifs = functionNode.get_child("motif").get_targets() if motifs: motif = motifs[0] if not motif: logger.error("have no motif") return False # prepare the result: delete previous annotations functionNode.get_child("peaks").set_value([]) try: myModel.disable_observers() annos = annotations.get_children() if annos: for anno in annos: anno.delete() except: myModel.enable_observers() return False myModel.enable_observers() myModel.notify_observers( annotations.get_id(), "children") # trigger the widgets to delete the annotations #################################### ## get the settings #################################### preFilter = functionNode.get_child("preFilter").get_value() postFilter = functionNode.get_child("postFilter").get_value() subtractPolynomOrder = functionNode.get_child( "subtractPolynomOrder").get_value() differentiate = functionNode.get_child("differentiate").get_value() timeRanges = functionNode.get_child("timeRanges").get_value() timeRanges = {int(k): v for k, v in timeRanges.items()} valueRanges = functionNode.get_child("valueRanges").get_value() valueRanges = {int(k): v for k, v in valueRanges.items()} typeFilter = functionNode.get_child("typeFilter").get_value() #################################### #get the motif #################################### motifStartTime = date2secs(motif.get_child("startTime").get_value()) motifEndTime = date2secs(motif.get_child("endTime").get_value()) motifVariable = motif.get_child("variable").get_targets()[0] #motifStartTime = motifStartTime+0.2*(motifEndTime-motifStartTime) #print(f"{vars[varName].get_name()},{motifStartTime}") motif = motifVariable.get_time_series(start=motifStartTime, end=motifEndTime) #print(motif) motifX = motif["__time"] motifY0 = motif["values"] motifY1 = mnh.pps_prep(motifY0, filter=preFilter, poly=subtractPolynomOrder, diff=differentiate, postFilter=postFilter) motifPPS = mnh.prominent_points(motifY1, motifX) #################################### #get the time series #################################### # make the time series data and pps series = motifVariable.get_time_series() # the full time x = series["__time"] t0 = series["__time"][0] x = x - t0 # start time from 0 y0 = series["values"] y1 = mnh.pps_prep(y0, filter=preFilter, poly=subtractPolynomOrder, diff=differentiate, postFilter=postFilter) pps = mnh.prominent_points(y1, x) #################################### # MINING #################################### progressNode.set_value(0.3) matches = mnh.pps_mining(motifPPS['pps'], pps['pps'], timeRanges=timeRanges, valueRanges=valueRanges, typeFilter=typeFilter, motifStartIndex=0, debug=False) print( f"{len(matches)} matches: {[secs2date(t0+m['time']).isoformat() for m in matches]}" ) #################################### # create result Annotations #################################### annoTimeLen = motifEndTime - motifStartTime newAnnotations = [] for m in matches: anno = { "type": "time", "startTime": "", "endTime": "", "tags": ["pattern_match"] } anno["startTime"] = epochToIsoString(m["time"] + t0 - annoTimeLen, zone=timezone('Europe/Berlin')) anno["endTime"] = epochToIsoString(m["time"] + t0, zone=timezone('Europe/Berlin')) newAnnotations.append(anno) #remove trivial matches (inside a guard area around the motif) guard = (motifEndTime - motifStartTime) / 2 newAnnotations = [ anno for anno in newAnnotations if date2secs(anno["startTime"]) < (motifStartTime - guard) or date2secs(anno["startTime"]) > motifEndTime + guard ] progressNode.set_value(0.6) myModel.disable_observers() for anno in newAnnotations: # create the annotation in the model newAnno = annotations.create_child(type="annotation") for k, v in anno.items(): newAnno.create_child(properties={ "name": k, "value": v, "type": "const" }) myModel.enable_observers() myModel.notify_observers(annotations.get_id(), "children") #also write the peaks peaks = [ epochToIsoString(m["time"] + t0, zone=timezone('Europe/Berlin')) for m in matches ] functionNode.get_child("peaks").set_value(peaks) progressNode.set_value(1) return True
def motif_miner(functionNode): logger = functionNode.get_logger() logger.info("==>>>> in motif_miner " + functionNode.get_browse_path()) progressNode = functionNode.get_child("control").get_child("progress") progressNode.set_value(0) tableNode = functionNode.get_child("table").get_targets()[0] timeNode = tableNode.get_child("timeField").get_targets()[0] subSamplingFactor = functionNode.get_child("subSamplingFactor").get_value() sigmaNoise = functionNode.get_child("addNoise").get_value() annotations = functionNode.get_child("annotations") myModel = functionNode.get_model() signalNode = functionNode.get_child("control").get_child("signal") polynomPre = functionNode.get_child("subtractPolynomOrder").get_value() if polynomPre != "none": polynomPre = "subtract_poly_" + str(int(polynomPre)) logger.debug(f"polynome pre is {polynomPre}") algo = functionNode.get_child("algorithm").get_value() """ preparation: - find the current widget, take the current selected selection for my motif - if not available - connect the result to the table """ myWidget = functionNode.get_child("widget").get_targets()[0] motifs = myWidget.get_child("hasAnnotation").get_child( "selectedAnnotations").get_leaves() motif = None if motifs: motif = motifs[0] functionNode.get_child("motif").add_references( motif, deleteAll=True) # take this annotation else: #there is currently no selection, we take the motif from last time if we have one motifs = functionNode.get_child("motif").get_targets() if motifs: motif = motifs[0] if not motif: logger.error("have no motif") return False # prepare the result: delete previous scores and annotations scoreNode = functionNode.get_child("score") functionNode.get_child("peaks").set_value([]) scoreNode.connect_to_table( tableNode ) # this will make it part of the table and write it all to numpy.inf try: myModel.disable_observers() annos = annotations.get_children() if annos: for anno in annos: anno.delete() except: myModel.enable_observers() return False myModel.enable_observers() myModel.notify_observers( annotations.get_id(), "children") # trigger the widgets to delete the annotations #prepare the motif motifVariable = motif.get_child("variable").get_targets()[0] start = motif.get_child("startTime").get_value() end = motif.get_child("endTime").get_value() motifEpochStart = date2secs(start) motifEpochEnd = date2secs(end) logger.debug( f"motif: {motifVariable.get_browse_path()}, {start} .. {end} ") timeIndices = timeNode.get_time_indices(start, end) logger.debug(f" motif len {len(timeIndices)}") y = motifVariable.get_value().copy() # y holds the full data y = gaussian_filter(y, sigma=2) # smoothen it #normalize y #y = normalize_range(y,0,1) #normalize t = timeIndices[::subSamplingFactor] #downsampling of the data yMotif = y[t].copy() # yMotif holds the motif data downsampled noise = np.random.normal(0, sigmaNoise, len(yMotif)) yMotif = yMotif + noise # adds a small amount of noise to the template in order to not create pathological results # prepare the result motifTimeLen = (timeIndices[-1] - timeIndices[0]) scoreIndices = numpy.arange(timeIndices[0] - 1 * motifTimeLen, timeIndices[-1] + 200 * motifTimeLen) scoreTimes = scoreIndices[::subSamplingFactor] #y = y[::subSamplingFactor] # full data #y=y[scoreTimes] #we will now go through the data and each 2 seconds update the results with the new results runningR = True runningL = True blockSize = 10 # times the motif len blockStartR = timeIndices[ 0] # start is the motif start time for the serach to the right blockStartL = timeIndices[0] - ( blockSize - 1 ) * motifTimeLen # start is the motif start time for the search to the left signalNode.set_value("none") while runningR or runningL: percentR = float( (blockStartR - timeIndices[0]) / (len(t) - blockStartR)) percentL = float(timeIndices[0] - blockStartL) / float(timeIndices[0]) progressNode.set_value(max(percentL, percentR)) logger.debug(f"processing block {blockStartR} {len(y)}") oldResult = scoreNode.get_value().copy() resultVector = numpy.full(len(oldResult), numpy.inf, dtype=numpy.float64) if runningR: #search to the right scoreIndices = numpy.arange(blockStartR, blockStartR + blockSize * motifTimeLen) scoreTimes = scoreIndices[::subSamplingFactor] yWindow = y[scoreTimes] result = motif_mining(template=yMotif, y=yWindow, preprocesssing_method=polynomPre, sim_method=algo, normalize=False, progressNode=progressNode) #resultVector = scoreNode.get_value() indices = numpy.asarray( list(range(0, len(resultVector), subSamplingFactor))) if 0: # for full for i in range(subSamplingFactor): resultVector[indices + i] = result else: for i in range(subSamplingFactor): resultVector[scoreTimes + i] = result #scoreNode.set_value(resultVector) blockStartR += (blockSize - 1) * motifTimeLen if blockStartR > (len(y) - (blockSize * motifTimeLen)): runningR = False if runningL: #search to the right scoreIndices = numpy.arange(blockStartL, blockStartL + blockSize * motifTimeLen) scoreTimes = scoreIndices[::subSamplingFactor] yWindow = y[scoreTimes] result = motif_mining(template=yMotif, y=yWindow, preprocesssing_method=polynomPre, sim_method=algo, normalize=False, progressNode=progressNode) #resultVector = scoreNode.get_value() indices = numpy.asarray( list(range(0, len(resultVector), subSamplingFactor))) if 0: # for full for i in range(subSamplingFactor): resultVector[indices + i] = result else: for i in range(subSamplingFactor): resultVector[scoreTimes + i] = result #scoreNode.set_value(resultVector) blockStartL -= (blockSize - 1) * motifTimeLen if blockStartL < 0: runningL = False transferIndices = numpy.isfinite( resultVector) # set the inf to 0 where we have new results oldResult[transferIndices] = resultVector[transferIndices] scoreNode.set_value(oldResult) if signalNode.get_value() == "stop": runningR = False runningL = False generate_peaks(resultVector, functionNode, logger, timeNode, (len(yMotif) * subSamplingFactor), motifEpochStart, motifEpochEnd) time.sleep(0.8) # this makes a yield to other threads #generate_peaks(resultVector,functionNode,logger,timeNode,(len(yMotif) * subSamplingFactor),motifEpochStart,motifEpochEnd) return True
def logistic_regression2(functionNode): """ new version of the logistic regression: we are following the approach of a n-class classification where n-1 classes are given in the categoryMap starting at number 1, all data inside the regions (we support only one here) will be treated as category 0 """ scaling = True algorithm = "rf" # of rf, lg logger = functionNode.get_logger() logger.info("==>>>> in logisticregression 2: " + functionNode.get_browse_path()) # now get the input and outputs inputNodes = functionNode.get_child("input").get_leaves() for node in inputNodes: logger.debug("input node" + node.get_browse_path()) outputNode = functionNode.get_child("output").get_leaves()[0] logger.debug("outputnode " + outputNode.get_browse_path()) annotations = functionNode.get_child("annotations").get_leaves() logger.debug("no annotations: " + str(len(annotations))) timeNode = inputNodes[0].get_table_time_node( ) # if the node is a column belonging to a time series table, we now get the time node of that table logger.debug("the time node is " + timeNode.get_browse_path()) tableNode = timeNode.get_table_node() tableLen = tableNode.get_table_len() """ now prepare the data for processing: learning: find all annotations with tag "learn" inside them take all annotations that are tagged with a tag inside the tagsMap/categoryMap, the target values start at 1 use them for learning, where all areas that are not tagged or tagged with something not in the category Map get the tagvalue 0 for scoring, we use the annoations with "score" tagged """ # get the annotation map tagsMap = functionNode.get_child( "categoryMap").get_value() # pick the category mapping from the model learnIndices = [] learnTimes = [] scoreIndices = [] scoreTimes = [] # see if we have a "region" annotation (we only support one) for anno in annotations: tag = anno.get_child('tags').get_value()[0] if tag in ["learn", "score"]: regionStartDate = anno.get_child("startTime").get_value() regionEndDate = anno.get_child("endTime").get_value() logger.debug(tag + "region: " + str(regionStartDate) + " " + str(regionEndDate)) regionStartTime = date2secs(regionStartDate) regionEndTime = date2secs(regionEndDate) #regionIndices = timeNode.get_time_indices(regionStartTime, regionEndTime) if tag == "learn": learnTimes.append({ "start": regionStartTime, "end": regionEndTime }) learnIndices.extend( timeNode.get_time_indices(regionStartTime, regionEndTime)) elif tag == "score": scoreTimes.append({ "start": regionStartTime, "end": regionEndTime }) scoreIndices.extend( timeNode.get_time_indices(regionStartTime, regionEndTime)) # now prepare the training target values based on the given annotations, # the regions where no anno is given are set as 0 # for simplicity we start with a training array of the full size of the table, later, we cut it # for the region training = numpy.full(tableLen, 0) # set zero to all values as a padding groundTruth = numpy.full(tableLen, 0) #set zero, used for the confusion matrix allAnnoIndices = [ ] # all indices that are inside a learn region and inside an annotation region of interest #indices = [] # a list of indices which give the points in time that were labelled #training = [] # the training values are the tags from the annoations translated via the tagsmap for anno in annotations: startTime = date2secs(anno.get_child("startTime").get_value()) endTime = date2secs(anno.get_child("endTime").get_value()) tag = anno.get_child('tags').get_value()[0] if tag in tagsMap: #we take annotations that have a tag which was selected in the tagsMap #now check if it is inside any learn area for region in learnTimes: if startTime >= region["start"] and endTime <= region["end"]: #the annotation is inside this region annoIndices = list( timeNode.get_time_indices(startTime, endTime) ) # this gives us the indices in the table between start and end time allAnnoIndices.extend(annoIndices) #now set the values with fancy indexing training[annoIndices] = tagsMap[tag] logger.debug("set training target %s as %i", tag, tagsMap[tag]) for region in scoreTimes: if startTime >= region["start"] and endTime <= region["end"]: annoIndices = list( timeNode.get_time_indices(startTime, endTime)) groundTruth[annoIndices] = tagsMap[tag] #now cut the region if we have one via fancy indexing #training = training[learnIndices] training = training[allAnnoIndices] trainingData = [] # now grab the values from the columns finiteNodes = [] for node in inputNodes: values = numpy.asarray(node.get_value()) values = values[allAnnoIndices] if numpy.isfinite(values).all(): trainingData.append(list(values)) finiteNodes.append(node) logger.debug("has good finite values" + node.get_name()) else: logger.warning("some infinite values in " + node.get_name()) #now pad with zero values[False == numpy.isfinite(values)] = 0 trainingData.append(list(values)) finiteNodes.append(node) table = numpy.stack(trainingData, axis=0) # now fit the model model = LogisticRegression() #model = RandomForestClassifier(n_estimators=50,max_features=10) scaler = StandardScaler() scaler.fit(table.T) if scaling: data = scaler.transform(table.T) else: data = table.T model.fit(data, numpy.asarray(training)) # # scoring is on "scoreregion" # # try some scoring logger.info("now score") scoreData = [] # now grab the values from the columns for node in finiteNodes: values = node.get_value()[scoreIndices] values[False == numpy.isfinite(values)] = 0 #pad inf as zero scoreData.append(values) scoreTable = numpy.stack(scoreData, axis=0) if scaling: data = scaler.transform(scoreTable.T) else: data = scoreTable.T score = model.predict(data) # now write the score back to the ouput # we need to fill in the remainings with Nan scoreFill = numpy.full(tableLen, numpy.inf) scoreFill[scoreIndices] = score outputNode.set_value(list(scoreFill)) """ #also get the confusion matrix matrix = {'00':0,'01':0,'11':0,'10':0} for val,score in zip(groundTruth[scoreIndices],score): entry = str(val)+str(score) matrix[entry]=matrix[entry]+1 for entry in matrix: matrix[entry]=float(matrix[entry])/float(len(scoreIndices)) logger.debug("Confusion Matrix"+str(matrix)) functionNode.get_child("confusion").set_value(matrix) """ return True
def logistic_regression2_old2(functionNode): """ new version of the logistic regression: we are following the approach of a n-class classification where n-1 classes are given in the categoryMap starting at number 1, all data inside the regions (we support only one here) will be treated as category 0 """ scaling = True algorithm = "rf" # of rf, lg logger = functionNode.get_logger() logger.info("==>>>> in logisticregression 2: " + functionNode.get_browse_path()) # now get the input and outputs inputNodes = functionNode.get_child("input").get_leaves() for node in inputNodes: logger.debug("input node" + node.get_browse_path()) outputNode = functionNode.get_child("output").get_leaves()[0] logger.debug("outputnode " + outputNode.get_browse_path()) annotations = functionNode.get_child("annotations").get_leaves() if annotations: logger.debug("no annotations: " + str(len(annotations))) timeNode = inputNodes[0].get_table_time_node( ) # if the node is a column belonging to a time series table, we now get the time node of that table logger.debug("the time node is " + timeNode.get_browse_path()) """ now prepare the data for processing: 1.1) take the annotated areas, collect all start and end times and get all indices of time inside the annotations 1.2) use the map to match the tag labels to values 1.3) cut out the input data from the variable based on the indices from 1.1) 1.4) train the model 1.5) score on the full data 1.6) write back the score to the output """ # get the annotation map tagsMap = functionNode.get_child( "categoryMap").get_value() # pick the category mapping from the model hasRegion = False # see if we have a "region" annotation (we only support one) for anno in annotations: if anno.get_child('tags').get_value()[0] == "region": regionStartDate = anno.get_child("startTime").get_value() regionEndDate = anno.get_child("endTime").get_value() hasRegion = True logger.debug("has Region:" + str(regionStartDate) + " " + str(regionEndDate)) regionStartTime = date2secs(regionStartDate) regionEndTime = date2secs(regionEndDate) regionIndices = timeNode.get_time_indices(regionStartTime, regionEndTime) break # only one supported # now prepare the training target values based on the given annotations, # the regions where no anno is given are set as 0 # for simplicity we start with a training array of the full size of the table, later, we cut it # for the region tableLen = len(timeNode.get_value()) training = numpy.full(tableLen, 0) # set zero to all values as a padding #indices = [] # a list of indices which give the points in time that were labelled #training = [] # the training values are the tags from the annoations translated via the tagsmap for anno in annotations: startTime = date2secs(anno.get_child("startTime").get_value()) endTime = date2secs(anno.get_child("endTime").get_value()) tag = anno.get_child('tags').get_value()[0] if tag in tagsMap: #we take annotations that have a tag which was selected in the tagsMap if hasRegion == False or (startTime >= regionStartTime and endTime <= regionEndTime): #we only take annotations inside the region annoIndices = list( timeNode.get_time_indices(startTime, endTime) ) # this gives us the indices in the table between start and end time #now set the values with fancy indexing training[annoIndices] = tagsMap[tag] #now cut the region if we have one via fancy indexing if hasRegion: training = training[regionIndices] trainingData = [] # now grab the values from the columns finiteNodes = [] for node in inputNodes: values = numpy.asarray(node.get_value()) if hasRegion: values = values[regionIndices] if numpy.isfinite(values).all(): trainingData.append(list(values)) finiteNodes.append(node) logger.debug("has good finite values" + node.get_name()) else: logger.warning("some infinite values in " + node.get_name()) #now pad with zero values[False == numpy.isfinite(values)] = 0 trainingData.append(list(values)) finiteNodes.append(node) table = numpy.stack(trainingData, axis=0) # now fit the model #model = LogisticRegression() model = RandomForestClassifier(n_estimators=50, max_features=10) scaler = StandardScaler() scaler.fit(table.T) if scaling: data = scaler.transform(table.T) model.fit(data, numpy.asarray(training)) else: model.fit(table.T, numpy.asarray(training)) # # scoring is on "scoreregion" # if hasRegion: tableNode = timeNode.get_table_node() tableLen = tableNode.get_table_len() # try some scoring logger.info("now score") scoreData = [] # now grab the values from the columns for node in finiteNodes: if hasRegion: values = node.get_value()[regionIndices] else: values = node.get_value() values[False == numpy.isfinite(values)] = 0 #pad inf as zero scoreData.append(values) scoreTable = numpy.stack(scoreData, axis=0) if scaling: data = scaler.transform(scoreTable.T) score = model.predict(data) else: score = model.predict(scoreTable.T) # now write the score back to the ouput if hasRegion: # we need to fill in the remainings with Nan scoreFill = numpy.full(tableLen, numpy.inf) scoreFill[regionIndices] = score outputNode.set_value(list(scoreFill)) else: outputNode.set_value(list(score)) #also get the confusion matrix matrix = {'00': 0, '01': 0, '11': 0, '10': 0} for val, score in zip(training, score): entry = str(val) + str(score) matrix[entry] = matrix[entry] + 1 for entry in matrix: matrix[entry] = float(matrix[entry]) / float(len(training)) logger.debug("Confusion Matrix" + str(matrix)) return True
def import_run(functionNode): logger = functionNode.get_logger() logger.debug(f"import running..") timeStartImport = dt.datetime.now() model = functionNode.get_model() # --- define vars importerNode = functionNode.get_parent() progressNode = functionNode.get_child("control.progress") # --- [vars] define #tablename = functionNode.get_child("tablename").get_value() #logger.debug(f"tablename: {tablename}") # # --- create needed nodes #importerNode.create_child('imports', type="folder") imports = importerNode.get_child("imports") # TODO importsNode.get_child(tablename).delete() #importsNode.create_child(tablename, type="folder") #table = importsNode.get_child(tablename) #table.create_child('variables', type="folder") #table.create_child('columns', type="referencer") #table.create_child('metadata', type="const") #vars = table.get_child("variables") #cols = table.get_child("columns") # --- read metadata and fields metadataRaw = functionNode.get_child("metadata").get_value() metadata = json.loads(metadataRaw) #table.get_child("metadata").set_value(metadata) #fields = metadata["variables"] # timefield = int(metadata["timefield"]) - 1 #timefield = 0 #filenames = metadata["filenames"] #headerexists = metadata["headerexists"] #csv_data : any # --- load csv data # * https://www.shanelynn.ie/python-pandas-read_csv-load-data-from-csv-files/ # * [ ] optimize speed? https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas # * [ ] vectorize a loop https://stackoverflow.com/questions/27575854/vectorizing-a-function-in-pandas # for filename in filenames: # filepath = 'upload/' + filename # df = pd.read_csv(filepath) # csv_data = [df] # csv_data = pd.concat(csv_data, axis=1, join='inner').sort_index() class Progress(): def __init__(self,progressNode): self.progressNode = progressNode self.lastTime = time.time() def set_progress(self,value): now = time.time() if now > self.lastTime+1: self.progressNode.set_value(self.offset+float(value)/self.div) self.lastTime = now def set_offset(self,offset): self.offset=offset def set_divisor(self,div): self.div = float(div) totalProgress = 0 progressPerFile = 0.8/ len(metadata["filenames"]) # for each file 2 steps: the get lenght and the actual import progress = Progress(progressNode) notify={} model.disable_observers() #findTs = FindExistingVariable(functionNode.get_parent().get_child("imports"),typ="timeseries")#prepare the search #findEv = FindExistingVariable(functionNode.get_parent().get_child("imports"),typ="eventseries")#prepare the search newVarsCreated = True #trigger the first search for existing vars try: for fileNo,fileName in enumerate(metadata["filenames"]): if newVarsCreated: findTs = FindExistingVariable(functionNode.get_parent().get_child("imports"),typ="timeseries")#prepare the search findEv = FindExistingVariable(functionNode.get_parent().get_child("imports"),typ="eventseries")#prepare the search newVarsCreated = False filepath = 'upload/' + fileName #first check the filesize numLines= sum(1 for line in open(filepath)) progress.set_divisor(numLines/progressPerFile*2) progress.set_offset(fileNo*progressPerFile) csv_data = pd.read_csv(filepath,skiprows=lambda x: progress.set_progress(x) and False)#,encoding="utf-8") progress.set_offset(fileNo*progressPerFile+progressPerFile/2) progress.set_divisor(len(csv_data.columns[1:])/progressPerFile*2) fileFolderName = fileName.replace('.','_') # --- define time list # * select rows and columns from dataframe https://thispointer.com/select-rows-columns-by-name-or-index-in-dataframe-using-loc-iloc-python-pandas/ timeList = csv_data.iloc[:,0].to_list() epochs = [date2secs(time) for time in timeList] #print(epochs) # --- import data, set vars and columns data = {} hasEvents = False #metadata["variables"]=["*"]#["Temperature","Humidity"]#hack for testing for idx,colName in enumerate(csv_data.columns[1:]): print(idx) progress.set_progress(idx) if not (colName in metadata["variables"] or metadata["variables"]==["*"] or metadata["variables"]=="*"): continue fieldname = str(colName).replace('.','_') # now check if the column is data or event data = csv_data.loc[:,fieldname].to_list() try: isTs = True values = numpy.asarray(data,dtype=numpy.float64) #if that works, it's a time series data except Exception as ex: isTs = False if isTs: #now check if that variables already exists, and try to put the data there fieldVar = findTs.find(fieldname)#find_existing_variable(imports,fieldname) if not fieldVar: #need to create it folder = imports.create_child("variables",type="folder").create_child(fileFolderName,type="folder") # if is exists, we just get the folder fieldVar = folder.create_child(fieldname, type="timeseries") notify[folder.get_id()]="children" #fieldVar.set_time_series(values=values, times=epochs) #fieldVar.delete fieldVar.insert_time_series(values=values, times=epochs,allowDuplicates = False) #avoid double times notify[fieldVar.get_id()]="value" newVarsCreated=True #trigger the update of the variable lookup else: logger.debug(f"variable {fieldname} exists already as {fieldVar.get_browse_path()}, we use that") fieldVar.insert_time_series(values=values, times=epochs,allowDuplicates = False) notify[fieldVar.get_id()]="value" logger.debug(f"import val: {fieldname} as timeseries") else: #conversion was not possible, this is an event col #now check if that exists already isNewEventVar = False eventVar = findEv.find(fieldname)#find_existing_variable(imports,fieldname,typ="eventseries") if not eventVar: eventVar = imports.create_child(fieldname,type="eventseries") notify[eventVar.get_id()]="children" isNewEventVar = True newVarsCreated=True #trigger the update of the variable lookup #now build up the series, leave out the nans (which were created by making rows like 15.2,,,start,,,4 vals = [] tims = [] for ev,tim in zip(data,epochs): if type(ev) is str: evStr = str(ev) else: #it still might be number, so try to convert try: number = numpy.float64(ev) if numpy.isfinite(number): evStr = str(number) else: logger.error(f"cant convert {ev}") continue except: logger.error(f"cant convert {ev}") continue vals.append(evStr.strip(" "))#remove space at start and end tims.append(tim) if isNewEventVar: eventVar.set_event_series(values=vals, times=tims) else: eventVar.insert_event_series(values=vals, times=tims) notify[eventVar.get_id()]="value" hasEvents = True logger.debug(f"import val: {fieldname} as eventseries") except Exception as ex: model.enable_observers() logger.error(f" in importer ex {ex}") return False # look for nodes of type widget and ensure variables can be selected progressNode.set_value(0.9) model.enable_observers() progress.set_divisor(len(notify)) progress.set_offset(0) count = 0 for k,v in notify.items(): count = count +1 progress.set_progress(count) model.notify_observers(k, [v]) workbench_model = functionNode.get_model() widget_nodes: List[Node] = workbench_model.find_nodes("root", matchProperty={"type": "widget"}) try: autoTake = functionNode.get_child("autoTakeEvents").get_value() except: autoTake = True logger.debug(f"Autotake the Event:{autoTake}") for widget_node in widget_nodes: selectable_variables_referencer: Node = widget_node.get_child("selectableVariables") selectable_variables_referencer.add_references(imports.get_child("variables").get_children()) #now hook the events in if hasEvents and autoTake: #use the event variable as the new standard event variable widget_node.get_child("hasEvents.events").add_references(eventVar,deleteAll=True) #prepare the visibleEvents: get the old setting and update if there is any new #visibleEvents = {ev:False for ev in set(vals)} visibleEvents = widget_node.get_child("hasEvents.visibleEvents").get_value() for event in set(vals): if event not in visibleEvents: visibleEvents[event]=False widget_node.get_child("hasEvents.visibleEvents").set_value(visibleEvents) #prepare the colors palette = ["#f41fb3","#b20083","#a867dd","#68b8e7","#04a75e","#41ff87","#618a04","#eaf4c7","#ffce18","#8f4504"] #from https://loading.io/color/random/ cols = widget_node.get_child("hasEvents.colors").get_value() for idx,ev in enumerate(set(vals)):# if ev not in cols: cols[ev]={"color":palette[idx%10]} widget_node.get_child("hasEvents.colors").set_value(cols) widget_node.get_child("hasEvents.events").add_references(eventVar,deleteAll=True) #turn on events widget_node.get_child("hasEvents").set_value(True) #make the entry in the visible elements visibleElements = widget_node.get_child("visibleElements").get_value() if not "events" in visibleElements: visibleElements["events"]=False widget_node.get_child("visibleElements").set_value(visibleElements) #watch the state widget_node.get_child("observerVisibleElements.targets").add_references(widget_node.get_child("hasEvents.visibleEvents"),allowDuplicates=False) #trigger the event model = functionNode.get_model() model.notify_observers(eventVar.get_id(), ["value"]) #to trigger the widget to reload the event data logger.debug(f"import complete (seconds: {(dt.datetime.now()-timeStartImport).seconds})") return True