def __init__(self, data, startIdxs, endIdxs=None, subseqLength=None, labels=None, name=None, id=0): self.data = ensure2D(data) self.startIdxs = np.asarray(startIdxs, dtype=np.int) self.labels = np.asarray(labels) self.name = name self.id = int(id) if endIdxs is not None: self.endIdxs = np.asarray(endIdxs, dtype=np.int) self.subseqLength = None elif subseqLength: self.endIdxs = self.startIdxs + subseqLength self.subseqLength = subseqLength else: raise ValueError( "Either endIdxs or subseqLength must be specified!") if labels is None or len(labels) == 0: self.labels = np.zeros(len(startIdxs), dtype=np.int) + DEFAULT_LABEL if startIdxs is not None and endIdxs is not None: # equal lengths nStart, nEnd = len(startIdxs), len(endIdxs) if nStart != nEnd: raise ValueError("Number of start indices must equal number" "of end indices! {0} != {1}".format( nStart, nEnd)) # starts before or equal to ends violators = np.where(startIdxs > endIdxs)[0] if np.any(violators): raise ValueError("Some start indices exceed end indices!" "Violators at {}".format(str(violators))) # valid indices violators = np.where(startIdxs < 0)[0] if np.any(violators): raise ValueError("Some start indices < 0!" "Violators at {}".format(str(violators))) violators = np.where(endIdxs > len(data))[0] if np.any(violators): violatorValues = endIdxs[violators] raise ValueError("Some end indices > length of data {}! " "Violators {} at {}".format( len(data), str(violatorValues), str(violators)))
def __init__(self, data, startIdxs, endIdxs=None, subseqLength=None, labels=None, name=None, id=0): self.data = ensure2D(data) self.startIdxs = np.asarray(startIdxs, dtype=np.int) self.labels = np.asarray(labels) self.name = name self.id = int(id) if endIdxs is not None: self.endIdxs = np.asarray(endIdxs, dtype=np.int) self.subseqLength = None elif subseqLength: self.endIdxs = self.startIdxs + subseqLength self.subseqLength = subseqLength else: raise ValueError("Either endIdxs or subseqLength must be specified!") if labels is None or len(labels) == 0: self.labels = np.zeros(len(startIdxs), dtype=np.int) + DEFAULT_LABEL if startIdxs is not None and endIdxs is not None: # equal lengths nStart, nEnd = len(startIdxs), len(endIdxs) if nStart != nEnd: raise ValueError("Number of start indices must equal number" "of end indices! {0} != {1}".format(nStart, nEnd)) # starts before or equal to ends violators = np.where(startIdxs > endIdxs)[0] if np.any(violators): raise ValueError("Some start indices exceed end indices!" "Violators at {}".format(str(violators))) # valid indices violators = np.where(startIdxs < 0)[0] if np.any(violators): raise ValueError("Some start indices < 0!" "Violators at {}".format(str(violators))) violators = np.where(endIdxs > len(data))[0] if np.any(violators): violatorValues = endIdxs[violators] raise ValueError("Some end indices > length of data {}! " "Violators {} at {}".format(len(data), str(violatorValues), str(violators)))
def sectionsOfDataNearAnnotationsImpure(X, startIdxs, endIdxs, labels, instancesPerTs=10, shuffle=False, padLen=0, maxPadJitter=0, keepLabels=None, datasetName="Dataset"): assert(len(startIdxs) == len(endIdxs)) assert(len(startIdxs) == len(labels)) startIdxs = np.asarray(startIdxs) endIdxs = np.asarray(endIdxs) # filter out labels we don't care about if keepLabels: allIdxs = np.arange(len(labels)) keepIdxs = [i for i in allIdxs if labels[i] in keepLabels] keepIdxs = np.array(keepIdxs, dtype=np.int) startIdxs = startIdxs[keepIdxs] endIdxs = endIdxs[keepIdxs] labels = labels[keepIdxs] # find sections of nearby annotations in the data and group these # sections together; we'll concat these groups together to form a ts combinedRanges = unionOfRanges(startIdxs, endIdxs, len(X), padLen=padLen) rangeGroups = formGroupsOfSize(combinedRanges, groupSize=instancesPerTs, shuffle=shuffle) # now the hard part--create a LabeledTimeSeries from each of these # sections of signal; we have to not only find which annotations # fall within each range, but also adjust the start and end indices # so that they're correct in the new ts formed by concatenating the # data in each range together tsList = [] for groupNum, ranges in enumerate(rangeGroups): ranges = sorted(ranges, key=lambda r: r[0]) # sort by range start idx dataLenSoFar = 0 dataInRanges = [] startsInRanges = [] endsInRanges = [] labelsInRanges = [] for rang in ranges: start, end = rang firstInRange, lastInRange = whereStartEndPairsInRange(startIdxs, endIdxs, start, end) idxsInRange = np.arange(firstInRange, lastInRange) # move the start and end indices around a bit so that ranges # aren't spaced exactly uniformly, which can lead to an # artificial semblance of regularity if maxPadJitter > 0: if firstInRange > 0: firstStartIdx = startIdxs[firstInRange] prevEndIdx = endIdxs[firstInRange-1] gap = firstStartIdx - prevEndIdx if gap > 1: gap = min(gap - 1, maxPadJitter) offset = int(np.random.rand() * gap) start -= offset if lastInRange < (len(startIdxs) - 1): lastEndIdx = endIdxs[lastInRange-1] # last idx not inclusive nextStartIdx = startIdxs[lastInRange] gap = nextStartIdx - lastEndIdx if gap > 1: gap = min(gap - 1, maxPadJitter) offset = int(np.random.rand() * gap) end += offset starts = startIdxs[idxsInRange] - start + dataLenSoFar ends = endIdxs[idxsInRange] - start + dataLenSoFar lbls = labels[idxsInRange] startsInRanges += list(starts) endsInRanges += list(ends) labelsInRanges += list(lbls) data = ensure2D(X[start:end]) dataInRanges.append(data) dataLenSoFar += len(data) if len(labelsInRanges) < 2: # need more than one pattern instance per ts continue groupData = np.vstack(dataInRanges) groupStarts = np.array(startsInRanges, dtype=np.int) groupEnds = np.array(endsInRanges, dtype=np.int) groupLabels = np.array(labelsInRanges, dtype=np.object) name = "{}-group{}".format(datasetName, groupNum) uniqId = hash(name) ts = LabeledTimeSeries(groupData, startIdxs=groupStarts, endIdxs=groupEnds, labels=groupLabels, name=name, id=uniqId) tsList.append(ts) return tsList