def fitWindowing(self, samples, windowSize, wordLength, symbols, normMean, lowerBounding): self.transformation = MFT(windowSize, normMean, lowerBounding) sa = {} index = 0 for i in range(samples["Samples"]): new_list = getDisjointSequences(samples[i], windowSize, normMean) for j in range(len(new_list)): sa[index] = new_list[j] index += 1 sa["Samples"] = index self.fitTransform(sa, wordLength, symbols, normMean)
def fitTransformDouble(self, samples, wordLength, symbols, normMean): if self.initialized == False: self.initialize(wordLength, symbols, normMean) if self.transformation == None: self.transformation = MFT(len(samples[0].data), normMean, self.lowerBounding, self.MUSE_Bool) transformedSamples = self.fillOrderline(samples, wordLength) if self.HistogramType == "EQUI_DEPTH": self.divideEquiDepthHistogram() elif self.HistogramType == "EQUI_FREQUENCY": self.divideEquiWidthHistogram() elif self.HistogramType == "INFORMATION_GAIN": self.divideHistogramInformationGain() return transformedSamples
def fitWindowing(self, samples, Labels, windowSize, wordLength, symbols, normMean, lowerBounding): self.transformation = MFT(windowSize, normMean, lowerBounding) labels = [] sa = [] for i in range(samples.shape[0]): new_list = getDisjointSequences(samples.iloc[i, :].tolist(), windowSize, normMean) for j in range(len(new_list)): sa.append(new_list[j]) labels.append(Labels[i]) SA = pd.DataFrame(np.zeros((len(sa), len(sa[0])))) for r in range(len(sa)): for c in range(len(sa[0])): SA.iloc[r, c] = sa[r][c] if self.SUP: self.fitTransformSupervised(SA, labels, wordLength, symbols, normMean) else: self.fitTransform(SA, labels, wordLength, symbols, normMean)
class SFA(): def __init__(self, histogram_type, SUP=False, LB=True, MB=False): self.initialized = False self.HistogramType = histogram_type self.SUP = SUP self.lowerBounding = LB self.MUSE_Bool = MB def initialize(self, wordLength, symbols, normMean): self.initialized = True self.wordLength = wordLength self.maxWordLength = wordLength self.symbols = symbols self.normMean = normMean self.alphabetSize = symbols self.transformation = None self.orderLine = [] self.bins = np.zeros((wordLength, self.alphabetSize)) + math.inf self.bins[:, 0] = -math.inf def printBins(self): print(self.bins) def mv_fitWindowing(self, samples, windowSize, wordLength, symbols, normMean, lowerBounding): sa = {} index = 0 for i in range(samples["Samples"]): for k in range(len(samples[i].keys())): new_list = getDisjointSequences(samples[i][k], windowSize, normMean) for j in range(len(new_list)): sa[index] = new_list[j] index += 1 sa["Samples"] = index self.fitWindowing(sa, windowSize, wordLength, symbols, normMean, lowerBounding) def fitWindowing(self, samples, windowSize, wordLength, symbols, normMean, lowerBounding): self.transformation = MFT(windowSize, normMean, lowerBounding, self.MUSE_Bool) sa = {} index = 0 for i in range(samples["Samples"]): new_list = getDisjointSequences(samples[i], windowSize, normMean) for j in range(len(new_list)): sa[index] = new_list[j] index += 1 sa["Samples"] = index if self.SUP: self.fitTransformSupervised(sa, wordLength, symbols, normMean) else: self.fitTransform(sa, wordLength, symbols, normMean) def fitTransform(self, samples, wordLength, symbols, normMean): return self.transform( samples, self.fitTransformDouble(samples, wordLength, symbols, normMean)) def transform(self, samples, approximate): transformed = [] for i in range(samples["Samples"]): transformed.append(self.transform2(samples[i].data, approximate[i])) return transformed def transform2(self, series, one_approx): if one_approx == "null": one_approx = self.transformation.transform(series, self.maxWordLength) if self.SUP: return self.quantizationSupervised(one_approx) else: return self.quantization(one_approx) def transformWindowing(self, series): mft = self.transformation.transformWindowing(series, self.maxWordLength) words = [] for i in range(len(mft)): if self.SUP: words.append(self.quantizationSupervised(mft[i])) else: words.append(self.quantization(mft[i])) return words def transformWindowingInt(self, series, wordLength): words = self.transformWindowing(series) intWords = [] for i in range(len(words)): intWords.append( self.createWord(words[i], wordLength, self.int2byte(self.alphabetSize))) return intWords def fitTransformDouble(self, samples, wordLength, symbols, normMean): if self.initialized == False: self.initialize(wordLength, symbols, normMean) if self.transformation == None: self.transformation = MFT(len(samples[0].data), normMean, self.lowerBounding, self.MUSE_Bool) transformedSamples = self.fillOrderline(samples, wordLength) if self.HistogramType == "EQUI_DEPTH": self.divideEquiDepthHistogram() elif self.HistogramType == "EQUI_FREQUENCY": self.divideEquiWidthHistogram() elif self.HistogramType == "INFORMATION_GAIN": self.divideHistogramInformationGain() return transformedSamples def fillOrderline(self, samples, wordLength): self.orderLine = [[None for _ in range(samples["Samples"])] for _ in range(wordLength)] transformedSamples = [] for i in range(samples["Samples"]): transformedSamples_small = self.transformation.transform( samples[i].data, wordLength) transformedSamples.append(transformedSamples_small) for j in range(len(transformedSamples_small)): value = float( format( round(transformedSamples_small[j], 2), '.2f')) + 0 #is a bad way of removing values of -0.0 obj = (value, samples[i].label) self.orderLine[j][i] = obj for l, list in enumerate(self.orderLine): del_list = list new_list = [] while len(del_list) != 0: current_min_value = math.inf current_min_location = -1 label = -math.inf for j in range(len(del_list)): if (del_list[j][0] < current_min_value) | ( (del_list[j][0] == current_min_value) & (del_list[j][1] < label)): current_min_value = del_list[j][0] label = del_list[j][1] current_min_location = j new_list.append(del_list[current_min_location]) del del_list[current_min_location] self.orderLine[l] = new_list return transformedSamples def quantization(self, one_approx): return cSFA.quantization(self.bins, one_approx) def divideEquiDepthHistogram(self): for i in range(self.bins.shape[0]): depth = len(self.orderLine[i]) / self.alphabetSize try: depth = len(self.orderLine[i]) / self.alphabetSize except: depth = 0 pos = 0 count = 0 try: for j in range(len(self.orderLine[i])): count += 1 condition1 = count > math.ceil(depth * (pos)) condition2 = pos == 1 condition3 = self.bins[i, pos - 1] != self.orderLine[i][j][0] if (condition1) & (condition2 | condition3): self.bins[i, pos] = round(self.orderLine[i][j][0], 2) pos += 1 except: pass self.bins[:, 0] = -math.inf def divideEquiWidthHistogram(self): i = 0 for element in self.orderLine: if len(element) != 0: first = element[0][0] last = element[-1][0] intervalWidth = (last - first) / self.alphabetSize for c in range(self.alphabetSize - 1): self.bins[i, c] = intervalWidth * (c + 1) + first i += 1 self.bins[:, 0] = -math.inf def divideHistogramInformationGain(self): for i, element in enumerate(self.orderLine): self.splitPoints = [] self.findBestSplit(element, 0, len(element), self.alphabetSize) self.splitPoints.sort() for j in range(len(self.splitPoints)): self.bins[i, j + 1] = element[self.splitPoints[j] + 1][0] def findBestSplit(self, element, start, end, remainingSymbols): bestGain = -1 bestPos = -1 total = end - start self.cOut = {} self.cIn = {} for pos in range(start, end): label = element[pos][1] self.cOut[label] = self.cOut[ label] + 1. if label in self.cOut.keys() else 1. class_entropy = self.entropy(self.cOut, total) i = start lastLabel = element[i][1] i += self.moveElement(lastLabel) for split in range(start + 1, end - 1): label = element[i][1] i += self.moveElement(label) if label != lastLabel: gain = self.calculateInformationGain(class_entropy, i, total) if gain >= bestGain: bestGain = gain bestPos = split lastLabel = label if bestPos > -1: self.splitPoints.append(bestPos) remainingSymbols = remainingSymbols / 2 if remainingSymbols > 1: if (bestPos - start > 2) & (end - bestPos > 2): self.findBestSplit(element, start, bestPos, remainingSymbols) self.findBestSplit(element, bestPos, end, remainingSymbols) elif end - bestPos > 4: self.findBestSplit(element, bestPos, int((end - bestPos) / 2), remainingSymbols) self.findBestSplit(element, int((end - bestPos) / 2), end, remainingSymbols) elif bestPos - start > 4: self.findBestSplit(element, start, int((bestPos - start) / 2), remainingSymbols) self.findBestSplit(element, int((bestPos - start) / 2), end, remainingSymbols) def moveElement(self, label): self.cIn[label] = self.cIn[label] + 1. if label in self.cIn.keys( ) else 1. self.cOut[label] = self.cOut[label] - 1. if label in self.cOut.keys( ) else -1. return 1 def entropy(self, freq, total): e = 0 if total != 0: log2 = 1.0 / math.log(2) for k in freq.keys(): p = freq[k] / total if p > 0: e += -1 * p * math.log(p) * log2 else: e = math.inf return e def calculateInformationGain(self, class_entropy, total_c_in, total): total_c_out = total - total_c_in return class_entropy - total_c_in / total * self.entropy( self.cIn, total_c_in) - total_c_out / total * self.entropy( self.cOut, total_c_out) def createWord(self, numbers, maxF, bits): shortsPerLong = int(round(60 / bits)) to = min(len(numbers), maxF) b = int(0) s = 0 shiftOffset = 1 for i in range(s, (min(to, shortsPerLong + s))): shift = 1 for j in range(bits): if (numbers[i] & shift) != 0: b |= shiftOffset shiftOffset <<= 1 shift <<= 1 return b def int2byte(self, number): log = 0 if (number & 0xffff0000) != 0: number >>= 16 log = 16 if number >= 256: number >>= 8 log += 8 if number >= 16: number >>= 4 log += 4 if number >= 4: number >>= 2 log += 2 return log + (number >> 1) ## Supervised def fitTransformSupervised(self, samples, wordLength, symbols, normMean): length = len(samples[0].data) transformedSignal = self.fitTransformDouble(samples, length, symbols, normMean) best = self.calcBestCoefficients(samples, transformedSignal) self.bestValues = [0 for i in range(min(len(best), wordLength))] self.maxWordLength = 0 for i in range(len(self.bestValues)): self.bestValues[i] = best[i][0] self.maxWordLength = max(best[i][0] + 1, self.maxWordLength) self.maxWordLength += self.maxWordLength % 2 return self.transform(samples, transformedSignal) def calcBestCoefficients(self, samples, transformedSignal): classes = {} for i in range(samples["Samples"]): if samples[i].label in classes.keys(): classes[samples[i].label].append(transformedSignal[i]) else: classes[samples[i].label] = [transformedSignal[i]] nSamples = len(transformedSignal) nClasses = len(classes.keys()) length = len(transformedSignal[1]) f = self.getFoneway(length, classes, nSamples, nClasses) f_sorted = sorted(f, reverse=True) best = [] inf_index = 0 for value in f_sorted: if value == math.inf: index = f.index(value) + inf_index inf_index += 1 else: index = f.index(value) best.append([index, value]) return best def getFoneway(self, length, classes, nSamples, nClasses): ss_alldata = [0. for i in range(length)] sums_args = {} keys_class = list(classes.keys()) for key in keys_class: allTs = classes[key] sums = [0. for i in range(len(ss_alldata))] sums_args[key] = sums for ts in allTs: for i in range(len(ts)): ss_alldata[i] += ts[i] * ts[i] sums[i] += ts[i] square_of_sums_alldata = [0. for i in range(len(ss_alldata))] square_of_sums_args = {} for key in keys_class: # square_of_sums_alldata2 = [0. for i in range(len(ss_alldata))] sums = sums_args[key] for i in range(len(sums)): square_of_sums_alldata[i] += sums[i] # square_of_sums_alldata += square_of_sums_alldata2 squares = [0. for i in range(len(sums))] square_of_sums_args[key] = squares for i in range(len(sums)): squares[i] += sums[i] * sums[i] for i in range(len(square_of_sums_alldata)): square_of_sums_alldata[i] *= square_of_sums_alldata[i] sstot = [0. for i in range(len(ss_alldata))] for i in range(len(sstot)): sstot[i] = ss_alldata[i] - square_of_sums_alldata[i] / nSamples ssbn = [0. for i in range(len(ss_alldata))] ## sum of squares between sswn = [0. for i in range(len(ss_alldata))] ## sum of squares within for key in keys_class: sums = square_of_sums_args[key] n_samples_per_class = len(classes[key]) for i in range(len(sums)): ssbn[i] += sums[i] / n_samples_per_class for i in range(len(square_of_sums_alldata)): ssbn[i] += -square_of_sums_alldata[i] / nSamples dfbn = nClasses - 1 ## degrees of freedom between dfwn = nSamples - nClasses ## degrees of freedom within msb = [0. for i in range(len(ss_alldata)) ] ## variance (mean square) between classes msw = [0. for i in range(len(ss_alldata)) ] ## variance (mean square) within samples f = [0. for i in range(len(ss_alldata))] ## f-ratio for i in range(len(sswn)): sswn[i] = sstot[i] - ssbn[i] msb[i] = ssbn[i] / dfbn msw[i] = sswn[i] / dfwn f[i] = msb[i] / msw[i] if msw[i] != 0. else math.inf return f def quantizationSupervised(self, one_approx): signal = [0 for _ in range(min(len(one_approx), len(self.bestValues)))] for a in range(len(signal)): i = self.bestValues[a] b = 0 for beta in range(self.bins.shape[1]): if one_approx[i] < self.bins[i, beta]: break else: b += 1 signal[a] = b - 1 return signal
class SFA(): def __init__(self, histogram_type, LB=True, logger=None, mftUseMaxOrMin=False): self.initialized = False self.HistogramType = histogram_type self.lowerBounding = LB self.mftUseMaxOrMin = mftUseMaxOrMin logger.Log(self.__dict__, level=0) self.logger = logger def initialize(self, wordLength, symbols, normMean): self.initialized = True self.wordLength = wordLength self.maxWordLength = wordLength self.symbols = symbols self.normMean = normMean self.alphabetSize = symbols self.transformation = None self.orderLine = [] self.bins = pd.DataFrame(np.zeros( (wordLength, self.alphabetSize))) + math.inf self.bins.iloc[:, 0] = -math.inf def printBins(self, logger=None): if logger != None: logger.Log(str(self.bins), level=0) else: print(self.bins) def mv_fitWindowing(self, samples, windowSize, wordLength, symbols, normMean, lowerBounding, dim): sa = {} index = 0 for i in range(samples["Samples"]): new_list = getDisjointSequences(samples[i][dim], windowSize, normMean) for j in range(len(new_list)): sa[index] = new_list[j] index += 1 sa["Samples"] = index self.fitWindowing(sa, windowSize, wordLength, symbols, normMean, lowerBounding) def fitWindowing(self, samples, windowSize, wordLength, symbols, normMean, lowerBounding): self.transformation = MFT(windowSize, normMean, lowerBounding) sa = {} index = 0 for i in range(samples["Samples"]): new_list = getDisjointSequences(samples[i], windowSize, normMean) for j in range(len(new_list)): sa[index] = new_list[j] index += 1 sa["Samples"] = index self.fitTransform(sa, wordLength, symbols, normMean) def fitTransform(self, samples, wordLength, symbols, normMean): return self.transform( samples, self.fitTransformDouble(samples, wordLength, symbols, normMean)) def transform(self, samples, approximate): transformed = [] for i in range(samples["Samples"]): transformed.append(self.transform2(samples[i].data, approximate[i])) return transformed def transform2(self, series, one_approx, str_return=False): if one_approx == "null": one_approx = self.transformation.transform(series, self.maxWordLength) if str_return: return sfaToWord(self.quantization(one_approx)) else: return self.quantization(one_approx) def transformWindowing(self, series, str_return=False): mft = self.transformation.transformWindowing(series, self.maxWordLength) words = [] for i in range(len(mft)): words.append(self.quantization(mft[i])) if str_return: return sfaToWordList(words) else: return words def transformWindowingInt(self, series, wordLength): words = self.transformWindowing(series) intWords = [] for i in range(len(words)): intWords.append( self.createWord(words[i], wordLength, self.int2byte(self.alphabetSize))) return intWords def fitTransformDouble(self, samples, wordLength, symbols, normMean): if self.initialized == False: self.initialize(wordLength, symbols, normMean) if self.transformation == None: self.transformation = MFT(len(samples[0].data), normMean, self.lowerBounding) transformedSamples = self.fillOrderline(samples, wordLength) if self.HistogramType == "EQUI_DEPTH": self.divideEquiDepthHistogram() elif self.HistogramType == "EQUI_FREQUENCY": self.divideEquiWidthHistogram() elif self.HistogramType == "INFORMATION_GAIN": self.divideHistogramInformationGain() self.orderLine = [] return transformedSamples def fillOrderline(self, samples, wordLength): self.orderLine = [[None for _ in range(samples["Samples"])] for _ in range(wordLength)] transformedSamples = [] for i in range(samples["Samples"]): transformedSamples_small = self.transformation.transform( samples[i].data, wordLength) transformedSamples.append(transformedSamples_small) for j in range(len(transformedSamples_small)): value = float( format( round(transformedSamples_small[j], 2), '.2f')) + 0 #is a bad way of removing values of -0.0 obj = (value, samples[i].label) self.orderLine[j][i] = obj for l, list in enumerate(self.orderLine): del_list = list new_list = [] while len(del_list) != 0: current_min_value = math.inf current_min_location = -1 label = -math.inf for j in range(len(del_list)): if (del_list[j][0] < current_min_value) | ( (del_list[j][0] == current_min_value) & (del_list[j][1] < label)): current_min_value = del_list[j][0] label = del_list[j][1] current_min_location = j new_list.append(del_list[current_min_location]) del del_list[current_min_location] self.orderLine[l] = new_list return transformedSamples def quantization(self, one_approx): i = 0 word = [0 for _ in range(len(one_approx))] for v in one_approx: c = 0 for C in range(self.bins.shape[1]): if v < self.bins.iloc[i, c]: break else: c += 1 word[i] = c - 1 i += 1 return word def divideEquiDepthHistogram(self): for i in range(self.bins.shape[0]): depth = len(self.orderLine[i]) / self.alphabetSize try: depth = len(self.orderLine[i]) / self.alphabetSize except: depth = 0 pos = 0 count = 0 try: for j in range(len(self.orderLine[i])): count += 1 condition1 = count > math.ceil(depth * (pos)) condition2 = pos == 1 condition3 = self.bins.iloc[i, pos - 1] != self.orderLine[i][j][0] if (condition1) & (condition2 | condition3): self.bins.iloc[i, pos] = round(self.orderLine[i][j][0], 2) pos += 1 except: pass self.bins.iloc[:, 0] = -math.inf def divideEquiWidthHistogram(self): i = 0 for element in self.orderLine: if len(element) != 0: first = element[0][0] last = element[-1][0] intervalWidth = (last - first) / self.alphabetSize for c in range(self.alphabetSize - 1): self.bins.iloc[i, c + 1] = intervalWidth * (c + 1) + first i += 1 self.bins.iloc[:, 0] = -math.inf def divideHistogramInformationGain(self): for i, element in enumerate(self.orderLine): self.splitPoints = [] self.findBestSplit(element, 0, len(element), self.alphabetSize) self.splitPoints.sort() for j in range(len(self.splitPoints)): self.bins.iloc[i, j + 1] = element[self.splitPoints[j] + 1][0] def findBestSplit(self, element, start, end, remainingSymbols): bestGain = -1 bestPos = -1 total = end - start self.cOut = {} self.cIn = {} for pos in range(start, end): label = element[pos][1] self.cOut[label] = self.cOut[ label] + 1. if label in self.cOut.keys() else 1. class_entropy = self.entropy(self.cOut, total) i = start lastLabel = element[i][1] i += self.moveElement(lastLabel) for split in range(start + 1, end - 1): label = element[i][1] i += self.moveElement(label) if label != lastLabel: gain = self.calculateInformationGain(class_entropy, i, total) if gain >= bestGain: bestGain = gain bestPos = split lastLabel = label if bestPos > -1: self.splitPoints.append(bestPos) remainingSymbols = remainingSymbols / 2 if remainingSymbols > 1: if (bestPos - start > 2) & (end - bestPos > 2): self.findBestSplit(element, start, bestPos, remainingSymbols) self.findBestSplit(element, bestPos, end, remainingSymbols) elif end - bestPos > 4: self.findBestSplit(element, bestPos, int((end - bestPos) / 2), remainingSymbols) self.findBestSplit(element, int((end - bestPos) / 2), end, remainingSymbols) elif bestPos - start > 4: self.findBestSplit(element, start, int((bestPos - start) / 2), remainingSymbols) self.findBestSplit(element, int((bestPos - start) / 2), end, remainingSymbols) def moveElement(self, label): self.cIn[label] = self.cIn[label] + 1. if label in self.cIn.keys( ) else 1. self.cOut[label] = self.cOut[label] - 1. if label in self.cOut.keys( ) else -1. return 1 def entropy(self, freq, total): e = 0 if total != 0: log2 = 1.0 / math.log(2) for k in freq.keys(): p = freq[k] / total if p > 0: e += -1 * p * math.log(p) * log2 else: e = math.inf return e def calculateInformationGain(self, class_entropy, total_c_in, total): total_c_out = total - total_c_in return class_entropy - total_c_in / total * self.entropy( self.cIn, total_c_in) - total_c_out / total * self.entropy( self.cOut, total_c_out) def createWord(self, numbers, maxF, bits): shortsPerLong = int(round(60 / bits)) to = min(len(numbers), maxF) b = int(0) s = 0 shiftOffset = 1 for i in range(s, (min(to, shortsPerLong + s))): shift = 1 for j in range(bits): if (numbers[i] & shift) != 0: b |= shiftOffset shiftOffset <<= 1 shift <<= 1 return b def int2byte(self, number): log = 0 if (number & 0xffff0000) != 0: number >>= 16 log = 16 if number >= 256: number >>= 8 log += 8 if number >= 16: number >>= 4 log += 4 if number >= 4: number >>= 2 log += 2 return log + (number >> 1)