コード例 #1
0
ファイル: analyse.py プロジェクト: chambai/deepstreamce
def setAnalysisParametersMcod(clusterer):
    k = util.getParameter('mcod_k')
    radius = util.getParameter('mcod_radius')
    windowSize = util.getParameter('mcod_windowsize')
    result = setAnalysisParameters_Mcod(clusterer, k, radius, windowSize)
    clusterer.prepareForUse()
    util.thisLogger.logInfo(result)
コード例 #2
0
ファイル: app.py プロジェクト: chambai/deepstreamce
    def __init__(self):

        # Setup Parameters
        util.params = None
        self.dnnModelPath = util.getFullFileName(
            util.getParameter('DnnModelPath'))
        self.numTrainingInstances = util.getParameter(
            'NumActivationTrainingInstances')
        self.timestamp = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
        self.outputName = util.getSetupFileDescription(
        ) + '--' + self.timestamp
        self.outputDir = 'output/%s' % (self.outputName)
        util.makeDirectory(self.outputDir)
        util.isLoggingEnabled = util.getParameter('LoggingEnabled')
        util.logPath = self.outputDir + '/%s.log' % (self.outputName)
        util.logLevel = util.getParameter('LogLevel')
        util.thisLogger = util.Logger()
        util.storeSetupParamsInLog()

        # Setup memory environment
        self.processorType = processorType = util.getParameter('ProcessorType')

        self.startTime = datetime.datetime.now()

        self.streamList = None
        self.clustererList = None
        self.classMaxValues1 = None  # max value of raw activation data
        self.classMaxValues2 = None  # max value of reduced activation data

        self.flatActivations = None
        self.activationBatches = None
        self.batchFlatActivations = None
        self.reducedFlatActivations = None
コード例 #3
0
ファイル: analyse.py プロジェクト: chambai/deepstreamce
def createClusterer(stream):
    # set the clusterer
    clusterer = gateway.entry_point.Moa_Clusterers_Outliers_MCOD_New()
    k = util.getParameter('mcod_k')
    radius = util.getParameter('mcod_radius')
    windowSize = util.getParameter('mcod_windowsize')
    setAnalysisParameters_Mcod(clusterer, k, radius, windowSize)

    clusterer.setModelContext(stream.getHeader())
    clusterer.prepareForUse()
    return clusterer
コード例 #4
0
ファイル: dataset.py プロジェクト: chambai/deepstreamce
def getOutOfFilterData():
    x_train, y_train, x_test, y_test = getAllData()
    dataDiscrepancyClass = util.getParameter('DataDiscrepancyClass')
    classes = [dataDiscrepancyClass]
    x_train, y_train, x_test, y_test = filterData(x_train, y_train, x_test,
                                                  y_test, classes)
    return x_train, y_train, x_test, y_test
コード例 #5
0
ファイル: extract.py プロジェクト: chambai/deepstreamce
def getActivations(model, inData):
    actDict = {}

    # Activation Layer Extraction
    layers = util.getParameter("IncludedLayers")

    if (layers == 'all'):
        layers = np.arange(len(model.layers))
    else:
        layers = np.asarray(
            layers.replace('[', '').replace(']', '').split(',')).astype(int)

    util.thisLogger.logInfo('Applying activation extraction to layers: %s' %
                            (layers))

    for layerNum in layers:
        getLastLayerOutput = K.function([model.layers[0].input],
                                        [model.layers[layerNum].output])

        # all values from one layer
        if layerNum in layers:
            layer_output = getLastLayerOutput([inData])
            actDict["activation" + str(layerNum)] = layer_output[0]

    return actDict
コード例 #6
0
ファイル: reduce.py プロジェクト: chambai/deepstreamce
def getModelFileName():
    datasetName = util.getParameter('DatasetName')
    classes = util.getParameter('DataClasses')
    classes = np.asarray(classes.replace('[', '').replace(
        ']', '').split(',')).astype(int)
    classesStr = '%sclasses' % (len(classes))
    for c in classes:
        classesStr += str(c)

    numActivationTrainingInstances = util.getParameter(
        'NumActivationTrainingInstances')

    modelFileName = '%s/models/reduce/%s_%s_single_undercomp_%s.h5' % (
        os.getcwd(), datasetName, classesStr,
        str(numActivationTrainingInstances))
    util.thisLogger.logInfo('Reduction model filename: %s' % (modelFileName))
    #modelFileName = '/home/jupyter/deepactistream/models/reduce/%s_%s_single_undercomp_%s.h5'%(datasetName,classesStr,str(numActivationTrainingInstances))
    return modelFileName
コード例 #7
0
def getInstanceParameters():
    dataDiscrepancyFrequency = util.getParameter(
        'DataDiscrepancyFrequency')  # i.e. 1in1
    splitData = dataDiscrepancyFrequency.split('in')
    numDiscrepancy = int(
        splitData[0].strip())  # first number is number of discrepancies
    numNonDiscrepancy = int(
        splitData[1].strip())  # second number is number of  non-discrepancies

    return numDiscrepancy, numNonDiscrepancy
コード例 #8
0
def startDataInputStream(streamList, clustererList, reductionModel, dnnModel,
                         x_test, maxClassValues1, maxClassValues2, dataDir,
                         filePrefix):
    numUnseenInstances = util.getParameter('NumUnseenInstances')
    util.thisLogger.logInfo("---------- start of data input stream ----------")
    unseenInstances, unseenResults = startDataInputStream_Time(
        streamList, clustererList, reductionModel, dnnModel, x_test,
        numUnseenInstances, maxClassValues1, maxClassValues2, dataDir,
        filePrefix)
    util.thisLogger.logInfo(
        "----------- end of %s data input stream ----------")

    return unseenInstances, unseenResults
コード例 #9
0
ファイル: reduce.py プロジェクト: chambai/deepstreamce
def train_undercompleteAutoencoder(flatActivations, batchFlatActivations):
    # trains an undercomplete autoencoder on the activations from the training
    # data

    useBatches = True

    data = None
    input_size = None
    processorType = util.getParameter('ProcessorType')
    if processorType == "GPU":
        if useBatches == False:
            data = np.array(flatActivations)
        input_size = len(flatActivations[0])
    else:
        # activations are a list/numpy array
        data = np.array(flatActivations)
        input_size = flatActivations[0].size()[0]

    autoencoder = design_autoencoder(input_size)

    epochs = 50

    autoencoder_trained = None
    # training in batches
    if (useBatches):
        autoencoder = trainInBatches(autoencoder, batchFlatActivations, epochs)
    else:
        act_train, act_valid = train_test_split(data,
                                                test_size=0.33,
                                                shuffle=True)
        autoencoder_trained = autoencoder.fit(act_train,
                                              act_train,
                                              batch_size=128,
                                              epochs=epochs,
                                              verbose=1,
                                              validation_data=(act_valid,
                                                               act_valid))

        # plot
        loss = autoencoder_trained.history['loss']
        val_loss = autoencoder_trained.history['val_loss']
        epochs = range(epochs)
        plt.figure()
        plt.plot(epochs, loss, 'bo', label='Training loss')
        plt.plot(epochs, val_loss, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.legend()
        plt.show()

    return autoencoder
コード例 #10
0
ファイル: dataset.py プロジェクト: chambai/deepstreamce
def getFilteredData():
    x_train, y_train, x_test, y_test = getAllData()

    # Only train and test on the first 5 classes
    #     dataFilter = util.getParameter('DataFilter')
    #     if(dataFilter == '5classes'):
    #         classes = [0,1,2,3,4]    # Todo: specify this in setup file?
    #     elif(dataFilter == '2classes'):
    classes = util.getParameter('DataClasses')
    classes = np.asarray(classes.replace('[', '').replace(
        ']', '').split(',')).astype(int)
    util.thisLogger.logInfo('Data classes to be used: %s' % (classes))

    #print('before filtering: classarray: %s, x_train: %s'%(classes, len(x_train)))
    #print('classarray: %s, y_train: %s'%(classes, len(y_train)))
    x_train, y_train, x_test, y_test = filterData(x_train, y_train, x_test,
                                                  y_test, classes)
    #print('after filtering: classarray: %s, x_train: %s'%(classes, len(x_train)))
    #print('classarray: %s, y_train: %s'%(classes, len(y_train)))

    return x_train, y_train, x_test, y_test
コード例 #11
0
def setPredictions(dnnModel):
    global unseenDataList
    instances = [x.instance for x in unseenDataList]
    instances = np.reshape(instances, (len(unseenDataList), 32, 32, 3))
    instances = np.asarray(instances)
    predictions = np.argmax(dnnModel.predict(instances), axis=1)
    instances = []  # reset instances to save memory

    # The DNN is trained to output 0 or 1 only.
    # get the original classes it was trained on and transform the outputs
    classes = util.getParameter('DataClasses')
    classes = np.asarray(classes.replace('[', '').replace(
        ']', '').split(',')).astype(int)
    util.thisLogger.logInfo('Data classes to be used: %s' % (classes))
    count = 0
    for c in classes:
        predictions = np.where(predictions == count, c, predictions)
        count += 1

    for index in range(len(predictions)):
        unseenDataList[index].predictedResult = predictions[index]
コード例 #12
0
ファイル: reduce.py プロジェクト: chambai/deepstreamce
def reduce_autoenc(flatActivations, batchFlatActivations):
    # reduces the activations for an unseen instance and returns
    # the reduced activations and the neuron labels
    global reductionModel
    encodedOutput = None

    # Take only the first and middle layers of the model
    encoder = Model(inputs=reductionModel.input,
                    outputs=reductionModel.layers[1].output)

    processorType = util.getParameter('ProcessorType')
    if processorType == "GPU":
        encodedOutput = encoder.predict(flatActivations, steps=1)
    else:
        encodedOutput = encoder.predict(np.array(flatActivations))

    # change to python lists
    encodedOutput = encodedOutput.tolist()

    util.thisLogger.logInfo("Activations reduced from %s to %s elements" %
                            (len(flatActivations[0]), len(encodedOutput[0])))

    return encodedOutput
コード例 #13
0
ファイル: app.py プロジェクト: chambai/deepstreamce
    def processDataStream(self):
        # start the thread to process the streams so that new instances get clustered
        thread1 = threading.Thread(target=analyse.processStreamInstances,
                                   args=(self.streamList, self.clustererList,
                                         self.numTrainingInstances,
                                         self.outputDir, self.outputName,
                                         False, True),
                                   daemon=True)
        thread1.start()

        unseenInstancesObjList = datastream.startDataInputStream(
            self.streamList, self.clustererList, reduce.reductionModel,
            self.dnnModel, self.x_test, self.classMaxValues1,
            self.classMaxValues2, self.outputDir, self.outputName)

        # reshape into original array types
        unseenInstances = [x.instance for x in unseenInstancesObjList[0]]
        unseenResults = [x.correctResult for x in unseenInstancesObjList[0]]

        dataDiscrepancyClass = self.layerExtraction = util.getParameter(
            "DataDiscrepancyClass")

        # append unseen instances to the training instances
        unseenInstances = np.append(unseenInstances, unseenResults, axis=1)
        classes = np.unique(self.y_train)
        for dataClass in classes:
            # Filter unseen instances to only include CE and data discrepancy class
            filteredInstances = list(
                filter(
                    lambda x:
                    (x[len(unseenInstances[0]) - 1] == dataClass or x[len(
                        unseenInstances[0]) - 1] == dataDiscrepancyClass),
                    unseenInstances))

            trainingActivations = util.readFromCsv(
                '%s/%s_trainingactivations_%s.csv' %
                (self.outputDir, self.outputName, dataClass))
            labels = np.arange(len(trainingActivations[0]))
            labels = np.append(labels, len(labels))
            classValues = np.full((trainingActivations.shape[0], 1),
                                  'Train_' + str(dataClass))

            trainingActivations = np.append(trainingActivations,
                                            classValues,
                                            axis=1)  # axis=1 means add columns
            trainingActivations = np.concatenate(
                (trainingActivations, filteredInstances),
                axis=0)  # axis=0 means add rows

            trainingActivations = np.concatenate(
                ([labels], trainingActivations),
                axis=0)  # axis=0 means add rows

        analyse.stopProcessing()
        thread1.join()

        # capture any unprocessed instances
        analyse.processStreamInstances(self.streamList, self.clustererList,
                                       self.numTrainingInstances,
                                       self.outputDir, self.outputName, True,
                                       True)

        util.thisLogger.logInfo('End of instance processing')

        # get outlier results and store in csv
        if analyse.results != None:
            util.createResults(unseenInstancesObjList, analyse.results,
                               self.outputDir, self.outputName)

        util.killMoaGateway()
        endTime = datetime.datetime.now()
        util.thisLogger.logInfo('Total run time: ' +
                                str(endTime - self.startTime))
        util.thisLogger.closeLog()
コード例 #14
0
ファイル: analyse.py プロジェクト: chambai/deepstreamce
def processStreamInstance(stream,
                          clusterer,
                          i,
                          numInstances,
                          dataDir,
                          prefix,
                          saveOutlierResults=False):
    global results
    if saveOutlierResults == True:
        if results == None:
            results = []
            results.append([
                'ID',
                'Instance = class %s' %
                (util.getParameter('DataDiscrepancyClass')), 'Class', 'Outlier'
            ])

    trainFilename = '%s/%s_trainingactivations_%s.csv' % (dataDir, prefix, i)

    numSamples = 1

    try:
        while stream.hasMoreInstances():
            newInstEx = stream.nextIdInstance()
            if (newInstEx is not None):
                instId = newInstEx.getIdAsString()
                newInst = newInstEx.getInstanceExample().getData()

                if saveOutlierResults == True:
                    # determine if the instance is an outlier
                    outlierResult = 'clusterer not defined'
                    clusterResult = None
                    inlierResult = None
                    outlierResult = gateway.entry_point.Moa_Clusterers_Outliers_MCOD_addAndAnalyse(
                        clusterer, newInst, trainFilename, numCpus)

                    if (numInstances == -1):
                        if (outlierResult[0] == 'DATA,OUTLIER,OUTLIER'):
                            util.thisLogger.logInfoColour(
                                "[%s] Activation data for stream %s, instance %s: %s"
                                % (instId, i, numSamples, outlierResult),
                                'red')
                        elif (outlierResult[0] == 'DATA,OUTLIER,NOT_OUTLIER'):
                            util.thisLogger.logInfoColour(
                                "[%s] Activation data for stream %s, instance %s: %s"
                                % (instId, i, numSamples, outlierResult),
                                'green')
                        else:
                            util.thisLogger.logInfoColour(
                                "[%s] Activation data for stream %s, instance %s: %s"
                                % (instId, i, numSamples, outlierResult),
                                'magenta')

                    # at this point we don't know if the original instance was an ND instance or CE as this is on a different thread
                    # mark is as ND for now, and update relevant entries with CE when creating the results
                    result = [
                        instId, 'ND', i,
                        outlierResult[0].replace('DATA,OUTLIER,', '')
                    ]
                    results.append(result)
                else:
                    # add instance as a training instance and do not do any outlier anlysis on it
                    gateway.entry_point.Moa_Clusterers_Outliers_MCOD_processNewInstanceImplTrain(
                        clusterer, newInst)

                numSamples += 1

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        util.thisLogger.logInfo(
            'problem reading stream: %s, %s, %s, %s, %s' %
            (e, exc_type, exc_obj, exc_tb, exc_tb.tb_lineno))
        if gateway != None:
            gateway.entry_point.StopCreatingTrainedClusterers()
コード例 #15
0
ファイル: train.py プロジェクト: chambai/deepstreamce
def getActivations(x_train, numActivationTrainingInstances, model, dnnModel,
                   y_train):
    util.thisLogger.logInfo(
        "------ start of activation data extraction for training data -------")
    startTime = datetime.datetime.now()

    # Only get activations from the instances that are correctly classified
    y_predict = np.argmax(dnnModel.predict(x_train), axis=1)

    # The DNN is trained to output 0 or 1 only.
    # get the original classes it was trained on and transform the outputs
    classes = util.getParameter('DataClasses')
    classes = np.asarray(classes.replace('[', '').replace(
        ']', '').split(',')).astype(int)
    util.thisLogger.logInfo('Data classes to be used: %s' % (classes))
    count = 0
    for c in classes:
        y_predict = np.where(y_predict == count, c, y_predict)
        count += 1

    incorrectPredictIndexes = []
    for i in range(0, len(y_predict) - 1):
        if (y_predict[i] != y_train[i]):
            incorrectPredictIndexes.append(i)

    x_train = np.delete(x_train, incorrectPredictIndexes, axis=0)
    y_train = np.delete(y_train, incorrectPredictIndexes, axis=0)
    y_predict = np.delete(y_predict, incorrectPredictIndexes, axis=0)

    # train in batches
    activationTrainingBatchSize = util.getParameter(
        'ActivationTrainingBatchSize')

    if numActivationTrainingInstances == -1:
        numActivationTrainingInstances = len(x_train)

    xData = x_train[:numActivationTrainingInstances, ]
    batchData = list(util.chunks(xData, activationTrainingBatchSize))

    activationData = []
    numBatches = len(batchData)
    batchActivationData = [[] for i in range(numBatches)]
    for batchIndex in range(numBatches):
        batch = batchData[batchIndex]
        util.thisLogger.logInfo("Training batch " + str(batchIndex + 1) +
                                " of " + str(len(batchData)) + " (" +
                                str(len(batch)) + " instances)")
        # Get activations and set up streams for the training data
        # get reduced activations for all training data in one go

        # Train in a loop
        util.thisLogger.logInfo(
            str(len(batch)) + " instances selected from training data")

        activations, numLayers = extract.getActivationData(model, batch)
        batchActivationData[batchIndex].append(activations)
        activationData.append(activations)

        util.thisLogger.logInfo(
            "Filter Layers: DNN has %s activation layers, getting activation data for %s instances."
            % (numLayers, len(batch)))

    endTime = datetime.datetime.now()
    util.thisLogger.logInfo('Total training time: ' + str(endTime - startTime))
    util.thisLogger.logInfo(
        "------- end of activation data extraction for training data --------")
    util.thisLogger.logInfo("")

    return numLayers, batchData, activationData, batchActivationData