def test_process_example(self):
     minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith())
     minas.offline(self.onPercentDataFrame)
     for x, target in self.fivePercentDataIterator:
         example, isClassified, cluster, dist = minas.onlineProcessExample(
             x, [])
         self.assertIsInstance(dist, float)
         self.assertIsInstance(cluster, Cluster)
         self.assertIsInstance(isClassified, bool)
         self.assertIsInstance(example, Example)
         break
 def test_zz_big_dataset(self):
     minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith())
     kwargs = {
         'name': 'test_zz_big_dataset',
         'trainSet': self.tenPercentDataFrame,
         'testSet': self.allDataIterator,
         'minas': minas
     }
     self.runDataset(**kwargs)
 def test_small_dataset_MinasAlgorithDaskKmeans(self):
     TimedMinasAlgorith = self.timed.timedClass(MinasAlgorithDaskKmeans)
     minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith())
     kwargs = {
         'name': 'test_small_dataset_MinasAlgorithDaskKmeans',
         'trainSet': self.onPercentDataFrame,
         'testSet': self.fivePercentDataIterator,
         'minas': minas
     }
     self.runDataset(**kwargs)
Exemple #4
0
    def fake_seed(self, seed):
        dirr = self.basedir + str(seed) + '/'
        if os.path.exists(dirr):
            shutil.rmtree(dirr)
        if not os.path.exists(dirr):
            os.makedirs(dirr)
        timed = Timed()
        TimedMinasAlgorith = timed.timedClass(MinasAlgorith)
        CONSTS = MinasConsts()
        logging.info('Next seed: {}'.format(seed))
        minas = MinasBase(minasAlgorith=TimedMinasAlgorith(CONSTS=CONSTS))
        #
        rootLogger = logging.getLogger()
        logHandler = logging.FileHandler(dirr + 'run.log')
        logHandler.formatter = rootLogger.handlers[0].formatter
        rootLogger.addHandler(logHandler)
        # ------------------------------------------------------------------------------------------------
        examples = self.setupFakeExamples(seed)
        plotExamples2D(dirr, '0-fake_base', examples)
        # ------------------------------------------------------------------------------------------------
        training_set = examples[:int(len(examples) * .1)]
        with open(dirr + 'training_set.csv', 'w') as training_set_csv:
            for ex in training_set:
                training_set_csv.write(','.join([str(i) for i in ex.item]) +
                                       ',' + ex.label + '\n')
        plotExamples2D(dirr, '1-training_set', training_set)

        trainingDf = pd.DataFrame(
            map(lambda x: {
                'item': x.item,
                'label': x.label
            }, training_set))
        logging.info('trainingDf' + '\n' +
                     str(trainingDf.groupby('label').describe()) + '\n')
        minas.offline(trainingDf)
        minas.storeToFile(dirr + 'minas.yaml')
        minas.restoreFromFile(dirr + 'minas.yaml')
        logging.info(str(minas) + str(minas))
        self.assertGreater(len(minas.clusters), 0,
                           'model must be trainded after offline call')

        plotExamples2D(dirr, '2-offline_clusters', [], minas.clusters)
        plotExamples2D(dirr, '3-offline_training', training_set,
                       minas.clusters)
        plotExamples2D(dirr, '4-offline_all_data', examples, minas.clusters)
        minas.minasAlgorith.checkTraining(trainingDf, minas.clusters)
        # ------------------------------------------------------------------------------------------------
        testSet = examples[int(len(examples) * .1):]
        minas.online(i.item for i in testSet)
        # ------------------------------------------------------------------------------------------------
        logging.info('aggregatin resutls')
        results = []
        positiveCount = 0
        negativeCount = 0
        unknownCount = 0
        totalExamples = len(examples)
        with open(dirr + 'examples.csv', 'w') as examplesCsv:
            for ex in examples:
                ex = deepcopy(ex)
                hasLabel, cluster, d = None, None, None
                if minas:
                    hasLabel, cluster, d, ex = minas.classify(ex)
                examplesCsv.write(
                    ','.join([str(i)
                              for i in ex.item]) + ',' + ex.label + ',' +
                    (cluster.label if cluster and hasLabel else 'Unknown') +
                    ',' + ('Positive' if cluster and cluster.label == ex.label
                           else 'Negative') + '\n')
                if hasLabel:
                    if cluster.label == ex.label:
                        ex.label = 'Positive'
                        positiveCount += 1
                    else:
                        ex.label = 'Negative'
                        negativeCount += 1
                else:
                    ex.label = 'Unknown'
                    unknownCount += 1
                results.append(ex)
                # end results map
        result = '[seed {seed}] positive: {p}({pp:.2%}), negative: {n}({nn:.2%}), unknown: {u}({uu:.2%})'.format(
            seed=seed,
            p=positiveCount,
            pp=positiveCount / totalExamples,
            n=negativeCount,
            nn=negativeCount / totalExamples,
            u=unknownCount,
            uu=unknownCount / totalExamples,
        )
        logging.info('\n\n\t=== Final Results ===\n{model}\n{result}\n'.format(
            model=str(minas), result=result))
        plotExamples2D(dirr, '5-online_clusters', [],
                       minas.clusters if minas else [])
        plotExamples2D(dirr, '6-online_resutls', results,
                       minas.clusters if minas else [])
        onlyFalses = [x for x in results if x.label is not 'Positive']
        plotExamples2D(dirr, '7-online_neg_unk', onlyFalses,
                       minas.clusters if minas else [])
        del minas
        rootLogger.removeHandler(logHandler)
        # ------------------------------------------------------------------------------------------------
        df = timed.statisticSummary()
        logging.info(f'=========== Timed Functions Summary ===========\n{df}')
        fig, ax = timed.mkTimedResumePlot()
        plt.tight_layout(.5)
        plt.savefig(dirr + 'timed-run.png')
        plt.close(fig)
        timed.clearTimes()
        return result, df.describe()
    def test_store(self):
        filename = 'run/forest-cover-type-dataset/store-test.yaml'
        minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith())
        minas.offline(self.onPercentDataFrame)

        clusters = len(minas.clusters)
        sleepClusters = len(minas.sleepClusters)
        unknownBuffer = len(minas.unknownBuffer)
        minas.storeToFile(filename)
        minas.restoreFromFile(filename)
        self.assertEqual(clusters, len(minas.clusters))
        self.assertEqual(sleepClusters, len(minas.sleepClusters))
        self.assertEqual(unknownBuffer, len(minas.unknownBuffer))

        for x, target in self.fivePercentDataIterator:
            example, isClassified, cluster, dist = minas.onlineProcessExample(
                x, [])
            self.assertIsInstance(dist, float)
            self.assertIsInstance(cluster, Cluster)
            self.assertIsInstance(isClassified, bool)
            self.assertIsInstance(example, Example)

        clusters = len(minas.clusters)
        sleepClusters = len(minas.sleepClusters)
        unknownBuffer = len(minas.unknownBuffer)
        minas.storeToFile(filename)
        minas.restoreFromFile(filename)
        self.assertEqual(clusters, len(minas.clusters))
        self.assertEqual(sleepClusters, len(minas.sleepClusters))
        self.assertEqual(unknownBuffer, len(minas.unknownBuffer))