def test_process_example(self): minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith()) minas.offline(self.onPercentDataFrame) for x, target in self.fivePercentDataIterator: example, isClassified, cluster, dist = minas.onlineProcessExample( x, []) self.assertIsInstance(dist, float) self.assertIsInstance(cluster, Cluster) self.assertIsInstance(isClassified, bool) self.assertIsInstance(example, Example) break
def test_zz_big_dataset(self): minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith()) kwargs = { 'name': 'test_zz_big_dataset', 'trainSet': self.tenPercentDataFrame, 'testSet': self.allDataIterator, 'minas': minas } self.runDataset(**kwargs)
def test_small_dataset_MinasAlgorithDaskKmeans(self): TimedMinasAlgorith = self.timed.timedClass(MinasAlgorithDaskKmeans) minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith()) kwargs = { 'name': 'test_small_dataset_MinasAlgorithDaskKmeans', 'trainSet': self.onPercentDataFrame, 'testSet': self.fivePercentDataIterator, 'minas': minas } self.runDataset(**kwargs)
def fake_seed(self, seed): dirr = self.basedir + str(seed) + '/' if os.path.exists(dirr): shutil.rmtree(dirr) if not os.path.exists(dirr): os.makedirs(dirr) timed = Timed() TimedMinasAlgorith = timed.timedClass(MinasAlgorith) CONSTS = MinasConsts() logging.info('Next seed: {}'.format(seed)) minas = MinasBase(minasAlgorith=TimedMinasAlgorith(CONSTS=CONSTS)) # rootLogger = logging.getLogger() logHandler = logging.FileHandler(dirr + 'run.log') logHandler.formatter = rootLogger.handlers[0].formatter rootLogger.addHandler(logHandler) # ------------------------------------------------------------------------------------------------ examples = self.setupFakeExamples(seed) plotExamples2D(dirr, '0-fake_base', examples) # ------------------------------------------------------------------------------------------------ training_set = examples[:int(len(examples) * .1)] with open(dirr + 'training_set.csv', 'w') as training_set_csv: for ex in training_set: training_set_csv.write(','.join([str(i) for i in ex.item]) + ',' + ex.label + '\n') plotExamples2D(dirr, '1-training_set', training_set) trainingDf = pd.DataFrame( map(lambda x: { 'item': x.item, 'label': x.label }, training_set)) logging.info('trainingDf' + '\n' + str(trainingDf.groupby('label').describe()) + '\n') minas.offline(trainingDf) minas.storeToFile(dirr + 'minas.yaml') minas.restoreFromFile(dirr + 'minas.yaml') logging.info(str(minas) + str(minas)) self.assertGreater(len(minas.clusters), 0, 'model must be trainded after offline call') plotExamples2D(dirr, '2-offline_clusters', [], minas.clusters) plotExamples2D(dirr, '3-offline_training', training_set, minas.clusters) plotExamples2D(dirr, '4-offline_all_data', examples, minas.clusters) minas.minasAlgorith.checkTraining(trainingDf, minas.clusters) # ------------------------------------------------------------------------------------------------ testSet = examples[int(len(examples) * .1):] minas.online(i.item for i in testSet) # ------------------------------------------------------------------------------------------------ logging.info('aggregatin resutls') results = [] positiveCount = 0 negativeCount = 0 unknownCount = 0 totalExamples = len(examples) with open(dirr + 'examples.csv', 'w') as examplesCsv: for ex in examples: ex = deepcopy(ex) hasLabel, cluster, d = None, None, None if minas: hasLabel, cluster, d, ex = minas.classify(ex) examplesCsv.write( ','.join([str(i) for i in ex.item]) + ',' + ex.label + ',' + (cluster.label if cluster and hasLabel else 'Unknown') + ',' + ('Positive' if cluster and cluster.label == ex.label else 'Negative') + '\n') if hasLabel: if cluster.label == ex.label: ex.label = 'Positive' positiveCount += 1 else: ex.label = 'Negative' negativeCount += 1 else: ex.label = 'Unknown' unknownCount += 1 results.append(ex) # end results map result = '[seed {seed}] positive: {p}({pp:.2%}), negative: {n}({nn:.2%}), unknown: {u}({uu:.2%})'.format( seed=seed, p=positiveCount, pp=positiveCount / totalExamples, n=negativeCount, nn=negativeCount / totalExamples, u=unknownCount, uu=unknownCount / totalExamples, ) logging.info('\n\n\t=== Final Results ===\n{model}\n{result}\n'.format( model=str(minas), result=result)) plotExamples2D(dirr, '5-online_clusters', [], minas.clusters if minas else []) plotExamples2D(dirr, '6-online_resutls', results, minas.clusters if minas else []) onlyFalses = [x for x in results if x.label is not 'Positive'] plotExamples2D(dirr, '7-online_neg_unk', onlyFalses, minas.clusters if minas else []) del minas rootLogger.removeHandler(logHandler) # ------------------------------------------------------------------------------------------------ df = timed.statisticSummary() logging.info(f'=========== Timed Functions Summary ===========\n{df}') fig, ax = timed.mkTimedResumePlot() plt.tight_layout(.5) plt.savefig(dirr + 'timed-run.png') plt.close(fig) timed.clearTimes() return result, df.describe()
def test_store(self): filename = 'run/forest-cover-type-dataset/store-test.yaml' minas = MinasBase(minasAlgorith=self.TimedMinasAlgorith()) minas.offline(self.onPercentDataFrame) clusters = len(minas.clusters) sleepClusters = len(minas.sleepClusters) unknownBuffer = len(minas.unknownBuffer) minas.storeToFile(filename) minas.restoreFromFile(filename) self.assertEqual(clusters, len(minas.clusters)) self.assertEqual(sleepClusters, len(minas.sleepClusters)) self.assertEqual(unknownBuffer, len(minas.unknownBuffer)) for x, target in self.fivePercentDataIterator: example, isClassified, cluster, dist = minas.onlineProcessExample( x, []) self.assertIsInstance(dist, float) self.assertIsInstance(cluster, Cluster) self.assertIsInstance(isClassified, bool) self.assertIsInstance(example, Example) clusters = len(minas.clusters) sleepClusters = len(minas.sleepClusters) unknownBuffer = len(minas.unknownBuffer) minas.storeToFile(filename) minas.restoreFromFile(filename) self.assertEqual(clusters, len(minas.clusters)) self.assertEqual(sleepClusters, len(minas.sleepClusters)) self.assertEqual(unknownBuffer, len(minas.unknownBuffer))