def test(self, classifier, testData, batchSize=None): log.start('Testing classifier') inputData, labels = testData batchSize = batchSize if batchSize is not None else inputData.shape[0] batchesCount = inputData.shape[0] / batchSize + 1 predictions = None for batchIndex in xrange(batchesCount): inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize] if predictions is None: predictions = classifier.classify(inputBatch) else: p = classifier.classify(inputBatch) if len(p): predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)]) log.progress('Testing classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount)) performance = Case.roc_auc_truncated(labels, predictions) testMetrics = TestMetrics(performance) log.done(testMetrics) return testMetrics
def verify(self, classifier): log.start('Verifying classifier') self.checkAgreementData = pandas.read_csv(self.checkAgreementDataFile) columns = self.checkAgreementData.columns[1:-2] checkAgreementData = self.checkAgreementData[columns][goodFeatures].as_matrix() agreementPredictions = classifier.classify(checkAgreementData) realPredictions = agreementPredictions[self.checkAgreementData['signal'].values == 0] monteCarloPrediction = agreementPredictions[self.checkAgreementData['signal'].values == 1] realWeights = self.checkAgreementData[self.checkAgreementData['signal'] == 0]['weight'].values monteCarloWeights = self.checkAgreementData[self.checkAgreementData['signal'] == 1]['weight'].values agreementCoefficient = Case.getKolmogorovSmirnovDistance(realPredictions, monteCarloPrediction, realWeights, monteCarloWeights) self.checkCorrelationData = pandas.read_csv(self.checkCorrelationDataFile) columns = self.checkCorrelationData.columns[1:-1] checkCorrelationData = self.checkCorrelationData[columns][goodFeatures].as_matrix() masses = self.checkCorrelationData['mass'] correlationPrediction = classifier.classify(checkCorrelationData) correlationCoefficient = Case.getCramerVonNeimanCoefficient(correlationPrediction, masses) verificationMetrics = VerificationMetrics(agreementCoefficient, self.agreementCutoff, correlationCoefficient, self.correlationCutoff) log.done(verificationMetrics) return verificationMetrics
def dump(self, submission): log.start('Dumping data') fileName = '{0}/{1}.csv'.format(self.submissionsDirectory, self.seed) submission.to_csv(fileName, index=False) log.done()
def createSubmission(self, classifier, testData, batchSize=None): log.start('Creating submission') batchSize = batchSize if batchSize is not None else input.shape[0] batchesCount = testData.shape[0] / batchSize + 1 predictions = None for batchIndex in xrange(batchesCount): inputBatch = testData[batchIndex * batchSize:(batchIndex + 1) * batchSize] if predictions is None: predictions = classifier.classify(inputBatch) elif len(inputBatch): predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)]) log.progress('Creating submission: {0}%'.format((batchIndex + 1) * 100 / batchesCount)) submission = pandas.DataFrame({"id": self.testData["id"], "prediction": predictions}) log.done('submission' + str(submission.shape)) return submission
def loadData(self, minified=False): log.start('Loading data') self.trainingData = pandas.read_csv(self.trainingDataFile) columns = self.trainingData.columns[1:-4] trainingInput = self.trainingData[columns][goodFeatures].as_matrix() trainingLabels = self.trainingData['signal'].as_matrix() trainingData = trainingInput, trainingLabels self.validationData = pandas.read_csv(self.checkAgreementDataFile) columns = self.validationData.columns[1:-2] validationInput = self.validationData[columns][goodFeatures].as_matrix() validationLabels = self.validationData['signal'].as_matrix() validationData = validationInput, validationLabels self.testData = pandas.read_csv(self.testDataFile) columns = self.testData.columns[1:] testData = self.testData[columns][goodFeatures].as_matrix() message = 'trainingData{0}, testData{1}'.format(trainingInput.shape, testData.shape) log.done(message) return trainingData, validationData, testData
def train(classifier, trainingData, validationData, batchSize=None): log.start('Training classifier') inputData, labels = trainingData batchSize = batchSize if batchSize is not None else inputData.shape[0] batchesCount = inputData.shape[0] / batchSize start = time.time() for batchIndex in xrange(batchesCount): inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize] labelsBatch = labels[batchIndex * batchSize:(batchIndex + 1) * batchSize] classifier.fit(inputBatch, labelsBatch) log.progress('Training classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount)) end = time.time() elapsed = end - start trainingMetrics = TrainingMetrics(elapsed) log.done(trainingMetrics) return trainingMetrics
def main(): """Main procedure of DFA minimization problem generator. Parses command-line arguments and builds solution and task DFA accordingly. Saves result and cleans up. """ # add and check parameters class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.MetavarTypeHelpFormatter, argparse.RawTextHelpFormatter): pass parser = argparse.ArgumentParser( description='Command-line tool to generate DFA minimization problems.', formatter_class=MyFormatter, epilog=_EPILOG) for groupName in _ARGUMENTS: group = parser.add_argument_group(groupName) for option in _ARGUMENTS[groupName]: if len(option) == 4: group.add_argument(option[0], type=option[1], default=option[2], help=option[3]) else: group.add_argument(option[0], type=option[1], default=option[2], help=option[3], choices=option[4]) args = parser.parse_args() strToBool = lambda x: x == 'yes' args.ps = strToBool(args.ps) args.c = strToBool(args.c) args.pt = strToBool(args.pt) args.dfa = strToBool(args.dfa) args.tex = strToBool(args.tex) args.pdf = strToBool(args.pdf) args.shuf = strToBool(args.shuf) args.out = pathlib.Path(args.out) if args.k > args.n: log.k_too_big() return if args.n < args.f: log.f_too_big() return if args.pt and not args.ps: log.invalid_p_options() return if args.k == 0 and args.e > 0: log.not_extendable() return if any( map(lambda x: x < 0, (args.k, args.n, args.f, args.dmin, args.dmax, args.e, args.u))): log.neg_value() return if not args.out.exists() or not args.out.is_dir(): log.creating_output_dir() args.out.mkdir() log.done() log.start(args) # construct solution dfa log.building_solution(args) build = next_min_dfa if args.b == 'enum' else rand_min_dfa solDFA = build(args.k, args.n, args.f, args.dmin, args.dmax, args.ps, args.out) if solDFA is None and args.b == 'enum': log.done() log.enum_finished() return log.done() # extend dfa log.extending_solution(args) for i in range(10): try: reachDFA, taskDFA = extend_dfa(solDFA, args.e, args.u, args.pt, args.c) except DFANotExtendable: log.failed() log.dfa_not_extendable(args) return except PygraphIndexErrorBug: log.failed() log.pygraph_bug('extending') if i == 9: log.pygraph_bug_abort(args) return else: log.done() break # generate graphical representation of solution and task dfa if args.dfa or args.tex or args.pdf: log.saving() save_exercise(solDFA, reachDFA, taskDFA, args.out, args.dfa, args.tex, args.pdf, args.shuf) log.done() else: log.no_saving() # clean up working directory log.cleaning() for f in args.out.iterdir(): if f.suffix in ('.toc', '.aux', '.log', '.gz', '.bbl', '.blg', '.out'): f.unlink() log.done()