Example #1
0
    def test(self, classifier, testData, batchSize=None):
        log.start('Testing classifier')

        inputData, labels = testData

        batchSize = batchSize if batchSize is not None else inputData.shape[0]
        batchesCount = inputData.shape[0] / batchSize + 1
		
        predictions = None
        for batchIndex in xrange(batchesCount):
            inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize]
			
            if predictions is None:
                predictions = classifier.classify(inputBatch)
            else:
                p = classifier.classify(inputBatch)
                if len(p):
                    predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)])
			
            log.progress('Testing classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount))

        performance = Case.roc_auc_truncated(labels, predictions)

        testMetrics = TestMetrics(performance)
        log.done(testMetrics)

        return testMetrics
Example #2
0
    def verify(self, classifier):
        log.start('Verifying classifier')

        self.checkAgreementData = pandas.read_csv(self.checkAgreementDataFile)
        columns = self.checkAgreementData.columns[1:-2]
        checkAgreementData = self.checkAgreementData[columns][goodFeatures].as_matrix()
		
        agreementPredictions = classifier.classify(checkAgreementData)

        realPredictions = agreementPredictions[self.checkAgreementData['signal'].values == 0]
        monteCarloPrediction = agreementPredictions[self.checkAgreementData['signal'].values == 1]
        realWeights = self.checkAgreementData[self.checkAgreementData['signal'] == 0]['weight'].values
        monteCarloWeights = self.checkAgreementData[self.checkAgreementData['signal'] == 1]['weight'].values
		
        agreementCoefficient = Case.getKolmogorovSmirnovDistance(realPredictions, monteCarloPrediction, realWeights, monteCarloWeights)

        self.checkCorrelationData = pandas.read_csv(self.checkCorrelationDataFile)
        columns = self.checkCorrelationData.columns[1:-1]
        checkCorrelationData = self.checkCorrelationData[columns][goodFeatures].as_matrix()
        masses = self.checkCorrelationData['mass']
		
        correlationPrediction = classifier.classify(checkCorrelationData)

        correlationCoefficient = Case.getCramerVonNeimanCoefficient(correlationPrediction, masses)

        verificationMetrics = VerificationMetrics(agreementCoefficient, self.agreementCutoff, correlationCoefficient, self.correlationCutoff)
        log.done(verificationMetrics)

        return verificationMetrics
Example #3
0
    def dump(self, submission):
        log.start('Dumping data')

        fileName = '{0}/{1}.csv'.format(self.submissionsDirectory, self.seed)

        submission.to_csv(fileName, index=False)

        log.done()
Example #4
0
    def createSubmission(self, classifier, testData, batchSize=None):
        log.start('Creating submission')
		
        batchSize = batchSize if batchSize is not None else input.shape[0]
        batchesCount = testData.shape[0] / batchSize + 1
		
        predictions = None
        for batchIndex in xrange(batchesCount):
            inputBatch = testData[batchIndex * batchSize:(batchIndex + 1) * batchSize]
			
            if predictions is None:
                predictions = classifier.classify(inputBatch)
            elif len(inputBatch):
                predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)])
			
            log.progress('Creating submission: {0}%'.format((batchIndex + 1) * 100 / batchesCount))

        submission = pandas.DataFrame({"id": self.testData["id"], "prediction": predictions})

        log.done('submission' + str(submission.shape))

        return submission
Example #5
0
    def loadData(self, minified=False):
        log.start('Loading data')

        self.trainingData = pandas.read_csv(self.trainingDataFile)
        columns = self.trainingData.columns[1:-4]
        trainingInput = self.trainingData[columns][goodFeatures].as_matrix()
        trainingLabels = self.trainingData['signal'].as_matrix()
        trainingData = trainingInput, trainingLabels

        self.validationData = pandas.read_csv(self.checkAgreementDataFile)
        columns = self.validationData.columns[1:-2]
        validationInput = self.validationData[columns][goodFeatures].as_matrix()
        validationLabels = self.validationData['signal'].as_matrix()
        validationData = validationInput, validationLabels

        self.testData = pandas.read_csv(self.testDataFile)
        columns = self.testData.columns[1:]
        testData = self.testData[columns][goodFeatures].as_matrix()

        message = 'trainingData{0}, testData{1}'.format(trainingInput.shape, testData.shape)
        log.done(message)

        return trainingData, validationData, testData
Example #6
0
    def train(classifier, trainingData, validationData, batchSize=None):
        log.start('Training classifier')

        inputData, labels = trainingData

        batchSize = batchSize if batchSize is not None else inputData.shape[0]
        batchesCount = inputData.shape[0] / batchSize

        start = time.time()

        for batchIndex in xrange(batchesCount):
            inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize]
            labelsBatch = labels[batchIndex * batchSize:(batchIndex + 1) * batchSize]

            classifier.fit(inputBatch, labelsBatch)
            log.progress('Training classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount))

        end = time.time()
        elapsed = end - start

        trainingMetrics = TrainingMetrics(elapsed)
        log.done(trainingMetrics)

        return trainingMetrics
def main():
    """Main procedure of DFA minimization problem generator.
    
    Parses command-line arguments and builds solution and task DFA accordingly.
    Saves result and cleans up.
    """

    # add and check parameters

    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter,
                      argparse.MetavarTypeHelpFormatter,
                      argparse.RawTextHelpFormatter):
        pass

    parser = argparse.ArgumentParser(
        description='Command-line tool to generate DFA minimization problems.',
        formatter_class=MyFormatter,
        epilog=_EPILOG)

    for groupName in _ARGUMENTS:
        group = parser.add_argument_group(groupName)

        for option in _ARGUMENTS[groupName]:
            if len(option) == 4:
                group.add_argument(option[0],
                                   type=option[1],
                                   default=option[2],
                                   help=option[3])
            else:
                group.add_argument(option[0],
                                   type=option[1],
                                   default=option[2],
                                   help=option[3],
                                   choices=option[4])

    args = parser.parse_args()

    strToBool = lambda x: x == 'yes'

    args.ps = strToBool(args.ps)
    args.c = strToBool(args.c)
    args.pt = strToBool(args.pt)
    args.dfa = strToBool(args.dfa)
    args.tex = strToBool(args.tex)
    args.pdf = strToBool(args.pdf)
    args.shuf = strToBool(args.shuf)

    args.out = pathlib.Path(args.out)

    if args.k > args.n:
        log.k_too_big()
        return

    if args.n < args.f:
        log.f_too_big()
        return

    if args.pt and not args.ps:
        log.invalid_p_options()
        return

    if args.k == 0 and args.e > 0:
        log.not_extendable()
        return

    if any(
            map(lambda x: x < 0, (args.k, args.n, args.f, args.dmin, args.dmax,
                                  args.e, args.u))):
        log.neg_value()
        return

    if not args.out.exists() or not args.out.is_dir():
        log.creating_output_dir()
        args.out.mkdir()
        log.done()

    log.start(args)

    # construct solution dfa

    log.building_solution(args)

    build = next_min_dfa if args.b == 'enum' else rand_min_dfa

    solDFA = build(args.k, args.n, args.f, args.dmin, args.dmax, args.ps,
                   args.out)

    if solDFA is None and args.b == 'enum':

        log.done()
        log.enum_finished()
        return

    log.done()

    # extend dfa

    log.extending_solution(args)

    for i in range(10):

        try:

            reachDFA, taskDFA = extend_dfa(solDFA, args.e, args.u, args.pt,
                                           args.c)

        except DFANotExtendable:

            log.failed()
            log.dfa_not_extendable(args)
            return

        except PygraphIndexErrorBug:

            log.failed()
            log.pygraph_bug('extending')

            if i == 9:
                log.pygraph_bug_abort(args)
                return

        else:

            log.done()
            break

    # generate graphical representation of solution and task dfa

    if args.dfa or args.tex or args.pdf:
        log.saving()
        save_exercise(solDFA, reachDFA, taskDFA, args.out, args.dfa, args.tex,
                      args.pdf, args.shuf)
        log.done()
    else:
        log.no_saving()

    # clean up working directory

    log.cleaning()

    for f in args.out.iterdir():
        if f.suffix in ('.toc', '.aux', '.log', '.gz', '.bbl', '.blg', '.out'):
            f.unlink()

    log.done()