Ejemplo n.º 1
0
 def __init__(self, params):
     '''
     params being a XValFoldParams object.  
     '''
     self.params = params
     self.fold_id = 'Fold-%d' % params.foldIdx
     self.confMatrix = ConfusionMatrix()
Ejemplo n.º 2
0
 def __init__(self, params):
     '''
     params being a XValFoldParams object.  
     '''
     self.params = params
     self.fold_id = 'Fold-%d' % params.fold_idx
     self.confmat = ConfusionMatrix()
     # write the training and testing databases into a file
     with open(
             os.path.join(params.directory,
                          'train_dbs_%d.db' % params.fold_idx),
             'w+') as dbfile:
         Database.write_dbs(params.learn_dbs, dbfile)
     with open(
             os.path.join(params.directory,
                          'test_dbs_%d.db' % params.fold_idx),
             'w+') as dbfile:
         Database.write_dbs(params.test_dbs, dbfile)
Ejemplo n.º 3
0
 def __init__(self, params):
     '''
     params being a XValFoldParams object.  
     '''
     self.params = params
     self.fold_id = 'Fold-%d' % params.fold_idx
     self.confmat = ConfusionMatrix()
     # write the training and testing databases into a file
     with open(os.path.join(params.directory, 'train_dbs_%d.db' % params.fold_idx), 'w+') as dbfile:
         Database.write_dbs(params.learn_dbs, dbfile)
     with open(os.path.join(params.directory, 'test_dbs_%d.db' % params.fold_idx), 'w+') as dbfile:
         Database.write_dbs(params.test_dbs, dbfile)
Ejemplo n.º 4
0
def doXVal(folds,
           percent,
           verbose,
           multicore,
           noisy,
           predName,
           domain,
           mlnfile,
           dbfiles,
           logicLearn,
           logicInfer,
           inverse=False,
           testSetCount=1):
    startTime = time.time()

    directory = time.strftime(
        "%a_%d_%b_%Y_%H:%M:%S_K=" + str(folds) + "_TSC=" + str(testSetCount),
        time.localtime())
    os.mkdir(directory)
    os.mkdir(os.path.join(directory, 'FOL'))
    os.mkdir(os.path.join(directory, 'FUZZY'))
    # set up the logger
    log = logging.getLogger('xval')
    fileLogger = FileHandler(os.path.join(directory, 'xval.log'))
    fileLogger.setFormatter(praclog.formatter)
    log.addHandler(fileLogger)

    log.info('Results will be written into %s' % directory)

    # preparations: Read the MLN and the databases
    mln_ = readMLNFromFile(mlnfile,
                           verbose=verbose,
                           logic='FuzzyLogic',
                           grammar='PRACGrammar')
    log.info('Read MLN %s.' % mlnfile)
    dbs = []
    for dbfile in dbfiles:
        db = readDBFromFile(mln_, dbfile)
        if type(db) is list:
            dbs.extend(db)
        else:
            dbs.append(db)
    log.info('Read %d databases.' % len(dbs))

    cwpreds = [pred for pred in mln_.predicates if pred != predName]

    # create the partition of data
    subsetLen = int(math.ceil(len(dbs) * percent / 100.0))
    if subsetLen < len(dbs):
        log.info('Using only %d of %d DBs' % (subsetLen, len(dbs)))
    dbs = sample(dbs, subsetLen)

    if len(dbs) < folds:
        log.error(
            'Cannot do %d-fold cross validation with only %d databases.' %
            (folds, len(dbs)))
        exit(0)

    shuffle(dbs)
    partSize = int(math.ceil(len(dbs) / float(folds)))
    partition = []
    for i in range(folds):
        partition.append(dbs[i * partSize:(i + 1) * partSize])

    foldRunnables = []
    for foldIdx in range(folds):
        partion_ = list(partition)
        params = XValFoldParams()
        params.mln = mln_.duplicate()
        params.testDBs = []
        params.learnDBs = []

        for i in range(0, testSetCount):
            if (foldIdx >= len(partion_)):
                params.testDBs.extend(partion_[0])
                del partion_[0]
            else:
                params.testDBs.extend(partion_[foldIdx])
                del partion_[foldIdx]

        for part in partion_:
            params.learnDBs.extend(part)
        print 'LEARN DBS :' + str(len(params.learnDBs))
        print 'TEST DBS :' + str(len(params.testDBs))

        params.foldIdx = foldIdx
        params.foldCount = folds
        params.noisyStringDomains = noisy
        params.directory = directory
        params.queryPred = predName
        params.queryDom = domain
        params.logicInfer = logicInfer
        foldRunnables.append(XValFold(params))

    if multicore:
        # set up a pool of worker processes
        try:
            workerPool = Pool()
            log.info('Starting %d-fold Cross-Validation in %d processes.' %
                     (folds, workerPool._processes))
            result = workerPool.map_async(runFold, foldRunnables).get()
            workerPool.close()
            workerPool.join()
            cm = ConfusionMatrix()
            for r in result:
                cm.combine(r.confMatrix)
            elapsedTimeMP = time.time() - startTime
            prepareResults(directory, 'FOL')
            prepareResults(directory, 'FUZZY')
        except (KeyboardInterrupt, SystemExit, SystemError):
            log.critical("Caught KeyboardInterrupt, terminating workers")
            workerPool.terminate()
            workerPool.join()
            exit(1)
        except:
            log.error('\n' +
                      ''.join(traceback.format_exception(*sys.exc_info())))
            exit(1)


#     startTime = time.time()
    else:
        log.info('Starting %d-fold Cross-Validation in 1 process.' % (folds))

        for fold in foldRunnables:
            runFold(fold)

        prepareResults(directory, 'FOL')
        prepareResults(directory, 'FUZZY')

        elapsedTimeSP = time.time() - startTime

    if multicore:
        log.info('%d-fold crossvalidation (MP) took %.2f min' %
                 (folds, elapsedTimeMP / 60.0))
    else:
        log.info('%d-fold crossvalidation (SP) took %.2f min' %
                 (folds, elapsedTimeSP / 60.0))
Ejemplo n.º 5
0
def prepareResults(directory, logic):
    cm = ConfusionMatrix()
    for f in os.listdir(os.path.join(directory, logic)):
        matrix = pickle.load(open(os.path.join(directory, logic, f), 'rb'))
        cm.combine(matrix)
    cm.toFile(os.path.join(directory, logic, 'conf_matrix.cm'))
Ejemplo n.º 6
0
    def run(self):
        '''
        Runs the respective fold of the crossvalidation.
        '''
        log = logging.getLogger(self.fold_id)
        log.info('Running fold %d of %d...' %
                 (self.params.foldIdx + 1, self.params.foldCount))
        directory = self.params.directory
        try:
            # Apply noisy string clustering
            log.debug('Transforming noisy strings...')
            if self.params.noisyStringDomains is not None:
                noisyStrTrans = NoisyStringTransformer(
                    self.params.mln, self.params.noisyStringDomains, True)
                learnDBs_ = noisyStrTrans.materializeNoisyDomains(
                    self.params.learnDBs)
                testDBs_ = noisyStrTrans.transformDBs(self.params.testDBs)
            else:
                learnDBs_ = self.params.learnDBs
                testDBs_ = self.params.testDBs

            # train the MLN
            mln = self.params.mln
            log.debug('Starting learning...')
            learnedMLN = mln.learnWeights(learnDBs_,
                                          method=self.params.learningMethod,
                                          verbose=verbose,
                                          evidencePreds=[
                                              "is_a",
                                              "ac_word",
                                          ],
                                          partSize=2,
                                          optimizer='cg',
                                          maxrepeat=1)

            # store the learned MLN in a file
            learnedMLN.writeToFile(
                os.path.join(directory, 'run_%d.mln' % self.params.foldIdx))
            log.debug('Finished learning.')

            # evaluate the MLN
            log.debug('Evaluating.')
            learnedMLN.setClosedWorldPred(None)
            if self.params.cwPreds is None:
                self.params.cwPreds = [
                    p for p in mln.predicates if p != self.params.queryPred
                ]
            for pred in [
                    pred for pred in self.params.cwPreds
                    if pred in learnedMLN.predicates
            ]:
                learnedMLN.setClosedWorldPred(pred)
            #FOL
            cm = ConfusionMatrix()
            self.evalMLN(learnedMLN, testDBs_, 'FirstOrderLogic', cm)
            cm.toFile(
                os.path.join(directory, 'FOL',
                             'conf_matrix_%d.cm' % self.params.foldIdx))

            #FUZZY
            cm = ConfusionMatrix()
            self.evalMLN(learnedMLN, testDBs_, 'FuzzyLogic', cm)
            cm.toFile(
                os.path.join(directory, 'FUZZY',
                             'conf_matrix_%d.cm' % self.params.foldIdx))

            log.debug('Evaluation finished.')
        except (KeyboardInterrupt, SystemExit):
            log.critical("Exiting...")
            return None
Ejemplo n.º 7
0
class XValFold(object):
    '''
    Class representing and providing methods for a cross validation fold.
    '''
    def __init__(self, params):
        '''
        params being a XValFoldParams object.  
        '''
        self.params = params
        self.fold_id = 'Fold-%d' % params.fold_idx
        self.confmat = ConfusionMatrix()
        # write the training and testing databases into a file
        with open(
                os.path.join(params.directory,
                             'train_dbs_%d.db' % params.fold_idx),
                'w+') as dbfile:
            Database.write_dbs(params.learn_dbs, dbfile)
        with open(
                os.path.join(params.directory,
                             'test_dbs_%d.db' % params.fold_idx),
                'w+') as dbfile:
            Database.write_dbs(params.test_dbs, dbfile)

    def eval(self, mln, dbs):
        '''
        Returns a confusion matrix for the given (learned) MLN evaluated on
        the databases given in dbs.
        '''
        querypred = self.params.querypred
        #         query_dom = self.params.query_dom

        #         sig = ['?arg%d' % i for i, _ in enumerate(mln.predicates[query_pred])]
        #         querytempl = '%s(%s)' % (query_pred, ','.join(sig))

        #         dbs = map(lambda db: db.copy(mln), dbs)

        for db_ in dbs:
            # save and remove the query predicates from the evidence
            db = db_.copy()
            gndtruth = mln.ground(db)
            gndtruth.apply_cw()
            for atom, _ in db.gndatoms(querypred):
                out('removing evidence', repr(atom))
                del db.evidence[atom]
            db.write()
            stop()
            try:
                resultdb = MLNQuery(config=self.params.queryconf,
                                    mln=mln,
                                    method=InferenceMethods.WCSPInference,
                                    db=db,
                                    cw_preds=[
                                        p.name for p in mln.predicates
                                        if p.name != self.params.querypred
                                    ],
                                    multicore=False).run().resultdb
                result = mln.ground(db)
                result.set_evidence(resultdb)
                for variable in result.variables:
                    if variable.predicate.name != querypred: continue
                    pvalue = variable.evidence_value()
                    tvalue = variable.evidence_value()
                    prediction = [
                        a for a, v in variable.atomvalues(pvalue) if v == 1
                    ]
                    truth = [
                        a for a, v in variable.atomvalues(tvalue) if v == 1
                    ]
                    prediction = str(prediction[0]) if prediction else None
                    truth = str(truth[0]) if truth else None
                    self.confmat.addClassificationResult(prediction, truth)
#                 sig2 = list(sig)
#                 entityIdx = mln.predicates[query_pred].argdoms.index(query_dom)
#                 for entity in db.domains[]:
#                     sig2[entityIdx] = entity
#                     query = '%s(%s)' % (queryPred, ','.join(sig2))
#                     for truth in trueDB.query(query):
#                         truth = truth.values().pop()
#                     for pred in resultDB.query(query):
#                         pred = pred.values().pop()
#                     self.confMatrix.addClassificationResult(pred, truth)
#                 for e, v in trueDB.evidence.iteritems():
#                     if v is not None:
#                         db.addGroundAtom('%s%s' % ('' if v is True else '!', e))
            except:
                logger.critical(''.join(
                    traceback.format_exception(*sys.exc_info())))

    def run(self):
        '''
        Runs the respective fold of the crossvalidation.
        '''
        logger.info('Running fold %d of %d...' %
                    (self.params.fold_idx + 1, self.params.folds))
        directory = self.params.directory
        try:
            #             # Apply noisy string clustering
            #             log.debug('Transforming noisy strings...')
            #             if self.params.noisyStringDomains is not None:
            #                 noisyStrTrans = NoisyStringTransformer(self.params.mln, self.params.noisyStringDomains, True)
            #                 learnDBs_ = noisyStrTrans.materializeNoisyDomains(self.params.learnDBs)
            #                 testDBs_ = noisyStrTrans.transformDBs(self.params.testDBs)
            #             else:
            #                 learnDBs_ = self.params.learnDBs
            #                 testDBs_ = self.params.testDBs

            # train the MLN
            mln = self.params.mln
            logger.debug('Starting learning...')
            learn_dbs = [db.copy() for db in self.params.learn_dbs]
            # apply closed world for fuzzy atoms
            for db in learn_dbs:
                for a, v in db.gndatoms([
                        p.name for p in mln.predicates
                        if isinstance(p, FuzzyPredicate)
                ]):
                    if v != 1: db[a] = 0

            learned = MLNLearn(config=self.params.learnconf,
                               mln=mln,
                               db=learn_dbs,
                               multicore=False).run()  #200
            # store the learned MLN in a file
            learned.tofile(
                os.path.join(directory, 'run_%d.mln' % self.params.fold_idx))
            logger.debug('Finished learning.')

            # evaluate the MLN
            logger.debug('Evaluating.')
            #             learnedMLN.setClosedWorldPred(None)
            #             if self.params.cwPreds is None:
            #                 self.params.cwPreds = [p for p in mln.predicates if p != self.params.queryPred]
            #             for pred in [pred for pred in self.params.cwPreds if pred in learnedMLN.predicates]:
            #                 learnedMLN.setClosedWorldPred(pred)
            self.eval(learned, self.params.test_dbs)
            self.confmat.toFile(
                os.path.join(directory,
                             'conf_matrix_%d.cm' % self.params.fold_idx))
            logger.debug('Evaluation finished.')
        except (KeyboardInterrupt, SystemExit):
            logger.critical("Exiting...")
            return None
Ejemplo n.º 8
0
        params.learnconf = project.learnconf
        params.queryconf = project.queryconf
        params.querypred = predname
        foldRunnables.append(XValFold(params))
        logger.info('Params for fold %d:\n%s' % (fold_idx, str(params)))

    if multicore:
        # set up a pool of worker processes
        try:
            workerPool = Pool()
            logger.info('Starting %d-fold Cross-Validation in %d processes.' %
                        (folds, workerPool._processes))
            result = workerPool.map_async(runFold, foldRunnables).get()
            workerPool.close()
            workerPool.join()
            cm = ConfusionMatrix()
            for r in result:
                cm.combine(r.confmat)
            elapsedTimeMP = time.time() - startTime
            cm.toFile(os.path.join(expdir, 'conf_matrix.cm'))
            # create the pdf table and move it into the log directory
            # this is a dirty hack since pdflatex apparently
            # does not support arbitrary output paths
            pdfname = 'conf_matrix'
            logger.info('creating pdf if confusion matrix...')
            cm.toPDF(pdfname)
            os.rename('%s.pdf' % pdfname,
                      os.path.join(expdir, '%s.pdf' % pdfname))
        except (KeyboardInterrupt, SystemExit, SystemError):
            logger.critical("Caught KeyboardInterrupt, terminating workers")
            workerPool.terminate()
Ejemplo n.º 9
0
    args = sys.argv[1:]
    path = '.'
    filename = time.strftime("%a_%d_%b_%Y_%H:%M:%S_Compare_Results",
                             time.localtime())

    if len(args) == 2:
        path = args[0]
        filename = args[1]
    elif len(args) == 1:
        path = args[0]

    pattern = r'^\w{3}_\d{2}_\w{3}_\d{4}_\d{2}:\d{2}:\d{2}_K=\d+_TSC=\d+$'
    gen = (f for f in os.listdir(path) if re.search(pattern, f))

    print 'Start compare process...'

    writable = WritableObject()
    sys.stdout = writable
    for f in gen:
        ConfusionMatrix.compareConfusionMatrices(
            os.path.join(path, f, 'FOL', 'conf_matrix.cm'),
            os.path.join(path, f, 'FUZZY', 'conf_matrix.cm'))

    sys.stdout = sys.__stdout__

    f = open(filename, 'w')
    for content in writable.content:
        f.write(content)
    f.close()
    print 'done'
Ejemplo n.º 10
0
class XValFold(object):
    '''
    Class representing and providing methods for a cross validation fold.
    '''
    
    def __init__(self, params):
        '''
        params being a XValFoldParams object.  
        '''
        self.params = params
        self.fold_id = 'Fold-%d' % params.fold_idx
        self.confmat = ConfusionMatrix()
        # write the training and testing databases into a file
        with open(os.path.join(params.directory, 'train_dbs_%d.db' % params.fold_idx), 'w+') as dbfile:
            Database.write_dbs(params.learn_dbs, dbfile)
        with open(os.path.join(params.directory, 'test_dbs_%d.db' % params.fold_idx), 'w+') as dbfile:
            Database.write_dbs(params.test_dbs, dbfile)
        
            
    def eval(self, mln, dbs):
        '''
        Returns a confusion matrix for the given (learned) MLN evaluated on
        the databases given in dbs.
        '''
        querypred = self.params.querypred
#         query_dom = self.params.query_dom
        
#         sig = ['?arg%d' % i for i, _ in enumerate(mln.predicates[query_pred])]
#         querytempl = '%s(%s)' % (query_pred, ','.join(sig))
        
#         dbs = map(lambda db: db.copy(mln), dbs)
        
        for db_ in dbs:
            # save and remove the query predicates from the evidence
            db = db_.copy()
            gndtruth = mln.ground(db)
            gndtruth.apply_cw()
            for atom, _ in db.gndatoms(querypred):
                out('removing evidence', repr(atom))
                del db.evidence[atom]
            db.write()
            stop()
            try:
                resultdb = MLNQuery(config=self.params.queryconf, mln=mln, method=InferenceMethods.WCSPInference, db=db, 
                                  cw_preds=[p.name for p in mln.predicates if p.name != self.params.querypred], multicore=False).run().resultdb
                result = mln.ground(db)
                result.set_evidence(resultdb)
                for variable in result.variables:
                    if variable.predicate.name != querypred: continue
                    pvalue = variable.evidence_value()
                    tvalue = variable.evidence_value()
                    prediction = [a for a, v in variable.atomvalues(pvalue) if v == 1]
                    truth = [a for a, v in variable.atomvalues(tvalue) if v == 1]
                    prediction = str(prediction[0]) if prediction else None
                    truth = str(truth[0]) if truth else None
                    self.confmat.addClassificationResult(prediction, truth)
#                 sig2 = list(sig)
#                 entityIdx = mln.predicates[query_pred].argdoms.index(query_dom)
#                 for entity in db.domains[]:
#                     sig2[entityIdx] = entity
#                     query = '%s(%s)' % (queryPred, ','.join(sig2))
#                     for truth in trueDB.query(query):
#                         truth = truth.values().pop()
#                     for pred in resultDB.query(query):
#                         pred = pred.values().pop()
#                     self.confMatrix.addClassificationResult(pred, truth)
#                 for e, v in trueDB.evidence.iteritems():
#                     if v is not None:
#                         db.addGroundAtom('%s%s' % ('' if v is True else '!', e))
            except:
                logger.critical(''.join(traceback.format_exception(*sys.exc_info())))

    def run(self):
        '''
        Runs the respective fold of the crossvalidation.
        '''
        logger.info('Running fold %d of %d...' % (self.params.fold_idx + 1, self.params.folds))
        directory = self.params.directory
        try:
#             # Apply noisy string clustering
#             log.debug('Transforming noisy strings...')
#             if self.params.noisyStringDomains is not None:
#                 noisyStrTrans = NoisyStringTransformer(self.params.mln, self.params.noisyStringDomains, True)
#                 learnDBs_ = noisyStrTrans.materializeNoisyDomains(self.params.learnDBs)
#                 testDBs_ = noisyStrTrans.transformDBs(self.params.testDBs)
#             else:
#                 learnDBs_ = self.params.learnDBs
#                 testDBs_ = self.params.testDBs

            # train the MLN
            mln = self.params.mln
            logger.debug('Starting learning...')
            learn_dbs = [db.copy() for db in self.params.learn_dbs]
            # apply closed world for fuzzy atoms
            for db in learn_dbs:
                for a, v in db.gndatoms([p.name for p in mln.predicates if isinstance(p, FuzzyPredicate)]):
                    if v != 1: db[a] = 0
                    
            learned = MLNLearn(config=self.params.learnconf, mln=mln, db=learn_dbs, multicore=False).run()#200
            # store the learned MLN in a file
            learned.tofile(os.path.join(directory, 'run_%d.mln' % self.params.fold_idx))
            logger.debug('Finished learning.')
            
            # evaluate the MLN
            logger.debug('Evaluating.')
#             learnedMLN.setClosedWorldPred(None)
#             if self.params.cwPreds is None:
#                 self.params.cwPreds = [p for p in mln.predicates if p != self.params.queryPred]
#             for pred in [pred for pred in self.params.cwPreds if pred in learnedMLN.predicates]:
#                 learnedMLN.setClosedWorldPred(pred)
            self.eval(learned, self.params.test_dbs)
            self.confmat.toFile(os.path.join(directory, 'conf_matrix_%d.cm' % self.params.fold_idx))
            logger.debug('Evaluation finished.')
        except (KeyboardInterrupt, SystemExit):
            logger.critical("Exiting...")
            return None
Ejemplo n.º 11
0
     params.directory = expdir
     params.learnconf = project.learnconf
     params.queryconf = project.queryconf
     params.querypred = predname
     foldRunnables.append(XValFold(params))
     logger.info('Params for fold %d:\n%s' % (fold_idx, str(params)))
 
 if multicore:
     # set up a pool of worker processes
     try:
         workerPool = Pool()
         logger.info('Starting %d-fold Cross-Validation in %d processes.' % (folds, workerPool._processes))
         result = workerPool.map_async(runFold, foldRunnables).get()
         workerPool.close()
         workerPool.join()
         cm = ConfusionMatrix()
         for r in result:
             cm.combine(r.confmat)
         elapsedTimeMP = time.time() - startTime
         cm.toFile(os.path.join(expdir, 'conf_matrix.cm'))
         # create the pdf table and move it into the log directory
         # this is a dirty hack since pdflatex apparently
         # does not support arbitrary output paths
         pdfname = 'conf_matrix'
         logger.info('creating pdf if confusion matrix...')
         cm.toPDF(pdfname)
         os.rename('%s.pdf' % pdfname, os.path.join(expdir, '%s.pdf' % pdfname))
     except (KeyboardInterrupt, SystemExit, SystemError):
         logger.critical("Caught KeyboardInterrupt, terminating workers")
         workerPool.terminate()
         workerPool.join()