def doXVal(folds, percent, verbose, multicore, noisy, predName, domain, mlnfile, dbfiles, logicLearn, logicInfer, inverse=False, testSetCount=1): startTime = time.time() directory = time.strftime( "%a_%d_%b_%Y_%H:%M:%S_K=" + str(folds) + "_TSC=" + str(testSetCount), time.localtime()) os.mkdir(directory) os.mkdir(os.path.join(directory, 'FOL')) os.mkdir(os.path.join(directory, 'FUZZY')) # set up the logger log = logging.getLogger('xval') fileLogger = FileHandler(os.path.join(directory, 'xval.log')) fileLogger.setFormatter(praclog.formatter) log.addHandler(fileLogger) log.info('Results will be written into %s' % directory) # preparations: Read the MLN and the databases mln_ = readMLNFromFile(mlnfile, verbose=verbose, logic='FuzzyLogic', grammar='PRACGrammar') log.info('Read MLN %s.' % mlnfile) dbs = [] for dbfile in dbfiles: db = readDBFromFile(mln_, dbfile) if type(db) is list: dbs.extend(db) else: dbs.append(db) log.info('Read %d databases.' % len(dbs)) cwpreds = [pred for pred in mln_.predicates if pred != predName] # create the partition of data subsetLen = int(math.ceil(len(dbs) * percent / 100.0)) if subsetLen < len(dbs): log.info('Using only %d of %d DBs' % (subsetLen, len(dbs))) dbs = sample(dbs, subsetLen) if len(dbs) < folds: log.error( 'Cannot do %d-fold cross validation with only %d databases.' % (folds, len(dbs))) exit(0) shuffle(dbs) partSize = int(math.ceil(len(dbs) / float(folds))) partition = [] for i in range(folds): partition.append(dbs[i * partSize:(i + 1) * partSize]) foldRunnables = [] for foldIdx in range(folds): partion_ = list(partition) params = XValFoldParams() params.mln = mln_.duplicate() params.testDBs = [] params.learnDBs = [] for i in range(0, testSetCount): if (foldIdx >= len(partion_)): params.testDBs.extend(partion_[0]) del partion_[0] else: params.testDBs.extend(partion_[foldIdx]) del partion_[foldIdx] for part in partion_: params.learnDBs.extend(part) print 'LEARN DBS :' + str(len(params.learnDBs)) print 'TEST DBS :' + str(len(params.testDBs)) params.foldIdx = foldIdx params.foldCount = folds params.noisyStringDomains = noisy params.directory = directory params.queryPred = predName params.queryDom = domain params.logicInfer = logicInfer foldRunnables.append(XValFold(params)) if multicore: # set up a pool of worker processes try: workerPool = Pool() log.info('Starting %d-fold Cross-Validation in %d processes.' % (folds, workerPool._processes)) result = workerPool.map_async(runFold, foldRunnables).get() workerPool.close() workerPool.join() cm = ConfusionMatrix() for r in result: cm.combine(r.confMatrix) elapsedTimeMP = time.time() - startTime prepareResults(directory, 'FOL') prepareResults(directory, 'FUZZY') except (KeyboardInterrupt, SystemExit, SystemError): log.critical("Caught KeyboardInterrupt, terminating workers") workerPool.terminate() workerPool.join() exit(1) except: log.error('\n' + ''.join(traceback.format_exception(*sys.exc_info()))) exit(1) # startTime = time.time() else: log.info('Starting %d-fold Cross-Validation in 1 process.' % (folds)) for fold in foldRunnables: runFold(fold) prepareResults(directory, 'FOL') prepareResults(directory, 'FUZZY') elapsedTimeSP = time.time() - startTime if multicore: log.info('%d-fold crossvalidation (MP) took %.2f min' % (folds, elapsedTimeMP / 60.0)) else: log.info('%d-fold crossvalidation (SP) took %.2f min' % (folds, elapsedTimeSP / 60.0))
params.querypred = predname foldRunnables.append(XValFold(params)) logger.info('Params for fold %d:\n%s' % (fold_idx, str(params))) if multicore: # set up a pool of worker processes try: workerPool = Pool() logger.info('Starting %d-fold Cross-Validation in %d processes.' % (folds, workerPool._processes)) result = workerPool.map_async(runFold, foldRunnables).get() workerPool.close() workerPool.join() cm = ConfusionMatrix() for r in result: cm.combine(r.confmat) elapsedTimeMP = time.time() - startTime cm.toFile(os.path.join(expdir, 'conf_matrix.cm')) # create the pdf table and move it into the log directory # this is a dirty hack since pdflatex apparently # does not support arbitrary output paths pdfname = 'conf_matrix' logger.info('creating pdf if confusion matrix...') cm.toPDF(pdfname) os.rename('%s.pdf' % pdfname, os.path.join(expdir, '%s.pdf' % pdfname)) except (KeyboardInterrupt, SystemExit, SystemError): logger.critical("Caught KeyboardInterrupt, terminating workers") workerPool.terminate() workerPool.join() exit(1)
def prepareResults(directory, logic): cm = ConfusionMatrix() for f in os.listdir(os.path.join(directory, logic)): matrix = pickle.load(open(os.path.join(directory, logic, f), 'rb')) cm.combine(matrix) cm.toFile(os.path.join(directory, logic, 'conf_matrix.cm'))
params.queryconf = project.queryconf params.querypred = predname foldRunnables.append(XValFold(params)) logger.info('Params for fold %d:\n%s' % (fold_idx, str(params))) if multicore: # set up a pool of worker processes try: workerPool = Pool() logger.info('Starting %d-fold Cross-Validation in %d processes.' % (folds, workerPool._processes)) result = workerPool.map_async(runFold, foldRunnables).get() workerPool.close() workerPool.join() cm = ConfusionMatrix() for r in result: cm.combine(r.confmat) elapsedTimeMP = time.time() - startTime cm.toFile(os.path.join(expdir, 'conf_matrix.cm')) # create the pdf table and move it into the log directory # this is a dirty hack since pdflatex apparently # does not support arbitrary output paths pdfname = 'conf_matrix' logger.info('creating pdf if confusion matrix...') cm.toPDF(pdfname) os.rename('%s.pdf' % pdfname, os.path.join(expdir, '%s.pdf' % pdfname)) except (KeyboardInterrupt, SystemExit, SystemError): logger.critical("Caught KeyboardInterrupt, terminating workers") workerPool.terminate() workerPool.join() exit(1) except: