Ejemplo n.º 1
0
 def makeFileList(name) :
     if os.path.exists(self.inputFilesListFile(name)) and self.useCachedFileLists() : return
     fileNames = eval(self.sampleDict[name].filesCommand)
     assert fileNames, "The command '%s' produced an empty list of files"%self.sampleDict[name].filesCommand
     tmpDir,localFileName,globalFileName = self.globalToLocal(self.inputFilesListFile(name))
     utils.writePickle(localFileName, fileNames)
     self.localToGlobal(tmpDir, localFileName, globalFileName)
Ejemplo n.º 2
0
 def makeFileList(name) :
     fName = self.inputFilesListFile(name)
     if os.path.exists(fName) and self.useCachedFileLists() : return
     fileNames = eval(self.sampleDict[name].filesCommand)
     assert fileNames, "The command '%s' produced an empty list of files"%self.sampleDict[name].filesCommand
     utils.mkdir(os.path.dirname(fName))
     utils.writePickle(fName, zip(fileNames,map(nEventsFile, fileNames)))
Ejemplo n.º 3
0
    def writePickle(self) :
        def pickleJar(step) :
            inter = set(step.varsToPickle()).intersection(set(['nPass','nFail','outputFileName']))
            assert not inter, "%s is trying to pickle %s, which %s reserved for use by analysisStep."%(step.name, str(inter), "is" if len(inter)==1 else "are")
            return dict([ (item, getattr(step,item)) for item in step.varsToPickle()+['nPass','nFail']] +
                        [('outputFileName', getattr(step,'outputFileName').replace(self.outputDir, self.globalDir))])

        utils.writePickle( self.pickleFileName,
                           [ [pickleJar(step) for step in self.steps], self.calculablesUsed, self.leavesUsed] )
Ejemplo n.º 4
0
    def __init__(self, options) :
        self.__batch   = options.batch
        self.__resubmit= options.resubmit
        self.__loop    = options.loop
        self.__profile = options.profile
        self.__jobId   = options.jobId
        self.__tag     = options.tag
        self.__sample  = options.sample
        self.__site    = options.site if options.site!=None else sites.prefix()
        self.__tags    = options.tags.split(',') if type(options.tags)==str else options.tags
        self.__samples = options.samples.split(',') if type(options.samples)==str else options.samples
        self.__omit    = options.omit.split(',')
        self.__nocheck = options.nocheck
        self.__quiet   = options.quiet
        self.__skip    = options.skip

        self.moveOutputFiles = (self.__jobId is None) or sites.info(site = self.__site, key = "moveOutputFilesBatch")
        self.localStem  = "%s/%s"%(sites.info(site = self.__site, key = "localOutputDir" ), self.name)
        self.globalStem = "%s/%s"%(sites.info(site = self.__site, key = "globalOutputDir"), self.name)
    
        self.sampleDict = samples.SampleHolder()
        map(self.sampleDict.update,self.listOfSampleDictionaries())

        if self.__tags is True or self.__samples is True :
            for conf in self.configurations :
                print conf['tag']
                if self.__samples is True :
                    print '\n'.join('  '+sample.weightedName for sample in self.filteredSamples(conf))
            sys.exit(0)
                
        if self.__loop:
            os.system("mkdir -p %s"%self.localStem)
            if self.__jobId==None :
                os.system("mkdir -p %s"%self.globalStem)
                self.makeInputFileLists()

        for conf in self.configurations:
            if options.slices < 0:
                nSlicesFixed = None
                nEventsPerSlice = -options.slices
            else:
                nSlicesFixed = options.slices
                nEventsPerSlice = None
            self.listsOfLoopers[conf['tag']] = self.sampleLoopers(conf, nSlicesFixed=nSlicesFixed, nEventsPerSlice=nEventsPerSlice, byEvents=options.byEvents)
            if self.__jobId is None and self.__loop:
                for looper in self.listsOfLoopers[conf['tag']]:
                    utils.writePickle(self.jobsFile(conf['tag'], looper.name, clean=True), looper.nSlices)
Ejemplo n.º 5
0
    def __init__(self, options) :
        self.__batch   = options.batch
        self.__loop    = int(options.loop)   if options.loop!=None else None
        self.__nSlices = int(options.slices) if options.slices!=None else 1
        self.__profile = options.profile
        self.__jobId   = options.jobId
        self.__tag     = options.tag
        self.__sample  = options.sample
        self.__site    = options.site if options.site!=None else configuration.sitePrefix()
        self.__tags    = options.tags.split(',') if type(options.tags)==str else options.tags
        self.__samples = options.samples.split(',') if type(options.samples)==str else options.samples
        self.__omit    = options.omit.split(',')
        self.__nocheck = options.nocheck

        self.localStem  = "%s/%s"%(configuration.siteInfo(site = self.__site, key = "localOutputDir" ), self.name)
        self.globalStem = "%s/%s"%(configuration.siteInfo(site = self.__site, key = "globalOutputDir"), self.name)
    
        self.sampleDict = samples.SampleHolder()
        map(self.sampleDict.update,self.listOfSampleDictionaries())

        if self.__tags is True or self.__samples is True :
            for conf in self.configurations :
                print conf['tag']
                if self.__samples is True :
                    print '\n'.join('  '+sample.weightedName for sample in self.filteredSamples(conf))
            sys.exit(0)
                
        if self.__loop!=None :
            os.system("mkdir -p %s"%self.localStem)
            if self.__jobId==None :
                os.system("mkdir -p %s"%self.globalStem)
                self.makeInputFileLists()

        for conf in self.configurations :
            self.listsOfLoopers[conf['tag']] = self.sampleLoopers(conf)
            if self.__jobId==None and self.__loop!=None :
                for looper in self.listsOfLoopers[conf['tag']] :
                    utils.writePickle( self.jobsFile(conf['tag'],looper.name), self.__nSlices )
Ejemplo n.º 6
0
            #t = timeit.Timer(stmt=code,setup='from __main__ import utils')
            #print(filename+'_'+str(i+1)+'.pickle')
            #print ('%f'%float(t.timeit(10/10)))
            sock = getConn()
            sock.sendall('token' + filename + '_' + str(i) + '.pickle' + "\n")
            received = sock.recv(1024)
            print "Received: {}".format(received)
            data = open(
                './pickles/token' + filename + '_' + str(i) + '.pickle',
                'rb').read()
            sock.sendall(data)

            sock.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--filename', '-f', help='Path of file to encrypt')
    parser.add_argument('--window', '-w', help='window size of the bucket')
    parser.add_argument('--chars', '-c', help='number of chars')
    args = parser.parse_args()

    #Same key used for different files!
    key = nacl.utils.random(nacl.secret.SecretBox.KEY_SIZE)
    nonce = nacl.utils.random(nacl.secret.SecretBox.NONCE_SIZE)
    utils.writePickle('./pickles/key.pickle', key)
    parseRealData(args.filename, int(args.window), int(args.chars))
    #createData()
    #sendFM()
    #createTokens()
Ejemplo n.º 7
0
 def worker(pickle, json) :
     if not os.path.exists(pickle) : utils.writePickle(pickle, recordedInvMicrobarns(json))
Ejemplo n.º 8
0
def train():
    X, Y, testX, testFid = prepare_data()
    X, Y, testX = str2float(X, Y, testX)
    X, testX = scale_data(X, testX)

    if args.select_feature:
        # select feature
        pass
    for iter in range(args.sample_num):
        np.random.seed(iter)
        logger.info('Iteration %2d, Current random seed: %2d' %
                    (iter, np.random.get_state()[1][0]))
        # sampling data
        # trainX, trainY, validX, validY = sample_data(X, Y)
        trainX = X
        trainY = Y
        global dTrain
        dTrain = xgb.DMatrix(trainX, trainY, nthread=args.nthread)
        global dTest
        dTest = xgb.DMatrix(testX, nthread=args.nthread)

        # bayes_opt selection
        # parameter to be learning
        logger.info('Setting parameters for BayesianOptimaization')
        params = {
            'max_depth': (10, args.xgb_max_depth),
            'subsample': (args.xgb_subsample, 1),
            'min_child_weight': (1, args.xgb_min_child_weight),
            'alpha': (0, args.xgb_alpha),
            'gamma': (0, args.xgb_gamma),
            'colsample_bytree': (args.xgb_colsample_bytree, 1),
            'colsample_bylevel': (args.xgb_colsample_bylevel, 1),
            'learning_rate':
            (args.xgb_learning_rate_lower, args.xgb_learning_rate_upper)
        }
        logger.info('Running BayesianOptimization')
        xgb_bayesopt = BayesianOptimization(train_xgb, params)
        xgb_bayesopt.maximize(init_points=5, n_iter=25)

        # get the best param
        best_params = xgb_bayesopt.res['max']['max_params']
        logger.info('Iteration: %d, XGBoost max auc: %f' %
                    (iter, xgb_bayesopt.res['max']['max_val']))
        for param, val in best_params.items():
            logger.info('Param %s: %r' % (param, val))
        # setting xgboost param
        logger.info('Setting best parameters for BayesianOptimization')
        xgb_params = {
            'nthread': args.nthread,
            'n_estimators': args.xgb_n_estimators,
            'eta': args.xgb_eta,
            'silent': args.xgb_silent,
            # for _train_internal
            'eval_metric': [args.xgb_eval_metric],
            ######################
            'max_depth': int(best_params['max_depth']),
            'subsample': max(min(best_params['subsample'], 1), 0),
            'min_child_weight': int(best_params['min_child_weight']),
            'alpha': max(best_params['alpha'], 0),
            'gamma': max(best_params['gamma'], 0),
            'colsample_bytree': max(min(best_params['colsample_bytree'], 1),
                                    0),
            'colsample_bylevel': max(min(best_params['colsample_bylevel'], 1),
                                     0),
            'learning_rate': max(min(best_params['learning_rate'], 1), 0),
        }
        # training
        model = xgb.train(xgb_params,
                          dTrain,
                          num_boost_round=args.xgb_num_boost_rounds,
                          verbose_eval=args.xgb_verbose_eval,
                          maximize=args.xgb_maximize)
        writePickle(
            model,
            os.path.join(
                'mdl', 'model_iter%d_%dfold_%f.pkl' %
                (iter, args.xgb_nfold, xgb_bayesopt.res['max']['max_val'])))
        # predict valid y
        predY = model.predict(dTest)
        result_df = pd.DataFrame(data={'y': predY})
        joined_df = pd.DataFrame(testFid).join(result_df)

        joined_df.to_csv(os.path.join(
            'result', 'xgb_result%d_%dfold.csv' % (iter, args.xgb_nfold)),
                         index=False)

        # re-sorted the fid because of the random splitting data
        logger.info(
            '----------------------------------------------------------------------\n\n\n'
        )