Example #1
0
def test_threshold_detector_returns_expected_results(path_to_config_threshold,
                                                     path_to_output_reference,
                                                     make_tmp_folder):
    util.seed(0)

    yass.set_config(path_to_config_threshold, make_tmp_folder)

    (standardized_path, standardized_params,
     whiten_filter) = preprocess.run(output_directory=make_tmp_folder)

    clear = detect.run(standardized_path, standardized_params, whiten_filter)

    path_to_clear = path.join(path_to_output_reference,
                              'detect_threshold_clear.npy')

    ReferenceTesting.assert_array_equal(clear, path_to_clear)
Example #2
0
def main():
    cfg = util.readConfig(util.gcfg( ))
    
    cseed = 0
    if cfg[MAIN][SEED] == '':
        cseed = util.seed( )
    else:
        cseed = float(cfg[MAIN][SEED])
    
    random.seed(cseed)
    
    util.loadCSV(cfg[AGENT][CSV_FILE])
    
    lg = log( cfg, cseed, util.gcfg( ) )
    util.renderHead( cfg )

    best = None

    for i in range( 0, int(cfg[MAIN][RUNS]) ):
        lg.sep( i )

        nbest = run(cfg, i, lg)

        #Determine if our new potential best is better,
        #  this just uses the average of the two fitness values versus the bad opponents
        if best == None or nbest.fit > best.fit:
            if best != None:
                best.delete( )
            best = nbest

    lg.best(best)
    lg.absBestFinish(cfg, best)
    lg.wrapUp(best)
    
    print("\n")
Example #3
0
def main():
    if len(sys.argv)!=4:
        print 'USAGE:'
        print 'python -m scoop devel.py [cloneID] [clusterDir] [outputDir]'
        print 'see devel_config.py'
        return

    cloneID = sys.argv[1]
    clusterDir = sys.argv[2]; assert clusterDir[-1]=='/',"should be ended with '/'"
    baseOutDir = sys.argv[3]; assert baseOutDir[-1]!='/',"should NOT be ended with '/'"

    clfParam = None
    method = cfg['method']
    if method=='esvm':
        from esvm_config import config as clfParam
    elif method=='psvm':
        from psvm_config import config as clfParam
    else:
        print 'FATAL: unknown method'
        return

    outDir = os.path.join(baseOutDir,'devel-'+os.path.basename(baseOutDir))
    if not(os.path.isdir(baseOutDir)): os.makedirs(baseOutDir)
    if not(os.path.isdir(outDir)): os.makedirs(outDir)

    ## Load data ###################################################################################
    dataLog = {}; dataLogFpath = os.path.join(outDir,'data_log_'+os.path.basename(baseOutDir)+'.json')
    dataset = clusterDir.split('/')[-2].split('-')[-1]; dataLog['dataset'] = dataset
    datasetParams = dataset.split('#')
    assert datasetParams[0]=='yamanishi'

    xyDevFpath = os.path.join(baseOutDir,'_'.join(['xdev','ydev','xrel','yrel']+datasetParams)+'.h5')
    if os.path.exists(xyDevFpath):
        print 'loading data from PREVIOUS...'

        with h5py.File(xyDevFpath,'r') as f:
            xdev = f['xdev'][:]
            ydev = f['ydev'][:]
            xrel = f['xrel'][:]
            yrel = f['yrel'][:]
            xrelraw = f['xrelraw'][:]

        with open(dataLogFpath,'r') as f:
            dataLog = yaml.load(f)

    else:
        print 'loading data FRESHLY...'

        print 'loading cluster result...'
        nUnlabels = []
        statFnames = [i for i in os.listdir(clusterDir) if 'labels_stat.json' in i]
        for i in statFnames:
            with open(os.path.join(clusterDir,i),'r') as f: stat = yaml.load(f)
            nUnlabels.append(stat['0'])

        # use the cluster with minimum numbers of unlabeled samples
        metric = '_'.join(statFnames[ nUnlabels.index(min(nUnlabels)) ].split('_')[0:2])
        dataLog['metric'] = metric

        connFpath = os.path.join(clusterDir,metric+'_labels.pkl')
        with open(connFpath,'r') as f:
            data = pickle.load(f)

        ##
        print 'getting devel and release data...'
        xraw = []; yraw = []
        for k,v in data.iteritems():
            for vv in v:
                xraw.append(vv)
                yraw.append(k)

        devIdx = [i for i in range(len(xraw)) if yraw[i]!=0]
        xdev = [xraw[i] for i in devIdx]
        ydev = [yraw[i] for i in devIdx]

        relIdx = [i for i in range(len(xraw)) if yraw[i]==0]
        xrel = [xraw[i] for i in relIdx]
        yrel = [yraw[i] for i in relIdx]

        dataLog['nDevel'] = len(devIdx); dataLog['nData'] = len(yraw)
        dataLog['rDevel:Data'] = dataLog['nDevel']/float(dataLog['nData'])
        dataLog['nDevel(+)'] = len( [i for i in ydev if i==1] ); assert dataLog['nDevel(+)']!=0
        dataLog['nDevel(-)'] = len( [i for i in ydev if i==-1] ); assert dataLog['nDevel(-)']!=0
        dataLog['rDevel(+):Devel'] = float(dataLog['nDevel(+)'])/dataLog['nDevel']
        dataLog['rDevel(-):Devel'] = float(dataLog['nDevel(-)'])/dataLog['nDevel']
        dataLog['rDevel(+):(-)'] = float(dataLog['nDevel(+)'])/float(dataLog['nDevel(-)'])
        dataLog['nRelease'] = len(relIdx);
        dataLog['rRelease:Data'] = dataLog['nRelease']/float(dataLog['nData'])

        ##
        print 'loading com, pro feature...'
        krFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature',
                               'klekotaroth','klekotaroth-'+datasetParams[1]+'.h5')
        aacFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature',
                                'amino-acid-composition','amino-acid-composition-'+datasetParams[1]+'.h5')

        krDict = {}; aacDict = {}
        with h5py.File(krFpath, 'r') as f:
            for com in [str(i) for i in f.keys()]:
                krDict[com] = f[com][:]
        with h5py.File(aacFpath, 'r') as f:
            for pro in [str(i) for i in f.keys()]:
                aacDict[pro] = f[pro][:]
                # aacDict[pro] = list( fu.map(lambda x: float('%.2f'%(x)),f[pro][:]) ) # rounding

        comFeaLenOri = len(krDict.values()[0])
        proFeaLenOri = len(aacDict.values()[0])

        ##
        msg = 'extract (com,pro) feature... dims: '+str(comFeaLenOri)+','+str(proFeaLenOri)
        msg += ' of '+str(len(ydev))+' and '+str(len(yrel))
        print msg

        sh.setConst(krDict=krDict)
        sh.setConst(aacDict=aacDict)
        xdevf = list( fu.map(cutil.extractComProFea,xdev) )
        xrelf = list( fu.map(cutil.extractComProFea,xrel) )

        ##
        xyDevList = cutil.divideSamples(xdevf,ydev,cfg['smoteBatchSize'])
        if cfg['maxNumberOfSmoteBatch'] != 0:
            xyDevList = xyDevList[0:cfg['maxNumberOfSmoteBatch']]

        smoteSeed = util.seed(); dataLog['smoteSeed'] = smoteSeed
        sh.setConst(smoteSeed=smoteSeed)

        print 'resampling via Smote FRESHLY... '+str(len(xyDevList))+' smote(s)'+' on '+str(len(ydev))
        smoteTic = time.time()

        xdevfr = []; ydevr = []
        xydevfrList = list( fu.map(ensembleSmote,xyDevList) )
        for xdevfri,ydevri in xydevfrList:
            for x in xdevfri: xdevfr.append(x.tolist())
            for y in ydevri: ydevr.append(y)
        assert len(xdevfr)==len(ydevr),'len(xdevfr)!=len(ydevr)'

        dataLog['nSmote'] = len(xyDevList)
        dataLog['nDevelResampled'] = len(ydevr)
        dataLog['rDevelResampled:Data'] = dataLog['nDevelResampled']/float(dataLog['nData'])
        dataLog['nDevelResampled(+)'] = len( [i for i in ydevr if i==1] )
        dataLog['nDevelResampled(-)'] = len( [i for i in ydevr if i==-1] )
        dataLog['rDevelResampled(+):DevelResampled'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled'])
        dataLog['rDevelResampled(-):DevelResampled'] = dataLog['nDevelResampled(-)']/float(dataLog['nDevelResampled'])
        dataLog['rDevelResampled(+):(-)'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled(-)'])
        dataLog['timeSMOTE'] =  str(time.time()-smoteTic)

        ##
        print 'update xdev,ydev,xrel... '+str(np.asarray(xdevfr).shape)
        xrelraw = xrel[:] # raw: feature is NOT extracted
        xrel = xrelf[:]
        xdev = xdevfr[:]
        ydev = ydevr[:]

        print 'writing updated xdev,ydev and xrel,yrel...'
        with h5py.File(xyDevFpath,'w') as f:
            f.create_dataset('xdev',data=xdev,dtype=np.float32)
            f.create_dataset('ydev',data=ydev,dtype=np.int8)
            f.create_dataset('xrel',data=xrel,dtype=np.float32)
            f.create_dataset('yrel',data=yrel,dtype=np.int8)
            f.create_dataset('xrelraw',data=xrelraw)

        print 'writing dataLog...'
        dataLog['nCom'] = len(krDict)
        dataLog['nPro'] = len(aacDict)
        with open(dataLogFpath,'w') as f:
            json.dump(dataLog,f,indent=2,sort_keys=True)

    ## TUNE+TRAIN+TEST #############################################################################
    devLog = {}
    devSeed = util.seed(); dataLog['devSeed'] = devSeed
    tag = '_'.join([method+'#'+cloneID,dataset,util.tag()])

    ## split devel dataset
    msg = ' '.join( ['devel',dataset,cloneID])
    xtr,xte,ytr,yte = tts(xdev,ydev,test_size=cfg['testSize'],
                          random_state=devSeed,stratify=ydev)

    if cfg['maxTestingSamples']>0:
        chosenIdx = np.random.randint(len(xte),size=cfg['maxTestingSamples'])
        xte = [xte[i] for i in chosenIdx]; yte = [yte[i] for i in chosenIdx]

    devLog['nTraining'] = len(xtr)
    devLog['nTraining(+)'] = len([i for i in ytr if i==1])
    devLog['nTraining(-)'] = len([i for i in ytr if i==-1])
    devLog['rTraining(+):(-)'] = devLog['nTraining(+)']/float(devLog['nTraining(-)'])
    devLog['rTraining:Devel'] = devLog['nTraining']/float(dataLog['nDevelResampled'])
    devLog['nTesting'] = len(xte)
    devLog['nTesting(+)'] = len([i for i in yte if i==1])
    devLog['nTesting(-)'] = len([i for i in yte if i==-1])
    devLog['rTesting(+):(-)'] = devLog['nTesting(+)']/float(devLog['nTesting(-)'])
    devLog['rTesting:Devel'] = devLog['nTesting']/float(dataLog['nDevelResampled'])

    ## tuning
    clf = None
    if method=='esvm':
        clf  = eSVM(simMat=None)
    elif method=='psvm':
        clf = svm.SVC(kernel=clfParam['kernel'],probability=True)

    ## training
    print msg+': fitting nTr= '+str(len(ytr))
    trTic = time.time()

    if method=='esvm':
        clf.fit(xtr,ytr)
        devLog['labels'] = clf.labels()
        devLog['nSVM'] = clf.nSVM()
        devLog['xtrDimAllBatches'] = clf.xtrDimAllBatches()
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTr = cutil.makeComProKernelMatFromSimMat(xtr,xtr,simMat)
            # clf.fit(simMatTr,ytr)
        else:
            clf.fit(xtr,ytr)
        devLog['labels'] = clf.classes_.tolist()
    devLog['timeTraining'] = str(time.time()-trTic)

    ## testing
    print msg+': predicting nTe= '+str(len(yte))
    teTic = time.time()

    if method=='esvm':
        ypred,yscore = clf.predict(xte)
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTe = cutil.makeComProKernelMatFromSimMat(xte,xtr,simMat)
            # ypred = clf.predict(simMatTe)
            # yscore = clf.predict_proba(simMatTe)
        else:
            ypred = clf.predict(xte)
            yscore = clf.predict_proba(xte)
            yscore = [max(i.tolist()) for i in yscore]
    devLog['timeTesting'] = str(time.time()-teTic)

    ## TEST RELEASE ################################################################################
    print msg+': predicting RELEASE n= '+str(len(yrel))
    relTic = time.time()

    if method=='esvm':
        yrel,yrelscore = clf.predict(xrel)
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTe = cutil.makeComProKernelMatFromSimMat(xrel,xtr,simMat)
            # yrel = clf.predict(simMatTe)
            # yrelscore = clf.predict_proba(simMatTe)
        else:
            yrel = clf.predict(xrel)
            yrelscore = clf.predict_proba(xrel)
            yrelscore = [max(i.tolist()) for i in yrelscore]
    devLog['timeRelease'] = str(time.time()-relTic)

    ## WRITE RESULT ################################################################################
    result = {'yte':yte,'ypred':ypred,'yscore':yscore,
              'xrelraw':xrelraw,'yrel':yrel,'yrelscore':yrelscore}

    print 'writing prediction...'
    with h5py.File(os.path.join(outDir,'result_'+tag+'.h5'),'w') as f:
        for k,v in result.iteritems():
            if 'raw' in k:
                f.create_dataset(k,data=v)
            else:
                dt = np.int8
                if 'score' in k: dt = np.float32
                f.create_dataset(k,data=v,dtype=dt)

    ##
    print 'writing devLog...'
    devLog['clfParam'] = clfParam
    devLog['devParam'] = cfg
    with open(os.path.join(outDir,'devLog_'+tag+'.json'),'w') as f:
        json.dump(devLog,f,indent=2,sort_keys=True)
Example #4
0
def main():
    # seed random number generator engine
    util.seed(0)
    
    # set up sheep parameters
    Sheep.set_gene_size(32)
    Sheep.prop.repr_age = 8
    Sheep.prop.threshold = 3
    Sheep.prop.mut_rate = 2
    Sheep.prop.N_max = 1000
    Sheep.prop.N_init = 1000
    # set up bear parameters
    Bear.set_gene_size(32)
    Bear.prop.repr_age = 8
    Bear.prop.threshold = 3
    Bear.prop.mut_rate = 2
    Bear.prop.N_max = 1000
    Bear.prop.N_init = 1000
    
    # generate initial population
    pop = [] # list
    for i in range(Sheep.prop.N_init):
        pop.append(Sheep(Sheep.random_age()))
        Sheep.prop.N_t += 1
    
    for i in range(Bear.prop.N_init):
        pop.append(Bear(Bear.random_age()))
        Bear.prop.N_t += 1
    
    # prepare output file
    with open("py_pennaLV_week03.txt", "w") as ofs:
        ofs.write("time sheep bear\n")
        
        ofs.write("#param seed {} ".format(util.seed()))
        ofs.write("N_init {} ".format(Sheep.prop.N_init))
        ofs.write("N_max {} ".format(Sheep.prop.N_max))
        ofs.write("gene_size {} ".format(Sheep.prop.gene_size))
        ofs.write("repr_age {} ".format(Sheep.prop.repr_age))
        ofs.write("mut_rate {} ".format(Sheep.prop.mut_rate))
        ofs.write("threshold {}\n".format(Sheep.prop.threshold))
        
        ofs.write("b_N_init {} ".format(Bear.prop.N_init))
        ofs.write("b_N_max {} ".format(Bear.prop.N_max))
        ofs.write("b_gene_size {} ".format(Bear.prop.gene_size))
        ofs.write("b_repr_age {} ".format(Bear.prop.repr_age))
        ofs.write("b_mut_rate {} ".format(Bear.prop.mut_rate))
        ofs.write("b_threshold {}\n".format(Bear.prop.threshold))
        
        # run simulation
        for gen in range(300):
            
            for s in pop[:]:
                if not s.progress():
                    pop.remove(s)
                    s.prop.N_t -= 1
                elif s.adult():
                    pop.insert(0, s.make_child())
                    s.prop.N_t += 1
            
            #~ print("{} {} {}".format(gen, Sheep.prop.N_t, Bear.prop.N_t))
            ofs.write("{} {} {}\n".format(gen, Sheep.prop.N_t, Bear.prop.N_t))
    
    print("total sheep: {}".format(Sheep.prop.N_t))
    print("total bear:  {}".format(Bear.prop.N_t))
    print(pop[-1])
Example #5
0
 def train(self,
           writer=None,
           batch_size=8,
           lr=1 * 10**-5,
           num_epochs=3,
           seed=None):
     max_grad_norm = 1.0
     test_losses = []
     should_stop = early_stopping.ConsecutiveNonImprovment(3)
     self.training_parameters.append({
         "batch_size": batch_size,
         "lr": lr,
         "num_epochs": num_epochs,
         "seed": seed,
         "base_model": self.base_model,
     })
     self.optimizer = AdamW(self.classifier.parameters(), lr=lr)
     self.mixed_precision_setup()
     if self.multi_gpu and type(self.classifier) != torch.nn.DataParallel:
         self.classifier = torch.nn.DataParallel(self.classifier)
     scheduler = self.warumup_cooldown_scheduler(self.optimizer, num_epochs,
                                                 batch_size)
     for epoch in range(num_epochs):
         # To not depend on if we run tests after each epoch we need to seed here
         if seed is not None:
             util.seed(seed + epoch)
         print(f"Starting training epoch {epoch + 1}/{num_epochs}")
         self.classifier.train()
         loader = DataLoader(self.train_dataset,
                             batch_size=batch_size,
                             collate_fn=PaddedBatch,
                             shuffle=True)
         for batch in tqdm(loader):
             self.optimizer.zero_grad()
             loss, logits = self.classifier(
                 batch.token_ids.cuda(),
                 token_type_ids=batch.sequence_ids.cuda(),
                 attention_mask=batch.input_mask.cuda(),
                 labels=batch.labels.float().cuda(),
             )
             self.num_batches += 1
             if writer:
                 writer.add_scalar("cross entropy loss per batch",
                                   loss.mean(), self.num_batches)
             if self.mixed_precision:
                 with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                     scaled_loss.mean().backward()
                 torch.nn.utils.clip_grad_norm_(
                     amp.master_params(self.optimizer), max_grad_norm)
             else:
                 loss.mean().backward()
                 torch.nn.utils.clip_grad_norm_(
                     self.classifier.parameters(), max_grad_norm)
             self.optimizer.step()
             scheduler.step()
         util.seed_for_testing()
         test_losses = self.run_test_loss_report(test_losses, epoch, writer)
         # Early stopping when test loss is no longer improving
         if should_stop(test_losses):
             print("Test loss no longer improving, stopping!")
             print(f"(losses were {test_losses})")
             best_epoch = sorted(enumerate(test_losses),
                                 key=lambda kv: kv[1])[0][0]
             self.early_stopped_at = best_epoch
             self.load_epoch_model(best_epoch)
             return
         del loader
Example #6
0
def setup():
    seed(0)