예제 #1
0
    def prep_labeled_input(self,classif,dataset_trainval):  #dom==domain
        # The input file as yaml
        inpF=self.trainvalid_1hot_dataPath+'/'+self.name+'.%s-scaff-split.yml'%classif
        print('\nprep_labeled scaffolds from:',inpF)
        start=time.time()

        #read it in
        inpD=read_yaml(inpF)
        numSegments=len(inpD)
        if self.verb>1:
            print(' found %d split-segments:'%numSegments,list(inpD.keys()),', kfoldOffset=', self.kfoldOffset)
            print('load_labeled_input_yaml dataset_trainval=',dataset_trainval,'kfoldOffset=', self.kfoldOffset)
        assert numSegments>0

        numTrainingSegments=Constants.numSegm_train
        numTotalSegments=Constants.numSegm_total

        assert numTrainingSegments>0  # makes no sense to split to train/eval/test
        kFold=self.kfoldOffset
        currentWrkSegment = {}

        #As example we are at kFold=1. Assuming we have 6 segments and 5 training segments:
        #This will set the train segment for this fold to 1..5 and the val to 0.
        #If we are at kFold=0, train will be 0..4 and val will be 5.
        if dataset_trainval=='train':
            jL=[ (kFold+j)%numTotalSegments for j in range(numTrainingSegments)]
            currentWrkSegment={}
            for k in jL:  currentWrkSegment.update(inpD[k])

        if dataset_trainval=='val':
            jL=(kFold+numTrainingSegments)%numTotalSegments # one element
            currentWrkSegment=inpD[jL]

        #We dont use the test segment at this point
        #if dataset_trainval=='test':
        #    jL=(kFold+numTrainingSegments+1)%self.numSegm # one element
        #    wrkL=inpD[jL]

        print('  class:',classif,'segL:',jL,' dataset_trainval:', dataset_trainval,', numSpec:',len(currentWrkSegment),', kfoldOffset=', self.kfoldOffset)

        assert len(currentWrkSegment)>0
        #print('  dump segg:',segg,wrkL)

        ############################
        ###The number of sequences to sample depends on sampling_rate (length-dependent)
        ###times the target_samples_per_contig_pred or target_samples_per_contig_train
        ############################
        sampling_rate=self.compute_sampling_rate(currentWrkSegment)

        if dataset_trainval not in self.trainvalid_data:
            self.trainvalid_data[dataset_trainval]={}

        #For validation and testing it is ok to use fewer samples, makes no difference.
        target_samples=int(Constants.target_samples_per_contig_training /2)
        if dataset_trainval!='train' : target_samples=int(target_samples/8)

        print('   request  target_samples=', target_samples, dataset_trainval,classif)
        # use all scaffolds, sample them to desired quota
        self.trainvalid_data[dataset_trainval][classif]=self.sample_labeled_scaffolds(currentWrkSegment,sampling_rate,classif,target_samples)
        print('prep:',dataset_trainval,classif,' completed, elaT=%.1f sec'%(time.time() - start),', gotSamples=',len(self.trainvalid_data[dataset_trainval][classif][0]))
예제 #2
0
    def prep_labeled_input(self, role, dom):  #dom==domain
        inpF = self.dataPath + '/' + self.name + '.%s-scaff-split.yml' % role
        print('\nprep_labeled scaffolds from:', inpF)
        start = time.time()

        inpD = read_yaml(inpF)
        nSeg = len(inpD)
        if self.verb > 1:
            print(' found %d split-segments:' % nSeg, list(inpD.keys()),
                  ', kfoldOffset=', self.kfoldOffset)
            print('load_labeled_input_yaml dom=', dom, 'kfoldOffset=',
                  self.kfoldOffset)
        assert nSeg > 0

        numTrainSeg = self.numSegm - 1

        assert numTrainSeg > 0  # makes no sense to split to train/eval/test
        n0 = self.kfoldOffset

        if dom == 'val':
            jL = (n0 + numTrainSeg) % self.numSegm  # one element
            wrkL = inpD[jL]

        if dom == 'test':
            jL = (n0 + numTrainSeg + 1) % self.numSegm  # one element
            wrkL = inpD[jL]

        if dom == 'train':
            jL = [(n0 + j) % self.numSegm for j in range(numTrainSeg)]
            wrkL = {}
            for k in jL:
                wrkL.update(inpD[k])

        print('  role:', role, 'segL:', jL, ' dom:', dom, ', numSpec:',
              len(wrkL), ', kfoldOffset=', self.kfoldOffset)

        assert len(wrkL) > 0
        #print('  dump segg:',segg,wrkL)

        dynL = self.compute_sampling_rate(wrkL)

        if dom not in self.bases_data:
            self.bases_data[dom] = {}

        numSamples = int(self.events / 2)
        if dom != 'train': numSamples = int(numSamples / 8)

        print('   request  numSamples=', numSamples, dom, role)
        # use all scaffolds, sample them to desired quota
        self.bases_data[dom][role] = self.partition_labeled_scaffolds(
            wrkL, dynL, role, numSamples)
        print('prep:', dom, role,
              ' completed, elaT=%.1f sec' % (time.time() - start),
              ', gotSamples=', len(self.bases_data[dom][role][0]))
예제 #3
0
    return args


#=================================
#=================================
#  M A I N
#=================================
#=================================
args = get_parser()
#args.dataPath='dataPlasmSmall/'
given_class = args.given
#This file was written in the function DL_Model::split_into_folds
inpF = args.dataPath + '/' + Constants.project + '.%s-scaff-split.yml' % given_class
assert (os.path.exists(inpF))
bulk = read_yaml(inpF)

# select how many scaffolds to be processed
if args.dataSegment >= 0:
    all_scaffolds = bulk[args.dataSegment]
else:
    all_scaffolds = {}
    for seg in range(Constants.numSegm_total):
        if seg in bulk:
            #print('ww',seg,type(bulk[seg]),len(all_scaffolds))
            all_scaffolds.update(bulk[seg])
            ###it_will_crash_make_5

score_thr = 0.50
max_scaff = Constants.mxScaf_pred
print('M: seg:', args.dataSegment, ' num scaffolds:', len(all_scaffolds),