def prep_labeled_input(self,classif,dataset_trainval): #dom==domain # The input file as yaml inpF=self.trainvalid_1hot_dataPath+'/'+self.name+'.%s-scaff-split.yml'%classif print('\nprep_labeled scaffolds from:',inpF) start=time.time() #read it in inpD=read_yaml(inpF) numSegments=len(inpD) if self.verb>1: print(' found %d split-segments:'%numSegments,list(inpD.keys()),', kfoldOffset=', self.kfoldOffset) print('load_labeled_input_yaml dataset_trainval=',dataset_trainval,'kfoldOffset=', self.kfoldOffset) assert numSegments>0 numTrainingSegments=Constants.numSegm_train numTotalSegments=Constants.numSegm_total assert numTrainingSegments>0 # makes no sense to split to train/eval/test kFold=self.kfoldOffset currentWrkSegment = {} #As example we are at kFold=1. Assuming we have 6 segments and 5 training segments: #This will set the train segment for this fold to 1..5 and the val to 0. #If we are at kFold=0, train will be 0..4 and val will be 5. if dataset_trainval=='train': jL=[ (kFold+j)%numTotalSegments for j in range(numTrainingSegments)] currentWrkSegment={} for k in jL: currentWrkSegment.update(inpD[k]) if dataset_trainval=='val': jL=(kFold+numTrainingSegments)%numTotalSegments # one element currentWrkSegment=inpD[jL] #We dont use the test segment at this point #if dataset_trainval=='test': # jL=(kFold+numTrainingSegments+1)%self.numSegm # one element # wrkL=inpD[jL] print(' class:',classif,'segL:',jL,' dataset_trainval:', dataset_trainval,', numSpec:',len(currentWrkSegment),', kfoldOffset=', self.kfoldOffset) assert len(currentWrkSegment)>0 #print(' dump segg:',segg,wrkL) ############################ ###The number of sequences to sample depends on sampling_rate (length-dependent) ###times the target_samples_per_contig_pred or target_samples_per_contig_train ############################ sampling_rate=self.compute_sampling_rate(currentWrkSegment) if dataset_trainval not in self.trainvalid_data: self.trainvalid_data[dataset_trainval]={} #For validation and testing it is ok to use fewer samples, makes no difference. target_samples=int(Constants.target_samples_per_contig_training /2) if dataset_trainval!='train' : target_samples=int(target_samples/8) print(' request target_samples=', target_samples, dataset_trainval,classif) # use all scaffolds, sample them to desired quota self.trainvalid_data[dataset_trainval][classif]=self.sample_labeled_scaffolds(currentWrkSegment,sampling_rate,classif,target_samples) print('prep:',dataset_trainval,classif,' completed, elaT=%.1f sec'%(time.time() - start),', gotSamples=',len(self.trainvalid_data[dataset_trainval][classif][0]))
def prep_labeled_input(self, role, dom): #dom==domain inpF = self.dataPath + '/' + self.name + '.%s-scaff-split.yml' % role print('\nprep_labeled scaffolds from:', inpF) start = time.time() inpD = read_yaml(inpF) nSeg = len(inpD) if self.verb > 1: print(' found %d split-segments:' % nSeg, list(inpD.keys()), ', kfoldOffset=', self.kfoldOffset) print('load_labeled_input_yaml dom=', dom, 'kfoldOffset=', self.kfoldOffset) assert nSeg > 0 numTrainSeg = self.numSegm - 1 assert numTrainSeg > 0 # makes no sense to split to train/eval/test n0 = self.kfoldOffset if dom == 'val': jL = (n0 + numTrainSeg) % self.numSegm # one element wrkL = inpD[jL] if dom == 'test': jL = (n0 + numTrainSeg + 1) % self.numSegm # one element wrkL = inpD[jL] if dom == 'train': jL = [(n0 + j) % self.numSegm for j in range(numTrainSeg)] wrkL = {} for k in jL: wrkL.update(inpD[k]) print(' role:', role, 'segL:', jL, ' dom:', dom, ', numSpec:', len(wrkL), ', kfoldOffset=', self.kfoldOffset) assert len(wrkL) > 0 #print(' dump segg:',segg,wrkL) dynL = self.compute_sampling_rate(wrkL) if dom not in self.bases_data: self.bases_data[dom] = {} numSamples = int(self.events / 2) if dom != 'train': numSamples = int(numSamples / 8) print(' request numSamples=', numSamples, dom, role) # use all scaffolds, sample them to desired quota self.bases_data[dom][role] = self.partition_labeled_scaffolds( wrkL, dynL, role, numSamples) print('prep:', dom, role, ' completed, elaT=%.1f sec' % (time.time() - start), ', gotSamples=', len(self.bases_data[dom][role][0]))
return args #================================= #================================= # M A I N #================================= #================================= args = get_parser() #args.dataPath='dataPlasmSmall/' given_class = args.given #This file was written in the function DL_Model::split_into_folds inpF = args.dataPath + '/' + Constants.project + '.%s-scaff-split.yml' % given_class assert (os.path.exists(inpF)) bulk = read_yaml(inpF) # select how many scaffolds to be processed if args.dataSegment >= 0: all_scaffolds = bulk[args.dataSegment] else: all_scaffolds = {} for seg in range(Constants.numSegm_total): if seg in bulk: #print('ww',seg,type(bulk[seg]),len(all_scaffolds)) all_scaffolds.update(bulk[seg]) ###it_will_crash_make_5 score_thr = 0.50 max_scaff = Constants.mxScaf_pred print('M: seg:', args.dataSegment, ' num scaffolds:', len(all_scaffolds),