def split_into_folds(self, inpL, classif,save=True, excludeL=None,info=''): out_allinfo={} out_lengths={} numSegm=Constants.numSegm_total # one of the segments will be validation and the rest used in training for seg in range(numSegm): out_allinfo[seg]={} out_lengths[seg]=0 for scafName,text,fetureV,seq in inpL: randomSegment=np.random.randint(numSegm) out_allinfo[randomSegment][scafName]={'text':text,'seq':seq,'len':len(seq),'features':fetureV} # out_lengths[randomSegment]+=len(seq) if self.verb>0: print(' achieved split for %d scaffolds to segments:'%len(inpL),[ (x,len(out_allinfo[x]),out_lengths[x]) for x in out_allinfo ]) for seg in range(numSegm): sum=out_lengths[seg]/1.e6 print('seg:',seg,' size=%.2f (MB) numScaf=%d'%(sum,len(out_allinfo[seg]))) out_allinfo['info']=info # save segments if save==True: out_file=self.trainvalid_1hot_dataPath+'/'+self.name+'.%s-scaff-split.yml'%classif write_yaml(out_allinfo,out_file) return out_allinfo
def split_species(self, inpL, role, save=True, excludeL=None, info=''): out0 = {} out1 = {} numSegm = self.numSegm + 1 # the last one is not used in any training for seg in range(numSegm): out0[seg] = {} out1[seg] = 0 for scafN, text, fetureV, seq in inpL: ix = np.random.randint(numSegm) out0[ix][scafN] = { 'text': text, 'seq': seq, 'len': len(seq), 'features': fetureV } # out1[ix] += len(seq) if self.verb > 0: print( ' achieved split for %d scaffolds to segments:' % len(inpL), [(x, len(out0[x]), out1[x]) for x in out0]) for seg in range(numSegm): sum = out1[seg] / 1.e6 print('seg:', seg, ' size=%.2f (MB) numScaf=%d' % (sum, len(out0[seg]))) out0['info'] = info # save segments if save == True: outF = self.dataPath + '/' + self.name + '.%s-scaff-split.yml' % role write_yaml(out0, outF) return out0
def save_training_history(self): outD = self.train_hirD outF = self.outPath + '/' + self.name + '.history.yml' write_yaml(outD, outF)
print( cnt['inp'], scaffN, 'decision=%s, avr score:' % decision, classif_avrg_score_str, ' len=%.1fk samples=%d' % (len(sequenceString) / 1000., len(sampList))) if args.verb > 0: print(Yscores_samples, classif_avrg_score_str, classif_score, scores_samples_list_class1, scores_samples_list_class0, scores_scaffs_list, Yclass, sampListHotEncodedA, featureListXsamplesA) classif_details_yaml['score'] = classif_score classif_details_yaml['model_info'] = deep.info #print('out classif_details_yaml='); pprint(classif_details_yaml) #write under outPR the info for this scaffold prediction in a yaml file write_yaml(classif_details_yaml, args.outPath + '/%s.assayer.yml' % (scaffN), 0) nClass = cnt['plasmid'] + cnt['main'] + cnt['ambig'] #print('M:%s endCnt:'%given_class,cnt,' fraction: Plasm=%.3f Ambig=%.3f Main=%.3f'%(cnt['plasmid']/nClass,cnt['ambig']/nClass,cnt['main']/nClass)) print('Counts: Plasm=%s Ambig=%s Main=%s nCount=%s' % (cnt['plasmid'], cnt['ambig'], cnt['main'], nClass)) f_predix.close() # make plot of all scores #ROC curve #print("Yclass %s " % (Yclass)) #print("scores_scaffs_list %s " % (scores_scaffs_list)) ####Print the AUC-ROC and FP-TP rate list to a file if 1 in Yclass and 0 in Yclass: ###Need both classes for this to be meaningful fpr, tpr, _ = roc_curve(