def tmp_chrom_file(self, tmp_dir=None, genome=True, chrom=None): make_dir(tmp_dir) tmp_genome = tmp_dir + '{}.genome'.format(self.id) tmpfh = open(tmp_genome, 'w') if genome == True: if chrom == None: for ref in self.refs: tmpfh.write('{}\t{}\n'.format(format_chrom(ref), self.refs[ref])) else: tmpfh.write('{}\t{}\n'.format(format_chrom(chrom), self.refs[chrom])) if genome == False: for ref in self.refs: tmpfh.write('{}\t0\t{}\n'.format(format_chrom(ref), self.refs[ref])) tmpfh.close() return tmp_genome
def main(): init_time = int(time()) parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,usage=splash.replace(" ","",1)+__useage___,add_help=False) inArgs,genoArgs,optArgs = parser.add_argument_group('input arguments'),parser.add_argument_group('genotype arguments'),parser.add_argument_group('optional arguments') inArgs.add_argument('-i','-bam',type=str,default=None,nargs='*') inArgs.add_argument('-b','-bed',type=str,default=None,nargs='*') inArgs.add_argument('-v','-vcf',type=str,default=None,nargs='*') inArgs.add_argument('-snv',type=str,default=None,nargs='*') inArgs.add_argument('-p','-ped',type=str,default=None,nargs='*') genoArgs.add_argument('-g','-genome',required=False,default='hg19',type=str) genoArgs.add_argument('-pcrfree',required=False,default=False,action="store_true") genoArgs.add_argument('-M',default=False,required=False,action="store_true") genoArgs.add_argument('-pre',required=False,default=None) genoArgs.add_argument('-feats',required=False,default=None) optArgs.add_argument('-L','-log',default=None,required=False) optArgs.add_argument('-T','-tmp-dir',default=os.getcwd()+'/sv2_tmp_'+rand_id(),required=False) optArgs.add_argument('-s','-seed',required=False,default=42,type=int) optArgs.add_argument('-o','-out',required=False,default="sv2_training_features",type=str) optArgs.add_argument('-O','-odir',required=False,default=os.getcwd(),type=str) optArgs.add_argument('-h','-help',required=False,action="store_true",default=False) args = parser.parse_args() bams,bed,vcf,snv,ped = args.i,args.b,args.v,args.snv,args.p gen,pcrfree,legacy_m,predir,featsdir= args.g,args.pcrfree,args.M,args.pre,args.feats logfh, tmp_dir, seed, ofh, odir = args.L,args.T,args.s,args.o,args.O _help = args.h if (_help==True or len(sys.argv)==1): print splash+__useage___ sys.exit(0) if logfh!=None: lfh = open(logfh,'w') sys.stderr=lfh preprocess_files,feats_files={},{} gens = ['hg19','hg38','mm10'] olog = logfh if olog == None: olog = 'STDOUT' print 'sv2 version:{} report bugs to <dantaki at ucsd dot edu> error messages located in {}'.format(__version__,olog) Confs=Config() if bams==None and predir==None and featsdir==None: print 'FATAL ERROR: No BAM file specified <-i, -bam FILE ...>' sys.stderr.write('FATAL ERROR: No BAM file specified <-i, -bam FILE ...>\n') sys.exit(1) if snv==None and predir==None and featsdir==None: print 'FATAL ERROR: No SNV VCF file specified <-snv FILE ...>' sys.stderr.write('FATAL ERROR: No SNV VCF file specified <-snv FILE ...>\n') sys.exit(1) if ped==None: print 'FATAL ERROR: No PED file specified <-p, -ped FILE ...>' sys.stderr.write('FATAL ERROR: No PED file specified <-p, -ped FILE ...>\n') sys.exit(1) if bed==None and vcf==None: print 'FATAL ERROR: No SVs provided <-b, -bed BED ...> <-v,-vcf VCF ...>' sys.stderr.write('FATAL ERROR: No SVs provided <-b, -bed BED ...> <-v,-vcf VCF ...>\n') sys.exit(1) if gen not in gens: print 'FATAL ERROR -g must be hg19 or hg38. NOT {}'.format(gen) sys.stderr.write('FATAL ERROR -g must be hg19 or hg38. NOT {}\n'.format(gen)) sys.exit(1) Peds=ped_init(ped) if bams!=None: Bams=bam_init(bams,Peds,snv_init(snv),gen) SV = sv_init(bed,vcf,gen) ofh = ofh.replace('.vcf','').replace('.out','').replace('.txt','') make_dir(tmp_dir) tmp_dir=slash_check(tmp_dir) if not odir.endswith('/'): odir = odir+'/' make_dir(odir) """ PREPROCESSING """ if predir == None: outdir = odir+'sv2_preprocessing/' make_dir(outdir) for bam in Bams: preofh = outdir+bam.id+'_sv2_preprocessing.txt' preprocess_files[bam.id]=preofh preprocess(bam,preofh,seed,gen,tmp_dir) else: predir=slash_check(predir) for fh in glob(predir+'*sv2_preprocessing.txt'): f = open(fh) if sum(1 for l in open(fh)) <= 1: continue else: preids=[] for l in f: if l.startswith('#'):continue preids.append(l.rstrip('\n').split('\t').pop(0)) f.close() for iid in set(preids): if iid in Peds.ids : preprocess_files[iid]=fh report_time(init_time,'PREPROCESSING COMPLETE') """" FEATURE EXTRACTION """ if featsdir == None: outdir = odir+'sv2_features/' make_dir(outdir) for bam in Bams: if preprocess_files.get(bam.id) == None: sys.stderr.write('WARNING: BAM sample id {} not found in preprocessing files. Skipping ...\n'.format(bam.id)) continue prefh = preprocess_files[bam.id] featfh = outdir+bam.id+'_sv2_features.txt' feats_files[bam.id]=featfh extract_feats(bam,SV.raw,prefh,featfh,gen,pcrfree,legacy_m,Confs,tmp_dir) else: featsdir=slash_check(featsdir) for fh in glob(featsdir+'*sv2_features.txt'): f = open(fh) if sum(1 for l in open(fh)) <= 1: continue else: featsid=[] for l in f: if l.startswith('#'):continue featsid.append(l.rstrip('\n').split('\t').pop(5)) f.close() for iid in set(featsid): if iid in Peds.ids : feats_files[iid]=fh feats=[] train_dir = odir+'sv2_training_features/' make_dir(train_dir) for iid in feats_files: with open(feats_files[iid]) as f: for l in f: feats.append(tuple(l.rstrip('\n').split('\t'))) sv2_train_output(feats,Peds,gen,train_dir+ofh) shutil.rmtree(tmp_dir) lfh.close() report_time(init_time,'FEATURE EXTRACTION COMPLETE')