Exemple #1
0
 def tmp_chrom_file(self, tmp_dir=None, genome=True, chrom=None):
     make_dir(tmp_dir)
     tmp_genome = tmp_dir + '{}.genome'.format(self.id)
     tmpfh = open(tmp_genome, 'w')
     if genome == True:
         if chrom == None:
             for ref in self.refs:
                 tmpfh.write('{}\t{}\n'.format(format_chrom(ref),
                                               self.refs[ref]))
         else:
             tmpfh.write('{}\t{}\n'.format(format_chrom(chrom),
                                           self.refs[chrom]))
     if genome == False:
         for ref in self.refs:
             tmpfh.write('{}\t0\t{}\n'.format(format_chrom(ref),
                                              self.refs[ref]))
     tmpfh.close()
     return tmp_genome
Exemple #2
0
def main():
	init_time = int(time())
 	parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,usage=splash.replace("       ","",1)+__useage___,add_help=False)
	inArgs,genoArgs,optArgs = parser.add_argument_group('input arguments'),parser.add_argument_group('genotype arguments'),parser.add_argument_group('optional arguments')
	inArgs.add_argument('-i','-bam',type=str,default=None,nargs='*')
	inArgs.add_argument('-b','-bed',type=str,default=None,nargs='*')
	inArgs.add_argument('-v','-vcf',type=str,default=None,nargs='*')
	inArgs.add_argument('-snv',type=str,default=None,nargs='*')
	inArgs.add_argument('-p','-ped',type=str,default=None,nargs='*')
	genoArgs.add_argument('-g','-genome',required=False,default='hg19',type=str)
	genoArgs.add_argument('-pcrfree',required=False,default=False,action="store_true")
	genoArgs.add_argument('-M',default=False,required=False,action="store_true")
	genoArgs.add_argument('-pre',required=False,default=None)
	genoArgs.add_argument('-feats',required=False,default=None)
	optArgs.add_argument('-L','-log',default=None,required=False)
	optArgs.add_argument('-T','-tmp-dir',default=os.getcwd()+'/sv2_tmp_'+rand_id(),required=False)
	optArgs.add_argument('-s','-seed',required=False,default=42,type=int)
	optArgs.add_argument('-o','-out',required=False,default="sv2_training_features",type=str)
	optArgs.add_argument('-O','-odir',required=False,default=os.getcwd(),type=str)
	optArgs.add_argument('-h','-help',required=False,action="store_true",default=False)
	args = parser.parse_args()
	bams,bed,vcf,snv,ped = args.i,args.b,args.v,args.snv,args.p
	gen,pcrfree,legacy_m,predir,featsdir= args.g,args.pcrfree,args.M,args.pre,args.feats
	logfh, tmp_dir, seed, ofh, odir = args.L,args.T,args.s,args.o,args.O
	_help = args.h
	if (_help==True or len(sys.argv)==1):
		print splash+__useage___
		sys.exit(0)
	if logfh!=None:
		lfh = open(logfh,'w')
		sys.stderr=lfh
	preprocess_files,feats_files={},{}
	gens = ['hg19','hg38','mm10']
	olog = logfh
	if olog == None: olog = 'STDOUT'
	print 'sv2 version:{}    report bugs to <dantaki at ucsd dot edu>       error messages located in {}'.format(__version__,olog)
	Confs=Config()
	if bams==None and predir==None and featsdir==None:
		print 'FATAL ERROR: No BAM file specified <-i, -bam  FILE ...>'
		sys.stderr.write('FATAL ERROR: No BAM file specified <-i, -bam  FILE ...>\n')
		sys.exit(1)
	if snv==None and predir==None and featsdir==None:
		print 'FATAL ERROR: No SNV VCF file specified <-snv  FILE ...>'
		sys.stderr.write('FATAL ERROR: No SNV VCF file specified <-snv  FILE ...>\n')
		sys.exit(1)
	if ped==None:
		print 'FATAL ERROR: No PED file specified <-p, -ped  FILE ...>'
		sys.stderr.write('FATAL ERROR: No PED file specified <-p, -ped  FILE ...>\n')
		sys.exit(1)
	if bed==None and vcf==None:
		print 'FATAL ERROR: No SVs provided <-b, -bed  BED ...> <-v,-vcf  VCF ...>'
		sys.stderr.write('FATAL ERROR: No SVs provided <-b, -bed  BED ...> <-v,-vcf  VCF ...>\n')
		sys.exit(1)
	if gen not in gens:
		print 'FATAL ERROR -g must be hg19 or hg38. NOT {}'.format(gen)
		sys.stderr.write('FATAL ERROR -g must be hg19 or hg38. NOT {}\n'.format(gen))
		sys.exit(1)
	Peds=ped_init(ped)
	if bams!=None: Bams=bam_init(bams,Peds,snv_init(snv),gen)
	SV = sv_init(bed,vcf,gen)
	ofh = ofh.replace('.vcf','').replace('.out','').replace('.txt','')
	make_dir(tmp_dir)
	tmp_dir=slash_check(tmp_dir)
	if not odir.endswith('/'): odir = odir+'/'
	make_dir(odir)
	"""
	PREPROCESSING
	"""
	if predir == None:
		outdir = odir+'sv2_preprocessing/'
		make_dir(outdir)
		for bam in Bams:
			preofh = outdir+bam.id+'_sv2_preprocessing.txt'
			preprocess_files[bam.id]=preofh
			preprocess(bam,preofh,seed,gen,tmp_dir)
	else:
		predir=slash_check(predir)
		for fh in glob(predir+'*sv2_preprocessing.txt'):
			f = open(fh)
			if sum(1 for l in open(fh)) <= 1: continue
			else:
				preids=[]
				for l in f:
					if l.startswith('#'):continue
					preids.append(l.rstrip('\n').split('\t').pop(0))
			f.close()
			for iid in set(preids):
				if iid in Peds.ids : preprocess_files[iid]=fh
	report_time(init_time,'PREPROCESSING COMPLETE')
	""""
	FEATURE EXTRACTION
	"""
	if featsdir == None:
		outdir = odir+'sv2_features/'
		make_dir(outdir)
		for bam in Bams:
			if preprocess_files.get(bam.id) == None:
				sys.stderr.write('WARNING: BAM sample id {} not found in preprocessing files. Skipping ...\n'.format(bam.id))
				continue
			prefh = preprocess_files[bam.id]
			featfh = outdir+bam.id+'_sv2_features.txt'
			feats_files[bam.id]=featfh
			extract_feats(bam,SV.raw,prefh,featfh,gen,pcrfree,legacy_m,Confs,tmp_dir)
	else:
		featsdir=slash_check(featsdir)
		for fh in glob(featsdir+'*sv2_features.txt'):
			f = open(fh)
			if sum(1 for l in open(fh)) <= 1: continue
			else:
				featsid=[]
				for l in f:
					if l.startswith('#'):continue
					featsid.append(l.rstrip('\n').split('\t').pop(5))
			f.close()
			for iid in set(featsid):
				if iid in Peds.ids : feats_files[iid]=fh
	feats=[]
	train_dir = odir+'sv2_training_features/'
	make_dir(train_dir)
	for iid in feats_files:
		with open(feats_files[iid]) as f:
			for l in f: feats.append(tuple(l.rstrip('\n').split('\t')))
	sv2_train_output(feats,Peds,gen,train_dir+ofh)
	shutil.rmtree(tmp_dir)
	lfh.close()
	report_time(init_time,'FEATURE EXTRACTION COMPLETE')