def main(): usage = "usage: %prog [options] " parser = argparse.ArgumentParser(description='extract SNPs or INDELs from a vcf.gz file and write to stdout') parser.add_argument('-type', dest='vtype', type=str, help="SNP|INDEL") parser.add_argument('vcfile', type=str,help='*.vcf.gz file') args=parser.parse_args() vcfroot, ext = os.path.splitext(args.vcfile) if ext == '.gz': vcf_basename = return_file_basename(return_file_basename(args.vcfile)) else: vcf_basename = return_file_basename(args.vcfile) new_vcfname=".".join([vcf_basename, args.vtype, 'vcf']) vcf_reader = vcf.Reader(open(args.vcfile, 'r')) vcf_writer = vcf.Writer(open('/dev/stdout', 'w'), vcf_reader) #sys.stderr.write("writing "+ args.vtype + " to " + new_vcfname +"\n") for record in vcf_reader: if args.vtype=='SNP' and record.is_snp==True: vcf_writer.write_record(record) if args.vtype=='INDEL' and record.is_indel==True: vcf_writer.write_record(record)
def main(): usage = "usage: %prog [options] " parser = argparse.ArgumentParser(description='generate jobs for mplielup') parser.add_argument("-Q", help="mapping quality",dest='mq', default='30') parser.add_argument("-q", help="base quality", dest='bq', default='20') parser.add_argument("-bin", help="path to samtools", dest='bin', default="/broad/software/free/Linux/redhat_5_x86_64/pkgs/samtools/samtools_0.1.19/bin/samtools") parser.add_argument("-bed", dest='bed', help="bedfile") parser.add_argument("-ref", dest='ref', default='/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta', help="reference fasta") parser.add_argument('seqindex', type=str,help='file.index') args = parser.parse_args() cwd = os.getcwd() fh=open(args.seqindex, 'r') SeqIndexRecord = namedtuple('SeqIndexRecord', 'PDO, SeqProject, Title, SeqCenter, PDOSample, ExternalID, BAMPath, FID, QC') bedfile_base=return_file_basename(args.bed) for rec in map(SeqIndexRecord._make, csv.reader(fh, delimiter='\t')): if rec.QC != 'PASS': continue bamfile_base = return_file_basename(rec.BAMPath) pileupout=".".join([bamfile_base, bedfile_base, 'pileup']) commandline=" ".join([ args.bin, 'mpileup', '-q', args.bq, '-Q', args.mq, '-f', args.ref, '-l', args.bed, rec.BAMPath, '>', cwd+"/"+pileupout]) outfh=open("pileupjob."+bamfile_base+"."+bedfile_base+".sh", 'w') #print commandline outfh.write(commandline+"\n")
def main(): """ reassign mapping qualities in a bam file """ usage = "usage: %prog [options] file.bam" parser = OptionParser(usage) parser.add_option("--DMQ", type="int", dest="dmq", default=60, help="default mapping quality to set") (options, args)=parser.parse_args() bamfilename=args[0] basename=return_file_basename(bamfilename) pybamfile = pysam.Samfile(bamfilename, "rb" ) bamfile_withRG_name=basename+".dmq.bam" dmqbamfile = pysam.Samfile(bamfile_withRG_name, "wb", header=pybamfile.header) if os.path.exists(bamfilename+".bai") == False: sys.stderr.write("please check for existence of bam index file (*.bai)\n") exit(1) sys.stderr.write("writing new bam file with new default mapping quality tag alignment record ...\n") for read in pybamfile.fetch(): read.mapq = options.dmq dmqbamfile.write(read)
def main(): """ reassign mapping qualities in a bam file """ usage = "usage: %prog [options] file.bam" parser = OptionParser(usage) parser.add_option("--DMQ", type="int", dest="dmq", default=60, help="default mapping quality to set") (options, args) = parser.parse_args() bamfilename = args[0] basename = return_file_basename(bamfilename) pybamfile = pysam.Samfile(bamfilename, "rb") bamfile_withRG_name = basename + ".dmq.bam" dmqbamfile = pysam.Samfile(bamfile_withRG_name, "wb", header=pybamfile.header) if os.path.exists(bamfilename + ".bai") == False: sys.stderr.write("please check for existence of bam index file (*.bai)\n") exit(1) sys.stderr.write("writing new bam file with new default mapping quality tag alignment record ...\n") for read in pybamfile.fetch(): read.mapq = options.dmq dmqbamfile.write(read)
def main(): """ get a specified number of paired-end reads from a samfile """ usage = "usage: %prog [options] readname_sorted.sam" parser=OptionParser(usage) parser.add_option("--N", type="int", default=100, dest="N", help="number of records to randomly select") (options,args)=parser.parse_args() samfilename=args[0] basename=return_file_basename(samfilename) subfh=open(basename+".subset.sam", "w") samfh=open(samfilename, 'r') records1 = sum(1 for _ in open(samfilename)) / 2 rand_records=sorted([random.randint(0, records1 - 1) for _ in xrange(options.N)]) print "total number of read pairs to select: ", len(rand_records) rec_no=-1 records_written=0 for rr in rand_records: while rec_no < rr: rec_no+=1 for i in range(2): samfh.readline() for j in range(2): subfh.write(samfh.readline()) rec_no+=1 records_written+=1 print >>sys.stderr, "wrote to %d read pairs to %s " % (records_written, subfh.name)
def main(): """ randomly select N records from paired end fastq files code based on this: http://www.biostars.org/p/6544/#6555 """ usage = "usage: %prog [options] file_1.fq.gz file_2.fq.gz" parser = OptionParser(usage) parser.add_option("--N", type="int", default=100, dest="N", help="number of records to randomly select") (options, args)=parser.parse_args() if len(args) !=2: sys.stderr.write("provide two fastq.gz files!\n") sys.exit(1) fq1=args[0] fq2=args[1] fq1_basename= return_file_basename(fq1) fq2_basename= return_file_basename(fq2) suba, subb = gzip.open(fq1_basename + ".subset.gz", "wb"), gzip.open(fq2_basename + ".subset.gz", "wb") records1 = sum(1 for _ in gzip.open(fq1)) / 4 if options.N > records1: sys.stderr.write("number requested less than number in fastq file!") if sum(1 for _ in gzip.open(fq2)) / 4 != records1: sys.stderr.write("unequal number of fastq records in PE files!\n") sys.exit(1) fh1=gzip.open(fq1) fh2=gzip.open(fq2) rand_records=sorted([random.randint(0, records1 - 1) for _ in xrange(options.N)]) rec_no=-1 for rr in rand_records: while rec_no < rr: rec_no+=1 for i in range(4): fh1.readline() for i in range(4): fh2.readline() for i in range(4): suba.write(fh1.readline()) subb.write(fh2.readline()) rec_no+=1 print >>sys.stderr, "wrote to %s, %s" % (suba.name, subb.name)
def main(): today=datetime.datetime.today() datestr=today.strftime("20%y%m%d %X") CWD=os.getcwd() BROAD_STD_AGILENT_TARGETS="/humgen/gsa-pipeline/resources/b37/v5/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list" BROAD_STD_AGILENT_GENCODE="/humgen/gsa-pipeline/resources/b37/v5/gencode.v12_broad.agilent_merged.interval_list" REF="/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta" BIN="/seq/software/picard/current/bin" #bsub -q hour -o aindap.$fbname.out $PWD/$x; done usage = "usage: %prog [options] " parser = argparse.ArgumentParser(description='Program description') parser.add_argument("--ti", type=str, dest='ti', help="target interval file (full path)") parser.add_argument('indexfile', type=str,help='REU index file') parser.add_argument("--picardtool", type=str, dest='picardtool', default="CalculateHsMetrics.jar") parser.add_argument("--queue", type=str, dest='queue', default="week") args=parser.parse_args() command="java -Xmx2g -jar " + BIN+"/"+args.picardtool TARGETS= args.ti sys.stderr.write("using " + TARGETS +"\n") ReuIndexRecord = namedtuple('ReuIndexRecord', 'PDO, SeqProject, Title, BI, PDOSample, ExternalID, BAMPath') #java -Xmx8g -jar /seq/software/picard/current/bin/CalculateHsMetrics.jar BAIT_INTERVALS=$TARGETS TARGET_INTERVALS=$TARGETS INPUT=$BAM OUTPUT=00428C.hsmetrics.txt REFERENCE_SEQUENCE=$REF PER_TARGET_COVERAGE=00428C.pertarget.txt for rec in map(ReuIndexRecord._make, csv.reader(open(args.indexfile, "rb"), delimiter='\t')): if rec.PDO == 'PDO': continue samplename= return_file_basename( rec.BAMPath) samplemetrics = ".".join([samplename, 'hsmetrics', 'txt']) pertargetmetrics=".".join([samplename, 'hsmetrics', 'pertarget', 'txt']) commandline = [ command , "BAIT_INTERVALS="+TARGETS, "TARGET_INTERVALS="+TARGETS, "INPUT="+rec.BAMPath, "OUTPUT="+CWD+"/"+samplemetrics, "REFERENCE_SEQUENCE="+REF, "PER_TARGET_COVERAGE="+CWD+"/"+pertargetmetrics] outstring=" ".join(commandline) output=".".join(["aindap", samplename, 'hsmetrics', 'txt']) scriptfile=".".join([samplename,rec.SeqProject, 'hsmetrics', 'sh']) outfh=open(scriptfile, 'w') outfh.write("#"+datestr+"\n") outfh.write(outstring+"\n")
def main(): usage = "usage: %prog [options] " parser = argparse.ArgumentParser(description='Program description') parser.add_argument( dest='csvfile', help="csvfile to pivot (turn rows into columns)\n") args = parser.parse_args() csvbasename=return_file_basename(args.csvfile) outfilecsv= csvbasename+".transposed.csv" a = izip(*csv.reader(open(args.csvfile, "rb"))) csv.writer(open(sys.stdout, "wb")).writerows(a)
def main(): """ add a the RG tag to the sam/bam record as well as update the header right now assumes single sample in the sam/bam file so can only add one RG/SM tag """ usage = "usage: %prog [options] file.bam" parser = OptionParser(usage) parser.add_option("--RG", type="string", dest="rgid", help="readgroup id") parser.add_option("--SM", type="string", dest="sm", help="sample name") parser.add_option("--PL", type="string", dest="pl", default="illumina", help="platform unit (of sequencing)") parser.add_option("--PI", type="string", dest="pi", default="350",help="insert size" ) (options, args)=parser.parse_args() bamfilename=args[0] basename=return_file_basename(bamfilename) pybamfile = pysam.Samfile(bamfilename, "rb" ) newheader=pybamfile.header newheader['RG']=[{'PI': options.pi, 'ID': options.rgid, 'PL': options.pl, 'SM': options.sm}] #print newheader bamfile_withRG_name=basename+".RG.bam" rgreads = pysam.Samfile(bamfile_withRG_name, "wb", header=newheader) if os.path.exists(bamfilename+".bai") == False: sys.stderr.write("please check for existence of bam index file (*.bai)\n") exit(1) sys.stderr.write("writing new bam file with RG tag in record and header ...\n") for read in pybamfile.fetch(): read.tags = read.tags + [("RG",options.rgid)] rgreads.write(read)
def main(): datestring = str(datetime.now()).split( ' ' )[0] datestring=datestring.replace('-','') localtime = time.asctime( time.localtime(time.time()) ) cwd = os.getcwd() usage = "usage: %prog [options] vcf.files.list.txt" parser = OptionParser(usage) parser.add_option("--bin", type="string", dest="binpath", default="/share/home/indapa/software/GenomeAnalysisTK-1.6-5-g557da77/dist", help="GATK path default: /share/home/indapa/software/GenomeAnalysisTK-1.6-5-g557da77/dist") parser.add_option("--R", type="string", dest='ref', default='/d1/data/pipeline_resources/references/human_reference.b37.including_decoys/human_reference_v37_decoys.fa', help=" reference fasta (default is /d1/data/pipeline_resources/references/human_reference.b37.including_decoys/human_reference_v37_decoys.fa") parser.add_option("--scriptname", type="string", dest="scriptname", default="combineVariants.sh", help="name of shell script (combineVariants.sh default)") parser.add_option("--mergeoption", type="string", dest="mergeoption", default="UNIQUIFY", help="default UNIQUIFY") parser.add_option("--runlocal", action="store_true", dest="runlocal", help="run local and not as cluster job", default=False) parser.add_option("--mem", type="string", dest="mem", default="mem8", help="memory of node, default mem8") parser.add_option("--ppn", type="int", dest="ppn", default=1, help="processes per node default 1") (options, args)=parser.parse_args() argsfile=args[0] fh=open(argsfile, 'r') """ file with variant1.vcf variant2.vcf to merge """ for line in fh: (variant1, variant2)=line.strip().split('\t') vcf1=return_file_basename(variant1) vcf2=return_file_basename(variant2) outputfile=".".join([os.path.splitext(file)[0],os.path.splitext(file2)[0], 'combineVariants', 'vcf']) outdir=outputfile outdir=string.replace(outdir,'.vcf', '') scriptname=outputfile scriptname=string.replace(scriptname,'.vcf', '.sh') outfh=open(scriptname,'w') localtime = time.asctime( time.localtime(time.time()) ) outfh.write("#" + localtime +"\n") if options.runlocal == False: outfh.write("#PBS -l nodes=1:"+options.mem+":ppn="+str(options.ppn)+'\n') outfh.write("#PBS -m abe"+'\n') outfh.write("\n") outfh.write("NODE_DIR=/scratch/indapa/"+outdir+"\n") outfh.write("INPUT_DIR=$NODE_DIR\n") outfh.write("if [ -d $NODE_DIR ]\nthen\nrm -rf $NODE_DIR\n fi"+'\n') outfh.write("mkdir -p $NODE_DIR \n cd $NODE_DIR\n") outfh.write("\n") printTransferFunction(outfh) outfh.write("\n") printFailFunction(outfh, scriptname) outfh.write("\n") jarcommand = "java -jar " + options.binpath+"/GenomeAnalysisTK.jar" commandtype="-T CombineVariants" referencefile= " -R " + options.ref variantone= "-V:"+name1 + " " +variant1 variantwo= "-V:"+name2 + " " +variant2 output = " -o " + outputfile mergestring="-genotypeMergeOptions " + options.mergeoption prioritystring="-priority " + priority stdout = " > " + scriptname + ".stdout" stderr = "2> " + scriptname + ".stderr" commandline = " ".join( [ jarcommand, commandtype,referencefile, variantone, variantwo, output, mergestring, prioritystring, stdout, stderr ] ) outfh.write(commandline + "\n") outfh.write("\n") outfh.write( " if [ $? -ne 0 ]; then \n") outfh.write("Terminate_Script \"CombineVariants\" " + scriptname+".stderr" + " \n") outfh.write("fi\n") if options.runlocal == False: #cp back to homedir outfh.write("cp " + outputfile + " " + cwd + "\n") outfh.write("echo \" " + scriptname + "\" > " + cwd + "/" +scriptname+".complete\n") outfh.write("rm -fr $NODE_DIR\n")