def main():
    usage = "usage: %prog [options]  "
    parser = argparse.ArgumentParser(description='extract SNPs or INDELs from a vcf.gz file and write to stdout')
    parser.add_argument('-type', dest='vtype', type=str, help="SNP|INDEL")
    parser.add_argument('vcfile',  type=str,help='*.vcf.gz file')

    args=parser.parse_args()




    vcfroot, ext = os.path.splitext(args.vcfile)
    if ext == '.gz':
        vcf_basename = return_file_basename(return_file_basename(args.vcfile))
    else:
        vcf_basename = return_file_basename(args.vcfile)

    new_vcfname=".".join([vcf_basename, args.vtype, 'vcf'])


    vcf_reader = vcf.Reader(open(args.vcfile, 'r'))
    vcf_writer = vcf.Writer(open('/dev/stdout', 'w'), vcf_reader)

    #sys.stderr.write("writing "+ args.vtype + " to " + new_vcfname +"\n")
    for record in vcf_reader:
        if args.vtype=='SNP' and record.is_snp==True:
            vcf_writer.write_record(record)

        if args.vtype=='INDEL' and record.is_indel==True:
            vcf_writer.write_record(record)
def main():
    
    
    usage = "usage: %prog [options]  "
    parser = argparse.ArgumentParser(description='generate jobs for mplielup')
    
    parser.add_argument("-Q", help="mapping quality",dest='mq', default='30')
    parser.add_argument("-q", help="base quality", dest='bq', default='20')
    parser.add_argument("-bin", help="path to samtools", dest='bin', default="/broad/software/free/Linux/redhat_5_x86_64/pkgs/samtools/samtools_0.1.19/bin/samtools")
    parser.add_argument("-bed", dest='bed', help="bedfile")
    parser.add_argument("-ref", dest='ref', default='/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta', help="reference fasta")
    
    parser.add_argument('seqindex',  type=str,help='file.index')
    
    args = parser.parse_args()
    
    cwd = os.getcwd()
    fh=open(args.seqindex, 'r')
    
    SeqIndexRecord = namedtuple('SeqIndexRecord', 'PDO, SeqProject, Title, SeqCenter, PDOSample, ExternalID, BAMPath, FID, QC')
    bedfile_base=return_file_basename(args.bed)
    
    for rec  in map(SeqIndexRecord._make, csv.reader(fh, delimiter='\t')): 
        if rec.QC != 'PASS':
            continue
        bamfile_base = return_file_basename(rec.BAMPath)
        pileupout=".".join([bamfile_base, bedfile_base, 'pileup'])
        
        commandline=" ".join([ args.bin, 'mpileup', '-q', args.bq, '-Q', args.mq, '-f', args.ref, '-l', args.bed, rec.BAMPath, '>', cwd+"/"+pileupout])        
 
        outfh=open("pileupjob."+bamfile_base+"."+bedfile_base+".sh", 'w')
        #print commandline
        outfh.write(commandline+"\n")
def main():
    """ reassign mapping qualities in a bam file """
    
    
    usage = "usage: %prog [options] file.bam"
    parser = OptionParser(usage)
    parser.add_option("--DMQ", type="int", dest="dmq", default=60, help="default mapping quality to set")
    
    
    (options, args)=parser.parse_args()    


    bamfilename=args[0]
    basename=return_file_basename(bamfilename)
    pybamfile = pysam.Samfile(bamfilename, "rb" )
    
    bamfile_withRG_name=basename+".dmq.bam"
    dmqbamfile = pysam.Samfile(bamfile_withRG_name, "wb", header=pybamfile.header)
    
    if os.path.exists(bamfilename+".bai") == False:
        sys.stderr.write("please check for existence of bam index file (*.bai)\n")
        exit(1)
        
    sys.stderr.write("writing new bam file with new default mapping quality tag  alignment record  ...\n")
    
    for read in pybamfile.fetch():
        read.mapq = options.dmq
        dmqbamfile.write(read)
def main():
    """ reassign mapping qualities in a bam file """

    usage = "usage: %prog [options] file.bam"
    parser = OptionParser(usage)
    parser.add_option("--DMQ", type="int", dest="dmq", default=60, help="default mapping quality to set")

    (options, args) = parser.parse_args()

    bamfilename = args[0]
    basename = return_file_basename(bamfilename)
    pybamfile = pysam.Samfile(bamfilename, "rb")

    bamfile_withRG_name = basename + ".dmq.bam"
    dmqbamfile = pysam.Samfile(bamfile_withRG_name, "wb", header=pybamfile.header)

    if os.path.exists(bamfilename + ".bai") == False:
        sys.stderr.write("please check for existence of bam index file (*.bai)\n")
        exit(1)

    sys.stderr.write("writing new bam file with new default mapping quality tag  alignment record  ...\n")

    for read in pybamfile.fetch():
        read.mapq = options.dmq
        dmqbamfile.write(read)
def main():
    
    """ get a specified number of paired-end reads from a samfile """
    usage = "usage: %prog [options] readname_sorted.sam"
    parser=OptionParser(usage)
    parser.add_option("--N", type="int", default=100, dest="N", help="number of records to randomly select")
    
    (options,args)=parser.parse_args()
    
    samfilename=args[0]
    basename=return_file_basename(samfilename)
    
    subfh=open(basename+".subset.sam", "w")
    
    samfh=open(samfilename, 'r')
 
    records1 = sum(1 for _ in open(samfilename)) / 2
    rand_records=sorted([random.randint(0, records1 - 1) for _ in xrange(options.N)])
    
    print "total number of read pairs to select: ", len(rand_records)
    
    rec_no=-1
    records_written=0
    
    for rr in rand_records:
        while rec_no < rr:
            rec_no+=1
            for i in range(2): samfh.readline()
        for j in range(2):
            subfh.write(samfh.readline())
        rec_no+=1
        records_written+=1
        
    print >>sys.stderr, "wrote to  %d read pairs to %s  " % (records_written, subfh.name)
def main():
    """ randomly select N records from paired end fastq files
        code based on this: http://www.biostars.org/p/6544/#6555  """
    usage = "usage: %prog [options] file_1.fq.gz file_2.fq.gz"
    parser = OptionParser(usage)
    
    parser.add_option("--N", type="int", default=100, dest="N", help="number of records to randomly select")
    (options, args)=parser.parse_args()

    if len(args) !=2:
        sys.stderr.write("provide two fastq.gz files!\n")
        sys.exit(1)
    
    fq1=args[0]
    fq2=args[1]
    fq1_basename= return_file_basename(fq1)
    fq2_basename= return_file_basename(fq2)
    suba, subb = gzip.open(fq1_basename + ".subset.gz", "wb"), gzip.open(fq2_basename + ".subset.gz", "wb")
    
    
    
    records1 = sum(1 for _ in gzip.open(fq1)) / 4
    if options.N > records1:
        sys.stderr.write("number requested less than number in fastq file!")
    
    if sum(1 for _ in gzip.open(fq2)) / 4 != records1:
        sys.stderr.write("unequal number of fastq records in PE files!\n")
        sys.exit(1)
    
    fh1=gzip.open(fq1)
    fh2=gzip.open(fq2)
    
    rand_records=sorted([random.randint(0, records1 - 1) for _ in xrange(options.N)])

    rec_no=-1
    for rr in rand_records:
        while rec_no  < rr:
            rec_no+=1
            for i in range(4): fh1.readline()
            for i in range(4): fh2.readline()
        for i  in range(4):
            suba.write(fh1.readline())
            subb.write(fh2.readline())
        rec_no+=1
        
    print >>sys.stderr, "wrote to %s, %s" % (suba.name, subb.name)
def main():
    
    today=datetime.datetime.today()
    
    datestr=today.strftime("20%y%m%d %X")
    
    
    
    CWD=os.getcwd() 
    
    BROAD_STD_AGILENT_TARGETS="/humgen/gsa-pipeline/resources/b37/v5/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"
    BROAD_STD_AGILENT_GENCODE="/humgen/gsa-pipeline/resources/b37/v5/gencode.v12_broad.agilent_merged.interval_list"
    REF="/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"
    BIN="/seq/software/picard/current/bin"
    #bsub -q hour -o aindap.$fbname.out $PWD/$x; done

    usage = "usage: %prog [options]  "
    parser = argparse.ArgumentParser(description='Program description')
    
    parser.add_argument("--ti", type=str, dest='ti', help="target interval file (full path)")


    parser.add_argument('indexfile',  type=str,help='REU index file')
    parser.add_argument("--picardtool", type=str, dest='picardtool', default="CalculateHsMetrics.jar")
    parser.add_argument("--queue", type=str, dest='queue', default="week")
    args=parser.parse_args()
    
    command="java -Xmx2g -jar " + BIN+"/"+args.picardtool
    
    

    TARGETS= args.ti

        
    sys.stderr.write("using "  + TARGETS +"\n")
    
    ReuIndexRecord = namedtuple('ReuIndexRecord', 'PDO, SeqProject, Title, BI, PDOSample, ExternalID, BAMPath')
    
    #java -Xmx8g -jar  /seq/software/picard/current/bin/CalculateHsMetrics.jar BAIT_INTERVALS=$TARGETS TARGET_INTERVALS=$TARGETS INPUT=$BAM OUTPUT=00428C.hsmetrics.txt REFERENCE_SEQUENCE=$REF PER_TARGET_COVERAGE=00428C.pertarget.txt

    
    for rec in map(ReuIndexRecord._make, csv.reader(open(args.indexfile, "rb"), delimiter='\t')):
        if rec.PDO == 'PDO': continue
        samplename= return_file_basename( rec.BAMPath)
        samplemetrics = ".".join([samplename, 'hsmetrics', 'txt'])
        pertargetmetrics=".".join([samplename, 'hsmetrics', 'pertarget', 'txt'])
        commandline = [ command , "BAIT_INTERVALS="+TARGETS, "TARGET_INTERVALS="+TARGETS, "INPUT="+rec.BAMPath, "OUTPUT="+CWD+"/"+samplemetrics,
                       "REFERENCE_SEQUENCE="+REF, "PER_TARGET_COVERAGE="+CWD+"/"+pertargetmetrics]
        outstring=" ".join(commandline)
        output=".".join(["aindap", samplename, 'hsmetrics', 'txt'])
        scriptfile=".".join([samplename,rec.SeqProject, 'hsmetrics', 'sh'])
        outfh=open(scriptfile, 'w')
        outfh.write("#"+datestr+"\n")
        outfh.write(outstring+"\n")
Example #8
0
def main():
    usage = "usage: %prog [options]  "
    parser = argparse.ArgumentParser(description='Program description')
    parser.add_argument( dest='csvfile', help="csvfile to pivot (turn rows into columns)\n")
    
    args = parser.parse_args()
    
    csvbasename=return_file_basename(args.csvfile)
    
    outfilecsv= csvbasename+".transposed.csv"
    
    
    
    a = izip(*csv.reader(open(args.csvfile, "rb")))
    csv.writer(open(sys.stdout, "wb")).writerows(a)
Example #9
0
def main():
    """ add a the RG tag to the sam/bam record as well as update the header
        right now assumes single sample in the sam/bam file so can only add one RG/SM tag """
    
    usage = "usage: %prog [options] file.bam"
    parser = OptionParser(usage)
    parser.add_option("--RG", type="string", dest="rgid", help="readgroup id")
    
    parser.add_option("--SM", type="string", dest="sm",  help="sample name")
    parser.add_option("--PL", type="string", dest="pl", default="illumina",  help="platform unit (of sequencing)")
    parser.add_option("--PI", type="string", dest="pi", default="350",help="insert size" )
    (options, args)=parser.parse_args()    


    bamfilename=args[0]
    basename=return_file_basename(bamfilename)
    
    
                    
    
    pybamfile = pysam.Samfile(bamfilename, "rb" )
    newheader=pybamfile.header
    newheader['RG']=[{'PI': options.pi, 'ID': options.rgid, 'PL': options.pl, 'SM': options.sm}]
    #print newheader
    bamfile_withRG_name=basename+".RG.bam"
    rgreads = pysam.Samfile(bamfile_withRG_name, "wb", header=newheader)
    
    
    
    if os.path.exists(bamfilename+".bai") == False:
        sys.stderr.write("please check for existence of bam index file (*.bai)\n")
        exit(1)
    
    sys.stderr.write("writing new bam file with RG tag in record and header ...\n")
    for read in pybamfile.fetch():
        read.tags = read.tags + [("RG",options.rgid)]
        rgreads.write(read)
def main():
    datestring = str(datetime.now()).split( ' ' )[0]
    datestring=datestring.replace('-','')
    localtime = time.asctime( time.localtime(time.time()) )
    cwd = os.getcwd()

    usage = "usage: %prog [options] vcf.files.list.txt"

    parser = OptionParser(usage)
    parser.add_option("--bin", type="string", dest="binpath", default="/share/home/indapa/software/GenomeAnalysisTK-1.6-5-g557da77/dist", help="GATK path default: /share/home/indapa/software/GenomeAnalysisTK-1.6-5-g557da77/dist")
   
    parser.add_option("--R", type="string", dest='ref', default='/d1/data/pipeline_resources/references/human_reference.b37.including_decoys/human_reference_v37_decoys.fa',
                      help="  reference fasta (default is /d1/data/pipeline_resources/references/human_reference.b37.including_decoys/human_reference_v37_decoys.fa")

    parser.add_option("--scriptname", type="string", dest="scriptname", default="combineVariants.sh", help="name of shell script (combineVariants.sh default)")
    

    parser.add_option("--mergeoption", type="string", dest="mergeoption", default="UNIQUIFY", help="default UNIQUIFY")
    parser.add_option("--runlocal", action="store_true", dest="runlocal", help="run local and not as cluster job", default=False)

    parser.add_option("--mem", type="string", dest="mem", default="mem8", help="memory of node, default mem8")
    parser.add_option("--ppn", type="int", dest="ppn", default=1, help="processes per node default 1")


    (options, args)=parser.parse_args()

    argsfile=args[0]
    fh=open(argsfile, 'r')

    """ file with variant1.vcf variant2.vcf to merge """
    for line in fh:
        (variant1, variant2)=line.strip().split('\t')
        vcf1=return_file_basename(variant1)
        vcf2=return_file_basename(variant2)
        outputfile=".".join([os.path.splitext(file)[0],os.path.splitext(file2)[0], 'combineVariants', 'vcf'])
       
        outdir=outputfile
        outdir=string.replace(outdir,'.vcf', '')

        scriptname=outputfile
        scriptname=string.replace(scriptname,'.vcf', '.sh')
        outfh=open(scriptname,'w')

        localtime = time.asctime( time.localtime(time.time()) )
        outfh.write("#" + localtime +"\n")


        if options.runlocal == False:
            outfh.write("#PBS -l nodes=1:"+options.mem+":ppn="+str(options.ppn)+'\n')
            outfh.write("#PBS -m abe"+'\n')
            outfh.write("\n")
        
            outfh.write("NODE_DIR=/scratch/indapa/"+outdir+"\n")
            outfh.write("INPUT_DIR=$NODE_DIR\n")
            outfh.write("if [ -d $NODE_DIR ]\nthen\nrm -rf $NODE_DIR\n fi"+'\n')
            outfh.write("mkdir -p  $NODE_DIR \n cd $NODE_DIR\n")
            outfh.write("\n")
            printTransferFunction(outfh)
            outfh.write("\n")
            printFailFunction(outfh, scriptname)
            outfh.write("\n")

    

        jarcommand = "java -jar " + options.binpath+"/GenomeAnalysisTK.jar"
        commandtype="-T CombineVariants"
        referencefile= " -R " + options.ref
        variantone= "-V:"+name1 + " " +variant1
        variantwo= "-V:"+name2 + " " +variant2
        output = " -o " + outputfile
        mergestring="-genotypeMergeOptions " + options.mergeoption
        prioritystring="-priority " + priority
    
    
    
        stdout = " > " + scriptname + ".stdout"
        stderr = "2> " + scriptname + ".stderr"
    
    
        commandline = " ".join( [ jarcommand, commandtype,referencefile, variantone, variantwo, output, mergestring, prioritystring, stdout, stderr  ] )
        outfh.write(commandline + "\n")
        outfh.write("\n")
        outfh.write( " if [ $? -ne 0  ]; then \n")
        outfh.write("Terminate_Script \"CombineVariants\" "  + scriptname+".stderr"  +  " \n")
        outfh.write("fi\n")
        if options.runlocal == False:
    #cp back to homedir
            outfh.write("cp " + outputfile + " " + cwd + "\n")
            outfh.write("echo \" " + scriptname + "\" > " + cwd + "/" +scriptname+".complete\n")
            outfh.write("rm -fr $NODE_DIR\n")