Beispiel #1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-wga',
                        help='whole genome alignment bed',
                        required=True)
    parser.add_argument('-bed', help='bed file of region', required=True)
    parser.add_argument('-region', help='region label', required=True)
    parser.add_argument('-spp',
                        help='Comma separated species list for output',
                        required=True)
    parser.add_argument('-out_stem',
                        help='output directory and file stem',
                        required=True)
    args = parser.parse_args()

    out_stem = args.out_stem + '_' + args.region

    bed_cmd = 'bedtools intersect -a {wga} -b {bed} | bgzip -c > {out_stem}.wga.bed.gz'.format(
        wga=args.wga, bed=args.bed, out_stem=out_stem)

    fasta_cmd = (
        'zcat {out_stem}.wga.bed.gz | '
        '~/sal_enhancers/divergence/wga2fa.py -out_stem {out_stem}.wga -spp {spp}'
    ).format(out_stem=out_stem, spp=args.spp)

    ape_cmd = 'Rscript ~/sal_enhancers/divergence/k80_div_est.R {out_stem}.wga.fa {region} > {out}'.format(
        out_stem=out_stem, region=args.region, out=out_stem + '.div.txt')

    q_sub([bed_cmd, fasta_cmd, ape_cmd],
          out=out_stem,
          rmem=12,
          mem=12,
          scheduler='SLURM')
Beispiel #2
0
def main():

    counter = 0
    cmds = ['cd /scratch/project_2002047/barson_mapping_v2/reads']

    for line in open('PRJEB10744.txt'):

        counter += 1

        if line.startswith('study'):
            continue

        reads = line.split('\t')[9].split(';')
        cmds += ['wget -c ftp://' + reads[0], 'wget -c ftp://' + reads[1]]

        if counter % 10 == 0:
            q_sub(
                cmds,
                out=
                '/scratch/project_2002047/barson_mapping_v2/reads/read_download'
                + str(counter),
                t=48,
                scheduler='SLURM')
            cmds = ['cd /scratch/project_2002047/barson_mapping_v2/reads']

    q_sub(
        cmds,
        out='/scratch/project_2002047/barson_mapping_v2/reads/read_download' +
        str(counter),
        t=48,
        scheduler='SLURM')
Beispiel #3
0
def main():
    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-ref', help='Reference genome', required=True)
    parser.add_argument('-out', help='Output vcf stem', required=True)
    args = parser.parse_args()

    # vcfs
    vcf_list = [x.rstrip() for x in sys.stdin]

    contigs = [x.split('\t')[0] for x in open(args.ref + '.fai') if not x.startswith('NW')
               and not x.startswith('KT') and not x.startswith('NC_001960.1')]

    for chromo in contigs:

        out = args.out + '_' + chromo + '.gatk.allsites.g.vcf'

        # submit job
        combine_cmd = ('gatk --java-options "-Xmx20g -Xms20g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" CombineGVCFs '
                       '-R {} '
                       '-O {} '
                       '-L {} ').format(args.ref, out, chromo)

        for v in vcf_list:
            combine_cmd += '--variant {} '.format(v)

        q_sub([combine_cmd], out=out.replace('.g.vcf', ''), t=60, mem=25, rmem=25, scheduler='SLURM')
Beispiel #4
0
def main():
    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-cds_fa',
                        help='Fasta file with CDS sequences in',
                        required=True)
    args = parser.parse_args()

    for i in (0, 2, 3, 4):

        cmd = (
            'python ~/sal_bal_sel/annotation/degen_to_bed.py '
            '-cds_fa {} -degen {} | '
            'sort -T /scratch/tuyida/bartonhe/tmp/ -k1,1 -k2,2n | '
            'bedtools merge -c 4 -o distinct | '
            'bgzip -c > /scratch/tuyida/bartonhe/sal_ref/salmo_salar_{}fold.bed.gz'
            '').format(args.cds_fa, i, i)

        q_sub(
            [cmd],
            out='/users/bartonhe/sal_bal_sel/annotation/{}fold_to_bed'.format(
                i),
            scheduler='SLURM',
            rmem=10,
            mem=10)
Beispiel #5
0
def main():

    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-ref', help='Reference genome', required=True)
    parser.add_argument('-out', help='Output dir', required=True)
    args = parser.parse_args()

    for vcf in sys.stdin:

        vcf = vcf.rstrip()
        out_vcf = args.out + vcf.replace('.allsites.g.vcf',
                                         '.raw.snps.indels.vcf').split('/')[-1]

        # submit job
        genotyper = (
            'gatk --java-options "-Xmx4g -Djava.io.tmpdir=/scratch/project_2002047/tmp" GenotypeGVCFs '
            '-R {} -V {} -O {} ').format(args.ref, vcf, out_vcf)

        q_sub([genotyper],
              out=out_vcf.replace('.vcf', ''),
              t=60,
              mem=10,
              rmem=10,
              scheduler='SLURM')
Beispiel #6
0
def main():

    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-maf_dir',
                        help='directory of block maf files',
                        required=True)
    args = parser.parse_args()

    mafs = [x for x in os.listdir(args.maf_dir) if x.endswith('.maf')]

    for maf in mafs:

        block = maf.replace('.multiple.maf', '')
        maf = args.maf_dir + maf

        cmd = (
            'cat {maf} | python /users/bartonhe/sal_enhancers/homeoblock_alignments/maf2fasta.py '
            '-block {block} -out {out}').format(maf=maf,
                                                block=block,
                                                out=args.maf_dir)

        q_sub([cmd],
              out=args.maf_dir + block + '.clean_align',
              scheduler='SLURM')
Beispiel #7
0
def main():
    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-in_dir', help='Top level input directory', required=True)
    parser.add_argument('-ref', help='Reference genome', required=True)
    parser.add_argument('-out_dir', help='Output directory', required=True)
    args = parser.parse_args()

    contigs = [x for x in os.listdir(args.in_dir) if os.path.isdir(args.in_dir + x)]
    
    for chromo in contigs:

        chromo_path = args.in_dir + chromo + '/'

        vcf_list = [chromo_path + x for x in os.listdir(chromo_path) if x.endswith('.g.vcf') and 'SRR' not in x]

        out = args.out_dir + 'salsal_{}.{}.allsites.g.vcf'.format(len(vcf_list), chromo)

        # submit job
        combine_cmd = ('gatk --java-options "-Xmx50g -Xms50g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" CombineGVCFs '
                       '-R {} '
                       '-O {} ').format(args.ref, out)

        for v in vcf_list:
            combine_cmd += '--variant {} '.format(v)

        q_sub([combine_cmd], out=out.replace('.g.vcf', ''), t=60, mem=53, rmem=53, scheduler='SLURM')
Beispiel #8
0
def main():

    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-cds_fa',
                        help='Fasta file with CDS sequences in',
                        required=True)
    parser.add_argument('-vcf', help='SNP vcf path', required=True)
    parser.add_argument('-out', help='output file stem', required=True)
    parser.add_argument('-evolgen',
                        help='If specified will run on evolgen',
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    # get chromosome list
    grep_cmd = 'zgrep -v ^# {} | cut -f 1 | uniq'.format(args.vcf)
    chromo_list = subprocess.Popen(
        grep_cmd, stdout=subprocess.PIPE,
        shell=True).communicate()[0].split('\n')[:-1]
    chromo_list = [x for x in chromo_list if x.startswith('chr')]

    # loop through chromo list and submit job for each
    for chromo in chromo_list:
        stem = '_'.join([args.out, chromo])

        nonsense_cmd = ('~/parus_indel/annotation/prem_stops_to_bed.py '
                        '-cds_fa {} '
                        '-vcf {} '
                        '-chr {} '
                        '-out {}').format(args.cds_fa, args.vcf, chromo,
                                          args.out)

        q_sub([nonsense_cmd], out=stem, t=48, evolgen=args.evolgen)
def main():
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('-indel_vcf', help='Vcf file to get summary stats for', required=True)
    parser.add_argument('-snp_vcf', help='Vcf file to get summary stats for', required=True)
    parser.add_argument('-region_list', help='text file with pairs of labels and bed files', required=True)
    parser.add_argument('-out_pre', help='output path and prefix', required=True)
    parser.add_argument('-correct_sfs', help='Corrects sfs for pol error', default=False, action='store_true')
    parser.add_argument('-evolgen', help='If specified will submit to lab queue', default=False, action='store_true')
    parser.add_argument('-no_sub', help=argparse.SUPPRESS, default=False, action='store_true')
    args = parser.parse_args()

    if args.correct_sfs:
        correct = ' -correct_sfs'
    else:
        correct = ''

    for region in open(args.region_list):

        tag, bed = region.split()

        out_stem = args.out_pre + '_' + tag

        cmd = ('~/parus_indel/summary_analyses/bed_summary_stats.py '
               '-indel_vcf {} -snp_vcf {} '
               '-bed {} -tag {}{} '
               '> {}').format(args.indel_vcf, args.snp_vcf, bed, tag, correct, out_stem + '_stats.txt')

        if args.no_sub:
            q_write([cmd], out=out_stem, mem=10, rmem=10, evolgen=args.evolgen)
        else:
            q_sub([cmd], out=out_stem, mem=10, rmem=10, evolgen=args.evolgen)
def main():
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('-wga',
                        help='Whole genome alignment bed file',
                        required=True)
    parser.add_argument('-region_list',
                        help='Coordinates to calc divergence for, bed format',
                        required=True)
    parser.add_argument('-out_dir', help='Output path and file', required=True)
    parser.add_argument('-evolgen',
                        help='if specified will run on lab queue',
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    for line in open(args.region_list):

        region, bed = line.split()

        out_stem = '{}gt_indel_div_{}'.format(args.out_dir, region)

        div_cmd = ('~/parus_indel/summary_analyses/indel_divergence.py '
                   '-wga {} -bed {} -tag {} > {}').format(
                       args.wga, bed, region, out_stem + '.txt')

        q_sub([div_cmd], out=out_stem, t=24, evolgen=args.evolgen)
def main():
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('-cds_fa_dir',
                        help='cds fasta directory',
                        required=True)
    parser.add_argument('-out_dir', help='output directory', required=True)
    parser.add_argument('-vcf', help='SNP vcf path', required=True)
    parser.add_argument('-call_fa',
                        help='Callable sites fasta file',
                        required=True)
    parser.add_argument('-evolgen',
                        help='if specified will run on lab queue',
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    out_files = []
    out_dir = args.out_dir
    autos = ('2R', '2RHet', '2L', '2LHet', '3R', '3RHet', '3L', '3LHet', '4')

    # per chromo jobs for extracting nonsense data
    for x in [args.cds_fa_dir + y for y in os.listdir(args.cds_fa_dir)]:
        if not x.endswith('.fasta.gz'):
            continue

        chromo = x.split('-')[1]

        if chromo not in autos:
            continue

        outstem = x.split('/')[-1].replace('.fasta.gz', '')
        out = out_dir + outstem + '.premstops.txt'

        out_files.append(out)

        extract_cmd = ('./extract_prem_stops.py '
                       '-cds_fa {cds_fa} '
                       '-chr {chromo} '
                       '-vcf {vcf} '
                       '-call_fa {c_fa} '
                       '-n 17 '
                       '-unfolded '
                       '-out {output}').format(cds_fa=x,
                                               chromo=chromo,
                                               vcf=args.vcf,
                                               c_fa=args.call_fa,
                                               output=out)

        q_sub([extract_cmd], out=out_dir + outstem, evolgen=args.evolgen)

    # write list file
    list_file = out_dir + 'chromo_nonsense_list.txt'

    with open(list_file, 'w') as list_out:
        print(*out_files, sep='\n', file=list_out)
Beispiel #12
0
def main():

    for pairwise_dir in sys.stdin:

        in_dir = pairwise_dir.rstrip()

        single_cov = 'python ~/sal_bal_sel/genome_alignment/single_cov.py -dir {} -ref_name BrownTrout'.format(
            in_dir)

        q_sub([single_cov], out=in_dir + 'single_cov', scheduler='SLURM')
Beispiel #13
0
def main():
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('-wga', help='wga bed file', required=True)
    parser.add_argument('-ucne_bed', help='UCNE bed file', required=True)
    parser.add_argument('-out_dir', help='output directory', required=True)
    parser.add_argument('-evolgen',
                        help='If specified will run on evolgen',
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    # get chromosome list
    grep_cmd = 'zcat {} | cut -f 1 | uniq'.format(args.ucne_bed)
    chromo_list = subprocess.Popen(
        grep_cmd, stdout=subprocess.PIPE,
        shell=True).communicate()[0].split('\n')[:-1]
    chromo_list = [
        x for x in chromo_list if x.startswith('chr') and 'random' not in x
    ]

    jids = []

    for chromo in chromo_list:

        out = 'gt_ucne_{}.bed'.format(chromo)

        cmd = ('zcat {} | '
               '~/WGAbed/non_ref_intersect.py '
               '-b {} -q Zebrafinch -c {} | '
               'grep -v "?" | '
               'cut -f 1-3 | '
               'sort -k1,1 -k2,2n | '
               'bedtools merge '
               '> {}').format(args.wga, args.ucne_bed, chromo,
                              args.out_dir + out)

        jid = out.replace('.bed', '.sh')
        jids.append(jid)

        q_sub([cmd],
              out=args.out_dir + out.replace('.bed', ''),
              jid=jid,
              evolgen=args.evolgen)

    # gather
    gather = 'cat {}*.bed | bgzip -c > {}gt_ucne.bed.gz'.format(
        args.out_dir, args.out_dir)
    index = 'tabix -pbed {}gt_ucne.bed.gz'.format(args.out_dir)

    q_sub([gather, index],
          out=args.out_dir + 'ucne_bed_merge',
          hold=jids,
          evolgen=args.evolgen)
Beispiel #14
0
def main():

    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-top_dir', help='top level directory of chromo grouped mafs', required=True)
    parser.add_argument('-ref', help='Reference species name', required=True)
    parser.add_argument('-out', help='Output directory', required=True)
    parser.add_argument('-no_sub', default=False, action='store_true')
    args = parser.parse_args()

    # number of runs
    n_runs = len([x for x in os.listdir(args.top_dir) if x.startswith('BrownTrout')])

    # loop through queries
    sh_list = []
    for i in range(1, n_runs, 200):

        roast_wrapper = ('python ~/sal_bal_sel/genome_alignment/roast_fish.py -top_dir {} -ref {} '
                         '-chr_tag $SLURM_ARRAY_TASK_ID').format(args.top_dir, args.ref)

        start = i

        if n_runs - i < 200:
            end = n_runs
        else:
            end = i + 199

        # if runs above 1000, ie i == 1001 or higher, switch new flag to add multiple of 1000 on
        if i >= 1001:
            multiplier = ' -n_thou 1'
            start -= 1000
            end -= 1000
        else:
            multiplier = ' -n_thou 0'

        q_write([roast_wrapper + multiplier],
                args.out + 'multiz_start' + str(i),
                t=8,
                rmem=4, mem=4,
                array=[start, end],
                scheduler='SLURM')

        sh_list.append(args.out + 'multiz_start' + str(i) + '_job.sh')

    # submit control script
    control = 'python ~/sal_bal_sel/genome_alignment/pairwise_control.py -sh ' + ' -sh '.join(sh_list)
    if args.no_sub:
        q_write([control], out=args.out + 'all_multiple',
                t=72, rmem=2, mem=2,
                scheduler='SLURM')
    else:
        q_sub([control], out=args.out + 'all_multiple',
              t=72, rmem=2, mem=2,
              scheduler='SLURM')
Beispiel #15
0
def main():

    for line in sys.stdin:
        vcf = line.rstrip()

        index = 'gatk IndexFeatureFile -F ' + vcf

        q_sub([index],
              out=vcf.replace('.g.vcf', '_indexing'),
              t=1,
              scheduler='SLURM')
Beispiel #16
0
def main():
    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-ref', help='Reference genome', required=True)
    parser.add_argument('-out_dir', help='Output directory', required=True)
    args = parser.parse_args()

    chromo_list = [x.split('\t')[0] for x in open(args.ref + '.fai') if not x.startswith('NW')]
    females = ['Uts_11_53', 'Uts_11_52', 'Uts_11_39', 'Uts_11_29', 'Uts_11_28',
               'Naus_12_0037', 'Nams_12_0071', 'Jols_13_0001', 'Arga_12_0082', 'Alta_12_0124']

    # per bam jobs
    bams = [x.rstrip() for x in sys.stdin]

    for b in bams:

        job_list = []
        sample = b.split('/')[-1].split('.')[0]

        # loop through chromos
        for chromo in chromo_list:

            # set ploidy for mito and sdy
            if chromo == 'NC_001960.1' or chromo.startswith('KT'):
                ploidy = 1
            else:
                ploidy = 2

            chromo_dir = args.out_dir + chromo + '/'

            # create chromo dir if not already there
            if not os.path.isdir(chromo_dir):
                os.makedirs(chromo_dir)

            # skip SDY for relevant females
            if sample in females and chromo.startswith('KT'):
                continue

            out_gvcf = chromo_dir + sample + '.' + chromo + '.allsites.g.vcf'

            hap_caller = ('gatk --java-options "-Xmx4g" HaplotypeCaller '
                          '-R {ref} '
                          '-I {bam} '
                          '-ERC GVCF '
                          '-ploidy {ploidy} '
                          '-O {gvcf} '
                          '-L {chr} ').format(ref=args.ref, bam=b, ploidy=ploidy, gvcf=out_gvcf, chr=chromo)

            job_list.append(hap_caller)

        # submit one job per bam
        out_stem = args.out_dir + sample + '.hap_calling'
        q_sub(job_list, out=out_stem, t=60, rmem=8, mem=8, scheduler='SLURM')
Beispiel #17
0
def main():

    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-maf', help='MAF file containg alignment, must be compressed', required=True)
    parser.add_argument('-ref_sp', help='Species to use for coordinates in output bed', required=True)
    parser.add_argument('-ref_sizes', help='Sizes file to extract chromosomes for ref species', required=True)
    parser.add_argument('-out', help='Output directory', required=True)
    parser.add_argument('-no_sub', default=False, action='store_true')
    args = parser.parse_args()

    # number of runs
    n_runs = len([x.split()[0] for x in open(args.ref_sizes)])

    # loop through queries
    sh_list = []
    for i in range(1, n_runs, 200):

        wgabed_wrapper = ('python ~/sal_bal_sel/genome_alignment/convert_to_bed.py '
                          '-maf {} -ref_sp {} -ref_sizes {} -out {} '
                          '-chr_tag $SLURM_ARRAY_TASK_ID').format(args.maf, args.ref_sp, args.ref_sizes, args.out)

        # if runs above 1000, ie i == 1001 or higher, switch new flag to add multiple of 1000 on
        n_thou = int(i / 1000)
        start = i % 1000

        multiplier = ' -n_thou {}'.format(n_thou)

        if n_runs - i < 200:
            end = n_runs
        else:
            end = start + 199

        q_write([wgabed_wrapper + multiplier],
                args.out + 'wgabed_start' + str(i),
                t=8,
                rmem=4, mem=4,
                array=[start, end],
                scheduler='SLURM')

        sh_list.append(args.out + 'wgabed_start' + str(i) + '_job.sh')

    # submit control script
    control = 'python ~/sal_bal_sel/genome_alignment/pairwise_control.py -sh ' + ' -sh '.join(sh_list)
    if args.no_sub:
        q_write([control], out=args.out + 'all_wgabed',
                t=72, rmem=2, mem=2,
                scheduler='SLURM')
    else:
        q_sub([control], out=args.out + 'all_wgabed',
              t=72, rmem=2, mem=2,
              scheduler='SLURM')
Beispiel #18
0
def main():

    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-out', help='Output vcf', required=True)
    args = parser.parse_args()

    vcfs = ' I='.join([x.rstrip() for x in sys.stdin])

    cmd = ('java -Xmx10G -jar /users/bartonhe/picard.jar GatherVcfs '
           'I={} O={}').format(vcfs, args.out)

    q_sub([cmd], out=args.out.replace('.vcf', ''), mem=12, rmem=12, scheduler='SLURM')
Beispiel #19
0
def main():
    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-out', help='Output dir', required=True)
    args = parser.parse_args()

    # vcfs
    vcf_list = [x.rstrip() for x in sys.stdin]

    for vcf in vcf_list:

        new_vcf = args.out + vcf.replace('.g.vcf', '.autosomes.g.vcf').split('/')[-1]

        cmd = 'cat {} | python ~/sal_enhancers/training_set/extract_autosomes.py > {}'.format(vcf, new_vcf)

        q_sub([cmd], out=new_vcf.replace('.g.vcf', ''), rmem=4, mem=4, scheduler='SLURM')
Beispiel #20
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-ref', help='reference genome', required=True)
    args = parser.parse_args()

    chromo_vcfs = [x.rstrip() for x in sys.stdin]

    for chr_vcf in chromo_vcfs:

        new_vcf = chr_vcf.replace('.allsites', '.raw.snps')

        snp_cmd = ('gatk SelectVariants '
                   '-R {ref} -V {vcf} -O {out} '
                   '--select-type-to-include SNP --exclude-non-variants'
                   '').format(ref=args.ref, vcf=chr_vcf, out=new_vcf)

        q_sub([snp_cmd], out=new_vcf.replace('.vcf', ''), scheduler='SLURM')
Beispiel #21
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-fastq_dir',
                        help='directroy containg fastq files',
                        required=True)
    parser.add_argument('-out_dir', help='output directory', required=True)
    args = parser.parse_args()

    # check out dir for complete cleaned samples
    complete = find_complete(args.out_dir)

    # pair reads
    reads = {}
    for file_name in os.listdir(args.fastq_dir):
        if not file_name.endswith('.fastq.gz'):
            continue

        sample = file_name.split('_')[0]

        if sample in complete:
            continue

        if sample not in reads.keys():
            reads[sample] = []

        reads[sample].append(args.fastq_dir + file_name)

    # cleaning
    for s in reads.keys():

        r1, r2 = sorted(reads[s])

        cmd = ('trim_galore --fastqc --output_dir {out} --paired {r1} {r2}'
               '').format(out=args.out_dir, r1=r1, r2=r2)

        print('running: ' + cmd)

        q_sub([cmd],
              out=args.out_dir + s,
              rmem=8,
              mem=8,
              scheduler='SLURM',
              t=2)
Beispiel #22
0
def main():

    for line in sys.stdin:

        out_dir = line.rstrip() + '/'

        align_dir = out_dir + 'aligned/'
        tmp_dir = align_dir + 'tmp/'

        print('creating: ' + align_dir)
        os.makedirs(align_dir)

        print('creating: ' + tmp_dir)
        os.makedirs(tmp_dir)

        roast = ('~/sal_enhancers/homeoblock_alignments/roast_homeoblock.py '
                 '-maf_dir {} -ref salmon').format(out_dir)

        q_sub([roast], out=out_dir + 'multiple_align', scheduler='SLURM')

        print()
Beispiel #23
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-ref', help='reference genome', required=True)
    parser.add_argument('-train_vcf',
                        help='vcf file of training data',
                        required=True)
    parser.add_argument('-out_dir', help='output directory', required=True)
    args = parser.parse_args()

    bams = [x.rstrip() for x in sys.stdin]

    for bam in bams:

        bam_stem = bam.replace('.bam', '').split('/')[-1]
        out_table = '{}{}.table'.format(args.out_dir, bam_stem)
        recal_bam = out_table.replace('.table', '.bqsr.bam')

        bqsr = ('gatk BaseRecalibrator '
                '-I {bam} '
                '-R {ref} '
                '--known-sites {truth} '
                '-O {table}').format(bam=bam,
                                     ref=args.ref,
                                     truth=args.train_vcf,
                                     table=out_table)

        apply = ('gatk ApplyBQSR '
                 '-R {ref} '
                 '-I {bam} '
                 '--bqsr-recal-file {table} '
                 '-O {new_bam}').format(ref=args.ref,
                                        bam=bam,
                                        table=out_table,
                                        new_bam=recal_bam)

        q_sub([bqsr, apply],
              out=out_table.replace('.table', ''),
              t=24,
              scheduler='SLURM')
Beispiel #24
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-sim_data')
    parser.add_argument('-out_dir')
    args = parser.parse_args()

    if not os.path.isdir(args.out_dir):
        os.makedirs(args.out_dir)

    counter = 0
    for line in open(args.sim_data):

        counter += 1

        out_stem = '{}{}.rep{}'.format(args.out_dir,
                                       args.sim_data.replace('.txt', ''),
                                       counter)

        sfs = [int(x) for x in line.rstrip().split(',')]

        sfs_dict = {'SNP': (sfs, 475625)}

        ctl = Snp1ControlFile()
        ctl.set_data(sfs_dict, 20, gamma_r=(-250, 50))
        control_contents = ctl.construct()

        ctl_name = out_stem + '.ctl.txt'
        log_name = out_stem + '.log.txt'
        res_name = out_stem + '.res.txt'

        with open(ctl_name, 'w') as o:
            print(control_contents, file=o)

        cmd = 'anavar1.4 {} {} {} {}'.format(ctl_name, res_name, log_name,
                                             counter)

        q_sub([cmd], out=out_stem, evolgen=True)
Beispiel #25
0
def main():
    # arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-in_dir',
                        help='Top level input directory',
                        required=True)
    parser.add_argument('-out_dir', help='Output directory', required=True)
    args = parser.parse_args()

    contigs = [
        x for x in os.listdir(args.in_dir) if os.path.isdir(args.in_dir + x)
    ]

    for chromo in contigs:

        chromo_path = args.in_dir + chromo + '/'
        out_chromo_path = args.out_dir + chromo + '/'

        if not os.path.isdir(out_chromo_path):
            os.makedirs(out_chromo_path)

        vcf_list = [x for x in os.listdir(chromo_path) if x.endswith('.g.vcf')]

        cmds = []
        for vcf in vcf_list:

            trim_cmd = 'grep -v NW_ {} > {}'.format(chromo_path + vcf,
                                                    out_chromo_path + vcf)
            cmds.append(trim_cmd)

            index = 'gatk IndexFeatureFile -F ' + out_chromo_path + vcf
            cmds.append(index)

        q_sub(cmds,
              out=args.out_dir + chromo + '_trimhead',
              t=10,
              scheduler='SLURM')
Beispiel #26
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-bam_in',
                        help='input directory of unmerged bams',
                        required=True)
    parser.add_argument('-bam_out', help='output directory', required=True)
    parser.add_argument('-info_file',
                        help='file with read group info',
                        required=True)
    parser.add_argument('-ref', help='reference genome', required=True)
    args = parser.parse_args()

    all_read_acc = sample_reads(args.info_file)

    for fish in all_read_acc.keys():

        accessions = all_read_acc[fish]
        in_bams = ' '.join(
            ['I=' + args.bam_in + x + '.dedup.bam' for x in accessions])
        bam_out = args.bam_out + fish + '.dedup.bam'

        merge_cmd = 'java -Xmx12G -jar ~/picard.jar MergeSamFiles {} O={}'.format(
            in_bams, bam_out)
        wgs_metrics = ("java -Xmx12g -jar ~/picard.jar CollectWgsMetrics "
                       "I={} "
                       "O={}.wgsmetrics_file.txt "
                       "R={} INCLUDE_BQ_HISTOGRAM=true"
                       "").format(bam_out, args.bam_out + fish, args.ref)
        #print(merge_cmd)
        #print(wgs_metrics)
        q_sub([merge_cmd, wgs_metrics],
              out=args.bam_out + fish + '_merge',
              mem=14,
              rmem=14,
              scheduler='SLURM')
def sel_v_neu_anavar(mode, vcf, call, sel_region, constraint, n, c, dfe, alg,
                     nnoimp, maximp, out_stem, search, degree, spread, evolgen,
                     start_index, given, ar_ref):
    """
    submits anavar jobs to cluster after writing required files etc
    :param mode: str
    :param vcf: str
    :param call: dict
    :param sel_region: str
    :param constraint: str
    :param n: int
    :param c: int
    :param dfe: str
    :param alg: str
    :param nnoimp: int
    :param maximp: int
    :param out_stem: str
    :param search: int
    :param degree: int
    :param spread: int
    :param evolgen: bool
    :param start_index: int
    :param given: bool
    :param ar_ref: bool
    :return: None
    """

    anavar_path = '/shared/evolgen1/shared_data/program_files/sharc/'

    anavar_cmd = '{path}anavar1.4 {ctl} {rslts} {log} {seed}'

    # sort file names
    ctl_name = out_stem + '.control.txt'
    merge_out = out_stem + '.merged.results.txt'

    # catch given on first run
    init = ()
    if given:
        if not os.path.isfile(merge_out):
            sys.exit(
                'Given True but no previous runs completed to take besty res from'
            )
        else:
            # get best result from merged out
            best_res = an.ResultsFile(
                open(merge_out)).ml_estimate(as_string=True)
            init = tuple(best_res.split()[3:-1])

    # region combinations
    region_combs = {
        'CDS': ['CDS_frameshift', 'CDS_non_frameshift'],
        'intron': ['intron'],
        'intergenic': ['intergenic'],
        'noncoding': ['intergenic', 'intron']
    }

    # make control file
    if mode == 'snp':
        sfs_data = prepare_snp_sfs(vcf,
                                   call,
                                   n,
                                   sel_sfs_regions=region_combs[sel_region],
                                   call_sel_reg=sel_region)
        ctl = an.SNPNeuSelControlFile()

    else:
        sfs_data = prepare_indel_sfs(vcf,
                                     call,
                                     n,
                                     sel_sfs_regions=region_combs[sel_region],
                                     call_sel_reg=sel_region,
                                     ar_ref=ar_ref)
        ctl = an.IndelNeuSelControlFile()

    ctl.set_alg_opts(search=search,
                     alg=alg,
                     key=3,
                     epsabs=1e-20,
                     epsrel=1e-9,
                     rftol=1e-9,
                     maxtime=3600,
                     optional=True,
                     maximp=maximp,
                     nnoimp=nnoimp,
                     init=init)

    ctl.set_data(sfs_data,
                 n,
                 dfe=dfe,
                 c=c,
                 gamma_r=(-5e4, 1e5),
                 theta_r=(1e-14, 0.1),
                 r_r=(0.01, 100),
                 scale_r=(0.1, 5000.0))
    if degree != 50:
        ctl.set_dfe_optional_opts(degree=degree, optional=True)
    ctl.set_constraint(constraint)
    ctl_contents = ctl.construct()
    with open(ctl_name, 'w') as control:
        control.write(ctl_contents)

    res_file_list = out_stem + '.allres.list.txt'
    hjids = []
    with open(res_file_list, 'a') as res_list:

        # split into requested jobs
        for i in range(start_index, start_index + spread):

            split_stem = '{}.split{}'.format(out_stem, i)

            result_name = split_stem + '.results.txt'
            log_name = split_stem + '.log.txt'

            print(result_name, file=res_list)

            # call anavar
            rep_cmd = anavar_cmd.format(path=anavar_path,
                                        ctl=ctl_name,
                                        rslts=result_name,
                                        log=log_name,
                                        seed=i)

            q_sub([rep_cmd],
                  out=split_stem,
                  jid=split_stem.split('/')[-1] + '.sh',
                  t=48,
                  evolgen=evolgen)
            hjids.append(split_stem.split('/')[-1] + '.sh')

    # hold job to merge outputs
    gather = 'cat {} | ~/parus_indel/anavar_analyses/gather_searches.py {}'.format(
        res_file_list, merge_out)
    q_sub([gather], out=out_stem + '.merge', hold=hjids, evolgen=evolgen)
Beispiel #28
0
# list comprehension
# file_list = [fas for fas in os.listdir(in_dir) if fas.endswith(".fas")]

# process the files
cmd_list = []
for i in range(0, len(fas_list)):
    fas_file = fas_dir + fas_list[i]
    #seq_file = seq_dir + seq_list[i]
    #print(fas_file)

    # replaces .fas with .phylip, make list by cutting on '/', [-1] takes last item of list - the filename
    out_name = fas_file.replace('.phylip', '.fas').split('/')[-1]
    out_file = out_dir + out_name

    # print(fas_file, seq_file, out_file)
    trimal_cmd = ('/data/bop17lhy/trimal/source/trimal '
                  '-in {} -out {}'
                  ' -fasta').format(fas_file, out_file)
    cmd_list.append(trimal_cmd)

    #print(fas_file, out_file)
#     #subprocess.call(trimal_cmd, shell=True)
#
# submit bins of jobs
for i in range(0, len(cmd_list), 100):
    bin_cmds = cmd_list[i:i + 100]
    bin_outs = out_dir + 'jobs' + str(i) + str(i + 100)

    q_sub(bin_cmds, out=bin_outs)
Beispiel #29
0
#!/usr/bin/env python

from qsub import q_sub

vcf = (
    '/fastdata/bop15hjb/drosophila_data/dmel/analysis_ready_data/'
    'dmel_17flys.gatk.raw.snps.exsnpindel.recalibrated.filtered_'
    't95.0.pass.dpfiltered.50bp_max.bial.rmarked.polarised.annotated.ar.degen.vcf.gz'
)

callsites = '/fastdata/bop15hjb/drosophila_data/dmel_ref/dmel.callablesites.summary_with_degen.csv'

out_dir = '/fastdata/bop15hjb/drosophila_data/dmel/anavar/degree_variation/'

for degree in [25, 50, 75, 100, 150, 200, 300]:
    for model in ['full', 'equal_t']:

        out = '{}dmel_cds_v_4fold_snps_continuous_dfe_degree{}.{}'.format(
            out_dir, degree, model)
        cmd = ('cds_vs_neutral_anavar_snps.py '
               '-vcf {} '
               '-n 17 -c 1 -dfe continuous '
               '-call_csv {}  '
               '-neu_type 4fold '
               '-out_pre {} -degree {}'
               '').format(vcf, callsites, out, degree)
        if model == 'equal_t':
            cmd += ' -constraint equal_mutation_rate'

        q_sub([cmd], out=out, t=48)
def sel_v_neu_anavar_nonsense(vcf, call, constraint, n, c, dfe, alg, nnoimp,
                              maximp, out_stem, search, degree, spread,
                              evolgen, prem_files):
    """
    submits anavar jobs to cluster after writing required files etc
    :param vcf: str
    :param call: dict
    :param constraint: str
    :param n: int
    :param c: int
    :param dfe: str
    :param alg: str
    :param nnoimp: int
    :param maximp: int
    :param out_stem: str
    :param search: int
    :param degree: int
    :param spread: int
    :param evolgen: bool
    :param prem_files: list
    :return: None
    """

    anavar_path = '/shared/evolgen1/shared_data/program_files/sharc/'

    anavar_cmd = '{path}anavar1.22 {ctl} {rslts} {log} {seed}'

    # sort file names
    ctl_name = out_stem + '.control.txt'

    # get nonsense data in
    nonsense_dict = gather_chromo_prems(prem_files)
    sel_sfs, sel_m = prem_freqs_call(nonsense_dict)

    # make control file
    sfs_data = prepare_nonsense_snp_sfs(vcf, call, n, sel_sfs, sel_m)
    ctl = an.SNPNeuSelControlFile()

    ctl.set_alg_opts(search=search,
                     alg=alg,
                     key=3,
                     epsabs=1e-20,
                     epsrel=1e-9,
                     rftol=1e-9,
                     maxtime=3600,
                     optional=True,
                     maximp=maximp,
                     nnoimp=nnoimp)

    ctl.set_data(sfs_data,
                 n,
                 dfe=dfe,
                 c=c,
                 gamma_r=(-5e4, 1e3),
                 theta_r=(1e-10, 0.1),
                 r_r=(0.01, 100),
                 scale_r=(0.1, 5000.0))
    if degree != 50:
        ctl.set_dfe_optional_opts(degree=degree, optional=True)
    ctl.set_constraint(constraint)
    ctl_contents = ctl.construct()
    with open(ctl_name, 'w') as control:
        control.write(ctl_contents)

    res_file_list = out_stem + '.allres.list.txt'
    hjids = []
    with open(res_file_list, 'w') as res_list:

        # split into requested jobs
        for i in range(1, spread + 1):

            #  seed = random.randint(1, 1e6)
            seed = i

            split_stem = '{}.split{}'.format(out_stem, i)

            result_name = split_stem + '.results.txt'
            log_name = split_stem + '.log.txt'

            print(result_name, file=res_list)

            # call anavar
            rep_cmd = anavar_cmd.format(path=anavar_path,
                                        ctl=ctl_name,
                                        rslts=result_name,
                                        log=log_name,
                                        seed=seed)

            q_sub([rep_cmd],
                  out=split_stem,
                  jid=split_stem.split('/')[-1] + '.sh',
                  t=8,
                  evolgen=evolgen)
            hjids.append(split_stem.split('/')[-1] + '.sh')

    # hold job to merge outputs
    merge_out = out_stem + '.merged.results.txt'
    gather = 'cat {} | gather_searches.py {}'.format(res_file_list, merge_out)
    q_sub([gather], out=out_stem + '.merge', hold=hjids, evolgen=evolgen)