def main(): parser = argparse.ArgumentParser() parser.add_argument('-wga', help='whole genome alignment bed', required=True) parser.add_argument('-bed', help='bed file of region', required=True) parser.add_argument('-region', help='region label', required=True) parser.add_argument('-spp', help='Comma separated species list for output', required=True) parser.add_argument('-out_stem', help='output directory and file stem', required=True) args = parser.parse_args() out_stem = args.out_stem + '_' + args.region bed_cmd = 'bedtools intersect -a {wga} -b {bed} | bgzip -c > {out_stem}.wga.bed.gz'.format( wga=args.wga, bed=args.bed, out_stem=out_stem) fasta_cmd = ( 'zcat {out_stem}.wga.bed.gz | ' '~/sal_enhancers/divergence/wga2fa.py -out_stem {out_stem}.wga -spp {spp}' ).format(out_stem=out_stem, spp=args.spp) ape_cmd = 'Rscript ~/sal_enhancers/divergence/k80_div_est.R {out_stem}.wga.fa {region} > {out}'.format( out_stem=out_stem, region=args.region, out=out_stem + '.div.txt') q_sub([bed_cmd, fasta_cmd, ape_cmd], out=out_stem, rmem=12, mem=12, scheduler='SLURM')
def main(): counter = 0 cmds = ['cd /scratch/project_2002047/barson_mapping_v2/reads'] for line in open('PRJEB10744.txt'): counter += 1 if line.startswith('study'): continue reads = line.split('\t')[9].split(';') cmds += ['wget -c ftp://' + reads[0], 'wget -c ftp://' + reads[1]] if counter % 10 == 0: q_sub( cmds, out= '/scratch/project_2002047/barson_mapping_v2/reads/read_download' + str(counter), t=48, scheduler='SLURM') cmds = ['cd /scratch/project_2002047/barson_mapping_v2/reads'] q_sub( cmds, out='/scratch/project_2002047/barson_mapping_v2/reads/read_download' + str(counter), t=48, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-ref', help='Reference genome', required=True) parser.add_argument('-out', help='Output vcf stem', required=True) args = parser.parse_args() # vcfs vcf_list = [x.rstrip() for x in sys.stdin] contigs = [x.split('\t')[0] for x in open(args.ref + '.fai') if not x.startswith('NW') and not x.startswith('KT') and not x.startswith('NC_001960.1')] for chromo in contigs: out = args.out + '_' + chromo + '.gatk.allsites.g.vcf' # submit job combine_cmd = ('gatk --java-options "-Xmx20g -Xms20g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" CombineGVCFs ' '-R {} ' '-O {} ' '-L {} ').format(args.ref, out, chromo) for v in vcf_list: combine_cmd += '--variant {} '.format(v) q_sub([combine_cmd], out=out.replace('.g.vcf', ''), t=60, mem=25, rmem=25, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-cds_fa', help='Fasta file with CDS sequences in', required=True) args = parser.parse_args() for i in (0, 2, 3, 4): cmd = ( 'python ~/sal_bal_sel/annotation/degen_to_bed.py ' '-cds_fa {} -degen {} | ' 'sort -T /scratch/tuyida/bartonhe/tmp/ -k1,1 -k2,2n | ' 'bedtools merge -c 4 -o distinct | ' 'bgzip -c > /scratch/tuyida/bartonhe/sal_ref/salmo_salar_{}fold.bed.gz' '').format(args.cds_fa, i, i) q_sub( [cmd], out='/users/bartonhe/sal_bal_sel/annotation/{}fold_to_bed'.format( i), scheduler='SLURM', rmem=10, mem=10)
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-ref', help='Reference genome', required=True) parser.add_argument('-out', help='Output dir', required=True) args = parser.parse_args() for vcf in sys.stdin: vcf = vcf.rstrip() out_vcf = args.out + vcf.replace('.allsites.g.vcf', '.raw.snps.indels.vcf').split('/')[-1] # submit job genotyper = ( 'gatk --java-options "-Xmx4g -Djava.io.tmpdir=/scratch/project_2002047/tmp" GenotypeGVCFs ' '-R {} -V {} -O {} ').format(args.ref, vcf, out_vcf) q_sub([genotyper], out=out_vcf.replace('.vcf', ''), t=60, mem=10, rmem=10, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-maf_dir', help='directory of block maf files', required=True) args = parser.parse_args() mafs = [x for x in os.listdir(args.maf_dir) if x.endswith('.maf')] for maf in mafs: block = maf.replace('.multiple.maf', '') maf = args.maf_dir + maf cmd = ( 'cat {maf} | python /users/bartonhe/sal_enhancers/homeoblock_alignments/maf2fasta.py ' '-block {block} -out {out}').format(maf=maf, block=block, out=args.maf_dir) q_sub([cmd], out=args.maf_dir + block + '.clean_align', scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-in_dir', help='Top level input directory', required=True) parser.add_argument('-ref', help='Reference genome', required=True) parser.add_argument('-out_dir', help='Output directory', required=True) args = parser.parse_args() contigs = [x for x in os.listdir(args.in_dir) if os.path.isdir(args.in_dir + x)] for chromo in contigs: chromo_path = args.in_dir + chromo + '/' vcf_list = [chromo_path + x for x in os.listdir(chromo_path) if x.endswith('.g.vcf') and 'SRR' not in x] out = args.out_dir + 'salsal_{}.{}.allsites.g.vcf'.format(len(vcf_list), chromo) # submit job combine_cmd = ('gatk --java-options "-Xmx50g -Xms50g -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" CombineGVCFs ' '-R {} ' '-O {} ').format(args.ref, out) for v in vcf_list: combine_cmd += '--variant {} '.format(v) q_sub([combine_cmd], out=out.replace('.g.vcf', ''), t=60, mem=53, rmem=53, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-cds_fa', help='Fasta file with CDS sequences in', required=True) parser.add_argument('-vcf', help='SNP vcf path', required=True) parser.add_argument('-out', help='output file stem', required=True) parser.add_argument('-evolgen', help='If specified will run on evolgen', default=False, action='store_true') args = parser.parse_args() # get chromosome list grep_cmd = 'zgrep -v ^# {} | cut -f 1 | uniq'.format(args.vcf) chromo_list = subprocess.Popen( grep_cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].split('\n')[:-1] chromo_list = [x for x in chromo_list if x.startswith('chr')] # loop through chromo list and submit job for each for chromo in chromo_list: stem = '_'.join([args.out, chromo]) nonsense_cmd = ('~/parus_indel/annotation/prem_stops_to_bed.py ' '-cds_fa {} ' '-vcf {} ' '-chr {} ' '-out {}').format(args.cds_fa, args.vcf, chromo, args.out) q_sub([nonsense_cmd], out=stem, t=48, evolgen=args.evolgen)
def main(): # argument parser parser = argparse.ArgumentParser() parser.add_argument('-indel_vcf', help='Vcf file to get summary stats for', required=True) parser.add_argument('-snp_vcf', help='Vcf file to get summary stats for', required=True) parser.add_argument('-region_list', help='text file with pairs of labels and bed files', required=True) parser.add_argument('-out_pre', help='output path and prefix', required=True) parser.add_argument('-correct_sfs', help='Corrects sfs for pol error', default=False, action='store_true') parser.add_argument('-evolgen', help='If specified will submit to lab queue', default=False, action='store_true') parser.add_argument('-no_sub', help=argparse.SUPPRESS, default=False, action='store_true') args = parser.parse_args() if args.correct_sfs: correct = ' -correct_sfs' else: correct = '' for region in open(args.region_list): tag, bed = region.split() out_stem = args.out_pre + '_' + tag cmd = ('~/parus_indel/summary_analyses/bed_summary_stats.py ' '-indel_vcf {} -snp_vcf {} ' '-bed {} -tag {}{} ' '> {}').format(args.indel_vcf, args.snp_vcf, bed, tag, correct, out_stem + '_stats.txt') if args.no_sub: q_write([cmd], out=out_stem, mem=10, rmem=10, evolgen=args.evolgen) else: q_sub([cmd], out=out_stem, mem=10, rmem=10, evolgen=args.evolgen)
def main(): # argument parser parser = argparse.ArgumentParser() parser.add_argument('-wga', help='Whole genome alignment bed file', required=True) parser.add_argument('-region_list', help='Coordinates to calc divergence for, bed format', required=True) parser.add_argument('-out_dir', help='Output path and file', required=True) parser.add_argument('-evolgen', help='if specified will run on lab queue', default=False, action='store_true') args = parser.parse_args() for line in open(args.region_list): region, bed = line.split() out_stem = '{}gt_indel_div_{}'.format(args.out_dir, region) div_cmd = ('~/parus_indel/summary_analyses/indel_divergence.py ' '-wga {} -bed {} -tag {} > {}').format( args.wga, bed, region, out_stem + '.txt') q_sub([div_cmd], out=out_stem, t=24, evolgen=args.evolgen)
def main(): # argument parser parser = argparse.ArgumentParser() parser.add_argument('-cds_fa_dir', help='cds fasta directory', required=True) parser.add_argument('-out_dir', help='output directory', required=True) parser.add_argument('-vcf', help='SNP vcf path', required=True) parser.add_argument('-call_fa', help='Callable sites fasta file', required=True) parser.add_argument('-evolgen', help='if specified will run on lab queue', default=False, action='store_true') args = parser.parse_args() out_files = [] out_dir = args.out_dir autos = ('2R', '2RHet', '2L', '2LHet', '3R', '3RHet', '3L', '3LHet', '4') # per chromo jobs for extracting nonsense data for x in [args.cds_fa_dir + y for y in os.listdir(args.cds_fa_dir)]: if not x.endswith('.fasta.gz'): continue chromo = x.split('-')[1] if chromo not in autos: continue outstem = x.split('/')[-1].replace('.fasta.gz', '') out = out_dir + outstem + '.premstops.txt' out_files.append(out) extract_cmd = ('./extract_prem_stops.py ' '-cds_fa {cds_fa} ' '-chr {chromo} ' '-vcf {vcf} ' '-call_fa {c_fa} ' '-n 17 ' '-unfolded ' '-out {output}').format(cds_fa=x, chromo=chromo, vcf=args.vcf, c_fa=args.call_fa, output=out) q_sub([extract_cmd], out=out_dir + outstem, evolgen=args.evolgen) # write list file list_file = out_dir + 'chromo_nonsense_list.txt' with open(list_file, 'w') as list_out: print(*out_files, sep='\n', file=list_out)
def main(): for pairwise_dir in sys.stdin: in_dir = pairwise_dir.rstrip() single_cov = 'python ~/sal_bal_sel/genome_alignment/single_cov.py -dir {} -ref_name BrownTrout'.format( in_dir) q_sub([single_cov], out=in_dir + 'single_cov', scheduler='SLURM')
def main(): # argument parser parser = argparse.ArgumentParser() parser.add_argument('-wga', help='wga bed file', required=True) parser.add_argument('-ucne_bed', help='UCNE bed file', required=True) parser.add_argument('-out_dir', help='output directory', required=True) parser.add_argument('-evolgen', help='If specified will run on evolgen', default=False, action='store_true') args = parser.parse_args() # get chromosome list grep_cmd = 'zcat {} | cut -f 1 | uniq'.format(args.ucne_bed) chromo_list = subprocess.Popen( grep_cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].split('\n')[:-1] chromo_list = [ x for x in chromo_list if x.startswith('chr') and 'random' not in x ] jids = [] for chromo in chromo_list: out = 'gt_ucne_{}.bed'.format(chromo) cmd = ('zcat {} | ' '~/WGAbed/non_ref_intersect.py ' '-b {} -q Zebrafinch -c {} | ' 'grep -v "?" | ' 'cut -f 1-3 | ' 'sort -k1,1 -k2,2n | ' 'bedtools merge ' '> {}').format(args.wga, args.ucne_bed, chromo, args.out_dir + out) jid = out.replace('.bed', '.sh') jids.append(jid) q_sub([cmd], out=args.out_dir + out.replace('.bed', ''), jid=jid, evolgen=args.evolgen) # gather gather = 'cat {}*.bed | bgzip -c > {}gt_ucne.bed.gz'.format( args.out_dir, args.out_dir) index = 'tabix -pbed {}gt_ucne.bed.gz'.format(args.out_dir) q_sub([gather, index], out=args.out_dir + 'ucne_bed_merge', hold=jids, evolgen=args.evolgen)
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-top_dir', help='top level directory of chromo grouped mafs', required=True) parser.add_argument('-ref', help='Reference species name', required=True) parser.add_argument('-out', help='Output directory', required=True) parser.add_argument('-no_sub', default=False, action='store_true') args = parser.parse_args() # number of runs n_runs = len([x for x in os.listdir(args.top_dir) if x.startswith('BrownTrout')]) # loop through queries sh_list = [] for i in range(1, n_runs, 200): roast_wrapper = ('python ~/sal_bal_sel/genome_alignment/roast_fish.py -top_dir {} -ref {} ' '-chr_tag $SLURM_ARRAY_TASK_ID').format(args.top_dir, args.ref) start = i if n_runs - i < 200: end = n_runs else: end = i + 199 # if runs above 1000, ie i == 1001 or higher, switch new flag to add multiple of 1000 on if i >= 1001: multiplier = ' -n_thou 1' start -= 1000 end -= 1000 else: multiplier = ' -n_thou 0' q_write([roast_wrapper + multiplier], args.out + 'multiz_start' + str(i), t=8, rmem=4, mem=4, array=[start, end], scheduler='SLURM') sh_list.append(args.out + 'multiz_start' + str(i) + '_job.sh') # submit control script control = 'python ~/sal_bal_sel/genome_alignment/pairwise_control.py -sh ' + ' -sh '.join(sh_list) if args.no_sub: q_write([control], out=args.out + 'all_multiple', t=72, rmem=2, mem=2, scheduler='SLURM') else: q_sub([control], out=args.out + 'all_multiple', t=72, rmem=2, mem=2, scheduler='SLURM')
def main(): for line in sys.stdin: vcf = line.rstrip() index = 'gatk IndexFeatureFile -F ' + vcf q_sub([index], out=vcf.replace('.g.vcf', '_indexing'), t=1, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-ref', help='Reference genome', required=True) parser.add_argument('-out_dir', help='Output directory', required=True) args = parser.parse_args() chromo_list = [x.split('\t')[0] for x in open(args.ref + '.fai') if not x.startswith('NW')] females = ['Uts_11_53', 'Uts_11_52', 'Uts_11_39', 'Uts_11_29', 'Uts_11_28', 'Naus_12_0037', 'Nams_12_0071', 'Jols_13_0001', 'Arga_12_0082', 'Alta_12_0124'] # per bam jobs bams = [x.rstrip() for x in sys.stdin] for b in bams: job_list = [] sample = b.split('/')[-1].split('.')[0] # loop through chromos for chromo in chromo_list: # set ploidy for mito and sdy if chromo == 'NC_001960.1' or chromo.startswith('KT'): ploidy = 1 else: ploidy = 2 chromo_dir = args.out_dir + chromo + '/' # create chromo dir if not already there if not os.path.isdir(chromo_dir): os.makedirs(chromo_dir) # skip SDY for relevant females if sample in females and chromo.startswith('KT'): continue out_gvcf = chromo_dir + sample + '.' + chromo + '.allsites.g.vcf' hap_caller = ('gatk --java-options "-Xmx4g" HaplotypeCaller ' '-R {ref} ' '-I {bam} ' '-ERC GVCF ' '-ploidy {ploidy} ' '-O {gvcf} ' '-L {chr} ').format(ref=args.ref, bam=b, ploidy=ploidy, gvcf=out_gvcf, chr=chromo) job_list.append(hap_caller) # submit one job per bam out_stem = args.out_dir + sample + '.hap_calling' q_sub(job_list, out=out_stem, t=60, rmem=8, mem=8, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-maf', help='MAF file containg alignment, must be compressed', required=True) parser.add_argument('-ref_sp', help='Species to use for coordinates in output bed', required=True) parser.add_argument('-ref_sizes', help='Sizes file to extract chromosomes for ref species', required=True) parser.add_argument('-out', help='Output directory', required=True) parser.add_argument('-no_sub', default=False, action='store_true') args = parser.parse_args() # number of runs n_runs = len([x.split()[0] for x in open(args.ref_sizes)]) # loop through queries sh_list = [] for i in range(1, n_runs, 200): wgabed_wrapper = ('python ~/sal_bal_sel/genome_alignment/convert_to_bed.py ' '-maf {} -ref_sp {} -ref_sizes {} -out {} ' '-chr_tag $SLURM_ARRAY_TASK_ID').format(args.maf, args.ref_sp, args.ref_sizes, args.out) # if runs above 1000, ie i == 1001 or higher, switch new flag to add multiple of 1000 on n_thou = int(i / 1000) start = i % 1000 multiplier = ' -n_thou {}'.format(n_thou) if n_runs - i < 200: end = n_runs else: end = start + 199 q_write([wgabed_wrapper + multiplier], args.out + 'wgabed_start' + str(i), t=8, rmem=4, mem=4, array=[start, end], scheduler='SLURM') sh_list.append(args.out + 'wgabed_start' + str(i) + '_job.sh') # submit control script control = 'python ~/sal_bal_sel/genome_alignment/pairwise_control.py -sh ' + ' -sh '.join(sh_list) if args.no_sub: q_write([control], out=args.out + 'all_wgabed', t=72, rmem=2, mem=2, scheduler='SLURM') else: q_sub([control], out=args.out + 'all_wgabed', t=72, rmem=2, mem=2, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-out', help='Output vcf', required=True) args = parser.parse_args() vcfs = ' I='.join([x.rstrip() for x in sys.stdin]) cmd = ('java -Xmx10G -jar /users/bartonhe/picard.jar GatherVcfs ' 'I={} O={}').format(vcfs, args.out) q_sub([cmd], out=args.out.replace('.vcf', ''), mem=12, rmem=12, scheduler='SLURM')
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-out', help='Output dir', required=True) args = parser.parse_args() # vcfs vcf_list = [x.rstrip() for x in sys.stdin] for vcf in vcf_list: new_vcf = args.out + vcf.replace('.g.vcf', '.autosomes.g.vcf').split('/')[-1] cmd = 'cat {} | python ~/sal_enhancers/training_set/extract_autosomes.py > {}'.format(vcf, new_vcf) q_sub([cmd], out=new_vcf.replace('.g.vcf', ''), rmem=4, mem=4, scheduler='SLURM')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-ref', help='reference genome', required=True) args = parser.parse_args() chromo_vcfs = [x.rstrip() for x in sys.stdin] for chr_vcf in chromo_vcfs: new_vcf = chr_vcf.replace('.allsites', '.raw.snps') snp_cmd = ('gatk SelectVariants ' '-R {ref} -V {vcf} -O {out} ' '--select-type-to-include SNP --exclude-non-variants' '').format(ref=args.ref, vcf=chr_vcf, out=new_vcf) q_sub([snp_cmd], out=new_vcf.replace('.vcf', ''), scheduler='SLURM')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-fastq_dir', help='directroy containg fastq files', required=True) parser.add_argument('-out_dir', help='output directory', required=True) args = parser.parse_args() # check out dir for complete cleaned samples complete = find_complete(args.out_dir) # pair reads reads = {} for file_name in os.listdir(args.fastq_dir): if not file_name.endswith('.fastq.gz'): continue sample = file_name.split('_')[0] if sample in complete: continue if sample not in reads.keys(): reads[sample] = [] reads[sample].append(args.fastq_dir + file_name) # cleaning for s in reads.keys(): r1, r2 = sorted(reads[s]) cmd = ('trim_galore --fastqc --output_dir {out} --paired {r1} {r2}' '').format(out=args.out_dir, r1=r1, r2=r2) print('running: ' + cmd) q_sub([cmd], out=args.out_dir + s, rmem=8, mem=8, scheduler='SLURM', t=2)
def main(): for line in sys.stdin: out_dir = line.rstrip() + '/' align_dir = out_dir + 'aligned/' tmp_dir = align_dir + 'tmp/' print('creating: ' + align_dir) os.makedirs(align_dir) print('creating: ' + tmp_dir) os.makedirs(tmp_dir) roast = ('~/sal_enhancers/homeoblock_alignments/roast_homeoblock.py ' '-maf_dir {} -ref salmon').format(out_dir) q_sub([roast], out=out_dir + 'multiple_align', scheduler='SLURM') print()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-ref', help='reference genome', required=True) parser.add_argument('-train_vcf', help='vcf file of training data', required=True) parser.add_argument('-out_dir', help='output directory', required=True) args = parser.parse_args() bams = [x.rstrip() for x in sys.stdin] for bam in bams: bam_stem = bam.replace('.bam', '').split('/')[-1] out_table = '{}{}.table'.format(args.out_dir, bam_stem) recal_bam = out_table.replace('.table', '.bqsr.bam') bqsr = ('gatk BaseRecalibrator ' '-I {bam} ' '-R {ref} ' '--known-sites {truth} ' '-O {table}').format(bam=bam, ref=args.ref, truth=args.train_vcf, table=out_table) apply = ('gatk ApplyBQSR ' '-R {ref} ' '-I {bam} ' '--bqsr-recal-file {table} ' '-O {new_bam}').format(ref=args.ref, bam=bam, table=out_table, new_bam=recal_bam) q_sub([bqsr, apply], out=out_table.replace('.table', ''), t=24, scheduler='SLURM')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-sim_data') parser.add_argument('-out_dir') args = parser.parse_args() if not os.path.isdir(args.out_dir): os.makedirs(args.out_dir) counter = 0 for line in open(args.sim_data): counter += 1 out_stem = '{}{}.rep{}'.format(args.out_dir, args.sim_data.replace('.txt', ''), counter) sfs = [int(x) for x in line.rstrip().split(',')] sfs_dict = {'SNP': (sfs, 475625)} ctl = Snp1ControlFile() ctl.set_data(sfs_dict, 20, gamma_r=(-250, 50)) control_contents = ctl.construct() ctl_name = out_stem + '.ctl.txt' log_name = out_stem + '.log.txt' res_name = out_stem + '.res.txt' with open(ctl_name, 'w') as o: print(control_contents, file=o) cmd = 'anavar1.4 {} {} {} {}'.format(ctl_name, res_name, log_name, counter) q_sub([cmd], out=out_stem, evolgen=True)
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument('-in_dir', help='Top level input directory', required=True) parser.add_argument('-out_dir', help='Output directory', required=True) args = parser.parse_args() contigs = [ x for x in os.listdir(args.in_dir) if os.path.isdir(args.in_dir + x) ] for chromo in contigs: chromo_path = args.in_dir + chromo + '/' out_chromo_path = args.out_dir + chromo + '/' if not os.path.isdir(out_chromo_path): os.makedirs(out_chromo_path) vcf_list = [x for x in os.listdir(chromo_path) if x.endswith('.g.vcf')] cmds = [] for vcf in vcf_list: trim_cmd = 'grep -v NW_ {} > {}'.format(chromo_path + vcf, out_chromo_path + vcf) cmds.append(trim_cmd) index = 'gatk IndexFeatureFile -F ' + out_chromo_path + vcf cmds.append(index) q_sub(cmds, out=args.out_dir + chromo + '_trimhead', t=10, scheduler='SLURM')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-bam_in', help='input directory of unmerged bams', required=True) parser.add_argument('-bam_out', help='output directory', required=True) parser.add_argument('-info_file', help='file with read group info', required=True) parser.add_argument('-ref', help='reference genome', required=True) args = parser.parse_args() all_read_acc = sample_reads(args.info_file) for fish in all_read_acc.keys(): accessions = all_read_acc[fish] in_bams = ' '.join( ['I=' + args.bam_in + x + '.dedup.bam' for x in accessions]) bam_out = args.bam_out + fish + '.dedup.bam' merge_cmd = 'java -Xmx12G -jar ~/picard.jar MergeSamFiles {} O={}'.format( in_bams, bam_out) wgs_metrics = ("java -Xmx12g -jar ~/picard.jar CollectWgsMetrics " "I={} " "O={}.wgsmetrics_file.txt " "R={} INCLUDE_BQ_HISTOGRAM=true" "").format(bam_out, args.bam_out + fish, args.ref) #print(merge_cmd) #print(wgs_metrics) q_sub([merge_cmd, wgs_metrics], out=args.bam_out + fish + '_merge', mem=14, rmem=14, scheduler='SLURM')
def sel_v_neu_anavar(mode, vcf, call, sel_region, constraint, n, c, dfe, alg, nnoimp, maximp, out_stem, search, degree, spread, evolgen, start_index, given, ar_ref): """ submits anavar jobs to cluster after writing required files etc :param mode: str :param vcf: str :param call: dict :param sel_region: str :param constraint: str :param n: int :param c: int :param dfe: str :param alg: str :param nnoimp: int :param maximp: int :param out_stem: str :param search: int :param degree: int :param spread: int :param evolgen: bool :param start_index: int :param given: bool :param ar_ref: bool :return: None """ anavar_path = '/shared/evolgen1/shared_data/program_files/sharc/' anavar_cmd = '{path}anavar1.4 {ctl} {rslts} {log} {seed}' # sort file names ctl_name = out_stem + '.control.txt' merge_out = out_stem + '.merged.results.txt' # catch given on first run init = () if given: if not os.path.isfile(merge_out): sys.exit( 'Given True but no previous runs completed to take besty res from' ) else: # get best result from merged out best_res = an.ResultsFile( open(merge_out)).ml_estimate(as_string=True) init = tuple(best_res.split()[3:-1]) # region combinations region_combs = { 'CDS': ['CDS_frameshift', 'CDS_non_frameshift'], 'intron': ['intron'], 'intergenic': ['intergenic'], 'noncoding': ['intergenic', 'intron'] } # make control file if mode == 'snp': sfs_data = prepare_snp_sfs(vcf, call, n, sel_sfs_regions=region_combs[sel_region], call_sel_reg=sel_region) ctl = an.SNPNeuSelControlFile() else: sfs_data = prepare_indel_sfs(vcf, call, n, sel_sfs_regions=region_combs[sel_region], call_sel_reg=sel_region, ar_ref=ar_ref) ctl = an.IndelNeuSelControlFile() ctl.set_alg_opts(search=search, alg=alg, key=3, epsabs=1e-20, epsrel=1e-9, rftol=1e-9, maxtime=3600, optional=True, maximp=maximp, nnoimp=nnoimp, init=init) ctl.set_data(sfs_data, n, dfe=dfe, c=c, gamma_r=(-5e4, 1e5), theta_r=(1e-14, 0.1), r_r=(0.01, 100), scale_r=(0.1, 5000.0)) if degree != 50: ctl.set_dfe_optional_opts(degree=degree, optional=True) ctl.set_constraint(constraint) ctl_contents = ctl.construct() with open(ctl_name, 'w') as control: control.write(ctl_contents) res_file_list = out_stem + '.allres.list.txt' hjids = [] with open(res_file_list, 'a') as res_list: # split into requested jobs for i in range(start_index, start_index + spread): split_stem = '{}.split{}'.format(out_stem, i) result_name = split_stem + '.results.txt' log_name = split_stem + '.log.txt' print(result_name, file=res_list) # call anavar rep_cmd = anavar_cmd.format(path=anavar_path, ctl=ctl_name, rslts=result_name, log=log_name, seed=i) q_sub([rep_cmd], out=split_stem, jid=split_stem.split('/')[-1] + '.sh', t=48, evolgen=evolgen) hjids.append(split_stem.split('/')[-1] + '.sh') # hold job to merge outputs gather = 'cat {} | ~/parus_indel/anavar_analyses/gather_searches.py {}'.format( res_file_list, merge_out) q_sub([gather], out=out_stem + '.merge', hold=hjids, evolgen=evolgen)
# list comprehension # file_list = [fas for fas in os.listdir(in_dir) if fas.endswith(".fas")] # process the files cmd_list = [] for i in range(0, len(fas_list)): fas_file = fas_dir + fas_list[i] #seq_file = seq_dir + seq_list[i] #print(fas_file) # replaces .fas with .phylip, make list by cutting on '/', [-1] takes last item of list - the filename out_name = fas_file.replace('.phylip', '.fas').split('/')[-1] out_file = out_dir + out_name # print(fas_file, seq_file, out_file) trimal_cmd = ('/data/bop17lhy/trimal/source/trimal ' '-in {} -out {}' ' -fasta').format(fas_file, out_file) cmd_list.append(trimal_cmd) #print(fas_file, out_file) # #subprocess.call(trimal_cmd, shell=True) # # submit bins of jobs for i in range(0, len(cmd_list), 100): bin_cmds = cmd_list[i:i + 100] bin_outs = out_dir + 'jobs' + str(i) + str(i + 100) q_sub(bin_cmds, out=bin_outs)
#!/usr/bin/env python from qsub import q_sub vcf = ( '/fastdata/bop15hjb/drosophila_data/dmel/analysis_ready_data/' 'dmel_17flys.gatk.raw.snps.exsnpindel.recalibrated.filtered_' 't95.0.pass.dpfiltered.50bp_max.bial.rmarked.polarised.annotated.ar.degen.vcf.gz' ) callsites = '/fastdata/bop15hjb/drosophila_data/dmel_ref/dmel.callablesites.summary_with_degen.csv' out_dir = '/fastdata/bop15hjb/drosophila_data/dmel/anavar/degree_variation/' for degree in [25, 50, 75, 100, 150, 200, 300]: for model in ['full', 'equal_t']: out = '{}dmel_cds_v_4fold_snps_continuous_dfe_degree{}.{}'.format( out_dir, degree, model) cmd = ('cds_vs_neutral_anavar_snps.py ' '-vcf {} ' '-n 17 -c 1 -dfe continuous ' '-call_csv {} ' '-neu_type 4fold ' '-out_pre {} -degree {}' '').format(vcf, callsites, out, degree) if model == 'equal_t': cmd += ' -constraint equal_mutation_rate' q_sub([cmd], out=out, t=48)
def sel_v_neu_anavar_nonsense(vcf, call, constraint, n, c, dfe, alg, nnoimp, maximp, out_stem, search, degree, spread, evolgen, prem_files): """ submits anavar jobs to cluster after writing required files etc :param vcf: str :param call: dict :param constraint: str :param n: int :param c: int :param dfe: str :param alg: str :param nnoimp: int :param maximp: int :param out_stem: str :param search: int :param degree: int :param spread: int :param evolgen: bool :param prem_files: list :return: None """ anavar_path = '/shared/evolgen1/shared_data/program_files/sharc/' anavar_cmd = '{path}anavar1.22 {ctl} {rslts} {log} {seed}' # sort file names ctl_name = out_stem + '.control.txt' # get nonsense data in nonsense_dict = gather_chromo_prems(prem_files) sel_sfs, sel_m = prem_freqs_call(nonsense_dict) # make control file sfs_data = prepare_nonsense_snp_sfs(vcf, call, n, sel_sfs, sel_m) ctl = an.SNPNeuSelControlFile() ctl.set_alg_opts(search=search, alg=alg, key=3, epsabs=1e-20, epsrel=1e-9, rftol=1e-9, maxtime=3600, optional=True, maximp=maximp, nnoimp=nnoimp) ctl.set_data(sfs_data, n, dfe=dfe, c=c, gamma_r=(-5e4, 1e3), theta_r=(1e-10, 0.1), r_r=(0.01, 100), scale_r=(0.1, 5000.0)) if degree != 50: ctl.set_dfe_optional_opts(degree=degree, optional=True) ctl.set_constraint(constraint) ctl_contents = ctl.construct() with open(ctl_name, 'w') as control: control.write(ctl_contents) res_file_list = out_stem + '.allres.list.txt' hjids = [] with open(res_file_list, 'w') as res_list: # split into requested jobs for i in range(1, spread + 1): # seed = random.randint(1, 1e6) seed = i split_stem = '{}.split{}'.format(out_stem, i) result_name = split_stem + '.results.txt' log_name = split_stem + '.log.txt' print(result_name, file=res_list) # call anavar rep_cmd = anavar_cmd.format(path=anavar_path, ctl=ctl_name, rslts=result_name, log=log_name, seed=seed) q_sub([rep_cmd], out=split_stem, jid=split_stem.split('/')[-1] + '.sh', t=8, evolgen=evolgen) hjids.append(split_stem.split('/')[-1] + '.sh') # hold job to merge outputs merge_out = out_stem + '.merged.results.txt' gather = 'cat {} | gather_searches.py {}'.format(res_file_list, merge_out) q_sub([gather], out=out_stem + '.merge', hold=hjids, evolgen=evolgen)