def vcf2bed(chrom, pop): filtered_vcf = f'steps/recode_vcf/chr{chrom}_{pop}.recode.vcf' bed = f'steps/plink/chr{chrom}_{pop}.bed' base_name = modpath(bed, suffix=('.bed', '')) pruned_bed = f'steps/plink/chr{chrom}_{pop}.pruned.bed' inputs = [filtered_vcf] outputs = [pruned_bed] options = {'memory': '2g', 'walltime': '02:00:00'} spec = f''' mkdir -p steps/plink plink --vcf {filtered_vcf} --make-bed --double-id --geno 0.025 --indep-pairwise 50 10 0.1 \ --out {base_name} plink --bfile {base_name} --extract {base_name}.prune.in --make-bed --out {base_name}.pruned ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def argsample(sites_file, times_file, popsize_file, recomb_file, bed_file): output_dir = os.path.dirname(bed_file) arg_sample_base_name = modpath(bed_file, suffix='') # TODO should be: arg_sample_base_name = modpath(bed_file, suffix=('.bed.gz', '') log_file = modpath(arg_sample_base_name, suffix='.log') tabix_file = modpath(arg_sample_base_name, suffix='.bed.gz.tbi') inputs = {'sites_file': sites_file, 'recomb_file': recomb_file} outputs = { 'bed_file': bed_file, 'log_file': log_file, 'tabix_file': tabix_file } options = {'memory': '40g', 'walltime': '14-00:00:00'} spec = f''' mkdir -p {output_dir} arg-sample -s {sites_file} \ --times-file {times_file} \ --popsize-file {popsize_file} \ --recombmap {recomb_file} \ -m 1.247e-08 \ -c 25 \ -n 30000 \ --overwrite \ -o {arg_sample_base_name} \ && \ ./argweaver/bin/smc2bed-all {arg_sample_base_name} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def mock_template(mocker, ): mock_template = mocker.MagicMock() mock_template.__name__ = "mock_template" mock_template.return_value = AnonymousTarget(inputs=[], outputs=[], options={}) return mock_template
def fs_master(cp_dir, run_name, i, o): """Function to run the -go parts of fs""" inputs = i outputs = [cp_dir + run_name + o] options = { 'cores': 2, 'memory': "8g", 'walltime': "01:00:00", "account": 'baboondiversity' } spec = f''' cd {cp_dir} fs {run_name}.cp -go ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def admixture(k, chrom, pop): bedfile = f'steps/plink/chr{chrom}_{pop}.pruned.bed' outputq = f'results/admixture/chr{chrom}_{pop}/chr{chrom}_{pop}.pruned.{k}.Q' outputp = f'results/admixture/chr{chrom}_{pop}/chr{chrom}_{pop}.pruned.{k}.P' no_path = f'chr{chrom}_{pop}.pruned.{k}' logs = f'results/admixture/crossvalidation/log_chr{chrom}_{pop}.{k}.out' inputs = [bedfile] outputs = [outputq, outputp, logs] options = {'memory': '5g', 'walltime': '8:00:00'} spec = f''' mkdir -p results/admixture/chr{chrom}_{pop} mkdir -p results/admixture/crossvalidation admixture --cv {bedfile} {k} | tee {logs} mv {no_path}* results/admixture/chr{chrom}_{pop} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def command_files(block, block_number, cp_dir, run_name, cf, i): """Function to run the commandfiles generated""" inputs = i o_file = f'{run_name}/commandfiles/{cf[-5]}_{block}_{block_number}' outputs = cp_dir + o_file options = { 'cores': 4, 'memory': "16g", 'walltime': "04:00:00", "account": 'baboondiversity' } spec = f''' cd {cp_dir} file_length=$(wc -l < {run_name}/commandfiles/{cf}) start=$((file_length*{block-1}/{block_number}+1)) stop=$((file_length*{block}/{block_number})) sed -n "$start,$stop p" {run_name}/commandfiles/{cf} | bash touch {o_file} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def command_files_single(cp_dir, run_name, cf, i): """Function to run the commandfiles generated""" inputs = i o_file = f'{run_name}/commandfiles/{cf[-5]}' outputs = cp_dir + o_file options = { 'cores': 8, 'memory': "16g", 'walltime': "04:00:00", "account": 'baboondiversity' } spec = f''' cd {cp_dir} cat {run_name}/commandfiles/{cf} | parallel touch {o_file} ''' print(spec) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def fs_start(cp_dir, run_name, idfile, phasefile, recombfile, s3iters, s4iters, s1minsnps, s1indfrac): """Function to initialize the fs run in hpc mode. If options should be added, they are defined here""" commandfile = f'{cp_dir}/{run_name}/commandfiles/commandfile1.txt' inputs = [idfile, phasefile, recombfile] outputs = [commandfile] #cp_dir+run_name+"/commandfiles/commandfile1.txt" options = { 'cores': 1, 'memory': '8g', 'walltime': '01:00:00', 'account': 'baboondiversity' } spec = f''' cd {cp_dir} fs {run_name}.cp -hpc 1 -idfile ../../{idfile} -phasefiles ../../{phasefile} -recombfiles ../../{recombfile} \ -s3iters {s3iters} -s4iters {s4iters} -s1minsnps {s1minsnps} -s1indfrac {s1indfrac} -go ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def bwa_map(ref_genome, r1, r2, bamfile): """Template for mapping reads to a reference genome with `bwa` and `samtools`.""" inputs = [ r1, r2, '{}.amb'.format(ref_genome), '{}.ann'.format(ref_genome), '{}.pac'.format(ref_genome), ] outputs = [bamfile] options = { 'cores': 16, 'memory': '1g', } spec = ''' bwa mem -t 16 {ref_genome} {r1} {r2} | \ samtools sort | \ samtools rmdup -s - {bamfile} '''.format(ref_genome=ref_genome, r1=r1, r2=r2, bamfile=bamfile) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def vcf_filter(vcf_file, chrom, popfile, pop): output_vcf = f'steps/recode_vcf/chr{chrom}_{pop}.recode.vcf' base_name = modpath(output_vcf, suffix=('.recode.vcf', '')) inputs = [vcf_file] outputs = [output_vcf] options = { 'cores': 1, 'memory': '2g', 'walltime': '02:00:00', 'account': 'baboondiversity' } spec = f''' mkdir -p steps/recode_vcf vcftools --gzvcf {vcf_file} --recode --keep data/{popfile} \ --out {base_name} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def vcf2plink(chrom, pop): filtered_vcf = f'steps/recode_vcf/chr{chrom}_{pop}.recode.vcf' ped_file = f'steps/plink/chr{chrom}_{pop}.ped' map_file = f'steps/plink/chr{chrom}_{pop}.map' base_name = modpath(ped_file, suffix=('.ped', '')) inputs = [filtered_vcf] outputs = [ped_file, map_file] options = { 'cores': 1, 'memory': '2g', 'walltime': '10:00:00', 'account': 'baboondiversity' } spec = f''' mkdir -p steps/plink plink --vcf {filtered_vcf} --recode12 --double-id --geno 0.025 --out {base_name} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def plink2finestructure(chrom, pop): ped_file = f'steps/plink/chr{chrom}_{pop}.ped' map_file = f'steps/plink/chr{chrom}_{pop}.map' id_file = f'steps/finestructure/chr{chrom}_{pop}.ids' phase_file = f'steps/finestructure/chr{chrom}_{pop}.phase' # Fix name inputs = [ped_file, map_file] outputs = [phase_file, id_file] options = { 'cores': 1, 'memory': '15g', 'walltime': '10:00:00', 'account': 'baboondiversity' } spec = f''' mkdir -p steps/finestructure ../../../software/fs_janne/plink2chromopainter.pl -p={ped_file} -m={map_file} \ -d={id_file} -o={phase_file} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def nbconvert(notebook_file_name, dependencies=[], inplace=False, output_format='notebook', allow_errors=False, timeout=-1): inputs = dependencies outputs = [notebook_file_name] options = { 'cores': args.cores, 'memory': args.total_memory, } if allow_errors: allow_errors = '--allow-errors' else: allow_errors = '' if inplace: inplace = '--inplace' else: inplace = '' spec = f''' cp {notebook_file_name} $TMPDIR/`basename {notebook_file_name}` && \ nbconvert_cmd = "jupyter nbconvert --ClearOutputPreprocessor.enabled=True \ --ExecutePreprocessor.timeout={timeout} {allow_errors} {inplace and '--allow-errors' or ''} \ --to {output_format} --execute {notebook_file_name}" && \ cp $TMPDIR/`basename {notebook_file_name}` {notebook_file_name} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def sum_ts(t_count_files, output_dir): inputs = {'t_count_files': t_count_files} outputs = {'sum_file': os.path.join(output_dir, 'sum.txt')} options = {} spec = """ cat {t_count_files} | awk -f scripts/sum_ts.awk > {outputs[sum_file]} """.format(t_count_files=' '.join(t_count_files), outputs=outputs) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def template_returning_anonymous_target(): return AnonymousTarget( inputs=[], outputs=[], options={}, working_dir="/some/dir", spec="this is the spec", )
def vcf_to_zarr(chrom, i): path = vcf_dir+vcf_names.format(chrom) output = zarr_dir+chrom inputs = i outputs = output options = {'memory': '10g', 'walltime': '0-08:00:00'} spec = "python scripts/vcf_to_zarr.py {} {}".format(path, output) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def bam_index(path): inputs = {'path': path} outputs = {'path': path + '.bai'} options = {'memory': '4g', 'walltime': '0-02:00:00'} spec = f'samtools index {path}' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def compute_avg_len(seq_len_file, output_dir): name = os.path.basename(seq_len_file) inputs = {'seq_len_file': seq_len_file} outputs = {'avg_file': os.path.join(output_dir, name + '.avg')} options = {} spec = """ cat {inputs[seq_len_file]} | awk -f scripts/compute_avg.awk > {outputs[avg_file]} """.format(inputs=inputs, outputs=outputs) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def bam2fastq(path): output_path = modpath(path, suffix='.fq') inputs = {'path': path} outputs = {'path': output_path} options = {'memory': '4g', 'walltime': '0-02:00:00'} spec = f'bamToFastq -i {path} -fq {output_path}' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def bam_chr2(path, index): sample_name = os.path.basename(os.path.dirname(path)) output_path = modpath(path, base=sample_name, suffix='.chr2.bam') inputs = {'path': path, 'index': index} outputs = {'path': output_path} options = {'memory': '4g', 'walltime': '0-02:00:00'} spec = f'samtools view -f 3 -F 4 -b -h {path} -o {output_path} 2' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def tabix_index(path): """Makes a tabix index on a VCF files. Existing files are overwritten. Args: path (str): Path to VCF file. Returns: gwf.AnonymousTarget: GWF target. """ inputs = {'path': path} outputs = path + '.tbi' options = {'memory': '4g', 'walltime': '0-01:00:00'} spec = f'tabix -f -p vcf {path}' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def filter_chromosomes(sample_file, output_dir): name = os.path.basename(sample_file) inputs = {'sample_file': sample_file} outputs = { 'sample_file': os.path.join( output_dir, '{sample_file}.filtered'.format(sample_file=name) ) } options = {} spec = """ grep -E 'chrom1|chrom3' {inputs[sample_file]} > {outputs[sample_file]} """.format(inputs=inputs, outputs=outputs) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def collapse_matrices_upon_eachother(matrices, output_file): """ This target will add coverage matrices together value by value, which can be helpfull if you look at coverage across different samples """ inputs = matrices outputs = [output_file] options = {"walltime": "1:00:00", "memory": "24gb"} spec = """ transcov collapse {} --output-file {} """.format(" ".join(matrices), output_file) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def generate_matrix(bam_file, bed_file, output_file): """ This target will given a bam file and a bed file created by the preprocesser generate a coverage matrix, showing read depth in a region around the TSS """ inputs = [bam_file, bed_file] outputs = [output_file] options = {"walltime": "12:00:00", "memory": "6gb"} spec = """ transcov generate {} {} --output-file {} """.format(bam_file, bed_file, output_file) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def preprocess_gencode_annotation(annotation_file, bed_file, tss_file, region_size): """ This target will generate a file bed file and meta data file, defining the TSS' to look for in the bam files. Theese TSS' will define the rows of the matrices """ inputs = [annotation_file] outputs = [bed_file, tss_file] options = {} spec = """ transcov preprocess {} --bed-file {} --tss-file {} --region-size {} """.format(annotation_file, bed_file, tss_file, region_size) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def arg_recomb_file(recomb_map, start, end, recomb_file): inputs = [recomb_map] outputs = {'recomb_file': recomb_file} options = {'memory': '4g', 'walltime': '01:00:00'} spec = f''' mkdir -p {os.path.dirname(recomb_file)} python scripts/argsample_rec_window.py {recomb_map} chrX {start} {end} {recomb_file} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def merge_chrom_files(input_files, output_file): inputs = input_files outputs = [output_file] options = {'memory': '4g', 'walltime': '00:10:00'} spec = f''' mkdir -p {os.path.dirname(output_file)} cat {" ".join(input_files)} > {output_file} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def arg_sites_file(start, end, sites_file, fasta_files): inputs = fasta_files outputs = {'sites_file': sites_file} options = {'memory': '4g', 'walltime': '01:00:00'} spec = f''' mkdir -p {os.path.dirname(sites_file)} python scripts/argsample_sites_file.py X {start} {end} {sites_file} {" ".join(fasta_files)} ''' return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def compute_stats(sample_file, output_dir): inputs = {'sample_file': sample_file} name = os.path.basename(sample_file) outputs = { 't_count_file': os.path.join(output_dir, '{name}.tcounts'.format(name=name)), 'seq_len_file': os.path.join(output_dir, '{name}.seqlengths'.format(name=name)), } options = {} spec = """ python scripts/compute_stats.py \ {inputs[sample_file]} \ {outputs[t_count_file]} \ {outputs[seq_len_file]} """.format(inputs=inputs, outputs=outputs) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)
def unzip(inputfile, outputfile): """A template for unzipping files.""" inputs = [inputfile] outputs = [outputfile] options = { 'cores': 1, 'memory': '2g', } spec = ''' gzcat {} > {} '''.format(inputfile, outputfile) return AnonymousTarget(inputs=inputs, outputs=outputs, options=options, spec=spec)