def get_freq_cutoffs(tablefile): """ Determine MAF using ploidy and the number of samples per pool. Sums across ploidy values for a given pool to determin MAF. Assumes: - equal ploidy across samples/pools Positional arguments: tablefile - path to VariantsToTable output - used to find ploidy etc Returns: lowfreq - minimum allele freq to keep (MAF) highfreq - maximum allele freq to keep (1-MAF) """ pooldir = op.dirname(op.dirname(tablefile)) parentdir = op.dirname(pooldir) pool = op.basename(pooldir) poolsamps = pklload(op.join(parentdir, 'poolsamps.pkl'))[pool] ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool] lowfreq = 1 / sum(ploidy.values()) if lowfreq == 1 or len(poolsamps) == 1: # for megagametophyte data lowfreq = 0 pklfile = op.join(parentdir, 'maf.pkl') if op.exists(pklfile): lowfreq = float(pklload(pklfile)) highfreq = 1 - lowfreq return lowfreq, highfreq
def get_freq_cutoffs(tablefile): """ Determine MAF using ploidy. Assumes: - equal ploidy across samples/pools Positional arguments: tablefile - path to VariantsToTable output - used to find ploidy etc Returns: lowfreq - minimum allele freq to keep (MAF) highfreq - maximum allele freq to keep (1-MAF) ploidy - count of haploid genomes in pool/sample """ pooldir = op.dirname(op.dirname(tablefile)) parentdir = op.dirname(pooldir) pool = op.basename(pooldir) poolsamps = pklload(op.join(parentdir, 'poolsamps.pkl'))[pool] ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool] lowfreq = 1 / (ploidy * len(poolsamps)) if lowfreq == 1: # for megagametophyte data lowfreq = 0 highfreq = 1 - lowfreq return lowfreq, highfreq, ploidy
def remove_paralogs(snps, parentdir, snpspath, pool): """ Remove sites from snptable that are thought to have multiple gene copies align to this position. # assumes # paralog file has 'CHROM' and 'locus' in the header (best if this is the only data, reads in quicker) # where CHROM is the reference chromosome/scaffold # where locus is hyphen-separated CHROM-POS # paralog file is created from calling SNPs on haplotype data as diploid # no need to worry about translating stiched -> unstitched if SNPs called on same reference. """ parpkl = op.join(parentdir, 'paralog_snps.pkl') if op.exists(parpkl): # read in paralogfile paralogdict = pklload(parpkl) if paralogdict[pool] is not None: print('Removing paralogs sites ...') paralogs = pd.read_csv(paralogdict[pool], sep='\t') # remove and isolate paralogs from snps truths = snps['locus'].isin(paralogs['locus']) found_paralogs = snps[truths].copy() snps = snps[~truths].copy() snps.index = range(len(snps.index)) # write paralogs to a file parafile = snpspath.replace(".txt", "_PARALOGS.txt") found_paralogs = mark_nas(found_paralogs, 'paralog SNPs') found_paralogs.to_csv(parafile, sep='\t', index=False) print( f'{op.basename(snpspath)} has {len(snps.index)} non-paralog SNPs' ) return snps
def get_prereqs(bedfile, pooldir, parentdir, pool, program): """Get object names.""" num = bedfile.split("_")[-1].split(".bed")[0] ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] outdir = makedir(op.join(pooldir, program)) vcf = op.join(outdir, f'{pool}_{program}_bedfile_{num}.vcf') return (num, ref, vcf)
def filter_freq(df, tf, tipe, tablefile): """ Filter out loci with global MAF < 1/(total_ploidy_across_pools). Right now this is unnecessary for varscan when setting pool-level freq to 1/ploidy. Positional arguments: df - pandas.dataframe; VariantsToTable output tablefile - path to VariantsToTable output - used to find ploidy etc tf - str; basename of tablefile tipe - str; one of either "SNP" or "INDEL" Returns: df - pandas.dataframe; freq-filtered VariantsToTable output """ # believe it or not, it's faster to do qual and freq filtering in two steps vs an 'and' statement lowfreq, highfreq = get_freq_cutoffs(tablefile) print(f'filtering for global frequency ({lowfreq}, {highfreq})...') df.reset_index(drop=True, inplace=True) # prep for filtering freqcols = [col for col in df.columns if '.FREQ' in col] pool = op.basename(op.dirname(op.dirname(tablefile))) parentdir = op.dirname(op.dirname(op.dirname(tablefile))) ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool] # carry on with poolseq datas filtloci = [] afs = [] copy = get_copy(df, freqcols) for locus in tqdm(copy.columns): freqs = dict( (samp.replace(".FREQ", ""), freq) for (samp, freq) in copy[locus].str.rstrip('%').astype('float').items() if not math.isnan(freq) ) # faster than .str.rstrip('%').astype('float').dropna() if len( freqs ) > 0: # avoid loci with all freqs masked (avoid ZeroDivisionError) # calc globfreq using the samps/ploidy that are present for this locus globfreq = sum([ ploidy[samp] * (freq / 100) for (samp, freq) in freqs.items() ]) / sum([ploidy[samp] for samp in freqs]) if lowfreq <= globfreq <= highfreq: filtloci.append(locus) # since we're going in order of rows in df ... # ... we can use afs to replace AF col later since we reduce df to filtloci afs.append(globfreq) # which is about 40x faster than: df.loc[locus, 'AF'] = globfreq print( f'{tf} has {len(filtloci)} {tipe}s that have global MAF > {lowfreq*100}%' ) df = df[df.index.isin(filtloci)].copy() df.index = range(len(df.index)) df['AF'] = afs return df
def checkfiles(pooldir): """Call get_bamfiles.""" # get the list of file names print('checking files') pool = op.basename(pooldir) samps = pklload(op.join(op.dirname(pooldir), 'poolsamps.pkl'))[pool] shdir = op.join(pooldir, 'shfiles/05_indelRealign_shfiles') files = getfiles(samps, shdir, 'indelRealign') check_queue(files.values(), pooldir) # make sure job isn't in the queue (running or pending) check_seff(files.values()) # make sure the jobs didn't die return get_bamfiles(samps, pooldir)
def translate_stitched_to_unstitched(df, parentdir, pool): """See if user asked regions to be translated from stitched genome to unstitched. # assumes # that this is run BEFORE removing repeats """ orderpkl = op.join(parentdir, 'orderfile.pkl') if op.exists(orderpkl): orderdict = pklload(orderpkl) if oderdict[pool] is not None: # if user selected translation be applied to this pool orderfile = orderdict[pool] df = translate_stitched.main(df.copy(), orderfile) return df
def get_avail_accounts(parentdir=None, save=False): """Query slurm with sshare command to determine accounts available. If called with parentdir=None, return all available accounts. - Meant to be called from command line outside of pipeline. See also sys.argv input. If called with parentdir='choose', allow user to choose accounts. - Meant to be called from command line outside of pipeline. See also sys.argv input. If called with save=True, confirm each account with user and save .pkl file in parentdir. - save=True is only called from 00_start.py Returns a list of accounts to balance queue. """ if parentdir is not None and save is False: # if the accounts have already been chosen, just return them right away # keep 'save is False' so 00_start can overwrite previous pkl and skip here pkl = os.path.join(parentdir, 'accounts.pkl') if os.path.exists(pkl): return pklload(pkl) # get a list of all available accounts acctout = subprocess.check_output([ shutil.which('sshare'), '-U', '--user', os.environ['USER'], '--format=Account' ]).decode('utf-8').split('\n') accts = [ acct.split()[0].split("_")[0] for acct in acctout if '_cpu' in acct ] # for running outside of the pipeline: if parentdir is None: # to manually run on command line, using all accounts (default + RAC) return accts elif parentdir == 'choose': # to manually run on command line, choose accounts return choose_accounts(accts) # save if necessary if save is True: # called from 00_start.py keep = choose_accounts(accts) pkldump(keep, os.path.join(parentdir, 'accounts.pkl')) # no return necessary for 00_start.py return return accts
def get_crisp_cmd(bamfiles, bedfile, pool, parentdir, ref, vcf, bednum): """Create command to call crisp.""" smallbams, smallcmds = get_small_bam_cmds(bamfiles, bednum, bedfile) bams = ' --bam '.join(smallbams) poolsize = pklload(op.join(parentdir, 'ploidy.pkl'))[pool] logfile = vcf.replace(".vcf", ".log") convertfile = vcf.replace(".vcf", "_converted.vcf") cmds = smallcmds + f'''module load python/2.7.14 $CRISP_DIR/CRISP --bam {bams} --ref {ref} --VCF {vcf} \ --poolsize {poolsize} --mbq 20 --minc 5 --bed {bedfile} > {logfile} touch $SLURM_TMPDIR/bam_file_list.txt # assumes equal pool sizes $CRISP_DIR/scripts/convert_pooled_vcf.py {vcf} $SLURM_TMPDIR/bam_file_list.txt \ {poolsize} > {convertfile} module unload python ''' return (cmds, convertfile, logfile)
def checkjobs(): """ Make sure previous realigned bamfiles were created without error. Avoids unintentionally combining a subset of all final expected files. Calls: getfiles from start_crispANDvarscan """ print('checking jobs') parentdir = op.dirname(pooldir) pool = op.basename(pooldir) ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] samps = fs(op.join(op.dirname(ref), 'bedfiles_%s' % op.basename(ref).split(".fa")[0])) shdir = op.join(pooldir, 'shfiles/crispANDvarscan') # files = {f.sh: f.out, ...} files = getfiles(samps, shdir, f"{grep}-{program}") return files
def get_varscan_cmd(bamfiles, bedfile, bednum, vcf, ref, pooldir, program): """Create command to call varscan.""" smallbams, smallcmds = get_small_bam_cmds(bamfiles, bednum, bedfile) smallbams = ' '.join(smallbams) ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool] # if single-sample then set minfreq to 0, else use min possible allele freq minfreq = 1 / sum(ploidy.values()) if len(ploidy.keys()) > 1 else 0 cmd = f'''samtools mpileup -B -f {ref} {smallbams} | java -Xmx15g -jar \ $VARSCAN_DIR/VarScan.v2.4.3.jar mpileup2cns --min-coverage 8 --p-value 0.05 \ --min-var-freq {minfreq} --strand-filter 1 --min-freq-for-hom 0.80 \ --min-avg-qual 20 --output-vcf 1 > {vcf} module unload samtools ''' # final vcf outdir = makedir(op.join(pooldir, program)) finalvcf = op.join(outdir, op.basename(vcf)) cmds = smallcmds + cmd return (cmds, finalvcf)
def get_varscan_cmd(bamfiles, bedfile, bednum, vcf, ref): """Create command to call varscan.""" smallbams, smallcmds = get_small_bam_cmds(bamfiles, bednum, bedfile) smallbams = ' '.join(smallbams) ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool] # if single-sample then set minfreq to 0, else use min possible allele freq minfreq = 1/(ploidy*len(bamfiles)) if len(bamfiles) > 1 else 0 # if single-sample then use pileup2cns, else use mpileup2snp # tool = 'mpileup2cns' if len(bamfiles) > 1 else 'pileup2cns' tool = 'mpileup2cns' # --strand-filter not mentioned in docs for pileup2cns strand_filter = '' if tool == 'pileup2cns' else '--strand-filter 1' cmd = f'''samtools mpileup -B -f {ref} {smallbams} | java -Xmx15g -jar \ $VARSCAN_DIR/VarScan.v2.4.3.jar {tool} --min-coverage 8 --p-value 0.05 \ --min-var-freq {minfreq} {strand_filter} --min-freq-for-hom 0.80 \ --min-avg-qual 20 --output-vcf 1 > {vcf} module unload samtools''' cmds = smallcmds + cmd return (cmds, vcf)
def get_varscan_names(df, pooldir): """Convert generic sample/pool names from varscan to something meaningful.""" print('renaming varscan columns ...') # get order of samps used to create varscan cmds (same order as datatable) pool = op.basename(pooldir) samps = pklload(op.join(op.dirname(pooldir), 'poolsamps.pkl'))[pool] # create a list of names that varscan gives by default generic = ['Sample%s' % (i+1) for i in range(len(samps))] # create a map between generic and true samp names dic = dict((gen, samp) for (gen, samp) in zip(generic, samps)) # rename the columns in df cols = [] for col in df: if '.' in col: gen, rest = col.split(".") samp = dic[gen] col = '.'.join([samp, rest]) cols.append(col) df.columns = cols return df
### purpose # use picard to mark/remove duplicates, build bam index for GATK ### ### usage # 03_mark_build.py /path/to/sortfile /path/to/pooldir/ ### """ import sys, os, balance_queue, subprocess, shutil from os import path as op from coadaptree import makedir, get_email_info, pklload thisfile, pooldir, samp = sys.argv sortfiles = pklload(op.join(pooldir, '%s_sortfiles.pkl' % samp)) # MarkDuplicates dupdir = op.join(pooldir, '03_dedup_rg_filtered_indexed_sorted_bamfiles') pool = op.basename(pooldir) dupfile = op.join(dupdir, "%s_rd.bam" % samp) dupflag = dupfile.replace(".bam", ".bam.flagstats") dupstat = op.join(dupdir, "%s_rd_dupstat.txt" % samp) # create sh file email_text = get_email_info(op.dirname(pooldir), '03') joined = ' I='.join(sortfiles) text = f"""#!/bin/bash #SBATCH --time=11:59:00 #SBATCH --mem=30000M #SBATCH --ntasks=1
""" import os import sys import time import shutil import subprocess from os import path as op from coadaptree import fs, pklload, pkldump, get_email_info # args thisfile, pooldir, ref = sys.argv parentdir = op.dirname(pooldir) pool = op.basename(pooldir) f2samp = pklload(op.join(parentdir, 'f2samp.pkl')) adaptors = pklload(op.join(parentdir, 'adaptors.pkl')) bash_variables = op.join(parentdir, 'bash_variables') for arg, path in [('pooldir', pooldir), ('ref', ref)]: if not op.exists(path): print("The argument does not exist in the specified path:\narg = %s\npath =%s" % (arg, path)) sys.exit(1) # make some dirs shdir = op.join(pooldir, 'shfiles') shtrimDIR = op.join(shdir, '01_trimmed_shfiles') # cmd.sh files trimDIR = op.join(pooldir, '01_trimmed') # outfiles for d in [shtrimDIR, trimDIR]: if not op.exists(d): os.makedirs(d)
""" import os, sys, balance_queue, subprocess, shutil from os import path as op from coadaptree import makedir, pklload, get_email_info thisfile, pooldir, samp, dupfile = sys.argv # RealignerTargetCreator aligndir = op.join(pooldir, '04_realign') listfile = op.join(aligndir, '%s_realingment_targets.list' % samp) # get ref parentdir = op.dirname(pooldir) pool = op.basename(pooldir) ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] email_text = get_email_info(parentdir, '04') text = '''#!/bin/bash #SBATCH --time=7-00:00:00 #SBATCH --mem=30000M #SBATCH --nodes=1 #SBATCH --ntasks=32 #SBATCH --cpus-per-task=1 #SBATCH --job-name=%(pool)s-%(samp)s-realign #SBATCH --output=%(pool)s-%(samp)s-realign_%%j.out %(email_text)s # realign using the GATK module load gatk/3.8 module load java
return md5 def get_cmds(srcfiles, md5files, remotedir, createmd5): subcmds = [] for src in srcfiles: if createmd5 is True: md5 = check_md5(src, md5files) md5dst = op.join(remotedir, op.basename(md5)) subcmds.append(f'rsync -avz {hostname}:{md5} {md5dst}') dst = op.join(remotedir, op.basename(src)) subcmds.append(f'rsync -avz {hostname}:{src} {dst}') return subcmds pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys()) pooldirs = [op.join(parentdir, p) for p in pools] newdirs = [] # keep track of directories to easily make on remote server cmds = [] # keep track of all rsync commands # get hostname (eg beluga, cedar, graham) hostname = os.environ['CC_CLUSTER'] # add remote and subdirs to newdirs list newdirs.append(remote) for p in pooldirs: newdirs.append(op.join(remote, op.basename(p) + '-gatk')) # get pkl files print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC) pkls = [f for f in fs(parentdir) if f.endswith('.pkl')] for p in pooldirs:
### usage # 02_bwa-map_view_sort_index_flagstat.py parentdir samp ### ### assumes # outfiles from "bwa index ref.fasta" ### """ import sys, os, subprocess, shutil from os import path as op from coadaptree import pklload, pkldump, get_email_info, makedir # get argument inputs thisfile, parentdir, samp = sys.argv pool = pklload(op.join(parentdir, 'samp2pool.pkl'))[samp] pooldir = op.join(parentdir, pool) shdir = op.join(pooldir, 'shfiles') ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] r1r2outs = pklload(op.join(pooldir, 'samp2_r1r2out.pkl'))[samp] bash_variables = op.join(parentdir, 'bash_variables') # create dirs bwashdir = op.join(shdir, '02_bwa_shfiles') samdir = op.join(pooldir, '02a_samfiles') bamdir = op.join(pooldir, '02b_bamfiles') sortdir = op.join(pooldir, '02c_sorted_bamfiles') for d in [bwashdir, samdir, bamdir, sortdir]: makedir(d) # get rginfo - THIS CAN STAY EVEN WITH SAMPS SEQUENCED MULTIPLE TIMES - RGID and RGPU are defined with file
# imports import os, sys, json, pandas as pd from tqdm import tqdm from os import path as op from collections import OrderedDict from coadaptree import fs, uni, pklload # args thisfile, parentdir, engines = sys.argv if parentdir.endswith("/"): parentdir = parentdir[:-1] # reqs print('getting reqs') samp2pool = pklload(op.join(parentdir, 'samp2pool.pkl')) pools = uni(list(samp2pool.values())) # get a list of subdirectory pool dirs created earlier in pipeline print('getting pooldirs') pooldirs = [] for p in pools: pooldir = op.join(parentdir, p) pooldirs.append(pooldir) # TRIMMING DATA # get the json data from trimming print('getting trim data') data = {} count = 0 for p in pooldirs:
def get_bedfiles(parentdir, pool): """Get a list of paths to all of the bed files for ref.fa.""" ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] beddir = op.join(op.dirname(ref), 'bedfiles_%s' % op.basename(ref).split(".fa")[0]) return [f for f in fs(beddir) if f.endswith('.bed')]
""" Create rsync command between src_server and remote_server. Create .md5 if necessary. """ subcmds = [] for src in srcfiles: if createmd5 is True: md5 = check_md5(src, md5files) md5dst = op.join(remotedir, op.basename(md5)) subcmds.append(f'rsync -azv {hostname}:{md5} {md5dst}') dst = op.join(remotedir, op.basename(src)) subcmds.append(f'rsync -azv {hostname}:{src} {dst}') return subcmds pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys()) pooldirs = [op.join(parentdir, p) for p in pools] newdirs = [] # keep track of directories to easily make on remote server cmds = [] # keep track of all rsync commands # get hostname (eg beluga, cedar, graham) hostname = os.environ['CC_CLUSTER'] # add remote and subdirs to newdirs list newdirs.append(remote) for p in pooldirs: newdirs.append(op.join(remote, op.basename(p))) # get pkl files print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC)
def remove_repeats(snps, parentdir, snpspath, pool): """ Remove SNPs that are found to be in repeat-masked regions. # assumes # that the positions have been translated BEFORE removing repeats # took forever to create unstitched repeat regions, don't want to translate repeat file # this way I can just use unstitched chrom if reference is stitched # repeat file has a header ('CHROM', 'start', 'stop') # start and stop positions of repeat regions are 1-based """ reppkl = op.join(parentdir, 'repeat_regions.pkl') if op.exists(reppkl): # read in repeat regions repeatdict = pklload(reppkl) if repeatdict[pool] is not None: print('Removing repeat regions ...') # if user selected translation be applied to this pool repeats = pd.read_csv(repeatdict[pool], sep='\t') # figure out if data is from stitched or not if 'unstitched_chrom' in snps.columns: # then the snps have been translated: stitched -> unstitched chromcol = 'unstitched_chrom' poscol = 'unstitched_pos' print('\tsnps have been translated') else: # otherwise SNPs were called on unstitched reference chromcol = 'CHROM' poscol = 'POS' print('\tsnps have not been translated') # reduce repeats to the chroms that matter (helps speed up lookups) repeats = repeats[repeats['CHROM'].isin( snps[chromcol].tolist())].copy() # isolate SNPs in repeat regions repeat_snps = [] for chrom in tqdm(uni(snps[chromcol])): reps = repeats[repeats['CHROM'] == chrom].copy() mysnps = snps[snps[chromcol] == chrom].copy() if len(reps.index) > 0 and len(mysnps.index) > 0: for row in mysnps.index: pos = snps.loc[ row, poscol] # index is maintained from snps to mysnsps df = reps[reps['stop'].astype(int) >= int(pos)].copy() df = df[df['start'].astype(int) <= int(pos)].copy() if len(df.index) > 0: assert len(df.index) == 1 repeat_snps.append(row) # save repeats print(f'\tSaving {len(repeat_snps)} repeat regions') repeat_path = snpspath.replace(".txt", "_REPEATS.txt") myrepeats = snps[snps.index.isin(repeat_snps)].copy() myrepeats = mark_nas(myrepeats, 'repeat SNPs') myrepeats.to_csv(repeat_path, sep='\t', index=False) # remove SNPs in repeat regions snps = snps[~snps.index.isin(repeat_snps)].copy() snps.index = range(len(snps.index)) print( f'{op.basename(snpspath)} has {len(snps.index)} SNPs outside of repeat regions' ) return snps
# """ ### imports import sys, os, pickle, subprocess from os import path as op import numpy as np from coadaptree import fs, createdirs, pklload, get_email_info from genotyping_scheduler import startscheduler, bigbrother, delsched ### ### args thisfile, parentdir = sys.argv if parentdir.endswith("/"): parentdir = parentdir[:-1] poolref = pklload(op.join(parentdir, 'poolref.pkl')) email_info = get_email_info(parentdir, 'final') bash_variables = op.join(parentdir, 'bash_variables') maf = pklload(op.join(parentdir, 'maf.pkl')) ### # make a reservation file so other jobs don't call 05.py resfile = op.join(parentdir, 'shfiles/06_reservation.txt') if not op.exists(resfile): startscheduler(resfile) else: print('06.py was running') bigbrother(resfile, DIR=None) ### dirs shdir = op.join(parentdir, 'shfiles/concat')