def handle_dict_fails(pool2repeatsfile, pool2translate, pool2paralogfile, repeats, translate, paralogs, data, parentdir): flagexit = False for dic, flag, word in zip( [pool2repeatsfile, pool2translate, pool2paralogfile], [repeats, translate, paralogs], ['remove repeats', 'translate stitched positions', 'remove paralogs']): # if a flag was specified but none of the pools were selected: if flag is True and sum([1 for v in dic.values() if v is not None]) == 0: flagexit = True text = 'FAIL: You have indicated that you would like to %s from final SNPs.\n' % word text = text + 'FAIL: But the user has not specified at least one pool to %s. \n' % word text = text + 'FAIL: You need to respond "yes" to at least one of the prompts above \n' text = text + 'FAIL: for assigning a file to a pool - i.e., to use the \n' text = text + 'FAIL: %s flag, you must apply it to at least one pool. \n' % word if 'repeats' in word: text = text + 'FAIL: The file containing repeat regions should be one of the following:\n' for ref in uni(data['ref']): repeatfile = ref.split(".fa")[0] + '_repeats.txt' text = text + "\t %s \n" % repeatfile elif 'stitched' in word: text = text + 'FAIL: The file to translate stitched to unstitched positions should \n' text = text + 'FAIL: be one of the following:\n' for ref in uni(data['ref']): orderfile = ref.split(".fa")[0] + '.order' text = text + "\t %s \n" % orderfile elif 'paralogs' in word: text = text + 'FAIL: The file(s) to remove paralogs must be in %s \n' % parentdir text = text + 'FAIL: and end with "_paralog_snps.txt".' print(Bcolors.FAIL + text + Bcolors.ENDC) if flagexit is True: exit()
def add_freq_cols(df, tf, tipe, tablefile): """ Adding in .FREQ columns for crisp file. Positional arguments: df - pandas.dataframe; current filtered VariantsToTable output tablefile - path to VariantsToTable output - used to find ploidy etc tf - basename of tablefile tipe - one of either "SNP" or "INDEL" Returns: df - pandas.dataframe; current filtered VariantsToTable output + freqcols """ print('Adding in .FREQ columns for crisp file ...') # remove bednum from column names so we can pd.concat() later bednum = tf.split("file_")[-1].split("_converted")[0] df.columns = [col.replace("_" + bednum, "") for col in df.columns] # add in a .FREQ column for pool-level freqs gtcols = [col for col in df.columns if '.GT' in col] print('len(gtcols) = ', len(gtcols)) freqcols = [] for col in tqdm(gtcols): refcol = col.replace(".GT", ".REFCOUNT") altcol = col.replace(".GT", ".ALTCOUNT") freqcol = col.replace(".GT", ".FREQ") freqcols.append(freqcol) for alt in uni(df['ALT']): df.loc[df['ALT'] == alt, altcol] = df[col].str.count(alt) for ref in uni(df['REF']): df.loc[df['REF'] == ref, refcol] = df[col].str.count(ref) df[freqcol] = df[altcol] / (df[altcol] + df[refcol]) # remove count cols print('Removing unnecessary cols ...') df = df[[ col for col in df.columns if '.REFCOUNT' not in col and '.ALTCOUNT' not in col ]].copy() # recalculate global AF df = recalc_global_freq(df, tf, freqcols) # sort columns to group data together for each pool datacols = sorted([col for col in df.columns if '.' in col]) othercols = [ col for col in df.columns if '.' not in col and col != 'locus' and 'crisp' not in col ] othercols.insert(othercols.index('AF') + 1, 'crisp_AF') df = df[['locus'] + othercols + datacols].copy() df.index = range(len(df.index)) return df
def get_refn_snps(df, tipe, ndfs=None): """ Isolate polymorphisms with REF=N but two ALT single nuleodite alleles. Positional arguments: df - pandas.dataframe; current filtered VariantsToTable output Returns: dfs - list of loci (pandas.dataframes) with REF=N and two ALT alleles, counts with respect to second ALT ndfs - return from pd.conat(dfs) """ ndf = df[df['REF'] == 'N'].copy() ndf = ndf[ndf['TYPE'] == tipe].copy() ncount = table(ndf['locus']) nloci = [locus for locus in ncount if ncount[locus] == 2] ndf = ndf[ndf['locus'].isin(nloci)].copy() dfs = [] for locus in uni(ndf['locus']): smalldf = ndf[ndf['locus'] == locus].copy() if len(smalldf.index) == 2: smalldf.index = range(len(smalldf.index)) smalldf = adjust_freqs(smalldf) smalldf.loc[0, 'ALT'] = "%s+%s" % (smalldf.loc[0, 'ALT'], smalldf.loc[1, "ALT"]) dfs.append(pd.DataFrame(smalldf.loc[0, :]).T) if len(dfs) > 0: ndfs = pd.concat(dfs) return (dfs, ndfs)
def make_pooldirs(data, parentdir): """Create subdirectories of parentdir. Positional arguments: data - datatable.txt with info for pipeline parentdir - directory with datatable.txt and (symlinks to) fastq data """ # make pool dirs print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC) pools = uni(data['pool_name'].tolist()) pooldirs = [] for p in pools: pooldir = op.join(parentdir, p) if op.exists(pooldir): text = "\tWARN: The pooldir already exists, this WILL overwrite and/or delete previous data: %s" % pooldir print(Bcolors.WARNING + text + Bcolors.ENDC) askforinput(tab='\t', newline='') # first unlink fastq files for f in fs(pooldir): if f.endswith('.gz'): os.unlink(f) # then just delete the directory shutil.rmtree(pooldir) pooldirs.append(makedir(pooldir)) return pooldirs
def create_all_bedfiles(poolref, numpools): """For each unique ref.fa in datatable.txt, create bedfiles for varscan. Positional arguments: poolref - dictionary with key = pool, val = /path/to/ref """ # create bedfiles for varscan print(Bcolors.BOLD + "\ncreating bedfiles" + Bcolors.ENDC) for ref in uni(poolref.values()): create_bedfiles.main(ref, numpools)
def make_pooldirs(data, parentdir): # make pool dirs print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC) pools = uni(data['pool_name'].tolist()) pooldirs = [] for p in pools: DIR = op.join(parentdir, p) if op.exists(DIR): print("The pooldir already exists, this could overwrite previous data: %s" % DIR) askforinput() pooldirs.append(makedir(DIR)) makedir(op.join(DIR, 'shfiles')) return pooldirs
def make_pooldirs(data, parentdir): """Create subdirectories of parentdir. Positional arguments: data - datatable.txt with info for pipeline parentdir - directory with datatable.txt and (symlinks to) fastq data """ # make pool dirs print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC) pools = uni(data['pool_name'].tolist()) pooldirs = [] for p in pools: DIR = op.join(parentdir, p) if op.exists(DIR): print( "The pooldir already exists, this could overwrite previous data: %s" % DIR) print("Do you want to proceed?") askforinput() pooldirs.append(makedir(DIR)) return pooldirs
def remove_repeats(snps, parentdir, snpspath, pool): """ Remove SNPs that are found to be in repeat-masked regions. # assumes # that the positions have been translated BEFORE removing repeats # took forever to create unstitched repeat regions, don't want to translate repeat file # this way I can just use unstitched chrom if reference is stitched # repeat file has a header ('CHROM', 'start', 'stop') # start and stop positions of repeat regions are 1-based """ reppkl = op.join(parentdir, 'repeat_regions.pkl') if op.exists(reppkl): # read in repeat regions repeatdict = pklload(reppkl) if repeatdict[pool] is not None: print('Removing repeat regions ...') # if user selected translation be applied to this pool repeats = pd.read_csv(repeatdict[pool], sep='\t') # figure out if data is from stitched or not if 'unstitched_chrom' in snps.columns: # then the snps have been translated: stitched -> unstitched chromcol = 'unstitched_chrom' poscol = 'unstitched_pos' print('\tsnps have been translated') else: # otherwise SNPs were called on unstitched reference chromcol = 'CHROM' poscol = 'POS' print('\tsnps have not been translated') # reduce repeats to the chroms that matter (helps speed up lookups) repeats = repeats[repeats['CHROM'].isin( snps[chromcol].tolist())].copy() # isolate SNPs in repeat regions repeat_snps = [] for chrom in tqdm(uni(snps[chromcol])): reps = repeats[repeats['CHROM'] == chrom].copy() mysnps = snps[snps[chromcol] == chrom].copy() if len(reps.index) > 0 and len(mysnps.index) > 0: for row in mysnps.index: pos = snps.loc[ row, poscol] # index is maintained from snps to mysnsps df = reps[reps['stop'].astype(int) >= int(pos)].copy() df = df[df['start'].astype(int) <= int(pos)].copy() if len(df.index) > 0: assert len(df.index) == 1 repeat_snps.append(row) # save repeats print(f'\tSaving {len(repeat_snps)} repeat regions') repeat_path = snpspath.replace(".txt", "_REPEATS.txt") myrepeats = snps[snps.index.isin(repeat_snps)].copy() myrepeats = mark_nas(myrepeats, 'repeat SNPs') myrepeats.to_csv(repeat_path, sep='\t', index=False) # remove SNPs in repeat regions snps = snps[~snps.index.isin(repeat_snps)].copy() snps.index = range(len(snps.index)) print( f'{op.basename(snpspath)} has {len(snps.index)} SNPs outside of repeat regions' ) return snps
def parse_datatable(data, parentdir, translate, repeats, paralogs): """ Checks some assumptions of datatable.txt, create files and dirs for downstream. translate, repeats, and paralogs are boolean. parentdir is a path. """ print(Bcolors.BOLD + '\nReading datatable, getting fastq info' + Bcolors.ENDC) # inititate dictionaries for downstream pipeline rginfo = {} # key=samp vals=rginfo samp2pool = {} # key=samp val=pool poolref = {} # key=pool val=ref.fa ploidy = {} # key=pool val=dict(key=sample: val=sample_ploidy) poolsamps = {} # key=pool val=sampnames f2samp = {} # key=f val=samp f2pool = {} # key=f val=pool adaptors = OrderedDict() # key=samp val={'r1','r2'} val=adaptor warning = [] # whether to print out warning about optional RG info failing = [] # whether to print out failing about required RG info pool2paralogfile = {} # if --rm_paralogs flagged, store file based on pool pool2repeatsfile = {} # if --rm_repeats flagged, store file based on pool pool2translate = {} # if --translate flagged, store file based on pool # make sure there are no blanks where there shouldn't be badcols = [] for column in data.columns: if column not in ['rgid', 'rgpu', 'adaptor_1', 'adaptor_2']: if data[column].isnull().sum() > 0: badcols.append(column) if len(badcols) > 0: print( Bcolors.FAIL + "\tFAIL: Some rows in datable.txt have blank entries in the following columns: " + Bcolors.ENDC) for col in badcols: print(Bcolors.FAIL + "\tFAIL: %s" % col + Bcolors.ENDC) print('exiting 00_start-pipeline.py') exit() # make sure specific words are not in a pool name badnames = [] for pool in uni(data['pool_name']): for keyword in ['SNP', 'REPEAT', 'PARALOG']: if keyword in pool: badnames.append((pool, keyword)) if len(badnames) > 0: print( Bcolors.FAIL + "\tFAIL: Some pool names have characters that could cause errors downstream." + Bcolors.ENDC) print( Bcolors.FAIL + "\tFAIL: Remove the bad characters from pool_names to continue." + Bcolors.ENDC) for pool, keyword in badnames: print(Bcolors.FAIL + "\tFAIL: Remove '%s' from pool_name '%s'." % (keyword, pool)) print('exiting 00_start-pipeline.py') exit() # iterate through datatable for row in data.index: # get variables samp = data.loc[row, 'sample_name'] adaptors[samp] = { 'r1': data.loc[row, 'adaptor_1'], 'r2': data.loc[row, 'adaptor_2'] } pool = data.loc[row, 'pool_name'] pooldir = op.join(parentdir, pool) print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool)) if pool not in poolsamps: poolsamps[pool] = [] if samp not in poolsamps[pool]: poolsamps[pool].append(samp) if samp in samp2pool: if samp2pool[samp] != pool: print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \ different pool assignments: %s' % samp + Bcolors.ENDC) print('exiting') exit() samp2pool[samp] = pool # get ploidy info if pool not in ploidy: ploidy[pool] = {} if samp in ploidy[pool].keys(): if ploidy[pool][samp] != int(data.loc[row, 'ploidy']): text = "FAIL: the ploidy values for sample_name '%s' are not the same" % samp print(Bcolors.FAIL + text + Bcolors.ENDC) exit() ploidy[pool][samp] = int(data.loc[row, 'ploidy']) # get ref.fasta info ref = data.loc[row, 'ref'] if pool in poolref: # make sure each row for a pool specifies the same reference.fa if poolref[pool] != ref: text = "FAIL: Ref genome for samples in %s pool seem to have different paths in datatable" % pool print(Bcolors.FAIL + text + Bcolors.ENDC) print('exiting 00_start-pipeline.py') exit() else: # check assumptions about ref poolref[pool] = check_ref_assumptions(samp, ref) # hangle RG info rginfo[samp] = {} # required RG info for col in ['rglb', 'rgpl', 'rgsm']: # rg info columns if not data.loc[row, col] == data.loc[row, col]: failing.append('%s\t%s' % (samp, col)) rginfo[samp][col] = data.loc[row, col] # optional RG info for col in ['rgid', 'rgpu']: if data.loc[row, col] != data.loc[row, col]: # if nan rginfo[samp][col] = None if samp not in warning: warning.append(samp) else: rginfo[samp][col] = data.loc[row, col] # map between file and pool/samp for f in [ data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2'] ]: f2pool[f] = pool f2samp[op.join(pooldir, f)] = samp # handle --rm_paralogs, --translate, --rm_repeats for pool in uni(data['pool_name']): # handle translating stitched genome to unstitched positions pool2translate[pool] = handle_translate(translate, pool2translate, poolref[pool], data, pool) # handle removing SNPs from repeat regions pool2repeatsfile[pool] = handle_repeats(repeats, pool2repeatsfile, poolref[pool], data, pool) # handle removing paralogs pool2paralogfile[pool] = handle_paralogs(paralogs, pool2paralogfile, data, pool, parentdir) # handle fails for rm_repeats/translate/rm_paralogs handle_dict_fails(pool2repeatsfile, pool2translate, pool2paralogfile, repeats, translate, paralogs, data, parentdir) # RG info failing/warnings handle_rg_fails(failing, warning, parentdir, data) pkldump(pool2repeatsfile, op.join(parentdir, 'repeat_regions.pkl')) pkldump(pool2paralogfile, op.join(parentdir, 'paralog_snps.pkl')) pkldump(pool2translate, op.join(parentdir, 'translate_snps.pkl')) pkldump(rginfo, op.join(parentdir, 'rginfo.pkl')) pkldump(ploidy, op.join(parentdir, 'ploidy.pkl')) pkldump(f2samp, op.join(parentdir, 'f2samp.pkl')) pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl')) pkldump(poolref, op.join(parentdir, 'poolref.pkl')) pkldump(adaptors, op.join(parentdir, 'adaptors.pkl')) pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl')) return f2pool, poolref
def read_datatable(parentdir): # read in the datatable, save info for later datatable = op.join(parentdir, 'datatable.txt') if not op.exists(datatable): print(Bcolors.FAIL + '''FAIL: the datatable is not in the necessary path: %s FAIL: exiting 00_start-gatk_pipeline.py''' % datatable + Bcolors.ENDC) sys.exit(3) print(Bcolors.BOLD + 'reading datatable, getting fastq info' + Bcolors.ENDC) data = pd.read_csv(datatable, sep='\t') rginfo = {} # key=sampname vals=rginfo samp2pool = {} # key=samp val=pool poolref = {} # key=pool val=ref.fa ploidy = {} # key=pool val=ploidy poolsamps = {} # key=pool val=sampnames f2samp = {} # key=f val=samp f2pool = {} # key=f val=pool adaptors = {} # key=samp val={'r1','r2'} val=adaptor for row in data.index: samp = data.loc[row, 'sample_name'] adaptors[samp] = {'r1': data.loc[row, 'adaptor_1'], 'r2': data.loc[row, 'adaptor_2']} pool = data.loc[row, 'pool_name'] pooldir = op.join(parentdir, pool) print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool)) if pool not in poolsamps: poolsamps[pool] = [] if samp not in poolsamps[pool]: poolsamps[pool].append(samp) if samp in samp2pool: if samp2pool[samp] != pool: print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \ different pool assignments: %s' % samp + Bcolors.ENDC) print('exiting') exit() samp2pool[samp] = pool df = data[data['pool_name'] == pool].copy() if not luni(df['ploidy']) == 1: print(Bcolors.WARNING + "The ploidy values for some elements with pool name '%s' are not the same." % pool + "\n\tHere are the ploidy values: %s" % uni(df['ploidy']) + Bcolors.ENDC) askforinput() if samp not in ploidy: ploidy[samp] = data.loc[row, 'ploidy'] if pool in poolref: if not poolref[pool] == data.loc[row, 'ref']: print("ref genome for samples in %s pool seems to have different paths in datatable.txt" % pool) sys.exit(1) else: ref = data.loc[row, 'ref'] if not op.exists(ref): print('ref for %s does not exist in path: %s' % (samp, ref)) print('exiting 00_start-gatk_pipeline.py') exit() needed = [] for suffix in ['.dict', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa']: refext = ref + suffix if suffix != '.dict' else ref.split('.fa')[0] + suffix if not op.exists(refext): needed.append(refext) if len(needed) > 0: print(Bcolors.FAIL + 'FAIL: the following extensions of the reference are needed to continue, \ please create these files' + Bcolors.ENDC) for n in needed: print(Bcolors.FAIL + n + Bcolors.ENDC) print('exiting') exit() printneeded = False intdir = op.join(op.dirname(ref), 'intervals') if not op.exists(intdir): printneeded = True elif len([f for f in fs(intdir) if '.list' in f]) == 0: printneeded = True if printneeded is True: print(Bcolors.FAIL + 'FAIL: either the intervals dir doesn not exist or there are not interval.list files\ \nFAIL: intdir should be here: %s' % intdir + Bcolors.ENDC) exit() poolref[pool] = ref rginfo[samp] = {} for col in ['rglb', 'rgpl', 'rgsm']: # rg info columns rginfo[samp][col] = data.loc[row, col] for f in [data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']]: if "__" in f: print(Bcolors.BOLD + Bcolors.FAIL + "FAIL: file names cannot have double underscores, replace __ with _ (single)" + Bcolors.END) exit() f2pool[f] = pool f2samp[op.join(pooldir, f)] = samp pkldump(rginfo, op.join(parentdir, 'rginfo.pkl')) pkldump(ploidy, op.join(parentdir, 'ploidy.pkl')) pkldump(f2samp, op.join(parentdir, 'f2samp.pkl')) pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl')) pkldump(poolref, op.join(parentdir, 'poolref.pkl')) pkldump(adaptors, op.join(parentdir, 'adaptors.pkl')) pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl')) return data, f2pool, poolref
# imports import os, sys, json, pandas as pd from tqdm import tqdm from os import path as op from collections import OrderedDict from coadaptree import fs, uni, pklload # args thisfile, parentdir, engines = sys.argv if parentdir.endswith("/"): parentdir = parentdir[:-1] # reqs print('getting reqs') samp2pool = pklload(op.join(parentdir, 'samp2pool.pkl')) pools = uni(list(samp2pool.values())) # get a list of subdirectory pool dirs created earlier in pipeline print('getting pooldirs') pooldirs = [] for p in pools: pooldir = op.join(parentdir, p) pooldirs.append(pooldir) # TRIMMING DATA # get the json data from trimming print('getting trim data') data = {} count = 0 for p in pooldirs: trimdir = op.join(p, '01_trimmed')