def getfiles(samps, shdir, grep): """Determine if all realign bam jobs have been created and sbatched. Positional arguments: samps - list of sample names (length is number of expected shfiles) shdir - directory where .sh and .out files are grep - program name - keyword used to find correct files Returns: files - dictionary where key = sh file, val = most recent outfile """ found = [sh for sh in fs(shdir) if sh.endswith(".sh") and grep in sh] outs = [out for out in fs(shdir) if out.endswith('.out') and grep in out] if len(found) != len(samps): print('not all shfiles have been created, exiting %s' % sys.argv[0]) exit() files = dict( (f, getmostrecent( [out for out in outs if op.basename(f).replace(".sh", "") in out])) for f in found) if None in files.values(): print('not all shfiles have been sbatched, exiting %s' % sys.argv[0]) exit() return files
def make_pooldirs(data, parentdir): """Create subdirectories of parentdir. Positional arguments: data - datatable.txt with info for pipeline parentdir - directory with datatable.txt and (symlinks to) fastq data """ # make pool dirs print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC) pools = uni(data['pool_name'].tolist()) pooldirs = [] for p in pools: pooldir = op.join(parentdir, p) if op.exists(pooldir): text = "\tWARN: The pooldir already exists, this WILL overwrite and/or delete previous data: %s" % pooldir print(Bcolors.WARNING + text + Bcolors.ENDC) askforinput(tab='\t', newline='') # first unlink fastq files for f in fs(pooldir): if f.endswith('.gz'): os.unlink(f) # then just delete the directory shutil.rmtree(pooldir) pooldirs.append(makedir(pooldir)) return pooldirs
def main(): # make sure all of the varscan jobs have finished files = checkjobs() # combine table files from output of VariantsToTable tablefiles = get_tables(files) # get SNP and indels for tipe in ['SNP', 'INDEL']: get_types(tablefiles, tipe, program, pooldir, grep) # combine repeats and paralogs tabledir = op.dirname(tablefiles[0]) for tipe in ['PARALOGS', 'REPEATS']: tablefiles = [ f for f in fs(tabledir) if tipe in f and 'all' not in f and f.endswith('.txt') ] if len(tablefiles) > 0: dfs = [] for t in tablefiles: dfs.append(pd.read_csv(t, sep='\t')) df = pd.concat(dfs) df.to_csv(op.join( tabledir, f'{op.basename(pooldir)}-{program}_all_bedfiles_{tipe}.txt'), sep='\t', index=False)
def get_parafile(parentdir, pool): """Obtain file containing paralog SNPs to be removed from final SNPs.""" parafiles = [f for f in fs(parentdir) if f.endswith('_paralog_snps.txt')] if len(parafiles) > 1: parafile = choose_file(parafiles, pool, 'remove paralogs') elif len(parafiles) == 0: parafile = None elif len(parafiles) == 1: parafile = parafiles[0] return parafile
def get_datafiles(parentdir, f2pool, data): """Get list of files from datatable, make sure they exist in parentdir. Create symlinks in /parentdir/<pool_name>/. Positional arguments: parentdir - directory with datatable.txt and (symlinks to) fastq data f2pool - dictionary with key = file.fastq, val = pool_name data - datatable.txt with info for pipeline """ print(Bcolors.BOLD + '\nchecking for existance of fastq files in datatable.txt' + Bcolors.ENDC) files = [f for f in fs(parentdir) if 'fastq' in f and 'md5' not in f] datafiles = data['file_name_r1'].tolist() for x in data['file_name_r2'].tolist(): datafiles.append(x) if len(files) > len(datafiles): desc = 'more' if len(files) < len(datafiles): desc = 'less' try: print(Bcolors.WARNING + 'WARN: there are %s fastq files in %s than in datatable.txt' % (desc, parentdir) + Bcolors.ENDC) print(Bcolors.BOLD + 'Here are the files in %s' % parentdir + Bcolors.ENDC) for x in files: print(op.basename(x)) print(Bcolors.BOLD + 'Here are the files in datatable.txt' + Bcolors.ENDC) for x in datafiles: print(x) askforinput(newline='') except NameError: pass # create symlinks in pooldirs for visualization for f in datafiles: src = op.join(parentdir, f) if not op.exists(src): # make sure file in datatable exists print( "could not find %s in %s\nmake sure file_name in datatable is its basename" % (f, parentdir)) print( "(symlinks in parentdir to fastq files in other dirs works fine, and is the intentional use)" ) sys.exit(1) pooldir = op.join(parentdir, f2pool[f]) dst = op.join(pooldir, f) if not op.exists(dst): # easy to visualize in cmdline if script is finding correct group of files by ls-ing pooldir os.symlink(src, dst)
def check_beddir(): """Avoid accidentally using incorrect bedfiles by removing any that exist.""" bname, beddir = make_beddir() files = [f for f in fs(beddir) if f.endswith('.bed')] if len(files) > 0: text = '\tThere are already existing bedfiles in %s. These will be deleted.' % beddir print(Bcolors.WARNING + text + Bcolors.ENDC) askforinput(tab='\t', newline='') for f in files: os.remove(f) print('\t\tRemoved %s bedfiles.' % len(files))
def get_bamfiles(samps, pooldir): """Using a list of sample names, find the realigned bamfiels. Return: files - dictionary with key = samp_name, val = /path/to/bamfile """ print('getting bamfiles') found = fs(op.join(pooldir, '04_realign')) files = dict((samp, f.replace(".bai", ".bam")) for samp in samps for f in found if samp in f and f.endswith('.bai')) if not len(files) == len(samps): print('len(files) != len(samps)') print('files = ', files) print('samps = ', samps) exit() return files
def get_tables(files): """Find all existing .txt files, exit if the number doesn't match expectations. Positional arguments: files - list of shfiles, should be same length as tablefiles (if all jobs are sbatched and done). """ print('getting tablefiles') tablefiles = [f for f in fs(op.join(pooldir, program)) if f.endswith('.txt') and 'all_bedfiles' not in f and 'SNP' not in f and 'INDEL' not in f and grep in f] if not len(tablefiles) == len(files): print('for some reason tablefiles != files. exiting.') exit() return tablefiles
def checkjobs(): """ Make sure previous realigned bamfiles were created without error. Avoids unintentionally combining a subset of all final expected files. Calls: getfiles from start_crispANDvarscan """ print('checking jobs') parentdir = op.dirname(pooldir) pool = op.basename(pooldir) ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] samps = fs(op.join(op.dirname(ref), 'bedfiles_%s' % op.basename(ref).split(".fa")[0])) shdir = op.join(pooldir, 'shfiles/crispANDvarscan') # files = {f.sh: f.out, ...} files = getfiles(samps, shdir, f"{grep}-{program}") return files
def make_bed_from_intervals(intdir): """If intervals.list files exist, use these instead of ref.fa.length file. Positional arguments: intdir - path to intervals.list files """ intfiles = [f for f in fs(intdir) if f.endswith('.list')] for intfile in intfiles: num = intfile.split("_")[-1].replace(".list", "") lines = [] with open(intfile, 'r') as o: text = o.read().split("\n") for line in text: scaff, span = line.split(":") start, stop = span.split("-") start, stop = (int(start) - 1, int(stop) - 1) lines.append((scaff, start, stop)) make_bed(lines, num) print('\t\tcreated %s bedfiles for %s from interval files' % (len(intfiles), ref))
pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys()) pooldirs = [op.join(parentdir, p) for p in pools] newdirs = [] # keep track of directories to easily make on remote server cmds = [] # keep track of all scp commands # get hostname (eg beluga, cedar, graham) hostname = os.environ['CC_CLUSTER'] # add remote and subdirs to newdirs list newdirs.append(remote) for p in pooldirs: newdirs.append(op.join(remote, op.basename(p))) # get pkl files pkls = [f for f in fs(parentdir) if f.endswith('.pkl')] for p in pooldirs: for pkl in pkls: pkldst = op.join(remote, f'{op.basename(p)}/{op.basename(pkl)}') cmds.append(f"scp {hostname}:{pkl} {pkldst}") newpkls = [f for f in fs(p) if f.endswith('.pkl')] for newpkl in newpkls: newdst = op.join(remote, f'{op.basename(p)}/{op.basename(newpkl)}') cmds.append(f"scp {hostname}:{newpkl} {newdst}") # get shfiles for p in pooldirs: shdir = op.join(p, 'shfiles') remotesh = op.join(remote, f'{op.basename(p)}/sh_and_outfiles') newdirs.append(remotesh) dirs = [d for d in fs(shdir) if op.isdir(d)]
def get_bedfiles(parentdir, pool): """Get a list of paths to all of the bed files for ref.fa.""" ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] beddir = op.join(op.dirname(ref), 'bedfiles_%s' % op.basename(ref).split(".fa")[0]) return [f for f in fs(beddir) if f.endswith('.bed')]
# get a list of subdirectory pool dirs created earlier in pipeline print('getting pooldirs') pooldirs = [] for p in pools: pooldir = op.join(parentdir, p) pooldirs.append(pooldir) # TRIMMING DATA # get the json data from trimming print('getting trim data') data = {} count = 0 for p in pooldirs: trimdir = op.join(p, '01_trimmed') jsons = [f for f in fs(trimdir) if f.endswith('.json')] count += len(jsons) for j in jsons: with open(j, 'r') as f: data[op.basename(j)] = json.load(f) # put data into a dataframe, and sort columns print('reading data') readinfo = OrderedDict() samps = [] for j in sorted(data): jsplit = j.split("__trimmed")[0] splits = jsplit.split(".") samp = '.'.join([splits[-1]] + splits[:-1]) for when in ['before_filtering', 'after_filtering']: for which in ['total_reads', 'total_bases', 'q20_bases', 'q30_bases']:
startscheduler(resfile) else: print('06.py was running') bigbrother(resfile, DIR=None) ### dirs shdir = op.join(parentdir, 'shfiles/concat') catdir = op.join(parentdir, 'concatenated_vcfs') filtdir = op.join(parentdir, 'filtered_snps') createdirs([shdir, catdir, filtdir]) ### # get the snpfiles snpdir = op.join(parentdir, 'snps') snpfiles = [ f.replace('.tbi', '') for f in fs(snpdir) if 'snp' in op.basename(f) and f.endswith('.tbi') ] os.system('echo "len(snpfiles) = %s"' % str(len(snpfiles))) # sort snpfiles by pool pools = list(poolref.keys()) combdict = {} for i, snp in enumerate(snpfiles): for p in pools: if p in op.basename(snp): pool = p break if pool not in combdict: combdict[pool] = [] combdict[pool].append(snp)
pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys()) pooldirs = [op.join(parentdir, p) for p in pools] newdirs = [] # keep track of directories to easily make on remote server cmds = [] # keep track of all scp commands # get hostname (eg beluga, cedar, graham) hostname = os.environ['CC_CLUSTER'] # add remote and subdirs to newdirs list newdirs.append(remote) for p in pooldirs: newdirs.append(op.join(remote, op.basename(p) + '-gatk')) # get pkl files pkls = [f for f in fs(parentdir) if f.endswith('.pkl')] for p in pooldirs: for pkl in pkls: pkldst = op.join(remote, f'{op.basename(p)}-gatk/{op.basename(pkl)}') cmds.append(f"scp {hostname}:{pkl} {pkldst}") newpkls = [f for f in fs(p) if f.endswith('.pkl')] for newpkl in newpkls: newdst = op.join(remote, f'{op.basename(p)}-gatk/{op.basename(newpkl)}') cmds.append(f"scp {hostname}:{newpkl} {newdst}") # get shfiles for p in pooldirs: shdir = op.join(p, 'shfiles') remotesh = op.join(remote, f'{op.basename(p)}-gatk/sh_and_outfiles') newdirs.append(remotesh)
if parentdir.endswith("/"): parentdir = parentdir[:-1] poolref = pklload(op.join(parentdir, 'poolref.pkl')) email_info = get_email_info(parentdir, 'concat') ### ### dirs shdir = op.join(parentdir, 'shfiles/concat') catdir = op.join(parentdir, 'concatenated_vcfs') filtdir = op.join(parentdir, 'filtered_snps') createdirs([shdir,catdir,filtdir]) ### # get the snpfiles snpdir = op.join(parentdir, 'snps') snpfiles = [f.replace('.tbi', '') for f in fs(snpdir) if 'snp' in op.basename(f) and f.endswith('.tbi')] os.system('echo "len(snpfiles) = %s"' % str(len(snpfiles))) # sort snpfiles by pool pools = list(poolref.keys()) combdict = {} for i,snp in enumerate(snpfiles): for p in pools: if p in op.basename(snp): pool = p break if not pool in combdict: combdict[pool] = [] combdict[pool].append(snp) del pool # will cause script to error if pool isn't found in snpfile for pool,poolfiles in combdict.items():
pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys()) pooldirs = [op.join(parentdir, p) for p in pools] newdirs = [] # keep track of directories to easily make on remote server cmds = [] # keep track of all rsync commands # get hostname (eg beluga, cedar, graham) hostname = os.environ['CC_CLUSTER'] # add remote and subdirs to newdirs list newdirs.append(remote) for p in pooldirs: newdirs.append(op.join(remote, op.basename(p) + '-gatk')) # get pkl files print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC) pkls = [f for f in fs(parentdir) if f.endswith('.pkl')] for p in pooldirs: for pkl in pkls: pkldst = op.join(remote, f'{op.basename(p)}-gatk/{op.basename(pkl)}') cmds.append(f"rsync -avz {hostname}:{pkl} {pkldst}") newpkls = [f for f in fs(p) if f.endswith('.pkl')] for newpkl in newpkls: newdst = op.join(remote, f'{op.basename(p)}-gatk/{op.basename(newpkl)}') cmds.append(f"rsync -avz {hostname}:{newpkl} {newdst}") # get shfiles print(Bcolors.BOLD + '\nBundling .sh and .out files ...' + Bcolors.ENDC) for p in pooldirs: shdir = op.join(p, 'shfiles') remotesh = op.join(remote, f'{op.basename(p)}-gatk/sh_and_outfiles')
def read_datatable(parentdir): # read in the datatable, save info for later datatable = op.join(parentdir, 'datatable.txt') if not op.exists(datatable): print(Bcolors.FAIL + '''FAIL: the datatable is not in the necessary path: %s FAIL: exiting 00_start-gatk_pipeline.py''' % datatable + Bcolors.ENDC) sys.exit(3) print(Bcolors.BOLD + 'reading datatable, getting fastq info' + Bcolors.ENDC) data = pd.read_csv(datatable, sep='\t') rginfo = {} # key=sampname vals=rginfo samp2pool = {} # key=samp val=pool poolref = {} # key=pool val=ref.fa ploidy = {} # key=pool val=ploidy poolsamps = {} # key=pool val=sampnames f2samp = {} # key=f val=samp f2pool = {} # key=f val=pool adaptors = {} # key=samp val={'r1','r2'} val=adaptor for row in data.index: samp = data.loc[row, 'sample_name'] adaptors[samp] = {'r1': data.loc[row, 'adaptor_1'], 'r2': data.loc[row, 'adaptor_2']} pool = data.loc[row, 'pool_name'] pooldir = op.join(parentdir, pool) print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool)) if pool not in poolsamps: poolsamps[pool] = [] if samp not in poolsamps[pool]: poolsamps[pool].append(samp) if samp in samp2pool: if samp2pool[samp] != pool: print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \ different pool assignments: %s' % samp + Bcolors.ENDC) print('exiting') exit() samp2pool[samp] = pool df = data[data['pool_name'] == pool].copy() if not luni(df['ploidy']) == 1: print(Bcolors.WARNING + "The ploidy values for some elements with pool name '%s' are not the same." % pool + "\n\tHere are the ploidy values: %s" % uni(df['ploidy']) + Bcolors.ENDC) askforinput() if samp not in ploidy: ploidy[samp] = data.loc[row, 'ploidy'] if pool in poolref: if not poolref[pool] == data.loc[row, 'ref']: print("ref genome for samples in %s pool seems to have different paths in datatable.txt" % pool) sys.exit(1) else: ref = data.loc[row, 'ref'] if not op.exists(ref): print('ref for %s does not exist in path: %s' % (samp, ref)) print('exiting 00_start-gatk_pipeline.py') exit() needed = [] for suffix in ['.dict', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa']: refext = ref + suffix if suffix != '.dict' else ref.split('.fa')[0] + suffix if not op.exists(refext): needed.append(refext) if len(needed) > 0: print(Bcolors.FAIL + 'FAIL: the following extensions of the reference are needed to continue, \ please create these files' + Bcolors.ENDC) for n in needed: print(Bcolors.FAIL + n + Bcolors.ENDC) print('exiting') exit() printneeded = False intdir = op.join(op.dirname(ref), 'intervals') if not op.exists(intdir): printneeded = True elif len([f for f in fs(intdir) if '.list' in f]) == 0: printneeded = True if printneeded is True: print(Bcolors.FAIL + 'FAIL: either the intervals dir doesn not exist or there are not interval.list files\ \nFAIL: intdir should be here: %s' % intdir + Bcolors.ENDC) exit() poolref[pool] = ref rginfo[samp] = {} for col in ['rglb', 'rgpl', 'rgsm']: # rg info columns rginfo[samp][col] = data.loc[row, col] for f in [data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']]: if "__" in f: print(Bcolors.BOLD + Bcolors.FAIL + "FAIL: file names cannot have double underscores, replace __ with _ (single)" + Bcolors.END) exit() f2pool[f] = pool f2samp[op.join(pooldir, f)] = samp pkldump(rginfo, op.join(parentdir, 'rginfo.pkl')) pkldump(ploidy, op.join(parentdir, 'ploidy.pkl')) pkldump(f2samp, op.join(parentdir, 'f2samp.pkl')) pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl')) pkldump(poolref, op.join(parentdir, 'poolref.pkl')) pkldump(adaptors, op.join(parentdir, 'adaptors.pkl')) pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl')) return data, f2pool, poolref
trimDIR = op.join(pooldir, '01_trimmed') # outfiles for d in [shtrimDIR, trimDIR]: if not op.exists(d): os.makedirs(d) mfile = op.join(parentdir, 'msgs.txt') ### def writetomfile(text): with open(mfile, 'a') as m: m.write("%s\n" % text) # get the fastq.gz files os.chdir(pooldir) gzfiles = [f for f in fs(pooldir) if 'R1' in f] lgz = len(gzfiles) text = 'found %(lgz)s R1 fastq.gz files in %(pooldir)s' % locals() print('\t%s' % text) writetomfile(text) # match seq pairs to samp, alert if pair not found seq_pairs = {} for f in gzfiles: samp = f2samp[f] if samp not in seq_pairs: seq_pairs[samp] = [] read2 = f.replace("_R1", "_R2") if op.exists(read2): seq_pairs[samp].append((op.abspath(f), op.abspath(read2))) else:
pooldirs = [op.join(parentdir, p) for p in pools] newdirs = [] # keep track of directories to easily make on remote server cmds = [] # keep track of all rsync commands # get hostname (eg beluga, cedar, graham) hostname = os.environ['CC_CLUSTER'] # add remote and subdirs to newdirs list newdirs.append(remote) for p in pooldirs: newdirs.append(op.join(remote, op.basename(p))) # get pkl files print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC) pkls = [f for f in fs(parentdir) if f.endswith('.pkl')] for p in pooldirs: for pkl in pkls: pkldst = op.join(remote, f'{op.basename(p)}/{op.basename(pkl)}') cmds.append(f"rsync -azv {hostname}:{pkl} {pkldst}") newpkls = [f for f in fs(p) if f.endswith('.pkl')] for newpkl in newpkls: newdst = op.join(remote, f'{op.basename(p)}/{op.basename(newpkl)}') cmds.append(f"rsync -azv {hostname}:{newpkl} {newdst}") # get shfiles print(Bcolors.BOLD + '\nBundling .sh and .out files ...' + Bcolors.ENDC) for p in pooldirs: shdir = op.join(p, 'shfiles') remotesh = op.join(remote, f'{op.basename(p)}/sh_and_outfiles')