def handle_dict_fails(pool2repeatsfile, pool2translate, pool2paralogfile,
                      repeats, translate, paralogs, data, parentdir):
    flagexit = False
    for dic, flag, word in zip(
        [pool2repeatsfile, pool2translate, pool2paralogfile],
        [repeats, translate, paralogs],
        ['remove repeats', 'translate stitched positions', 'remove paralogs']):
        # if a flag was specified but none of the pools were selected:
        if flag is True and sum([1
                                 for v in dic.values() if v is not None]) == 0:
            flagexit = True
            text = 'FAIL: You have indicated that you would like to %s from final SNPs.\n' % word
            text = text + 'FAIL: But the user has not specified at least one pool to %s. \n' % word
            text = text + 'FAIL: You need to respond "yes" to at least one of the prompts above \n'
            text = text + 'FAIL: for assigning a file to a pool - i.e., to use the \n'
            text = text + 'FAIL: %s flag, you must apply it to at least one pool. \n' % word
            if 'repeats' in word:
                text = text + 'FAIL: The file containing repeat regions should be one of the following:\n'
                for ref in uni(data['ref']):
                    repeatfile = ref.split(".fa")[0] + '_repeats.txt'
                    text = text + "\t %s \n" % repeatfile
            elif 'stitched' in word:
                text = text + 'FAIL: The file to translate stitched to unstitched positions should \n'
                text = text + 'FAIL: be one of the following:\n'
                for ref in uni(data['ref']):
                    orderfile = ref.split(".fa")[0] + '.order'
                    text = text + "\t %s \n" % orderfile
            elif 'paralogs' in word:
                text = text + 'FAIL: The file(s) to remove paralogs must be in %s \n' % parentdir
                text = text + 'FAIL: and end with "_paralog_snps.txt".'
            print(Bcolors.FAIL + text + Bcolors.ENDC)
    if flagexit is True:
        exit()
def add_freq_cols(df, tf, tipe, tablefile):
    """
    Adding in .FREQ columns for crisp file.
    
    Positional arguments:
    df - pandas.dataframe; current filtered VariantsToTable output
    tablefile - path to VariantsToTable output - used to find ploidy etc
    tf - basename of tablefile
    tipe - one of either "SNP" or "INDEL"
    
    Returns:
    df - pandas.dataframe; current filtered VariantsToTable output + freqcols
    """
    print('Adding in .FREQ columns for crisp file ...')
    # remove bednum from column names so we can pd.concat() later
    bednum = tf.split("file_")[-1].split("_converted")[0]
    df.columns = [col.replace("_" + bednum, "") for col in df.columns]
    # add in a .FREQ column for pool-level freqs
    gtcols = [col for col in df.columns if '.GT' in col]
    print('len(gtcols) = ', len(gtcols))
    freqcols = []
    for col in tqdm(gtcols):
        refcol = col.replace(".GT", ".REFCOUNT")
        altcol = col.replace(".GT", ".ALTCOUNT")
        freqcol = col.replace(".GT", ".FREQ")
        freqcols.append(freqcol)
        for alt in uni(df['ALT']):
            df.loc[df['ALT'] == alt, altcol] = df[col].str.count(alt)
        for ref in uni(df['REF']):
            df.loc[df['REF'] == ref, refcol] = df[col].str.count(ref)
        df[freqcol] = df[altcol] / (df[altcol] + df[refcol])
    # remove count cols
    print('Removing unnecessary cols ...')
    df = df[[
        col for col in df.columns
        if '.REFCOUNT' not in col and '.ALTCOUNT' not in col
    ]].copy()
    # recalculate global AF
    df = recalc_global_freq(df, tf, freqcols)
    # sort columns to group data together for each pool
    datacols = sorted([col for col in df.columns if '.' in col])
    othercols = [
        col for col in df.columns
        if '.' not in col and col != 'locus' and 'crisp' not in col
    ]
    othercols.insert(othercols.index('AF') + 1, 'crisp_AF')
    df = df[['locus'] + othercols + datacols].copy()
    df.index = range(len(df.index))
    return df
def get_refn_snps(df, tipe, ndfs=None):
    """
    Isolate polymorphisms with REF=N but two ALT single nuleodite alleles.
    
    Positional arguments:
    df - pandas.dataframe; current filtered VariantsToTable output
    
    Returns:
    dfs - list of loci (pandas.dataframes) with REF=N and two ALT alleles, counts with respect to second ALT
    ndfs - return from pd.conat(dfs)
    """
    ndf = df[df['REF'] == 'N'].copy()
    ndf = ndf[ndf['TYPE'] == tipe].copy()
    ncount = table(ndf['locus'])
    nloci = [locus for locus in ncount if ncount[locus] == 2]
    ndf = ndf[ndf['locus'].isin(nloci)].copy()
    dfs = []
    for locus in uni(ndf['locus']):
        smalldf = ndf[ndf['locus'] == locus].copy()
        if len(smalldf.index) == 2:
            smalldf.index = range(len(smalldf.index))
            smalldf = adjust_freqs(smalldf)
            smalldf.loc[0, 'ALT'] = "%s+%s" % (smalldf.loc[0, 'ALT'],
                                               smalldf.loc[1, "ALT"])
            dfs.append(pd.DataFrame(smalldf.loc[0, :]).T)
    if len(dfs) > 0:
        ndfs = pd.concat(dfs)
    return (dfs, ndfs)
def make_pooldirs(data, parentdir):
    """Create subdirectories of parentdir.

    Positional arguments:
    data - datatable.txt with info for pipeline
    parentdir - directory with datatable.txt and (symlinks to) fastq data
    """
    # make pool dirs
    print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC)
    pools = uni(data['pool_name'].tolist())
    pooldirs = []
    for p in pools:
        pooldir = op.join(parentdir, p)
        if op.exists(pooldir):
            text = "\tWARN: The pooldir already exists, this WILL overwrite and/or delete previous data: %s" % pooldir
            print(Bcolors.WARNING + text + Bcolors.ENDC)
            askforinput(tab='\t', newline='')
            # first unlink fastq files
            for f in fs(pooldir):
                if f.endswith('.gz'):
                    os.unlink(f)
            # then just delete the directory
            shutil.rmtree(pooldir)
        pooldirs.append(makedir(pooldir))
    return pooldirs
def create_all_bedfiles(poolref, numpools):
    """For each unique ref.fa in datatable.txt, create bedfiles for varscan.

    Positional arguments:
    poolref - dictionary with key = pool, val = /path/to/ref
    """
    # create bedfiles for varscan
    print(Bcolors.BOLD + "\ncreating bedfiles" + Bcolors.ENDC)
    for ref in uni(poolref.values()):
        create_bedfiles.main(ref, numpools)
Exemple #6
0
def make_pooldirs(data, parentdir):
    # make pool dirs
    print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC)
    pools = uni(data['pool_name'].tolist())
    pooldirs = []
    for p in pools:
        DIR = op.join(parentdir, p)
        if op.exists(DIR):
            print("The pooldir already exists, this could overwrite previous data: %s" % DIR)
            askforinput()
        pooldirs.append(makedir(DIR))
        makedir(op.join(DIR, 'shfiles'))
    return pooldirs
def make_pooldirs(data, parentdir):
    """Create subdirectories of parentdir.

    Positional arguments:
    data - datatable.txt with info for pipeline
    parentdir - directory with datatable.txt and (symlinks to) fastq data
    """
    # make pool dirs
    print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC)
    pools = uni(data['pool_name'].tolist())
    pooldirs = []
    for p in pools:
        DIR = op.join(parentdir, p)
        if op.exists(DIR):
            print(
                "The pooldir already exists, this could overwrite previous data: %s"
                % DIR)
            print("Do you want to proceed?")
            askforinput()
        pooldirs.append(makedir(DIR))
    return pooldirs
def remove_repeats(snps, parentdir, snpspath, pool):
    """
    Remove SNPs that are found to be in repeat-masked regions.
    
    # assumes
    # that the positions have been translated BEFORE removing repeats
        # took forever to create unstitched repeat regions, don't want to translate repeat file
        # this way I can just use unstitched chrom if reference is stitched
    # repeat file has a header ('CHROM', 'start', 'stop')
    # start and stop positions of repeat regions are 1-based
    """
    reppkl = op.join(parentdir, 'repeat_regions.pkl')
    if op.exists(reppkl):
        # read in repeat regions
        repeatdict = pklload(reppkl)
        if repeatdict[pool] is not None:
            print('Removing repeat regions ...')
            # if user selected translation be applied to this pool
            repeats = pd.read_csv(repeatdict[pool], sep='\t')
            # figure out if data is from stitched or not
            if 'unstitched_chrom' in snps.columns:
                # then the snps have been translated: stitched -> unstitched
                chromcol = 'unstitched_chrom'
                poscol = 'unstitched_pos'
                print('\tsnps have been translated')
            else:
                # otherwise SNPs were called on unstitched reference
                chromcol = 'CHROM'
                poscol = 'POS'
                print('\tsnps have not been translated')
            # reduce repeats to the chroms that matter (helps speed up lookups)
            repeats = repeats[repeats['CHROM'].isin(
                snps[chromcol].tolist())].copy()

            # isolate SNPs in repeat regions
            repeat_snps = []
            for chrom in tqdm(uni(snps[chromcol])):
                reps = repeats[repeats['CHROM'] == chrom].copy()
                mysnps = snps[snps[chromcol] == chrom].copy()
                if len(reps.index) > 0 and len(mysnps.index) > 0:
                    for row in mysnps.index:
                        pos = snps.loc[
                            row,
                            poscol]  # index is maintained from snps to mysnsps
                        df = reps[reps['stop'].astype(int) >= int(pos)].copy()
                        df = df[df['start'].astype(int) <= int(pos)].copy()
                        if len(df.index) > 0:
                            assert len(df.index) == 1
                            repeat_snps.append(row)

            # save repeats
            print(f'\tSaving {len(repeat_snps)} repeat regions')
            repeat_path = snpspath.replace(".txt", "_REPEATS.txt")
            myrepeats = snps[snps.index.isin(repeat_snps)].copy()
            myrepeats = mark_nas(myrepeats, 'repeat SNPs')
            myrepeats.to_csv(repeat_path, sep='\t', index=False)

            # remove SNPs in repeat regions
            snps = snps[~snps.index.isin(repeat_snps)].copy()
            snps.index = range(len(snps.index))

            print(
                f'{op.basename(snpspath)} has {len(snps.index)} SNPs outside of repeat regions'
            )

    return snps
def parse_datatable(data, parentdir, translate, repeats, paralogs):
    """
    Checks some assumptions of datatable.txt, create files and dirs for downstream.

    translate, repeats, and paralogs are boolean.
    parentdir is a path.
    """
    print(Bcolors.BOLD + '\nReading datatable, getting fastq info' +
          Bcolors.ENDC)

    # inititate dictionaries for downstream pipeline
    rginfo = {}  # key=samp vals=rginfo
    samp2pool = {}  # key=samp val=pool
    poolref = {}  # key=pool val=ref.fa
    ploidy = {}  # key=pool val=dict(key=sample: val=sample_ploidy)
    poolsamps = {}  # key=pool val=sampnames
    f2samp = {}  # key=f val=samp
    f2pool = {}  # key=f val=pool
    adaptors = OrderedDict()  # key=samp val={'r1','r2'} val=adaptor
    warning = []  # whether to print out warning about optional RG info
    failing = []  # whether to print out failing about required RG info
    pool2paralogfile = {}  # if --rm_paralogs flagged, store file based on pool
    pool2repeatsfile = {}  # if --rm_repeats flagged, store file based on pool
    pool2translate = {}  # if --translate flagged, store file based on pool

    # make sure there are no blanks where there shouldn't be
    badcols = []
    for column in data.columns:
        if column not in ['rgid', 'rgpu', 'adaptor_1', 'adaptor_2']:
            if data[column].isnull().sum() > 0:
                badcols.append(column)
    if len(badcols) > 0:
        print(
            Bcolors.FAIL +
            "\tFAIL: Some rows in datable.txt have blank entries in the following columns: "
            + Bcolors.ENDC)
        for col in badcols:
            print(Bcolors.FAIL + "\tFAIL: %s" % col + Bcolors.ENDC)
        print('exiting 00_start-pipeline.py')
        exit()

    # make sure specific words are not in a pool name
    badnames = []
    for pool in uni(data['pool_name']):
        for keyword in ['SNP', 'REPEAT', 'PARALOG']:
            if keyword in pool:
                badnames.append((pool, keyword))
    if len(badnames) > 0:
        print(
            Bcolors.FAIL +
            "\tFAIL: Some pool names have characters that could cause errors downstream."
            + Bcolors.ENDC)
        print(
            Bcolors.FAIL +
            "\tFAIL: Remove the bad characters from pool_names to continue." +
            Bcolors.ENDC)
        for pool, keyword in badnames:
            print(Bcolors.FAIL + "\tFAIL: Remove '%s' from pool_name '%s'." %
                  (keyword, pool))
        print('exiting 00_start-pipeline.py')
        exit()

    # iterate through datatable
    for row in data.index:
        # get variables
        samp = data.loc[row, 'sample_name']
        adaptors[samp] = {
            'r1': data.loc[row, 'adaptor_1'],
            'r2': data.loc[row, 'adaptor_2']
        }
        pool = data.loc[row, 'pool_name']
        pooldir = op.join(parentdir, pool)
        print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool))
        if pool not in poolsamps:
            poolsamps[pool] = []
        if samp not in poolsamps[pool]:
            poolsamps[pool].append(samp)
        if samp in samp2pool:
            if samp2pool[samp] != pool:
                print(Bcolors.FAIL +
                      'FAIL: there are duplicate sample names with \
different pool assignments: %s' % samp + Bcolors.ENDC)
                print('exiting')
                exit()
        samp2pool[samp] = pool

        # get ploidy info
        if pool not in ploidy:
            ploidy[pool] = {}
        if samp in ploidy[pool].keys():
            if ploidy[pool][samp] != int(data.loc[row, 'ploidy']):
                text = "FAIL: the ploidy values for sample_name '%s' are not the same" % samp
                print(Bcolors.FAIL + text + Bcolors.ENDC)
                exit()
        ploidy[pool][samp] = int(data.loc[row, 'ploidy'])

        # get ref.fasta info
        ref = data.loc[row, 'ref']
        if pool in poolref:
            # make sure each row for a pool specifies the same reference.fa
            if poolref[pool] != ref:
                text = "FAIL: Ref genome for samples in %s pool seem to have different paths in datatable" % pool
                print(Bcolors.FAIL + text + Bcolors.ENDC)
                print('exiting 00_start-pipeline.py')
                exit()
        else:
            # check assumptions about ref
            poolref[pool] = check_ref_assumptions(samp, ref)

        # hangle RG info
        rginfo[samp] = {}
        # required RG info
        for col in ['rglb', 'rgpl', 'rgsm']:  # rg info columns
            if not data.loc[row, col] == data.loc[row, col]:
                failing.append('%s\t%s' % (samp, col))
            rginfo[samp][col] = data.loc[row, col]
        # optional RG info
        for col in ['rgid', 'rgpu']:
            if data.loc[row, col] != data.loc[row, col]:
                # if nan
                rginfo[samp][col] = None
                if samp not in warning:
                    warning.append(samp)
            else:
                rginfo[samp][col] = data.loc[row, col]

        # map between file and pool/samp
        for f in [
                data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']
        ]:
            f2pool[f] = pool
            f2samp[op.join(pooldir, f)] = samp

    # handle --rm_paralogs, --translate, --rm_repeats
    for pool in uni(data['pool_name']):
        # handle translating stitched genome to unstitched positions
        pool2translate[pool] = handle_translate(translate, pool2translate,
                                                poolref[pool], data, pool)

        # handle removing SNPs from repeat regions
        pool2repeatsfile[pool] = handle_repeats(repeats, pool2repeatsfile,
                                                poolref[pool], data, pool)

        # handle removing paralogs
        pool2paralogfile[pool] = handle_paralogs(paralogs, pool2paralogfile,
                                                 data, pool, parentdir)

    # handle fails for rm_repeats/translate/rm_paralogs
    handle_dict_fails(pool2repeatsfile, pool2translate, pool2paralogfile,
                      repeats, translate, paralogs, data, parentdir)

    # RG info failing/warnings
    handle_rg_fails(failing, warning, parentdir, data)

    pkldump(pool2repeatsfile, op.join(parentdir, 'repeat_regions.pkl'))
    pkldump(pool2paralogfile, op.join(parentdir, 'paralog_snps.pkl'))
    pkldump(pool2translate, op.join(parentdir, 'translate_snps.pkl'))
    pkldump(rginfo, op.join(parentdir, 'rginfo.pkl'))
    pkldump(ploidy, op.join(parentdir, 'ploidy.pkl'))
    pkldump(f2samp, op.join(parentdir, 'f2samp.pkl'))
    pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl'))
    pkldump(poolref, op.join(parentdir, 'poolref.pkl'))
    pkldump(adaptors, op.join(parentdir, 'adaptors.pkl'))
    pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl'))
    return f2pool, poolref
Exemple #10
0
def read_datatable(parentdir):
    # read in the datatable, save info for later
    datatable = op.join(parentdir, 'datatable.txt')
    if not op.exists(datatable):
        print(Bcolors.FAIL + '''FAIL: the datatable is not in the necessary path: %s
FAIL: exiting 00_start-gatk_pipeline.py''' % datatable + Bcolors.ENDC)
        sys.exit(3)
    print(Bcolors.BOLD + 'reading datatable, getting fastq info' + Bcolors.ENDC)
    data = pd.read_csv(datatable, sep='\t')
    rginfo = {}     # key=sampname vals=rginfo
    samp2pool = {}  # key=samp val=pool
    poolref = {}    # key=pool val=ref.fa
    ploidy = {}     # key=pool val=ploidy
    poolsamps = {}  # key=pool val=sampnames
    f2samp = {}     # key=f val=samp
    f2pool = {}     # key=f val=pool
    adaptors = {}   # key=samp val={'r1','r2'} val=adaptor
    for row in data.index:
        samp = data.loc[row, 'sample_name']
        adaptors[samp] = {'r1': data.loc[row, 'adaptor_1'],
                          'r2': data.loc[row, 'adaptor_2']}
        pool = data.loc[row, 'pool_name']
        pooldir = op.join(parentdir, pool)
        print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool))
        if pool not in poolsamps:
            poolsamps[pool] = []
        if samp not in poolsamps[pool]:
            poolsamps[pool].append(samp)
        if samp in samp2pool:
            if samp2pool[samp] != pool:
                print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \
different pool assignments: %s' % samp + Bcolors.ENDC)
                print('exiting')
                exit()
        samp2pool[samp] = pool
        df = data[data['pool_name'] == pool].copy()
        if not luni(df['ploidy']) == 1:
            print(Bcolors.WARNING + 
                  "The ploidy values for some elements with pool name '%s' are not the same." % pool +
                  "\n\tHere are the ploidy values: %s" % uni(df['ploidy']) +
                  Bcolors.ENDC)
            askforinput()
        if samp not in ploidy:
            ploidy[samp] = data.loc[row, 'ploidy']
        if pool in poolref:
            if not poolref[pool] == data.loc[row, 'ref']:
                print("ref genome for samples in %s pool seems to have different paths in datatable.txt" % pool)
                sys.exit(1)
        else:
            ref = data.loc[row, 'ref']
            if not op.exists(ref):
                print('ref for %s does not exist in path: %s' % (samp, ref))
                print('exiting 00_start-gatk_pipeline.py')
                exit()
            needed = []
            for suffix in ['.dict', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa']:
                refext = ref + suffix if suffix != '.dict' else ref.split('.fa')[0] + suffix
                if not op.exists(refext):
                    needed.append(refext)
            if len(needed) > 0:
                print(Bcolors.FAIL + 
                      'FAIL: the following extensions of the reference are needed to continue, \
please create these files' + 
                      Bcolors.ENDC)
                for n in needed:
                    print(Bcolors.FAIL + n + Bcolors.ENDC)
                print('exiting')
                exit()
            printneeded = False
            intdir = op.join(op.dirname(ref), 'intervals')
            if not op.exists(intdir):
                printneeded = True
            elif len([f for f in fs(intdir) if '.list' in f]) == 0:
                printneeded = True
            if printneeded is True:
                print(Bcolors.FAIL + 
                      'FAIL: either the intervals dir doesn not exist or there are not interval.list files\
\nFAIL: intdir should be here: %s' % intdir +
                      Bcolors.ENDC)
                exit()
            poolref[pool] = ref
        rginfo[samp] = {}
        for col in ['rglb', 'rgpl', 'rgsm']:  # rg info columns
            rginfo[samp][col] = data.loc[row, col]
        for f in [data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']]:
            if "__" in f:
                print(Bcolors.BOLD + 
                      Bcolors.FAIL + 
                      "FAIL: file names cannot have double underscores, replace __ with _ (single)" + 
                      Bcolors.END)
                exit()
            f2pool[f] = pool
            f2samp[op.join(pooldir, f)] = samp
    pkldump(rginfo, op.join(parentdir, 'rginfo.pkl'))
    pkldump(ploidy, op.join(parentdir, 'ploidy.pkl'))
    pkldump(f2samp, op.join(parentdir, 'f2samp.pkl'))
    pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl'))
    pkldump(poolref, op.join(parentdir, 'poolref.pkl'))
    pkldump(adaptors, op.join(parentdir, 'adaptors.pkl'))
    pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl'))
    return data, f2pool, poolref
# imports
import os, sys, json, pandas as pd
from tqdm import tqdm
from os import path as op
from collections import OrderedDict
from coadaptree import fs, uni, pklload

# args
thisfile, parentdir, engines = sys.argv
if parentdir.endswith("/"):
    parentdir = parentdir[:-1]

# reqs
print('getting reqs')
samp2pool = pklload(op.join(parentdir, 'samp2pool.pkl'))
pools = uni(list(samp2pool.values()))

# get a list of subdirectory pool dirs created earlier in pipeline
print('getting pooldirs')
pooldirs = []
for p in pools:
    pooldir = op.join(parentdir, p)
    pooldirs.append(pooldir)

# TRIMMING DATA
# get the json data from trimming
print('getting trim data')
data = {}
count = 0
for p in pooldirs:
    trimdir = op.join(p, '01_trimmed')