Ejemplo n.º 1
0
def get_avail_accounts(parentdir=None, save=False):
    """Query slurm with sshare command to determine accounts available.
    
    If called with parentdir=None, return all available accounts.
        - Meant to be called from command line outside of pipeline. See also sys.argv input.
    If called with parentdir='choose', allow user to choose accounts.
        - Meant to be called from command line outside of pipeline. See also sys.argv input.
    If called with save=True, confirm each account with user and save .pkl file in parentdir.
        - save=True is only called from 00_start.py
    
    Returns a list of accounts to balance queue.
    """

    if parentdir is not None and save is False:
        # if the accounts have already been chosen, just return them right away
        # keep 'save is False' so 00_start can overwrite previous pkl and skip here
        pkl = os.path.join(parentdir, 'accounts.pkl')
        if os.path.exists(pkl):
            return pklload(pkl)

    # get a list of all available accounts
    acctout = subprocess.check_output([
        shutil.which('sshare'), '-U', '--user', os.environ['USER'],
        '--format=Account'
    ]).decode('utf-8').split('\n')
    accts = [
        acct.split()[0].split("_")[0] for acct in acctout if '_cpu' in acct
    ]

    # for running outside of the pipeline:
    if parentdir is None:
        # to manually run on command line, using all accounts (default + RAC)
        return accts
    elif parentdir == 'choose':
        # to manually run on command line, choose accounts
        return choose_accounts(accts)

    # save if necessary
    if save is True:
        # called from 00_start.py
        keep = choose_accounts(accts)
        pkldump(keep, os.path.join(parentdir, 'accounts.pkl'))
        # no return necessary for 00_start.py
        return

    return accts
Ejemplo n.º 2
0
-g --cut_window_size 5 --cut_mean_quality 30 --n_base_limit 20 --length_required 75 \
-h %(html)s.html --cut_by_quality3 --thread 16 --json %(json)s.json \
%(adaptorflag)s > %(logfile)s

''' % locals()
        newtext = newtext + text
        
    suffix = '''# once finished, map using bwa mem 
python $HOME/gatk_pipeline/02_bwa-map_view_sort_index_flagstat.py %(parentdir)s %(samp)s

''' % locals()
    
    text = header + newtext + suffix
    
    filE = op.join(shtrimDIR, '%(pool)s-%(samp)s-trim.sh' % locals())
    shfiles.append(filE)
    with open(filE, 'w') as o:
        o.write("%s" % text)
pkldump(samp2_r1r2out, op.join(pooldir, 'samp2_r1r2out.pkl'))


print('\tshcount =', len(shfiles))
print('\tshdir = ', shtrimDIR)
# qsub the files
for sh in shfiles:
    os.chdir(op.dirname(sh))     # want sbatch outfiles in same folder as sh file
    print('\tshfile=', sh)
    subprocess.call([shutil.which('sbatch'), sh])
    # os.system('sbatch %s' % sh)
    time.sleep(2)
module load bedtools/2.27.1
echo -e "\\ncreating coordfile"
bedtools bamtobed -i {sortfile} > {coordfile}

''')


# get bwatext
bwatext = ''''''
sortfiles = []
for r1, r2 in r1r2outs:
    sortfile, text = getbwatext(r1, r2)
    bwatext = bwatext + text
    sortfiles.append(sortfile)
pkldump(sortfiles, op.join(pooldir, '%s_sortfiles.pkl' % samp))

# send it off
email_text = get_email_info(parentdir, '02')
text = f'''#!/bin/bash
#SBATCH --time=23:59:00
#SBATCH --mem=55000M
#SBATCH --nodes=1
#SBATCH --ntasks=32
#SBATCH --cpus-per-task=1
#SBATCH --job-name={pool}-{samp}-bwa
#SBATCH --output={pool}-{samp}-bwa_%j.out 
{email_text}

{bwatext}
def get_pars():
    choices = ['all', 'fail', 'begin', 'end', 'pipeline-finish']
    parser = argparse.ArgumentParser(
        description=mytext,
        add_help=False,
        formatter_class=argparse.RawTextHelpFormatter)
    requiredNAMED = parser.add_argument_group('required arguments')
    requiredNAMED.add_argument("-p",
                               required=True,
                               default=argparse.SUPPRESS,
                               dest="parentdir",
                               type=str,
                               help="/path/to/directory/with/fastq.gz-files/")
    parser.add_argument(
        "-e",
        required=False,
        dest="email",
        help='''the email address you would like to have notifications
sent to''')
    parser.add_argument(
        "-n",
        default=None,
        nargs='+',
        required=False,
        dest="email_options",
        help='''the type(s) of email notifications you would like to
receive from the pipeline. Requires --email-address.
These options are used to fill out the #SBATCH flags.
Must be one (or multiple) of 
%s
(default: None)''' % [x for x in choices])
    parser.add_argument(
        "-maf",
        required=False,
        dest="maf",
        help='''At the end of the pipeline, VCF files will be filtered
for MAF. If the pipeline is run on a single
population/pool, the user can set MAF to 0.0 so as to
filter variants based on global allele frequency 
across populations/pools at a later time. (if the
number of sample_names in a pool == 1 then default
maf=0; Otherwise default maf = 1/sum(ploidy column)''')
    parser.add_argument(
        '--translate',
        required=False,
        action='store_true',
        dest="translate",
        help='''Boolean: true if used, false otherwise. If a stitched
genome is used for mapping, this option will look for
a ref.order file in the same directory as the
ref.fasta - where ref is the basename of the ref.fasta
(without the .fasta). The pipeline will use this
.order file to translate mapped positions to
unstitched positions at the end of the pipeline while
filtering. Positions in .order file are assumed to be
1-based indexing. Assumes .order file has no header,
and is of the format (contig name from unstitched
genome, start/stop are positions in the stitched genome):
ref_scaffold<tab>contig_name<tab>start_pos<tab>stop_pos<tab>contig_length
(default: False)''')
    parser.add_argument(
        '--rm_repeats',
        required=False,
        action='store_true',
        dest='repeats',
        help='''Boolean: true if used, false otherwise. If repeat
regions are available, remove SNPs that fall within
these regions from final SNP table and write to 
a REPEATS table. This option will look for a .txt file
in the same directory as the ref.fasta. Assumes the
filename is of the form: ref_repeats.txt - where ref
is the basename of the ref.fasta (without the .fasta).
This file should have 1-based indexing and should be
located in the same directory as the reference. The
file should have a header ('CHROM', 'start', 'stop').
The CHROM column can be names in the reference (if
using unstitched reference), or names of contigs that
were stitched to form the reference. If using a
stitched genome, --translate is required. (default:
False)''')
    parser.add_argument(
        '--rm_paralogs',
        required=False,
        action='store_true',
        dest='paralogs',
        help='''Boolean: true if used, false otherwise. If candidate
sites have been isolated within the reference where
distinct gene copies (paralogs) map to the same
position (and thus create erroneous SNPs), remove any
SNPs that fall on these exact sites and write to a
PARALOGS file. The pipeline assumes this file is
located in the parentdir, andends with 
'_paralog_snps.txt'. This file is tab-delimited, and
must have a column called 'locus' thatcontains
hyphen-separated CHROM-POS sites for paralogs. These
sites should be found in the current ref.fa being
used to call SNPs (otherwise SNPs cannot be filtered
by these sites). (default: False)''')
    parser.add_argument('-h',
                        '--help',
                        action='help',
                        default=argparse.SUPPRESS,
                        help='Show this help message and exit.\n')
    args = parser.parse_args()
    # trim path
    if args.parentdir.endswith('/'):
        args.parentdir = args.parentdir[:-1]
    # save command
    pkldump(args, op.join(args.parentdir, 'pipeline_start_command.pkl'))
    # assess arguments
    if args.email and args.email_options is None:
        print(Bcolors.FAIL +
              'FAIL: --notification-types are required when specifying email' +
              Bcolors.ENDC)
        print(Bcolors.FAIL + 'FAIL: choices = {%s}\n' % [x for x in choices] +
              Bcolors.ENDC)
        exit()
    if args.email_options and args.email is None:
        print(Bcolors.FAIL +
              'FAIL: specifying --notification-types requires specifying \
--email-address\n' + Bcolors.ENDC)
        exit()
    if args.email_options:
        for choice in args.email_options:
            if not choice.lower() in choices:
                print(
                    Bcolors.FAIL +
                    '''FAIL: There can be multiple options, but they must be from the set:'''
                    + Bcolors.ENDC)
                print(Bcolors.FAIL + '''\t%s\n''' % choices + Bcolors.ENDC)
                exit()
    if args.email:
        if '@' not in args.email:
            print(Bcolors.FAIL +
                  'FAIL: email address does not have an "@" symbol in it, \
please check input\n' + Bcolors.ENDC)
            exit()
        if 'all' in args.email_options:
            args.email_options = ['all']
        # save email
        epkl = {'email': args.email, 'opts': args.email_options}
        pkldump(epkl, op.join(args.parentdir, 'email_opts.pkl'))

    if args.maf:
        pkldump(args.maf, op.join(args.parentdir, 'maf.pkl'))

    if args.repeats:
        text = 'WARN: You have indicated that you want to remove repeats.\n'
        text = text + 'WARN: Make sure --translate is used if using a stitched reference.\n'
        text = text + 'WARN: Otherwise this will cause an error.\n'
        text = text + 'WARN: --repeats assumes that the first column in the repeats file ...\n'
        text = text + 'WARN: ... are the exact chromosome names found in the ref.fasta, ...\n'
        text = text + 'WARN: ... or if used with --translate this assumes that the first ...\n'
        text = text + 'WARN: ... column of the repeats file are names found in the second ...\n'
        text = text + 'WARN: ... column of the ref.order file used to translate positions.'
        print(Bcolors.WARNING + text + Bcolors.ENDC)
        askforinput()

    return args
def parse_datatable(data, parentdir, translate, repeats, paralogs):
    """
    Checks some assumptions of datatable.txt, create files and dirs for downstream.

    translate, repeats, and paralogs are boolean.
    parentdir is a path.
    """
    print(Bcolors.BOLD + '\nReading datatable, getting fastq info' +
          Bcolors.ENDC)

    # inititate dictionaries for downstream pipeline
    rginfo = {}  # key=samp vals=rginfo
    samp2pool = {}  # key=samp val=pool
    poolref = {}  # key=pool val=ref.fa
    ploidy = {}  # key=pool val=dict(key=sample: val=sample_ploidy)
    poolsamps = {}  # key=pool val=sampnames
    f2samp = {}  # key=f val=samp
    f2pool = {}  # key=f val=pool
    adaptors = OrderedDict()  # key=samp val={'r1','r2'} val=adaptor
    warning = []  # whether to print out warning about optional RG info
    failing = []  # whether to print out failing about required RG info
    pool2paralogfile = {}  # if --rm_paralogs flagged, store file based on pool
    pool2repeatsfile = {}  # if --rm_repeats flagged, store file based on pool
    pool2translate = {}  # if --translate flagged, store file based on pool

    # make sure there are no blanks where there shouldn't be
    badcols = []
    for column in data.columns:
        if column not in ['rgid', 'rgpu', 'adaptor_1', 'adaptor_2']:
            if data[column].isnull().sum() > 0:
                badcols.append(column)
    if len(badcols) > 0:
        print(
            Bcolors.FAIL +
            "\tFAIL: Some rows in datable.txt have blank entries in the following columns: "
            + Bcolors.ENDC)
        for col in badcols:
            print(Bcolors.FAIL + "\tFAIL: %s" % col + Bcolors.ENDC)
        print('exiting 00_start-pipeline.py')
        exit()

    # make sure specific words are not in a pool name
    badnames = []
    for pool in uni(data['pool_name']):
        for keyword in ['SNP', 'REPEAT', 'PARALOG']:
            if keyword in pool:
                badnames.append((pool, keyword))
    if len(badnames) > 0:
        print(
            Bcolors.FAIL +
            "\tFAIL: Some pool names have characters that could cause errors downstream."
            + Bcolors.ENDC)
        print(
            Bcolors.FAIL +
            "\tFAIL: Remove the bad characters from pool_names to continue." +
            Bcolors.ENDC)
        for pool, keyword in badnames:
            print(Bcolors.FAIL + "\tFAIL: Remove '%s' from pool_name '%s'." %
                  (keyword, pool))
        print('exiting 00_start-pipeline.py')
        exit()

    # iterate through datatable
    for row in data.index:
        # get variables
        samp = data.loc[row, 'sample_name']
        adaptors[samp] = {
            'r1': data.loc[row, 'adaptor_1'],
            'r2': data.loc[row, 'adaptor_2']
        }
        pool = data.loc[row, 'pool_name']
        pooldir = op.join(parentdir, pool)
        print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool))
        if pool not in poolsamps:
            poolsamps[pool] = []
        if samp not in poolsamps[pool]:
            poolsamps[pool].append(samp)
        if samp in samp2pool:
            if samp2pool[samp] != pool:
                print(Bcolors.FAIL +
                      'FAIL: there are duplicate sample names with \
different pool assignments: %s' % samp + Bcolors.ENDC)
                print('exiting')
                exit()
        samp2pool[samp] = pool

        # get ploidy info
        if pool not in ploidy:
            ploidy[pool] = {}
        if samp in ploidy[pool].keys():
            if ploidy[pool][samp] != int(data.loc[row, 'ploidy']):
                text = "FAIL: the ploidy values for sample_name '%s' are not the same" % samp
                print(Bcolors.FAIL + text + Bcolors.ENDC)
                exit()
        ploidy[pool][samp] = int(data.loc[row, 'ploidy'])

        # get ref.fasta info
        ref = data.loc[row, 'ref']
        if pool in poolref:
            # make sure each row for a pool specifies the same reference.fa
            if poolref[pool] != ref:
                text = "FAIL: Ref genome for samples in %s pool seem to have different paths in datatable" % pool
                print(Bcolors.FAIL + text + Bcolors.ENDC)
                print('exiting 00_start-pipeline.py')
                exit()
        else:
            # check assumptions about ref
            poolref[pool] = check_ref_assumptions(samp, ref)

        # hangle RG info
        rginfo[samp] = {}
        # required RG info
        for col in ['rglb', 'rgpl', 'rgsm']:  # rg info columns
            if not data.loc[row, col] == data.loc[row, col]:
                failing.append('%s\t%s' % (samp, col))
            rginfo[samp][col] = data.loc[row, col]
        # optional RG info
        for col in ['rgid', 'rgpu']:
            if data.loc[row, col] != data.loc[row, col]:
                # if nan
                rginfo[samp][col] = None
                if samp not in warning:
                    warning.append(samp)
            else:
                rginfo[samp][col] = data.loc[row, col]

        # map between file and pool/samp
        for f in [
                data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']
        ]:
            f2pool[f] = pool
            f2samp[op.join(pooldir, f)] = samp

    # handle --rm_paralogs, --translate, --rm_repeats
    for pool in uni(data['pool_name']):
        # handle translating stitched genome to unstitched positions
        pool2translate[pool] = handle_translate(translate, pool2translate,
                                                poolref[pool], data, pool)

        # handle removing SNPs from repeat regions
        pool2repeatsfile[pool] = handle_repeats(repeats, pool2repeatsfile,
                                                poolref[pool], data, pool)

        # handle removing paralogs
        pool2paralogfile[pool] = handle_paralogs(paralogs, pool2paralogfile,
                                                 data, pool, parentdir)

    # handle fails for rm_repeats/translate/rm_paralogs
    handle_dict_fails(pool2repeatsfile, pool2translate, pool2paralogfile,
                      repeats, translate, paralogs, data, parentdir)

    # RG info failing/warnings
    handle_rg_fails(failing, warning, parentdir, data)

    pkldump(pool2repeatsfile, op.join(parentdir, 'repeat_regions.pkl'))
    pkldump(pool2paralogfile, op.join(parentdir, 'paralog_snps.pkl'))
    pkldump(pool2translate, op.join(parentdir, 'translate_snps.pkl'))
    pkldump(rginfo, op.join(parentdir, 'rginfo.pkl'))
    pkldump(ploidy, op.join(parentdir, 'ploidy.pkl'))
    pkldump(f2samp, op.join(parentdir, 'f2samp.pkl'))
    pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl'))
    pkldump(poolref, op.join(parentdir, 'poolref.pkl'))
    pkldump(adaptors, op.join(parentdir, 'adaptors.pkl'))
    pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl'))
    return f2pool, poolref
Ejemplo n.º 6
0
def read_datatable(parentdir):
    # read in the datatable, save info for later
    datatable = op.join(parentdir, 'datatable.txt')
    if not op.exists(datatable):
        print(Bcolors.FAIL + '''FAIL: the datatable is not in the necessary path: %s
FAIL: exiting 00_start-gatk_pipeline.py''' % datatable + Bcolors.ENDC)
        sys.exit(3)
    print(Bcolors.BOLD + 'reading datatable, getting fastq info' + Bcolors.ENDC)
    data = pd.read_csv(datatable, sep='\t')
    rginfo = {}     # key=sampname vals=rginfo
    samp2pool = {}  # key=samp val=pool
    poolref = {}    # key=pool val=ref.fa
    ploidy = {}     # key=pool val=ploidy
    poolsamps = {}  # key=pool val=sampnames
    f2samp = {}     # key=f val=samp
    f2pool = {}     # key=f val=pool
    adaptors = {}   # key=samp val={'r1','r2'} val=adaptor
    for row in data.index:
        samp = data.loc[row, 'sample_name']
        adaptors[samp] = {'r1': data.loc[row, 'adaptor_1'],
                          'r2': data.loc[row, 'adaptor_2']}
        pool = data.loc[row, 'pool_name']
        pooldir = op.join(parentdir, pool)
        print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool))
        if pool not in poolsamps:
            poolsamps[pool] = []
        if samp not in poolsamps[pool]:
            poolsamps[pool].append(samp)
        if samp in samp2pool:
            if samp2pool[samp] != pool:
                print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \
different pool assignments: %s' % samp + Bcolors.ENDC)
                print('exiting')
                exit()
        samp2pool[samp] = pool
        df = data[data['pool_name'] == pool].copy()
        if not luni(df['ploidy']) == 1:
            print(Bcolors.WARNING + 
                  "The ploidy values for some elements with pool name '%s' are not the same." % pool +
                  "\n\tHere are the ploidy values: %s" % uni(df['ploidy']) +
                  Bcolors.ENDC)
            askforinput()
        if samp not in ploidy:
            ploidy[samp] = data.loc[row, 'ploidy']
        if pool in poolref:
            if not poolref[pool] == data.loc[row, 'ref']:
                print("ref genome for samples in %s pool seems to have different paths in datatable.txt" % pool)
                sys.exit(1)
        else:
            ref = data.loc[row, 'ref']
            if not op.exists(ref):
                print('ref for %s does not exist in path: %s' % (samp, ref))
                print('exiting 00_start-gatk_pipeline.py')
                exit()
            needed = []
            for suffix in ['.dict', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa']:
                refext = ref + suffix if suffix != '.dict' else ref.split('.fa')[0] + suffix
                if not op.exists(refext):
                    needed.append(refext)
            if len(needed) > 0:
                print(Bcolors.FAIL + 
                      'FAIL: the following extensions of the reference are needed to continue, \
please create these files' + 
                      Bcolors.ENDC)
                for n in needed:
                    print(Bcolors.FAIL + n + Bcolors.ENDC)
                print('exiting')
                exit()
            printneeded = False
            intdir = op.join(op.dirname(ref), 'intervals')
            if not op.exists(intdir):
                printneeded = True
            elif len([f for f in fs(intdir) if '.list' in f]) == 0:
                printneeded = True
            if printneeded is True:
                print(Bcolors.FAIL + 
                      'FAIL: either the intervals dir doesn not exist or there are not interval.list files\
\nFAIL: intdir should be here: %s' % intdir +
                      Bcolors.ENDC)
                exit()
            poolref[pool] = ref
        rginfo[samp] = {}
        for col in ['rglb', 'rgpl', 'rgsm']:  # rg info columns
            rginfo[samp][col] = data.loc[row, col]
        for f in [data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']]:
            if "__" in f:
                print(Bcolors.BOLD + 
                      Bcolors.FAIL + 
                      "FAIL: file names cannot have double underscores, replace __ with _ (single)" + 
                      Bcolors.END)
                exit()
            f2pool[f] = pool
            f2samp[op.join(pooldir, f)] = samp
    pkldump(rginfo, op.join(parentdir, 'rginfo.pkl'))
    pkldump(ploidy, op.join(parentdir, 'ploidy.pkl'))
    pkldump(f2samp, op.join(parentdir, 'f2samp.pkl'))
    pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl'))
    pkldump(poolref, op.join(parentdir, 'poolref.pkl'))
    pkldump(adaptors, op.join(parentdir, 'adaptors.pkl'))
    pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl'))
    return data, f2pool, poolref
Ejemplo n.º 7
0
def get_pars():
    choices = ['all', 'fail', 'begin', 'end', 'pipeline-finish']
    parser = argparse.ArgumentParser(description=print(mytext),
                                     add_help=False,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    requiredNAMED = parser.add_argument_group('required arguments')
    requiredNAMED.add_argument("-p",
                               required=True,
                               default=argparse.SUPPRESS,
                               dest="parentdir",
                               type=str,
                               help="/path/to/directory/with/fastq.gz-files/")
    parser.add_argument("-e",
                        required=False,
                        dest="email",
                        help="the email address you would like to have notifications sent to")
    parser.add_argument("-n",
                        default=argparse.SUPPRESS,
                        nargs='+',
                        required=False,
                        dest="email_options",
                        help='''the type(s) of email notifications you would like to receive from the pipeline.\
                        Requires --email-address. These options are used to fill out the #SBATCH flags.
must be one (or multiple) of %s''' % [x for x in choices])
    parser.add_argument('-h', '--help',
                        action='help',
                        default=argparse.SUPPRESS,
                        help='Show this help message and exit.\n')
    args = parser.parse_args()
    if args.parentdir.endswith('/'):
        args.parentdir = args.parentdir[:-1]
    if args.email and args.email_options is None:
        print(Bcolors.FAIL + 'FAIL: --notification-types are required when specifying email' + Bcolors.ENDC)
        print(Bcolors.FAIL + 'FAIL: choices = {%s}\n' % [x for x in choices] + Bcolors.ENDC)
        exit()
    if args.email_options and args.email is None:
        print(Bcolors.FAIL + 'FAIL: specifying --notification-types requires specifying \
--email-address\n' + Bcolors.ENDC)
        exit()
    if args.email_options:
        for choice in args.email_options:
            if not choice.lower() in choices:
                print(Bcolors.FAIL +
                      '''FAIL: There can be multiple options, but they must be from the set:''' +
                      Bcolors.ENDC)
                print(Bcolors.FAIL +
                      '''\t%s\n''' % choices +
                      Bcolors.ENDC)
                exit()
    if args.email:
        if '@' not in args.email:
            print(Bcolors.FAIL + 'FAIL: email address does not have an "@" symbol in it, \
please check input\n' + Bcolors.ENDC)
            exit()
        if 'all' in args.email_options:
            args.email_options = ['all']
        # save email
        epkl = {'email': args.email,
                'opts': args.email_options}
        pkldump(epkl, op.join(args.parentdir, 'email_opts.pkl'))

    return args