Ejemplo n.º 1
0
def getfiles(samps, shdir, grep):
    """Determine if all realign bam jobs have been created and sbatched.

    Positional arguments:
    samps - list of sample names (length is number of expected shfiles)
    shdir - directory where .sh and .out files are
    grep - program name - keyword used to find correct files

    Returns:
    files - dictionary where key = sh file, val = most recent outfile
    """
    found = [sh for sh in fs(shdir) if sh.endswith(".sh") and grep in sh]
    outs = [out for out in fs(shdir) if out.endswith('.out') and grep in out]
    if len(found) != len(samps):
        print('not all shfiles have been created, exiting %s' % sys.argv[0])
        exit()
    files = dict(
        (f,
         getmostrecent(
             [out for out in outs
              if op.basename(f).replace(".sh", "") in out])) for f in found)
    if None in files.values():
        print('not all shfiles have been sbatched, exiting %s' % sys.argv[0])
        exit()
    return files
def make_pooldirs(data, parentdir):
    """Create subdirectories of parentdir.

    Positional arguments:
    data - datatable.txt with info for pipeline
    parentdir - directory with datatable.txt and (symlinks to) fastq data
    """
    # make pool dirs
    print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC)
    pools = uni(data['pool_name'].tolist())
    pooldirs = []
    for p in pools:
        pooldir = op.join(parentdir, p)
        if op.exists(pooldir):
            text = "\tWARN: The pooldir already exists, this WILL overwrite and/or delete previous data: %s" % pooldir
            print(Bcolors.WARNING + text + Bcolors.ENDC)
            askforinput(tab='\t', newline='')
            # first unlink fastq files
            for f in fs(pooldir):
                if f.endswith('.gz'):
                    os.unlink(f)
            # then just delete the directory
            shutil.rmtree(pooldir)
        pooldirs.append(makedir(pooldir))
    return pooldirs
def main():
    # make sure all of the varscan jobs have finished
    files = checkjobs()

    # combine table files from output of VariantsToTable
    tablefiles = get_tables(files)

    # get SNP and indels
    for tipe in ['SNP', 'INDEL']:
        get_types(tablefiles, tipe, program, pooldir, grep)

    # combine repeats and paralogs
    tabledir = op.dirname(tablefiles[0])
    for tipe in ['PARALOGS', 'REPEATS']:
        tablefiles = [
            f for f in fs(tabledir)
            if tipe in f and 'all' not in f and f.endswith('.txt')
        ]
        if len(tablefiles) > 0:
            dfs = []
            for t in tablefiles:
                dfs.append(pd.read_csv(t, sep='\t'))
            df = pd.concat(dfs)
            df.to_csv(op.join(
                tabledir,
                f'{op.basename(pooldir)}-{program}_all_bedfiles_{tipe}.txt'),
                      sep='\t',
                      index=False)
def get_parafile(parentdir, pool):
    """Obtain file containing paralog SNPs to be removed from final SNPs."""
    parafiles = [f for f in fs(parentdir) if f.endswith('_paralog_snps.txt')]
    if len(parafiles) > 1:
        parafile = choose_file(parafiles, pool, 'remove paralogs')
    elif len(parafiles) == 0:
        parafile = None
    elif len(parafiles) == 1:
        parafile = parafiles[0]
    return parafile
def get_datafiles(parentdir, f2pool, data):
    """Get list of files from datatable, make sure they exist in parentdir.
    Create symlinks in /parentdir/<pool_name>/.

    Positional arguments:
    parentdir - directory with datatable.txt and (symlinks to) fastq data
    f2pool - dictionary with key = file.fastq, val = pool_name
    data - datatable.txt with info for pipeline
    """
    print(Bcolors.BOLD +
          '\nchecking for existance of fastq files in datatable.txt' +
          Bcolors.ENDC)
    files = [f for f in fs(parentdir) if 'fastq' in f and 'md5' not in f]
    datafiles = data['file_name_r1'].tolist()
    for x in data['file_name_r2'].tolist():
        datafiles.append(x)
    if len(files) > len(datafiles):
        desc = 'more'
    if len(files) < len(datafiles):
        desc = 'less'
    try:
        print(Bcolors.WARNING +
              'WARN: there are %s fastq files in %s than in datatable.txt' %
              (desc, parentdir) + Bcolors.ENDC)
        print(Bcolors.BOLD + 'Here are the files in %s' % parentdir +
              Bcolors.ENDC)
        for x in files:
            print(op.basename(x))
        print(Bcolors.BOLD + 'Here are the files in datatable.txt' +
              Bcolors.ENDC)
        for x in datafiles:
            print(x)
        askforinput(newline='')

    except NameError:
        pass

    # create symlinks in pooldirs for visualization
    for f in datafiles:
        src = op.join(parentdir, f)
        if not op.exists(src):
            # make sure file in datatable exists
            print(
                "could not find %s in %s\nmake sure file_name in datatable is its basename"
                % (f, parentdir))
            print(
                "(symlinks in parentdir to fastq files in other dirs works fine, and is the intentional use)"
            )
            sys.exit(1)
        pooldir = op.join(parentdir, f2pool[f])
        dst = op.join(pooldir, f)
        if not op.exists(dst):
            # easy to visualize in cmdline if script is finding correct group of files by ls-ing pooldir
            os.symlink(src, dst)
Ejemplo n.º 6
0
def check_beddir():
    """Avoid accidentally using incorrect bedfiles by removing any that exist."""
    bname, beddir = make_beddir()
    files = [f for f in fs(beddir) if f.endswith('.bed')]
    if len(files) > 0:
        text = '\tThere are already existing bedfiles in %s. These will be deleted.' % beddir
        print(Bcolors.WARNING + text + Bcolors.ENDC)
        askforinput(tab='\t', newline='')
        for f in files:
            os.remove(f)
        print('\t\tRemoved %s bedfiles.' % len(files))
def get_bamfiles(samps, pooldir):
    """Using a list of sample names, find the realigned bamfiels.

    Return:
    files - dictionary with key = samp_name, val = /path/to/bamfile
    """
    print('getting bamfiles')
    found = fs(op.join(pooldir, '04_realign'))
    files = dict((samp, f.replace(".bai", ".bam")) for samp in samps for f in found if samp in f and f.endswith('.bai'))
    if not len(files) == len(samps):
        print('len(files) != len(samps)')
        print('files = ', files)
        print('samps = ', samps)
        exit()
    return files
def get_tables(files):
    """Find all existing .txt files, exit if the number doesn't match expectations.

    Positional arguments:
    files - list of shfiles, should be same length as tablefiles (if all jobs are sbatched and done).
    """
    print('getting tablefiles')
    tablefiles = [f for f in fs(op.join(pooldir, program))
                  if f.endswith('.txt')
                  and 'all_bedfiles' not in f
                  and 'SNP' not in f
                  and 'INDEL' not in f
                  and grep in f]
    if not len(tablefiles) == len(files):
        print('for some reason tablefiles != files. exiting.')
        exit()
    return tablefiles
def checkjobs():
    """
    Make sure previous realigned bamfiles were created without error.
    Avoids unintentionally combining a subset of all final expected files.

    Calls:
    getfiles from start_crispANDvarscan
    """
    print('checking jobs')
    parentdir = op.dirname(pooldir)
    pool = op.basename(pooldir)
    ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool]
    samps = fs(op.join(op.dirname(ref),
                       'bedfiles_%s' % op.basename(ref).split(".fa")[0]))
    shdir = op.join(pooldir, 'shfiles/crispANDvarscan')
    # files = {f.sh: f.out, ...}
    files = getfiles(samps, shdir, f"{grep}-{program}")
    return files
Ejemplo n.º 10
0
def make_bed_from_intervals(intdir):
    """If intervals.list files exist, use these instead of ref.fa.length file.

    Positional arguments:
    intdir - path to intervals.list files
    """
    intfiles = [f for f in fs(intdir) if f.endswith('.list')]
    for intfile in intfiles:
        num = intfile.split("_")[-1].replace(".list", "")
        lines = []
        with open(intfile, 'r') as o:
            text = o.read().split("\n")
        for line in text:
            scaff, span = line.split(":")
            start, stop = span.split("-")
            start, stop = (int(start) - 1, int(stop) - 1)
            lines.append((scaff, start, stop))
        make_bed(lines, num)
    print('\t\tcreated %s bedfiles for %s from interval files' %
          (len(intfiles), ref))
Ejemplo n.º 11
0

pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys())
pooldirs = [op.join(parentdir, p) for p in pools]
newdirs = []  # keep track of directories to easily make on remote server
cmds = []  # keep track of all scp commands
# get hostname (eg beluga, cedar, graham)
hostname = os.environ['CC_CLUSTER']

# add remote and subdirs to newdirs list
newdirs.append(remote)
for p in pooldirs:
    newdirs.append(op.join(remote, op.basename(p)))

# get pkl files
pkls = [f for f in fs(parentdir) if f.endswith('.pkl')]
for p in pooldirs:
    for pkl in pkls:
        pkldst = op.join(remote, f'{op.basename(p)}/{op.basename(pkl)}')
        cmds.append(f"scp {hostname}:{pkl} {pkldst}")
    newpkls = [f for f in fs(p) if f.endswith('.pkl')]
    for newpkl in newpkls:
        newdst = op.join(remote, f'{op.basename(p)}/{op.basename(newpkl)}')
        cmds.append(f"scp {hostname}:{newpkl} {newdst}")

# get shfiles
for p in pooldirs:
    shdir = op.join(p, 'shfiles')
    remotesh = op.join(remote, f'{op.basename(p)}/sh_and_outfiles')
    newdirs.append(remotesh)
    dirs = [d for d in fs(shdir) if op.isdir(d)]
def get_bedfiles(parentdir, pool):
    """Get a list of paths to all of the bed files for ref.fa."""
    ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool]
    beddir = op.join(op.dirname(ref), 'bedfiles_%s' % op.basename(ref).split(".fa")[0])
    return [f for f in fs(beddir) if f.endswith('.bed')]
Ejemplo n.º 13
0
# get a list of subdirectory pool dirs created earlier in pipeline
print('getting pooldirs')
pooldirs = []
for p in pools:
    pooldir = op.join(parentdir, p)
    pooldirs.append(pooldir)

# TRIMMING DATA
# get the json data from trimming
print('getting trim data')
data = {}
count = 0
for p in pooldirs:
    trimdir = op.join(p, '01_trimmed')
    jsons = [f for f in fs(trimdir) if f.endswith('.json')]
    count += len(jsons)
    for j in jsons:
        with open(j, 'r') as f:
            data[op.basename(j)] = json.load(f)

# put data into a dataframe, and sort columns
print('reading data')
readinfo = OrderedDict()
samps = []
for j in sorted(data):
    jsplit = j.split("__trimmed")[0]
    splits = jsplit.split(".")
    samp = '.'.join([splits[-1]] + splits[:-1])
    for when in ['before_filtering', 'after_filtering']:
        for which in ['total_reads', 'total_bases', 'q20_bases', 'q30_bases']:
    startscheduler(resfile)
else:
    print('06.py was running')
    bigbrother(resfile, DIR=None)

### dirs
shdir = op.join(parentdir, 'shfiles/concat')
catdir = op.join(parentdir, 'concatenated_vcfs')
filtdir = op.join(parentdir, 'filtered_snps')
createdirs([shdir, catdir, filtdir])
###

# get the snpfiles
snpdir = op.join(parentdir, 'snps')
snpfiles = [
    f.replace('.tbi', '') for f in fs(snpdir)
    if 'snp' in op.basename(f) and f.endswith('.tbi')
]
os.system('echo "len(snpfiles) = %s"' % str(len(snpfiles)))

# sort snpfiles by pool
pools = list(poolref.keys())
combdict = {}
for i, snp in enumerate(snpfiles):
    for p in pools:
        if p in op.basename(snp):
            pool = p
            break
    if pool not in combdict:
        combdict[pool] = []
    combdict[pool].append(snp)
Ejemplo n.º 15
0
pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys())
pooldirs = [op.join(parentdir, p) for p in pools]
newdirs = []  # keep track of directories to easily make on remote server
cmds = []  # keep track of all scp commands
# get hostname (eg beluga, cedar, graham)
hostname = os.environ['CC_CLUSTER']


# add remote and subdirs to newdirs list
newdirs.append(remote)
for p in pooldirs:
    newdirs.append(op.join(remote, op.basename(p) + '-gatk'))


# get pkl files
pkls = [f for f in fs(parentdir) if f.endswith('.pkl')]
for p in pooldirs:
    for pkl in pkls:
        pkldst = op.join(remote, f'{op.basename(p)}-gatk/{op.basename(pkl)}')
        cmds.append(f"scp {hostname}:{pkl} {pkldst}")
    newpkls = [f for f in fs(p) if f.endswith('.pkl')]
    for newpkl in newpkls:
        newdst = op.join(remote, f'{op.basename(p)}-gatk/{op.basename(newpkl)}')
        cmds.append(f"scp {hostname}:{newpkl} {newdst}")


# get shfiles
for p in pooldirs:
    shdir = op.join(p, 'shfiles')
    remotesh = op.join(remote, f'{op.basename(p)}-gatk/sh_and_outfiles')
    newdirs.append(remotesh)
if parentdir.endswith("/"):
    parentdir = parentdir[:-1]
poolref = pklload(op.join(parentdir, 'poolref.pkl'))
email_info = get_email_info(parentdir, 'concat')
###

### dirs
shdir   = op.join(parentdir, 'shfiles/concat')
catdir  = op.join(parentdir, 'concatenated_vcfs')
filtdir = op.join(parentdir, 'filtered_snps')
createdirs([shdir,catdir,filtdir])
###

# get the snpfiles
snpdir = op.join(parentdir, 'snps')
snpfiles = [f.replace('.tbi', '') for f in fs(snpdir) if 'snp' in op.basename(f) and f.endswith('.tbi')]
os.system('echo "len(snpfiles) = %s"' % str(len(snpfiles)))

# sort snpfiles by pool
pools = list(poolref.keys())
combdict = {}
for i,snp in enumerate(snpfiles):
    for p in pools:
        if p in op.basename(snp):
            pool = p
            break
    if not pool in combdict:
        combdict[pool] = []
    combdict[pool].append(snp)
    del pool  # will cause script to error if pool isn't found in snpfile
for pool,poolfiles in combdict.items():
pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys())
pooldirs = [op.join(parentdir, p) for p in pools]
newdirs = []  # keep track of directories to easily make on remote server
cmds = []  # keep track of all rsync commands
# get hostname (eg beluga, cedar, graham)
hostname = os.environ['CC_CLUSTER']

# add remote and subdirs to newdirs list
newdirs.append(remote)
for p in pooldirs:
    newdirs.append(op.join(remote, op.basename(p) + '-gatk'))

# get pkl files
print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC)
pkls = [f for f in fs(parentdir) if f.endswith('.pkl')]
for p in pooldirs:
    for pkl in pkls:
        pkldst = op.join(remote, f'{op.basename(p)}-gatk/{op.basename(pkl)}')
        cmds.append(f"rsync -avz {hostname}:{pkl} {pkldst}")
    newpkls = [f for f in fs(p) if f.endswith('.pkl')]
    for newpkl in newpkls:
        newdst = op.join(remote,
                         f'{op.basename(p)}-gatk/{op.basename(newpkl)}')
        cmds.append(f"rsync -avz {hostname}:{newpkl} {newdst}")

# get shfiles
print(Bcolors.BOLD + '\nBundling .sh and .out files ...' + Bcolors.ENDC)
for p in pooldirs:
    shdir = op.join(p, 'shfiles')
    remotesh = op.join(remote, f'{op.basename(p)}-gatk/sh_and_outfiles')
Ejemplo n.º 18
0
def read_datatable(parentdir):
    # read in the datatable, save info for later
    datatable = op.join(parentdir, 'datatable.txt')
    if not op.exists(datatable):
        print(Bcolors.FAIL + '''FAIL: the datatable is not in the necessary path: %s
FAIL: exiting 00_start-gatk_pipeline.py''' % datatable + Bcolors.ENDC)
        sys.exit(3)
    print(Bcolors.BOLD + 'reading datatable, getting fastq info' + Bcolors.ENDC)
    data = pd.read_csv(datatable, sep='\t')
    rginfo = {}     # key=sampname vals=rginfo
    samp2pool = {}  # key=samp val=pool
    poolref = {}    # key=pool val=ref.fa
    ploidy = {}     # key=pool val=ploidy
    poolsamps = {}  # key=pool val=sampnames
    f2samp = {}     # key=f val=samp
    f2pool = {}     # key=f val=pool
    adaptors = {}   # key=samp val={'r1','r2'} val=adaptor
    for row in data.index:
        samp = data.loc[row, 'sample_name']
        adaptors[samp] = {'r1': data.loc[row, 'adaptor_1'],
                          'r2': data.loc[row, 'adaptor_2']}
        pool = data.loc[row, 'pool_name']
        pooldir = op.join(parentdir, pool)
        print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool))
        if pool not in poolsamps:
            poolsamps[pool] = []
        if samp not in poolsamps[pool]:
            poolsamps[pool].append(samp)
        if samp in samp2pool:
            if samp2pool[samp] != pool:
                print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \
different pool assignments: %s' % samp + Bcolors.ENDC)
                print('exiting')
                exit()
        samp2pool[samp] = pool
        df = data[data['pool_name'] == pool].copy()
        if not luni(df['ploidy']) == 1:
            print(Bcolors.WARNING + 
                  "The ploidy values for some elements with pool name '%s' are not the same." % pool +
                  "\n\tHere are the ploidy values: %s" % uni(df['ploidy']) +
                  Bcolors.ENDC)
            askforinput()
        if samp not in ploidy:
            ploidy[samp] = data.loc[row, 'ploidy']
        if pool in poolref:
            if not poolref[pool] == data.loc[row, 'ref']:
                print("ref genome for samples in %s pool seems to have different paths in datatable.txt" % pool)
                sys.exit(1)
        else:
            ref = data.loc[row, 'ref']
            if not op.exists(ref):
                print('ref for %s does not exist in path: %s' % (samp, ref))
                print('exiting 00_start-gatk_pipeline.py')
                exit()
            needed = []
            for suffix in ['.dict', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa']:
                refext = ref + suffix if suffix != '.dict' else ref.split('.fa')[0] + suffix
                if not op.exists(refext):
                    needed.append(refext)
            if len(needed) > 0:
                print(Bcolors.FAIL + 
                      'FAIL: the following extensions of the reference are needed to continue, \
please create these files' + 
                      Bcolors.ENDC)
                for n in needed:
                    print(Bcolors.FAIL + n + Bcolors.ENDC)
                print('exiting')
                exit()
            printneeded = False
            intdir = op.join(op.dirname(ref), 'intervals')
            if not op.exists(intdir):
                printneeded = True
            elif len([f for f in fs(intdir) if '.list' in f]) == 0:
                printneeded = True
            if printneeded is True:
                print(Bcolors.FAIL + 
                      'FAIL: either the intervals dir doesn not exist or there are not interval.list files\
\nFAIL: intdir should be here: %s' % intdir +
                      Bcolors.ENDC)
                exit()
            poolref[pool] = ref
        rginfo[samp] = {}
        for col in ['rglb', 'rgpl', 'rgsm']:  # rg info columns
            rginfo[samp][col] = data.loc[row, col]
        for f in [data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']]:
            if "__" in f:
                print(Bcolors.BOLD + 
                      Bcolors.FAIL + 
                      "FAIL: file names cannot have double underscores, replace __ with _ (single)" + 
                      Bcolors.END)
                exit()
            f2pool[f] = pool
            f2samp[op.join(pooldir, f)] = samp
    pkldump(rginfo, op.join(parentdir, 'rginfo.pkl'))
    pkldump(ploidy, op.join(parentdir, 'ploidy.pkl'))
    pkldump(f2samp, op.join(parentdir, 'f2samp.pkl'))
    pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl'))
    pkldump(poolref, op.join(parentdir, 'poolref.pkl'))
    pkldump(adaptors, op.join(parentdir, 'adaptors.pkl'))
    pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl'))
    return data, f2pool, poolref
Ejemplo n.º 19
0
trimDIR = op.join(pooldir, '01_trimmed')          # outfiles
for d in [shtrimDIR, trimDIR]:
    if not op.exists(d):
        os.makedirs(d)
mfile = op.join(parentdir, 'msgs.txt')
###


def writetomfile(text):
    with open(mfile, 'a') as m:
        m.write("%s\n" % text)


# get the fastq.gz files
os.chdir(pooldir)
gzfiles = [f for f in fs(pooldir) if 'R1' in f]
lgz = len(gzfiles)
text = 'found %(lgz)s R1 fastq.gz files in %(pooldir)s' % locals()
print('\t%s' % text)
writetomfile(text)

# match seq pairs to samp, alert if pair not found
seq_pairs = {}
for f in gzfiles:
    samp = f2samp[f]
    if samp not in seq_pairs:
        seq_pairs[samp] = []
    read2 = f.replace("_R1", "_R2")
    if op.exists(read2):
        seq_pairs[samp].append((op.abspath(f), op.abspath(read2)))
    else:
Ejemplo n.º 20
0
pooldirs = [op.join(parentdir, p) for p in pools]
newdirs = []  # keep track of directories to easily make on remote server
cmds = []  # keep track of all rsync commands
# get hostname (eg beluga, cedar, graham)
hostname = os.environ['CC_CLUSTER']


# add remote and subdirs to newdirs list
newdirs.append(remote)
for p in pooldirs:
    newdirs.append(op.join(remote, op.basename(p)))


# get pkl files
print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC)
pkls = [f for f in fs(parentdir) if f.endswith('.pkl')]
for p in pooldirs:
    for pkl in pkls:
        pkldst = op.join(remote, f'{op.basename(p)}/{op.basename(pkl)}')
        cmds.append(f"rsync -azv {hostname}:{pkl} {pkldst}")
    newpkls = [f for f in fs(p) if f.endswith('.pkl')]
    for newpkl in newpkls:
        newdst = op.join(remote, f'{op.basename(p)}/{op.basename(newpkl)}')
        cmds.append(f"rsync -azv {hostname}:{newpkl} {newdst}")


# get shfiles
print(Bcolors.BOLD + '\nBundling .sh and .out files ...' + Bcolors.ENDC)
for p in pooldirs:
    shdir = op.join(p, 'shfiles')
    remotesh = op.join(remote, f'{op.basename(p)}/sh_and_outfiles')