Python pklload 예제들, coadaptree.pklload Python 예제들

예제 #1

0

파일 보기

파일: filter_VariantsToTable.py 프로젝트: brandonlind/varscan_pipeline

def get_freq_cutoffs(tablefile):
    """
    Determine MAF using ploidy and the number of samples per pool.
    Sums across ploidy values for a given pool to determin MAF.
    
    Assumes:
    - equal ploidy across samples/pools
    
    Positional arguments:
    tablefile - path to VariantsToTable output - used to find ploidy etc
    
    Returns:
    lowfreq - minimum allele freq to keep (MAF)
    highfreq - maximum allele freq to keep (1-MAF)
    """
    pooldir = op.dirname(op.dirname(tablefile))
    parentdir = op.dirname(pooldir)
    pool = op.basename(pooldir)
    poolsamps = pklload(op.join(parentdir, 'poolsamps.pkl'))[pool]
    ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool]
    lowfreq = 1 / sum(ploidy.values())
    if lowfreq == 1 or len(poolsamps) == 1:
        # for megagametophyte data
        lowfreq = 0
    pklfile = op.join(parentdir, 'maf.pkl')
    if op.exists(pklfile):
        lowfreq = float(pklload(pklfile))
    highfreq = 1 - lowfreq
    return lowfreq, highfreq

예제 #2

0

파일 보기

파일: filter_VariantsToTable.py 프로젝트: crougeux/poolseq_pipeline

def get_freq_cutoffs(tablefile):
    """
    Determine MAF using ploidy.
    
    Assumes:
    - equal ploidy across samples/pools
    
    Positional arguments:
    tablefile - path to VariantsToTable output - used to find ploidy etc
    
    Returns:
    lowfreq - minimum allele freq to keep (MAF)
    highfreq - maximum allele freq to keep (1-MAF)
    ploidy - count of haploid genomes in pool/sample
    """
    pooldir = op.dirname(op.dirname(tablefile))
    parentdir = op.dirname(pooldir)
    pool = op.basename(pooldir)
    poolsamps = pklload(op.join(parentdir, 'poolsamps.pkl'))[pool]
    ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool]
    lowfreq = 1 / (ploidy * len(poolsamps))
    if lowfreq == 1:
        # for megagametophyte data
        lowfreq = 0
    highfreq = 1 - lowfreq
    return lowfreq, highfreq, ploidy

예제 #3

0

파일 보기

파일: filter_VariantsToTable.py 프로젝트: brandonlind/varscan_pipeline

def remove_paralogs(snps, parentdir, snpspath, pool):
    """
    Remove sites from snptable that are thought to have multiple gene copies align to this position.
    
    # assumes
    # paralog file has 'CHROM' and 'locus' in the header (best if this is the only data, reads in quicker)
    #   where CHROM is the reference chromosome/scaffold
    #   where locus is hyphen-separated CHROM-POS
    
    # paralog file is created from calling SNPs on haplotype data as diploid
    #   no need to worry about translating stiched -> unstitched if SNPs called on same reference.
    """
    parpkl = op.join(parentdir, 'paralog_snps.pkl')
    if op.exists(parpkl):
        # read in paralogfile
        paralogdict = pklload(parpkl)
        if paralogdict[pool] is not None:
            print('Removing paralogs sites ...')
            paralogs = pd.read_csv(paralogdict[pool], sep='\t')
            # remove and isolate paralogs from snps
            truths = snps['locus'].isin(paralogs['locus'])
            found_paralogs = snps[truths].copy()
            snps = snps[~truths].copy()
            snps.index = range(len(snps.index))

            # write paralogs to a file
            parafile = snpspath.replace(".txt", "_PARALOGS.txt")
            found_paralogs = mark_nas(found_paralogs, 'paralog SNPs')
            found_paralogs.to_csv(parafile, sep='\t', index=False)
            print(
                f'{op.basename(snpspath)} has {len(snps.index)} non-paralog SNPs'
            )
    return snps

예제 #4

0

파일 보기

파일: start_crispANDvarscan.py 프로젝트: crougeux/poolseq_pipeline

def get_prereqs(bedfile, pooldir, parentdir, pool, program):
    """Get object names."""
    num = bedfile.split("_")[-1].split(".bed")[0]
    ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool]
    outdir = makedir(op.join(pooldir, program))
    vcf = op.join(outdir, f'{pool}_{program}_bedfile_{num}.vcf')
    return (num, ref, vcf)

예제 #5

0

파일 보기

파일: filter_VariantsToTable.py 프로젝트: brandonlind/varscan_pipeline

def filter_freq(df, tf, tipe, tablefile):
    """
    Filter out loci with global MAF < 1/(total_ploidy_across_pools).
    Right now this is unnecessary for varscan when setting pool-level freq to 1/ploidy.
    
    Positional arguments:
    df - pandas.dataframe; VariantsToTable output
    tablefile - path to VariantsToTable output - used to find ploidy etc
    tf - str; basename of tablefile
    tipe - str; one of either "SNP" or "INDEL"
    
    Returns:
    df - pandas.dataframe; freq-filtered VariantsToTable output
    """
    # believe it or not, it's faster to do qual and freq filtering in two steps vs an 'and' statement
    lowfreq, highfreq = get_freq_cutoffs(tablefile)
    print(f'filtering for global frequency ({lowfreq}, {highfreq})...')
    df.reset_index(drop=True, inplace=True)

    # prep for filtering
    freqcols = [col for col in df.columns if '.FREQ' in col]
    pool = op.basename(op.dirname(op.dirname(tablefile)))
    parentdir = op.dirname(op.dirname(op.dirname(tablefile)))
    ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool]

    # carry on with poolseq datas
    filtloci = []
    afs = []
    copy = get_copy(df, freqcols)
    for locus in tqdm(copy.columns):
        freqs = dict(
            (samp.replace(".FREQ", ""), freq)
            for (samp,
                 freq) in copy[locus].str.rstrip('%').astype('float').items()
            if not math.isnan(freq)
        )  # faster than .str.rstrip('%').astype('float').dropna()
        if len(
                freqs
        ) > 0:  # avoid loci with all freqs masked (avoid ZeroDivisionError)
            # calc globfreq using the samps/ploidy that are present for this locus
            globfreq = sum([
                ploidy[samp] * (freq / 100) for (samp, freq) in freqs.items()
            ]) / sum([ploidy[samp] for samp in freqs])
            if lowfreq <= globfreq <= highfreq:
                filtloci.append(locus)
                # since we're going in order of rows in df ...
                # ... we can use afs to replace AF col later since we reduce df to filtloci
                afs.append(globfreq)
                # which is about 40x faster than: df.loc[locus, 'AF'] = globfreq
    print(
        f'{tf} has {len(filtloci)} {tipe}s that have global MAF > {lowfreq*100}%'
    )
    df = df[df.index.isin(filtloci)].copy()
    df.index = range(len(df.index))
    df['AF'] = afs
    return df

예제 #6

0

파일 보기

파일: start_crispANDvarscan.py 프로젝트: crougeux/poolseq_pipeline

def checkfiles(pooldir):
    """Call get_bamfiles."""
    # get the list of file names
    print('checking files')
    pool = op.basename(pooldir)
    samps = pklload(op.join(op.dirname(pooldir), 'poolsamps.pkl'))[pool]
    shdir = op.join(pooldir, 'shfiles/05_indelRealign_shfiles')
    files = getfiles(samps, shdir, 'indelRealign')
    check_queue(files.values(), pooldir)  # make sure job isn't in the queue (running or pending)
    check_seff(files.values())  # make sure the jobs didn't die
    return get_bamfiles(samps, pooldir)

예제 #7

0

파일 보기

파일: filter_VariantsToTable.py 프로젝트: brandonlind/varscan_pipeline

def translate_stitched_to_unstitched(df, parentdir, pool):
    """See if user asked regions to be translated from stitched genome to unstitched.

    # assumes
    # that this is run BEFORE removing repeats
    """
    orderpkl = op.join(parentdir, 'orderfile.pkl')
    if op.exists(orderpkl):
        orderdict = pklload(orderpkl)
        if oderdict[pool] is not None:
            # if user selected translation be applied to this pool
            orderfile = orderdict[pool]
            df = translate_stitched.main(df.copy(), orderfile)
    return df

예제 #8

0

파일 보기

def get_avail_accounts(parentdir=None, save=False):
    """Query slurm with sshare command to determine accounts available.
    
    If called with parentdir=None, return all available accounts.
        - Meant to be called from command line outside of pipeline. See also sys.argv input.
    If called with parentdir='choose', allow user to choose accounts.
        - Meant to be called from command line outside of pipeline. See also sys.argv input.
    If called with save=True, confirm each account with user and save .pkl file in parentdir.
        - save=True is only called from 00_start.py
    
    Returns a list of accounts to balance queue.
    """

    if parentdir is not None and save is False:
        # if the accounts have already been chosen, just return them right away
        # keep 'save is False' so 00_start can overwrite previous pkl and skip here
        pkl = os.path.join(parentdir, 'accounts.pkl')
        if os.path.exists(pkl):
            return pklload(pkl)

    # get a list of all available accounts
    acctout = subprocess.check_output([
        shutil.which('sshare'), '-U', '--user', os.environ['USER'],
        '--format=Account'
    ]).decode('utf-8').split('\n')
    accts = [
        acct.split()[0].split("_")[0] for acct in acctout if '_cpu' in acct
    ]

    # for running outside of the pipeline:
    if parentdir is None:
        # to manually run on command line, using all accounts (default + RAC)
        return accts
    elif parentdir == 'choose':
        # to manually run on command line, choose accounts
        return choose_accounts(accts)

    # save if necessary
    if save is True:
        # called from 00_start.py
        keep = choose_accounts(accts)
        pkldump(keep, os.path.join(parentdir, 'accounts.pkl'))
        # no return necessary for 00_start.py
        return

    return accts

예제 #9

0

파일 보기

파일: start_crispANDvarscan.py 프로젝트: crougeux/poolseq_pipeline

def get_crisp_cmd(bamfiles, bedfile, pool, parentdir, ref, vcf, bednum):
    """Create command to call crisp."""
    smallbams, smallcmds = get_small_bam_cmds(bamfiles, bednum, bedfile)
    bams = ' --bam '.join(smallbams)
    poolsize = pklload(op.join(parentdir, 'ploidy.pkl'))[pool]
    logfile = vcf.replace(".vcf", ".log")
    convertfile = vcf.replace(".vcf", "_converted.vcf")
    cmds = smallcmds + f'''module load python/2.7.14
$CRISP_DIR/CRISP --bam {bams} --ref {ref} --VCF {vcf} \
--poolsize {poolsize} --mbq 20 --minc 5 --bed {bedfile} > {logfile}

touch $SLURM_TMPDIR/bam_file_list.txt # assumes equal pool sizes

$CRISP_DIR/scripts/convert_pooled_vcf.py {vcf} $SLURM_TMPDIR/bam_file_list.txt \
{poolsize} > {convertfile}
module unload python
'''
    return (cmds, convertfile, logfile)

예제 #10

0

파일 보기

파일: combine_crispORvarscan.py 프로젝트: crougeux/poolseq_pipeline

def checkjobs():
    """
    Make sure previous realigned bamfiles were created without error.
    Avoids unintentionally combining a subset of all final expected files.

    Calls:
    getfiles from start_crispANDvarscan
    """
    print('checking jobs')
    parentdir = op.dirname(pooldir)
    pool = op.basename(pooldir)
    ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool]
    samps = fs(op.join(op.dirname(ref),
                       'bedfiles_%s' % op.basename(ref).split(".fa")[0]))
    shdir = op.join(pooldir, 'shfiles/crispANDvarscan')
    # files = {f.sh: f.out, ...}
    files = getfiles(samps, shdir, f"{grep}-{program}")
    return files

예제 #11

0

파일 보기

def get_varscan_cmd(bamfiles, bedfile, bednum, vcf, ref, pooldir, program):
    """Create command to call varscan."""
    smallbams, smallcmds = get_small_bam_cmds(bamfiles, bednum, bedfile)
    smallbams = ' '.join(smallbams)
    ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool]
    # if single-sample then set minfreq to 0, else use min possible allele freq
    minfreq = 1 / sum(ploidy.values()) if len(ploidy.keys()) > 1 else 0
    cmd = f'''samtools mpileup -B -f {ref} {smallbams} | java -Xmx15g -jar \
$VARSCAN_DIR/VarScan.v2.4.3.jar mpileup2cns --min-coverage 8 --p-value 0.05 \
--min-var-freq {minfreq} --strand-filter 1 --min-freq-for-hom 0.80 \
--min-avg-qual 20 --output-vcf 1 > {vcf}
module unload samtools
'''
    # final vcf
    outdir = makedir(op.join(pooldir, program))
    finalvcf = op.join(outdir, op.basename(vcf))
    cmds = smallcmds + cmd
    return (cmds, finalvcf)

예제 #12

0

파일 보기

파일: start_crispANDvarscan.py 프로젝트: crougeux/poolseq_pipeline

def get_varscan_cmd(bamfiles, bedfile, bednum, vcf, ref):
    """Create command to call varscan."""
    smallbams, smallcmds = get_small_bam_cmds(bamfiles, bednum, bedfile)
    smallbams = ' '.join(smallbams)
    ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool]
    # if single-sample then set minfreq to 0, else use min possible allele freq
    minfreq = 1/(ploidy*len(bamfiles)) if len(bamfiles) > 1 else 0
    # if single-sample then use pileup2cns, else use mpileup2snp
#     tool = 'mpileup2cns' if len(bamfiles) > 1 else 'pileup2cns'
    tool = 'mpileup2cns'
    # --strand-filter not mentioned in docs for pileup2cns
    strand_filter = '' if tool == 'pileup2cns' else '--strand-filter 1'
    cmd = f'''samtools mpileup -B -f {ref} {smallbams} | java -Xmx15g -jar \
$VARSCAN_DIR/VarScan.v2.4.3.jar {tool} --min-coverage 8 --p-value 0.05 \
--min-var-freq {minfreq} {strand_filter} --min-freq-for-hom 0.80 \
--min-avg-qual 20 --output-vcf 1 > {vcf}
module unload samtools'''
    cmds = smallcmds + cmd
    return (cmds, vcf)

예제 #13

0

파일 보기

파일: combine_crispORvarscan.py 프로젝트: crougeux/poolseq_pipeline

def get_varscan_names(df, pooldir):
    """Convert generic sample/pool names from varscan to something meaningful."""
    print('renaming varscan columns ...')
    # get order of samps used to create varscan cmds (same order as datatable)
    pool = op.basename(pooldir)
    samps = pklload(op.join(op.dirname(pooldir), 'poolsamps.pkl'))[pool]
    # create a list of names that varscan gives by default
    generic = ['Sample%s' % (i+1) for i in range(len(samps))]
    # create a map between generic and true samp names
    dic = dict((gen, samp) for (gen, samp) in zip(generic, samps))
    # rename the columns in df
    cols = []
    for col in df:
        if '.' in col:
            gen, rest = col.split(".")
            samp = dic[gen]
            col = '.'.join([samp, rest])
        cols.append(col)
    df.columns = cols
    return df

예제 #14

0

파일 보기

파일: 03_mark_build.py 프로젝트: crougeux/gatk_pipeline

### purpose
# use picard to mark/remove duplicates, build bam index for GATK
###

### usage
# 03_mark_build.py /path/to/sortfile /path/to/pooldir/
###
"""

import sys, os, balance_queue, subprocess, shutil
from os import path as op
from coadaptree import makedir, get_email_info, pklload

thisfile, pooldir, samp = sys.argv
sortfiles = pklload(op.join(pooldir, '%s_sortfiles.pkl' % samp))

# MarkDuplicates
dupdir = op.join(pooldir, '03_dedup_rg_filtered_indexed_sorted_bamfiles')
pool = op.basename(pooldir)
dupfile = op.join(dupdir, "%s_rd.bam" % samp)
dupflag = dupfile.replace(".bam", ".bam.flagstats")
dupstat = op.join(dupdir, "%s_rd_dupstat.txt" % samp)

# create sh file
email_text = get_email_info(op.dirname(pooldir), '03')
joined = ' I='.join(sortfiles)
text = f"""#!/bin/bash
#SBATCH --time=11:59:00
#SBATCH --mem=30000M
#SBATCH --ntasks=1

예제 #15

0

파일 보기

파일: 01_trim-fastq.py 프로젝트: brandonlind/gatk_pipeline

"""


import os
import sys
import time
import shutil
import subprocess
from os import path as op
from coadaptree import fs, pklload, pkldump, get_email_info

# args
thisfile, pooldir, ref = sys.argv
parentdir = op.dirname(pooldir)
pool = op.basename(pooldir)
f2samp = pklload(op.join(parentdir, 'f2samp.pkl'))
adaptors = pklload(op.join(parentdir, 'adaptors.pkl'))
bash_variables = op.join(parentdir, 'bash_variables')
for arg, path in [('pooldir', pooldir), ('ref', ref)]:
    if not op.exists(path):
        print("The argument does not exist in the specified path:\narg = %s\npath =%s" % (arg, path))
        sys.exit(1)


# make some dirs
shdir = op.join(pooldir, 'shfiles')
shtrimDIR = op.join(shdir, '01_trimmed_shfiles')  # cmd.sh files
trimDIR = op.join(pooldir, '01_trimmed')          # outfiles
for d in [shtrimDIR, trimDIR]:
    if not op.exists(d):
        os.makedirs(d)

예제 #16

0

파일 보기

파일: 04_realignTargetCreator.py 프로젝트: crougeux/poolseq_pipeline

"""

import os, sys, balance_queue, subprocess, shutil
from os import path as op
from coadaptree import makedir, pklload, get_email_info

thisfile, pooldir, samp, dupfile = sys.argv

# RealignerTargetCreator
aligndir = op.join(pooldir, '04_realign')
listfile = op.join(aligndir, '%s_realingment_targets.list' % samp)

# get ref
parentdir = op.dirname(pooldir)
pool = op.basename(pooldir)
ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool]

email_text = get_email_info(parentdir, '04')
text = '''#!/bin/bash
#SBATCH --time=7-00:00:00
#SBATCH --mem=30000M
#SBATCH --nodes=1
#SBATCH --ntasks=32
#SBATCH --cpus-per-task=1
#SBATCH --job-name=%(pool)s-%(samp)s-realign
#SBATCH --output=%(pool)s-%(samp)s-realign_%%j.out 
%(email_text)s

# realign using the GATK
module load gatk/3.8
module load java

예제 #17

0

파일 보기

파일: 99_bundle_files_for_transfer.py 프로젝트: brandonlind/gatk_pipeline

    return md5


def get_cmds(srcfiles, md5files, remotedir, createmd5):
    subcmds = []
    for src in srcfiles:
        if createmd5 is True:
            md5 = check_md5(src, md5files)
            md5dst = op.join(remotedir, op.basename(md5))
            subcmds.append(f'rsync -avz {hostname}:{md5} {md5dst}')
        dst = op.join(remotedir, op.basename(src))
        subcmds.append(f'rsync -avz {hostname}:{src} {dst}')
    return subcmds


pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys())
pooldirs = [op.join(parentdir, p) for p in pools]
newdirs = []  # keep track of directories to easily make on remote server
cmds = []  # keep track of all rsync commands
# get hostname (eg beluga, cedar, graham)
hostname = os.environ['CC_CLUSTER']

# add remote and subdirs to newdirs list
newdirs.append(remote)
for p in pooldirs:
    newdirs.append(op.join(remote, op.basename(p) + '-gatk'))

# get pkl files
print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC)
pkls = [f for f in fs(parentdir) if f.endswith('.pkl')]
for p in pooldirs:

예제 #18

0

파일 보기

파일: 02_bwa-map_view_sort_index_flagstat.py 프로젝트: brandonlind/gatk_pipeline

### usage
# 02_bwa-map_view_sort_index_flagstat.py parentdir samp
###

### assumes
# outfiles from "bwa index ref.fasta"
###
"""

import sys, os, subprocess, shutil
from os import path as op
from coadaptree import pklload, pkldump, get_email_info, makedir

# get argument inputs
thisfile, parentdir, samp = sys.argv
pool = pklload(op.join(parentdir, 'samp2pool.pkl'))[samp]
pooldir = op.join(parentdir, pool)
shdir = op.join(pooldir, 'shfiles')
ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool]
r1r2outs = pklload(op.join(pooldir, 'samp2_r1r2out.pkl'))[samp]
bash_variables = op.join(parentdir, 'bash_variables')

# create dirs
bwashdir = op.join(shdir, '02_bwa_shfiles')
samdir = op.join(pooldir, '02a_samfiles')
bamdir = op.join(pooldir, '02b_bamfiles')
sortdir = op.join(pooldir, '02c_sorted_bamfiles')
for d in [bwashdir, samdir, bamdir, sortdir]:
    makedir(d)

# get rginfo - THIS CAN STAY EVEN WITH SAMPS SEQUENCED MULTIPLE TIMES - RGID and RGPU are defined with file

예제 #19

0

파일 보기

파일: 99_get_read_stats.py 프로젝트: crougeux/poolseq_pipeline

# imports
import os, sys, json, pandas as pd
from tqdm import tqdm
from os import path as op
from collections import OrderedDict
from coadaptree import fs, uni, pklload

# args
thisfile, parentdir, engines = sys.argv
if parentdir.endswith("/"):
    parentdir = parentdir[:-1]

# reqs
print('getting reqs')
samp2pool = pklload(op.join(parentdir, 'samp2pool.pkl'))
pools = uni(list(samp2pool.values()))

# get a list of subdirectory pool dirs created earlier in pipeline
print('getting pooldirs')
pooldirs = []
for p in pools:
    pooldir = op.join(parentdir, p)
    pooldirs.append(pooldir)

# TRIMMING DATA
# get the json data from trimming
print('getting trim data')
data = {}
count = 0
for p in pooldirs:

예제 #20

0

파일 보기

파일: start_crispANDvarscan.py 프로젝트: crougeux/poolseq_pipeline

def get_bedfiles(parentdir, pool):
    """Get a list of paths to all of the bed files for ref.fa."""
    ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool]
    beddir = op.join(op.dirname(ref), 'bedfiles_%s' % op.basename(ref).split(".fa")[0])
    return [f for f in fs(beddir) if f.endswith('.bed')]

예제 #21

0

파일 보기

    """
    Create rsync command between src_server and remote_server.
    Create .md5 if necessary.
    """
    subcmds = []
    for src in srcfiles:
        if createmd5 is True:
            md5 = check_md5(src, md5files)
            md5dst = op.join(remotedir, op.basename(md5))
            subcmds.append(f'rsync -azv {hostname}:{md5} {md5dst}')
        dst = op.join(remotedir, op.basename(src))
        subcmds.append(f'rsync -azv {hostname}:{src} {dst}')
    return subcmds


pools = list(pklload(op.join(parentdir, 'poolref.pkl')).keys())
pooldirs = [op.join(parentdir, p) for p in pools]
newdirs = []  # keep track of directories to easily make on remote server
cmds = []  # keep track of all rsync commands
# get hostname (eg beluga, cedar, graham)
hostname = os.environ['CC_CLUSTER']


# add remote and subdirs to newdirs list
newdirs.append(remote)
for p in pooldirs:
    newdirs.append(op.join(remote, op.basename(p)))


# get pkl files
print(Bcolors.BOLD + '\nBundling .pkl files ...' + Bcolors.ENDC)

예제 #22

0

파일 보기

파일: filter_VariantsToTable.py 프로젝트: brandonlind/varscan_pipeline

def remove_repeats(snps, parentdir, snpspath, pool):
    """
    Remove SNPs that are found to be in repeat-masked regions.
    
    # assumes
    # that the positions have been translated BEFORE removing repeats
        # took forever to create unstitched repeat regions, don't want to translate repeat file
        # this way I can just use unstitched chrom if reference is stitched
    # repeat file has a header ('CHROM', 'start', 'stop')
    # start and stop positions of repeat regions are 1-based
    """
    reppkl = op.join(parentdir, 'repeat_regions.pkl')
    if op.exists(reppkl):
        # read in repeat regions
        repeatdict = pklload(reppkl)
        if repeatdict[pool] is not None:
            print('Removing repeat regions ...')
            # if user selected translation be applied to this pool
            repeats = pd.read_csv(repeatdict[pool], sep='\t')
            # figure out if data is from stitched or not
            if 'unstitched_chrom' in snps.columns:
                # then the snps have been translated: stitched -> unstitched
                chromcol = 'unstitched_chrom'
                poscol = 'unstitched_pos'
                print('\tsnps have been translated')
            else:
                # otherwise SNPs were called on unstitched reference
                chromcol = 'CHROM'
                poscol = 'POS'
                print('\tsnps have not been translated')
            # reduce repeats to the chroms that matter (helps speed up lookups)
            repeats = repeats[repeats['CHROM'].isin(
                snps[chromcol].tolist())].copy()

            # isolate SNPs in repeat regions
            repeat_snps = []
            for chrom in tqdm(uni(snps[chromcol])):
                reps = repeats[repeats['CHROM'] == chrom].copy()
                mysnps = snps[snps[chromcol] == chrom].copy()
                if len(reps.index) > 0 and len(mysnps.index) > 0:
                    for row in mysnps.index:
                        pos = snps.loc[
                            row,
                            poscol]  # index is maintained from snps to mysnsps
                        df = reps[reps['stop'].astype(int) >= int(pos)].copy()
                        df = df[df['start'].astype(int) <= int(pos)].copy()
                        if len(df.index) > 0:
                            assert len(df.index) == 1
                            repeat_snps.append(row)

            # save repeats
            print(f'\tSaving {len(repeat_snps)} repeat regions')
            repeat_path = snpspath.replace(".txt", "_REPEATS.txt")
            myrepeats = snps[snps.index.isin(repeat_snps)].copy()
            myrepeats = mark_nas(myrepeats, 'repeat SNPs')
            myrepeats.to_csv(repeat_path, sep='\t', index=False)

            # remove SNPs in repeat regions
            snps = snps[~snps.index.isin(repeat_snps)].copy()
            snps.index = range(len(snps.index))

            print(
                f'{op.basename(snpspath)} has {len(snps.index)} SNPs outside of repeat regions'
            )

    return snps

예제 #23

0

파일 보기

파일: 06_filter_concat_scaffolds.py 프로젝트: brandonlind/gatk_pipeline

#
"""

### imports
import sys, os, pickle, subprocess
from os import path as op
import numpy as np
from coadaptree import fs, createdirs, pklload, get_email_info
from genotyping_scheduler import startscheduler, bigbrother, delsched
###

### args
thisfile, parentdir = sys.argv
if parentdir.endswith("/"):
    parentdir = parentdir[:-1]
poolref = pklload(op.join(parentdir, 'poolref.pkl'))
email_info = get_email_info(parentdir, 'final')
bash_variables = op.join(parentdir, 'bash_variables')
maf = pklload(op.join(parentdir, 'maf.pkl'))
###

# make a reservation file so other jobs don't call 05.py
resfile = op.join(parentdir, 'shfiles/06_reservation.txt')
if not op.exists(resfile):
    startscheduler(resfile)
else:
    print('06.py was running')
    bigbrother(resfile, DIR=None)

### dirs
shdir = op.join(parentdir, 'shfiles/concat')