Esempio n. 1
0
def repair_task_list():
    """main gain and loss algorithm"""
    (sentinel_path, results_path, haplotype_path, cancer_dir_path,
     tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID()
    inputs = []
    outputs = []
    prev_sentinels = []

    prev_sentinels.append(
        taskHelpers.CreateFileList('{0}_findroi.sentinel', 1, sentinel_path))
    sentinels = taskHelpers.CreateFileList('{0}_repair.sentinel', 1,
                                           sentinel_path)

    inputs.append(
        taskHelpers.CreateFileList('{0}.{1}.roi.sorted.bam', 88, tmpbams_path,
                                   "gain"))
    outputs.append(
        taskHelpers.CreateFileList('{0}.{1}.repaired.bam', 88, tmpbams_path,
                                   "gain"))

    sample_ids = taskHelpers.CreateFileList('{0}', 1, '')
    job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs,
                                                sample_ids, prev_sentinels)
    for job in job_parameters:
        yield job
Esempio n. 2
0
def find_roi_bam_task_list():
    """populates task inputs and outputs"""
    (sentinel_path, results_path, haplotype_path, cancer_dir_path,
     tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID()
    inputs = []
    outputs = []
    prev_sentinels = []
    split_path = "/".join([results_path, "splitbams"])

    prev_sentinels.append(
        taskHelpers.CreateFileList('{0}_sortn.sentinel', 1, sentinel_path))
    sentinels = taskHelpers.CreateFileList('{0}_findroi.sentinel', 1,
                                           sentinel_path)
    inputs.append(
        taskHelpers.CreateFileList('{0}.byname.bam', 22, split_path,
                                   "extractROI"))
    outputs.append(
        taskHelpers.CreateFileList('{0}.{1}.roi.bam', 88, tmpbams_path,
                                   "extractROI")
    )  # max number of outputs chr*events*haplotypes (8 for 2 chromosomes)

    sample_ids = taskHelpers.CreateFileList('{0}', 1, '')
    job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs,
                                                sample_ids, prev_sentinels)
    for job in job_parameters:
        yield job
Esempio n. 3
0
def complete_pipeline_gain_task_list():
    (sentinel_path, results_path, haplotype_path, cancer_dir_path,
     tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID()
    inputs = []
    outputs = []
    prev_sentinels = []

    prev_sentinels.append(
        taskHelpers.CreateFileList('{0}_subsample_gain.sentinel', 1,
                                   sentinel_path))

    sentinels = taskHelpers.CreateFileList('{0}_sortmerge.sentinel', 1,
                                           sentinel_path)

    inputs.append(
        taskHelpers.CreateFileList('{0}_{1}_{2}.bam', 88, finalbams_path,
                                   "FINAL"))

    outputs.append(
        taskHelpers.CreateFileList(params.GetOutputFileName(), 1,
                                   finalbams_path))

    sample_ids = taskHelpers.CreateFileList('{0}', 1, '')

    job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs,
                                                sample_ids, prev_sentinels)
    for job in job_parameters:
        yield job
Esempio n. 4
0
def sort_by_name_task_list():
    """populates task inputs and outputs"""
    (sentinel_path, results_path, haplotype_path, cancer_dir_path,
     tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID()
    inputs = []
    outputs = []
    prev_sentinels = []
    split_path = "/".join([results_path, "splitbams"])

    prev_sentinels.append(
        taskHelpers.CreateFileList('{0}_split.sentinel', 1, sentinel_path))
    sentinels = taskHelpers.CreateFileList('{0}_sortn.sentinel', 1,
                                           sentinel_path)
    inputs.append(
        taskHelpers.CreateFileList('chr{1}.bam', 22, split_path + "/"))
    outputs.append(
        taskHelpers.CreateFileList('chr{1}.byname.bam', 22, split_path + "/"))
    sample_ids = taskHelpers.CreateFileList('{0}', 1, '')
    job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs,
                                                sample_ids, prev_sentinels)
    for job in job_parameters:
        yield job
Esempio n. 5
0
def subsample_loss_task_list():
    (sentinel_path, results_path, haplotype_path, cancer_dir_path,
     tmpbams_path, finalbams_path) = taskHelpers.GetProjectNamePathRunID()
    inputs = []
    outputs = []
    prev_sentinels = []
    prev_sentinels.append(
        taskHelpers.CreateFileList('{0}_mutate_loss.sentinel', 1,
                                   sentinel_path))
    sentinels = taskHelpers.CreateFileList('{0}_subsample_loss.sentinel', 1,
                                           sentinel_path)

    inputs.append(
        taskHelpers.CreateFileList('{0}.{1}.mutated.merged.sorted.bam', 12,
                                   tmpbams_path, "loss"))
    outputs.append(
        taskHelpers.CreateFileList('{0}{1}_GAIN.bam', 88, tmpbams_path,
                                   "loss"))

    sample_ids = taskHelpers.CreateFileList('{0}', 1, '')
    job_parameters = taskHelpers.CreateTaskList(inputs, sentinels, outputs,
                                                sample_ids, prev_sentinels)
    for job in job_parameters:
        yield job
Esempio n. 6
0
def subsample_loss(inputs, output_sentinel, outputs, sample_id, prev_sentinel):     
    """adjusting sample rate for Bam files"""
    task_list = []
    log_msg = ' [subsample loss events] ' + '[' + sample_id + '] '
    if pipelineHelpers.CheckSentinel(prev_sentinel, log, log_msg):
        pipelineHelpers.Logging('INFO', log, log_msg + 'Starting')
        python = sys.executable
        current_path = params.GetProgramPath()
        script_path = pipelineHelpers.GetScriptPath(
            sample_id, bamhelp.name)
        bamgineer_mem = bamhelp.GetBamgineerMem('med')
        
        for inp in inputs[0]:
            chrevent=os.path.basename(inp).strip().split("_")[0]
            chr = re.split('(\d+)',chrevent)[1]
            original_bam = sub('.mutated.merged.sorted.bam', '.sorted.bam', inp) 
            sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID()
            LOSS_FINAL = "/".join([finalbams_path,  'CHR'+str(chr).upper() +'_LOSS.bam'])
    
            script = open('{0}sample_{1}_{2}.sh'.format(script_path, 'chr'+str(chr), "loss"), 'w')
            script.write('#!/bin/bash\n')
            script.write('#\n')
            script.write('#$ -cwd \n')
            script.write('module load samtools/1.2 \n')
            script.write('python {path}/subsample_loss.py {inbam} {fl} \n'.format(path=current_path,inbam=inp, fl=LOSS_FINAL)) 
            
            script.close()
            process = pipelineHelpers.RunTask( 
                os.path.abspath(script.name),4, bamgineer_mem,
                sample_id, bamhelp.name)
            task_list.append(process)
                
            pipelineHelpers.CheckTaskStatus(
                            task_list, output_sentinel, log, log_msg)
    pipelineHelpers.Logging('INFO', log, log_msg+ 'Finished Sampling Loss Event')
Esempio n. 7
0
def mutate_loss(inputs, output_sentinel, outputs, sample_id, prev_sentinel):
    """mutating reads according to haplotype at germline SNP locations"""
    task_list = []
    log_msg = ' [implement_cnv] ' + '[' + sample_id + '] '
    
    if pipelineHelpers.CheckSentinel(prev_sentinel, log, log_msg):
        pipelineHelpers.Logging('INFO', log, log_msg + 'Starting')
        python = sys.executable

        current_path = params.GetProgramPath()
        script_path = pipelineHelpers.GetScriptPath(
            sample_id, bamhelp.name)
        bamgineer_mem = bamhelp.GetBamgineerMem('med')
        sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID()
        
        for inp in inputs[0]:
           
            chr= os.path.basename(inp).strip().split(".")[0]
           
            bedfn= "/".join([haplotype_path, 'loss_het_snp_' + chr + '.bed'])
            diffn =   "/".join([tmpbams_path,"diff.bam"])
            nonhet= "/".join([tmpbams_path, 'diff_only1_' +  os.path.basename(inp)])
            hetfn=sub('.roi.sorted.bam$',".mutated.het.bam", inp)
            hetfnsorted = sub('.roi.sorted.bam$',".mutated.het.sorted.bam", inp)
            mergedsortfn = sub('.roi.sorted.bam$',".mutated.merged.sorted.bam", inp)
            
            
            script = open('{0}mutate_{1}_{2}.sh'.format(script_path, chr, "loss"), 'w')
            script.write('#!/bin/bash\n')
            script.write('#')
            script.write('#$ -cwd \n')
            script.write('module load samtools/1.2 \n')
            script.write('module load sambamba \n')
            script.write('module load bamUtil \n')  
                
            script.write('sort -u {bf} -o {bf}\n\n'.format(bf=bedfn))
            script.write('python {path}/mutate.py {inp1} {bf} {happath}\n\n'.format(inp1=inp, bf=bedfn ,path=current_path , happath=haplotype_path))        
            script.write('sambamba sort {het} -o {hetsort}\n\n'.format(het=hetfn, hetsort=hetfnsorted))
            script.write('bam diff --in1 {repairedbam} --in2 {hetsort} --out {dif}\n\n'.format(repairedbam=inp, hetsort=hetfnsorted ,dif=diffn ))  
            script.write('sambamba merge {merged} {hetonly} {nonhetonly}\n\n'.format(merged=mergedsortfn,hetonly=hetfnsorted, nonhetonly= nonhet))
            script.write('rm {het} {nonhetonly}  \n\n'.format(het=hetfn,nonhetonly= nonhet))
            
            script.close()
            process = pipelineHelpers.RunTask( 
                os.path.abspath(script.name),4, bamgineer_mem,
                sample_id, bamhelp.name)
            task_list.append(process)
                
            pipelineHelpers.CheckTaskStatus(
                            task_list, output_sentinel, log, log_msg)
    pipelineHelpers.Logging('INFO', log, log_msg + 'Finished Mutating')
Esempio n. 8
0
def find_roi_bam(inputs, output_sentinel, outputs, sample_id, prev_sentinel):
    """finding ROI bam for each haplotype/event/chr"""
    task_list = []
    log_msg = ' [FindRoiBam] ' + '[' + sample_id + '] '

    pipelineHelpers.Logging('INFO', log, log_msg + 'Starting')
    if pipelineHelpers.CheckSentinel(prev_sentinel, log, log_msg):
        python = sys.executable
        script_path = pipelineHelpers.GetScriptPath(
            sample_id, bamhelp.name)
        bamgineer_mem = bamhelp.GetBamgineerMem('med')
        sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID()
        
        for inp, op in izip(inputs[0],outputs[0]):
            opsorted=sub('.bam$',".sorted.bam", op)
            chr=os.path.basename(op).strip().split(".")[0]
            event=os.path.basename(op).strip().split(".")[1]
            exonsinroibed = "/".join([haplotype_path,   event + "_exons_in_roi_"+ str(chr) +'.bed'])
            script = open(
                '{0}find_roi_{1}_{2}.sh'.format(script_path,
                                                     chr, event), 'w')
            script.write('#!/bin/bash\n\n')
            script.write('#\n')
            script.write('#$ -cwd \n')
            script.write('module load bedtools \n')
            script.write('module load sambamba \n')
            
            script.write('sort -u {exonbed} -o {exonbed} \n'.format(exonbed=exonsinroibed))
            script.write('bedtools pairtobed -abam {inp} '
                         '-b {bf} -type either > {outp} \n'.format(inp = inp,
                                           bf=exonsinroibed, outp=op))
            script.write('sambamba sort {outp} -o '
                          '{outpsorted} \n'.format(outp=op, outpsorted= opsorted))  
            script.write('rm {outp} \n'.format( outp=op))
            script.close()
            process = pipelineHelpers.RunTask(
                os.path.abspath(script.name), 1, bamgineer_mem,
                sample_id,  bamhelp.name)
            
            task_list.append(process)                 
        pipelineHelpers.CheckTaskStatus(
                    task_list, output_sentinel, log, log_msg)
    pipelineHelpers.Logging('INFO', log, log_msg + 'Finished FindROI')
Esempio n. 9
0
import itertools
import re
import subprocess

global bases
bases = ('A','T','C','G')

log = pipelineHelpers.GetLogFile('Bamgineer')
import utils
import vcf
import gzip
import shutil
chr_list = range(1,23)
event_list=['gain','loss']

sentinel_path, results_path,haplotype_path,cancer_dir_path,tmpbams_path,finalbams_path = taskHelpers.GetProjectNamePathRunID()


def initPool(queue, level, terminating_):
    """
    This causes the logging module to be initialized with the necessary info
    in pool threads to work correctly.
    """
    logging.getLogger('').setLevel(level)
    global terminating
    terminating = terminating_


def initialize():
    try:
        utils.createDirectory(results_path)