def run_rmarkdown(start=0, sample= "", theme= "", workdir=None, outdir=None, timeout=TIMEOUT,nthreads=1, kobas=True): logger.info("Get rmarkdown report for %s"%sample) report_log = os.path.join(workdir, "report.log") report_log_fd = open(report_log, "w") step=0 msg = "Get report for %s"%sample sma3s_summary = "%s/%s-sma3s-summary.tsv" %(os.path.join(outdir,"annotation"), sample) sma3s_table = "%s/%s-sma3s-table.tsv" %(os.path.join(outdir,"annotation"), sample) sma3s_go = "%s/%s-go.tsv" %(os.path.join(outdir,"annotation"), sample) sma3s_ko = "%s/%s-ko.tsv" %(os.path.join(outdir,"annotation"), sample) engine="" if os.path.exists(sma3s_go and sma3s_ko): engine="0" elif os.path.exists(sma3s_summary and sma3s_table): engine="1" else: logger.error("The files of annotation did not existed!") os._exit(-1) if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if kobas: #convert to str kobas="1" else: kobas="0" command='''\ python /opt/Auxtools/rmd_creator.py -i %(in)s -w %(work)s \ -t /opt/Auxtools/rmarkdown/template.Rmd \ -s %(sam)s -o %(in)s/%(sam)s.Rmd -m %(theme)s -k %(kobas)s -e %(engine)s && \ R -e 'rmarkdown::render(\\"%(in)s/%(sam)s.Rmd\\")' && \ rm %(in)s/%(sam)s.Rmd \ ''' % { 'in': outdir, 'work': workdir, 'sam': sample, 'theme': theme, 'kobas': kobas, 'engine': engine } logger.info(command) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=report_log_fd, cmd_log=report_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1
def run_fusioncatcher(data_dir="", input="", start=0, fusioncatcher=FUSIONCATCHER, fusioncatcher_opts="", sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running RNA fusion detection (FusionCatcher) for %s" % sample) if not os.path.exists(data_dir): logger.error("Aborting!") raise Exception("No data directory %s" % data_dir) work_fusioncatcher = os.path.join(workdir, "fusioncatcher", sample) create_dirs([work_fusioncatcher]) fusioncatcher_log = os.path.join(work_fusioncatcher, "fusioncatcher.log") fusioncatcher_log_fd = open(fusioncatcher_log, "w") if nthreads > 1: if "-p " not in fusioncatcher_opts: fusioncatcher_opts += " -p %d" % nthreads msg = "Run FusionCatcher for %s" % sample command = "%s -d %s -i %s --start %d -o %s" % ( fusioncatcher, data_dir, input, start, work_fusioncatcher) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd, cmd_log=fusioncatcher_log_fd, msg=msg, timeout=timeout) out_fusioncatcher = os.path.join(outdir, "fusioncatcher", sample) create_dirs([out_fusioncatcher]) msg = "Copy predictions to output directory for %s." % sample if os.path.exists("%s/final-list_candidate-fusion-genes.txt" % work_fusioncatcher): command = "cp %s/final-list_candidate-fusion-genes.txt %s/final-list_candidate-fusion-genes.txt" % ( work_fusioncatcher, out_fusioncatcher) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd, cmd_log=fusioncatcher_log, msg=msg, timeout=timeout) fusions = "" if os.path.exists("%s/final-list_candidate-fusion-genes.txt" % out_fusioncatcher): logger.info("FusionCatcher was successfull!") logger.info( "Output fusions: %s/final-list_candidate-fusion-genes.txt" % out_fusioncatcher) fusions = "%s/final-list_candidate-fusion-genes.txt" % out_fusioncatcher else: logger.info("FusionCatcher failed!") return fusions
def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference=None, assembly=None, pad=AGE_PAD, age=None, truncation_pad_read_age = AGE_TRUNCATION_PAD, max_interval_len_truncation_age = AGE_MAX_INTERVAL_TRUNCATION, dist_to_expected_bp = AGE_DIST_TO_BP, min_del_subalign_len = MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len = MIN_INV_SUBALIGN_LENGTH, age_window = AGE_WINDOW_SIZE, age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, myid=0): thread_logger = logging.getLogger("%s-%s" % (run_age_single.__name__, multiprocessing.current_process())) bedtools_intervals = [] intervals_bedtool = pybedtools.BedTool(intervals_bed) assembly_fasta = pysam.Fastafile(assembly) if assembly else None reference_fasta = pysam.Fastafile(reference) breakpoints_bed = None thread_logger.info("Will process %d intervals" % (len(region_list))) try: for region in region_list: bedtools_interval = pybedtools.Interval(region[0], region[1], region[3]) matching_intervals = [interval for interval in intervals_bedtool if ( interval.start == bedtools_interval.start and interval.end == bedtools_interval.end and interval.chrom == bedtools_interval.chrom)] if not matching_intervals: thread_logger.info("Matching interval not found for %s" % (str(bedtools_interval))) matching_interval = bedtools_interval else: matching_interval = matching_intervals[0] thread_logger.info("Matching interval %s" % (str(matching_interval))) sc_locations = [] try: sc_locations = map(int, json.loads(base64.b64decode(matching_interval.name.split(",")[0]))["SC_LOCATIONS"].split(",")) except: pass if region not in contig_dict: continue if not contig_dict[region]: continue region_object = SVRegion(region[0], region[1], region[2], region[3]) if region_object.pos1 - pad < 0: thread_logger.error("Region too close to start of chromosome. Skipping.") continue reference_sequence = reference_fasta.fetch(reference=region_object.chrom1, start=region_object.pos1 - pad, end=region_object.pos2 + pad) region_name = "%s.%d.%d" % (region_object.chrom1, region_object.pos1, region_object.pos2) ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name) thread_logger.info("Writing the ref sequence for region %s" % region_name) with open(ref_name, "w") as file_handle: file_handle.write(">{}.ref\n{}".format(region_name, reference_sequence)) age_records = [] thread_logger.info("Processing %d contigs for region %s" % (len(contig_dict[region]), str(region_object))) for contig in contig_dict[region]: thread_logger.info( "Writing the assembeled sequence %s of length %s" % (contig.raw_name, contig.sequence_len)) tr_region=[] if region_object.length()>max_interval_len_truncation_age and contig.sv_type in ["INV","DEL","DUP"]: # For large SVs, middle sequences has no effect on genotyping. So, we truncate middle region of reference to speed up thread_logger.info("Truncate the reference sequence.") truncate_start = pad + dist_to_expected_bp + truncation_pad_read_age +1 truncate_end = len(reference_sequence) - (pad + dist_to_expected_bp + truncation_pad_read_age) reference_sequence_tr=reference_sequence[0:truncate_start-1]+reference_sequence[truncate_end:] region_name_tr = "%s.%d.%d.tr_%d_%d" % (region_object.chrom1, region_object.pos1, region_object.pos2,truncate_start,truncate_end) ref_name_tr = os.path.join(age_workdir, "%s.ref.fa" % region_name_tr) thread_logger.info("Writing the truncated ref sequence for region %s, contig %s" % (region_name_tr, contig.raw_name)) with open(ref_name_tr, "w") as file_handle: file_handle.write(">{}.ref\n{}".format(region_name_tr, reference_sequence_tr)) ref_len = len(reference_sequence_tr) ref_f_name = ref_name_tr tr_region = [truncate_start,truncate_end-truncate_start+1] else: ref_len = region_object.length() ref_f_name = ref_name if contig.sequence_len * ref_len >= 100000000: thread_logger.info("Skipping contig because AGE problem is large (contig_len = %d , ref_len= %d)"%(contig.sequence_len, ref_len)) continue contig_sequence = assembly_fasta.fetch(contig.raw_name) prefix = get_age_file_prefix(contig) asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix) out = os.path.join(age_workdir, "%s.age.out" % prefix) err = os.path.join(age_workdir, "%s.age.err" % prefix) fd_out = open(out, "w") fd_err = open(err, "w") with open(asm_name, "w") as file_handle: file_handle.write(">{}.as\n{}".format(region_name, contig_sequence)) age_cmd = "%s %s -both -go=-6 %s %s" % ( age, "-inv" if contig.sv_type == "INV" else "-tdup" if contig.sv_type == "DUP" else "-indel", ref_f_name, asm_name) cmd_runner = TimedExternalCmd(age_cmd, thread_logger) retcode = cmd_runner.run(timeout=timeout, cmd_log_fd_out=fd_out, cmd_log_fd_err=fd_err) fd_out.close() fd_err.close() if retcode == 0: age_record = AgeRecord(out,tr_region_1=tr_region) if len(age_record.inputs) == 2: age_record.contig = contig age_record.set_assembly_contig(contig_sequence) age_records.append(age_record) else: thread_logger.error("Number of inputs != 2 in age output file %s. Skipping." % out) if not keep_temp: os.remove(asm_name) os.remove(err) if tr_region: os.remove(ref_name_tr) unique_age_records = get_unique_age_records(age_records) thread_logger.info("Unique %d AGE records for region %s" % (len(unique_age_records), str(region_object))) for age_record in unique_age_records: thread_logger.info(str(age_record)) sv_types = list(set([age_record.contig.sv_type for age_record in unique_age_records])) if len(sv_types) != 1: thread_logger.error("Some problem. Mixed SV types for this interval %s" % (str(sv_types))) else: sv_type = sv_types[0] thread_logger.info("Processing region of type %s" % sv_type) breakpoints, info_dict = process_age_records(unique_age_records, sv_type=sv_type, pad=pad, dist_to_expected_bp=dist_to_expected_bp, min_del_subalign_len=min_del_subalign_len, min_inv_subalign_len=min_inv_subalign_len, age_window=age_window, sc_locations=sc_locations) bedtools_fields = matching_interval.fields if len(breakpoints) == 1 and sv_type == "INS": bedtools_fields += map(str, [breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1], breakpoints[0][2]]) elif len(breakpoints) == 2 and (sv_type in ["DEL","INV","DUP"]): bedtools_fields += map(str, breakpoints + [breakpoints[1] - breakpoints[0]] + ["."]) else: bedtools_fields += map(str, [bedtools_fields[1], bedtools_fields[2], -1, "."]) bedtools_fields[3] += ";AS" bedtools_fields.append(base64.b64encode(json.dumps(info_dict))) thread_logger.info("Writing out fields %s" % (str(bedtools_fields))) bedtools_intervals.append(pybedtools.create_interval_from_list(bedtools_fields)) if not keep_temp: os.remove(ref_name) except Exception as e: thread_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e if assembly_fasta: assembly_fasta.close() reference_fasta.close() thread_logger.info("Writing %d intervals" % (len(bedtools_intervals))) if bedtools_intervals: breakpoints_bed = os.path.join(age_workdir, "%d_breakpoints.bed" % myid) pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed) return breakpoints_bed
def run_spades_single(intervals=[], bams=[], spades=None, spades_options="", work=None, pad=SPADES_PAD, timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS): thread_logger = logging.getLogger( "%s-%s" % (run_spades_single.__name__, multiprocessing.current_process())) if not os.path.isdir(work): thread_logger.info("Creating %s" % work) os.makedirs(work) merged_contigs = open(os.path.join(work, "merged.fa"), "w") spades_log_fd = open(os.path.join(work, "spades.log"), "w") extract_fns = [extract_pairs.all_pair_hq, extract_pairs.non_perfect_hq] try: bam_handles = [pysam.Samfile(bam, "rb") for bam in bams] for interval in intervals: region = "%s:%d-%d" % (str( interval.chrom), interval.start, interval.end) thread_logger.info("Processing interval %s" % (str(interval).strip())) sv_type = interval.name.split(",")[1] extraction_counts = extract_pairs.extract_read_pairs( bam_handles, region, "%s/" % work, extract_fns, pad=pad, max_read_pairs=max_read_pairs, sv_type=sv_type) all_pair_count = extraction_counts[0][1] for fn_id, ((end1, end2), extracted_count) in enumerate(extraction_counts): extract_fn_name = extract_fns[fn_id].__name__ if fn_id > 0 and extracted_count == all_pair_count: thread_logger.info( "Skipping assembly from %s since read count same as all_pairs" % extract_fn_name) continue if extracted_count >= 5: extra_opt = "--sc" if not fn_id == 0 else "" spades_log_fd.write( "Running spades for interval %s with extraction function %s\n" % (str(interval).strip(), extract_fn_name)) cmd = TimedExternalCmd( "%s -1 %s -2 %s -o %s/spades_%s/ -m 4 -t 1 --phred-offset 33 %s %s" % (spades, end1, end2, work, extract_fn_name, extra_opt, spades_options), thread_logger) retcode = cmd.run(cmd_log_fd_out=spades_log_fd, timeout=timeout) if retcode == 0: append_contigs( os.path.join(work, "spades_%s/contigs.fasta") % extract_fn_name, interval, merged_contigs, fn_id, sv_type) elif not cmd.did_timeout: thread_logger.error("Spades failed") if stop_on_fail: thread_logger.error("Aborting!") raise Exception( "Spades failure on interval %s for extraction function %s\n" % (str(interval).strip(), extract_fn_name)) else: thread_logger.info( "Too few read pairs (%d) extracted. Skipping assembly." % extracted_count) for bam_handle in bam_handles: bam_handle.close() except Exception as e: thread_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e merged_contigs.close() return os.path.abspath(merged_contigs.name)
def run_spades_single(intervals=[], bams=[], spades=None, spades_options="", work=None, pad=SPADES_PAD, timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS): thread_logger = logging.getLogger("%s-%s" % (run_spades_single.__name__, multiprocessing.current_process())) if not os.path.isdir(work): thread_logger.info("Creating %s" % work) os.makedirs(work) merged_contigs = open(os.path.join(work, "merged.fa"), "w") spades_log_fd = open(os.path.join(work, "spades.log"), "w") extract_fns = [extract_pairs.all_pair_hq, extract_pairs.non_perfect_hq] try: bam_handles = [pysam.Samfile(bam, "rb") for bam in bams] for interval in intervals: region = "%s:%d-%d" % (str(interval.chrom), interval.start, interval.end) thread_logger.info("Processing interval %s" % (str(interval).strip())) sv_type = interval.name.split(",")[1] extraction_counts = extract_pairs.extract_read_pairs(bam_handles, region, "%s/" % work, extract_fns, pad=pad, max_read_pairs=max_read_pairs, sv_type=sv_type) all_pair_count = extraction_counts[0][1] for fn_id, ((end1, end2), extracted_count) in enumerate(extraction_counts): extract_fn_name = extract_fns[fn_id].__name__ if fn_id > 0 and extracted_count == all_pair_count: thread_logger.info("Skipping assembly from %s since read count same as all_pairs" % extract_fn_name) continue if extracted_count >= 5: extra_opt = "--sc" if not fn_id == 0 else "" spades_log_fd.write("Running spades for interval %s with extraction function %s\n" % ( str(interval).strip(), extract_fn_name)) cmd = TimedExternalCmd("%s -1 %s -2 %s -o %s/spades_%s/ -m 4 -t 1 --phred-offset 33 %s %s" % ( spades, end1, end2, work, extract_fn_name, extra_opt, spades_options), thread_logger) retcode = cmd.run(cmd_log_fd_out=spades_log_fd, timeout=timeout) if retcode == 0: append_contigs(os.path.join(work, "spades_%s/contigs.fasta") % extract_fn_name, interval, merged_contigs, fn_id, sv_type) elif not cmd.did_timeout: thread_logger.error("Spades failed") if stop_on_fail: thread_logger.error("Aborting!") raise Exception("Spades failure on interval %s for extraction function %s\n" % ( str(interval).strip(), extract_fn_name)) else: thread_logger.info("Too few read pairs (%d) extracted. Skipping assembly." % extracted_count) for bam_handle in bam_handles: bam_handle.close() except Exception as e: thread_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e merged_contigs.close() return os.path.abspath(merged_contigs.name)
def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference=None, assembly=None, pad=AGE_PAD, age=None, truncation_pad_read_age=AGE_TRUNCATION_PAD, max_interval_len_truncation_age=AGE_MAX_INTERVAL_TRUNCATION, dist_to_expected_bp=AGE_DIST_TO_BP, min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH, age_window=AGE_WINDOW_SIZE, age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, myid=0): thread_logger = logging.getLogger( "%s-%s" % (run_age_single.__name__, multiprocessing.current_process())) bedtools_intervals = [] intervals_bedtool = pybedtools.BedTool(intervals_bed) assembly_fasta = pysam.Fastafile(assembly) if assembly else None reference_fasta = pysam.Fastafile(reference) breakpoints_bed = None thread_logger.info("Will process %d intervals" % (len(region_list))) try: for region in region_list: bedtools_interval = pybedtools.Interval(region[0], region[1], region[3]) matching_intervals = [ interval for interval in intervals_bedtool if (interval.start == bedtools_interval.start and interval.end == bedtools_interval.end and interval.chrom == bedtools_interval.chrom) ] if not matching_intervals: thread_logger.info("Matching interval not found for %s" % (str(bedtools_interval))) matching_interval = bedtools_interval else: matching_interval = matching_intervals[0] thread_logger.info("Matching interval %s" % (str(matching_interval))) sc_locations = [] try: sc_locations = map( int, json.loads( base64.b64decode(matching_interval.name.split(",")[0])) ["SC_LOCATIONS"].split(",")) except: pass if region not in contig_dict: continue if not contig_dict[region]: continue region_object = SVRegion(region[0], region[1], region[2], region[3]) if region_object.pos1 - pad < 0: thread_logger.error( "Region too close to start of chromosome. Skipping.") continue reference_sequence = reference_fasta.fetch( reference=region_object.chrom1, start=region_object.pos1 - pad, end=region_object.pos2 + pad) region_name = "%s.%d.%d" % (region_object.chrom1, region_object.pos1, region_object.pos2) ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name) thread_logger.info("Writing the ref sequence for region %s" % region_name) with open(ref_name, "w") as file_handle: file_handle.write(">{}.ref\n{}".format(region_name, reference_sequence)) age_records = [] thread_logger.info("Processing %d contigs for region %s" % (len(contig_dict[region]), str(region_object))) for contig in contig_dict[region]: thread_logger.info( "Writing the assembeled sequence %s of length %s" % (contig.raw_name, contig.sequence_len)) tr_region = [] if region_object.length( ) > max_interval_len_truncation_age and contig.sv_type in [ "INV", "DEL", "DUP" ]: # For large SVs, middle sequences has no effect on genotyping. So, we truncate middle region of reference to speed up thread_logger.info("Truncate the reference sequence.") truncate_start = pad + dist_to_expected_bp + truncation_pad_read_age + 1 truncate_end = len(reference_sequence) - ( pad + dist_to_expected_bp + truncation_pad_read_age) reference_sequence_tr = reference_sequence[ 0:truncate_start - 1] + reference_sequence[truncate_end:] region_name_tr = "%s.%d.%d.tr_%d_%d" % ( region_object.chrom1, region_object.pos1, region_object.pos2, truncate_start, truncate_end) ref_name_tr = os.path.join(age_workdir, "%s.ref.fa" % region_name_tr) thread_logger.info( "Writing the truncated ref sequence for region %s, contig %s" % (region_name_tr, contig.raw_name)) with open(ref_name_tr, "w") as file_handle: file_handle.write(">{}.ref\n{}".format( region_name_tr, reference_sequence_tr)) ref_len = len(reference_sequence_tr) ref_f_name = ref_name_tr tr_region = [ truncate_start, truncate_end - truncate_start + 1 ] else: ref_len = region_object.length() ref_f_name = ref_name if contig.sequence_len * ref_len >= 100000000: thread_logger.info( "Skipping contig because AGE problem is large (contig_len = %d , ref_len= %d)" % (contig.sequence_len, ref_len)) continue contig_sequence = assembly_fasta.fetch(contig.raw_name) prefix = get_age_file_prefix(contig) asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix) out = os.path.join(age_workdir, "%s.age.out" % prefix) err = os.path.join(age_workdir, "%s.age.err" % prefix) fd_out = open(out, "w") fd_err = open(err, "w") with open(asm_name, "w") as file_handle: file_handle.write(">{}.as\n{}".format( region_name, contig_sequence)) age_cmd = "%s %s -both -go=-6 %s %s" % ( age, "-inv" if contig.sv_type == "INV" else "-tdup" if contig.sv_type == "DUP" else "-indel", ref_f_name, asm_name) cmd_runner = TimedExternalCmd(age_cmd, thread_logger) retcode = cmd_runner.run(timeout=timeout, cmd_log_fd_out=fd_out, cmd_log_fd_err=fd_err) fd_out.close() fd_err.close() if retcode == 0: age_record = AgeRecord(out, tr_region_1=tr_region) if len(age_record.inputs) == 2: age_record.contig = contig age_record.set_assembly_contig(contig_sequence) age_records.append(age_record) else: thread_logger.error( "Number of inputs != 2 in age output file %s. Skipping." % out) if not keep_temp: os.remove(asm_name) os.remove(err) if tr_region: os.remove(ref_name_tr) unique_age_records = get_unique_age_records(age_records) thread_logger.info("Unique %d AGE records for region %s" % (len(unique_age_records), str(region_object))) for age_record in unique_age_records: thread_logger.info(str(age_record)) sv_types = list( set([ age_record.contig.sv_type for age_record in unique_age_records ])) if len(sv_types) != 1: thread_logger.error( "Some problem. Mixed SV types for this interval %s" % (str(sv_types))) else: sv_type = sv_types[0] thread_logger.info("Processing region of type %s" % sv_type) breakpoints, info_dict = process_age_records( unique_age_records, sv_type=sv_type, pad=pad, dist_to_expected_bp=dist_to_expected_bp, min_del_subalign_len=min_del_subalign_len, min_inv_subalign_len=min_inv_subalign_len, age_window=age_window, sc_locations=sc_locations) bedtools_fields = matching_interval.fields if len(breakpoints) == 1 and sv_type == "INS": bedtools_fields += map(str, [ breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1], breakpoints[0][2] ]) elif len(breakpoints) == 2 and (sv_type in ["DEL", "INV", "DUP"]): bedtools_fields += map( str, breakpoints + [breakpoints[1] - breakpoints[0]] + ["."]) else: bedtools_fields += map( str, [bedtools_fields[1], bedtools_fields[2], -1, "."]) bedtools_fields[3] += ";AS" bedtools_fields.append(base64.b64encode(json.dumps(info_dict))) thread_logger.info("Writing out fields %s" % (str(bedtools_fields))) bedtools_intervals.append( pybedtools.create_interval_from_list(bedtools_fields)) if not keep_temp: os.remove(ref_name) except Exception as e: thread_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e if assembly_fasta: assembly_fasta.close() reference_fasta.close() thread_logger.info("Writing %d intervals" % (len(bedtools_intervals))) if bedtools_intervals: breakpoints_bed = os.path.join(age_workdir, "%d_breakpoints.bed" % myid) pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed) return breakpoints_bed
def run_oases(assmebly_hash=DNV_HASH, seq_1="", seq_2="", seq_u="", seq_i="", file_format=DNV_FORMAT, read_type=DNV_READTYPE, oases=OASES, velvetg=VELVETG, velveth=VELVETH, oases_opts="", velvetg_opts="", velveth_opts="", start=0, sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running de novo assembly (OASES) for %s" % sample) if seq_1 and seq_2: for s1 in seq_1.split(","): if not os.path.exists(s1): logger.error("Aborting!") raise Exception("No Mate 1 sequence file %s" % s1) for s2 in seq_2.split(","): if not os.path.exists(s2): logger.error("Aborting!") raise Exception("No Mate 2 sequence file %s" % s2) seq_argument = "-separate %s %s" % (seq_1, seq_2) elif seq_u: seq_argument = seq_u for su in seq_u.split(","): if not os.path.exists(su): logger.error("Aborting!") raise Exception("No unpaired sequence file %s" % su) elif seq_i: seq_argument = seq_i for sr in seq_i.split(","): if not os.path.exists(seq_i): logger.error("Aborting!") raise Exception("No sra sequence file %s" % sr) work_oases = os.path.join(workdir, "oases", sample) create_dirs([work_oases]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase Oases work directory for %s" % sample command = "rm -rf %s/*" % (work_oases) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 oases_log = os.path.join(work_oases, "oases.log") oases_log_fd = open(oases_log, "w") seq_argument = "-%s -%s %s " % (file_format, read_type, seq_argument) msg = "velveth for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s %d %s %s" % (velveth, work_oases, assmebly_hash, velveth_opts, seq_argument) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "velvetg for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s %s -read_trkg yes " % (velvetg, work_oases, velvetg_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "oases for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s %s " % (oases, work_oases, oases_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_oases = os.path.join(outdir, "oases", sample) create_dirs([out_oases]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/transcripts.fa" % work_oases): command = "cp %s/transcripts.fa %s/transcripts.fa" % (work_oases, out_oases) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 transcripts = "" if os.path.exists("%s/transcripts.fa" % out_oases): logger.info("Oases was successfull!") logger.info("Output transcripts: %s/transcripts.fa" % out_oases) transcripts = "%s/transcripts.fa" % out_oases else: logger.info("Oases failed!") return transcripts
def run_afterqc(fqdir="", r1_flag="", r2_flag="", start=0, sample="", afterqc_opts="", workdir=None, outdir=None, timeout=TIMEOUT, nthreads=1): logger.info( "Automatic Filtering, Trimming, Error Removing and Quality Control for fastq data for %s" % sample) work_qc = os.path.join(workdir, "qc") create_dirs([work_qc]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase QC work directory for %s" % sample command = "rm -rf %s/*" % (work_qc) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 qc_log = os.path.join(work_qc, "qc.log") qc_log_fd = open(qc_log, "w") #seq_argument="-%s -%s %s "%(file_format,read_type,seq_argument) msg = "QC for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -d %s --read1_flag %s --read2_flag %s -g %s -b %s -r %s %s" % ( "python", "/opt/AfterQC/after.py", fqdir, r1_flag, r2_flag, work_qc + "/good/", work_qc + "/bad/", work_qc + "/qc_report/", afterqc_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=qc_log_fd, cmd_log=qc_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_qc = os.path.join(outdir, "qc") create_dirs([out_qc]) msg = "Copy qc html report to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) qclist = filter(lambda x: x.endswith(("html")), os.listdir("%s/qc_report/" % work_qc)) if len(qclist) > 0: #command=" && ".join(map(lambda x: "cp %s/qc_report/%s %s/" % (work_qc, x, out_qc), qclist)) command = "cp %s %s/" % (" ".join( map(lambda x: "%s/qc_report/%s" % (work_qc, x), qclist)), out_qc) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=qc_log_fd, cmd_log=qc_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1
def run_lordec(kmer=23, solid=3, long="", short="", lordec=LORDEC, lordec_opts="", start=0, sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running long read error correction (LoRDEC) for %s" % sample) if not os.path.exists(long): logger.error("Aborting!") raise Exception("No long read sequence file %s" % long) if not os.path.exists(short): logger.error("Aborting!") raise Exception("No short read sequence file %s" % short) work_lordec = os.path.join(workdir, "lordec", sample) create_dirs([work_lordec]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase LoRDEC work directory for %s" % sample command = "rm -rf %s/*" % (work_lordec) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 lordec_log = os.path.join(work_lordec, "lordec.log") lordec_log_fd = open(lordec_log, "w") ksps = "" if "-T " not in lordec_opts: lordec_opts += " -T %d" % nthreads msg = "LoRDEC for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -k %d -s %d -i %s -2 %s -O %s -o %s/long_corrected.fa" % ( lordec, lordec_opts, kmer, solid, long, short, work_lordec, work_lordec) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=lordec_log_fd, cmd_log=lordec_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_lordec = os.path.join(outdir, "lordec", sample) create_dirs([out_lordec]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/long_corrected.fa" % work_lordec): command = "cp %s/long_corrected.fa %s/long_corrected.fa" % ( work_lordec, out_lordec) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=lordec_log_fd, cmd_log=lordec_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 corrected = "" if os.path.exists("%s/long_corrected.fa" % out_lordec): logger.info("LoRDEC was successfull!") logger.info("Output corrected reads: %s/long_corrected.fa" % out_lordec) corrected = "%s/long_corrected.fa" % out_lordec else: logger.info("LoRDEC failed!") return corrected
def annotate_ver2(sample="", start=0, nettype="0", dcutt="0.5", dcutnt="0.45", orgtype="0", minsglen="10", trunc="70", orgname="", blastp_opts="", evalue="", msa="", workdir=None, outdir=None, timeout=TIMEOUT, nthreads=1): logger.info("Annotation for precursor proteins of polypeptides for %s" % sample) work_annot = os.path.join(workdir, "annotation") create_dirs([work_annot]) work_msalign = os.path.join(workdir, "msalign") step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase annotation work directory for %s" % sample command = "rm -rf %s/*" % (work_annot) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 annot_log = os.path.join(work_annot, "annotation.log") annot_log_fd = open(annot_log, "w") msg = "Predicting the presence and location of signal peptide cleavage sites in amino acid sequences for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = ( "casperjs /opt/Auxtools/spider/signalP.js --fasta=%(msdir)s/%(sam)s-sequence.fa " "--outfile=%(atdir)s/%(sam)s-signalP.txt --minlen=\"%(mlen)s\" " "--method=\"%(nettype)s\" --orgtype=\"%(orgtype)s\" " "--dcut=\"user\" --notm=%(dcutnt)s --tm=%(dcutt)s --trunc=%(trunc)s" ) % { 'msdir': work_msalign, 'atdir': work_annot, 'sam': sample, 'mlen': minsglen, 'nettype': nettype, 'orgtype': orgtype, 'dcutt': dcutt, 'dcutnt': dcutnt, 'trunc': trunc } command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=annot_log_fd, cmd_log=annot_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Annotating precursor protein function for %s" % sample sma3sfa = filter(lambda x: x.endswith(("fa", "fasta")), os.listdir(SMA3SDB_DIR)) sma3sat = filter(lambda x: x.endswith(("annot")), os.listdir(SMA3SDB_DIR)) sma3sfagz = filter(lambda x: x.endswith(("fa.gz", "fasta.gz")), os.listdir(SMA3SDB_DIR)) sma3satgz = filter(lambda x: x.endswith(("annot.gz")), os.listdir(SMA3SDB_DIR)) if (len(sma3sfa) == 1 and len(sma3sat) == 1): if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = ( "mkdir -p %(atdir)s/sma3s/ && " "ln -sf /data/%(ssdir)s/%(dbfa)s %(atdir)s/sma3s/%(dbfa)s && " "ln -sf /data/%(ssdir)s/%(dbat)s %(atdir)s/sma3s/%(dbat)s && " "cp %(msdir)s/%(sam)s-sequence.fa %(atdir)s/sma3s/ &&" "cd %(atdir)s/sma3s/ && " "perl /opt/Auxtools/sma3s_v2.pl -i %(sam)s-sequence.fa -d %(dbfa)s -go -goslim -p 0.00001 -num_threads %(nthreads)s && " "cd -") % { 'msdir': work_msalign, 'atdir': work_annot, 'ssdir': SMA3SDB_DIR, 'dbfa': sma3sfa[0], 'dbat': sma3sat[0], 'sam': sample, 'nthreads': nthreads } command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=annot_log_fd, cmd_log=annot_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) elif (len(sma3sfagz) == 1 and len(sma3satgz) == 1): if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = ( "mkdir -p %(atdir)s/sma3s/ && " "gzip -dc %(ssdir)s/%(dbatgz)s > %(atdir)s/sma3s/%(dbat)s && " "gzip -dc %(ssdir)s/%(dbfagz)s > %(atdir)s/sma3s/%(dbfa)s && " "cp %(msdir)s/%(sam)s-sequence.fa %(atdir)s/sma3s/ &&" "cd %(atdir)s/sma3s/ && " "perl /opt/Auxtools/sma3s_v2.pl -i %(sam)s-sequence.fa -d %(dbfa)s -go -goslim -p 0.00001 -num_threads %(nthreads)s && " "cd -") % { 'msdir': work_msalign, 'atdir': work_annot, 'ssdir': SMA3SDB_DIR, 'dbfagz': sma3sfagz[0], 'dbatgz': sma3satgz[0], 'dbfa': os.path.splitext(sma3sfagz[0])[0], 'dbat': os.path.splitext(sma3satgz[0])[0], 'sam': sample, 'nthreads': nthreads } command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=annot_log_fd, cmd_log=annot_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) else: logger.warn( "Two Sma3s database files(uniref90.annot and uniref90.fasta) did not exist in the '%s' directory! Please obtain it from http://www.bioinfocabd.upo.es/sma3s/db/" % (BLASTDB_DIR)) logger.warn("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Multiple sequence alignment for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = ( "python /opt/Auxtools/msa.py -i %(msdir)s/%(sam)s-sequence.fa " "-m %(msa)s -o %(atdir)s/%(sam)s-msa.html ") % { 'msdir': work_msalign, 'atdir': work_annot, 'sam': sample, 'msa': msa } command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=annot_log_fd, cmd_log=annot_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Venom annotation for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = ( "python /opt/Auxtools/venomkb/venomkb_annot.py -i %(msdir)s/%(sam)s-sequence.fa " "-c /opt/Auxtools/venomkb/venomkb_proteins_06272017.json.gz -o %(atdir)s/%(sam)s " ) % { 'msdir': work_msalign, 'atdir': work_annot, 'sam': sample } command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=annot_log_fd, cmd_log=annot_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Similar sequence blast for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) falist = filter(lambda x: x.endswith(("fa", "fasta")), os.listdir(BLASTDB_DIR)) if (len(falist) == 0): logger.warn( "There is no blast database file(*.fa or *.fasta) in the '%s' directory!" % (BLASTDB_DIR)) logger.warn("Skipping step %d: %s" % (step, msg)) if (len(falist) > 1): logger.warn( "Only one blast database file is allowed in the '%s' directory!" % (BLASTDB_DIR)) logger.warn("Skipping step %d: %s" % (step, msg)) dbext = filter(lambda x: x.endswith(("phr", "pin", "psq")), os.listdir("./config/blastdb/")) if (len(dbext) < 3): #check db had been builded. cmd_chip1 = "makeblastdb -dbtype prot -in %(path)s/%(db)s -out %(path)s/%(db)s && \ " % { 'path': BLASTDB_DIR, 'db': falist[0] } else: cmd_chip1 = "" cmd_chip2 = ( "blastp -db %(path)s/%(db)s -num_threads %(nthreads)s -query %(msdir)s/%(sam)s-sequence.fa " "-out %(atdir)s/%(sam)s.asn -outfmt 11 -evalue %(evalue)s %(opts)s && " "blast_formatter -archive %(atdir)s/%(sam)s.asn -outfmt 0 > %(atdir)s/%(sam)s-pairwise.txt && " "blast_formatter -archive %(atdir)s/%(sam)s.asn -outfmt '7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle' > %(atdir)s/%(sam)s-tabular.txt && " "python /opt/Auxtools/BlasterJS/src/blast2html.py -i %(atdir)s/%(sam)s-pairwise.txt " "-o %(atdir)s/blast_html/") % { 'path': BLASTDB_DIR, 'db': falist[0], 'msdir': work_msalign, 'atdir': work_annot, 'sam': sample, 'orgname': orgname, 'evalue': evalue, 'opts': blastp_opts, 'nthreads': nthreads } command = cmd_chip1 + cmd_chip2 command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=annot_log_fd, cmd_log=annot_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_annot = os.path.join(outdir, "annotation") create_dirs([out_annot]) msg = "Copy annotation result to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/%s-signalP.txt" % (work_annot, sample)): command = ( "cp %(indir)s/%(sam)s-msa.html %(indir)s/%(sam)s-venom.tsv " "%(indir)s/%(sam)s-signalP.txt %(outdir)s/") % { "indir": work_annot, "sam": sample, "outdir": out_annot } cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=annot_log_fd, cmd_log=annot_log, msg=msg, timeout=timeout) if os.path.exists("%s/blast_html" % (work_annot)): copy_and_overwrite("%s/blast_html" % (work_annot), "%s/blast_html/" % (out_annot)) tmpin = "%s/%s-tabular.txt" % (work_annot, sample) tmpout = "%s/%s-tabular.txt" % (out_annot, sample) if os.path.exists(tmpin): copyfile(tmpin, tmpout) tsvlist = filter(lambda x: x.endswith(("tsv")), os.listdir(os.path.join(work_annot, "sma3s"))) logger.info(tsvlist) summary = filter(lambda x: x.endswith(("summary.tsv")), tsvlist)[0] tsvtab = filter(lambda x: not x.endswith(("summary.tsv")), tsvlist)[0] tmpin = "%s" % os.path.join(work_annot, "sma3s", summary) tmpout = "%s/%s-sma3s-summary.tsv" % (out_annot, sample) if os.path.exists(tmpin): copyfile(tmpin, tmpout) tmpin = "%s" % os.path.join(work_annot, "sma3s", tsvtab) tmpout = "%s/%s-sma3s-table.tsv" % (out_annot, sample) if os.path.exists(tmpin): copyfile(tmpin, tmpout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 return os.EX_OK
def run_comet(input="", longest=False, spectrum="", start=0, sample= "", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running mass spectra alignment (Comet) for %s"%sample) work_msalign=os.path.join(workdir,"msalign") create_dirs([work_msalign]) step=0 if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) msg = "Erase msalign work directory for %s"%sample command="rm -rf %s/*" % ( work_msalign) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step+=1 msalign_log = os.path.join(work_msalign, "msalign.log") msalign_log_fd = open(msalign_log, "w") #determine wether the fasta is nucleotide or amino acid format. tmpfile=open(input) tmpline=tmpfile.readlines()[1] #get second line is_na=True if len(set(tmpline.strip()))>4: is_na=False msg = "Run PGA database creator for %s"%sample if start<=step and is_na: logger.info("--------------------------STEP %s--------------------------"%step) command="Rscript /opt/Auxtools/run_dbcreator.R %s %s %s %s" % ( input, longest, work_msalign, sample) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) elif start<=step and not is_na: logger.info("--------------------------STEP %s--------------------------"%step) command=("mkdir -p %(wk)s/database/ && cp %(db)s %(wk)s/database/%(sam)s.ntx.fasta") % { 'db': input, 'wk' : work_msalign, 'sam' : sample } command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Run Comet for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command=("/opt/comet.2018012.linux.exe -Pconfig/par/comet.params -N%(dir)s/%(sam)s " "-D%(dir)s/database/%(sam)s.ntx.fasta %(spectrum)s ") % { 'spectrum': spectrum, 'dir':work_msalign, 'sam':sample } command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Tidy identification result and get precursor protein for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command=("Rscript /opt/Auxtools/comet_fdr.R %(dir)s/%(sam)s %(dir)s/database/%(sam)s.ntx.fasta ") % { 'dir': work_msalign, 'sam': sample } command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 out_msalign=os.path.join(outdir,"msalign") out_database=os.path.join(outdir,"database") create_dirs([out_msalign, out_database]) msg="Copy novel sequence database and MS identification result(s) to output directory for %s."%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if os.path.exists("%s/%s-pepSummary.tsv"% (work_msalign, sample)): command = "cp %(dir)s/%(sam)s-pepSummary.tsv %(dir)s/%(sam)s-psmSummary.tsv %(dir)s/%(sam)s-sequence.fa %(out)s/"%{ 'dir': work_msalign, 'sam': sample, 'out': out_msalign } cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) if os.path.exists("%s/database/%s.ntx.fasta"% (work_msalign, sample)): command = "cp %(dir)s/database/%(sam)s.ntx.fasta %(out)s/" % { 'dir': work_msalign, 'sam': sample, 'out': out_database } cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 return os.EX_OK
def run_msgfplus(input="", longest=False, spectrum="",instrument="3",enzyme="0",decoy="1", fragid="0", pretol="20ppm",minlen=6,maxlen=50,modfile="",ntt="0", start=0, sample= "", nthreads=1, msgfplus_opts="", max_mem="10G", workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running mass spectra alignment (MSGFPlus) for %s"%sample) work_msalign=os.path.join(workdir,"msalign") create_dirs([work_msalign]) step=0 if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) msg = "Erase msalign work directory for %s"%sample command="rm -rf %s/*" % ( work_msalign) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step+=1 msalign_log = os.path.join(work_msalign, "msalign.log") msalign_log_fd = open(msalign_log, "w") #determine wether the fasta is nucleotide or amino acid format. tmpfile=open(input) tmpline=tmpfile.readlines()[1] #get second line is_na=True if len(set(tmpline.strip()))>4: is_na=False msg = "Run PGA database creator for %s"%sample if start<=step and is_na: logger.info("--------------------------STEP %s--------------------------"%step) command="Rscript /opt/Auxtools/run_dbcreator.R %s %s %s %s" % ( input, longest, work_msalign, sample) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) elif start<=step and not is_na: logger.info("--------------------------STEP %s--------------------------"%step) command=("mkdir -p %(wk)s/database/ && cp %(db)s %(wk)s/database/%(sam)s.ntx.fasta") % { 'db': input, 'wk' : work_msalign, 'sam' : sample } command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Run MSGFPlus for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command=("java -jar /opt/MSGFPlus/MSGFPlus.jar -s %(spectrum)s " "-o %(dir)s/%(sam)s.mzid -d %(dir)s/database/%(sam)s.ntx.fasta -m %(fragid)s " "-t %(pretol)s -inst %(inst)s -e %(eyz)s -ntt %(ntt)s -tda %(tda)s -minLength %(minl)s " "-maxLength %(maxl)s -thread %(th)s -mod %(modfile)s") % { 'spectrum': spectrum, 'dir':work_msalign, 'sam':sample, "pretol": pretol, "inst": instrument, "eyz": enzyme, "tda": decoy, "ntt": ntt, "minl": minlen, "maxl": maxlen, 'modfile': modfile, "fragid": fragid, 'th': nthreads } command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Converts MS-GF+ output (.mzid) into the tsv format (.tsv) for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command=("java -cp /opt/MSGFPlus/MSGFPlus.jar edu.ucsd.msjava.ui.MzIDToTsv " "-i %(dir)s/%(sam)s.mzid -o %(dir)s/%(sam)s-rawSummary.tsv " "-showQValue 1 -showDecoy 0 -unroll 1") % { 'dir':work_msalign, 'sam':sample, } command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Tidy identification result and get precursor protein for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command=("python /opt/Auxtools/fasta_preparation.py -i %(dir)s/%(sam)s-rawSummary.tsv " "-d %(dir)s/database/%(sam)s.ntx.fasta -o %(dir)s/%(sam)s") % { 'dir': work_msalign, 'sam': sample } command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 out_msalign=os.path.join(outdir,"msalign") out_database=os.path.join(outdir,"database") create_dirs([out_msalign, out_database]) msg="Copy novel sequence database and MS identification result(s) to output directory for %s."%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if os.path.exists("%s/%s-pepSummary.tsv"% (work_msalign, sample)): command = "cp %(dir)s/%(sam)s-pepSummary.tsv %(dir)s/%(sam)s-psmSummary.tsv %(dir)s/%(sam)s-sequence.fa %(out)s/"%{ 'dir': work_msalign, 'sam': sample, 'out': out_msalign } cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) if os.path.exists("%s/database/%s.ntx.fasta"% (work_msalign, sample)): command = "cp %(dir)s/database/%(sam)s.ntx.fasta %(out)s/" % { 'dir': work_msalign, 'sam': sample, 'out': out_database } cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 return os.EX_OK
def run_stringtie(alignment_bam="", ref_gtf="", stringtie_opts="", stringtie=STRINGTIE, start=0, sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running transcriptome reconstruction (StringTie) for %s" % sample) if not os.path.exists(alignment_bam): logger.error("Aborting!") raise Exception("No input alignment BAM file %s" % alignment_bam) work_stringtie = "%s/stringtie/%s/" % (workdir, sample) create_dirs([work_stringtie]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase StringTie work directory for %s" % sample command = "rm -rf %s/*" % (work_stringtie) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 stringtie_log = os.path.join(work_stringtie, "stringtie.log") stringtie_log_fd = open(stringtie_log, "w") if ref_gtf: if not os.path.exists(ref_gtf): logger.error("Aborting!") raise Exception("No reference GTF file %s" % ref_gtf) if ref_gtf: stringtie_opts += " -G %s" % ref_gtf if "-p " not in stringtie_opts: stringtie_opts += " -p %d" % nthreads msg = "StringTie for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s %s -o %s/transcripts.gtf -A %s/gene_abund.tab -v" % ( stringtie, alignment_bam, stringtie_opts, work_stringtie, work_stringtie) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_stringtie = os.path.join(outdir, "stringtie", sample) create_dirs([out_stringtie]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/transcripts.gtf"%work_stringtie) and \ os.path.exists("%s/gene_abund.tab"%work_stringtie): command = "cp %s/transcripts.gtf %s/transcripts.gtf" % ( work_stringtie, out_stringtie) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout) command = "cp %s/gene_abund.tab %s/gene_abund.tab" % ( work_stringtie, out_stringtie) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 transcripts = "" abundances = "" if os.path.exists("%s/transcripts.gtf"%out_stringtie) and \ os.path.exists("%s/gene_abund.tab"%out_stringtie): logger.info("StringTie was successfull!") logger.info("Output isoforms: %s/transcripts.gtf" % out_stringtie) logger.info("Output expressions: %s/gene_abund.tab" % out_stringtie) transcripts = "%s/transcripts.gtf" % out_stringtie abundances = "%s/gene_abund.tab" % out_stringtie else: logger.info("StringTie failed!") return transcripts, abundances
def run_trinity(seq_1="", seq_2="", seq_u="", start=0, sample="", nthreads=1, trinity_opts="", max_mem="20G", workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running de novo assembly (TRINITY) for %s" % sample) #dirname="trinity_"+sample #Triniy's output directory must contain the word 'trinity' as a safety precaution work_trinity = os.path.join(workdir, "trinity") create_dirs([work_trinity]) #check the fq if seq_1 and seq_2: for s1 in seq_1.split(","): if not os.path.exists(s1): logger.error("Aborting!") raise Exception("No Mate 1 sequence file %s" % s1) if not s1.endswith(".fq.gz"): logger.error( "Aborting! Please ensure the suffix of fastq files is <*>.fq.gz" ) raise Exception("Fastq format error %s" % s1) for s2 in seq_2.split(","): if not os.path.exists(s2): logger.error("Aborting!") raise Exception("No Mate 2 sequence file %s" % s2) if not s2.endswith(".fq.gz"): logger.error( "Aborting! Please ensure the suffix of fastq files is <*>.fq.gz" ) raise Exception("Fastq format error %s" % s2) seq_argument = "--left %s --right %s" % (seq_1, seq_2) ''' cor_1=chainMap(seq_1.split(",")). map(lambda x: os.path.basename(x)). map(lambda x: re.sub(r'(.+)\.(fq|fastq)(\.gz)?', r'\1', x)). map(lambda x: work_trinity + "/corfq/"+ x +".cor.fq.gz") scor_1=",".join(cor_1) cor_2=chainMap(seq_2.split(",")). map(lambda x: os.path.basename(x)). map(lambda x: re.sub(r'(.+)\.(fq|fastq)(\.gz)?', r'\1', x)). map(lambda x: work_trinity + "/corfq/"+ x +".cor.fq.gz") scor_2=",".join(cor_2) ''' elif seq_u: for su in seq_u.split(","): if not os.path.exists(su): logger.error("Aborting!") raise Exception("No unpaired sequence file %s" % su) if not su.endswith(".fq.gz"): logger.error( "Aborting! Please ensure the suffix of fastq files is <*>.fq.gz" ) raise Exception("Fastq format error %s" % su) seq_argument = "--single %s" % (seq_u) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase Trinity work directory for %s" % sample #Trinity's running status is stored with the hidden files. It's necessary to delete these hidden file before the rerun of Trinity. Note that '*' couldn't match the hidden files. command = "rm -rf %(wk)s/* %(wk)s/.*" % {'wk': work_trinity} command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 trinity_log = os.path.join(work_trinity, "trinity.log") trinity_log_fd = open(trinity_log, "w") #seq_argument="-%s -%s %s "%(file_format,read_type,seq_argument) msg = "Run Trinity for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "Trinity --seqType fq --max_memory %s --CPU %s --output %s %s %s" % ( max_mem, nthreads, work_trinity, seq_argument, trinity_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS": str(nthreads)}) retcode = cmd.run(cmd_log_fd_out=trinity_log_fd, cmd_log=trinity_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_trinity = os.path.join(outdir, "trinity") create_dirs([out_trinity]) msg = "Copy trinity transcripts to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/Trinity.fasta" % work_trinity): command = "cp %s/Trinity.fasta %s/" % (work_trinity, out_trinity) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=trinity_log_fd, cmd_log=trinity_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 return os.EX_OK
def run_idpfusion(alignment="", short_junction="", long_alignment="", mode_number=0, short_fasta="", long_fasta="", ref_genome="", ref_all_gpd="", ref_gpd="", uniqueness_bedgraph="", genome_bowtie2_idx="", transcriptome_bowtie2_idx="", read_length=100, idpfusion_cfg="", idpfusion=IDPFUSION, samtools=SAMTOOLS, gmap=GMAP, gmap_idx="", star_dir=STAR_DIR, bowtie2_dir=BOWTIE2_DIR, start=0, sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running long read fusion Detection (IDP-fusion) for %s" % sample) if not os.path.exists(alignment): logger.error("Aborting!") raise Exception("No input short read alignment BAM/SAM file %s" % alignment) if not os.path.exists(short_junction): logger.error("Aborting!") raise Exception("No input short read junction BED file %s" % short_junction) if idpfusion_cfg: if not os.path.exists(idpfusion_cfg): logger.error("Aborting!") raise Exception("No input .cfg file %s" % idpfusion_cfg) if mode_number > 0: start = 4 work_idpfusion = "%s/idpfusion/%s/" % (workdir, sample) create_dirs([work_idpfusion]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase IDP-fusion work directory for %s" % sample command = "rm -rf %s/*" % (work_idpfusion) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 idpfusion_log = os.path.join(work_idpfusion, "idpfusion.log") idpfusion_log_fd = open(idpfusion_log, "w") msg = "converting BAM to SAM for %s" % sample logger.info("--------------------------STEP %s--------------------------" % step) if start <= step: if alignment.endswith('.bam'): command = "%s view -h -o %s/alignments.sam %s " % ( samtools, work_idpfusion, alignment) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) alignment = "%s/alignments.sam" % (work_idpfusion) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Fix soft-clipped reads in SAM for %s" % sample logger.info("--------------------------STEP %s--------------------------" % step) if start <= step: logger.info("Task: %s" % msg) corrected_alignment = "%s/alignments_corrected.sam" % (work_idpfusion) with open(alignment, "r") as csv_file_i: with open(corrected_alignment, "w") as csv_file_o: spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|') spamwriter = csv.writer(csv_file_o, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in spamreader: if row[0][0] == "@": spamwriter.writerow(row) continue if row[5] == "*": continue if "S" in row[5]: cigartuple = cigarstring_to_tuple(row[5]) if cigartuple[0][0] == 4: row[9] = row[9][cigartuple[0][1]:] row[10] = row[10][cigartuple[0][1]:] cigartuple = cigartuple[1:] if cigartuple[-1][0] == 4: row[9] = row[9][:-cigartuple[-1][1]] row[10] = row[10][:-cigartuple[-1][1]] cigartuple = cigartuple[:-1] row[5] = "".join([ "%d%s" % (x[1], CIGAR_OP_DICT_rev[x[0]]) for x in cigartuple ]) spamwriter.writerow(row) alignment = corrected_alignment else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Fix junction bed for %s" % sample logger.info("--------------------------STEP %s--------------------------" % step) if start <= step: logger.info("Task: %s" % msg) corrected_junction = "%s/splicesites_corrected.bed" % (work_idpfusion) with open(short_junction, "r") as csv_file_i: with open(corrected_junction, "w") as csv_file_o: spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|') spamwriter = csv.writer(csv_file_o, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in spamreader: if len(row) < 4: spamwriter.writerow(row) continue if "]" in row[3]: spamwriter.writerow(row) continue row[3] = "(2)[2_2](2/0)" spamwriter.writerow(row) short_junction = corrected_junction else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Preparing run.cfg for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) logger.info("Task: %s" % msg) if idpfusion_cfg: msg = "copy IDP-fusion .cfg file for %s" % sample command = "cp %s %s/run.cfg" % (idpfusion_cfg, work_idpfusion) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) else: f = open("%s/run.cfg" % work_idpfusion, 'w') f.close() cgf_dict = {} with open("%s/run.cfg" % work_idpfusion, 'r') as cfg_file: for line in cfg_file: line = line.strip() if line == '': continue if "=" in line and not line[0] == '#': k, v = line.split("=") k = k.strip() v = v.strip() cgf_dict[k] = v with open("%s/run.cfg" % work_idpfusion, 'w') as cfg_file: for k, v in cgf_dict.iteritems(): cfg_file.write("%s = %s \n" % (k, v)) if "temp_foldername" not in cgf_dict: cfg_file.write("temp_foldername = %s/tmp/ \n" % work_idpfusion) if "output_foldername" not in cgf_dict: cfg_file.write("output_foldername = %s/out/ \n" % work_idpfusion) if "Nthread" not in cgf_dict: cfg_file.write("Nthread = %d \n" % nthreads) if "LR_psl_pathfilename" not in cgf_dict: if long_alignment and os.path.exists(long_alignment): cfg_file.write("LR_psl_pathfilename = %s \n" % long_alignment) if "LR_pathfilename" not in cgf_dict: cfg_file.write("LR_pathfilename = %s \n" % long_fasta) if "SR_sam_pathfilename" not in cgf_dict: cfg_file.write("SR_sam_pathfilename = %s \n" % alignment) if "SR_jun_pathfilename" not in cgf_dict: cfg_file.write("SR_jun_pathfilename = %s \n" % short_junction) if "SR_pathfilename" not in cgf_dict: cfg_file.write("SR_pathfilename = %s \n" % short_fasta) if "SR_aligner_choice" not in cgf_dict: cfg_file.write("SR_aligner_choice = STAR \n") if "star_path" not in cgf_dict: cfg_file.write("star_path = %s \n" % star_dir) if "gmap_executable_pathfilename" not in cgf_dict: cfg_file.write("gmap_executable_pathfilename = %s \n" % gmap) if "gmap_index_pathfoldername" not in cgf_dict: cfg_file.write("gmap_index_pathfoldername = %s \n" % gmap_idx) if "genome_bowtie2_index_pathfilename" not in cgf_dict: cfg_file.write("genome_bowtie2_index_pathfilename = %s \n" % genome_bowtie2_idx) if "transcriptome_bowtie2_index_pathfilename" not in cgf_dict: cfg_file.write( "transcriptome_bowtie2_index_pathfilename = %s \n" % transcriptome_bowtie2_idx) if "allref_annotation_pathfilename" not in cgf_dict: cfg_file.write("allref_annotation_pathfilename = %s \n" % ref_all_gpd) if "ref_annotation_pathfilename" not in cgf_dict: cfg_file.write("ref_annotation_pathfilename = %s \n" % ref_gpd) if "genome_pathfilename" not in cgf_dict: cfg_file.write("genome_pathfilename = %s \n" % ref_genome) if "estimator_choice" not in cgf_dict: cfg_file.write("estimator_choice = MAP \n") if "FPR" not in cgf_dict: cfg_file.write("FPR = 0.1 \n") if "Njun_limit" not in cgf_dict: cfg_file.write("Njun_limit = 10 \n") if "Niso_limit" not in cgf_dict: cfg_file.write("Niso_limit = 20 \n") if "L_exon_limit" not in cgf_dict: cfg_file.write("L_exon_limit = 1700 \n") if "L_min_intron" not in cgf_dict: cfg_file.write("L_min_intron = 68 \n") if "Bfile_Npt" not in cgf_dict: cfg_file.write("Bfile_Npt = 50 \n") if "Bfile_Nbin" not in cgf_dict: cfg_file.write("Bfile_Nbin = 5 \n") if "min_LR_overlap_len" not in cgf_dict: cfg_file.write("min_LR_overlap_len = 100 \n") if "LR_fusion_point_err_margin" not in cgf_dict: cfg_file.write("LR_fusion_point_err_margin = 100 \n") if "min_LR_fusion_point_search_distance" not in cgf_dict: cfg_file.write("min_LR_fusion_point_search_distance = 20 \n") if "uniq_LR_alignment_margin_perc" not in cgf_dict: cfg_file.write("uniq_LR_alignment_margin_perc = 20 \n") if "Niso_fusion_limit" not in cgf_dict: cfg_file.write("Niso_fusion_limit = 1000 \n") if "psl_type" not in cgf_dict: cfg_file.write("psl_type = 0 \n") if "read_length" not in cgf_dict: cfg_file.write("read_length = %d \n" % read_length) if "min_junction_overlap_len" not in cgf_dict: cfg_file.write("min_junction_overlap_len = 10 \n") if "I_refjun_isoformconstruction" not in cgf_dict: cfg_file.write("I_refjun_isoformconstruction = 1 \n") if "I_ref5end_isoformconstruction" not in cgf_dict: cfg_file.write("I_ref5end_isoformconstruction = 1 \n") if "I_ref3end_isoformconstruction" not in cgf_dict: cfg_file.write("I_ref3end_isoformconstruction = 1 \n") if "fusion_mode" not in cgf_dict: cfg_file.write("fusion_mode = 1 \n") if "uniqueness_bedGraph_pathfilename" not in cgf_dict: cfg_file.write("uniqueness_bedGraph_pathfilename = %s \n" % uniqueness_bedgraph) if "exon_construction_junction_span" not in cgf_dict: cfg_file.write("exon_construction_junction_span = 1 \n") if "aligner_choice" not in cgf_dict: cfg_file.write("aligner_choice = gmap \n") if "aligner_choice" not in cgf_dict: cfg_file.write("aligner_choice = gmap \n") if "three_primer" not in cgf_dict: cfg_file.write("three_primer = \n") if "five_primer" not in cgf_dict: cfg_file.write("five_primer = \n") else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 if star_dir: os.environ["PATH"] += ":%s/" % star_dir if bowtie2_dir: os.environ["PATH"] += ":%s/" % bowtie2_dir msg = "IDP-fusion for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s/run.cfg %d" % (idpfusion, work_idpfusion, mode_number) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Convert transcript GPD file to GTF for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/out/isoform.gpd" % work_idpfusion): sort_gpd("%s/out/isoform.gpd" % work_idpfusion, "%s/out/isoform_sorted.gpd" % work_idpfusion) command = "gpd2gtf.py \ %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP" % ( work_idpfusion, work_idpfusion, work_idpfusion) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_idpfusion = os.path.join(outdir, "idpfusion", sample) create_dirs([out_idpfusion]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/out/fusion_report.tsv" % work_idpfusion): command = "cp %s/out/fusion_report.tsv %s/fusion_report.tsv" % ( work_idpfusion, out_idpfusion) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 fusions = "" if os.path.exists("%s/fusion_report.tsv" % out_idpfusion): logger.info("IDP-fusion was successfull!") logger.info("Output fusions: %s/fusion_report.tsv" % out_idpfusion) fusions = "%s/fusion_report.tsv" % out_idpfusion else: logger.info("IDP-fusion failed!") return fusions
def run_hisat2(align_idx=None, seq_1="", seq_2="", seq_u="", seq_sra="", ref_gtf="", hisat2_opts="", hisat2=HISAT2, hisat2_sps=HISAT2_SPS, samtools=SAMTOOLS, start=0, sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running alignment (HISAT2) for %s" % sample) if not os.path.exists(align_idx + ".1.ht2"): logger.error("Aborting!") raise Exception("No HISAT index file %s.1.ht2" % align_idx) if seq_1 and seq_2: for s1 in seq_1.split(","): if not os.path.exists(s1): logger.error("Aborting!") raise Exception("No Mate 1 sequence file %s" % s1) for s2 in seq_2.split(","): if not os.path.exists(s2): logger.error("Aborting!") raise Exception("No Mate 2 sequence file %s" % s2) seq_argument = "-1 %s -2 %s" % (seq_1, seq_2) elif seq_u: seq_argument = "-U %s" % (seq_u) for su in seq_u.split(","): if not os.path.exists(su): logger.error("Aborting!") raise Exception("No unpaired sequence file %s" % su) elif seq_sra: seq_argument = "--sra-acc %s" % (seq_sra) for sr in seq_sra.split(","): if not os.path.exists(sr): logger.error("Aborting!") raise Exception("No sra sequence file %s" % sr) work_hisat2 = os.path.join(workdir, "hisat2", sample) create_dirs([work_hisat2]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase HISAT2 work directory for %s" % sample command = "rm -rf %s/*" % (work_hisat2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 hisat2_log = os.path.join(work_hisat2, "hisat2.log") hisat2_log_fd = open(hisat2_log, "w") ksps = "" msg = "Prepare known-splicesites for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if ref_gtf: if not os.path.exists(ref_gtf): logger.error("Aborting!") raise Exception("No reference GTF file %s" % ref_gtf) else: ksps = ref_gtf.strip() + "known-splicesite.txt" if os.path.exists(ksps): logger.info( "Will use the precomputed %s as --known-splicesite-infile for HISAT2" % ksps) else: msg = "compute --known-splicesite-infile for HISAT2" ksps = os.path.join(work_hisat2, "known-splicesite.txt") ksps_fd = open(ksps, "w") command = "%s %s" % (hisat2_sps, ref_gtf) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=ksps_fd, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 if "--dta " not in hisat2_opts: hisat2_opts += " --dta" if "--rg-id " not in hisat2_opts: hisat2_opts += " --rg-id hisat2" if "--rg " not in hisat2_opts: hisat2_opts += " --rg SM:%s" % sample if "--threads " not in hisat2_opts: hisat2_opts += " --threads %d" % nthreads if ksps: hisat2_opts += " --known-splicesite-infile %s" % ksps msg = "HISAT2 for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -x %s %s -S %s/alignments.sam --novel-splicesite-outfile %s/splicesites.tab" % ( hisat2, hisat2_opts, align_idx, seq_argument, work_hisat2, work_hisat2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "converting SAM to BAM for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s view -Su %s/alignments.sam -@ %d -o %s/alignments.bam" % ( samtools, work_hisat2, nthreads, work_hisat2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "sorting BAM for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s sort -@ %d -T %s/alignments.sorted -o %s/alignments.sorted.bam %s/alignments.bam " % ( samtools, nthreads, work_hisat2, work_hisat2, work_hisat2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Converting junctions to BED for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "hisat2_jun2bed.py %s/splicesites.tab %s/splicesites.bed " % ( work_hisat2, work_hisat2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "Clean temp alignment files for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "rm %s/alignments.sam %s/alignments.bam" % (work_hisat2, work_hisat2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_hisat2 = os.path.join(outdir, "hisat2", sample) create_dirs([out_hisat2]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/alignments.sorted.bam"%work_hisat2) and \ os.path.exists("%s/splicesites.tab"%work_hisat2) and \ os.path.exists("%s/splicesites.bed"%work_hisat2): command = "cp %s/alignments.sorted.bam %s/alignments.sorted.bam" % ( work_hisat2, out_hisat2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) command = "cp %s/splicesites.tab %s/splicesites.tab" % ( work_hisat2, out_hisat2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) command = "cp %s/splicesites.bed %s/splicesites.bed" % ( work_hisat2, out_hisat2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 alignments_bam = "" junctions_tab = "" junctions_bed = "" if os.path.exists("%s/alignments.sorted.bam" % out_hisat2): logger.info("HISAT2 was successfull!") logger.info("Output alignment: %s/alignments.sorted.bam" % out_hisat2) logger.info("Output junction tab: %s/splicesites.tab" % out_hisat2) logger.info("Output junction bed: %s/splicesites.bed" % out_hisat2) alignments_bam = "%s/alignments.sorted.bam" % out_hisat2 junctions_tab = "%s/splicesites.tab" % out_hisat2 junctions_bed = "%s/splicesites.bed" % out_hisat2 else: logger.info("HISAT2 failed!") return alignments_bam, junctions_tab, junctions_bed
def run_giremi(alignment="", variant="", strand_pos="", genes_pos="", ref_genome="", knownsites="", giremi_dir="", htslib_dir="", samtools=SAMTOOLS, gatk=GATK, java=JAVA, giremi_opts="", java_opts="", VariantAnnotator_opts="", start=0, sample= "", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running RNA editing detection (GIREMI) for %s"%sample) if not os.path.exists(alignment): logger.error("Aborting!") raise Exception("No alignment file %s"%alignment) if not os.path.exists(variant): logger.error("Aborting!") raise Exception("No variant VCF file %s"%variant) if not os.path.exists(strand_pos): logger.error("Aborting!") raise Exception("No strand position BED file %s"%strand_pos) if not os.path.exists(genes_pos): logger.error("Aborting!") raise Exception("No genes position BED file %s"%genes_pos) if not os.path.exists(ref_genome): logger.error("Aborting!") raise Exception("No reference genome FASTA file %s"%ref_genome) if not os.path.exists(knownsites): logger.error("Aborting!") raise Exception("No VCF knownsites file %s"%knownsites) if giremi_dir: if not os.path.exists(giremi_dir): logger.error("Aborting!") raise Exception("No GIREMI directory %s"%giremi_dir) work_giremi=os.path.join(workdir,"giremi",sample) create_dirs([work_giremi]) if nthreads>1: if "-nt " not in VariantAnnotator_opts: VariantAnnotator_opts += " -nt %d"%nthreads if "-Xms" not in java_opts: java_opts += " %s"%JAVA_XMS if "-Xmx" not in java_opts: java_opts += " %s"%JAVA_XMG if "-Djava.io.tmpdir" not in java_opts: java_opts += " -Djava.io.tmpdir=%s/javatmp/"%(work_giremi) step=0 if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) msg = "Erase GIREMI work directory for %s"%sample command="rm -rf %s/*" % ( work_giremi) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg,timeout=timeout) step+=1 giremi_log = os.path.join(work_giremi, "giremi.log") giremi_log_fd = open(giremi_log, "w") msg = "Sort BAM by name for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command="%s sort -n -@ %d %s %s/alignments.name_sorted" % ( samtools, nthreads, alignment, work_giremi) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Filter alignments mapped to multiple chromosoms for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) logger.info(msg) filter_multi_chr_alignments("%s/alignments.name_sorted.bam"%work_giremi,"%s/alignments.chr_unique.bam"%work_giremi) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Sort BAM by pos for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command="%s sort -@ %d %s/alignments.chr_unique.bam %s/alignments.pos_sorted " % ( samtools, nthreads, work_giremi, work_giremi) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "GATK VariantAnnotator for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command="%s %s -jar %s -T VariantAnnotator -R %s -V %s -L %s -o %s/annotated.vcf --dbsnp %s %s" % ( java, java_opts, gatk, ref_genome,variant,variant,work_giremi,knownsites,VariantAnnotator_opts) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg="Find variant strands for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) logger.info(msg) find_SNV_strands(strand_pos, genes_pos, "%s/annotated.vcf"%work_giremi, "%s/SNV_annotated.bed"%work_giremi) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 if htslib_dir: if "LD_LIBRARY_PATH" in os.environ: os.environ["LD_LIBRARY_PATH"] += ":%s/"%htslib_dir else: os.environ["LD_LIBRARY_PATH"] = htslib_dir if giremi_dir: os.environ["PATH"] += ":%s/"%giremi_dir msg = "Run GIREMI for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command="cd %s && %s %s -f %s -l %s/SNV_annotated.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % ( giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi)) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 if os.path.exists("%s/giremi_out.txt"%work_giremi) and not os.path.exists("%s/giremi_out.txt.res"%work_giremi): msg="Identify N variants for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) logger.info(msg) with open("%s/giremi_out.txt"%work_giremi) as csv_file_i: spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|') with open("%s/N.bed"%work_giremi, 'wb') as csvfile_o: spamwriter = csv.writer(csvfile_o, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in spamreader: if (row[5]=="N" or row[8]=="N"): spamwriter.writerow([row[0],int(row[1])-1,row[1]]) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 cnt=len(pybedtools.BedTool("%s/N.bed"%work_giremi)) if cnt>0: msg="Remove N variants for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) logger.info(msg) pybedtools.BedTool("%s/SNV_annotated.bed"%work_giremi).intersect( "%s/N.bed"%work_giremi,r=True, f=1, v=True).saveas("%s/SNV_annotated_filtered.bed"%work_giremi) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Rerun GIREMI for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if os.path.exists("%s/SNV_annotated_filtered.bed"%work_giremi): command="cd %s && %s %s -f %s -l %s/SNV_annotated_filtered.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % ( giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi)) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) else: logger.info("No file %s/SNV_annotated_filtered.bed"%work_giremi) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 else: step+=2 else: step+=3 out_giremi=os.path.join(outdir,"giremi",sample) create_dirs([out_giremi]) msg="Copy predictions to output directory for %s."%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if os.path.exists("%s/giremi_out.txt.res"%work_giremi): command = "cp %s/giremi_out.txt.res %s/giremi_out.txt.res"%( work_giremi, out_giremi) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 edits = "" if os.path.exists("%s/giremi_out.txt.res"%out_giremi): logger.info("GIREMI was successfull!") logger.info("Output edits: %s/giremi_out.txt.res"%out_giremi) edits = "%s/giremi_out.txt.res"%out_giremi else: logger.info("GIREMI failed!") return edits
def run_starlong(long="", genome_dir="", ref_gtf="", starlong=STARLONG, sam2psl=SAM2PSL, samtools=SAMTOOLS, starlong_opts="", start=0, sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running long read alignment (STARlong) for %s" % sample) if not os.path.exists(genome_dir + "SAindex"): logger.error("Aborting!") raise Exception("No SAindex directory in %s" % genome_dir) if long: if not os.path.exists(long): logger.error("Aborting!") raise Exception("No long read sequence file %s" % long) work_starlong = os.path.join(workdir, "starlong", sample) create_dirs([work_starlong]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase STARlong work directory for %s" % sample command = "rm -rf %s/*" % (work_starlong) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 starlong_log = os.path.join(work_starlong, "starlong.log") starlong_log_fd = open(starlong_log, "w") if ref_gtf: if not os.path.exists(ref_gtf): logger.error("Aborting!") raise Exception("No reference GTF file %s" % ref_gtf) if "--outSAMattrRGline" not in starlong_opts: starlong_opts += " --outSAMattrRGline ID:STARlong SM:%s" % sample if "--runThreadN " not in starlong_opts: starlong_opts += " --runThreadN %d" % nthreads if ref_gtf: starlong_opts += " --sjdbGTFfile %s" % ref_gtf for k, v in STARLONG_DEFAULTS.iteritems(): if k not in starlong_opts: starlong_opts += " --%s %s" % (k, v) msg = "STARlong for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s --runMode alignReads %s --genomeDir %s --readFilesIn %s --outFileNamePrefix %s/" % ( starlong, starlong_opts, genome_dir, long, work_starlong) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "converting SAM to PSL for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s -i %s/Aligned.out.sam -o %s/Aligned.out.psl" % ( sam2psl, work_starlong, work_starlong) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "converting SAM to BAM for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s view -Su %s/Aligned.out.sam -@ %d -o %s/Aligned.out.bam" % ( samtools, work_starlong, nthreads, work_starlong) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 # # msg = "Clean temp alignment files for %s"%sample # if start<=step: # logger.info("--------------------------STEP %s--------------------------"%step) # command="rm %s/Aligned.out.sam" % (work_starlong) # command="bash -c \"%s\""%command # cmd = TimedExternalCmd(command, logger, raise_exception=True) # retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) # else: # logger.info("Skipping step %d: %s"%(step,msg)) # step+=1 out_starlong = os.path.join(outdir, "starlong", sample) create_dirs([out_starlong]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/Aligned.out.psl" % work_starlong): command = "cp %s/Aligned.out.psl %s/Aligned.out.psl" % ( work_starlong, out_starlong) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 alignments_psl = "" if os.path.exists("%s/Aligned.out.psl" % out_starlong): logger.info("STARlong was successfull!") logger.info("Output alignment: %s/Aligned.out.psl" % out_starlong) alignments_psl = "%s/Aligned.out.psl" % out_starlong else: logger.info("STARlong failed!") return alignments_psl
def run_idp(alignment="", short_junction="", long_alignment="",mode_number=0, ref_genome="", ref_all_gpd="", ref_gpd="",read_length=100, idp_cfg="", idp=IDP, samtools=SAMTOOLS, start=0, sample= "", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running long-read transcriptome reconstruction (IDP) for %s"%sample) if not os.path.exists(alignment): logger.error("Aborting!") raise Exception("No input short read alignment BAM/SAM file %s"%alignment) if not os.path.exists(short_junction): logger.error("Aborting!") raise Exception("No input short read junction BED file %s"%short_junction) if not os.path.exists(long_alignment): logger.error("Aborting!") raise Exception("No input long read alignment PSL file %s"%long_alignment) if idp_cfg: if not os.path.exists(idp_cfg): logger.error("Aborting!") raise Exception("No input .cfg file %s"%idp_cfg) if mode_number>0: start=4 work_idp="%s/idp/%s/"%(workdir,sample) create_dirs([work_idp]) step=0 if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) msg = "Erase IDP work directory for %s"%sample command="rm -rf %s/*" % ( work_idp) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg,timeout=timeout) step+=1 idp_log = os.path.join(work_idp, "idp.log") idp_log_fd = open(idp_log, "w") msg = "converting BAM to SAM for %s"%sample logger.info("--------------------------STEP %s--------------------------"%step) if start<=step: if alignment.endswith('.bam'): command = "%s view -h -o %s/alignments.sam %s " % (samtools,work_idp,alignment) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) alignment = "%s/alignments.sam"%(work_idp) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Preparing run.cfg for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if idp_cfg: msg = "copy IDP .cfg file for %s"%sample command="cp %s %s/run.cfg" % ( idp_cfg, work_idp) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) else: f=open("%s/run.cfg"%work_idp, 'w') f.close() cgf_dict={} with open("%s/run.cfg"%work_idp , 'r') as cfg_file: for line in cfg_file: line = line.strip() if line=='': continue if "=" in line and not line[0]=='#' : k,v=line.split("=") k=k.strip() v=v.strip() cgf_dict[k]=v with open("%s/run.cfg"%work_idp , 'w') as cfg_file: for k,v in cgf_dict.iteritems(): cfg_file.write("%s = %s \n"%(k,v)) if "temp_foldername" not in cgf_dict: cfg_file.write("temp_foldername = %s/tmp/ \n"%work_idp) if "output_foldername" not in cgf_dict: cfg_file.write("output_foldername = %s/out/ \n"%work_idp) if "Nthread" not in cgf_dict: cfg_file.write("Nthread = %d \n"%nthreads) if "LR_psl_pathfilename" not in cgf_dict: cfg_file.write("LR_psl_pathfilename = %s \n"%long_alignment) if "SR_sam_pathfilename" not in cgf_dict: cfg_file.write("SR_sam_pathfilename = %s \n"%alignment) if "SR_jun_pathfilename" not in cgf_dict: cfg_file.write("SR_jun_pathfilename = %s \n"%short_junction) if "genome_pathfilename" not in cgf_dict: cfg_file.write("genome_pathfilename = %s \n"%ref_genome) if "allref_annotation_pathfilename" not in cgf_dict: cfg_file.write("allref_annotation_pathfilename = %s \n"%ref_all_gpd) if "ref_annotation_pathfilename" not in cgf_dict: cfg_file.write("ref_annotation_pathfilename = %s \n"%ref_gpd) if "estimator_choice" not in cgf_dict: cfg_file.write("estimator_choice = MLE \n") if "FPR" not in cgf_dict: cfg_file.write("FPR = 0.05 \n") if "Njun_limit" not in cgf_dict: cfg_file.write("Njun_limit = 10 \n") if "Niso_limit" not in cgf_dict: cfg_file.write("Niso_limit = 100 \n") if "aligner_choice" not in cgf_dict: cfg_file.write("aligner_choice = gmap \n") if "exon_construction_junction_span" not in cgf_dict: cfg_file.write("exon_construction_junction_span = 1 \n") if "read_length" not in cgf_dict: cfg_file.write("read_length = %d \n"%read_length) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "IDP for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) command="%s %s/run.cfg %d" % ( idp, work_idp, mode_number) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 msg = "Convert transcript GPD file to GTF for %s"%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if os.path.exists("%s/out/isoform.gpd"%work_idp): sort_gpd("%s/out/isoform.gpd"%work_idp,"%s/out/isoform_sorted.gpd"%work_idp) command="gpd2gtf.py \ %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP"%(work_idp,work_idp,work_idp) command="bash -c \"%s\""%command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 out_idp=os.path.join(outdir,"idp",sample) create_dirs([out_idp]) msg="Copy predictions to output directory for %s."%sample if start<=step: logger.info("--------------------------STEP %s--------------------------"%step) if os.path.exists("%s/out/isoform.gtf"%work_idp) and \ os.path.exists("%s/out/isoform.exp"%work_idp): command = "cp %s/out/isoform.gtf %s/isoform.gtf"%( work_idp, out_idp) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) command = "cp %s/out/isoform.exp %s/isoform.exp"%( work_idp, out_idp) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s"%(step,msg)) step+=1 transcripts = "" abundances = "" if os.path.exists("%s/isoform.gtf"%out_idp) and \ os.path.exists("%s/isoform.exp"%out_idp): logger.info("IDP was successfull!") logger.info("Output isoforms: %s/isoform.gtf"%out_idp) logger.info("Output expressions: %s/isoform.exp"%out_idp) transcripts = "%s/isoform.gtf"%out_idp abundances = "%s/isoform.exp"%out_idp else: logger.info("IDP failed!") return transcripts,abundances
def run_gatk(alignment="", ref_genome="", knownsites="", picard=PICARD, gatk=GATK, java=JAVA, java_opts="", CleanSam=False, IndelRealignment=False, no_BaseRecalibrator=False, AddOrReplaceReadGroups_opts="", MarkDuplicates_opts="", SplitNCigarReads_opts="", RealignerTargetCreator_opts="", IndelRealigner_opts="", BaseRecalibrator_opts="", PrintReads_opts="", HaplotypeCaller_opts="", VariantFiltration_opts="", start=0, sample="", nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running variant calling (GATK) for %s" % sample) if not os.path.exists(alignment): logger.error("Aborting!") raise Exception("No alignment file %s" % alignment) if not os.path.exists(ref_genome): logger.error("Aborting!") raise Exception("No reference genome FASTA file %s" % ref_genome) work_gatk = os.path.join(workdir, "gatk", sample) create_dirs([work_gatk]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase GATK work directory for %s" % sample command = "rm -rf %s/*" % (work_gatk) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 gatk_log = os.path.join(work_gatk, "gatk.log") gatk_log_fd = open(gatk_log, "w") if "SO=" not in AddOrReplaceReadGroups_opts: AddOrReplaceReadGroups_opts += " SO=coordinate" if "RGLB=" not in AddOrReplaceReadGroups_opts: AddOrReplaceReadGroups_opts += " RGLB=lib1" if "RGPL=" not in AddOrReplaceReadGroups_opts: AddOrReplaceReadGroups_opts += " RGPL=illumina" if "RGPU=" not in AddOrReplaceReadGroups_opts: AddOrReplaceReadGroups_opts += " RGPU=unit1" if "RGSM=" not in AddOrReplaceReadGroups_opts: AddOrReplaceReadGroups_opts += " RGSM=%s" % sample if "CREATE_INDEX=" not in MarkDuplicates_opts: MarkDuplicates_opts += " CREATE_INDEX=true" if "VALIDATION_STRINGENCY=" not in MarkDuplicates_opts: MarkDuplicates_opts += " VALIDATION_STRINGENCY=SILENT" if "-rf " not in SplitNCigarReads_opts: SplitNCigarReads_opts += " -rf %s" % GATK_SN_RF if "-RMQF " not in SplitNCigarReads_opts: SplitNCigarReads_opts += " -RMQF %d" % GATK_SN_RMQF if "-RMQT " not in SplitNCigarReads_opts: SplitNCigarReads_opts += " -RMQT %d" % GATK_SN_RMQT if "-U " not in SplitNCigarReads_opts: SplitNCigarReads_opts += " -U ALLOW_N_CIGAR_READS" if knownsites: if not os.path.exists(knownsites): logger.error("Aborting!") raise Exception("No VCF knownsites file %s" % knownsites) if "--known " not in RealignerTargetCreator_opts: RealignerTargetCreator_opts += " --known %s" % knownsites if "-known " not in IndelRealigner_opts and "--knownAlleles " not in IndelRealigner_opts: IndelRealigner_opts += " -known %s" % knownsites if "-knownSites " not in BaseRecalibrator_opts: BaseRecalibrator_opts += " -knownSites %s" % knownsites if "-dontUseSoftClippedBases " not in HaplotypeCaller_opts: HaplotypeCaller_opts += " -dontUseSoftClippedBases" if "-stand_call_conf " not in HaplotypeCaller_opts: HaplotypeCaller_opts += " -stand_call_conf %f" % GATK_HC_STANDCALLCONF if "-stand_emit_conf " not in HaplotypeCaller_opts: HaplotypeCaller_opts += " -stand_emit_conf %f" % GATK_HC_STANDEMITCONF if "-window " not in VariantFiltration_opts: VariantFiltration_opts += " -window %d" % GATK_VF_WINDOW if "-cluster " not in VariantFiltration_opts: VariantFiltration_opts += " -cluster %d" % GATK_VF_CLUSTER if "-filterName FS " not in VariantFiltration_opts: VariantFiltration_opts += " -filterName FS -filter 'FS > %f'" % GATK_VF_FSMIN if "-filterName QD " not in VariantFiltration_opts: VariantFiltration_opts += " -filterName QD -filter 'QD < %f'" % GATK_VF_QDMAX if nthreads > 1: if "-nct " not in BaseRecalibrator_opts: BaseRecalibrator_opts += " -nct %d" % nthreads if "-nct " not in PrintReads_opts: PrintReads_opts += " -nct %d" % nthreads if "-Xms" not in java_opts: java_opts += " %s" % JAVA_XMS if "-Xmx" not in java_opts: java_opts += " %s" % JAVA_XMG if "-Djava.io.tmpdir" not in java_opts: java_opts += " -Djava.io.tmpdir=%s/javatmp/" % (work_gatk) msg = "picard CleanSam for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if CleanSam: command = "%s %s -cp %s picard.cmdline.PicardCommandLine CleanSam I=%s O=%s/alignments_clean.bam" % ( java, java_opts, picard, alignment, work_gatk) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) alignment = "%s/alignments_clean.bam" % work_gatk else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "picard AddOrReplaceReadGroups for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -cp %s picard.cmdline.PicardCommandLine AddOrReplaceReadGroups I=%s O=%s/rg_added_sorted.bam %s" % ( java, java_opts, picard, alignment, work_gatk, AddOrReplaceReadGroups_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "picard MarkDuplicates for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -cp %s picard.cmdline.PicardCommandLine MarkDuplicates I=%s/rg_added_sorted.bam O=%s/dedupped.bam %s M=%s/output.metrics" % ( java, java_opts, picard, work_gatk, work_gatk, MarkDuplicates_opts, work_gatk) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "GATK SplitNCigarReads for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -jar %s -T SplitNCigarReads -R %s -I %s/dedupped.bam -o %s/split.bam %s" % ( java, java_opts, gatk, ref_genome, work_gatk, work_gatk, SplitNCigarReads_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 split_bam = "%s/split.bam" % work_gatk if IndelRealignment: msg = "GATK RealignerTargetCreator for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -jar %s -T RealignerTargetCreator -R %s -I %s/split.bam -o %s/forIndelRealigner.intervals %s" % ( java, java_opts, gatk, ref_genome, work_gatk, work_gatk, RealignerTargetCreator_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "GATK IndelRealigner for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -jar %s -T IndelRealigner -R %s -I %s/split.bam -targetIntervals %s/forIndelRealigner.intervals -o %s/split_realigned.bam %s" % ( java, java_opts, gatk, ref_genome, work_gatk, work_gatk, work_gatk, IndelRealigner_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 split_bam = "%s/split_realigned.bam" % work_gatk else: msg = "GATK RealignerTargetCreator for %s" % sample logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "GATK IndelRealigner for %s" % sample logger.info("Skipping step %d: %s" % (step, msg)) step += 1 if not no_BaseRecalibrator: msg = "GATK BaseRecalibrator for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -jar %s -T BaseRecalibrator -R %s -I %s -o %s/recal_data.table %s" % ( java, java_opts, gatk, ref_genome, split_bam, work_gatk, BaseRecalibrator_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "GATK PrintReads for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -jar %s -T PrintReads -R %s -I %s -BQSR %s/recal_data.table -o %s/bsqr.bam %s" % ( java, java_opts, gatk, ref_genome, split_bam, work_gatk, work_gatk, PrintReads_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 split_bam = "%s/bsqr.bam" % work_gatk else: msg = "GATK BaseRecalibrator for %s" % sample logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "GATK PrintReads for %s" % sample logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "GATK HaplotypeCaller for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -jar %s -T HaplotypeCaller -R %s -I %s -o %s/variants.vcf %s" % ( java, java_opts, gatk, ref_genome, split_bam, work_gatk, HaplotypeCaller_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "GATK VariantFiltration for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -jar %s -T VariantFiltration -R %s -V %s/variants.vcf -o %s/variants_filtered.vcf %s" % ( java, java_opts, gatk, ref_genome, work_gatk, work_gatk, VariantFiltration_opts) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_gatk = os.path.join(outdir, "gatk", sample) create_dirs([out_gatk]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/variants_filtered.vcf" % work_gatk): command = "cp %s/variants_filtered.vcf %s/variants_filtered.vcf" % ( work_gatk, out_gatk) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 variants = "" if os.path.exists("%s/variants_filtered.vcf" % out_gatk): logger.info("GATK was successfull!") logger.info("Output variants: %s/variants_filtered.vcf" % out_gatk) variants = "%s/variants_filtered.vcf" % out_gatk else: logger.info("GATK failed!") return variants
def run_deseq2(quant_files="", alignments="", transcripts_gtfs="", ref_gtf="", featureCounts_opts="", featureCounts=FEATURECOUNTS, stringtie=STRINGTIE, stringtie_merge_opts="", mincount=DESeq2_MINCNT, alpha=DESeq2_ALPHA, R=R_CMD, start=0, samples=[], nthreads=1, workdir=None, outdir=None, timeout=TIMEOUT): samples = map(lambda x: x.split(","), samples) samples_txt = "-".join(map(lambda x: ",".join(x), samples)) logger.info("Running differential analysis (DESeq2) for %s" % samples_txt) n_samples = len(samples) n_replicates = map(len, samples) use_quant = True use_refgtf = False if quant_files and ref_gtf: if len(quant_files) != n_samples: logger.error("Aborting!") raise Exception( "Number of input quantification files does not match the number of samples (%s != %s)" % (len(quant_files), n_samples)) quant_files = map(lambda x: x.split(","), quant_files) for i, q in enumerate(quant_files): if len(q) != n_replicates[i]: logger.error("Aborting!") raise Exception( "Number of input quantification replicate files does not match the number of replicates in %d%s sample (%s != %s)" % (i + 1, "st" if i > 0 else "th", len(q), n_replicates[i])) for r in q: if not os.path.exists(r): logger.error("Aborting!") raise Exception("No qantification file %s" % r) elif alignments and (transcripts_gtfs or ref_gtf): use_quant = False if len(alignments) != n_samples: logger.error("Aborting!") raise Exception( "Number of input alignment files does not match the number of samples (%s != %s)" % (len(alignments), n_samples)) alignments = map(lambda x: x.split(","), alignments) for i, a in enumerate(alignments): if len(a) != n_replicates[i]: logger.error("Aborting!") raise Exception( "Number of input alignment replicate files does not match the number of replicates in %d%s sample (%s != %s)" % (i + 1, "st" if i > 0 else "th", len(a), n_replicates[i])) for r in a: if not os.path.exists(r): logger.error("Aborting!") raise Exception("No aligment file %s" % r) if transcripts_gtfs: transcripts_gtfs = map(lambda x: x.split(","), transcripts_gtfs) for i, a in enumerate(transcripts_gtfs): if len(a) != n_replicates[i]: logger.error("Aborting!") raise Exception( "Number of input gtf files does not match the total number of replicates in %d%s sample (%s != %s)" % (i + 1, "st" if i > 0 else "th", len(a), n_replicates[i])) elif ref_gtf: use_refgtf = True if ref_gtf: if not os.path.exists(ref_gtf): logger.error("Aborting!") raise Exception("No reference GTF file %s" % ref_gtf) else: logger.error("Aborting!") raise Exception( "Either (quantification files + ref_gtf) or (Alignment files + transcripts_gtfs or ref_gtf) is needed." ) work_deseq2 = os.path.join(workdir, "deseq2", samples_txt) create_dirs([work_deseq2]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase DESeq2 work directory for %s" % samples_txt command = "rm -rf %s/*" % (work_deseq2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 deseq2_log = os.path.join(work_deseq2, "deseq2.log") deseq2_log_fd = open(deseq2_log, "w") if use_quant: msg = "prepare tx2gene for %s." % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) tx2gene_file = ref_gtf.strip() + "tx2gene.csv" if os.path.exists(tx2gene_file): logger.info( "Will use the precomputed %s as tx2gene.csv for %s" % (tx2gene_file, samples_txt)) else: tx2gene_file = os.path.join(work_deseq2, "tx2gene.csv") logger.info("Will computed %s as tx2gene.csv for %s" % (tx2gene_file, samples_txt)) tx2gene_map(ref_gtf, tx2gene_file) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "compute gene level abundances for %s." % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) fixed_quant_files = [] for i, qs in enumerate(quant_files): fixed_qs = [] for j, q in enumerate(qs): fixed_q = os.path.join( work_deseq2, "{}.fixed_quant.sf".format(samples[i][j])) fix_quant_file(q, fixed_q) fixed_qs.append(fixed_q) fixed_quant_files.append(fixed_qs) command = "%s -e \"library('readr'); library('tximport'); \ samples=c(%s); (files <- file.path(c(%s))); names(files) <- samples; \ tx2gene <- read.csv(file.path('%s'),sep='\\t'); \ txi <- tximport(files, type = 'salmon', tx2gene = tx2gene); \ save(txi, file='%s/txi.rda'); \ write.table(txi$abundance, file = '%s/txi.abundances',\ quote = FALSE, sep='\\t'); \ write.table(txi$length, file = '%s/txi.length', quote = FALSE, \ sep='\\t'); write.table(txi$counts, file = '%s/txi.counts',\ quote = FALSE, sep='\\t');\"" % ( R, ",".join( map(lambda x: "'%s'" % x, reduce(lambda x, y: x + y, samples))), ",".join( map(lambda x: "'%s'" % x, reduce(lambda x, y: x + y, fixed_quant_files))), tx2gene_file, work_deseq2, work_deseq2, work_deseq2, work_deseq2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "DESeq2 for %s" % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s -e \"library('DESeq2'); load('%s/txi.rda'); \ samples <- c(%s); \ condition <- factor(c(%s)); \ (colData <- data.frame(row.names=colnames(txi$count), condition));\ counts <- round(txi$counts); mode(counts) <- 'integer'; \ dds <- DESeqDataSetFromMatrix(countData=counts, colData=colData, design=~ condition);\ stopifnot(txi$countsFromAbundance %%in%% c('no','scaledTPM','lengthScaledTPM')); \ if (txi$countsFromAbundance %%in%% c('scaledTPM','lengthScaledTPM')) \ { message('using just counts from tximport'); } else \ { message('using counts and average transcript lengths from tximport'); \ lengths <- txi$length; dimnames(lengths) <- dimnames(dds);\ assays(dds)[['avgTxLength']] <- lengths; }; \ dds <- dds[ rowSums(counts(dds)) >= %d, ]; \ dds <- DESeq(dds); \ for (i in seq_along(samples)){ \ for (j in seq_along(samples)){ \ if (i < j){\ sample1 <- samples[i]; \ sample2 <- samples[j]; \ res <- results(dds, contrast=c('condition',sample1,sample2), alpha=%f); \ (summary(res)); \ res_file= sprintf('%s/deseq2_res_%%s_vs_%%s.tab',sample1,sample2);\ write.table(res, file = res_file, \ quote = FALSE, sep='\\t'); \ } \ } \ }; \ save(txi,colData,condition,dds,res, \ file='%s/deseq2.rda');\"" % (R, work_deseq2, ",".join( map(lambda i: "'sample%d'" % (i), range(len(samples)))), ",".join( map( lambda i: "rep('sample%d', %d)" % (i, n_replicates[i]), range( len(samples)))), mincount, alpha, work_deseq2, work_deseq2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 else: if use_refgtf: msg = "featureCounts for %s" % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -o %s/featureCounts.txt -T %d -a %s -g gene_id %s" % ( featureCounts, featureCounts_opts, work_deseq2, nthreads, ref_gtf, " ".join(reduce(lambda x, y: x + y, alignments))) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) command = "sed -i -e '2s/.*/Geneid\\tChr\\tStart\\tEnd\\tStrand\\tLength\\t%s/' %s/featureCounts.txt" % ( "\\t".join(reduce(lambda x, y: x + y, samples)), work_deseq2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "DESeq2 for %s" % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s -e \"library('DESeq2'); countData <- read.table('%s/featureCounts.txt', \ header=TRUE, row.names=1); countData <- countData[ ,6:ncol(countData)]; \ countData <- as.matrix(countData); \ samples <- c(%s); \ condition <- factor(c(%s)); \ (colData <- data.frame(row.names=colnames(countData), condition));\ dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~ condition);\ dds <- dds[ rowSums(counts(dds)) >= %d, ]; \ dds <- DESeq(dds); \ for (i in seq_along(samples)){ \ for (j in seq_along(samples)){ \ if (i < j){\ sample1 <- samples[i]; \ sample2 <- samples[j]; \ res <- results(dds, contrast=c('condition',sample1,sample2), alpha=%f); \ (summary(res)); \ res_file= sprintf('%s/deseq2_res_%%s_vs_%%s.tab',sample1,sample2);\ write.table(res, file = res_file, \ quote = FALSE, sep='\\t'); \ } \ } \ }; \ save(countData,colData,condition,dds,res, \ file='%s/deseq2.rda');\"" % ( R, work_deseq2, ",".join( map(lambda i: "'sample%d'" % (i), range(len(samples)))), ",".join( map( lambda i: "rep('sample%d', %d)" % (i, n_replicates[i]), range( len(samples)))), mincount, alpha, work_deseq2, work_deseq2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 else: msg = "Merge transcripts GTFs for %s" % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if ref_gtf: stringtie_merge_opts += " -G %s" % ref_gtf if "-p " not in stringtie_merge_opts: stringtie_merge_opts += " -p %d" % nthreads gtfs_list = open("%s/gtfs_list.txt" % work_deseq2, 'w') gtfs_list.write("\n".join( reduce(lambda x, y: x + y, transcripts_gtfs))) gtfs_list.close() command = "%s --merge %s -o %s/merged.gtf -v %s/gtfs_list.txt" % ( stringtie, stringtie_merge_opts, work_deseq2, work_deseq2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "featureCounts for %s" % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s %s -o %s/featureCounts.txt -T %d -a %s/merged.gtf -g gene_id %s" % ( featureCounts, featureCounts_opts, work_deseq2, nthreads, work_deseq2, " ".join( reduce(lambda x, y: x + y, alignments))) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) command = "sed -i -e '2s/.*/Geneid\\tChr\\tStart\\tEnd\\tStrand\\tLength\\t%s/' %s/featureCounts.txt" % ( "\\t".join(reduce(lambda x, y: x + y, samples)), work_deseq2) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 msg = "DESeq2 for %s" % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s -e \"library('DESeq2'); countData <- read.table('%s/featureCounts.txt', \ header=TRUE, row.names=1); countData <- countData[ ,6:ncol(countData)]; \ countData <- as.matrix(countData); \ samples <- c(%s); \ condition <- factor(c(%s)); \ (colData <- data.frame(row.names=colnames(countData), condition));\ dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~ condition);\ dds <- dds[ rowSums(counts(dds)) >= %d, ]; \ dds <- DESeq(dds); \ for (i in seq_along(samples)){ \ for (j in seq_along(samples)){ \ if (i < j){\ sample1 <- samples[i]; \ sample2 <- samples[j]; \ res <- results(dds, contrast=c('condition',sample1,sample2), alpha=%f); \ (summary(res)); \ res_file= sprintf('%s/deseq2_res_%%s_vs_%%s.tab',sample1,sample2);\ write.table(res, file = res_file, \ quote = FALSE, sep='\\t'); \ } \ } \ }; \ save(countData,colData,condition,dds, \ file='%s/deseq2.rda');\"" % ( R, work_deseq2, ",".join( map(lambda i: "'sample%d'" % (i), range(len(samples)))), ",".join( map( lambda i: "rep('sample%d', %d)" % (i, n_replicates[i]), range( len(samples)))), mincount, alpha, work_deseq2, work_deseq2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_deseq2 = os.path.join(outdir, "deseq2", samples_txt) create_dirs([out_deseq2]) msg = "Copy predictions to output directory for %s." % samples_txt if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if len(glob.glob("%s/deseq2_res*.tab" % work_deseq2)) > 0: for out_file in glob.glob("%s/deseq2_res*.tab" % work_deseq2): command = "cp %s %s/" % (out_file, out_deseq2) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd, cmd_log=deseq2_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 diff = "" if len(glob.glob("%s/deseq2_res*.tab" % out_deseq2)) > 0: logger.info("DESeq2 was successfull!") logger.info("Output differential expressions: %s" % (glob.glob("%s/deseq2_res*.tab" % out_deseq2))) diff = glob.glob("%s/deseq2_res*.tab" % out_deseq2) else: logger.info("DESeq2 failed!") return diff
def run_salmon_smem(quantifier_idx=None, seq_1="", seq_2="", seq_u="", salmon_k=SALMON_SMEM_k, libtype="", salmon_smem_opts="", salmon=SALMON, start=0, sample="", nthreads=1, unzip=False, workdir=None, outdir=None, timeout=TIMEOUT): logger.info("Running quantification (Salmon-SMEM) for %s" % sample) if not os.path.exists(quantifier_idx): logger.error("Aborting!") raise Exception("No Salmon FMD index directory %s" % quantifier_idx) if seq_1 and seq_2: for s1 in seq_1.split(","): if not os.path.exists(s1): logger.error("Aborting!") raise Exception("No Mate 1 sequence file %s" % s1) for s2 in seq_2.split(","): if not os.path.exists(s2): logger.error("Aborting!") raise Exception("No Mate 2 sequence file %s" % s2) if unzip: seq_argument = "-1 <(gunzip -c %s) -2 <(gunzip -c %s)" % (" ".join( seq_1.split(",")), " ".join(seq_2.split(","))) else: if "," in seq_1: seq_1 = "<(cat %s)" % (" ".join(seq_1.split(","))) if "," in seq_2: seq_2 = "<(cat %s)" % (" ".join(seq_2.split(","))) seq_argument = "-1 %s -2 %s" % (seq_1, seq_2) elif seq_u: if unzip: seq_argument = "-r <(gunzip -c %s)" % (" ".join(seq_u.split(","))) elif "," in seq_u: seq_argument = "-r <(cat %s)" % (" ".join(seq_u1.split(","))) else: seq_argument = "-r %s" % (seq_u) for su in seq_u.split(","): if not os.path.exists(su): logger.error("Aborting!") raise Exception("No unpaired sequence file %s" % su) work_salmon_smem = os.path.join(workdir, "salmon_smem", sample) create_dirs([work_salmon_smem]) step = 0 if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) msg = "Erase Salmon-SMEM work directory for %s" % sample command = "rm -rf %s/*" % (work_salmon_smem) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=False) retcode = cmd.run(msg=msg, timeout=timeout) step += 1 salmon_smem_log = os.path.join(work_salmon_smem, "salmon_smem.log") salmon_smem_log_fd = open(salmon_smem_log, "w") if "-p " not in salmon_smem_opts: salmon_smem_opts += " -p %d" % nthreads salmon_smem_opts += " -k %d" % salmon_k salmon_smem_opts += " -l %s" % libtype msg = "Salmon-SMEM for %s" % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) command = "%s quant -i %s %s %s -o %s" % ( salmon, quantifier_idx, salmon_smem_opts, seq_argument, work_salmon_smem) command = "bash -c \"%s\"" % command cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd, cmd_log=salmon_smem_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 out_salmon_smem = os.path.join(outdir, "salmon_smem", sample) create_dirs([out_salmon_smem]) msg = "Copy predictions to output directory for %s." % sample if start <= step: logger.info( "--------------------------STEP %s--------------------------" % step) if os.path.exists("%s/quant.sf" % work_salmon_smem): command = "cp %s/quant.sf %s/quant.sf" % (work_salmon_smem, out_salmon_smem) cmd = TimedExternalCmd(command, logger, raise_exception=True) retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd, cmd_log=salmon_smem_log, msg=msg, timeout=timeout) else: logger.info("Skipping step %d: %s" % (step, msg)) step += 1 quant = "" if os.path.exists("%s/quant.sf" % out_salmon_smem): logger.info("Salmon-SMEM was successfull!") logger.info("Output expressions: %s/quant.sf" % out_salmon_smem) quant = "%s/quant.sf" % out_salmon_smem else: logger.info("Salmon-SMEM failed!") return quant