Beispiel #1
0
def run_fusioncatcher(data_dir="",
                      input="",
                      start=0,
                      fusioncatcher=FUSIONCATCHER,
                      fusioncatcher_opts="",
                      sample="",
                      nthreads=1,
                      workdir=None,
                      outdir=None,
                      timeout=TIMEOUT):

    logger.info("Running RNA fusion detection (FusionCatcher) for %s" % sample)
    if not os.path.exists(data_dir):
        logger.error("Aborting!")
        raise Exception("No data directory %s" % data_dir)

    work_fusioncatcher = os.path.join(workdir, "fusioncatcher", sample)
    create_dirs([work_fusioncatcher])
    fusioncatcher_log = os.path.join(work_fusioncatcher, "fusioncatcher.log")
    fusioncatcher_log_fd = open(fusioncatcher_log, "w")

    if nthreads > 1:
        if "-p " not in fusioncatcher_opts:
            fusioncatcher_opts += " -p %d" % nthreads
    msg = "Run FusionCatcher for %s" % sample
    command = "%s -d %s -i %s --start %d -o %s" % (
        fusioncatcher, data_dir, input, start, work_fusioncatcher)
    command = "bash -c \"%s\"" % command
    cmd = TimedExternalCmd(command, logger, raise_exception=True)
    retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd,
                      cmd_log=fusioncatcher_log_fd,
                      msg=msg,
                      timeout=timeout)

    out_fusioncatcher = os.path.join(outdir, "fusioncatcher", sample)
    create_dirs([out_fusioncatcher])
    msg = "Copy predictions to output directory for %s." % sample
    if os.path.exists("%s/final-list_candidate-fusion-genes.txt" %
                      work_fusioncatcher):
        command = "cp %s/final-list_candidate-fusion-genes.txt %s/final-list_candidate-fusion-genes.txt" % (
            work_fusioncatcher, out_fusioncatcher)
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd,
                          cmd_log=fusioncatcher_log,
                          msg=msg,
                          timeout=timeout)

    fusions = ""
    if os.path.exists("%s/final-list_candidate-fusion-genes.txt" %
                      out_fusioncatcher):
        logger.info("FusionCatcher was successfull!")
        logger.info(
            "Output fusions: %s/final-list_candidate-fusion-genes.txt" %
            out_fusioncatcher)
        fusions = "%s/final-list_candidate-fusion-genes.txt" % out_fusioncatcher
    else:
        logger.info("FusionCatcher failed!")
    return fusions
Beispiel #2
0
def run_rmarkdown(start=0, sample= "", theme= "",
                workdir=None, outdir=None, timeout=TIMEOUT,nthreads=1, kobas=True):
    logger.info("Get rmarkdown report for %s"%sample)

    report_log = os.path.join(workdir, "report.log")
    report_log_fd = open(report_log, "w")

    step=0
    msg = "Get report for %s"%sample

    sma3s_summary = "%s/%s-sma3s-summary.tsv" %(os.path.join(outdir,"annotation"), sample)
    sma3s_table = "%s/%s-sma3s-table.tsv" %(os.path.join(outdir,"annotation"), sample)
    sma3s_go = "%s/%s-go.tsv" %(os.path.join(outdir,"annotation"), sample)
    sma3s_ko = "%s/%s-ko.tsv" %(os.path.join(outdir,"annotation"), sample)
    engine=""
    if os.path.exists(sma3s_go and sma3s_ko):
        engine="0"
    elif os.path.exists(sma3s_summary and sma3s_table):
        engine="1"
    else:
        logger.error("The files of annotation did not existed!")
        os._exit(-1)

    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        
        if kobas:  #convert to str
            kobas="1"
        else:
            kobas="0"

        command='''\
        python /opt/Auxtools/rmd_creator.py -i %(in)s -w %(work)s \
        -t /opt/Auxtools/rmarkdown/template.Rmd \
        -s %(sam)s -o %(in)s/%(sam)s.Rmd -m %(theme)s -k %(kobas)s -e %(engine)s && \
        R -e 'rmarkdown::render(\\"%(in)s/%(sam)s.Rmd\\")' && \
        rm %(in)s/%(sam)s.Rmd \
        ''' % {
                    'in': outdir,
                    'work': workdir,
                    'sam': sample,
                    'theme': theme,
                    'kobas': kobas,
                    'engine': engine
            }
        logger.info(command)
        command="bash -c \"%s\""%command
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=report_log_fd, cmd_log=report_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1
Beispiel #3
0
def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference=None, assembly=None, pad=AGE_PAD,
                   age=None, truncation_pad_read_age = AGE_TRUNCATION_PAD,
                   max_interval_len_truncation_age = AGE_MAX_INTERVAL_TRUNCATION,
                   dist_to_expected_bp = AGE_DIST_TO_BP, min_del_subalign_len = MIN_DEL_SUBALIGN_LENGTH, 
                   min_inv_subalign_len = MIN_INV_SUBALIGN_LENGTH, age_window = AGE_WINDOW_SIZE,
                   age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, myid=0):
    thread_logger = logging.getLogger("%s-%s" % (run_age_single.__name__, multiprocessing.current_process()))

    bedtools_intervals = []
    intervals_bedtool = pybedtools.BedTool(intervals_bed)

    assembly_fasta = pysam.Fastafile(assembly) if assembly else None
    reference_fasta = pysam.Fastafile(reference)

    breakpoints_bed = None

    thread_logger.info("Will process %d intervals" % (len(region_list)))

    try:
        for region in region_list:
            bedtools_interval = pybedtools.Interval(region[0], region[1], region[3])
            matching_intervals = [interval for interval in intervals_bedtool if (
                interval.start == bedtools_interval.start and interval.end == bedtools_interval.end and interval.chrom == bedtools_interval.chrom)]
            if not matching_intervals:
                thread_logger.info("Matching interval not found for %s" % (str(bedtools_interval)))
                matching_interval = bedtools_interval
            else:
                matching_interval = matching_intervals[0]
            thread_logger.info("Matching interval %s" % (str(matching_interval)))
            sc_locations = []
            try:
                sc_locations = map(int, json.loads(base64.b64decode(matching_interval.name.split(",")[0]))["SC_LOCATIONS"].split(","))
            except:
                pass

            if region not in contig_dict:
                continue
            if not contig_dict[region]:
                continue

            region_object = SVRegion(region[0], region[1], region[2], region[3])
            if region_object.pos1 - pad < 0:
                thread_logger.error("Region too close to start of chromosome. Skipping.")
                continue

            reference_sequence = reference_fasta.fetch(reference=region_object.chrom1, start=region_object.pos1 - pad,
                                                       end=region_object.pos2 + pad)
            region_name = "%s.%d.%d" % (region_object.chrom1, region_object.pos1, region_object.pos2)
            ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name)

            thread_logger.info("Writing the ref sequence for region %s" % region_name)
            with open(ref_name, "w") as file_handle:
                file_handle.write(">{}.ref\n{}".format(region_name, reference_sequence))


            

            age_records = []
            thread_logger.info("Processing %d contigs for region %s" % (len(contig_dict[region]), str(region_object)))
            for contig in contig_dict[region]:
                thread_logger.info(
                    "Writing the assembeled sequence %s of length %s" % (contig.raw_name, contig.sequence_len))
                
                tr_region=[]
                if region_object.length()>max_interval_len_truncation_age and contig.sv_type in ["INV","DEL","DUP"]:
                    # For large SVs, middle sequences has no effect on genotyping. So, we truncate middle region of reference to speed up
                    thread_logger.info("Truncate the reference sequence.")
                    

                    truncate_start = pad + dist_to_expected_bp + truncation_pad_read_age +1
                    truncate_end = len(reference_sequence) -  (pad + dist_to_expected_bp + truncation_pad_read_age)
                    reference_sequence_tr=reference_sequence[0:truncate_start-1]+reference_sequence[truncate_end:]
                    region_name_tr = "%s.%d.%d.tr_%d_%d" % (region_object.chrom1, region_object.pos1, region_object.pos2,truncate_start,truncate_end)
                    ref_name_tr = os.path.join(age_workdir, "%s.ref.fa" % region_name_tr)

                    thread_logger.info("Writing the truncated ref sequence for region %s, contig %s" % (region_name_tr, contig.raw_name))
                    with open(ref_name_tr, "w") as file_handle:
                        file_handle.write(">{}.ref\n{}".format(region_name_tr, reference_sequence_tr))
                        
                    ref_len = len(reference_sequence_tr)
                    ref_f_name = ref_name_tr
                    tr_region = [truncate_start,truncate_end-truncate_start+1]
                    
                else:
                    ref_len = region_object.length()
                    ref_f_name = ref_name
                    
                if contig.sequence_len * ref_len >= 100000000:
                    thread_logger.info("Skipping contig because AGE problem is large (contig_len = %d , ref_len= %d)"%(contig.sequence_len, ref_len))
                    continue

                contig_sequence = assembly_fasta.fetch(contig.raw_name)

                prefix = get_age_file_prefix(contig)
                asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix)
                out = os.path.join(age_workdir, "%s.age.out" % prefix)
                err = os.path.join(age_workdir, "%s.age.err" % prefix)
                fd_out = open(out, "w")
                fd_err = open(err, "w")

                with open(asm_name, "w") as file_handle:
                    file_handle.write(">{}.as\n{}".format(region_name, contig_sequence))

                age_cmd = "%s %s -both -go=-6 %s %s" % (
                    age, "-inv" if contig.sv_type == "INV" else "-tdup" if contig.sv_type == "DUP" else "-indel",
                    ref_f_name, asm_name)
                cmd_runner = TimedExternalCmd(age_cmd, thread_logger)
                retcode = cmd_runner.run(timeout=timeout, cmd_log_fd_out=fd_out, cmd_log_fd_err=fd_err)
                fd_out.close()
                fd_err.close()

                if retcode == 0:
                    age_record = AgeRecord(out,tr_region_1=tr_region)
                    if len(age_record.inputs) == 2:
                        age_record.contig = contig
                        age_record.set_assembly_contig(contig_sequence)
                        age_records.append(age_record)
                    else:
                        thread_logger.error("Number of inputs != 2 in age output file %s. Skipping." % out)

                if not keep_temp:
                    os.remove(asm_name)
                    os.remove(err)
                    if tr_region:
                        os.remove(ref_name_tr)

            unique_age_records = get_unique_age_records(age_records)

            thread_logger.info("Unique %d AGE records for region %s" % (len(unique_age_records), str(region_object)))
            for age_record in unique_age_records:
                thread_logger.info(str(age_record))

            sv_types = list(set([age_record.contig.sv_type for age_record in unique_age_records]))
            if len(sv_types) != 1:
                thread_logger.error("Some problem. Mixed SV types for this interval %s" % (str(sv_types)))
            else:
                sv_type = sv_types[0]
                thread_logger.info("Processing region of type %s" % sv_type)
                breakpoints, info_dict = process_age_records(unique_age_records, sv_type=sv_type, 
                                                             pad=pad, dist_to_expected_bp=dist_to_expected_bp,
                                                             min_del_subalign_len=min_del_subalign_len,
                                                             min_inv_subalign_len=min_inv_subalign_len,
                                                             age_window=age_window, sc_locations=sc_locations)
                bedtools_fields = matching_interval.fields
                if len(breakpoints) == 1 and sv_type == "INS":
                    bedtools_fields += map(str, [breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1], breakpoints[0][2]])
                elif len(breakpoints) == 2 and (sv_type in ["DEL","INV","DUP"]):
                    bedtools_fields += map(str, breakpoints + [breakpoints[1] - breakpoints[0]] + ["."])
                else:
                    bedtools_fields += map(str, [bedtools_fields[1], bedtools_fields[2], -1, "."])
                bedtools_fields[3] += ";AS"
                bedtools_fields.append(base64.b64encode(json.dumps(info_dict)))
                thread_logger.info("Writing out fields %s" % (str(bedtools_fields)))
                bedtools_intervals.append(pybedtools.create_interval_from_list(bedtools_fields))

            if not keep_temp:
                os.remove(ref_name)
    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    if assembly_fasta:
        assembly_fasta.close()
    reference_fasta.close()

    thread_logger.info("Writing %d intervals" % (len(bedtools_intervals)))
    if bedtools_intervals:
        breakpoints_bed = os.path.join(age_workdir, "%d_breakpoints.bed" % myid)
        pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed)

    return breakpoints_bed
Beispiel #4
0
def run_spades_single(intervals=[],
                      bams=[],
                      spades=None,
                      spades_options="",
                      work=None,
                      pad=SPADES_PAD,
                      timeout=SPADES_TIMEOUT,
                      isize_min=ISIZE_MIN,
                      isize_max=ISIZE_MAX,
                      stop_on_fail=False,
                      max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
    thread_logger = logging.getLogger(
        "%s-%s" %
        (run_spades_single.__name__, multiprocessing.current_process()))

    if not os.path.isdir(work):
        thread_logger.info("Creating %s" % work)
        os.makedirs(work)

    merged_contigs = open(os.path.join(work, "merged.fa"), "w")
    spades_log_fd = open(os.path.join(work, "spades.log"), "w")

    extract_fns = [extract_pairs.all_pair_hq, extract_pairs.non_perfect_hq]

    try:
        bam_handles = [pysam.Samfile(bam, "rb") for bam in bams]

        for interval in intervals:
            region = "%s:%d-%d" % (str(
                interval.chrom), interval.start, interval.end)
            thread_logger.info("Processing interval %s" %
                               (str(interval).strip()))

            sv_type = interval.name.split(",")[1]

            extraction_counts = extract_pairs.extract_read_pairs(
                bam_handles,
                region,
                "%s/" % work,
                extract_fns,
                pad=pad,
                max_read_pairs=max_read_pairs,
                sv_type=sv_type)
            all_pair_count = extraction_counts[0][1]

            for fn_id, ((end1, end2),
                        extracted_count) in enumerate(extraction_counts):
                extract_fn_name = extract_fns[fn_id].__name__

                if fn_id > 0 and extracted_count == all_pair_count:
                    thread_logger.info(
                        "Skipping assembly from %s since read count same as all_pairs"
                        % extract_fn_name)
                    continue

                if extracted_count >= 5:
                    extra_opt = "--sc" if not fn_id == 0 else ""
                    spades_log_fd.write(
                        "Running spades for interval %s with extraction function %s\n"
                        % (str(interval).strip(), extract_fn_name))
                    cmd = TimedExternalCmd(
                        "%s -1 %s -2 %s -o %s/spades_%s/ -m 4 -t 1 --phred-offset 33 %s %s"
                        % (spades, end1, end2, work, extract_fn_name,
                           extra_opt, spades_options), thread_logger)
                    retcode = cmd.run(cmd_log_fd_out=spades_log_fd,
                                      timeout=timeout)
                    if retcode == 0:
                        append_contigs(
                            os.path.join(work, "spades_%s/contigs.fasta") %
                            extract_fn_name, interval, merged_contigs, fn_id,
                            sv_type)
                    elif not cmd.did_timeout:
                        thread_logger.error("Spades failed")
                        if stop_on_fail:
                            thread_logger.error("Aborting!")
                            raise Exception(
                                "Spades failure on interval %s for extraction function %s\n"
                                % (str(interval).strip(), extract_fn_name))
                else:
                    thread_logger.info(
                        "Too few read pairs (%d) extracted. Skipping assembly."
                        % extracted_count)

        for bam_handle in bam_handles:
            bam_handle.close()

    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    merged_contigs.close()

    return os.path.abspath(merged_contigs.name)
Beispiel #5
0
def run_spades_single(intervals=[], bams=[], spades=None, spades_options="", work=None, pad=SPADES_PAD, timeout=SPADES_TIMEOUT,
                      isize_min=ISIZE_MIN,
                      isize_max=ISIZE_MAX, stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
    thread_logger = logging.getLogger("%s-%s" % (run_spades_single.__name__, multiprocessing.current_process()))

    if not os.path.isdir(work):
        thread_logger.info("Creating %s" % work)
        os.makedirs(work)

    merged_contigs = open(os.path.join(work, "merged.fa"), "w")
    spades_log_fd = open(os.path.join(work, "spades.log"), "w")

    extract_fns = [extract_pairs.all_pair_hq, extract_pairs.non_perfect_hq]

    try:
        bam_handles = [pysam.Samfile(bam, "rb") for bam in bams]

        for interval in intervals:
            region = "%s:%d-%d" % (str(interval.chrom), interval.start, interval.end)
            thread_logger.info("Processing interval %s" % (str(interval).strip()))

            sv_type = interval.name.split(",")[1]

            extraction_counts = extract_pairs.extract_read_pairs(bam_handles, region, "%s/" % work, extract_fns, pad=pad,
                                                                 max_read_pairs=max_read_pairs, sv_type=sv_type)
            all_pair_count = extraction_counts[0][1]

            for fn_id, ((end1, end2), extracted_count) in enumerate(extraction_counts):
                extract_fn_name = extract_fns[fn_id].__name__

                if fn_id > 0 and extracted_count == all_pair_count:
                    thread_logger.info("Skipping assembly from %s since read count same as all_pairs" % extract_fn_name)
                    continue

                if extracted_count >= 5:
                    extra_opt = "--sc" if not fn_id == 0 else ""
                    spades_log_fd.write("Running spades for interval %s with extraction function %s\n" % (
                        str(interval).strip(), extract_fn_name))
                    cmd = TimedExternalCmd("%s -1 %s -2 %s -o %s/spades_%s/ -m 4 -t 1 --phred-offset 33 %s %s" % (
                        spades, end1, end2, work, extract_fn_name, extra_opt, spades_options), thread_logger)
                    retcode = cmd.run(cmd_log_fd_out=spades_log_fd, timeout=timeout)
                    if retcode == 0:
                        append_contigs(os.path.join(work, "spades_%s/contigs.fasta") % extract_fn_name, interval,
                                       merged_contigs, fn_id, sv_type)
                    elif not cmd.did_timeout:
                        thread_logger.error("Spades failed")
                        if stop_on_fail:
                            thread_logger.error("Aborting!")
                            raise Exception("Spades failure on interval %s for extraction function %s\n" % (
                            str(interval).strip(), extract_fn_name))
                else:
                    thread_logger.info("Too few read pairs (%d) extracted. Skipping assembly." % extracted_count)

        for bam_handle in bam_handles:
            bam_handle.close()

    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    merged_contigs.close()

    return os.path.abspath(merged_contigs.name)
Beispiel #6
0
def run_age_single(intervals_bed=None,
                   region_list=[],
                   contig_dict={},
                   reference=None,
                   assembly=None,
                   pad=AGE_PAD,
                   age=None,
                   truncation_pad_read_age=AGE_TRUNCATION_PAD,
                   max_interval_len_truncation_age=AGE_MAX_INTERVAL_TRUNCATION,
                   dist_to_expected_bp=AGE_DIST_TO_BP,
                   min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH,
                   min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH,
                   age_window=AGE_WINDOW_SIZE,
                   age_workdir=None,
                   timeout=AGE_TIMEOUT,
                   keep_temp=False,
                   myid=0):
    thread_logger = logging.getLogger(
        "%s-%s" % (run_age_single.__name__, multiprocessing.current_process()))

    bedtools_intervals = []
    intervals_bedtool = pybedtools.BedTool(intervals_bed)

    assembly_fasta = pysam.Fastafile(assembly) if assembly else None
    reference_fasta = pysam.Fastafile(reference)

    breakpoints_bed = None

    thread_logger.info("Will process %d intervals" % (len(region_list)))

    try:
        for region in region_list:
            bedtools_interval = pybedtools.Interval(region[0], region[1],
                                                    region[3])
            matching_intervals = [
                interval for interval in intervals_bedtool
                if (interval.start == bedtools_interval.start
                    and interval.end == bedtools_interval.end
                    and interval.chrom == bedtools_interval.chrom)
            ]
            if not matching_intervals:
                thread_logger.info("Matching interval not found for %s" %
                                   (str(bedtools_interval)))
                matching_interval = bedtools_interval
            else:
                matching_interval = matching_intervals[0]
            thread_logger.info("Matching interval %s" %
                               (str(matching_interval)))
            sc_locations = []
            try:
                sc_locations = map(
                    int,
                    json.loads(
                        base64.b64decode(matching_interval.name.split(",")[0]))
                    ["SC_LOCATIONS"].split(","))
            except:
                pass

            if region not in contig_dict:
                continue
            if not contig_dict[region]:
                continue

            region_object = SVRegion(region[0], region[1], region[2],
                                     region[3])
            if region_object.pos1 - pad < 0:
                thread_logger.error(
                    "Region too close to start of chromosome. Skipping.")
                continue

            reference_sequence = reference_fasta.fetch(
                reference=region_object.chrom1,
                start=region_object.pos1 - pad,
                end=region_object.pos2 + pad)
            region_name = "%s.%d.%d" % (region_object.chrom1,
                                        region_object.pos1, region_object.pos2)
            ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name)

            thread_logger.info("Writing the ref sequence for region %s" %
                               region_name)
            with open(ref_name, "w") as file_handle:
                file_handle.write(">{}.ref\n{}".format(region_name,
                                                       reference_sequence))

            age_records = []
            thread_logger.info("Processing %d contigs for region %s" %
                               (len(contig_dict[region]), str(region_object)))
            for contig in contig_dict[region]:
                thread_logger.info(
                    "Writing the assembeled sequence %s of length %s" %
                    (contig.raw_name, contig.sequence_len))

                tr_region = []
                if region_object.length(
                ) > max_interval_len_truncation_age and contig.sv_type in [
                        "INV", "DEL", "DUP"
                ]:
                    # For large SVs, middle sequences has no effect on genotyping. So, we truncate middle region of reference to speed up
                    thread_logger.info("Truncate the reference sequence.")

                    truncate_start = pad + dist_to_expected_bp + truncation_pad_read_age + 1
                    truncate_end = len(reference_sequence) - (
                        pad + dist_to_expected_bp + truncation_pad_read_age)
                    reference_sequence_tr = reference_sequence[
                        0:truncate_start -
                        1] + reference_sequence[truncate_end:]
                    region_name_tr = "%s.%d.%d.tr_%d_%d" % (
                        region_object.chrom1, region_object.pos1,
                        region_object.pos2, truncate_start, truncate_end)
                    ref_name_tr = os.path.join(age_workdir,
                                               "%s.ref.fa" % region_name_tr)

                    thread_logger.info(
                        "Writing the truncated ref sequence for region %s, contig %s"
                        % (region_name_tr, contig.raw_name))
                    with open(ref_name_tr, "w") as file_handle:
                        file_handle.write(">{}.ref\n{}".format(
                            region_name_tr, reference_sequence_tr))

                    ref_len = len(reference_sequence_tr)
                    ref_f_name = ref_name_tr
                    tr_region = [
                        truncate_start, truncate_end - truncate_start + 1
                    ]

                else:
                    ref_len = region_object.length()
                    ref_f_name = ref_name

                if contig.sequence_len * ref_len >= 100000000:
                    thread_logger.info(
                        "Skipping contig because AGE problem is large (contig_len = %d , ref_len= %d)"
                        % (contig.sequence_len, ref_len))
                    continue

                contig_sequence = assembly_fasta.fetch(contig.raw_name)

                prefix = get_age_file_prefix(contig)
                asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix)
                out = os.path.join(age_workdir, "%s.age.out" % prefix)
                err = os.path.join(age_workdir, "%s.age.err" % prefix)
                fd_out = open(out, "w")
                fd_err = open(err, "w")

                with open(asm_name, "w") as file_handle:
                    file_handle.write(">{}.as\n{}".format(
                        region_name, contig_sequence))

                age_cmd = "%s %s -both -go=-6 %s %s" % (
                    age, "-inv" if contig.sv_type == "INV" else
                    "-tdup" if contig.sv_type == "DUP" else "-indel",
                    ref_f_name, asm_name)
                cmd_runner = TimedExternalCmd(age_cmd, thread_logger)
                retcode = cmd_runner.run(timeout=timeout,
                                         cmd_log_fd_out=fd_out,
                                         cmd_log_fd_err=fd_err)
                fd_out.close()
                fd_err.close()

                if retcode == 0:
                    age_record = AgeRecord(out, tr_region_1=tr_region)
                    if len(age_record.inputs) == 2:
                        age_record.contig = contig
                        age_record.set_assembly_contig(contig_sequence)
                        age_records.append(age_record)
                    else:
                        thread_logger.error(
                            "Number of inputs != 2 in age output file %s. Skipping."
                            % out)

                if not keep_temp:
                    os.remove(asm_name)
                    os.remove(err)
                    if tr_region:
                        os.remove(ref_name_tr)

            unique_age_records = get_unique_age_records(age_records)

            thread_logger.info("Unique %d AGE records for region %s" %
                               (len(unique_age_records), str(region_object)))
            for age_record in unique_age_records:
                thread_logger.info(str(age_record))

            sv_types = list(
                set([
                    age_record.contig.sv_type
                    for age_record in unique_age_records
                ]))
            if len(sv_types) != 1:
                thread_logger.error(
                    "Some problem. Mixed SV types for this interval %s" %
                    (str(sv_types)))
            else:
                sv_type = sv_types[0]
                thread_logger.info("Processing region of type %s" % sv_type)
                breakpoints, info_dict = process_age_records(
                    unique_age_records,
                    sv_type=sv_type,
                    pad=pad,
                    dist_to_expected_bp=dist_to_expected_bp,
                    min_del_subalign_len=min_del_subalign_len,
                    min_inv_subalign_len=min_inv_subalign_len,
                    age_window=age_window,
                    sc_locations=sc_locations)
                bedtools_fields = matching_interval.fields
                if len(breakpoints) == 1 and sv_type == "INS":
                    bedtools_fields += map(str, [
                        breakpoints[0][0], breakpoints[0][0] + 1,
                        breakpoints[0][1], breakpoints[0][2]
                    ])
                elif len(breakpoints) == 2 and (sv_type
                                                in ["DEL", "INV", "DUP"]):
                    bedtools_fields += map(
                        str, breakpoints + [breakpoints[1] - breakpoints[0]] +
                        ["."])
                else:
                    bedtools_fields += map(
                        str, [bedtools_fields[1], bedtools_fields[2], -1, "."])
                bedtools_fields[3] += ";AS"
                bedtools_fields.append(base64.b64encode(json.dumps(info_dict)))
                thread_logger.info("Writing out fields %s" %
                                   (str(bedtools_fields)))
                bedtools_intervals.append(
                    pybedtools.create_interval_from_list(bedtools_fields))

            if not keep_temp:
                os.remove(ref_name)
    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    if assembly_fasta:
        assembly_fasta.close()
    reference_fasta.close()

    thread_logger.info("Writing %d intervals" % (len(bedtools_intervals)))
    if bedtools_intervals:
        breakpoints_bed = os.path.join(age_workdir,
                                       "%d_breakpoints.bed" % myid)
        pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed)

    return breakpoints_bed
def run_oases(assmebly_hash=DNV_HASH,
              seq_1="",
              seq_2="",
              seq_u="",
              seq_i="",
              file_format=DNV_FORMAT,
              read_type=DNV_READTYPE,
              oases=OASES,
              velvetg=VELVETG,
              velveth=VELVETH,
              oases_opts="",
              velvetg_opts="",
              velveth_opts="",
              start=0,
              sample="",
              nthreads=1,
              workdir=None,
              outdir=None,
              timeout=TIMEOUT):

    logger.info("Running de novo assembly (OASES) for %s" % sample)

    if seq_1 and seq_2:
        for s1 in seq_1.split(","):
            if not os.path.exists(s1):
                logger.error("Aborting!")
                raise Exception("No Mate 1 sequence file %s" % s1)
        for s2 in seq_2.split(","):
            if not os.path.exists(s2):
                logger.error("Aborting!")
                raise Exception("No Mate 2 sequence file %s" % s2)
        seq_argument = "-separate %s %s" % (seq_1, seq_2)
    elif seq_u:
        seq_argument = seq_u
        for su in seq_u.split(","):
            if not os.path.exists(su):
                logger.error("Aborting!")
                raise Exception("No unpaired sequence file %s" % su)

    elif seq_i:
        seq_argument = seq_i
        for sr in seq_i.split(","):
            if not os.path.exists(seq_i):
                logger.error("Aborting!")
                raise Exception("No sra sequence file %s" % sr)

    work_oases = os.path.join(workdir, "oases", sample)
    create_dirs([work_oases])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase Oases work directory for %s" % sample
        command = "rm -rf %s/*" % (work_oases)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    oases_log = os.path.join(work_oases, "oases.log")
    oases_log_fd = open(oases_log, "w")

    seq_argument = "-%s -%s %s " % (file_format, read_type, seq_argument)

    msg = "velveth for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s %d  %s %s" % (velveth, work_oases, assmebly_hash,
                                       velveth_opts, seq_argument)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command,
                               logger,
                               raise_exception=True,
                               env_dict={"OMP_NUM_THREADS": str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=oases_log_fd,
                          cmd_log=oases_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "velvetg for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s %s -read_trkg yes " % (velvetg, work_oases,
                                                velvetg_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=oases_log_fd,
                          cmd_log=oases_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "oases for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s %s " % (oases, work_oases, oases_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=oases_log_fd,
                          cmd_log=oases_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_oases = os.path.join(outdir, "oases", sample)
    create_dirs([out_oases])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/transcripts.fa" % work_oases):
            command = "cp %s/transcripts.fa %s/transcripts.fa" % (work_oases,
                                                                  out_oases)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=oases_log_fd,
                              cmd_log=oases_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    transcripts = ""
    if os.path.exists("%s/transcripts.fa" % out_oases):
        logger.info("Oases was successfull!")
        logger.info("Output transcripts: %s/transcripts.fa" % out_oases)
        transcripts = "%s/transcripts.fa" % out_oases
    else:
        logger.info("Oases failed!")
    return transcripts
Beispiel #8
0
def run_afterqc(fqdir="",
                r1_flag="",
                r2_flag="",
                start=0,
                sample="",
                afterqc_opts="",
                workdir=None,
                outdir=None,
                timeout=TIMEOUT,
                nthreads=1):
    logger.info(
        "Automatic Filtering, Trimming, Error Removing and Quality Control for fastq data for %s"
        % sample)

    work_qc = os.path.join(workdir, "qc")
    create_dirs([work_qc])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase QC work directory for %s" % sample
        command = "rm -rf %s/*" % (work_qc)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    qc_log = os.path.join(work_qc, "qc.log")
    qc_log_fd = open(qc_log, "w")

    #seq_argument="-%s -%s %s "%(file_format,read_type,seq_argument)

    msg = "QC for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s -d %s --read1_flag %s --read2_flag %s -g %s -b %s -r %s %s" % (
            "python", "/opt/AfterQC/after.py", fqdir, r1_flag, r2_flag,
            work_qc + "/good/", work_qc + "/bad/", work_qc + "/qc_report/",
            afterqc_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command,
                               logger,
                               raise_exception=True,
                               env_dict={"OMP_NUM_THREADS": str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=qc_log_fd,
                          cmd_log=qc_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_qc = os.path.join(outdir, "qc")
    create_dirs([out_qc])
    msg = "Copy qc html report to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        qclist = filter(lambda x: x.endswith(("html")),
                        os.listdir("%s/qc_report/" % work_qc))
        if len(qclist) > 0:
            #command=" && ".join(map(lambda x: "cp %s/qc_report/%s %s/" % (work_qc, x, out_qc), qclist))
            command = "cp %s %s/" % (" ".join(
                map(lambda x: "%s/qc_report/%s" %
                    (work_qc, x), qclist)), out_qc)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=qc_log_fd,
                              cmd_log=qc_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1
Beispiel #9
0
def run_lordec(kmer=23,
               solid=3,
               long="",
               short="",
               lordec=LORDEC,
               lordec_opts="",
               start=0,
               sample="",
               nthreads=1,
               workdir=None,
               outdir=None,
               timeout=TIMEOUT):

    logger.info("Running long read error correction (LoRDEC) for %s" % sample)
    if not os.path.exists(long):
        logger.error("Aborting!")
        raise Exception("No long read sequence file %s" % long)

    if not os.path.exists(short):
        logger.error("Aborting!")
        raise Exception("No short read sequence file %s" % short)

    work_lordec = os.path.join(workdir, "lordec", sample)
    create_dirs([work_lordec])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase LoRDEC work directory for %s" % sample
        command = "rm -rf %s/*" % (work_lordec)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    lordec_log = os.path.join(work_lordec, "lordec.log")
    lordec_log_fd = open(lordec_log, "w")
    ksps = ""

    if "-T " not in lordec_opts:
        lordec_opts += " -T %d" % nthreads

    msg = "LoRDEC for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s  -k %d -s %d -i %s -2 %s -O %s -o %s/long_corrected.fa" % (
            lordec, lordec_opts, kmer, solid, long, short, work_lordec,
            work_lordec)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=lordec_log_fd,
                          cmd_log=lordec_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_lordec = os.path.join(outdir, "lordec", sample)
    create_dirs([out_lordec])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/long_corrected.fa" % work_lordec):
            command = "cp %s/long_corrected.fa %s/long_corrected.fa" % (
                work_lordec, out_lordec)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=lordec_log_fd,
                              cmd_log=lordec_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    corrected = ""
    if os.path.exists("%s/long_corrected.fa" % out_lordec):
        logger.info("LoRDEC was successfull!")
        logger.info("Output corrected reads: %s/long_corrected.fa" %
                    out_lordec)
        corrected = "%s/long_corrected.fa" % out_lordec
    else:
        logger.info("LoRDEC failed!")
    return corrected
Beispiel #10
0
def annotate_ver2(sample="",
                  start=0,
                  nettype="0",
                  dcutt="0.5",
                  dcutnt="0.45",
                  orgtype="0",
                  minsglen="10",
                  trunc="70",
                  orgname="",
                  blastp_opts="",
                  evalue="",
                  msa="",
                  workdir=None,
                  outdir=None,
                  timeout=TIMEOUT,
                  nthreads=1):
    logger.info("Annotation for precursor proteins of polypeptides for %s" %
                sample)

    work_annot = os.path.join(workdir, "annotation")
    create_dirs([work_annot])
    work_msalign = os.path.join(workdir, "msalign")

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase annotation work directory for %s" % sample
        command = "rm -rf %s/*" % (work_annot)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    annot_log = os.path.join(work_annot, "annotation.log")
    annot_log_fd = open(annot_log, "w")

    msg = "Predicting the presence and location of signal peptide cleavage sites in amino acid sequences for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = (
            "casperjs /opt/Auxtools/spider/signalP.js --fasta=%(msdir)s/%(sam)s-sequence.fa "
            "--outfile=%(atdir)s/%(sam)s-signalP.txt --minlen=\"%(mlen)s\" "
            "--method=\"%(nettype)s\" --orgtype=\"%(orgtype)s\" "
            "--dcut=\"user\" --notm=%(dcutnt)s --tm=%(dcutt)s --trunc=%(trunc)s"
        ) % {
            'msdir': work_msalign,
            'atdir': work_annot,
            'sam': sample,
            'mlen': minsglen,
            'nettype': nettype,
            'orgtype': orgtype,
            'dcutt': dcutt,
            'dcutnt': dcutnt,
            'trunc': trunc
        }
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command,
                               logger,
                               raise_exception=True,
                               env_dict={"OMP_NUM_THREADS": str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=annot_log_fd,
                          cmd_log=annot_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Annotating precursor protein function for %s" % sample
    sma3sfa = filter(lambda x: x.endswith(("fa", "fasta")),
                     os.listdir(SMA3SDB_DIR))
    sma3sat = filter(lambda x: x.endswith(("annot")), os.listdir(SMA3SDB_DIR))
    sma3sfagz = filter(lambda x: x.endswith(("fa.gz", "fasta.gz")),
                       os.listdir(SMA3SDB_DIR))
    sma3satgz = filter(lambda x: x.endswith(("annot.gz")),
                       os.listdir(SMA3SDB_DIR))

    if (len(sma3sfa) == 1 and len(sma3sat) == 1):
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            command = (
                "mkdir -p %(atdir)s/sma3s/ && "
                "ln -sf /data/%(ssdir)s/%(dbfa)s %(atdir)s/sma3s/%(dbfa)s && "
                "ln -sf /data/%(ssdir)s/%(dbat)s %(atdir)s/sma3s/%(dbat)s && "
                "cp %(msdir)s/%(sam)s-sequence.fa %(atdir)s/sma3s/ &&"
                "cd %(atdir)s/sma3s/ && "
                "perl /opt/Auxtools/sma3s_v2.pl -i %(sam)s-sequence.fa -d %(dbfa)s -go -goslim -p 0.00001 -num_threads %(nthreads)s && "
                "cd -") % {
                    'msdir': work_msalign,
                    'atdir': work_annot,
                    'ssdir': SMA3SDB_DIR,
                    'dbfa': sma3sfa[0],
                    'dbat': sma3sat[0],
                    'sam': sample,
                    'nthreads': nthreads
                }
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command,
                                   logger,
                                   raise_exception=True,
                                   env_dict={"OMP_NUM_THREADS": str(nthreads)})
            retcode = cmd.run(cmd_log_fd_out=annot_log_fd,
                              cmd_log=annot_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))

    elif (len(sma3sfagz) == 1 and len(sma3satgz) == 1):
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            command = (
                "mkdir -p %(atdir)s/sma3s/ && "
                "gzip -dc %(ssdir)s/%(dbatgz)s > %(atdir)s/sma3s/%(dbat)s && "
                "gzip -dc %(ssdir)s/%(dbfagz)s > %(atdir)s/sma3s/%(dbfa)s && "
                "cp %(msdir)s/%(sam)s-sequence.fa %(atdir)s/sma3s/ &&"
                "cd %(atdir)s/sma3s/ && "
                "perl /opt/Auxtools/sma3s_v2.pl -i %(sam)s-sequence.fa -d %(dbfa)s -go -goslim -p 0.00001 -num_threads %(nthreads)s && "
                "cd -") % {
                    'msdir': work_msalign,
                    'atdir': work_annot,
                    'ssdir': SMA3SDB_DIR,
                    'dbfagz': sma3sfagz[0],
                    'dbatgz': sma3satgz[0],
                    'dbfa': os.path.splitext(sma3sfagz[0])[0],
                    'dbat': os.path.splitext(sma3satgz[0])[0],
                    'sam': sample,
                    'nthreads': nthreads
                }
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command,
                                   logger,
                                   raise_exception=True,
                                   env_dict={"OMP_NUM_THREADS": str(nthreads)})
            retcode = cmd.run(cmd_log_fd_out=annot_log_fd,
                              cmd_log=annot_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
    else:
        logger.warn(
            "Two Sma3s database files(uniref90.annot and uniref90.fasta) did not exist in the '%s' directory! Please obtain it from http://www.bioinfocabd.upo.es/sma3s/db/"
            % (BLASTDB_DIR))
        logger.warn("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Multiple sequence alignment for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = (
            "python /opt/Auxtools/msa.py -i %(msdir)s/%(sam)s-sequence.fa "
            "-m %(msa)s -o %(atdir)s/%(sam)s-msa.html ") % {
                'msdir': work_msalign,
                'atdir': work_annot,
                'sam': sample,
                'msa': msa
            }
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command,
                               logger,
                               raise_exception=True,
                               env_dict={"OMP_NUM_THREADS": str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=annot_log_fd,
                          cmd_log=annot_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Venom annotation for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = (
            "python /opt/Auxtools/venomkb/venomkb_annot.py -i %(msdir)s/%(sam)s-sequence.fa "
            "-c /opt/Auxtools/venomkb/venomkb_proteins_06272017.json.gz -o %(atdir)s/%(sam)s "
        ) % {
            'msdir': work_msalign,
            'atdir': work_annot,
            'sam': sample
        }
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command,
                               logger,
                               raise_exception=True,
                               env_dict={"OMP_NUM_THREADS": str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=annot_log_fd,
                          cmd_log=annot_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Similar sequence blast for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        falist = filter(lambda x: x.endswith(("fa", "fasta")),
                        os.listdir(BLASTDB_DIR))
        if (len(falist) == 0):
            logger.warn(
                "There is no blast database file(*.fa or *.fasta) in the '%s' directory!"
                % (BLASTDB_DIR))
            logger.warn("Skipping step %d: %s" % (step, msg))
        if (len(falist) > 1):
            logger.warn(
                "Only one blast database file is allowed in the '%s' directory!"
                % (BLASTDB_DIR))
            logger.warn("Skipping step %d: %s" % (step, msg))

        dbext = filter(lambda x: x.endswith(("phr", "pin", "psq")),
                       os.listdir("./config/blastdb/"))
        if (len(dbext) < 3):  #check db had been builded.
            cmd_chip1 = "makeblastdb -dbtype prot  -in %(path)s/%(db)s  -out %(path)s/%(db)s && \
                " % {
                'path': BLASTDB_DIR,
                'db': falist[0]
            }
        else:
            cmd_chip1 = ""
        cmd_chip2 = (
            "blastp -db %(path)s/%(db)s -num_threads %(nthreads)s -query %(msdir)s/%(sam)s-sequence.fa "
            "-out %(atdir)s/%(sam)s.asn -outfmt 11 -evalue %(evalue)s %(opts)s && "
            "blast_formatter -archive %(atdir)s/%(sam)s.asn -outfmt 0 > %(atdir)s/%(sam)s-pairwise.txt && "
            "blast_formatter -archive %(atdir)s/%(sam)s.asn -outfmt '7 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle' > %(atdir)s/%(sam)s-tabular.txt && "
            "python /opt/Auxtools/BlasterJS/src/blast2html.py -i %(atdir)s/%(sam)s-pairwise.txt "
            "-o %(atdir)s/blast_html/") % {
                'path': BLASTDB_DIR,
                'db': falist[0],
                'msdir': work_msalign,
                'atdir': work_annot,
                'sam': sample,
                'orgname': orgname,
                'evalue': evalue,
                'opts': blastp_opts,
                'nthreads': nthreads
            }

        command = cmd_chip1 + cmd_chip2
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command,
                               logger,
                               raise_exception=True,
                               env_dict={"OMP_NUM_THREADS": str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=annot_log_fd,
                          cmd_log=annot_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_annot = os.path.join(outdir, "annotation")
    create_dirs([out_annot])
    msg = "Copy annotation result to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/%s-signalP.txt" % (work_annot, sample)):
            command = (
                "cp %(indir)s/%(sam)s-msa.html %(indir)s/%(sam)s-venom.tsv "
                "%(indir)s/%(sam)s-signalP.txt %(outdir)s/") % {
                    "indir": work_annot,
                    "sam": sample,
                    "outdir": out_annot
                }
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=annot_log_fd,
                              cmd_log=annot_log,
                              msg=msg,
                              timeout=timeout)

        if os.path.exists("%s/blast_html" % (work_annot)):
            copy_and_overwrite("%s/blast_html" % (work_annot),
                               "%s/blast_html/" % (out_annot))

        tmpin = "%s/%s-tabular.txt" % (work_annot, sample)
        tmpout = "%s/%s-tabular.txt" % (out_annot, sample)
        if os.path.exists(tmpin):
            copyfile(tmpin, tmpout)

        tsvlist = filter(lambda x: x.endswith(("tsv")),
                         os.listdir(os.path.join(work_annot, "sma3s")))
        logger.info(tsvlist)
        summary = filter(lambda x: x.endswith(("summary.tsv")), tsvlist)[0]
        tsvtab = filter(lambda x: not x.endswith(("summary.tsv")), tsvlist)[0]

        tmpin = "%s" % os.path.join(work_annot, "sma3s", summary)
        tmpout = "%s/%s-sma3s-summary.tsv" % (out_annot, sample)
        if os.path.exists(tmpin):
            copyfile(tmpin, tmpout)
        tmpin = "%s" % os.path.join(work_annot, "sma3s", tsvtab)
        tmpout = "%s/%s-sma3s-table.tsv" % (out_annot, sample)
        if os.path.exists(tmpin):
            copyfile(tmpin, tmpout)

    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    return os.EX_OK
Beispiel #11
0
def run_comet(input="", longest=False, spectrum="",
              start=0, sample= "", nthreads=1,
              workdir=None, outdir=None, timeout=TIMEOUT):

    logger.info("Running mass spectra alignment (Comet) for %s"%sample)
    
    work_msalign=os.path.join(workdir,"msalign")
    create_dirs([work_msalign])

    step=0
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        msg = "Erase msalign work directory for %s"%sample
        command="rm -rf %s/*" % (
            work_msalign)
        command="bash -c \"%s\""%command        
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step+=1

    msalign_log = os.path.join(work_msalign, "msalign.log")
    msalign_log_fd = open(msalign_log, "w")

    #determine wether the fasta is nucleotide or amino acid format.
    tmpfile=open(input)
    tmpline=tmpfile.readlines()[1] #get second line 
    is_na=True
    if len(set(tmpline.strip()))>4: 
        is_na=False

    msg = "Run PGA database creator for %s"%sample
    if start<=step and is_na:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command="Rscript /opt/Auxtools/run_dbcreator.R %s %s %s %s" % (
            input, longest, work_msalign, sample)
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)   
    elif start<=step and not is_na:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command=("mkdir -p %(wk)s/database/ && cp %(db)s %(wk)s/database/%(sam)s.ntx.fasta") % {
                'db': input,
                'wk' : work_msalign,
                'sam' : sample
            }
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg = "Run Comet for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command=("/opt/comet.2018012.linux.exe -Pconfig/par/comet.params -N%(dir)s/%(sam)s "
                "-D%(dir)s/database/%(sam)s.ntx.fasta  %(spectrum)s ") % {
                        'spectrum': spectrum,
                        'dir':work_msalign,
                        'sam':sample
                        }
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg = "Tidy identification result and get precursor protein for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command=("Rscript /opt/Auxtools/comet_fdr.R %(dir)s/%(sam)s %(dir)s/database/%(sam)s.ntx.fasta ") % {
                'dir': work_msalign,
                'sam': sample
                }
        command="bash -c \"%s\""%command
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1


    out_msalign=os.path.join(outdir,"msalign")
    out_database=os.path.join(outdir,"database")
    create_dirs([out_msalign, out_database])
    msg="Copy novel sequence database and  MS identification result(s) to output directory for %s."%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        if os.path.exists("%s/%s-pepSummary.tsv"% (work_msalign, sample)):
            command = "cp %(dir)s/%(sam)s-pepSummary.tsv %(dir)s/%(sam)s-psmSummary.tsv %(dir)s/%(sam)s-sequence.fa %(out)s/"%{
                    'dir': work_msalign,
                    'sam': sample,
                    'out': out_msalign
                    }
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
        if os.path.exists("%s/database/%s.ntx.fasta"% (work_msalign, sample)):
            command = "cp %(dir)s/database/%(sam)s.ntx.fasta %(out)s/" % {
                'dir': work_msalign,
                'sam': sample,
                'out': out_database
            }
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
 
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1
 

    return os.EX_OK
Beispiel #12
0
def run_msgfplus(input="", longest=False,
              spectrum="",instrument="3",enzyme="0",decoy="1", fragid="0",
              pretol="20ppm",minlen=6,maxlen=50,modfile="",ntt="0",
              start=0, sample= "", nthreads=1,
              msgfplus_opts="", max_mem="10G",
              workdir=None, outdir=None, timeout=TIMEOUT):

    logger.info("Running mass spectra alignment (MSGFPlus) for %s"%sample)
    
    work_msalign=os.path.join(workdir,"msalign")
    create_dirs([work_msalign])

    step=0
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        msg = "Erase msalign work directory for %s"%sample
        command="rm -rf %s/*" % (
            work_msalign)
        command="bash -c \"%s\""%command        
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step+=1

    msalign_log = os.path.join(work_msalign, "msalign.log")
    msalign_log_fd = open(msalign_log, "w")

    #determine wether the fasta is nucleotide or amino acid format.
    tmpfile=open(input)
    tmpline=tmpfile.readlines()[1] #get second line 
    is_na=True
    if len(set(tmpline.strip()))>4: 
        is_na=False

    msg = "Run PGA database creator for %s"%sample
    if start<=step and is_na:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command="Rscript /opt/Auxtools/run_dbcreator.R %s %s %s %s" % (
            input, longest, work_msalign, sample)
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)   
    elif start<=step and not is_na:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command=("mkdir -p %(wk)s/database/ && cp %(db)s %(wk)s/database/%(sam)s.ntx.fasta") % {
                'db': input,
                'wk' : work_msalign,
                'sam' : sample
            }
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg = "Run MSGFPlus for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command=("java -jar /opt/MSGFPlus/MSGFPlus.jar  -s %(spectrum)s "
                "-o %(dir)s/%(sam)s.mzid -d %(dir)s/database/%(sam)s.ntx.fasta -m %(fragid)s "
                "-t %(pretol)s -inst %(inst)s -e %(eyz)s -ntt %(ntt)s -tda %(tda)s -minLength %(minl)s "
                "-maxLength %(maxl)s -thread %(th)s -mod %(modfile)s") % {
                        'spectrum': spectrum,
                        'dir':work_msalign,
                        'sam':sample,
                        "pretol": pretol,
                        "inst": instrument,
                        "eyz": enzyme,
                        "tda": decoy,
                        "ntt": ntt,
                        "minl": minlen,
                        "maxl": maxlen,
                        'modfile': modfile,
                        "fragid": fragid,
                        'th': nthreads
                        }
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg = "Converts MS-GF+ output (.mzid) into the tsv format (.tsv) for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command=("java -cp /opt/MSGFPlus/MSGFPlus.jar  edu.ucsd.msjava.ui.MzIDToTsv "
                "-i %(dir)s/%(sam)s.mzid -o %(dir)s/%(sam)s-rawSummary.tsv "
                "-showQValue 1 -showDecoy 0 -unroll 1") % {
                    'dir':work_msalign,
                    'sam':sample,
                    }
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg = "Tidy identification result and get precursor protein for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command=("python /opt/Auxtools/fasta_preparation.py -i %(dir)s/%(sam)s-rawSummary.tsv "
                "-d %(dir)s/database/%(sam)s.ntx.fasta -o %(dir)s/%(sam)s") % {
                'dir': work_msalign,
                'sam': sample
                }
        command="bash -c \"%s\""%command
        cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1


    out_msalign=os.path.join(outdir,"msalign")
    out_database=os.path.join(outdir,"database")
    create_dirs([out_msalign, out_database])
    msg="Copy novel sequence database and  MS identification result(s) to output directory for %s."%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        if os.path.exists("%s/%s-pepSummary.tsv"% (work_msalign, sample)):
            command = "cp %(dir)s/%(sam)s-pepSummary.tsv %(dir)s/%(sam)s-psmSummary.tsv %(dir)s/%(sam)s-sequence.fa %(out)s/"%{
                    'dir': work_msalign,
                    'sam': sample,
                    'out': out_msalign
                    }
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
        if os.path.exists("%s/database/%s.ntx.fasta"% (work_msalign, sample)):
            command = "cp %(dir)s/database/%(sam)s.ntx.fasta %(out)s/" % {
                'dir': work_msalign,
                'sam': sample,
                'out': out_database
            }
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=msalign_log_fd, cmd_log=msalign_log, msg=msg, timeout=timeout)
 
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1
 

    return os.EX_OK
Beispiel #13
0
def run_stringtie(alignment_bam="",
                  ref_gtf="",
                  stringtie_opts="",
                  stringtie=STRINGTIE,
                  start=0,
                  sample="",
                  nthreads=1,
                  workdir=None,
                  outdir=None,
                  timeout=TIMEOUT):

    logger.info("Running transcriptome reconstruction (StringTie) for %s" %
                sample)
    if not os.path.exists(alignment_bam):
        logger.error("Aborting!")
        raise Exception("No input alignment BAM file %s" % alignment_bam)

    work_stringtie = "%s/stringtie/%s/" % (workdir, sample)
    create_dirs([work_stringtie])
    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase StringTie work directory for %s" % sample
        command = "rm -rf %s/*" % (work_stringtie)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1
    stringtie_log = os.path.join(work_stringtie, "stringtie.log")
    stringtie_log_fd = open(stringtie_log, "w")

    if ref_gtf:
        if not os.path.exists(ref_gtf):
            logger.error("Aborting!")
            raise Exception("No reference GTF file %s" % ref_gtf)

    if ref_gtf:
        stringtie_opts += " -G %s" % ref_gtf
    if "-p " not in stringtie_opts:
        stringtie_opts += " -p %d" % nthreads

    msg = "StringTie for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s %s -o %s/transcripts.gtf -A %s/gene_abund.tab -v" % (
            stringtie, alignment_bam, stringtie_opts, work_stringtie,
            work_stringtie)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd,
                          cmd_log=stringtie_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_stringtie = os.path.join(outdir, "stringtie", sample)
    create_dirs([out_stringtie])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/transcripts.gtf"%work_stringtie) and \
           os.path.exists("%s/gene_abund.tab"%work_stringtie):
            command = "cp %s/transcripts.gtf %s/transcripts.gtf" % (
                work_stringtie, out_stringtie)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd,
                              cmd_log=stringtie_log,
                              msg=msg,
                              timeout=timeout)

            command = "cp %s/gene_abund.tab %s/gene_abund.tab" % (
                work_stringtie, out_stringtie)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd,
                              cmd_log=stringtie_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    transcripts = ""
    abundances = ""
    if os.path.exists("%s/transcripts.gtf"%out_stringtie) and \
       os.path.exists("%s/gene_abund.tab"%out_stringtie):
        logger.info("StringTie was successfull!")
        logger.info("Output isoforms: %s/transcripts.gtf" % out_stringtie)
        logger.info("Output expressions: %s/gene_abund.tab" % out_stringtie)
        transcripts = "%s/transcripts.gtf" % out_stringtie
        abundances = "%s/gene_abund.tab" % out_stringtie
    else:
        logger.info("StringTie failed!")
    return transcripts, abundances
Beispiel #14
0
def run_trinity(seq_1="",
                seq_2="",
                seq_u="",
                start=0,
                sample="",
                nthreads=1,
                trinity_opts="",
                max_mem="20G",
                workdir=None,
                outdir=None,
                timeout=TIMEOUT):
    logger.info("Running de novo assembly (TRINITY) for %s" % sample)

    #dirname="trinity_"+sample  #Triniy's output directory must contain the word 'trinity' as a safety precaution
    work_trinity = os.path.join(workdir, "trinity")
    create_dirs([work_trinity])

    #check the fq
    if seq_1 and seq_2:
        for s1 in seq_1.split(","):
            if not os.path.exists(s1):
                logger.error("Aborting!")
                raise Exception("No Mate 1 sequence file %s" % s1)
            if not s1.endswith(".fq.gz"):
                logger.error(
                    "Aborting! Please ensure the suffix of fastq files is <*>.fq.gz"
                )
                raise Exception("Fastq format error %s" % s1)

        for s2 in seq_2.split(","):
            if not os.path.exists(s2):
                logger.error("Aborting!")
                raise Exception("No Mate 2 sequence file %s" % s2)
            if not s2.endswith(".fq.gz"):
                logger.error(
                    "Aborting! Please ensure the suffix of fastq files is <*>.fq.gz"
                )
                raise Exception("Fastq format error %s" % s2)

        seq_argument = "--left %s --right %s" % (seq_1, seq_2)
        '''
        cor_1=chainMap(seq_1.split(",")).
            map(lambda x: os.path.basename(x)).
            map(lambda x: re.sub(r'(.+)\.(fq|fastq)(\.gz)?', r'\1', x)).
            map(lambda x: work_trinity + "/corfq/"+ x +".cor.fq.gz")
        scor_1=",".join(cor_1)
        cor_2=chainMap(seq_2.split(",")).
            map(lambda x: os.path.basename(x)).
            map(lambda x: re.sub(r'(.+)\.(fq|fastq)(\.gz)?', r'\1', x)).
            map(lambda x: work_trinity + "/corfq/"+ x +".cor.fq.gz")
        scor_2=",".join(cor_2)
        '''
    elif seq_u:
        for su in seq_u.split(","):
            if not os.path.exists(su):
                logger.error("Aborting!")
                raise Exception("No unpaired sequence file %s" % su)
            if not su.endswith(".fq.gz"):
                logger.error(
                    "Aborting! Please ensure the suffix of fastq files is <*>.fq.gz"
                )
                raise Exception("Fastq format error %s" % su)

        seq_argument = "--single %s" % (seq_u)

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase Trinity work directory for %s" % sample
        #Trinity's running status is stored with the hidden files. It's necessary to delete these hidden file before the rerun of Trinity. Note that '*' couldn't match the hidden files.
        command = "rm -rf %(wk)s/* %(wk)s/.*" % {'wk': work_trinity}
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    trinity_log = os.path.join(work_trinity, "trinity.log")
    trinity_log_fd = open(trinity_log, "w")

    #seq_argument="-%s -%s %s "%(file_format,read_type,seq_argument)

    msg = "Run Trinity for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "Trinity --seqType fq --max_memory %s --CPU %s  --output %s %s %s" % (
            max_mem, nthreads, work_trinity, seq_argument, trinity_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command,
                               logger,
                               raise_exception=True,
                               env_dict={"OMP_NUM_THREADS": str(nthreads)})
        retcode = cmd.run(cmd_log_fd_out=trinity_log_fd,
                          cmd_log=trinity_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_trinity = os.path.join(outdir, "trinity")
    create_dirs([out_trinity])
    msg = "Copy trinity transcripts to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/Trinity.fasta" % work_trinity):
            command = "cp %s/Trinity.fasta %s/" % (work_trinity, out_trinity)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=trinity_log_fd,
                              cmd_log=trinity_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    return os.EX_OK
Beispiel #15
0
def run_idpfusion(alignment="",
                  short_junction="",
                  long_alignment="",
                  mode_number=0,
                  short_fasta="",
                  long_fasta="",
                  ref_genome="",
                  ref_all_gpd="",
                  ref_gpd="",
                  uniqueness_bedgraph="",
                  genome_bowtie2_idx="",
                  transcriptome_bowtie2_idx="",
                  read_length=100,
                  idpfusion_cfg="",
                  idpfusion=IDPFUSION,
                  samtools=SAMTOOLS,
                  gmap=GMAP,
                  gmap_idx="",
                  star_dir=STAR_DIR,
                  bowtie2_dir=BOWTIE2_DIR,
                  start=0,
                  sample="",
                  nthreads=1,
                  workdir=None,
                  outdir=None,
                  timeout=TIMEOUT):

    logger.info("Running long read fusion Detection (IDP-fusion) for %s" %
                sample)
    if not os.path.exists(alignment):
        logger.error("Aborting!")
        raise Exception("No input short read alignment BAM/SAM file %s" %
                        alignment)
    if not os.path.exists(short_junction):
        logger.error("Aborting!")
        raise Exception("No input short read junction BED file %s" %
                        short_junction)

    if idpfusion_cfg:
        if not os.path.exists(idpfusion_cfg):
            logger.error("Aborting!")
            raise Exception("No input .cfg file %s" % idpfusion_cfg)

    if mode_number > 0:
        start = 4

    work_idpfusion = "%s/idpfusion/%s/" % (workdir, sample)
    create_dirs([work_idpfusion])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase IDP-fusion work directory for %s" % sample
        command = "rm -rf %s/*" % (work_idpfusion)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    idpfusion_log = os.path.join(work_idpfusion, "idpfusion.log")
    idpfusion_log_fd = open(idpfusion_log, "w")

    msg = "converting BAM to SAM for %s" % sample
    logger.info("--------------------------STEP %s--------------------------" %
                step)
    if start <= step:
        if alignment.endswith('.bam'):
            command = "%s view -h -o %s/alignments.sam %s " % (
                samtools, work_idpfusion, alignment)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd,
                              cmd_log=idpfusion_log,
                              msg=msg,
                              timeout=timeout)
            alignment = "%s/alignments.sam" % (work_idpfusion)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Fix soft-clipped reads in SAM for %s" % sample
    logger.info("--------------------------STEP %s--------------------------" %
                step)
    if start <= step:
        logger.info("Task: %s" % msg)
        corrected_alignment = "%s/alignments_corrected.sam" % (work_idpfusion)
        with open(alignment, "r") as csv_file_i:
            with open(corrected_alignment, "w") as csv_file_o:
                spamreader = csv.reader(csv_file_i,
                                        delimiter='\t',
                                        quotechar='|')
                spamwriter = csv.writer(csv_file_o,
                                        delimiter='\t',
                                        quotechar='|',
                                        quoting=csv.QUOTE_MINIMAL)
                for row in spamreader:
                    if row[0][0] == "@":
                        spamwriter.writerow(row)
                        continue
                    if row[5] == "*":
                        continue
                    if "S" in row[5]:
                        cigartuple = cigarstring_to_tuple(row[5])
                        if cigartuple[0][0] == 4:
                            row[9] = row[9][cigartuple[0][1]:]
                            row[10] = row[10][cigartuple[0][1]:]
                            cigartuple = cigartuple[1:]
                        if cigartuple[-1][0] == 4:
                            row[9] = row[9][:-cigartuple[-1][1]]
                            row[10] = row[10][:-cigartuple[-1][1]]
                            cigartuple = cigartuple[:-1]
                        row[5] = "".join([
                            "%d%s" % (x[1], CIGAR_OP_DICT_rev[x[0]])
                            for x in cigartuple
                        ])
                    spamwriter.writerow(row)
        alignment = corrected_alignment
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Fix junction bed for %s" % sample
    logger.info("--------------------------STEP %s--------------------------" %
                step)
    if start <= step:
        logger.info("Task: %s" % msg)
        corrected_junction = "%s/splicesites_corrected.bed" % (work_idpfusion)
        with open(short_junction, "r") as csv_file_i:
            with open(corrected_junction, "w") as csv_file_o:
                spamreader = csv.reader(csv_file_i,
                                        delimiter='\t',
                                        quotechar='|')
                spamwriter = csv.writer(csv_file_o,
                                        delimiter='\t',
                                        quotechar='|',
                                        quoting=csv.QUOTE_MINIMAL)
                for row in spamreader:
                    if len(row) < 4:
                        spamwriter.writerow(row)
                        continue
                    if "]" in row[3]:
                        spamwriter.writerow(row)
                        continue
                    row[3] = "(2)[2_2](2/0)"
                    spamwriter.writerow(row)
        short_junction = corrected_junction
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Preparing run.cfg for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        logger.info("Task: %s" % msg)
        if idpfusion_cfg:
            msg = "copy IDP-fusion .cfg file for %s" % sample
            command = "cp  %s %s/run.cfg" % (idpfusion_cfg, work_idpfusion)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd,
                              cmd_log=idpfusion_log,
                              msg=msg,
                              timeout=timeout)
        else:
            f = open("%s/run.cfg" % work_idpfusion, 'w')
            f.close()

        cgf_dict = {}
        with open("%s/run.cfg" % work_idpfusion, 'r') as cfg_file:
            for line in cfg_file:
                line = line.strip()
                if line == '':
                    continue
                if "=" in line and not line[0] == '#':
                    k, v = line.split("=")
                    k = k.strip()
                    v = v.strip()
                    cgf_dict[k] = v

        with open("%s/run.cfg" % work_idpfusion, 'w') as cfg_file:
            for k, v in cgf_dict.iteritems():
                cfg_file.write("%s = %s \n" % (k, v))
            if "temp_foldername" not in cgf_dict:
                cfg_file.write("temp_foldername = %s/tmp/ \n" % work_idpfusion)
            if "output_foldername" not in cgf_dict:
                cfg_file.write("output_foldername = %s/out/ \n" %
                               work_idpfusion)
            if "Nthread" not in cgf_dict:
                cfg_file.write("Nthread = %d \n" % nthreads)
            if "LR_psl_pathfilename" not in cgf_dict:
                if long_alignment and os.path.exists(long_alignment):
                    cfg_file.write("LR_psl_pathfilename = %s \n" %
                                   long_alignment)
            if "LR_pathfilename" not in cgf_dict:
                cfg_file.write("LR_pathfilename = %s \n" % long_fasta)
            if "SR_sam_pathfilename" not in cgf_dict:
                cfg_file.write("SR_sam_pathfilename = %s \n" % alignment)
            if "SR_jun_pathfilename" not in cgf_dict:
                cfg_file.write("SR_jun_pathfilename = %s \n" % short_junction)
            if "SR_pathfilename" not in cgf_dict:
                cfg_file.write("SR_pathfilename = %s \n" % short_fasta)
            if "SR_aligner_choice" not in cgf_dict:
                cfg_file.write("SR_aligner_choice = STAR \n")
            if "star_path" not in cgf_dict:
                cfg_file.write("star_path = %s \n" % star_dir)
            if "gmap_executable_pathfilename" not in cgf_dict:
                cfg_file.write("gmap_executable_pathfilename = %s \n" % gmap)
            if "gmap_index_pathfoldername" not in cgf_dict:
                cfg_file.write("gmap_index_pathfoldername = %s \n" % gmap_idx)
            if "genome_bowtie2_index_pathfilename" not in cgf_dict:
                cfg_file.write("genome_bowtie2_index_pathfilename = %s \n" %
                               genome_bowtie2_idx)
            if "transcriptome_bowtie2_index_pathfilename" not in cgf_dict:
                cfg_file.write(
                    "transcriptome_bowtie2_index_pathfilename = %s \n" %
                    transcriptome_bowtie2_idx)
            if "allref_annotation_pathfilename" not in cgf_dict:
                cfg_file.write("allref_annotation_pathfilename = %s \n" %
                               ref_all_gpd)
            if "ref_annotation_pathfilename" not in cgf_dict:
                cfg_file.write("ref_annotation_pathfilename = %s \n" % ref_gpd)
            if "genome_pathfilename" not in cgf_dict:
                cfg_file.write("genome_pathfilename = %s \n" % ref_genome)
            if "estimator_choice" not in cgf_dict:
                cfg_file.write("estimator_choice = MAP \n")
            if "FPR" not in cgf_dict:
                cfg_file.write("FPR = 0.1 \n")
            if "Njun_limit" not in cgf_dict:
                cfg_file.write("Njun_limit = 10 \n")
            if "Niso_limit" not in cgf_dict:
                cfg_file.write("Niso_limit = 20 \n")
            if "L_exon_limit" not in cgf_dict:
                cfg_file.write("L_exon_limit = 1700 \n")
            if "L_min_intron" not in cgf_dict:
                cfg_file.write("L_min_intron = 68 \n")
            if "Bfile_Npt" not in cgf_dict:
                cfg_file.write("Bfile_Npt = 50 \n")
            if "Bfile_Nbin" not in cgf_dict:
                cfg_file.write("Bfile_Nbin = 5 \n")
            if "min_LR_overlap_len" not in cgf_dict:
                cfg_file.write("min_LR_overlap_len = 100 \n")
            if "LR_fusion_point_err_margin" not in cgf_dict:
                cfg_file.write("LR_fusion_point_err_margin = 100 \n")
            if "min_LR_fusion_point_search_distance" not in cgf_dict:
                cfg_file.write("min_LR_fusion_point_search_distance = 20 \n")
            if "uniq_LR_alignment_margin_perc" not in cgf_dict:
                cfg_file.write("uniq_LR_alignment_margin_perc = 20 \n")
            if "Niso_fusion_limit" not in cgf_dict:
                cfg_file.write("Niso_fusion_limit = 1000 \n")
            if "psl_type" not in cgf_dict:
                cfg_file.write("psl_type = 0 \n")
            if "read_length" not in cgf_dict:
                cfg_file.write("read_length = %d \n" % read_length)
            if "min_junction_overlap_len" not in cgf_dict:
                cfg_file.write("min_junction_overlap_len = 10 \n")
            if "I_refjun_isoformconstruction" not in cgf_dict:
                cfg_file.write("I_refjun_isoformconstruction = 1 \n")
            if "I_ref5end_isoformconstruction" not in cgf_dict:
                cfg_file.write("I_ref5end_isoformconstruction = 1 \n")
            if "I_ref3end_isoformconstruction" not in cgf_dict:
                cfg_file.write("I_ref3end_isoformconstruction = 1 \n")
            if "fusion_mode" not in cgf_dict:
                cfg_file.write("fusion_mode = 1 \n")
            if "uniqueness_bedGraph_pathfilename" not in cgf_dict:
                cfg_file.write("uniqueness_bedGraph_pathfilename = %s \n" %
                               uniqueness_bedgraph)
            if "exon_construction_junction_span" not in cgf_dict:
                cfg_file.write("exon_construction_junction_span = 1 \n")
            if "aligner_choice" not in cgf_dict:
                cfg_file.write("aligner_choice = gmap \n")
            if "aligner_choice" not in cgf_dict:
                cfg_file.write("aligner_choice = gmap \n")
            if "three_primer" not in cgf_dict:
                cfg_file.write("three_primer =  \n")
            if "five_primer" not in cgf_dict:
                cfg_file.write("five_primer =  \n")
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    if star_dir:
        os.environ["PATH"] += ":%s/" % star_dir
    if bowtie2_dir:
        os.environ["PATH"] += ":%s/" % bowtie2_dir

    msg = "IDP-fusion for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s/run.cfg %d" % (idpfusion, work_idpfusion, mode_number)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd,
                          cmd_log=idpfusion_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Convert transcript GPD file to GTF for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/out/isoform.gpd" % work_idpfusion):
            sort_gpd("%s/out/isoform.gpd" % work_idpfusion,
                     "%s/out/isoform_sorted.gpd" % work_idpfusion)
            command = "gpd2gtf.py \
                  %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP" % (
                work_idpfusion, work_idpfusion, work_idpfusion)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd,
                              cmd_log=idpfusion_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_idpfusion = os.path.join(outdir, "idpfusion", sample)
    create_dirs([out_idpfusion])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/out/fusion_report.tsv" % work_idpfusion):
            command = "cp %s/out/fusion_report.tsv %s/fusion_report.tsv" % (
                work_idpfusion, out_idpfusion)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd,
                              cmd_log=idpfusion_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    fusions = ""
    if os.path.exists("%s/fusion_report.tsv" % out_idpfusion):
        logger.info("IDP-fusion was successfull!")
        logger.info("Output fusions: %s/fusion_report.tsv" % out_idpfusion)
        fusions = "%s/fusion_report.tsv" % out_idpfusion
    else:
        logger.info("IDP-fusion failed!")
    return fusions
Beispiel #16
0
def run_hisat2(align_idx=None,
               seq_1="",
               seq_2="",
               seq_u="",
               seq_sra="",
               ref_gtf="",
               hisat2_opts="",
               hisat2=HISAT2,
               hisat2_sps=HISAT2_SPS,
               samtools=SAMTOOLS,
               start=0,
               sample="",
               nthreads=1,
               workdir=None,
               outdir=None,
               timeout=TIMEOUT):

    logger.info("Running alignment (HISAT2) for %s" % sample)
    if not os.path.exists(align_idx + ".1.ht2"):
        logger.error("Aborting!")
        raise Exception("No HISAT index file %s.1.ht2" % align_idx)

    if seq_1 and seq_2:
        for s1 in seq_1.split(","):
            if not os.path.exists(s1):
                logger.error("Aborting!")
                raise Exception("No Mate 1 sequence file %s" % s1)
        for s2 in seq_2.split(","):
            if not os.path.exists(s2):
                logger.error("Aborting!")
                raise Exception("No Mate 2 sequence file %s" % s2)
        seq_argument = "-1 %s -2 %s" % (seq_1, seq_2)
    elif seq_u:
        seq_argument = "-U %s" % (seq_u)
        for su in seq_u.split(","):
            if not os.path.exists(su):
                logger.error("Aborting!")
                raise Exception("No unpaired sequence file %s" % su)

    elif seq_sra:
        seq_argument = "--sra-acc %s" % (seq_sra)
        for sr in seq_sra.split(","):
            if not os.path.exists(sr):
                logger.error("Aborting!")
                raise Exception("No sra sequence file %s" % sr)

    work_hisat2 = os.path.join(workdir, "hisat2", sample)
    create_dirs([work_hisat2])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase HISAT2 work directory for %s" % sample
        command = "rm -rf %s/*" % (work_hisat2)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    hisat2_log = os.path.join(work_hisat2, "hisat2.log")
    hisat2_log_fd = open(hisat2_log, "w")

    ksps = ""
    msg = "Prepare known-splicesites for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if ref_gtf:
            if not os.path.exists(ref_gtf):
                logger.error("Aborting!")
                raise Exception("No reference GTF file %s" % ref_gtf)
            else:
                ksps = ref_gtf.strip() + "known-splicesite.txt"
                if os.path.exists(ksps):
                    logger.info(
                        "Will use the precomputed %s as --known-splicesite-infile for HISAT2"
                        % ksps)
                else:
                    msg = "compute --known-splicesite-infile for HISAT2"
                    ksps = os.path.join(work_hisat2, "known-splicesite.txt")
                    ksps_fd = open(ksps, "w")

                    command = "%s %s" % (hisat2_sps, ref_gtf)
                    command = "bash -c \"%s\"" % command
                    cmd = TimedExternalCmd(command,
                                           logger,
                                           raise_exception=True)
                    retcode = cmd.run(cmd_log_fd_out=ksps_fd,
                                      msg=msg,
                                      timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    if "--dta " not in hisat2_opts:
        hisat2_opts += " --dta"
    if "--rg-id " not in hisat2_opts:
        hisat2_opts += " --rg-id hisat2"
    if "--rg " not in hisat2_opts:
        hisat2_opts += " --rg SM:%s" % sample
    if "--threads " not in hisat2_opts:
        hisat2_opts += " --threads %d" % nthreads
    if ksps:
        hisat2_opts += " --known-splicesite-infile %s" % ksps

    msg = "HISAT2 for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s  -x %s %s -S %s/alignments.sam --novel-splicesite-outfile %s/splicesites.tab" % (
            hisat2, hisat2_opts, align_idx, seq_argument, work_hisat2,
            work_hisat2)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                          cmd_log=hisat2_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "converting SAM to BAM for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s view -Su %s/alignments.sam -@ %d -o %s/alignments.bam" % (
            samtools, work_hisat2, nthreads, work_hisat2)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                          cmd_log=hisat2_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "sorting BAM for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s sort  -@ %d -T %s/alignments.sorted -o %s/alignments.sorted.bam %s/alignments.bam  " % (
            samtools, nthreads, work_hisat2, work_hisat2, work_hisat2)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                          cmd_log=hisat2_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Converting junctions to BED for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "hisat2_jun2bed.py %s/splicesites.tab %s/splicesites.bed " % (
            work_hisat2, work_hisat2)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                          cmd_log=hisat2_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "Clean temp alignment files for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "rm %s/alignments.sam %s/alignments.bam" % (work_hisat2,
                                                              work_hisat2)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                          cmd_log=hisat2_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_hisat2 = os.path.join(outdir, "hisat2", sample)
    create_dirs([out_hisat2])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/alignments.sorted.bam"%work_hisat2) and \
           os.path.exists("%s/splicesites.tab"%work_hisat2) and \
           os.path.exists("%s/splicesites.bed"%work_hisat2):
            command = "cp %s/alignments.sorted.bam %s/alignments.sorted.bam" % (
                work_hisat2, out_hisat2)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                              cmd_log=hisat2_log,
                              msg=msg,
                              timeout=timeout)
            command = "cp %s/splicesites.tab %s/splicesites.tab" % (
                work_hisat2, out_hisat2)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                              cmd_log=hisat2_log,
                              msg=msg,
                              timeout=timeout)
            command = "cp %s/splicesites.bed %s/splicesites.bed" % (
                work_hisat2, out_hisat2)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd,
                              cmd_log=hisat2_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    alignments_bam = ""
    junctions_tab = ""
    junctions_bed = ""
    if os.path.exists("%s/alignments.sorted.bam" % out_hisat2):
        logger.info("HISAT2 was successfull!")
        logger.info("Output alignment: %s/alignments.sorted.bam" % out_hisat2)
        logger.info("Output junction tab: %s/splicesites.tab" % out_hisat2)
        logger.info("Output junction bed: %s/splicesites.bed" % out_hisat2)
        alignments_bam = "%s/alignments.sorted.bam" % out_hisat2
        junctions_tab = "%s/splicesites.tab" % out_hisat2
        junctions_bed = "%s/splicesites.bed" % out_hisat2
    else:
        logger.info("HISAT2 failed!")
    return alignments_bam, junctions_tab, junctions_bed
Beispiel #17
0
def run_giremi(alignment="", variant="", 
                  strand_pos="", genes_pos="",
                  ref_genome="", knownsites="",
                  giremi_dir="", htslib_dir="",
                  samtools=SAMTOOLS, gatk=GATK,                  
                  java=JAVA, giremi_opts="", java_opts="",
                  VariantAnnotator_opts="",  
                  start=0, sample= "", nthreads=1,
                  workdir=None, outdir=None, timeout=TIMEOUT):


    logger.info("Running RNA editing detection (GIREMI) for %s"%sample)
    if not os.path.exists(alignment):
        logger.error("Aborting!")
        raise Exception("No alignment file %s"%alignment)
    if not os.path.exists(variant):
        logger.error("Aborting!")
        raise Exception("No variant VCF file %s"%variant)
    if not os.path.exists(strand_pos):
        logger.error("Aborting!")
        raise Exception("No strand position BED file %s"%strand_pos)
    if not os.path.exists(genes_pos):
        logger.error("Aborting!")
        raise Exception("No genes position BED file %s"%genes_pos)
    if not os.path.exists(ref_genome):
        logger.error("Aborting!")
        raise Exception("No reference genome FASTA file %s"%ref_genome)
    if not os.path.exists(knownsites):
        logger.error("Aborting!")
        raise Exception("No VCF knownsites file %s"%knownsites)
    if giremi_dir:
        if not os.path.exists(giremi_dir):
            logger.error("Aborting!")
            raise Exception("No GIREMI directory %s"%giremi_dir)

    work_giremi=os.path.join(workdir,"giremi",sample)
    create_dirs([work_giremi])
    
    if nthreads>1:
        if "-nt " not in VariantAnnotator_opts:
            VariantAnnotator_opts += " -nt %d"%nthreads 

    if "-Xms" not in java_opts:
        java_opts += " %s"%JAVA_XMS
    if "-Xmx" not in java_opts:
        java_opts += " %s"%JAVA_XMG
    if "-Djava.io.tmpdir" not in java_opts:
        java_opts += " -Djava.io.tmpdir=%s/javatmp/"%(work_giremi)



    step=0
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        msg = "Erase GIREMI work directory for %s"%sample
        command="rm -rf %s/*" % (
            work_giremi)
        command="bash -c \"%s\""%command        
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg,timeout=timeout)
    step+=1
    
    giremi_log = os.path.join(work_giremi, "giremi.log")
    giremi_log_fd = open(giremi_log, "w")
    
    
    msg = "Sort BAM by name for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command="%s sort -n -@ %d %s %s/alignments.name_sorted" % (
            samtools, nthreads, alignment, work_giremi)
        command="bash -c \"%s\""%command        
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1
        

    msg = "Filter alignments mapped to multiple chromosoms for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        logger.info(msg)
        filter_multi_chr_alignments("%s/alignments.name_sorted.bam"%work_giremi,"%s/alignments.chr_unique.bam"%work_giremi)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg = "Sort BAM by pos for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command="%s sort -@ %d %s/alignments.chr_unique.bam %s/alignments.pos_sorted " % (
            samtools, nthreads, work_giremi, work_giremi)
        command="bash -c \"%s\""%command        
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg = "GATK VariantAnnotator for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command="%s %s -jar %s -T VariantAnnotator -R %s -V %s -L %s -o %s/annotated.vcf --dbsnp %s %s" % (
            java, java_opts, gatk, ref_genome,variant,variant,work_giremi,knownsites,VariantAnnotator_opts)
        command="bash -c \"%s\""%command      
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)   
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    msg="Find variant strands for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        logger.info(msg)
        find_SNV_strands(strand_pos, genes_pos,  "%s/annotated.vcf"%work_giremi, "%s/SNV_annotated.bed"%work_giremi)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    if htslib_dir:
        if "LD_LIBRARY_PATH" in os.environ:
            os.environ["LD_LIBRARY_PATH"] += ":%s/"%htslib_dir
        else:
            os.environ["LD_LIBRARY_PATH"] = htslib_dir

    if giremi_dir:
        os.environ["PATH"] += ":%s/"%giremi_dir
                
    msg = "Run GIREMI for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command="cd %s && %s %s -f %s -l %s/SNV_annotated.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % (
            giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi))
        command="bash -c \"%s\""%command        
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

        
    if os.path.exists("%s/giremi_out.txt"%work_giremi) and not os.path.exists("%s/giremi_out.txt.res"%work_giremi):

        msg="Identify N variants for %s"%sample
        if start<=step:
            logger.info("--------------------------STEP %s--------------------------"%step)
            logger.info(msg)
            with open("%s/giremi_out.txt"%work_giremi) as csv_file_i:
                spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|')
                with open("%s/N.bed"%work_giremi, 'wb') as csvfile_o:
                    spamwriter = csv.writer(csvfile_o, delimiter='\t',
                                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
                    for row in spamreader:
                        if (row[5]=="N" or row[8]=="N"):
                            spamwriter.writerow([row[0],int(row[1])-1,row[1]])
        else:
            logger.info("Skipping step %d: %s"%(step,msg))
        step+=1

        cnt=len(pybedtools.BedTool("%s/N.bed"%work_giremi))
        if cnt>0:
            msg="Remove N variants for %s"%sample
            if start<=step:
                logger.info("--------------------------STEP %s--------------------------"%step)
                logger.info(msg)
                pybedtools.BedTool("%s/SNV_annotated.bed"%work_giremi).intersect(
                "%s/N.bed"%work_giremi,r=True, f=1, v=True).saveas("%s/SNV_annotated_filtered.bed"%work_giremi)
            else:
                logger.info("Skipping step %d: %s"%(step,msg))
            step+=1
            
            msg = "Rerun GIREMI for %s"%sample
            if start<=step:
                logger.info("--------------------------STEP %s--------------------------"%step)
                if os.path.exists("%s/SNV_annotated_filtered.bed"%work_giremi):
                    command="cd %s && %s %s -f %s -l %s/SNV_annotated_filtered.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % (
                        giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi))
                    command="bash -c \"%s\""%command        
                    cmd = TimedExternalCmd(command, logger, raise_exception=False)
                    retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
                else:
                    logger.info("No file %s/SNV_annotated_filtered.bed"%work_giremi)
            else:
                logger.info("Skipping step %d: %s"%(step,msg))
            step+=1
        else:
            step+=2
    else:
        step+=3

    out_giremi=os.path.join(outdir,"giremi",sample)
    create_dirs([out_giremi])
    msg="Copy predictions to output directory for %s."%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        if os.path.exists("%s/giremi_out.txt.res"%work_giremi):
            command = "cp %s/giremi_out.txt.res %s/giremi_out.txt.res"%(
                       work_giremi, out_giremi)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)   
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1


    edits = ""
    if os.path.exists("%s/giremi_out.txt.res"%out_giremi):
        logger.info("GIREMI was successfull!")
        logger.info("Output edits: %s/giremi_out.txt.res"%out_giremi)
        edits = "%s/giremi_out.txt.res"%out_giremi   
    else:            
        logger.info("GIREMI failed!")
    return edits
Beispiel #18
0
def run_starlong(long="",
                 genome_dir="",
                 ref_gtf="",
                 starlong=STARLONG,
                 sam2psl=SAM2PSL,
                 samtools=SAMTOOLS,
                 starlong_opts="",
                 start=0,
                 sample="",
                 nthreads=1,
                 workdir=None,
                 outdir=None,
                 timeout=TIMEOUT):

    logger.info("Running long read alignment (STARlong) for %s" % sample)
    if not os.path.exists(genome_dir + "SAindex"):
        logger.error("Aborting!")
        raise Exception("No SAindex directory in %s" % genome_dir)

    if long:
        if not os.path.exists(long):
            logger.error("Aborting!")
            raise Exception("No long read sequence file %s" % long)

    work_starlong = os.path.join(workdir, "starlong", sample)
    create_dirs([work_starlong])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase STARlong work directory for %s" % sample
        command = "rm -rf %s/*" % (work_starlong)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    starlong_log = os.path.join(work_starlong, "starlong.log")
    starlong_log_fd = open(starlong_log, "w")

    if ref_gtf:
        if not os.path.exists(ref_gtf):
            logger.error("Aborting!")
            raise Exception("No reference GTF file %s" % ref_gtf)

    if "--outSAMattrRGline" not in starlong_opts:
        starlong_opts += " --outSAMattrRGline ID:STARlong SM:%s" % sample
    if "--runThreadN " not in starlong_opts:
        starlong_opts += " --runThreadN %d" % nthreads
    if ref_gtf:
        starlong_opts += " --sjdbGTFfile %s" % ref_gtf
    for k, v in STARLONG_DEFAULTS.iteritems():
        if k not in starlong_opts:
            starlong_opts += " --%s %s" % (k, v)

    msg = "STARlong for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s --runMode alignReads %s --genomeDir %s  --readFilesIn %s  --outFileNamePrefix %s/" % (
            starlong, starlong_opts, genome_dir, long, work_starlong)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=starlong_log_fd,
                          cmd_log=starlong_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "converting SAM to PSL for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s -i %s/Aligned.out.sam -o %s/Aligned.out.psl" % (
            sam2psl, work_starlong, work_starlong)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=starlong_log_fd,
                          cmd_log=starlong_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "converting SAM to BAM for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s view -Su %s/Aligned.out.sam -@ %d -o %s/Aligned.out.bam" % (
            samtools, work_starlong, nthreads, work_starlong)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=starlong_log_fd,
                          cmd_log=starlong_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    #
    #     msg = "Clean temp alignment files for %s"%sample
    #     if start<=step:
    #         logger.info("--------------------------STEP %s--------------------------"%step)
    #         command="rm %s/Aligned.out.sam" % (work_starlong)
    #         command="bash -c \"%s\""%command
    #         cmd = TimedExternalCmd(command, logger, raise_exception=True)
    #         retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout)
    #     else:
    #         logger.info("Skipping step %d: %s"%(step,msg))
    #     step+=1

    out_starlong = os.path.join(outdir, "starlong", sample)
    create_dirs([out_starlong])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/Aligned.out.psl" % work_starlong):
            command = "cp %s/Aligned.out.psl %s/Aligned.out.psl" % (
                work_starlong, out_starlong)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=starlong_log_fd,
                              cmd_log=starlong_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    alignments_psl = ""
    if os.path.exists("%s/Aligned.out.psl" % out_starlong):
        logger.info("STARlong was successfull!")
        logger.info("Output alignment: %s/Aligned.out.psl" % out_starlong)
        alignments_psl = "%s/Aligned.out.psl" % out_starlong
    else:
        logger.info("STARlong failed!")
    return alignments_psl
def run_idp(alignment="", short_junction="", long_alignment="",mode_number=0, 
                  ref_genome="", ref_all_gpd="", ref_gpd="",read_length=100,
                  idp_cfg="", idp=IDP, samtools=SAMTOOLS,
                  start=0, sample= "", nthreads=1,
                  workdir=None, outdir=None, timeout=TIMEOUT):

    logger.info("Running long-read transcriptome reconstruction (IDP) for %s"%sample)
    if not os.path.exists(alignment):
        logger.error("Aborting!")
        raise Exception("No input short read alignment BAM/SAM file %s"%alignment)
    if not os.path.exists(short_junction):
        logger.error("Aborting!")
        raise Exception("No input short read junction BED file %s"%short_junction)
    if not os.path.exists(long_alignment):
        logger.error("Aborting!")
        raise Exception("No input long read alignment PSL file %s"%long_alignment)
        
    if idp_cfg:
        if not os.path.exists(idp_cfg):
            logger.error("Aborting!")
            raise Exception("No input .cfg file %s"%idp_cfg)
        

    
    if mode_number>0:
        start=4
    
    work_idp="%s/idp/%s/"%(workdir,sample)
    create_dirs([work_idp])

    step=0
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        msg = "Erase IDP work directory for %s"%sample
        command="rm -rf %s/*" % (
            work_idp)
        command="bash -c \"%s\""%command        
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg,timeout=timeout)
    step+=1



    idp_log = os.path.join(work_idp, "idp.log")
    idp_log_fd = open(idp_log, "w")

    msg = "converting BAM to SAM for %s"%sample
    logger.info("--------------------------STEP %s--------------------------"%step)
    if start<=step:
        if alignment.endswith('.bam'):
            command = "%s view -h -o %s/alignments.sam %s " % (samtools,work_idp,alignment)
            command="bash -c \"%s\""%command       
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)
            alignment =  "%s/alignments.sam"%(work_idp)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1


    msg = "Preparing run.cfg for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        if idp_cfg:
            msg = "copy IDP .cfg file for %s"%sample
            command="cp  %s %s/run.cfg" % (
                idp_cfg, work_idp)
            command="bash -c \"%s\""%command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)   
        else:
            f=open("%s/run.cfg"%work_idp, 'w')
            f.close()

        cgf_dict={}
        with open("%s/run.cfg"%work_idp , 'r') as cfg_file:
            for line in cfg_file:
                line = line.strip()
                if line=='':
                    continue
                if "=" in line and not line[0]=='#' :
                    k,v=line.split("=")
                    k=k.strip()
                    v=v.strip()
                    cgf_dict[k]=v
                    
        with open("%s/run.cfg"%work_idp , 'w') as cfg_file:
            for k,v in cgf_dict.iteritems():
                cfg_file.write("%s = %s \n"%(k,v))
            if "temp_foldername" not in cgf_dict:
                cfg_file.write("temp_foldername = %s/tmp/ \n"%work_idp)
            if "output_foldername" not in cgf_dict:
                cfg_file.write("output_foldername = %s/out/ \n"%work_idp)
            if "Nthread" not in cgf_dict:
                cfg_file.write("Nthread = %d \n"%nthreads)
            if "LR_psl_pathfilename" not in cgf_dict:
                cfg_file.write("LR_psl_pathfilename = %s \n"%long_alignment)
            if "SR_sam_pathfilename" not in cgf_dict:
                cfg_file.write("SR_sam_pathfilename = %s \n"%alignment)
            if "SR_jun_pathfilename" not in cgf_dict:
                cfg_file.write("SR_jun_pathfilename = %s \n"%short_junction)
            if "genome_pathfilename" not in cgf_dict:       
                cfg_file.write("genome_pathfilename = %s \n"%ref_genome)
            if "allref_annotation_pathfilename" not in cgf_dict:       
                cfg_file.write("allref_annotation_pathfilename = %s \n"%ref_all_gpd)
            if "ref_annotation_pathfilename" not in cgf_dict:       
                cfg_file.write("ref_annotation_pathfilename = %s \n"%ref_gpd)
            if "estimator_choice" not in cgf_dict:       
                cfg_file.write("estimator_choice = MLE \n")
            if "FPR" not in cgf_dict:       
                cfg_file.write("FPR = 0.05 \n")
            if "Njun_limit" not in cgf_dict:       
                cfg_file.write("Njun_limit = 10 \n")
            if "Niso_limit" not in cgf_dict:       
                cfg_file.write("Niso_limit = 100 \n")
            if "aligner_choice" not in cgf_dict:       
                cfg_file.write("aligner_choice = gmap \n")
            if "exon_construction_junction_span" not in cgf_dict:
                cfg_file.write("exon_construction_junction_span = 1 \n")
            if "read_length" not in cgf_dict:
                cfg_file.write("read_length = %d \n"%read_length)
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1


    
    msg = "IDP for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        command="%s %s/run.cfg %d" % (
            idp, work_idp, mode_number)
        command="bash -c \"%s\""%command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)   
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1
    
    msg = "Convert transcript GPD file to GTF for %s"%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        if os.path.exists("%s/out/isoform.gpd"%work_idp):
            sort_gpd("%s/out/isoform.gpd"%work_idp,"%s/out/isoform_sorted.gpd"%work_idp)
            command="gpd2gtf.py \
                  %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP"%(work_idp,work_idp,work_idp)
            command="bash -c \"%s\""%command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)   
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1

    out_idp=os.path.join(outdir,"idp",sample)
    create_dirs([out_idp])
    msg="Copy predictions to output directory for %s."%sample
    if start<=step:
        logger.info("--------------------------STEP %s--------------------------"%step)
        if os.path.exists("%s/out/isoform.gtf"%work_idp) and \
           os.path.exists("%s/out/isoform.exp"%work_idp):
            command = "cp %s/out/isoform.gtf %s/isoform.gtf"%(
                       work_idp, out_idp)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)   
            
            command = "cp %s/out/isoform.exp %s/isoform.exp"%(
                       work_idp, out_idp)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)   
    else:
        logger.info("Skipping step %d: %s"%(step,msg))
    step+=1



    transcripts = ""
    abundances = ""
    if os.path.exists("%s/isoform.gtf"%out_idp) and \
       os.path.exists("%s/isoform.exp"%out_idp):
        logger.info("IDP was successfull!")
        logger.info("Output isoforms: %s/isoform.gtf"%out_idp)
        logger.info("Output expressions: %s/isoform.exp"%out_idp)
        transcripts = "%s/isoform.gtf"%out_idp   
        abundances = "%s/isoform.exp"%out_idp   
    else:            
        logger.info("IDP failed!")
    return transcripts,abundances
Beispiel #20
0
def run_gatk(alignment="",
             ref_genome="",
             knownsites="",
             picard=PICARD,
             gatk=GATK,
             java=JAVA,
             java_opts="",
             CleanSam=False,
             IndelRealignment=False,
             no_BaseRecalibrator=False,
             AddOrReplaceReadGroups_opts="",
             MarkDuplicates_opts="",
             SplitNCigarReads_opts="",
             RealignerTargetCreator_opts="",
             IndelRealigner_opts="",
             BaseRecalibrator_opts="",
             PrintReads_opts="",
             HaplotypeCaller_opts="",
             VariantFiltration_opts="",
             start=0,
             sample="",
             nthreads=1,
             workdir=None,
             outdir=None,
             timeout=TIMEOUT):

    logger.info("Running variant calling (GATK) for %s" % sample)
    if not os.path.exists(alignment):
        logger.error("Aborting!")
        raise Exception("No alignment file %s" % alignment)
    if not os.path.exists(ref_genome):
        logger.error("Aborting!")
        raise Exception("No reference genome FASTA file %s" % ref_genome)

    work_gatk = os.path.join(workdir, "gatk", sample)
    create_dirs([work_gatk])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase GATK work directory for %s" % sample
        command = "rm -rf %s/*" % (work_gatk)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    gatk_log = os.path.join(work_gatk, "gatk.log")
    gatk_log_fd = open(gatk_log, "w")

    if "SO=" not in AddOrReplaceReadGroups_opts:
        AddOrReplaceReadGroups_opts += " SO=coordinate"
    if "RGLB=" not in AddOrReplaceReadGroups_opts:
        AddOrReplaceReadGroups_opts += " RGLB=lib1"
    if "RGPL=" not in AddOrReplaceReadGroups_opts:
        AddOrReplaceReadGroups_opts += " RGPL=illumina"
    if "RGPU=" not in AddOrReplaceReadGroups_opts:
        AddOrReplaceReadGroups_opts += " RGPU=unit1"
    if "RGSM=" not in AddOrReplaceReadGroups_opts:
        AddOrReplaceReadGroups_opts += " RGSM=%s" % sample

    if "CREATE_INDEX=" not in MarkDuplicates_opts:
        MarkDuplicates_opts += " CREATE_INDEX=true"
    if "VALIDATION_STRINGENCY=" not in MarkDuplicates_opts:
        MarkDuplicates_opts += " VALIDATION_STRINGENCY=SILENT"

    if "-rf " not in SplitNCigarReads_opts:
        SplitNCigarReads_opts += " -rf %s" % GATK_SN_RF
    if "-RMQF " not in SplitNCigarReads_opts:
        SplitNCigarReads_opts += " -RMQF %d" % GATK_SN_RMQF
    if "-RMQT " not in SplitNCigarReads_opts:
        SplitNCigarReads_opts += " -RMQT %d" % GATK_SN_RMQT
    if "-U " not in SplitNCigarReads_opts:
        SplitNCigarReads_opts += " -U ALLOW_N_CIGAR_READS"

    if knownsites:
        if not os.path.exists(knownsites):
            logger.error("Aborting!")
            raise Exception("No VCF knownsites file %s" % knownsites)
        if "--known " not in RealignerTargetCreator_opts:
            RealignerTargetCreator_opts += " --known %s" % knownsites
        if "-known " not in IndelRealigner_opts and "--knownAlleles " not in IndelRealigner_opts:
            IndelRealigner_opts += " -known %s" % knownsites
        if "-knownSites " not in BaseRecalibrator_opts:
            BaseRecalibrator_opts += " -knownSites %s" % knownsites

    if "-dontUseSoftClippedBases " not in HaplotypeCaller_opts:
        HaplotypeCaller_opts += " -dontUseSoftClippedBases"
    if "-stand_call_conf " not in HaplotypeCaller_opts:
        HaplotypeCaller_opts += " -stand_call_conf %f" % GATK_HC_STANDCALLCONF
    if "-stand_emit_conf " not in HaplotypeCaller_opts:
        HaplotypeCaller_opts += " -stand_emit_conf %f" % GATK_HC_STANDEMITCONF

    if "-window " not in VariantFiltration_opts:
        VariantFiltration_opts += " -window %d" % GATK_VF_WINDOW
    if "-cluster " not in VariantFiltration_opts:
        VariantFiltration_opts += " -cluster %d" % GATK_VF_CLUSTER
    if "-filterName FS " not in VariantFiltration_opts:
        VariantFiltration_opts += " -filterName FS -filter 'FS > %f'" % GATK_VF_FSMIN
    if "-filterName QD " not in VariantFiltration_opts:
        VariantFiltration_opts += " -filterName QD -filter 'QD < %f'" % GATK_VF_QDMAX

    if nthreads > 1:
        if "-nct " not in BaseRecalibrator_opts:
            BaseRecalibrator_opts += " -nct %d" % nthreads
        if "-nct " not in PrintReads_opts:
            PrintReads_opts += " -nct %d" % nthreads

    if "-Xms" not in java_opts:
        java_opts += " %s" % JAVA_XMS
    if "-Xmx" not in java_opts:
        java_opts += " %s" % JAVA_XMG
    if "-Djava.io.tmpdir" not in java_opts:
        java_opts += " -Djava.io.tmpdir=%s/javatmp/" % (work_gatk)

    msg = "picard CleanSam for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if CleanSam:
            command = "%s %s -cp %s picard.cmdline.PicardCommandLine CleanSam I=%s O=%s/alignments_clean.bam" % (
                java, java_opts, picard, alignment, work_gatk)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                              cmd_log=gatk_log,
                              msg=msg,
                              timeout=timeout)
            alignment = "%s/alignments_clean.bam" % work_gatk
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "picard AddOrReplaceReadGroups for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s -cp %s picard.cmdline.PicardCommandLine AddOrReplaceReadGroups I=%s O=%s/rg_added_sorted.bam %s" % (
            java, java_opts, picard, alignment, work_gatk,
            AddOrReplaceReadGroups_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                          cmd_log=gatk_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "picard MarkDuplicates for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s -cp %s picard.cmdline.PicardCommandLine MarkDuplicates I=%s/rg_added_sorted.bam O=%s/dedupped.bam %s M=%s/output.metrics" % (
            java, java_opts, picard, work_gatk, work_gatk, MarkDuplicates_opts,
            work_gatk)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                          cmd_log=gatk_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "GATK SplitNCigarReads for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s -jar %s -T SplitNCigarReads -R %s -I %s/dedupped.bam -o %s/split.bam %s" % (
            java, java_opts, gatk, ref_genome, work_gatk, work_gatk,
            SplitNCigarReads_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                          cmd_log=gatk_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    split_bam = "%s/split.bam" % work_gatk
    if IndelRealignment:
        msg = "GATK RealignerTargetCreator for %s" % sample
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            command = "%s %s -jar %s -T RealignerTargetCreator -R %s -I %s/split.bam -o %s/forIndelRealigner.intervals %s" % (
                java, java_opts, gatk, ref_genome, work_gatk, work_gatk,
                RealignerTargetCreator_opts)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                              cmd_log=gatk_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
        step += 1

        msg = "GATK IndelRealigner for %s" % sample
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            command = "%s %s -jar %s -T IndelRealigner -R %s -I %s/split.bam -targetIntervals %s/forIndelRealigner.intervals -o %s/split_realigned.bam %s" % (
                java, java_opts, gatk, ref_genome, work_gatk, work_gatk,
                work_gatk, IndelRealigner_opts)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                              cmd_log=gatk_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
        step += 1
        split_bam = "%s/split_realigned.bam" % work_gatk
    else:
        msg = "GATK RealignerTargetCreator for %s" % sample
        logger.info("Skipping step %d: %s" % (step, msg))
        step += 1
        msg = "GATK IndelRealigner for %s" % sample
        logger.info("Skipping step %d: %s" % (step, msg))
        step += 1

    if not no_BaseRecalibrator:
        msg = "GATK BaseRecalibrator for %s" % sample
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            command = "%s %s -jar %s -T BaseRecalibrator -R %s -I %s  -o %s/recal_data.table %s" % (
                java, java_opts, gatk, ref_genome, split_bam, work_gatk,
                BaseRecalibrator_opts)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                              cmd_log=gatk_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
        step += 1

        msg = "GATK PrintReads for %s" % sample
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            command = "%s %s -jar %s -T PrintReads -R %s -I %s -BQSR %s/recal_data.table -o %s/bsqr.bam %s" % (
                java, java_opts, gatk, ref_genome, split_bam, work_gatk,
                work_gatk, PrintReads_opts)
            command = "bash -c \"%s\"" % command
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                              cmd_log=gatk_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
        step += 1
        split_bam = "%s/bsqr.bam" % work_gatk
    else:
        msg = "GATK BaseRecalibrator for %s" % sample
        logger.info("Skipping step %d: %s" % (step, msg))
        step += 1
        msg = "GATK PrintReads for %s" % sample
        logger.info("Skipping step %d: %s" % (step, msg))
        step += 1

    msg = "GATK HaplotypeCaller for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s -jar %s -T HaplotypeCaller -R %s -I %s -o %s/variants.vcf %s" % (
            java, java_opts, gatk, ref_genome, split_bam, work_gatk,
            HaplotypeCaller_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                          cmd_log=gatk_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    msg = "GATK VariantFiltration for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s %s -jar %s -T VariantFiltration -R %s -V %s/variants.vcf -o %s/variants_filtered.vcf %s" % (
            java, java_opts, gatk, ref_genome, work_gatk, work_gatk,
            VariantFiltration_opts)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                          cmd_log=gatk_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_gatk = os.path.join(outdir, "gatk", sample)
    create_dirs([out_gatk])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/variants_filtered.vcf" % work_gatk):
            command = "cp %s/variants_filtered.vcf %s/variants_filtered.vcf" % (
                work_gatk, out_gatk)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=gatk_log_fd,
                              cmd_log=gatk_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    variants = ""
    if os.path.exists("%s/variants_filtered.vcf" % out_gatk):
        logger.info("GATK was successfull!")
        logger.info("Output variants: %s/variants_filtered.vcf" % out_gatk)
        variants = "%s/variants_filtered.vcf" % out_gatk
    else:
        logger.info("GATK failed!")
    return variants
Beispiel #21
0
def run_deseq2(quant_files="",
               alignments="",
               transcripts_gtfs="",
               ref_gtf="",
               featureCounts_opts="",
               featureCounts=FEATURECOUNTS,
               stringtie=STRINGTIE,
               stringtie_merge_opts="",
               mincount=DESeq2_MINCNT,
               alpha=DESeq2_ALPHA,
               R=R_CMD,
               start=0,
               samples=[],
               nthreads=1,
               workdir=None,
               outdir=None,
               timeout=TIMEOUT):

    samples = map(lambda x: x.split(","), samples)
    samples_txt = "-".join(map(lambda x: ",".join(x), samples))

    logger.info("Running differential analysis (DESeq2) for %s" % samples_txt)

    n_samples = len(samples)
    n_replicates = map(len, samples)
    use_quant = True
    use_refgtf = False
    if quant_files and ref_gtf:
        if len(quant_files) != n_samples:
            logger.error("Aborting!")
            raise Exception(
                "Number of input quantification files does not match the number of samples (%s != %s)"
                % (len(quant_files), n_samples))
        quant_files = map(lambda x: x.split(","), quant_files)
        for i, q in enumerate(quant_files):
            if len(q) != n_replicates[i]:
                logger.error("Aborting!")
                raise Exception(
                    "Number of input quantification replicate files does not match the number of replicates in %d%s sample  (%s != %s)"
                    %
                    (i + 1, "st" if i > 0 else "th", len(q), n_replicates[i]))
            for r in q:
                if not os.path.exists(r):
                    logger.error("Aborting!")
                    raise Exception("No qantification file %s" % r)
    elif alignments and (transcripts_gtfs or ref_gtf):
        use_quant = False
        if len(alignments) != n_samples:
            logger.error("Aborting!")
            raise Exception(
                "Number of input alignment files does not match the number of samples (%s != %s)"
                % (len(alignments), n_samples))

        alignments = map(lambda x: x.split(","), alignments)
        for i, a in enumerate(alignments):
            if len(a) != n_replicates[i]:
                logger.error("Aborting!")
                raise Exception(
                    "Number of input alignment replicate files does not match the number of replicates in %d%s sample (%s != %s)"
                    %
                    (i + 1, "st" if i > 0 else "th", len(a), n_replicates[i]))

            for r in a:
                if not os.path.exists(r):
                    logger.error("Aborting!")
                    raise Exception("No aligment file %s" % r)
        if transcripts_gtfs:
            transcripts_gtfs = map(lambda x: x.split(","), transcripts_gtfs)
            for i, a in enumerate(transcripts_gtfs):
                if len(a) != n_replicates[i]:
                    logger.error("Aborting!")
                    raise Exception(
                        "Number of input gtf files does not match the total number of replicates in %d%s sample (%s != %s)"
                        % (i + 1, "st" if i > 0 else "th", len(a),
                           n_replicates[i]))
        elif ref_gtf:
            use_refgtf = True

        if ref_gtf:
            if not os.path.exists(ref_gtf):
                logger.error("Aborting!")
                raise Exception("No reference GTF file %s" % ref_gtf)
    else:
        logger.error("Aborting!")
        raise Exception(
            "Either (quantification files + ref_gtf) or (Alignment files + transcripts_gtfs or ref_gtf) is needed."
        )

    work_deseq2 = os.path.join(workdir, "deseq2", samples_txt)
    create_dirs([work_deseq2])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase DESeq2 work directory for %s" % samples_txt
        command = "rm -rf %s/*" % (work_deseq2)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    deseq2_log = os.path.join(work_deseq2, "deseq2.log")
    deseq2_log_fd = open(deseq2_log, "w")

    if use_quant:

        msg = "prepare tx2gene for %s." % samples_txt
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            tx2gene_file = ref_gtf.strip() + "tx2gene.csv"
            if os.path.exists(tx2gene_file):
                logger.info(
                    "Will use the precomputed %s as tx2gene.csv for %s" %
                    (tx2gene_file, samples_txt))
            else:
                tx2gene_file = os.path.join(work_deseq2, "tx2gene.csv")
                logger.info("Will computed %s as tx2gene.csv for %s" %
                            (tx2gene_file, samples_txt))
                tx2gene_map(ref_gtf, tx2gene_file)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
        step += 1

        msg = "compute gene level abundances for %s." % samples_txt
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)

            fixed_quant_files = []
            for i, qs in enumerate(quant_files):
                fixed_qs = []
                for j, q in enumerate(qs):
                    fixed_q = os.path.join(
                        work_deseq2, "{}.fixed_quant.sf".format(samples[i][j]))
                    fix_quant_file(q, fixed_q)
                    fixed_qs.append(fixed_q)
                fixed_quant_files.append(fixed_qs)

            command = "%s -e \"library('readr'); library('tximport'); \
                       samples=c(%s); (files <- file.path(c(%s))); names(files) <- samples; \
                       tx2gene <- read.csv(file.path('%s'),sep='\\t'); \
                       txi <- tximport(files, type = 'salmon', tx2gene = tx2gene); \
                       save(txi, file='%s/txi.rda'); \
                       write.table(txi$abundance, file = '%s/txi.abundances',\
                       quote = FALSE, sep='\\t'); \
                       write.table(txi$length, file = '%s/txi.length', quote = FALSE, \
                       sep='\\t'); write.table(txi$counts, file = '%s/txi.counts',\
                       quote = FALSE, sep='\\t');\"" % (
                R, ",".join(
                    map(lambda x: "'%s'" % x,
                        reduce(lambda x, y: x + y, samples))), ",".join(
                            map(lambda x: "'%s'" % x,
                                reduce(lambda x, y: x + y,
                                       fixed_quant_files))), tx2gene_file,
                work_deseq2, work_deseq2, work_deseq2, work_deseq2)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                              cmd_log=deseq2_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
        step += 1

        msg = "DESeq2 for %s" % samples_txt
        if start <= step:
            logger.info(
                "--------------------------STEP %s--------------------------" %
                step)
            command = "%s -e \"library('DESeq2'); load('%s/txi.rda'); \
                       samples <- c(%s); \
                       condition <- factor(c(%s)); \
                       (colData <- data.frame(row.names=colnames(txi$count), condition));\
                        counts <- round(txi$counts); mode(counts) <- 'integer'; \
                        dds <- DESeqDataSetFromMatrix(countData=counts, colData=colData, design=~ condition);\
                         stopifnot(txi$countsFromAbundance %%in%% c('no','scaledTPM','lengthScaledTPM')); \
                         if (txi$countsFromAbundance %%in%% c('scaledTPM','lengthScaledTPM')) \
                         {    message('using just counts from tximport');  } else \
                         {    message('using counts and average transcript lengths from tximport'); \
                         lengths <- txi$length;    dimnames(lengths) <- dimnames(dds);\
                         assays(dds)[['avgTxLength']] <- lengths;  }; \
                         dds <- dds[ rowSums(counts(dds)) >= %d, ]; \
                         dds <- DESeq(dds); \
                         for (i in seq_along(samples)){ \
                         for (j in seq_along(samples)){ \
                         if (i < j){\
                         sample1 <- samples[i]; \
                         sample2 <- samples[j]; \
                         res <- results(dds, contrast=c('condition',sample1,sample2), alpha=%f); \
                         (summary(res)); \
                         res_file= sprintf('%s/deseq2_res_%%s_vs_%%s.tab',sample1,sample2);\
                         write.table(res, file = res_file, \
                         quote = FALSE, sep='\\t'); \
                         } \
                         } \
                         }; \
                         save(txi,colData,condition,dds,res, \
                         file='%s/deseq2.rda');\"" % (R, work_deseq2, ",".join(
                map(lambda i: "'sample%d'" %
                    (i), range(len(samples)))), ",".join(
                        map(
                            lambda i: "rep('sample%d', %d)" %
                            (i, n_replicates[i]), range(
                                len(samples)))), mincount, alpha, work_deseq2,
                                                      work_deseq2)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                              cmd_log=deseq2_log,
                              msg=msg,
                              timeout=timeout)
        else:
            logger.info("Skipping step %d: %s" % (step, msg))
        step += 1

    else:
        if use_refgtf:
            msg = "featureCounts for %s" % samples_txt
            if start <= step:
                logger.info(
                    "--------------------------STEP %s--------------------------"
                    % step)
                command = "%s %s -o %s/featureCounts.txt -T %d -a %s -g gene_id %s" % (
                    featureCounts, featureCounts_opts, work_deseq2, nthreads,
                    ref_gtf, " ".join(reduce(lambda x, y: x + y, alignments)))
                command = "bash -c \"%s\"" % command
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)

                command = "sed -i -e '2s/.*/Geneid\\tChr\\tStart\\tEnd\\tStrand\\tLength\\t%s/' %s/featureCounts.txt" % (
                    "\\t".join(reduce(lambda x, y: x + y,
                                      samples)), work_deseq2)
                command = "bash -c \"%s\"" % command
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)
            else:
                logger.info("Skipping step %d: %s" % (step, msg))
            step += 1

            msg = "DESeq2 for %s" % samples_txt
            if start <= step:
                logger.info(
                    "--------------------------STEP %s--------------------------"
                    % step)
                command = "%s -e \"library('DESeq2'); countData <- read.table('%s/featureCounts.txt', \
                           header=TRUE, row.names=1);  countData <- countData[ ,6:ncol(countData)]; \
                            countData <- as.matrix(countData); \
                           samples <- c(%s); \
                           condition <- factor(c(%s)); \
                           (colData <- data.frame(row.names=colnames(countData), condition));\
                            dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~ condition);\
                             dds <- dds[ rowSums(counts(dds)) >= %d, ]; \
                             dds <- DESeq(dds); \
                             for (i in seq_along(samples)){ \
                             for (j in seq_along(samples)){ \
                             if (i < j){\
                             sample1 <- samples[i]; \
                             sample2 <- samples[j]; \
                             res <- results(dds, contrast=c('condition',sample1,sample2), alpha=%f); \
                             (summary(res)); \
                             res_file= sprintf('%s/deseq2_res_%%s_vs_%%s.tab',sample1,sample2);\
                             write.table(res, file = res_file, \
                             quote = FALSE, sep='\\t'); \
                             } \
                             } \
                             }; \
                             save(countData,colData,condition,dds,res, \
                             file='%s/deseq2.rda');\"" % (
                    R, work_deseq2, ",".join(
                        map(lambda i: "'sample%d'" %
                            (i), range(len(samples)))), ",".join(
                                map(
                                    lambda i: "rep('sample%d', %d)" %
                                    (i, n_replicates[i]), range(
                                        len(samples)))), mincount, alpha,
                    work_deseq2, work_deseq2)
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)
            else:
                logger.info("Skipping step %d: %s" % (step, msg))
            step += 1

        else:

            msg = "Merge transcripts GTFs for %s" % samples_txt
            if start <= step:
                logger.info(
                    "--------------------------STEP %s--------------------------"
                    % step)

                if ref_gtf:
                    stringtie_merge_opts += " -G %s" % ref_gtf
                if "-p " not in stringtie_merge_opts:
                    stringtie_merge_opts += " -p %d" % nthreads

                gtfs_list = open("%s/gtfs_list.txt" % work_deseq2, 'w')
                gtfs_list.write("\n".join(
                    reduce(lambda x, y: x + y, transcripts_gtfs)))
                gtfs_list.close()

                command = "%s --merge %s -o %s/merged.gtf -v %s/gtfs_list.txt" % (
                    stringtie, stringtie_merge_opts, work_deseq2, work_deseq2)
                command = "bash -c \"%s\"" % command
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)
            else:
                logger.info("Skipping step %d: %s" % (step, msg))
            step += 1

            msg = "featureCounts for %s" % samples_txt
            if start <= step:
                logger.info(
                    "--------------------------STEP %s--------------------------"
                    % step)
                command = "%s %s -o %s/featureCounts.txt -T %d -a %s/merged.gtf -g gene_id %s" % (
                    featureCounts, featureCounts_opts, work_deseq2, nthreads,
                    work_deseq2, " ".join(
                        reduce(lambda x, y: x + y, alignments)))
                command = "bash -c \"%s\"" % command
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)

                command = "sed -i -e '2s/.*/Geneid\\tChr\\tStart\\tEnd\\tStrand\\tLength\\t%s/' %s/featureCounts.txt" % (
                    "\\t".join(reduce(lambda x, y: x + y,
                                      samples)), work_deseq2)
                command = "bash -c \"%s\"" % command
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)
            else:
                logger.info("Skipping step %d: %s" % (step, msg))
            step += 1

            msg = "DESeq2 for %s" % samples_txt
            if start <= step:
                logger.info(
                    "--------------------------STEP %s--------------------------"
                    % step)
                command = "%s -e \"library('DESeq2'); countData <- read.table('%s/featureCounts.txt', \
                           header=TRUE, row.names=1);  countData <- countData[ ,6:ncol(countData)]; \
                            countData <- as.matrix(countData); \
                           samples <- c(%s); \
                           condition <- factor(c(%s)); \
                           (colData <- data.frame(row.names=colnames(countData), condition));\
                            dds <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~ condition);\
                             dds <- dds[ rowSums(counts(dds)) >= %d, ]; \
                             dds <- DESeq(dds); \
                             for (i in seq_along(samples)){ \
                             for (j in seq_along(samples)){ \
                             if (i < j){\
                             sample1 <- samples[i]; \
                             sample2 <- samples[j]; \
                             res <- results(dds, contrast=c('condition',sample1,sample2), alpha=%f); \
                             (summary(res)); \
                             res_file= sprintf('%s/deseq2_res_%%s_vs_%%s.tab',sample1,sample2);\
                             write.table(res, file = res_file, \
                             quote = FALSE, sep='\\t'); \
                             } \
                             } \
                             }; \
                             save(countData,colData,condition,dds, \
                             file='%s/deseq2.rda');\"" % (
                    R, work_deseq2, ",".join(
                        map(lambda i: "'sample%d'" %
                            (i), range(len(samples)))), ",".join(
                                map(
                                    lambda i: "rep('sample%d', %d)" %
                                    (i, n_replicates[i]), range(
                                        len(samples)))), mincount, alpha,
                    work_deseq2, work_deseq2)
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)
            else:
                logger.info("Skipping step %d: %s" % (step, msg))
            step += 1

    out_deseq2 = os.path.join(outdir, "deseq2", samples_txt)
    create_dirs([out_deseq2])
    msg = "Copy predictions to output directory for %s." % samples_txt
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if len(glob.glob("%s/deseq2_res*.tab" % work_deseq2)) > 0:
            for out_file in glob.glob("%s/deseq2_res*.tab" % work_deseq2):
                command = "cp %s %s/" % (out_file, out_deseq2)
                cmd = TimedExternalCmd(command, logger, raise_exception=True)
                retcode = cmd.run(cmd_log_fd_out=deseq2_log_fd,
                                  cmd_log=deseq2_log,
                                  msg=msg,
                                  timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    diff = ""
    if len(glob.glob("%s/deseq2_res*.tab" % out_deseq2)) > 0:
        logger.info("DESeq2 was successfull!")
        logger.info("Output differential expressions: %s" %
                    (glob.glob("%s/deseq2_res*.tab" % out_deseq2)))
        diff = glob.glob("%s/deseq2_res*.tab" % out_deseq2)
    else:
        logger.info("DESeq2 failed!")
    return diff
Beispiel #22
0
def run_salmon_smem(quantifier_idx=None,
                    seq_1="",
                    seq_2="",
                    seq_u="",
                    salmon_k=SALMON_SMEM_k,
                    libtype="",
                    salmon_smem_opts="",
                    salmon=SALMON,
                    start=0,
                    sample="",
                    nthreads=1,
                    unzip=False,
                    workdir=None,
                    outdir=None,
                    timeout=TIMEOUT):

    logger.info("Running quantification (Salmon-SMEM) for %s" % sample)
    if not os.path.exists(quantifier_idx):
        logger.error("Aborting!")
        raise Exception("No Salmon FMD index directory %s" % quantifier_idx)

    if seq_1 and seq_2:
        for s1 in seq_1.split(","):
            if not os.path.exists(s1):
                logger.error("Aborting!")
                raise Exception("No Mate 1 sequence file %s" % s1)

        for s2 in seq_2.split(","):
            if not os.path.exists(s2):
                logger.error("Aborting!")
                raise Exception("No Mate 2 sequence file %s" % s2)

        if unzip:
            seq_argument = "-1 <(gunzip -c %s) -2 <(gunzip -c %s)" % (" ".join(
                seq_1.split(",")), " ".join(seq_2.split(",")))
        else:
            if "," in seq_1:
                seq_1 = "<(cat %s)" % (" ".join(seq_1.split(",")))
            if "," in seq_2:
                seq_2 = "<(cat %s)" % (" ".join(seq_2.split(",")))
            seq_argument = "-1 %s -2 %s" % (seq_1, seq_2)
    elif seq_u:
        if unzip:
            seq_argument = "-r <(gunzip -c %s)" % (" ".join(seq_u.split(",")))
        elif "," in seq_u:
            seq_argument = "-r <(cat %s)" % (" ".join(seq_u1.split(",")))
        else:
            seq_argument = "-r %s" % (seq_u)
        for su in seq_u.split(","):
            if not os.path.exists(su):
                logger.error("Aborting!")
                raise Exception("No unpaired sequence file %s" % su)

    work_salmon_smem = os.path.join(workdir, "salmon_smem", sample)
    create_dirs([work_salmon_smem])

    step = 0
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        msg = "Erase Salmon-SMEM work directory for %s" % sample
        command = "rm -rf %s/*" % (work_salmon_smem)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=False)
        retcode = cmd.run(msg=msg, timeout=timeout)
    step += 1

    salmon_smem_log = os.path.join(work_salmon_smem, "salmon_smem.log")
    salmon_smem_log_fd = open(salmon_smem_log, "w")

    if "-p " not in salmon_smem_opts:
        salmon_smem_opts += " -p %d" % nthreads

    salmon_smem_opts += " -k %d" % salmon_k
    salmon_smem_opts += " -l %s" % libtype

    msg = "Salmon-SMEM for %s" % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        command = "%s quant -i %s %s %s -o %s" % (
            salmon, quantifier_idx, salmon_smem_opts, seq_argument,
            work_salmon_smem)
        command = "bash -c \"%s\"" % command
        cmd = TimedExternalCmd(command, logger, raise_exception=True)
        retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd,
                          cmd_log=salmon_smem_log,
                          msg=msg,
                          timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    out_salmon_smem = os.path.join(outdir, "salmon_smem", sample)
    create_dirs([out_salmon_smem])
    msg = "Copy predictions to output directory for %s." % sample
    if start <= step:
        logger.info(
            "--------------------------STEP %s--------------------------" %
            step)
        if os.path.exists("%s/quant.sf" % work_salmon_smem):
            command = "cp %s/quant.sf %s/quant.sf" % (work_salmon_smem,
                                                      out_salmon_smem)
            cmd = TimedExternalCmd(command, logger, raise_exception=True)
            retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd,
                              cmd_log=salmon_smem_log,
                              msg=msg,
                              timeout=timeout)
    else:
        logger.info("Skipping step %d: %s" % (step, msg))
    step += 1

    quant = ""
    if os.path.exists("%s/quant.sf" % out_salmon_smem):
        logger.info("Salmon-SMEM was successfull!")
        logger.info("Output expressions: %s/quant.sf" % out_salmon_smem)
        quant = "%s/quant.sf" % out_salmon_smem
    else:
        logger.info("Salmon-SMEM failed!")
    return quant