Ejemplo n.º 1
0
def run(num_threads, outdir, compress, phred, trimmomatic_args, input_files):
    """Start the run.

    Args:
        num_threads (int): How many threads to run with.
        outdir (str): Output directory for results
        compress (bool): Whether or not to compress the output.
        phred (int): Phred encoding of fastq file(s). (33 or 64)
        trimmomatic_args (str): Arguments (in string format) to pass to
            Trimmomatic (e.g. TRAILING:10 MINLEN:50).
        input_files ([str]): Input files.
    """

    if len(input_files) == 2:
        in1, in2 = input_files
        out1 = filename_in_to_out_fqgz(in1, SUFFIX_QTRIM, compress, outdir)
        out2 = filename_in_to_out_fqgz(in2, SUFFIX_QTRIM, compress, outdir)
        out_files = [out1, out2]
        out_log = pe_log_filename(SUFFIX_QTRIM, out2)
        trim_log = pe_log_filename(SUFFIX_QTRIM, out2, 'trimlog')
        # i.e. for those that end up unpaired because partner trimmed too much
        unpaired_out1 = filename_in_to_out_fqgz(out1, SUFFIX_QTRIM_UNPAIRED,
                                                compress, outdir)
        unpaired_out2 = filename_in_to_out_fqgz(out2, SUFFIX_QTRIM_UNPAIRED,
                                                compress, outdir)

        tmp_out1, tmp_out2, tmp_unpaired_out1, tmp_unpaired_out2 = tmpf_start(
            out1, out2, unpaired_out1, unpaired_out2)

        qtrim(num_threads, phred, trimmomatic_args, in1, in2, tmp_out1,
              tmp_out2, tmp_unpaired_out1, tmp_unpaired_out2, out_log,
              trim_log)
        tmpf_finish(tmp_out1, tmp_out2, tmp_unpaired_out1, tmp_unpaired_out2)

    elif len(input_files) == 1:
        in1 = input_files[0]
        out1 = filename_in_to_out_fqgz(in1, SUFFIX_QTRIM, compress, outdir)
        out_files = [out1]
        out_log = se_log_filename(SUFFIX_QTRIM, out1)
        trim_log = se_log_filename(SUFFIX_QTRIM, out1, 'trimlog')

        tmp_out1 = tmpf_start(out1)[0]

        qtrim(num_threads, phred, trimmomatic_args, in1, tmp_out1, out_log,
              trim_log)
        tmpf_finish(tmp_out1)

    else:
        raise ControlFlowException("""ERR911: Not possible to be here.""")

    return out_files
Ejemplo n.º 2
0
def run(kit, store, outdir, input_file, paired, build_read_and_loc_dbs,
        reject_umi_errors, correct_umis, write_dedupped_sam, write_flagged_sam,
        write_dup_only_sam, write_dup_group_sam_like, write_umi_error_rejects,
        write_sam_headers, random_seed, debug_switch, dump_rg_db, dump_loc_db,
        dump_dup_group_db, dump_dup_db, dump_umi_error_db):
    """Start the run.

    Args:
        kit (str): kit...
        store (str): Which storage backend to use.
        outdir (str): Output directory for results
        input_dir (str): ...
        build_read_and_loc_dbs (bool): Whether or not to build the
            read_group_db and location_db.
    """

    ## Set the random seed (if not set by user).
    if sys.version_info[0:2] in ((3, 0), (3, 1)):
        print(
            "WARNING: You are using python v{}.{}, which has a random number "
            "generator which is not stable between runs.  Dupligänger "
            "output will not be the same between runs.  To fix this, switch "
            "to python version 2.7 or version >= 3.2.".format(
                sys.version_info[0], sys.version_info[1]))
    random.seed(RANDOM_SEED if random_seed is None else random_seed)

    # report_db:
    #   A regular 'dict'.
    #   key: string describing a given (count) metric (I think they're all counts)
    #   val: (int) a count
    #   purpose: To provide a report at the end of the day on various metrics
    #       collected.
    # umi_error_db:
    #   A regular 'dict'.
    #   key: (str) A ReadGroup name.
    #   val: Not used (just doing 'True')
    #   purpose: To keep track of which reads have errors in one or more of
    #       their UMIs (and may optionally be rejected outright).
    # dup_group_db:
    #   A regular 'dict'.
    #   key: read_group_id
    #   val: list of read_group_ids that are duplicates of read_group_id
    #   purpose: To store stats, generate logs, debugging, etc. Maybe to
    #       also choose which duplicate to use.
    # dup_db:
    #   A SimpleBucketDB.
    #   key: ReadName of a PCR duplicate to be removed.
    #   val: Always just "True"
    #   purpose: When walking over the SAM/BAM file during last pass, any
    #       ReadName found in this DB will be removed from the final output.

    ## Setup databases
    report_db = setup_report_db()
    umi_error_db = {}
    dup_group_db = {}
    if store == STORE_OPTION_LMDB:
        db_file = os.path.join(outdir,
                               os.path.split(input_file)[1] + '.sdd.db')
        parent_db = ParentDbLmdb(db_file, LMDB_MAX_DBS, LMDB_DB_SIZE)
        read_group_db = SimpleObjectDbLmdb('read_group', parent_db,
                                           parent_db.env, ReadGroup)
        location_bucket_store = SimpleBucketLmdb('location_bucket_store',
                                                 parent_db, parent_db.env,
                                                 DELIM_BUCKET_LIST, str)
        dup_db = SimpleBucketLmdb('duplicate', parent_db, parent_db.env,
                                  DELIM_BUCKET_LIST, str)
    elif store == STORE_OPTION_MEMORY:
        parent_db = ParentDbDict()
        read_group_db = SimpleObjectDbDict()
        location_bucket_store = SimpleBucketDict()
        dup_db = SimpleBucketDict()
    loc_db = LocationBucketDb(location_bucket_store,
                              to_location_key_with_5p_trimming)

    ## What fp_process_location() and write_output_files() to use?
    if kit == KIT_BIOO:
        if paired:
            fp_process_location = functools.partial(
                process_location_pe_bioo_1nt, report_db, umi_error_db,
                reject_umi_errors, correct_umis)
            fp_write_output_files = write_output_files_pe
        else:
            raise CannotContinueException(
                """Doesn't support single-end yet.""")
            fp_process_location = functools.partial(
                process_location_sr_bioo_1nt, report_db, umi_error_db,
                reject_umi_errors, correct_umis)
            fp_write_output_files = write_output_files_sr
    else:
        if paired:
            fp_process_location = functools.partial(
                process_location_pe_randomer_1nt, report_db)
            fp_write_output_files = write_output_files_pe
        else:
            raise CannotContinueException(
                """Doesn't support single-end yet.""")
            fp_process_location = functools.partial(
                process_location_sr_randomer_1nt, report_db)
            fp_write_output_files = write_output_files_sr

    ## Winners and Losers
    # TODO: HARDCODED for now.
    fp_choose_winner_and_losers = choose_winner_and_losers_random_fixed_seed
    fp_log_winner_losers = log_winner_losers

    ## Setup output filenames
    # TODO: Offer BAM output?
    if write_dedupped_sam:
        dedupped_sam = filename_in_to_out_sambam(input_file,
                                                 'dups_removed.sam', outdir)
    else:
        dedupped_sam = os.devnull
    if write_flagged_sam:
        flagged_sam = filename_in_to_out_sambam(input_file, 'dups_flagged.sam',
                                                outdir)
    else:
        flagged_sam = os.devnull
    if write_dup_only_sam:
        dup_only_sam = filename_in_to_out_sambam(input_file, 'duplicates.sam',
                                                 outdir)
    else:
        dup_only_sam = os.devnull
    if write_dup_group_sam_like:
        dup_group_sam_like = filename_in_to_out_sambam(input_file,
                                                       'dup_groups.samlike',
                                                       outdir)
    else:
        dup_group_sam_like = os.devnull
    if write_umi_error_rejects:
        rejects_sam = filename_in_to_out_sambam(input_file, 'umi_errors.sam',
                                                outdir)
    else:
        rejects_sam = os.devnull

    ### Go ###

    if build_read_and_loc_dbs:
        time1 = time.time()
        with sambamopen(input_file) as fin:

            ## First op, build read and location dbs
            write_to_read_and_location_dbs(report_db, fin, parent_db,
                                           read_group_db, loc_db,
                                           RECORDS_PER_TXN)
        time2 = time.time()
        print(
            "Building read_group_db and loc_db took: {}s, current mem (MBs): {}"
            .format(time2 - time1, memory_info(True)))

    ## Second op, build DupGroupDB
    time1 = time.time()
    write_to_dup_group_db(report_db, parent_db, read_group_db, loc_db,
                          dup_group_db, RECORDS_PER_TXN, fp_process_location)
    time2 = time.time()
    print("Building dup_group_db took: {}s, current mem (MBs): {}".format(
        time2 - time1, memory_info(True)))

    # Third op, resolve duplicates and build DupDB
    time1 = time.time()
    random.seed(RANDOM_SEED)
    write_to_dup_db(report_db, parent_db, read_group_db, dup_group_db, dup_db,
                    RECORDS_PER_TXN, fp_choose_winner_and_losers,
                    fp_log_winner_losers)
    time2 = time.time()
    print("Building dup_db took: {}s, current mem (MBs): {}".format(
        time2 - time1, memory_info(True)))

    ## Setup tmp filenames (now that most of the work is complete)
    out_files = [
        dedupped_sam, flagged_sam, dup_only_sam, dup_group_sam_like,
        rejects_sam
    ]
    (tmp_dedupped_sam, tmp_flagged_sam, tmp_dup_only_sam,
     tmp_dup_group_sam_like, tmp_rejects_sam) = tmpf_start(*out_files)

    ## Fourth op: Write the dup_group_sam_like file
    time1 = time.time()
    write_dup_group_sam_like_file(parent_db, read_group_db, dup_group_db,
                                  tmp_dup_group_sam_like)
    time2 = time.time()
    print("Writing DupGroup file took: {}s, current mem (MBs): {}".format(
        time2 - time1, memory_info(True)))

    ## Fifth op: Walk through SAM/BAM input_file and write output files.
    time1 = time.time()
    fp_write_output_files(parent_db, read_group_db, dup_db, umi_error_db,
                          input_file, reject_umi_errors, tmp_dedupped_sam,
                          tmp_flagged_sam, tmp_dup_only_sam, tmp_rejects_sam,
                          write_dedupped_sam, write_flagged_sam,
                          write_dup_only_sam, write_dup_group_sam_like,
                          write_umi_error_rejects, write_sam_headers)
    time2 = time.time()
    print("Writing output files took: {}s, current mem (MBs): {}".format(
        time2 - time1, memory_info(True)))

    # Write report_db
    for k, v in sorted(iteritems(report_db)):
        print("{}: {}".format(k, v))

    # Finish temp files
    tmpf_finish(tmp_dedupped_sam, tmp_flagged_sam, tmp_dup_only_sam,
                tmp_dup_group_sam_like, tmp_rejects_sam)

    # TEMPORARY!  Just for bootstrapping the testing of dedup. Will change to
    if dump_rg_db:
        sys.stderr.write(str(read_group_db))
        sys.stderr.write('\n')
    if dump_loc_db:
        sys.stderr.write(str(loc_db))
        sys.stderr.write('\n')
    if dump_dup_group_db:
        sys.stderr.write(str(dup_group_db))
        sys.stderr.write('\n')
    if dump_dup_db:
        sys.stderr.write(str(dup_db))
        sys.stderr.write('\n')
    if dump_umi_error_db:
        sys.stderr.write(str(umi_error_db))
        sys.stderr.write('\n')

    return out_files
Ejemplo n.º 3
0
def open_out_pe_barcode_fastq_files(outdir, delete_temp_files_upon_failure,
                                    fp_write, barcode_list_dict):
    """Context manager to help manage the variable number of output paired-end
    FASTQ files.  Takes as input a list of barcodes.  Based upon the list of
    barcodes (or optionally mapped sample names if provided in
    barcode_list_dict), does the following:
        * opens *temporary* paired-end output files for writing, two for each
          barcode.
        * yields those back for writing
        * when finished writing, closes the files.
        * renames the temporary files to be named after barcodes (or sample
          names if provided).

    Args:
        outdir (str): Place output fastq files in directory outdir.
        delete_temp_files_upon_failure (bool): If there is a failure and this
            arg is True, delete the temp files.
        fp_write (function): Function pointer for writing output files.
        barcode_list_dict (dict): Keys are expected barcodes, vals are either
            None or sample names.
    Yields:
        (dict): Each key is a barcode, each value is a two-tuple of file handles
            for that barcode (first fh is R1 output file for that barcode,
            second fh is for R2).
    """
    # keys: barcodes, vals: [read1_tmp_filehandle, read2_tmp_filehandle]
    output_files = {}
    tmp_output_filenames = {}

    try:
        ## This 'try' block: open output files for writing, yield.

        for barcode, opt_sample_name in barcode_list_dict.items():
            # First, determine if we're using barcodes or sample names for
            # final file names ('pretend' is here just so we can reuse
            # filename_in_to_out_fqgz...)
            if opt_sample_name is not None:
                in_pretend_r1 = "{}.R1.fq".format(opt_sample_name)
                in_pretend_r2 = "{}.R2.fq".format(opt_sample_name)
            else:
                in_pretend_r1 = "{}.R1.fq".format(barcode)
                in_pretend_r2 = "{}.R2.fq".format(barcode)

            # Come up with final filenames, open temp versions of those files,
            # record file names for later use....
            out_r1 = filename_in_to_out_fqgz(in_pretend_r1, SUFFIX_REMOVE_UMI,
                                             False, outdir)
            out_r2 = filename_in_to_out_fqgz(in_pretend_r2, SUFFIX_REMOVE_UMI,
                                             False, outdir)
            tmp_out_r1, tmp_out_r2 = tmpf_start(out_r1, out_r2)

            tmp_output_filenames[barcode] = (tmp_out_r1, tmp_out_r2)
            output_files[barcode] = (fp_write(tmp_out_r1),
                                     fp_write(tmp_out_r2))

        yield output_files

    except Exception as e:
        ## This 'except' block: try to recover from failure (close/delete files)
        for barcode in barcode_list_dict:
            # Close tmp files first
            for out_f in output_files[barcode]:
                try:
                    out_f.close()
                except Exception:
                    pass
            if delete_temp_files_upon_failure:
                for tmp_filename in tmp_output_filenames[barcode]:
                    try:
                        os.remove(tmp_filename)
                    except Exception:
                        pass
        raise e

    finally:
        ## This 'finally' block: normal exit, close/rename files.
        for barcode in barcode_list_dict:
            # Close tmp files first
            for out_f in output_files[barcode]:
                out_f.close()
            # Rename from tmp to final...
            tmp_out_r1, tmp_out_r2 = tmp_output_filenames[barcode]
            try:
                tmpf_finish(tmp_out_r1, tmp_out_r2)
            except Exception:
                # We catch and ignore this exception so as to not mask previous exceptions.
                pass
def run(write_func, outdir, compress, opt_trimlog, input_files):
    """Start the run.

    Args:
        outdir (str): Output directory for results
        compress (bool): Whether or not to compress the output.
        input_files ([str]): Input files.
    """

    # Some fastq files have an index read in them. Detect.
    if is_gzipped(input_files[0]):
        with gzip.open(input_files[0], 'rb') as in1:
            first_line = in1.readline()
    else:
        with open(input_files[0], 'r') as in1:
            first_line = in1.readline()
    has_index = True if len(first_line.split()) > 1 else False

    if len(input_files) == 2:
        in1, in2 = input_files
        out1 = filename_in_to_out_fqgz(in1, SUFFIX_ANNOTATE_QTRIM, compress,
                                       outdir)
        out2 = filename_in_to_out_fqgz(in2, SUFFIX_ANNOTATE_QTRIM, compress,
                                       outdir)
        out_files = [out1, out2]

        if opt_trimlog is not None:
            trim_log = opt_trimlog
        else:
            qtrim_out2_file = out2.replace(SUFFIX_QTRIM + '.', '')
            trim_log = pe_log_filename(SUFFIX_QTRIM, in2, 'trimlog')

        tmp_out1, tmp_out2 = tmpf_start(out1, out2)

        with    pgopen(1, in1) as fin1, \
                pgopen(1, in2) as fin2, \
                pgopen(1, trim_log) as ftrim_log, \
                write_func(tmp_out1) as fout1, \
                write_func(tmp_out2) as fout2:
            create_annotated_files(fin1, fin2, ftrim_log, fout1, fout2,
                                   has_index)

        tmpf_finish(tmp_out1, tmp_out2)

    elif len(input_files) == 1:
        in1 = input_files[0]
        out1 = filename_in_to_out_fqgz(in1, SUFFIX_ANNOTATE_QTRIM, compress,
                                       outdir)
        out_files = [out1]

        if opt_trimlog is not None:
            trim_log = opt_trimlog
        else:
            qtrim_out_file = out1.replace(SUFFIX_QTRIM + '.', '')
            trim_log = se_log_filename(SUFFIX_QTRIM, in1, 'trimlog')

        tmp_out1 = tmpf_start(out1)[0]
        with    pgopen(1, in1) as fin1, \
                pgopen(1, trim_log) as ftrim_log, \
                write_func(tmp_out1) as fout1:
            create_annotated_file(fin1, ftrim_log, fout1, has_index)
        tmpf_finish(tmp_out1)

    else:
        raise ControlFlowException("""ERR911: Not possible to be here.""")

    return out_files
Ejemplo n.º 5
0
def run(fp_extract_umi, fp_anno, fp_write, outdir, compress, force_paired,
        input_files):
    """Start the run.

    Args:
        fp_extract_umi (function): Function extract UMIs from single/paired
            reads, also returns the length(s) to clip.
        fp_anno (Function): Function to be used to parse the FASTQ or BAM file(s).
        fp_write (Function): Function to be used to write output file(s).
        outdir (str): Output directory for results
        compress (bool): Whether or not to compress the output.
        force_paired (bool): Whether user wants to force paired-end, even if
            if we didn't detect it.
        input_files ([str]): Array of input files to be parsed.
    """

    if fp_anno == create_annotated_files_from_bam or force_paired:
        # BAM, paired-end
        infile1 = input_files[0]
        out1, out2 = filename_in_bam_to_out_fqgz(infile1, SUFFIX_REMOVE_UMI,
                                                 compress, True, outdir)
        out_files = [out1, out2]
        tmp_out1, tmp_out2 = tmpf_start(out1, out2)

        with bamopen(infile1) as in1, \
                fp_write(tmp_out1) as out1, \
                fp_write(tmp_out2) as out2:
            fp_anno(fp_extract_umi, in1, out1, out2)
        tmpf_finish(tmp_out1, tmp_out2)

    elif fp_anno == create_annotated_file_from_bam:
        # BAM, single-end
        infile1 = input_files[0]
        out1 = filename_in_bam_to_out_fqgz(infile1, SUFFIX_REMOVE_UMI,
                                           compress, False, outdir)[0]
        out_files = [out1]
        tmp_out1 = tmpf_start(out1)[0]

        with bamopen(infile1) as in1, \
                fp_write(tmp_out1) as out1:
            fp_anno(fp_extract_umi, in1, out1)
        tmpf_finish(tmp_out1)

    else:

        # Some datasets have look like this:
        #       @NS500451:139:H5TV5AFXX:1:11101:3928:1111 1:N:0:ATCACGTT
        # Others like this:
        #       @NS500451:139:H5TV5AFXX:1:11101:3928:1111
        # Detect.
        if is_gzipped(input_files[0]):
            with gzip.open(input_files[0], mode='rt') as in1:
                first_line = in1.readline()
        else:
            with io.open(input_files[0], mode='r', encoding='latin-1') as in1:
                first_line = in1.readline()
        has_index = True if len(first_line.split()) > 1 else False

        # FASTQ
        if len(input_files) == 2:
            # FASTQ, paired-end
            infile1, infile2 = input_files
            out1 = filename_in_to_out_fqgz(infile1, SUFFIX_REMOVE_UMI,
                                           compress, outdir)
            out2 = filename_in_to_out_fqgz(infile2, SUFFIX_REMOVE_UMI,
                                           compress, outdir)
            out_files = [out1, out2]
            tmp_out1, tmp_out2 = tmpf_start(out1, out2)

            with pgopen(1, infile1) as in1, \
                    pgopen(1, infile2) as in2, \
                    fp_write(tmp_out1) as out1, \
                    fp_write(tmp_out2) as out2:
                fp_anno(fp_extract_umi, in1, in2, out1, out2, has_index)
            tmpf_finish(tmp_out1, tmp_out2)

        elif len(input_files) == 1:
            # FASTQ, single-end
            infile1 = input_files[0]
            out1 = filename_in_to_out_fqgz(infile1, SUFFIX_REMOVE_UMI,
                                           compress, outdir)
            out_files = [out1]
            tmp_out = tmpf_start(out1)[0]

            with pgopen(1, infile1) as in1, \
                    fp_write(tmp_out) as out1:
                fp_anno(fp_extract_umi, in1, out1, has_index)
            tmpf_finish(tmp_out)

        else:
            raise ControlFlowException("""ERR911: Not possible to be here.""")

    return out_files
def run(compress, outdir, cutadapt_args, adapters, input_files):
    """Start the run.

    Args:
        compress (bool): Whether or not to compress output.
        outdir (str): Output directory for results
        cutadapt_args (str): Arguments to pass to cutadapt.
        adapters ([str]): List of Illumina adapters to cutadapt.
        input_files ([str]): Array of input fastq file(s) to be parsed.
    Returns:
        out_files ([str]): Array of output fastq file(s).

    """
    if not which('cutadapt'):
        raise PrerequisitesException("""Cannot find 'cutadapt'.  Install with:

                pip install cutadapt

                """)

    if len(input_files) == 2:
        in1, in2 = input_files
        out1 = filename_in_to_out_fqgz(in1, SUFFIX_REMOVE_ADAPTER, compress,
                                       outdir)
        out2 = filename_in_to_out_fqgz(in2, SUFFIX_REMOVE_ADAPTER, compress,
                                       outdir)
        out_files = [out1, out2]
        adapter1, adapter2 = adapters
        out_log = pe_log_filename(SUFFIX_REMOVE_ADAPTER, out2)
        # i.e. for those that are too short...
        short_out1 = filename_in_to_out_fqgz(out1,
                                             SUFFIX_REMOVE_ADAPTER_TOO_SHORT,
                                             compress, outdir)
        short_out2 = filename_in_to_out_fqgz(out2,
                                             SUFFIX_REMOVE_ADAPTER_TOO_SHORT,
                                             compress, outdir)

        tmp_out1, tmp_out2, tmp_short_out1, tmp_short_out2 = tmpf_start(
            out1, out2, short_out1, short_out2)

        pe_remove_adapters(in1, in2, tmp_out1, tmp_out2, tmp_short_out1,
                           tmp_short_out2, out_log, adapter1, adapter2,
                           cutadapt_args)
        tmpf_finish(tmp_out1, tmp_out2, tmp_short_out1, tmp_short_out2)

    elif len(input_files) == 1:
        in1 = input_files[0]
        out1 = filename_in_to_out_fqgz(in1, SUFFIX_REMOVE_ADAPTER, compress,
                                       outdir)
        out_files = [out1]
        adapter1 = adapters[0]
        out_log = se_log_filename(SUFFIX_REMOVE_ADAPTER, out1)
        # i.e. for those that are too short...
        short_out1 = filename_in_to_out_fqgz(out1,
                                             SUFFIX_REMOVE_ADAPTER_TOO_SHORT,
                                             compress, outdir)

        tmp_out1, tmp_short_out1 = tmpf_start(out1, short_out1)

        se_remove_adapter(in1, tmp_out1, tmp_short_out1, out_log, adapter1,
                          cutadapt_args)
        tmpf_finish(tmp_out1, tmp_short_out1)

    else:
        raise ControlFlowException("""ERR911: Not possible to be here.""")

    return out_files