Example #1
0
def prog_umi_group_ec(args):
    import os
    from fileio import zopen, FastqFormat
    from seq import QSequence

    assert args.k > 0
    assert args.max_offset >= 0

    fn, fext = os.path.splitext(args.o)
    with zopen(fn + ".discarded" + fext, 'w') as discard_file:
        umi_groups = run_umi_group_ec(
            FastqFormat.records_in(zopen(args.i, 'r')), args.k,
            args.max_offset, args.min_score,
            FastqFormat.records_out(discard_file, None))

    FastqFormat.records_out(zopen(args.o, 'w'),
                            (QSequence("UMI:%s:%s" %
                                       (umi, cqs.count), cqs.seq, cqs.qual)
                             for umi, cqs in umi_groups.iteritems()))
Example #2
0
def prog_umi_group_ec(args):
    import os
    from fileio import zopen, FastqFormat
    from seq import QSequence

    assert args.k > 0
    assert args.max_offset >= 0
    assert args.min_score_for_offset <= args.k
    assert args.min_score_for_merge <= args.k

    fn, fext = os.path.splitext(args.o)
    discarded = FastqFormat.records_out(zopen(fn + '.discarded' + fext, 'w'),
                                        None)

    input_file = stdin if args.i is None else zopen(args.i, 'r')
    _progress_indicator(0, 0, input_file)
    umi_groups = run_umi_group_ec(records=FastqFormat.records_in(input_file),
                                  k=args.k,
                                  moff=args.max_offset,
                                  min_score4offset=args.min_score_for_offset,
                                  min_score4merge=args.min_score_for_merge,
                                  discarded=discarded,
                                  callback=_progress_indicator)
    stdout.write('\n')

    out = FastqFormat.records_out(zopen(args.o, 'w'), None)

    print 'Writing results to file'
    for name, grouplist in umi_groups.iteritems():
        grouplist = sorted(grouplist,
                           key=lambda cqs: (-cqs.count, -sum(cqs.qual)))
        cqs = grouplist[0]
        out += QSequence(
            'UMI:%s:%s:%s' % (name, '%s/%s' %
                              ('1', len(grouplist)), cqs.count), cqs.seq,
            cqs.qual)
        for i, cqs in enumerate(grouplist[1:]):
            discarded += QSequence(
                'UMI:%s:%s:%s' % (name, '%s/%s' %
                                  (i + 2, len(grouplist)), cqs.count), cqs.seq,
                cqs.qual)
Example #3
0
def prog_umi_group_ec(args):
    import os
    from fileio import zopen, FastqFormat
    from seq import QSequence

    assert args.k > 0
    assert args.max_offset >= 0
    assert args.min_score_for_offset <= args.k
    assert args.min_score_for_merge <= args.k

    fn, fext = os.path.splitext(args.o)
    discarded = FastqFormat.records_out(
            zopen(fn + '.discarded' + fext, 'w'), None)

    input_file = stdin if args.i is None else zopen(args.i, 'r')
    _progress_indicator(0, 0, input_file)
    umi_groups = run_umi_group_ec(
            records = FastqFormat.records_in(input_file),
            k = args.k,
            moff = args.max_offset,
            min_score4offset = args.min_score_for_offset,
            min_score4merge = args.min_score_for_merge,
            discarded = discarded,
            callback = _progress_indicator)
    stdout.write('\n')

    out = FastqFormat.records_out(zopen(args.o, 'w'), None)

    print 'Writing results to file'
    for name, grouplist in umi_groups.iteritems():
        grouplist = sorted(grouplist, key = lambda cqs:(-cqs.count,
            -sum(cqs.qual)))
        cqs = grouplist[0]
        out += QSequence('UMI:%s:%s:%s'%(name,
            '%s/%s'%('1', len(grouplist)), cqs.count), cqs.seq, cqs.qual)
        for i, cqs in enumerate(grouplist[1:]):
            discarded += QSequence('UMI:%s:%s:%s'%(name, '%s/%s'%(i+2,
                len(grouplist)),cqs.count),
                    cqs.seq, cqs.qual)
Example #4
0
def prog_pipeline(args):
    # [Defaults] section
    confidence = float(config.get("Defaults", "confidence"))
    min_phred_threshold = int(config.get("Defaults", "min_phred_threshold"))
    clone_classname = config.get("Defaults", "clone_classname")
    min_seqlen = int(config.get("Defaults", "min_seqlen"))
    include_cysphe = str2bool(config.get("Defaults", "include_cysphe"))
    species = config.get("Defaults", "species")
    gene = config.get("Defaults", "gene")
    update_interval = float(config.get("Defaults", "update_interval"))
    ref_fn = path.join(here, config.get("Defaults", "ref_fn"))
    alignments_fn = config.get("Defaults", "alignments_fn")
    alignment_stats_fn = config.get("Defaults", "alignment_stats_fn")
    Q_mm_stats_fn = config.get("Defaults", "Q_mm_stats_fn")
    Q_mm_stats_plot_fn = config.get("Defaults", "Q_mm_stats_plot_fn")
    qplot_fn = config.get("Defaults", "qplot_fn")
    output_fn = config.get('Defaults', 'output_fn')
    output_hdr = config.get('Defaults', 'output_hdr').decode('string_escape')
    output_fmt = config.get('Defaults', 'output_fmt').decode('string_escape')

    # [Aligner] section
    location = config.get("Aligner", "location")
    cmd_build_index = config.get("Aligner", "cmd_build_index")
    args_build_index = config.get("Aligner", "args_build_index")
    cmd_align = config.get("Aligner", "cmd_align")
    args_align_base = config.get("Aligner", "args_align_base")
    args_align_v = args_align_base + " " + \
            config.get("Aligner", "args_align_v")
    args_align_j = args_align_base + " " + \
            config.get("Aligner", "args_align_j")

    if args.no_VJ_collapsing:
        clone_classname = "AnnotatedCloneDistinctAllele"

    ref_fn = path.realpath(ref_fn) \
            if args.ref is None else path.realpath(args.ref)
    reads_fn = path.realpath(args.reads)
    phred_encoding = determine_phred_encoding(reads_fn) \
            if args.phred_encoding is None else args.phred_encoding
    assert phred_encoding in (33, 64)
    species = species if args.species is None else args.species
    species = species.split(",")
    gene = gene if args.gene is None else args.gene
    gene = gene.split(",")
    n_threads = mp.cpu_count() if args.threads is None else args.threads

    # Prepare aligner commands and check existence of aligner
    cmd_build_index = path.realpath(path.join(location, cmd_build_index))
    cmd_align = path.realpath(path.join(location, cmd_align))

    if not path.isfile(cmd_build_index):
        raise ValueError(\
                "Executable to build an index (\"%s\") does not exist.\n\
Please use \"rtcr Config\" to see if the Aligner is properly configured"                                                                        %
        cmd_build_index)

    if not path.isfile(cmd_align):
        raise ValueError(\
                "Executable to align sequences (\"%s\") does not exist.\n\
Please use \"rtcr Config\" to see if the Aligner is properly configured "                                                                         %
        cmd_align)

    init_logging()
    if args.debug:
        logger.setLevel(logging.DEBUG)
        logger.debug("log level set to DEBUG")
    if args.verbose:
        logger.addHandler(logging.StreamHandler(stdout, ))
        logger.info("Writing log statements to stdout")
    # Note, delaying import of modules that have a logger until after logging
    # has been initialised.
    from fileio import read_reference, zopen
    from pipeline import Pipeline

    ref = read_reference(ref_fn).get_slice(species=species, gene=gene)

    for s in species:
        if not s in ref.species:
            logger.error("species \"%s\" does not exist in reference" % s)
            sys.exit(1)
    for g in gene:
        if not g in ref.genes:
            logger.error("gene \"%s\" does not exist in reference" % s)
            sys.exit(1)

    version = __version__
    preamble = '\nRTCR version: %(version)s\n' % locals()
    preamble += '\n[Command line arguments]\n' + \
            '\n'.join(['%s : %s'%(i,v) for i,v in enumerate(sys.argv)])
    preamble += '\n\
[Files]\n\
Reference: %(ref_fn)s\n\
Reads: %(reads_fn)s\n\
Output: %(output_fn)s\n\
\n\
[Settings]\n\
PHRED encoding: %(phred_encoding)s\n\
Species: %(species)s\n\
Gene: %(gene)s\n\
confidence: %(confidence)s\n\
\n\
[Immune receptor reference]\n' % locals()
    for species in sorted(ref.species):
        for gene in sorted(ref.genes):
            for region in sorted(ref.regions):
                alleles = ref.get_alleles(species=species,
                                          gene=gene,
                                          region=region)
                n = len(alleles)
                if n > 0:
                    preamble += "%s,%s,%s: %s alleles\n" % (species, gene,
                                                            region, n)
                s = ""
                for allele in alleles:
                    s += "%s, %s\n" % (allele.species, allele.name)
                logger.debug("species, allele\n" + s)
    preamble += "\n[Pipeline run]"
    logger.info(preamble)

    # Make sure exceptions are logged, even when not caught
    sys.excepthook = handle_uncaught_exception

    pipeline = Pipeline(ref=ref,
                        reads=zopen(reads_fn, 'r'),
                        phred_encoding=phred_encoding,
                        cmd_build_index=cmd_build_index,
                        args_build_index=args_build_index,
                        cmd_align=cmd_align,
                        args_align_v=args_align_v,
                        args_align_j=args_align_j,
                        alignments_fn=alignments_fn,
                        alignment_stats_fn=alignment_stats_fn,
                        Q_mm_stats_fn=Q_mm_stats_fn,
                        Q_mm_stats_plot_fn=Q_mm_stats_plot_fn,
                        output_fn=output_fn,
                        output_hdr=output_hdr,
                        output_fmt=output_fmt,
                        clone_classname=clone_classname,
                        confidence=confidence,
                        min_seqlen=min_seqlen,
                        include_cysphe=include_cysphe,
                        min_phred_threshold=min_phred_threshold,
                        n_threads=n_threads,
                        update_interval=update_interval,
                        listener=Listener())
    pipeline.daemon = True
    pipeline.name = 'Pipeline'
    try:
        pipeline.start()
        while pipeline.is_alive():
            pipeline.join(1)
    except KeyboardInterrupt:
        logger.error('Caught keyboard interrupt. Shutting down.')
        pipeline.stop()
        pipeline.join(1)
Example #5
0
def prog_checkout(args):
    search_rc = args.reverse_complement
    barcodes_fn = args.barcodes

    adapters = list(BarcodeFormat.records_in(open(barcodes_fn, 'r')))

    outfiles = {sample_id : open("%s.fastq"%sample_id,'w') \
            for (sample_id, master, slave) in adapters}

    fq1_fn = args.i
    fq2_fn = args.i2

    max_mm = args.max_mm

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if fq2_fn:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = False

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if time() - prev_time > .5:
            prev_time = time()
            frac = float(fq1_filepos()) / fq1_filesize
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i,
                        frac*100, n_accepted, 100*float(n_accepted)/i))
            stdout.flush()

        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match == None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                slave_matches, slave_matches_rc = slave.locate_in(
                    r2.seq, max_mm, search_rc=False)
                slave_match = get_best_match(slave_matches, slave_matches_rc)

                if slave.has_UMI():  # get umi
                    slave_umi = slave.get_UMI(r2.seq, r2.qual_str,
                                              slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0]:
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, sample_id, umi)

        if best_match:
            master_match, sample_id, umi = best_match
            out = outfiles[sample_id]
            out.write("@%s UMI:%s:%s\n" % (r1.id, umi[0], umi[1]))
            out.write("%s\n+\n%s\n" % (r1.seq, r1.qual_str))
            n_accepted += 1
Example #6
0
    def run(self):
        if self._alignments_fn is None or self._alignments_fn == "":
            output_alignments = False
        else:
            output_alignments = True

        if output_alignments and os.path.isfile(self._alignments_fn):
            logger.info("SKIPPING creation of %s"%self._alignments_fn)
            output_alignments = False
            alignment_file = zopen(self._alignments_fn, 'r')
            vj_recs = SAMFormat.records_in(alignment_file)
            # Get two (rows/)alignments at a time from vj_recs
            alns = ((rec, next(vj_recs)) for rec in vj_recs)
            self._listener.notify("Reading alignments from %s"%
                    self._alignments_fn)
        else:
            alns = get_vj_alignments(self._ref, self._reads,
                    self._cmd_build_index,
                    self._args_build_index,
                    self._cmd_align,
                    self._args_align_v,
                    self._args_align_j,
                    phred_encoding = self._phred_encoding,
                    n_threads = self._n_threads)
            self._listener.notify("Aligning reference sequences to reads")

        # Keep track of the quality scores of the bases that went into the
        # sequences of the clones.
        Q_counts = {}

        # Build clones and use alignments to count mismatches and indels
        cs = CloneSet()
        alnstats = {"V":{}, "J":{}}
        if self._include_cysphe:
            v_refpos_offset = -3
            j_refpos_offset = 3
        else:
            v_refpos_offset = 0
            j_refpos_offset = 0

        try:
            if output_alignments:
                out = zopen(self._alignments_fn, 'w')

            if output_alignments:
                infile = self._reads
            else:
                infile = alignment_file

            prev_time = time()
            if isinstance(infile, gzip.GzipFile):
                infile_size = os.path.getsize(infile.name)
                infile_pos = infile.fileobj.tell
            else:
                infile_size = filesize(infile)
                infile_pos = infile.tell

            self._listener.notify(("PROGRESSBAR", "Alignments", "start"))

            for v_rec, j_rec in alns:
                if self.stopped():
                    logger.warning("Pipeline stopped")
                    return

                if time() - prev_time >= self._update_interval:
                    prev_time = time()
                    if not infile.closed:
                        pos = infile_pos()
                    else:
                        # assuming a closed infile means the entire infile has
                        # been processed.
                        pos = infile_size
                    frac = float(pos) / infile_size
                    self._listener.notify(("PROGRESSBAR", "Alignments", frac))

                if output_alignments:
                    out.write("\t".join(map(str, v_rec)) + "\n" + \
                            "\t".join(map(str, j_rec)) + "\n")

                clone = build_clone(self._ref, v_rec, j_rec,
                        self._include_cysphe, self._clone_classname)

                if clone is None:
                    continue

                seqlen = len(clone.seq)
                if seqlen < self._min_seqlen:
                    continue

                # Count base qualities in the clone (which is possible because
                # at this point the clone is based on a single read)
                lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42)
                for i in xrange(clone.v.end, clone.j.start):
                    lenfam_Q_counts[clone.qual[i]] += 1
                
                cs.add(clone, merge = True)

                v_allele = self._ref[v_rec.RNAME]
                j_allele = self._ref[j_rec.RNAME]
                # Count errors in the alignments
                for (rec, r_roi_start, r_roi_end) in \
                        ((v_rec, v_allele.refpos + v_refpos_offset, 0),
                        (j_rec, 0, j_allele.refpos + j_refpos_offset)):
                    allele = self._ref[rec.RNAME]
                    lenfam_alnstats = alnstats[allele.region].setdefault(
                            seqlen, {
                        "n"     : 0,
                        "mm"    : 0,
                        "ins"   : 0,
                        "dels"  : 0,
                        "Q_mm"  : [0] * 42,
                        "Q_n"   : [0] * 42})
                    n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats(rec,
                            allele.seq,
                            lenfam_alnstats["Q_mm"], lenfam_alnstats["Q_n"],
                            r_roi_start, r_roi_end)
                    lenfam_alnstats["n"] += n
                    lenfam_alnstats["mm"] += mm
                    lenfam_alnstats["ins"] += ins
                    lenfam_alnstats["dels"] += dels
        finally:
            if output_alignments:
                out.close()
        self._listener.notify(("PROGRESSBAR", "Alignments", "end"))

        if len(cs) == 0:
            msg = "No clones found in alignments. \
Was the correct germline reference used?"
            logger.error(msg)
            raise Exception(msg)

        if not self._alignment_stats_fn is None and \
                self._alignment_stats_fn != "":
            logger.info("Writing alignment stats to \"%s\""%
                    self._alignment_stats_fn)
            with zopen(self._alignment_stats_fn, 'w') as out:
                out.write("seqlen,region,n,mm,ins,dels\n")
                for region in alnstats:
                    for seqlen, lenfam_alnstats in \
                            alnstats[region].iteritems():
                        out.write(",".join(map(str,[
                            seqlen, region,
                            lenfam_alnstats["n"],
                            lenfam_alnstats["mm"],
                            lenfam_alnstats["ins"],
                            lenfam_alnstats["dels"]])) + "\n")

        self._save_cloneset(cs, "r")

        # Sum all the counts in the V and J regions separately, and calculate
        # average error rates
        tot_err = {"V":{}, "J":{}}
        for region in ("V", "J"):
            region_stats = alnstats[region]
            x = tot_err[region]
            x["n"] = sum([y["n"] for y in region_stats.itervalues()])
            x["mm"] = sum([y["mm"] for y in region_stats.itervalues()])
            x["ins"] = sum([y["ins"] for y in region_stats.itervalues()])
            x["dels"]= sum([y["dels"] for y in region_stats.itervalues()])

            n = x["n"]
            if n > 0:
                x["mmr"] = float(x["mm"]) / n 
                x["insr"] = float(x["ins"]) / n
                x["delsr"] = float(x["dels"]) / n
            else:
                x["mmr"] = 0
                x["insr"] = 0
                x["delsr"] = 0
        global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"])
        global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"])
        global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"])
        logger.info("global error rates: mmr: %(global_mmr)s, \
insr: %(global_insr)s, delsr: %(global_delsr)s"%locals())

        # Calculate observed error rates for Phred scores
        Q_mm_stats = {"V":{}, "J":{}}
        for region, region_stats in alnstats.iteritems():
            Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42)
            Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42)
            for lenfam_alnstats in region_stats.itervalues():
                for i in xrange(42):
                    Q_mm[i] += lenfam_alnstats["Q_mm"][i]
                    Q_n[i] += lenfam_alnstats["Q_n"][i]

        if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "":
            with zopen(self._Q_mm_stats_fn, 'w') as out:
                out.write("region,Q,n,mm\n")
                for region in Q_mm_stats:
                    for Q,(mm, n) in enumerate(izip(Q_mm_stats[region]["Q_mm"],
                            Q_mm_stats[region]["Q_n"])):
                        out.write("%s,%s,%s,%s\n"%(region, Q, n, mm))

        # Calculate ratio between base quality score assigned by the sequencer
        # and observed base quality (based on alignments with germline
        # reference).
        sum_ratios = 0
        n_ratios = 0
        for region in Q_mm_stats:
            Q_mm = Q_mm_stats[region]["Q_mm"]
            Q_n = Q_mm_stats[region]["Q_n"]
            for q in xrange(42):
                mm = Q_mm[q]
                n = Q_n[q]
                if mm > 0 and n > 0:
                    q_obs = p2q(float(mm) / n)
                    if q_obs > 0:
                        sum_ratios += (q / q_obs) * n
                        n_ratios += n
        if n_ratios > 0:
            alpha = float(sum_ratios) / n_ratios
        else:
            logger.warning('No instances found of a Phred score associated ' +\
                    'with mismatches.')
            alpha = 1.0

        logger.info("Ratio between base quality and observed quality: %s"%
                alpha)

        if not self._Q_mm_stats_plot_fn is None and \
                self._Q_mm_stats_plot_fn != "":
            plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn)

        # Get median quality score
        Q_n = [0] * 42 # count number of bases for every Q score
        for lenfam_Q_counts in Q_counts.itervalues():
            for q, count in enumerate(lenfam_Q_counts):
                Q_n[q] += count
        i = ((sum(Q_n) + 1) // 2) - 1 # index of median element in Q_n
        j = 0
        for max_Q, count in enumerate(Q_n):
            j += count
            if j > i:
                break
        logger.info("max_Q = %s"%max_Q)

        pool = ConnectedConsumerPool(n_consumers = self._n_threads)
        by_seqlen = lambda clone:len(clone.seq)
        confidence = self._confidence
        for seqlen, clones in groupby(sorted(cs, key = by_seqlen), by_seqlen):
            if self.stopped():
                logger.warning("Pipeline stopped")
                return
            cs2 = CloneSet(clones)
            # Calculate expected number of errors based on Q scores
            lenfam_Q_counts = Q_counts[seqlen]

            # get total number of bases between V and J region
            n_o = sum(lenfam_Q_counts)
            mm_o = 0
            for q, count in enumerate(lenfam_Q_counts):
                q /= alpha
                mm_o += q2p(q) * count

            mm_v = alnstats["V"][seqlen]["mm"]
            n_v = alnstats["V"][seqlen]["n"]

            mm_j = alnstats["J"][seqlen]["mm"]
            n_j = alnstats["J"][seqlen]["n"]

            mm_tot = mm_v + mm_o + mm_j
            n_tot = n_v + n_o + n_j
            logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\
mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)"%(seqlen,
                mm_v, float(mm_v)/n_v if n_v > 0 else 0,
                mm_o, float(mm_o)/n_o if n_o > 0 else 0,
                mm_j, float(mm_j)/n_j if n_j > 0 else 0,
                mm_tot, float(mm_tot)/n_tot if n_tot > 0 else 0))
            local_mmr = float(mm_tot) / n_tot
            mmr = max(local_mmr, global_mmr)
            logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \
%(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s"%locals())
            pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q))
    
        self._listener.notify("Running QMerge and IMerge on bins.")
        self.run_pool(pool, desc = 'QMerge, IMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable([x[0] for x in results]))
        self._save_cloneset(cloneset, "rqi")

        self._listener.notify("Running LMerge")
        cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr,
                confidence)
        self._save_cloneset(cloneset, "rqil")

        pool = ConnectedConsumerPool(n_consumers = self._n_threads)
        for seqlen, clones in groupby(sorted(cloneset, key = by_seqlen),
                by_seqlen):
            cs2 = CloneSet(clones)
            pool.add_task(wrapper_run_nmerge_on_bin,
                    args = (cs2,))
        self._listener.notify("Running NMerge on bins.")
        self.run_pool(pool, desc = 'NMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable(results))
        self._save_cloneset(cloneset, "rqiln")

        ########################
        # Write clones to file #
        ########################
        self._listener.notify("Writing clones")
        with open(self._output_fn, 'w') as res_ok:
            with open("discarded_clones.tsv", 'w') as res_not_ok:
                header = self._output_hdr
                res_ok.write(header)
                res_not_ok.write(header)

                n_discarded = 0
                for clone in sorted(cloneset,
                        key = lambda clone:(-clone.count, clone.seq)):
                    min_phred = min(clone.qual)
                    aa_seq = nt2aa(clone.seq)
                    n_stop_codons = sum([aa == '*' for aa in aa_seq])
                    frame = len(clone.seq) % 3
                    if min_phred < self._min_phred_threshold \
                            or n_stop_codons > 0 or frame != 0:
                        n_discarded += 1
                        out = res_not_ok
                    else:
                        out = res_ok
                    out.write(clone2str(clone, fmt = self._output_fmt))
        self._listener.notify("Discarded %s clones"%n_discarded)
Example #7
0
def prog_pipeline(args):
    # [Defaults] section
    confidence = float(config.get("Defaults", "confidence"))
    min_phred_threshold = int(config.get("Defaults", "min_phred_threshold"))
    clone_classname = config.get("Defaults", "clone_classname")
    min_seqlen = int(config.get("Defaults", "min_seqlen"))
    include_cysphe = str2bool(config.get("Defaults", "include_cysphe"))
    species = config.get("Defaults", "species")
    gene = config.get("Defaults", "gene")
    update_interval = float(config.get("Defaults", "update_interval"))
    ref_fn = path.join(here, config.get("Defaults", "ref_fn"))
    alignments_fn = config.get("Defaults", "alignments_fn")
    alignment_stats_fn = config.get("Defaults", "alignment_stats_fn")
    Q_mm_stats_fn = config.get("Defaults", "Q_mm_stats_fn")
    Q_mm_stats_plot_fn = config.get("Defaults", "Q_mm_stats_plot_fn")
    qplot_fn = config.get("Defaults", "qplot_fn")
    output_fn = config.get('Defaults', 'output_fn')
    output_hdr = config.get('Defaults', 'output_hdr').decode('string_escape')
    output_fmt = config.get('Defaults', 'output_fmt').decode('string_escape')

    # [Aligner] section
    location = config.get("Aligner", "location")
    cmd_build_index = config.get("Aligner", "cmd_build_index")
    args_build_index = config.get("Aligner", "args_build_index")
    cmd_align = config.get("Aligner", "cmd_align")
    args_align_base = config.get("Aligner", "args_align_base")
    args_align_v = args_align_base + " " + \
            config.get("Aligner", "args_align_v")
    args_align_j = args_align_base + " " + \
            config.get("Aligner", "args_align_j")

    if args.no_VJ_collapsing:
        clone_classname = "AnnotatedCloneDistinctAllele"

    ref_fn = path.realpath(ref_fn) \
            if args.ref is None else path.realpath(args.ref)
    reads_fn = path.realpath(args.reads)
    phred_encoding = determine_phred_encoding(reads_fn) \
            if args.phred_encoding is None else args.phred_encoding
    assert phred_encoding in (33, 64)
    species =  species if args.species is None else args.species
    species = species.split(",")
    gene =  gene if args.gene is None else args.gene
    gene = gene.split(",")
    n_threads = mp.cpu_count() if args.threads is None else args.threads

    # Prepare aligner commands and check existence of aligner
    cmd_build_index = path.realpath(path.join(location, cmd_build_index))
    cmd_align = path.realpath(path.join(location, cmd_align))

    if not path.isfile(cmd_build_index):
        raise ValueError(\
                "Executable to build an index (\"%s\") does not exist.\n\
Please use \"rtcr Config\" to see if the Aligner is properly configured"%
cmd_build_index)

    if not path.isfile(cmd_align):
        raise ValueError(\
                "Executable to align sequences (\"%s\") does not exist.\n\
Please use \"rtcr Config\" to see if the Aligner is properly configured "%
cmd_align)

    init_logging()
    if args.debug:
        logging.root.setLevel(logging.DEBUG)
        logging.root.debug("log level set to DEBUG")
    if args.verbose:
        logging.root.addHandler(logging.StreamHandler(stdout,))
        logging.root.info("Writing log statements to stdout")
    # Note, delaying import of modules that have a logger until after logging
    # has been initialised.
    from fileio import read_reference, zopen
    from pipeline import Pipeline

    ref = read_reference(ref_fn).get_slice(species = species, gene = gene)

    for s in species:
        if not s in ref.species:
            logger.error("species \"%s\" does not exist in reference"%s)
            sys.exit(1)
    for g in gene:
        if not g in ref.genes:
            logger.error("gene \"%s\" does not exist in reference"%s)
            sys.exit(1)

    version = __version__
    preamble = '\nRTCR version: %(version)s\n'%locals()
    preamble += '\n[Command line arguments]\n' + \
            '\n'.join(['%s : %s'%(i,v) for i,v in enumerate(sys.argv)])
    preamble += '\n\
[Files]\n\
Reference: %(ref_fn)s\n\
Reads: %(reads_fn)s\n\
Output: %(output_fn)s\n\
\n\
[Settings]\n\
PHRED encoding: %(phred_encoding)s\n\
Species: %(species)s\n\
Gene: %(gene)s\n\
confidence: %(confidence)s\n\
\n\
[Immune receptor reference]\n'%locals()
    for species in sorted(ref.species):
        for gene in sorted(ref.genes):
            for region in sorted(ref.regions):
                alleles = ref.get_alleles(species = species, gene = gene,
                        region = region)
                n = len(alleles)
                if n > 0:
                    preamble += "%s,%s,%s: %s alleles\n"%(
                            species, gene, region, n)
                s = ""
                for allele in alleles:
                    s += "%s, %s\n"%(allele.species, allele.name)
                logger.debug("species, allele\n" + s)
    preamble += "\n[Pipeline run]"
    logger.info(preamble)

    # Make sure exceptions are logged, even when not caught
    sys.excepthook = handle_uncaught_exception

    pipeline = Pipeline(
            ref = ref,
            reads = zopen(reads_fn, 'r'),
            phred_encoding = phred_encoding,
            cmd_build_index = cmd_build_index,
            args_build_index = args_build_index,
            cmd_align = cmd_align,
            args_align_v = args_align_v,
            args_align_j = args_align_j,
            alignments_fn = alignments_fn,
            alignment_stats_fn = alignment_stats_fn,
            Q_mm_stats_fn = Q_mm_stats_fn,
            Q_mm_stats_plot_fn = Q_mm_stats_plot_fn,
            output_fn = output_fn,
            output_hdr = output_hdr,
            output_fmt = output_fmt,
            clone_classname = clone_classname,
            confidence = confidence,
            min_seqlen = min_seqlen,
            include_cysphe = include_cysphe,
            min_phred_threshold = min_phred_threshold,
            n_threads = n_threads,
            update_interval = update_interval,
            listener = Listener())
    pipeline.daemon = True
    pipeline.name = 'Pipeline'
    try:
        pipeline.start()
        while pipeline.is_alive():
            pipeline.join(1)
    except KeyboardInterrupt:
        logger.error('Caught keyboard interrupt. Shutting down.')
        pipeline.stop()
        pipeline.join(1)
Example #8
0
def checkout(fq1_fn, fq2_fn, adapters, max_mm, search_rc, paired=False):
    assert not fq1_fn is None
    assert not (paired and not fq2_fn is None)

    print 'Handling file(s): %s' % ''.join(
        [fq1_fn, '' if fq2_fn is None else ', %s' % fq2_fn])

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if not fq2_fn is None:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = None

    outfiles = {}
    for (sample_id, master, slave) in adapters:
        outfiles[sample_id] = {
                "out1" : (open("%s_R1.fastq"%sample_id, 'w'), 'R1') \
                        if not paired else \
                        (open("%s_R12.fastq"%sample_id, 'w'), 'R12'),
                "out2" : (None, None) if fq2 is None else \
                        (open("%s_R2.fastq"%sample_id, 'w'), 'R2')}

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)
        else:
            r2 = None

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match is None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                if paired:
                    r = r1
                else:
                    r = r2

                slave_matches, slave_matches_rc = slave.locate_in(
                    r.seq, max_mm, search_rc=search_rc)
                slave_match, slave_is_rc = get_best_match(
                    slave_matches, slave_matches_rc)

                if not slave_match:  # No slave found
                    continue

                if slave.has_UMI():  # get umi
                    if slave_is_rc:
                        slave_umi_start = len(
                            r.seq) - (slave_match[1] + len(slave.seq))
                        slave_umi = slave.get_UMI(revcomp(r.seq),
                                                  r.qual_str[::-1],
                                                  slave_umi_start)
                    else:
                        slave_umi = slave.get_UMI(r.seq, r.qual_str,
                                                  slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0] or \
               (best_match[0][0] == master_match[0] and \
                slave_match and \
                (not best_match[1] or not best_match[1][0] or \
                 best_match[1][0] > slave_match[0])):
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, slave_match, sample_id, umi)

        if best_match:
            master_match, slave_match, sample_id, umi = best_match
            for (r, (out, typename)) in ((r1, outfiles[sample_id]["out1"]),
                                         (r2, outfiles[sample_id]["out2"])):
                if not out:
                    continue
                out.write("@%s UMI:%s:%s:%s\n" %
                          (r.id, typename, umi[0], umi[1]))
                out.write("%s\n+\n%s\n" % (r.seq, r.qual_str))
            n_accepted += 1

        frac = float(fq1_filepos()) / fq1_filesize
        if time() - prev_time > .5 or frac == 1.0:
            prev_time = time()
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i + 1,
                        frac*100, n_accepted,
                        (100*float(n_accepted)/(i+1))))
            stdout.flush()

    stdout.write('\n')
    def run(self):
        if self._alignments_fn is None or self._alignments_fn == "":
            output_alignments = False
        else:
            output_alignments = True

        if output_alignments and os.path.isfile(self._alignments_fn):
            logger.info("SKIPPING creation of %s" % self._alignments_fn)
            output_alignments = False
            alignment_file = zopen(self._alignments_fn, 'r')
            vj_recs = SAMFormat.records_in(alignment_file)
            # Get two (rows/)alignments at a time from vj_recs
            alns = ((rec, next(vj_recs)) for rec in vj_recs)
            self._listener.notify("Reading alignments from %s" %
                                  self._alignments_fn)
        else:
            alns = get_vj_alignments(self._ref,
                                     self._reads,
                                     self._cmd_build_index,
                                     self._args_build_index,
                                     self._cmd_align,
                                     self._args_align_v,
                                     self._args_align_j,
                                     phred_encoding=self._phred_encoding,
                                     n_threads=self._n_threads)
            self._listener.notify("Aligning reference sequences to reads")

        # Keep track of the quality scores of the bases that went into the
        # sequences of the clones.
        Q_counts = {}

        # Build clones and use alignments to count mismatches and indels
        cs = CloneSet()
        alnstats = {"V": {}, "J": {}}
        v_refpos_offset = -3
        j_refpos_offset = 3

        try:
            if output_alignments:
                out = zopen(self._alignments_fn, 'w')

            if output_alignments:
                infile = self._reads
            else:
                infile = alignment_file

            prev_time = time()
            if isinstance(infile, gzip.GzipFile):
                infile_size = os.path.getsize(infile.name)
                infile_pos = infile.fileobj.tell
            else:
                infile_size = filesize(infile)
                infile_pos = infile.tell

            self._listener.notify(("PROGRESSBAR", "Alignments", "start"))

            for v_rec, j_rec in alns:
                if self.stopped():
                    logger.warning("Pipeline stopped")
                    return

                if time() - prev_time >= self._update_interval:
                    prev_time = time()
                    if not infile.closed:
                        pos = infile_pos()
                    else:
                        # assuming a closed infile means the entire infile has
                        # been processed.
                        pos = infile_size
                    frac = float(pos) / infile_size
                    self._listener.notify(("PROGRESSBAR", "Alignments", frac))

                if output_alignments:
                    out.write("\t".join(map(str, v_rec)) + "\n" + \
                            "\t".join(map(str, j_rec)) + "\n")

                clone = build_clone(self._ref, v_rec, j_rec,
                                    self._clone_classname)

                if clone is None:
                    continue

                seqlen = len(clone.seq)
                if seqlen < self._min_seqlen:
                    continue

                # Count base qualities in the clone (which is possible because
                # at this point the clone is based on a single read)
                lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42)
                for i in xrange(clone.v.end, clone.j.start):
                    lenfam_Q_counts[clone.qual[i]] += 1

                cs.add(clone, merge=True)

                v_allele = self._ref[v_rec.RNAME]
                j_allele = self._ref[j_rec.RNAME]
                # Count errors in the alignments
                for (rec, r_roi_start, r_roi_end) in \
                        ((v_rec, v_allele.refpos + v_refpos_offset, 0),
                        (j_rec, 0, j_allele.refpos + j_refpos_offset)):
                    allele = self._ref[rec.RNAME]
                    lenfam_alnstats = alnstats[allele.region].setdefault(
                        seqlen, {
                            "n": 0,
                            "mm": 0,
                            "ins": 0,
                            "dels": 0,
                            "Q_mm": [0] * 42,
                            "Q_n": [0] * 42
                        })
                    n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats(
                        rec, allele.seq, lenfam_alnstats["Q_mm"],
                        lenfam_alnstats["Q_n"], r_roi_start, r_roi_end)
                    lenfam_alnstats["n"] += n
                    lenfam_alnstats["mm"] += mm
                    lenfam_alnstats["ins"] += ins
                    lenfam_alnstats["dels"] += dels
        finally:
            if output_alignments:
                out.close()
        self._listener.notify(("PROGRESSBAR", "Alignments", "end"))

        if len(cs) == 0:
            msg = "No clones found in alignments. \
Was the correct germline reference used?"

            logger.error(msg)
            raise Exception(msg)

        if not self._alignment_stats_fn is None and \
                self._alignment_stats_fn != "":
            logger.info("Writing alignment stats to \"%s\"" %
                        self._alignment_stats_fn)
            with zopen(self._alignment_stats_fn, 'w') as out:
                out.write("seqlen,region,n,mm,ins,dels\n")
                for region in alnstats:
                    for seqlen, lenfam_alnstats in \
                            alnstats[region].iteritems():
                        out.write(",".join(
                            map(str, [
                                seqlen, region, lenfam_alnstats["n"],
                                lenfam_alnstats["mm"], lenfam_alnstats["ins"],
                                lenfam_alnstats["dels"]
                            ])) + "\n")

        self._save_cloneset(cs, "r")

        # Sum all the counts in the V and J regions separately, and calculate
        # average error rates
        tot_err = {"V": {}, "J": {}}
        for region in ("V", "J"):
            region_stats = alnstats[region]
            x = tot_err[region]
            x["n"] = sum([y["n"] for y in region_stats.itervalues()])
            x["mm"] = sum([y["mm"] for y in region_stats.itervalues()])
            x["ins"] = sum([y["ins"] for y in region_stats.itervalues()])
            x["dels"] = sum([y["dels"] for y in region_stats.itervalues()])

            n = x["n"]
            if n > 0:
                x["mmr"] = float(x["mm"]) / n
                x["insr"] = float(x["ins"]) / n
                x["delsr"] = float(x["dels"]) / n
            else:
                x["mmr"] = 0
                x["insr"] = 0
                x["delsr"] = 0
        global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"])
        global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"])
        global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"])
        logger.info("global error rates: mmr: %(global_mmr)s, \
insr: %(global_insr)s, delsr: %(global_delsr)s" % locals())

        # Calculate observed error rates for Phred scores
        Q_mm_stats = {"V": {}, "J": {}}
        for region, region_stats in alnstats.iteritems():
            Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42)
            Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42)
            for lenfam_alnstats in region_stats.itervalues():
                for i in xrange(42):
                    Q_mm[i] += lenfam_alnstats["Q_mm"][i]
                    Q_n[i] += lenfam_alnstats["Q_n"][i]

        if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "":
            with zopen(self._Q_mm_stats_fn, 'w') as out:
                out.write("region,Q,n,mm\n")
                for region in Q_mm_stats:
                    for Q, (mm, n) in enumerate(
                            izip(Q_mm_stats[region]["Q_mm"],
                                 Q_mm_stats[region]["Q_n"])):
                        out.write("%s,%s,%s,%s\n" % (region, Q, n, mm))

        # Calculate ratio between base quality score assigned by the sequencer
        # and observed base quality (based on alignments with germline
        # reference).
        sum_ratios = 0
        n_ratios = 0
        for region in Q_mm_stats:
            Q_mm = Q_mm_stats[region]["Q_mm"]
            Q_n = Q_mm_stats[region]["Q_n"]
            for q in xrange(42):
                mm = Q_mm[q]
                n = Q_n[q]
                if mm > 0 and n > 0:
                    q_obs = p2q(float(mm) / n)
                    if q_obs > 0:
                        sum_ratios += (q / q_obs) * n
                        n_ratios += n
        if n_ratios > 0:
            alpha = float(sum_ratios) / n_ratios
        else:
            logger.warning('No instances found of a Phred score associated ' +\
                    'with mismatches.')
            alpha = 1.0

        logger.info("Ratio between base quality and observed quality: %s" %
                    alpha)

        if not self._Q_mm_stats_plot_fn is None and \
                self._Q_mm_stats_plot_fn != "":
            plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn)

        # Get median quality score
        Q_n = [0] * 42  # count number of bases for every Q score
        for lenfam_Q_counts in Q_counts.itervalues():
            for q, count in enumerate(lenfam_Q_counts):
                Q_n[q] += count
        i = ((sum(Q_n) + 1) // 2) - 1  # index of median element in Q_n
        j = 0
        for max_Q, count in enumerate(Q_n):
            j += count
            if j > i:
                break
        logger.info("max_Q = %s" % max_Q)

        pool = ConnectedConsumerPool(n_consumers=self._n_threads)
        by_seqlen = lambda clone: len(clone.seq)
        confidence = self._confidence
        for seqlen, clones in groupby(sorted(cs, key=by_seqlen), by_seqlen):
            if self.stopped():
                logger.warning("Pipeline stopped")
                return
            cs2 = CloneSet(clones)
            # Calculate expected number of errors based on Q scores
            lenfam_Q_counts = Q_counts[seqlen]

            # get total number of bases between V and J region
            n_o = sum(lenfam_Q_counts)
            mm_o = 0
            for q, count in enumerate(lenfam_Q_counts):
                q /= alpha
                mm_o += q2p(q) * count

            mm_v = alnstats["V"][seqlen]["mm"]
            n_v = alnstats["V"][seqlen]["n"]

            mm_j = alnstats["J"][seqlen]["mm"]
            n_j = alnstats["J"][seqlen]["n"]

            mm_tot = mm_v + mm_o + mm_j
            n_tot = n_v + n_o + n_j
            logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\
mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)" %
                        (seqlen, mm_v, float(mm_v) / n_v if n_v > 0 else 0,
                         mm_o, float(mm_o) / n_o if n_o > 0 else 0, mm_j,
                         float(mm_j) / n_j if n_j > 0 else 0, mm_tot,
                         float(mm_tot) / n_tot if n_tot > 0 else 0))
            local_mmr = float(mm_tot) / n_tot
            mmr = max(local_mmr, global_mmr)
            logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \
%(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s" % locals())
            pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q))

        self._listener.notify("Running QMerge and IMerge on bins.")
        self.run_pool(pool, desc='QMerge, IMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable([x[0] for x in results]))
        self._save_cloneset(cloneset, "rqi")

        self._listener.notify("Running LMerge")
        cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr,
                              confidence)
        self._save_cloneset(cloneset, "rqil")

        pool = ConnectedConsumerPool(n_consumers=self._n_threads)
        for seqlen, clones in groupby(sorted(cloneset, key=by_seqlen),
                                      by_seqlen):
            cs2 = CloneSet(clones)
            pool.add_task(wrapper_run_nmerge_on_bin, args=(cs2, ))
        self._listener.notify("Running NMerge on bins.")
        self.run_pool(pool, desc='NMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable(results))
        self._save_cloneset(cloneset, "rqiln")

        ########################
        # Write clones to file #
        ########################
        self._listener.notify("Writing clones")
        sequence_id = 0
        with open(self._output_fn, 'w') as res_ok:
            with open(self._output_not_ok_fn, 'w') as res_not_ok:
                header = '\t'.join(\
                        clone2AIRRDict(clone = None, ref = None).keys()) + '\n'
                res_ok.write(header)
                res_not_ok.write(header)

                n_discarded = 0
                for clone in sorted(cloneset,
                                    key=lambda clone:
                                    (-clone.count, clone.seq)):
                    record = clone2AIRRDict(clone=clone, ref=self._ref)
                    min_phred = int(record['junction_minimum_quality_score'])
                    if min_phred < self._min_phred_threshold \
                            or record['stop_codon'] == 'T' or \
                            record['vj_in_frame'] == 'F':
                        n_discarded += 1
                        out = res_not_ok
                    else:
                        out = res_ok
                    sequence_id += 1
                    record['sequence_id'] = str(sequence_id)
                    out.write('\t'.join([v for k, v in record.iteritems()]) +\
                            '\n')
        self._listener.notify("Discarded %s clones" % n_discarded)