コード例 #1
0
ファイル: umi_group_ec.py プロジェクト: Rosemary94/RTCR
def _progress_indicator(rec_nr, n_discarded, handle=None):
    if not handle is None:
        if isinstance(handle, gzip.GzipFile):
            _progress_indicator.endpos = os.path.getsize(handle.name)
            _progress_indicator.get_pos = handle.fileobj.tell
        else:
            try:
                _progress_indicator.endpos = filesize(handle)
                _progress_indicator.get_pos = handle.tell
            except IOError:
                _progress_indicator.endpos = None
                _progress_indicator.get_pos = lambda: None

        _progress_indicator.prev_time = time()
        return
    get_pos = _progress_indicator.get_pos
    endpos = _progress_indicator.endpos

    frac = None if endpos is None else float(get_pos()) / endpos
    if time() - _progress_indicator.prev_time > .5 or frac == 1.0:
        _progress_indicator.prev_time = time()
        perc_str = '?%' if frac is None else '%.2f%%' % (frac * 100)
        stdout.write(term.EL(2) + term.CHA(0) + \
                'Processed %s records (%s)'%(rec_nr + 1, perc_str))
        stdout.flush()
コード例 #2
0
def prog_checkout(args):
    search_rc = args.reverse_complement
    barcodes_fn = args.barcodes

    adapters = list(BarcodeFormat.records_in(open(barcodes_fn, 'r')))

    outfiles = {sample_id : open("%s.fastq"%sample_id,'w') \
            for (sample_id, master, slave) in adapters}

    fq1_fn = args.i
    fq2_fn = args.i2

    max_mm = args.max_mm

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if fq2_fn:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = False

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if time() - prev_time > .5:
            prev_time = time()
            frac = float(fq1_filepos()) / fq1_filesize
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i,
                        frac*100, n_accepted, 100*float(n_accepted)/i))
            stdout.flush()

        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match == None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                slave_matches, slave_matches_rc = slave.locate_in(
                    r2.seq, max_mm, search_rc=False)
                slave_match = get_best_match(slave_matches, slave_matches_rc)

                if slave.has_UMI():  # get umi
                    slave_umi = slave.get_UMI(r2.seq, r2.qual_str,
                                              slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0]:
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, sample_id, umi)

        if best_match:
            master_match, sample_id, umi = best_match
            out = outfiles[sample_id]
            out.write("@%s UMI:%s:%s\n" % (r1.id, umi[0], umi[1]))
            out.write("%s\n+\n%s\n" % (r1.seq, r1.qual_str))
            n_accepted += 1
コード例 #3
0
ファイル: checkout.py プロジェクト: Rosemary94/RTCR
def checkout(fq1_fn, fq2_fn, adapters, max_mm, search_rc, paired=False):
    assert not fq1_fn is None
    assert not (paired and not fq2_fn is None)

    print 'Handling file(s): %s' % ''.join(
        [fq1_fn, '' if fq2_fn is None else ', %s' % fq2_fn])

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if not fq2_fn is None:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = None

    outfiles = {}
    for (sample_id, master, slave) in adapters:
        outfiles[sample_id] = {
                "out1" : (open("%s_R1.fastq"%sample_id, 'w'), 'R1') \
                        if not paired else \
                        (open("%s_R12.fastq"%sample_id, 'w'), 'R12'),
                "out2" : (None, None) if fq2 is None else \
                        (open("%s_R2.fastq"%sample_id, 'w'), 'R2')}

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)
        else:
            r2 = None

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match is None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                if paired:
                    r = r1
                else:
                    r = r2

                slave_matches, slave_matches_rc = slave.locate_in(
                    r.seq, max_mm, search_rc=search_rc)
                slave_match, slave_is_rc = get_best_match(
                    slave_matches, slave_matches_rc)

                if not slave_match:  # No slave found
                    continue

                if slave.has_UMI():  # get umi
                    if slave_is_rc:
                        slave_umi_start = len(
                            r.seq) - (slave_match[1] + len(slave.seq))
                        slave_umi = slave.get_UMI(revcomp(r.seq),
                                                  r.qual_str[::-1],
                                                  slave_umi_start)
                    else:
                        slave_umi = slave.get_UMI(r.seq, r.qual_str,
                                                  slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0] or \
               (best_match[0][0] == master_match[0] and \
                slave_match and \
                (not best_match[1] or not best_match[1][0] or \
                 best_match[1][0] > slave_match[0])):
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, slave_match, sample_id, umi)

        if best_match:
            master_match, slave_match, sample_id, umi = best_match
            for (r, (out, typename)) in ((r1, outfiles[sample_id]["out1"]),
                                         (r2, outfiles[sample_id]["out2"])):
                if not out:
                    continue
                out.write("@%s UMI:%s:%s:%s\n" %
                          (r.id, typename, umi[0], umi[1]))
                out.write("%s\n+\n%s\n" % (r.seq, r.qual_str))
            n_accepted += 1

        frac = float(fq1_filepos()) / fq1_filesize
        if time() - prev_time > .5 or frac == 1.0:
            prev_time = time()
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i + 1,
                        frac*100, n_accepted,
                        (100*float(n_accepted)/(i+1))))
            stdout.flush()

    stdout.write('\n')