Example #1
0
def main():
    log = CreateLogger()
    params = ParseCommandLineParams(log)
    from Bio import SeqIO
    log.info("Reading reads from %s" % params.input_path)
    with smart_open(params.input_path) as reads_file:
        records = list(SeqIO.parse(reads_file, "fastq"))
    log.info("Read %d reads" % len(records))

    log.info("Converting records")
    cnt = 0
    for record in records:
        id = record.id
        import re
        match = re.match(r"^(.*)_UMI:(.*)$", id)
        record.description = ""
        groups = match.groups()
        record.id = "%s UMI:%s:%s" % (groups[0], groups[1],
                                      "S" * len(groups[1]))
        cnt += 1

    log.info("Writing output")
    with smart_open(params.output_path, "w") as output_file:
        for record in records:
            SeqIO.write(record, output_file, "fastq")
Example #2
0
def main():
    log = CreateLogger()
    params = ParseCommandLineParams(log)
    from Bio import SeqIO
    log.info("Reading reads from %s" % params.repertoire_path)
    records = []
    with open(params.repertoire_path) as input_file:
        header = input_file.readline().split("\t")
        sequence_column = header.index("Clonal sequence(s)")
        size_column = header.index("Clone count")
        id = 0
        for line in input_file:
            if len(line) == 0:
                break
            info = line.split("\t")
            from Bio import Seq
            record = SeqIO.SeqRecord(seq=Seq.Seq(info[sequence_column]),
                                     id="cluster___%d___size___%d" %
                                     (id, int(info[size_column])),
                                     description="")
            records.append(record)
            id += 1

    log.info("Read %d reads" % len(records))

    log.info("Writing output")
    with smart_open(params.output_path, "w") as output_file:
        for record in records:
            SeqIO.write(record, output_file, "fasta")
Example #3
0
def ReadInput(log, params):
    from Bio import SeqIO
    log.info("Reading reads from %s" % params.reads_path)
    with smart_open(params.reads_path) as reads_file:
        read_id_to_read = dict([(record.id, record.seq)
                                for record in SeqIO.parse(reads_file, "fasta")
                                ])
    log.info("Read %d reads" % len(read_id_to_read))

    log.info("Reading rcm from %s" % params.rcm_path)
    from rcm_utils import read_rcm_list
    rcm = read_rcm_list(params.rcm_path)
    log.info("Read %d mappings" % len(rcm))

    log.info("Reading UMIs from %s" % params.umi_path)
    with smart_open(params.umi_path) as umis_file:
        umis = list(SeqIO.parse(umis_file, "fastq"))
    log.info("Read %d UMIs" % len(umis))

    return read_id_to_read, rcm, umis
Example #4
0
def main():
    log = CreateLogger()
    params = ParseCommandLineParams(log)
    from Bio import SeqIO
    log.info("Reading reads from %s" % params.repertoire_path)
    with smart_open(params.repertoire_path) as reads_file:
        records = list(SeqIO.parse(reads_file, "fasta"))
    log.info("Read %d reads" % len(records))

    log.info("Converting records")
    cnt = 0
    for record in records:
        id = record.id
        import re
        match = re.match(r"^.*CONSCOUNT=(\d+).*$", id)
        record.description = ""
        record.id = "cluster___%d___size___%d" % (cnt, int(match.groups()[0]))
        cnt += 1

    log.info("Writing output")
    with smart_open(params.output_path, "w") as output_file:
        for record in records:
            SeqIO.write(record, output_file, "fasta")
Example #5
0
def ReportCorrectedUmiErrors(log, read_id_to_read, rcm, umis, output_path):
    read_id_to_umi = dict()
    umi_to_read_ids = dict()
    for record in umis:
        sequence = str(record.seq)
        read_id_to_umi[record.id] = sequence
        if sequence not in umi_to_read_ids:
            umi_to_read_ids[sequence] = []
        umi_to_read_ids[sequence].append(record.id)
    assert len(umis) == len(read_id_to_umi)

    cluster_to_read_id = dict()
    for read, cluster in rcm:
        if cluster not in cluster_to_read_id:
            cluster_to_read_id[cluster] = []
        cluster_to_read_id[cluster].append(read)

    output_file = smart_open(output_path, "w")
    adjacent_pairs = 0
    large_pairs = 0
    become_large = 0
    both_significant = 0
    corrected_reads = set()
    for cluster, ids in cluster_to_read_id.iteritems():
        # umis = set([str(read_id_to_umi[read_id]) for read_id in ids])
        cluster_umis = set()
        for read_id in ids:
            cluster_umis.add(read_id_to_umi[read_id])
        for umi1 in cluster_umis:
            for umi2 in cluster_umis:
                if umi1 >= umi2:
                    continue
                from umi_tools.count_equal_reads_with_close_umis import dist
                if dist(umi1, umi2) == 1:
                    adjacent_pairs += 1
                    read_ids1 = set()
                    read_ids2 = set()
                    for read_id1 in umi_to_read_ids[umi1]:
                        for read_id2 in umi_to_read_ids[umi2]:
                            if dist(read_id_to_read[read_id1],
                                    read_id_to_read[read_id2],
                                    MAX_READ_DIST) <= MAX_READ_DIST:
                                read_ids1.add(read_id1)
                                read_ids2.add(read_id2)
                    # for _ in range(len(umi_to_read_ids[umi1])):
                    #     for read_id11 in umi_to_read_ids[umi1]:
                    #         if read_id11 not in read_ids1:
                    #             continue
                    #         for read_id12 in umi_to_read_ids[umi1]:
                    #             if read_id12 in read_ids1:
                    #                 continue
                    #             if dist(read_id_to_read[read_id11], read_id_to_read[read_id12], MAX_READ_DIST) <= MAX_READ_DIST:
                    #                 read_ids1.add(read_id12)
                    # for _ in range(len(umi_to_read_ids[umi2])):
                    #     for read_id21 in umi_to_read_ids[umi2]:
                    #         if read_id21 not in read_ids2:
                    #             continue
                    #         for read_id22 in umi_to_read_ids[umi2]:
                    #             if read_id22 in read_ids2:
                    #                 continue
                    #             if dist(read_id_to_read[read_id21], read_id_to_read[read_id22], MAX_READ_DIST) <= MAX_READ_DIST:
                    #                 read_ids2.add(read_id22)

                    if max(len(read_ids1), len(read_ids2)
                           ) < LARGE_CLUSTER_SIZE and len(read_ids1) + len(
                               read_ids2) >= LARGE_CLUSTER_SIZE:
                        become_large += 1
                    if len(read_ids1) >= SIGNIFICANT_CLUSTER_SIZE and len(
                            read_ids2) >= SIGNIFICANT_CLUSTER_SIZE:
                        both_significant += 1
                    if len(read_ids1) + len(
                            read_ids2) < SUM_SIZE_THRESHOLD or max(
                                len(read_ids1),
                                len(read_ids2)) < MAX_SIZE_THRESHOLD:
                        continue
                    large_pairs += 1
                    output_file.write("New pair: %s %s\n" % (umi1, umi2))
                    output_file.write(
                        "%d reads from first + %d reads from second\n" %
                        (len(read_ids1), len(read_ids2)))
                    for read_id1 in read_ids1:
                        output_file.write(
                            ">%s\n%s\n" %
                            (read_id1, read_id_to_read[read_id1]))
                    output_file.write("\n")
                    for read_id2 in read_ids2:
                        output_file.write(
                            ">%s\n%s\n" %
                            (read_id2, read_id_to_read[read_id2]))
                    output_file.write("-----------------")

                    not_in_cluster = 0
                    for read_id in read_ids1:
                        if read_id not in ids:
                            not_in_cluster += 1
                    for read_id in read_ids2:
                        if read_id not in ids:
                            not_in_cluster += 1
                    print not_in_cluster, " out of ", len(read_ids1) + len(
                        read_ids2
                    ), " are not in the cluster actually!!! Sizes: ", len(
                        read_ids1), len(read_ids2)

                    read_ids = read_ids1 if len(read_ids1) < len(
                        read_ids2) else read_ids2
                    for read_id in read_ids:
                        log.info(">%s\n%s" %
                                 (read_id, read_id_to_read[read_id]))
                        # if read_id in corrected_reads:
                        #     print "too bad"
                    corrected_reads.update(read_ids)

    # for read_id in corrected_reads:
    #     log.info(">%s\n%s" % (read_id, read_id_to_read[read_id]))

    log.info("Total %d of adjacent pairs found" % adjacent_pairs)
    log.info(
        "Total %d of them have at least %d in total and at least %d in the largest"
        % (large_pairs, SUM_SIZE_THRESHOLD, MAX_SIZE_THRESHOLD))
    log.info("%d become large >= %d" % (become_large, LARGE_CLUSTER_SIZE))
    log.info("%d have both at least %d" %
             (both_significant, SIGNIFICANT_CLUSTER_SIZE))
    log.info("%d total reads corrected" % len(corrected_reads))