def main():
    with smart_open(sys.argv[1], "r") as handle:
        cluster_to_read_cnt = {}
        for record in SeqIO.parse(handle, "fasta"):
            cluster, mult = parse_cluster_mult(record.id)
            cluster_to_read_cnt[cluster] = mult
    with smart_open(sys.argv[2], "r") as handle:
        cluster_to_umi_cnt = {}
        for record in SeqIO.parse(handle, "fasta"):
            cluster, mult = parse_cluster_mult(record.id)
            cluster_to_umi_cnt[cluster] = mult

    read_cnt = np.array([
        cluster_to_read_cnt[cluster] for cluster in cluster_to_read_cnt.keys()
    ])
    umi_cnt = np.array([
        cluster_to_umi_cnt[cluster] for cluster in cluster_to_read_cnt.keys()
    ])
    plot = sns.regplot(umi_cnt, read_cnt, fit_reg=False)
    plot.set_ylabel("Read count")
    plot.set_xlabel("Barcode count")
    margin_coef = 0.01
    # ysize = max(read_cnt) - min(read_cnt)
    # ymargin = margin_coef * ysize
    xsize = max(umi_cnt) - min(umi_cnt)
    xmargin = margin_coef * xsize
    ymargin = 0.1
    plt.ylim(1.0 / (1 + ymargin), max(read_cnt) * (1 + ymargin))
    plt.xlim(min(umi_cnt) - xmargin, max(umi_cnt) + xmargin)
    plt.yscale("log", nonposy="clip")
    plt.savefig(
        os.path.join(os.path.dirname(sys.argv[1]), "read_cnt_to_umi_cnt.png"))
    plt.close()
def run_presto(input_file, output_dir, log=None, remove_tmp=True):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    # gunzip
    input_file_new = "%s/input_reads.fasta" % output_dir
    fastx2fastx(input_file, input_file_new)

    args = {"input_file": input_file_new, "output_dir": output_dir}

    timer = Timer()
    support.sys_call(
        "CollapseSeq.py -s %(input_file)s --outdir %(output_dir)s --outname presto"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    presto_output = output_dir + "/presto_collapse-unique.fasta"
    repertoire_fa = output_dir + "/final_repertoire.fa"
    with smart_open(presto_output) as fin, smart_open(repertoire_fa,
                                                      "w") as fout:
        for i, record in enumerate(
                SeqIO.parse(fin, idFormatByFileName(presto_output))):
            id = record.description
            size = parse_presto_id(id)
            record.id = record.description = "cluster___%d___size___%d" % (
                i, size)
            SeqIO.write(record, fout, "fasta")

    if remove_tmp:
        os.remove(input_file_new)
        os.remove(presto_output)
def convert_mixcr_output_to_igrec(input_file, output_file):
    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        # Skip header
        fh.next()

        for i, line in enumerate(fh):
            seq, size = line.strip().split()
            size = int(size)
            fout.write(">cluster___%d___size___%d\n" % (i, size))
            fout.write(seq + "\n")
def multiplex_repertoire(input_file, output_file):
    output_format = idFormatByFileName(output_file)
    input_format = idFormatByFileName(input_file)
    assert output_format == "fasta" or input_format == "fastq"

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, input_format):
            cluster, mult = parse_cluster_mult(str(record.description))
            for i in xrange(1, mult + 1):
                record.id = record.description = "antibody_%s_multiplicity_%d_copy_%d" % (
                    cluster, mult, i)
                SeqIO.write(record, fout, output_format)
def convert_abvitro_to_repertoire(input_file, output_file):
    output_format = idFormatByFileName(output_file)
    input_format = idFormatByFileName(input_file)
    assert output_format == "fasta" or input_format == "fastq"

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, input_format):
            cluster, mult = parse_abvitro_assembled_header(
                str(record.description))
            record.id = record.description = "cluster___%s___size___%d" % (
                cluster, mult)
            SeqIO.write(record, fout, output_format)
def jit_fx_file(input_file,
                output_file,
                error_rate=2,
                random_errors=True,
                min_error=0,
                erroneous_site_len=10005000,
                seed=None):
    import numpy as np
    from Bio import Seq
    import random

    output_format = idFormatByFileName(output_file)
    input_format = idFormatByFileName(input_file)
    print seed
    random.seed(seed)
    np.random.seed(seed)

    print np.random.ranf(1)

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, input_format):
            n_errors = np.random.poisson(error_rate,
                                         1)[0] if random_errors else error_rate
            if n_errors < min_error:
                n_errors = min_error

            positions = random.sample(
                range(min(len(record.seq), erroneous_site_len)), n_errors)
            s = list(str(record.seq))
            for pos in positions:
                s[pos] = RC(s[pos])

            if input_format == "fastq":
                phred_quality = record.letter_annotations["phred_quality"]
                record.letter_annotations = {}

            record.seq = Seq.Seq("".join(s))

            if output_format == "fastq":
                if input_format == "fastq":
                    record.letter_annotations["phred_quality"] = phred_quality
                else:
                    record.letter_annotations["phred_quality"] = [
                        random.randint(30, 50) for _ in xrange(len(record))
                    ]  # TODO Check it out

            SeqIO.write(record, fout, output_format)
def convert_mixcr2_output_to_igrec(input_file, output_file, initial_reads,
                                   output_rcm):
    with smart_open(initial_reads) as fh:
        record_ids = [
            str(record.description)
            for record in SeqIO.parse(fh, idFormatByFileName(initial_reads))
        ]

    targets = [None] * len(record_ids)
    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        # Skip header
        fh.next()

        for i, line in enumerate(fh):
            seq, size, ids = line.strip().split("\t")
            ids = ids.strip().split(",")
            ids = map(int, ids)
            for id in ids:
                targets[id] = i
            size = int(size)
            assert size <= len(ids)  # WHY?????????????
            # if size != len(ids):
            #     print size
            #     print ids
            size = len(ids)
            fout.write(">cluster___%d___size___%d\n" % (i, size))
            fout.write(seq + "\n")

        empty_num = max(target for target in targets if target is not None) + 1
        # print empty_num
        with smart_open(initial_reads) as fh:
            for j, record in enumerate(
                    SeqIO.parse(fh, idFormatByFileName(initial_reads))):
                if targets[j] is None:
                    targets[j] = empty_num
                    empty_num += 1
                    fout.write(">cluster___%d___size___%d\n" % (targets[j], 1))
                    fout.write(str(record.seq) + "\n")

    with smart_open(output_rcm, "w") as rcm:
        for id, target_cluster in zip(record_ids, targets):
            assert target_cluster is not None
            rcm.write("%s\t%d\n" % (id, target_cluster))
Beispiel #8
0
def generate_rcm(reads_file_name, compressed_file_name, cliques_ids_file_name,
                 out_file):
    # Obtain read ids
    with smart_open(reads_file_name, "r") as fh:
        ids = [
            str(record.id)
            for record in SeqIO.parse(fh, idFormatByFileName(reads_file_name))
        ]

    # Obtain compread2clique
    with smart_open(cliques_ids_file_name, "r") as fh:
        compread2clique = [int(s) for s in fh]

    with smart_open(compressed_file_name, "r") as fh:
        idmap = [int(s) for s in fh]

    with smart_open(out_file, "w") as fh:
        for i in xrange(len(ids)):
            fh.write("%s\t%d\n" % (ids[i], compread2clique[idmap[i]]))
def simulated_repertoire_to_final_repertoire(input_file, output_file):
    import random

    output_format = idFormatByFileName(output_file)

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, "fasta"):
            id = record.description
            cluster, size, copy = parse_final_repertoire_id(id)
            if copy == 1:
                record.id = record.description = "cluster___%s___size___%d" % (
                    cluster, size)
                record.letter_annotations = {}

                if output_format == "fastq":
                    record.letter_annotations["phred_quality"] = [
                        random.randint(30, 50) for _ in xrange(len(record))
                    ]  # TODO Check it out
                    # record.letter_annotations["phred_quality"] = [50] * len(record)

                SeqIO.write(record, fout, output_format)
Beispiel #10
0
def parse_vjf_output(filename, readfile):
    from collections import defaultdict

    with smart_open(readfile, "rU") as fh:
        parser = SeqIO.parse(fh, idFormatByFileName(readfile))
        descr_to_ind = { str(record.description).replace(" ", "_"): i for i, record in enumerate(parser) }

    result = defaultdict(dict)
    with open(filename) as csv_file:
        reader = csv.reader(csv_file, delimiter="\t")
        headers = reader.next()

# Read_name    Chain_type    V_hit    V_start_pos    V_end_pos    V_score    J_hit    J_start_pos    J_end_pos    J_score
        id_col = linear_search(headers, "Read_name")
        Vstart_col = linear_search(headers, "V_start_pos")
        Vend_col = linear_search(headers, "V_end_pos")
        Vgene_col = linear_search(headers, "V_hit")
        Jgene_col = linear_search(headers, "J_hit")
        Jstart_col = linear_search(headers, "J_start_pos")
        Jend_col = linear_search(headers, "J_end_pos")
        for line in reader:
            desc = line[id_col]

            Vstart = int(line[Vstart_col])
            Vend = int(line[Vend_col])
            Jstart = int(line[Jstart_col])
            Jend = int(line[Jend_col])

            Vgene = line[Vgene_col]
            # Vgene = Vgene[:Vgene.find(" ")]
            Jgene = line[Jgene_col]
            # Jgene = Jgene[:Jgene.find(" ")]

            ind = descr_to_ind[desc]
            result[desc]["V"] = HitTableRowVJF("V", desc, Vgene, Vstart, Vend)
            result[desc]["J"] = HitTableRowVJF("J", desc, Jgene, Jstart, Jend)
            result[ind] = result[desc]

        return result
Beispiel #11
0
                        type=str,
                        help="input FASTA/FASTQ file with abundances in ids")
    parser.add_argument("output", type=str, help="output FASTA/FASTQ file")
    parser.add_argument("--limit",
                        "-l",
                        type=int,
                        default=5,
                        help="size limit (default: %(default)s)")

    args = parser.parse_args()

    print "Supernode reporter started..."
    print "Command line: %s" % " ".join(sys.argv)

    input_size = output_size = 0
    with smart_open(args.input, "r") as fin, smart_open(args.output,
                                                        "w") as fout:
        for record in SeqIO.parse(fin, idFormatByFileName(args.input)):
            input_size += 1
            id = str(record.description)
            size = parse_size(id)
            assert id is not None
            if size >= args.limit:
                SeqIO.write(record, fout, idFormatByFileName(args.output))
                output_size += 1

    print "%d antibody clusters have abundance >= %d" % (output_size,
                                                         args.limit)
    print "%d lowly abundant antibody clusters will be discarded" % (
        input_size - output_size, )
    print "Highly abundant clusters were written to " + args.output
    def stamp(self, filename):
        delta = self.delta()
        with smart_open(filename, "w") as f:
            f.write("%f\n" % delta)

        return delta
    parser.add_argument("--output-repertoire",
                        "-r",
                        type=str,
                        help="output file with repertoire sequences")
    parser.add_argument("--output-rcm",
                        "-R",
                        type=str,
                        help="output file with repertoire RCM")

    args = parser.parse_args()

    print "Construct repertoire from TrieCompressor output..."
    print "Command line: %s" % " ".join(sys.argv)

    # Fix ids
    with smart_open(args.input_compressed) as fin, smart_open(
            args.output_repertoire, "w") as fout:
        for i, record in enumerate(
                SeqIO.parse(fin, idFormatByFileName(args.input_compressed))):
            id = record.description
            size = parse_size(id)
            record.id = record.description = "cluster___%d___size___%d" % (
                i, size)
            SeqIO.write(record, fout,
                        idFormatByFileName(args.output_repertoire))

    with smart_open(args.input_reads) as fin_reads, smart_open(
            args.input_map) as fin_map, smart_open(args.output_rcm,
                                                   "w") as fout_rcm:
        for record, cluster in izip(
                SeqIO.parse(fin_reads, idFormatByFileName(args.input_reads)),
Beispiel #14
0
                        type=int,
                        default=10,
                        help="distance threshold [default %(default)d]")
    parser.add_argument("--lengths",
                        type=str,
                        help="file for read length stats")
    # parser.add_argument("--subs-map", "-M",
    #                     type=str,
    #                     help="file for subs table")

    args = parser.parse_args()

    barcodes_count = defaultdict(int)

    print("Reading library...")
    with smart_open(args.input, "r") as fh:
        data = list(SeqIO.parse(fh, idFormatByFileName(args.input)))

    if not args.no_fix_spaces:
        for record in data:
            record.description = str(record.description).replace(" ", "_")
            record.id = record.name = record.description

    # Omit reads with Ns
    data = [record for record in data if record.seq.count("N") == 0]

    data = [
        record for record in data if extract_barcode(record.id) is not None
    ]

    clusters = defaultdict(list)
Beispiel #15
0
if __name__ == "__main__":
    args = parse_command_line()

    log = CreateLogger("VJF benchmark")
    if args.log:
        AttachFileLogger(log, args.log)

    with open(args.germline_J_file, "rU") as fh:
        germline_J_parser = SeqIO.parse(fh, "fasta")
        germline_J_map = { str(record.id): str(record.seq) for record in germline_J_parser }

    igblast_hits = get_igblast_output(args)
    vjf_hits = get_vjf_output(args)

    with smart_open(args.input, "rU") as fh:
        parser = SeqIO.parse(fh, idFormatByFileName(args.input))
        reads = list(parser)

    assert len(reads) == len(igblast_hits)
    ids = [str(record.description) for record in reads]
    assert len(set(ids)) == len(reads)

    stats = benchmark_stats(igblast_hits, vjf_hits, germline_J_map)

    if args.bad_reads:
        with smart_open(args.bad_reads, "w") as f:
            SeqIO.write([reads[_] for _ in stats.bad_reads], f, idFormatByFileName(args.bad_reads))
        log.info("Bad reads were written to " + args.bad_reads)

    log.info("Overall reads %d" % stats.all)
Beispiel #16
0
    parser.add_argument("--input", "-i",
                        required=True,
                        type=str,
                        help="input FASTA/FASTQ file with abundances in ids")
    parser.add_argument("--output", "-o",
                        required=True,
                        type=str,
                        help="output FASTA/FASTQ file")
    parser.add_argument("--id-map", "-m",
                        type=str,
                        default="",
                        help="map file name; empty (default) for non-producing")

    args = parser.parse_args()

    print "Fake trie_compressor started"
    read_count = 0
    with smart_open(args.input, "r") as fin, smart_open(args.output, "w") as fout:
        for record in SeqIO.parse(fin, idFormatByFileName(args.input)):
            id = str(record.description)
            record.id = record.description = id + "__size__1"
            SeqIO.write(record, fout, idFormatByFileName(args.output))
            read_count += 1

    if args.id_map:
        with smart_open(args.id_map, "w") as f_id_map:
            for i in xrange(read_count):
                f_id_map.write("%d\n" % i)

    print "Fake trie_compressor finished"