def multiplex_repertoire(input_file, output_file):
    output_format = idFormatByFileName(output_file)
    input_format = idFormatByFileName(input_file)
    assert output_format == "fasta" or input_format == "fastq"

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, input_format):
            cluster, mult = parse_cluster_mult(str(record.description))
            for i in xrange(1, mult + 1):
                record.id = record.description = "antibody_%s_multiplicity_%d_copy_%d" % (
                    cluster, mult, i)
                SeqIO.write(record, fout, output_format)
def run_mixcr2_alignment_only(input_file,
                              output_dir,
                              log=None,
                              loci="all",
                              enforce_fastq=False,
                              threads=16,
                              remove_tmp=True,
                              species="hsa"):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    if enforce_fastq and idFormatByFileName(input_file) == "fasta":
        input_file_fq = "%s/input_reads.fq" % output_dir
        fastx2fastx(input_file, input_file_fq)
        input_file = input_file_tmp = input_file_fq
    elif idFormatByFileName(input_file) == "fasta":
        input_file_fasta = "%s/input_reads.fasta" % output_dir
        fastx2fastx(input_file, input_file_fasta)
        input_file = input_file_tmp = input_file_fasta
    else:
        input_file_tmp = None

    path = path_to_mixcr2
    args = {
        "path": path,
        "compress_eq_clusters_cmd":
        path_to_igrec + "/py/ig_compress_equal_clusters.py",
        "mixcr_cmd": "java -jar %s/mixcr.jar" % path,
        "threads": threads,
        "input_file": input_file,
        "output_dir": output_dir,
        "species": species,
        "loci": loci,
        "loci_arg": "chains"
    }

    # support.sys_call("%(mixcr_cmd)s align -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --%(loci_arg)s %(loci)s --noMerge --species %(species)s %(input_file)s %(output_dir)s/mixcr.vdjca" % args,
    #                  log=log)
    timer = Timer()
    support.sys_call(
        "%(mixcr_cmd)s align -p kaligner2 --species %(species)s -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --noMerge --%(loci_arg)s %(loci)s -OreadsLayout=Collinear -OvParameters.geneFeatureToAlign=VTranscript -OallowPartialAlignments=true %(input_file)s %(output_dir)s/mixcr.vdjca"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    if remove_tmp:
        if input_file_tmp is not None:
            os.remove(input_file_tmp)

        os.remove(output_dir + "/align_report.txt")
        os.remove(output_dir + "/mixcr.vdjca")
def convert_abvitro_to_repertoire(input_file, output_file):
    output_format = idFormatByFileName(output_file)
    input_format = idFormatByFileName(input_file)
    assert output_format == "fasta" or input_format == "fastq"

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, input_format):
            cluster, mult = parse_abvitro_assembled_header(
                str(record.description))
            record.id = record.description = "cluster___%s___size___%d" % (
                cluster, mult)
            SeqIO.write(record, fout, output_format)
def run_presto(input_file, output_dir, log=None, remove_tmp=True):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    # gunzip
    input_file_new = "%s/input_reads.fasta" % output_dir
    fastx2fastx(input_file, input_file_new)

    args = {"input_file": input_file_new, "output_dir": output_dir}

    timer = Timer()
    support.sys_call(
        "CollapseSeq.py -s %(input_file)s --outdir %(output_dir)s --outname presto"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    presto_output = output_dir + "/presto_collapse-unique.fasta"
    repertoire_fa = output_dir + "/final_repertoire.fa"
    with smart_open(presto_output) as fin, smart_open(repertoire_fa,
                                                      "w") as fout:
        for i, record in enumerate(
                SeqIO.parse(fin, idFormatByFileName(presto_output))):
            id = record.description
            size = parse_presto_id(id)
            record.id = record.description = "cluster___%d___size___%d" % (
                i, size)
            SeqIO.write(record, fout, "fasta")

    if remove_tmp:
        os.remove(input_file_new)
        os.remove(presto_output)
def jit_fx_file(input_file,
                output_file,
                error_rate=2,
                random_errors=True,
                min_error=0,
                erroneous_site_len=10005000,
                seed=None):
    import numpy as np
    from Bio import Seq
    import random

    output_format = idFormatByFileName(output_file)
    input_format = idFormatByFileName(input_file)
    print seed
    random.seed(seed)
    np.random.seed(seed)

    print np.random.ranf(1)

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, input_format):
            n_errors = np.random.poisson(error_rate,
                                         1)[0] if random_errors else error_rate
            if n_errors < min_error:
                n_errors = min_error

            positions = random.sample(
                range(min(len(record.seq), erroneous_site_len)), n_errors)
            s = list(str(record.seq))
            for pos in positions:
                s[pos] = RC(s[pos])

            if input_format == "fastq":
                phred_quality = record.letter_annotations["phred_quality"]
                record.letter_annotations = {}

            record.seq = Seq.Seq("".join(s))

            if output_format == "fastq":
                if input_format == "fastq":
                    record.letter_annotations["phred_quality"] = phred_quality
                else:
                    record.letter_annotations["phred_quality"] = [
                        random.randint(30, 50) for _ in xrange(len(record))
                    ]  # TODO Check it out

            SeqIO.write(record, fout, output_format)
def convert_mixcr2_output_to_igrec(input_file, output_file, initial_reads,
                                   output_rcm):
    with smart_open(initial_reads) as fh:
        record_ids = [
            str(record.description)
            for record in SeqIO.parse(fh, idFormatByFileName(initial_reads))
        ]

    targets = [None] * len(record_ids)
    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        # Skip header
        fh.next()

        for i, line in enumerate(fh):
            seq, size, ids = line.strip().split("\t")
            ids = ids.strip().split(",")
            ids = map(int, ids)
            for id in ids:
                targets[id] = i
            size = int(size)
            assert size <= len(ids)  # WHY?????????????
            # if size != len(ids):
            #     print size
            #     print ids
            size = len(ids)
            fout.write(">cluster___%d___size___%d\n" % (i, size))
            fout.write(seq + "\n")

        empty_num = max(target for target in targets if target is not None) + 1
        # print empty_num
        with smart_open(initial_reads) as fh:
            for j, record in enumerate(
                    SeqIO.parse(fh, idFormatByFileName(initial_reads))):
                if targets[j] is None:
                    targets[j] = empty_num
                    empty_num += 1
                    fout.write(">cluster___%d___size___%d\n" % (targets[j], 1))
                    fout.write(str(record.seq) + "\n")

    with smart_open(output_rcm, "w") as rcm:
        for id, target_cluster in zip(record_ids, targets):
            assert target_cluster is not None
            rcm.write("%s\t%d\n" % (id, target_cluster))
Esempio n. 7
0
def generate_rcm(reads_file_name, compressed_file_name, cliques_ids_file_name,
                 out_file):
    # Obtain read ids
    with smart_open(reads_file_name, "r") as fh:
        ids = [
            str(record.id)
            for record in SeqIO.parse(fh, idFormatByFileName(reads_file_name))
        ]

    # Obtain compread2clique
    with smart_open(cliques_ids_file_name, "r") as fh:
        compread2clique = [int(s) for s in fh]

    with smart_open(compressed_file_name, "r") as fh:
        idmap = [int(s) for s in fh]

    with smart_open(out_file, "w") as fh:
        for i in xrange(len(ids)):
            fh.write("%s\t%d\n" % (ids[i], compread2clique[idmap[i]]))
def simulated_repertoire_to_final_repertoire(input_file, output_file):
    import random

    output_format = idFormatByFileName(output_file)

    with smart_open(input_file) as fh, smart_open(output_file, "w") as fout:
        for record in SeqIO.parse(fh, "fasta"):
            id = record.description
            cluster, size, copy = parse_final_repertoire_id(id)
            if copy == 1:
                record.id = record.description = "cluster___%s___size___%d" % (
                    cluster, size)
                record.letter_annotations = {}

                if output_format == "fastq":
                    record.letter_annotations["phred_quality"] = [
                        random.randint(30, 50) for _ in xrange(len(record))
                    ]  # TODO Check it out
                    # record.letter_annotations["phred_quality"] = [50] * len(record)

                SeqIO.write(record, fout, output_format)
Esempio n. 9
0
def parse_vjf_output(filename, readfile):
    from collections import defaultdict

    with smart_open(readfile, "rU") as fh:
        parser = SeqIO.parse(fh, idFormatByFileName(readfile))
        descr_to_ind = { str(record.description).replace(" ", "_"): i for i, record in enumerate(parser) }

    result = defaultdict(dict)
    with open(filename) as csv_file:
        reader = csv.reader(csv_file, delimiter="\t")
        headers = reader.next()

# Read_name    Chain_type    V_hit    V_start_pos    V_end_pos    V_score    J_hit    J_start_pos    J_end_pos    J_score
        id_col = linear_search(headers, "Read_name")
        Vstart_col = linear_search(headers, "V_start_pos")
        Vend_col = linear_search(headers, "V_end_pos")
        Vgene_col = linear_search(headers, "V_hit")
        Jgene_col = linear_search(headers, "J_hit")
        Jstart_col = linear_search(headers, "J_start_pos")
        Jend_col = linear_search(headers, "J_end_pos")
        for line in reader:
            desc = line[id_col]

            Vstart = int(line[Vstart_col])
            Vend = int(line[Vend_col])
            Jstart = int(line[Jstart_col])
            Jend = int(line[Jend_col])

            Vgene = line[Vgene_col]
            # Vgene = Vgene[:Vgene.find(" ")]
            Jgene = line[Jgene_col]
            # Jgene = Jgene[:Jgene.find(" ")]

            ind = descr_to_ind[desc]
            result[desc]["V"] = HitTableRowVJF("V", desc, Vgene, Vstart, Vend)
            result[desc]["J"] = HitTableRowVJF("J", desc, Jgene, Jstart, Jend)
            result[ind] = result[desc]

        return result
Esempio n. 10
0
    parser.add_argument("output", type=str, help="output FASTA/FASTQ file")
    parser.add_argument("--limit",
                        "-l",
                        type=int,
                        default=5,
                        help="size limit (default: %(default)s)")

    args = parser.parse_args()

    print "Supernode reporter started..."
    print "Command line: %s" % " ".join(sys.argv)

    input_size = output_size = 0
    with smart_open(args.input, "r") as fin, smart_open(args.output,
                                                        "w") as fout:
        for record in SeqIO.parse(fin, idFormatByFileName(args.input)):
            input_size += 1
            id = str(record.description)
            size = parse_size(id)
            assert id is not None
            if size >= args.limit:
                SeqIO.write(record, fout, idFormatByFileName(args.output))
                output_size += 1

    print "%d antibody clusters have abundance >= %d" % (output_size,
                                                         args.limit)
    print "%d lowly abundant antibody clusters will be discarded" % (
        input_size - output_size, )
    print "Highly abundant clusters were written to " + args.output
    print "Supernode reporter done"
def run_mixcr2(input_file,
               output_dir,
               log=None,
               loci="all",
               enforce_fastq=False,
               threads=16,
               remove_tmp=True,
               species="hsa",
               region_from="FR1Begin",
               region_to="FR4Begin"):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    if enforce_fastq and idFormatByFileName(input_file) == "fasta":
        input_file_fq = "%s/input_reads.fq" % output_dir
        fastx2fastx(input_file, input_file_fq)
        input_file = input_file_tmp = input_file_fq
    elif idFormatByFileName(input_file) == "fasta":
        input_file_fasta = "%s/input_reads.fasta" % output_dir
        fastx2fastx(input_file, input_file_fasta)
        input_file = input_file_tmp = input_file_fasta
    else:
        input_file_tmp = None

    path = path_to_mixcr2
    args = {
        "path": path,
        "compress_eq_clusters_cmd":
        path_to_igrec + "/py/ig_compress_equal_clusters.py",
        "mixcr_cmd": "java -jar %s/mixcr.jar" % path,
        "threads": threads,
        "input_file": input_file,
        "output_dir": output_dir,
        "species": species,
        "loci": loci,
        "from": region_from,
        "to": region_to,
        "loci_arg": "chains"
    }

    # support.sys_call("%(mixcr_cmd)s align -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --%(loci_arg)s %(loci)s --noMerge --species %(species)s %(input_file)s %(output_dir)s/mixcr.vdjca" % args,
    timer = Timer()
    #                  log=log)
    support.sys_call(
        "%(mixcr_cmd)s align -p kaligner2 --species %(species)s -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --noMerge --%(loci_arg)s %(loci)s -OreadsLayout=Collinear -OvParameters.geneFeatureToAlign=VTranscript -OallowPartialAlignments=true %(input_file)s %(output_dir)s/mixcr.vdjca"
        % args,
        log=log)
    # support.sys_call("%(mixcr_cmd)s assemble -p default_affine -OassemblingFeatures=VDJRegion -OseparateByC=true -OqualityAggregationType=Average -OclusteringFilter.specificMutationProbability=1E-5 -OmaxBadPointsPercent=0 -t %(threads)d -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns" % args,
    # support.sys_call("%(mixcr_cmd)s assemble -f -p default_affine -OassemblingFeatures=VDJRegion -OseparateByC=true -OqualityAggregationType=Average -OclusteringFilter.specificMutationProbability=1E-5 -OmaxBadPointsPercent=0 -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns" % args,
    #                  log=log)
    # support.sys_call("%(mixcr_cmd)s assemble -t %(threads)d -f -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns" % args,
    #                  log=log)
    support.sys_call(
        "%(mixcr_cmd)s assemble -t %(threads)d -f -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file -OassemblingFeatures=\"{%(from)s:%(to)s}\" %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns"
        % args,
        log=log)
    args[
        "small_features"] = "-sequence -count -readIds %(output_dir)s/index_file" % args
    support.sys_call(
        "%(mixcr_cmd)s exportClones %(small_features)s -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/mixcr.txt"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    args[
        "features"] = "-count -sequence -nFeature CDR3 -vHit -jHit -vAlignment -jAlignment -aaFeature CDR3 -readIds %(output_dir)s/index_file" % args
    support.sys_call(
        "%(mixcr_cmd)s exportClones %(features)s -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/features.txt"
        % args,
        log=log)
    # convert_mixcr_output_to_igrec("%(output_dir)s/mixcr.txt" % args, "%(output_dir)s/mixcr_uncompressed.fa" % args)

    convert_mixcr2_output_to_igrec(
        "%(output_dir)s/mixcr.txt" % args,
        "%(output_dir)s/mixcr_uncompressed.fa" % args, input_file,
        "%(output_dir)s/mixcr_uncompressed.rcm" % args)
    support.sys_call(
        "%(compress_eq_clusters_cmd)s %(output_dir)s/mixcr_uncompressed.fa %(output_dir)s/final_repertoire.fa -r %(output_dir)s/mixcr_uncompressed.rcm -R %(output_dir)s/final_repertoire.rcm"
        % args)

    if remove_tmp:
        if input_file_tmp is not None:
            os.remove(input_file_tmp)

        os.remove(output_dir + "/align_report.txt")
        os.remove(output_dir + "/assemble_report.txt")
        os.remove(output_dir + "/mixcr.clns")
        os.remove(output_dir + "/mixcr.txt")
        os.remove(output_dir + "/features.txt")
        os.remove(output_dir + "/mixcr.vdjca")
        os.remove(output_dir + "/mixcr_uncompressed.fa")
        os.remove(output_dir + "/mixcr_uncompressed.rcm")
        os.remove(output_dir + "/index_file")
def run_mixcr(input_file,
              output_dir,
              log=None,
              loci="all",
              enforce_fastq=False,
              threads=16,
              remove_tmp=True,
              species="hsa",
              version=1):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    if enforce_fastq and idFormatByFileName(input_file) == "fasta":
        input_file_fq = "%s/input_reads.fq" % output_dir
        fastx2fastx(input_file, input_file_fq)
        input_file = input_file_tmp = input_file_fq
    elif idFormatByFileName(input_file) == "fasta":
        input_file_fasta = "%s/input_reads.fasta" % output_dir
        fastx2fastx(input_file, input_file_fasta)
        input_file = input_file_tmp = input_file_fasta
    else:
        input_file_tmp = None

    path = path_to_mixcr if version == 1 else path_to_mixcr2
    args = {
        "path": path,
        "compress_eq_clusters_cmd":
        path_to_igrec + "/py/ig_compress_equal_clusters.py",
        "mixcr_cmd": "java -jar %s/mixcr.jar" % path,
        "threads": threads,
        "input_file": input_file,
        "output_dir": output_dir,
        "species": species,
        "loci": loci,
        "loci_arg": "loci" if version == 1 else "chains"
    }

    timer = Timer()
    support.sys_call(
        "%(mixcr_cmd)s align -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --%(loci_arg)s %(loci)s --noMerge -OvParameters.geneFeatureToAlign=VTranscript --species %(species)s %(input_file)s %(output_dir)s/mixcr.vdjca"
        % args,
        log=log)
    support.sys_call(
        "%(mixcr_cmd)s assemble -t %(threads)d -f -r %(output_dir)s/assemble_report.txt -OassemblingFeatures=\"{FR1Begin:FR4Begin}\" %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns"
        % args,
        log=log)
    support.sys_call(
        "%(mixcr_cmd)s exportClones -sequence -count -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/mixcr.txt"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    args[
        "features"] = "-count -sequence -nFeature CDR3 -vHit -jHit -vAlignment -jAlignment -aaFeature CDR3"
    support.sys_call(
        "%(mixcr_cmd)s exportClones %(features)s -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/features.txt"
        % args,
        log=log)
    convert_mixcr_output_to_igrec(
        "%(output_dir)s/mixcr.txt" % args,
        "%(output_dir)s/mixcr_uncompressed.fa" % args)
    support.sys_call(
        "%(compress_eq_clusters_cmd)s %(output_dir)s/mixcr_uncompressed.fa %(output_dir)s/final_repertoire.fa"
        % args)

    if remove_tmp:
        if input_file_tmp is not None:
            os.remove(input_file_tmp)

        os.remove(output_dir + "/mixcr.clns")
        os.remove(output_dir + "/mixcr.txt")
        os.remove(output_dir + "/mixcr.vdjca")
        os.remove(output_dir + "/mixcr_uncompressed.fa")
                        help="output file with repertoire sequences")
    parser.add_argument("--output-rcm",
                        "-R",
                        type=str,
                        help="output file with repertoire RCM")

    args = parser.parse_args()

    print "Construct repertoire from TrieCompressor output..."
    print "Command line: %s" % " ".join(sys.argv)

    # Fix ids
    with smart_open(args.input_compressed) as fin, smart_open(
            args.output_repertoire, "w") as fout:
        for i, record in enumerate(
                SeqIO.parse(fin, idFormatByFileName(args.input_compressed))):
            id = record.description
            size = parse_size(id)
            record.id = record.description = "cluster___%d___size___%d" % (
                i, size)
            SeqIO.write(record, fout,
                        idFormatByFileName(args.output_repertoire))

    with smart_open(args.input_reads) as fin_reads, smart_open(
            args.input_map) as fin_map, smart_open(args.output_rcm,
                                                   "w") as fout_rcm:
        for record, cluster in izip(
                SeqIO.parse(fin_reads, idFormatByFileName(args.input_reads)),
                fin_map):
            id = record.description
            cluster = cluster.strip()
Esempio n. 14
0
                        default=10,
                        help="distance threshold [default %(default)d]")
    parser.add_argument("--lengths",
                        type=str,
                        help="file for read length stats")
    # parser.add_argument("--subs-map", "-M",
    #                     type=str,
    #                     help="file for subs table")

    args = parser.parse_args()

    barcodes_count = defaultdict(int)

    print("Reading library...")
    with smart_open(args.input, "r") as fh:
        data = list(SeqIO.parse(fh, idFormatByFileName(args.input)))

    if not args.no_fix_spaces:
        for record in data:
            record.description = str(record.description).replace(" ", "_")
            record.id = record.name = record.description

    # Omit reads with Ns
    data = [record for record in data if record.seq.count("N") == 0]

    data = [
        record for record in data if extract_barcode(record.id) is not None
    ]

    clusters = defaultdict(list)
Esempio n. 15
0
if __name__ == "__main__":
    args = parse_command_line()

    log = CreateLogger("VJF benchmark")
    if args.log:
        AttachFileLogger(log, args.log)

    with open(args.germline_J_file, "rU") as fh:
        germline_J_parser = SeqIO.parse(fh, "fasta")
        germline_J_map = { str(record.id): str(record.seq) for record in germline_J_parser }

    igblast_hits = get_igblast_output(args)
    vjf_hits = get_vjf_output(args)

    with smart_open(args.input, "rU") as fh:
        parser = SeqIO.parse(fh, idFormatByFileName(args.input))
        reads = list(parser)

    assert len(reads) == len(igblast_hits)
    ids = [str(record.description) for record in reads]
    assert len(set(ids)) == len(reads)

    stats = benchmark_stats(igblast_hits, vjf_hits, germline_J_map)

    if args.bad_reads:
        with smart_open(args.bad_reads, "w") as f:
            SeqIO.write([reads[_] for _ in stats.bad_reads], f, idFormatByFileName(args.bad_reads))
        log.info("Bad reads were written to " + args.bad_reads)

    log.info("Overall reads %d" % stats.all)
    log.info("Identified contaminations %d from %d" % (stats.identified_contaminations, stats.contaminations))