def simulate_data_wo_errors(input_file, output_dir, log=None):
    import tempfile
    import shutil

    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    temp_dir = tempfile.mkdtemp()
    run_igrec(input_file, temp_dir, remove_tmp=False,
              tau=1)  # Run IgReC for VJF output

    input_file = temp_dir + "/vj_finder/cleaned_reads.fa"

    simulated_repertoire_to_rcm(input_file,
                                "%s/final_repertoire.rcm" % output_dir)

    simulated_repertoire_to_final_repertoire(
        input_file, "%s/final_repertoire.fa.gz" % output_dir)

    args = {
        "path": igrec_dir,
        "repertoire": output_dir + "/final_repertoire.fa.gz",
        "rcm": output_dir + "/final_repertoire.rcm"
    }
    support.sys_call(
        "%(path)s/py/ig_compress_equal_clusters.py %(repertoire)s %(repertoire)s -r %(rcm)s"
        % args,
        log=log)

    fastx2fastx(input_file, output_dir + "/error_free_reads.fa.gz")

    shutil.rmtree(temp_dir)
def run_presto(input_file, output_dir, log=None, remove_tmp=True):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    # gunzip
    input_file_new = "%s/input_reads.fasta" % output_dir
    fastx2fastx(input_file, input_file_new)

    args = {"input_file": input_file_new, "output_dir": output_dir}

    timer = Timer()
    support.sys_call(
        "CollapseSeq.py -s %(input_file)s --outdir %(output_dir)s --outname presto"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    presto_output = output_dir + "/presto_collapse-unique.fasta"
    repertoire_fa = output_dir + "/final_repertoire.fa"
    with smart_open(presto_output) as fin, smart_open(repertoire_fa,
                                                      "w") as fout:
        for i, record in enumerate(
                SeqIO.parse(fin, idFormatByFileName(presto_output))):
            id = record.description
            size = parse_presto_id(id)
            record.id = record.description = "cluster___%d___size___%d" % (
                i, size)
            SeqIO.write(record, fout, "fasta")

    if remove_tmp:
        os.remove(input_file_new)
        os.remove(presto_output)
def run_mixcr2_alignment_only(input_file,
                              output_dir,
                              log=None,
                              loci="all",
                              enforce_fastq=False,
                              threads=16,
                              remove_tmp=True,
                              species="hsa"):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    if enforce_fastq and idFormatByFileName(input_file) == "fasta":
        input_file_fq = "%s/input_reads.fq" % output_dir
        fastx2fastx(input_file, input_file_fq)
        input_file = input_file_tmp = input_file_fq
    elif idFormatByFileName(input_file) == "fasta":
        input_file_fasta = "%s/input_reads.fasta" % output_dir
        fastx2fastx(input_file, input_file_fasta)
        input_file = input_file_tmp = input_file_fasta
    else:
        input_file_tmp = None

    path = path_to_mixcr2
    args = {
        "path": path,
        "compress_eq_clusters_cmd":
        path_to_igrec + "/py/ig_compress_equal_clusters.py",
        "mixcr_cmd": "java -jar %s/mixcr.jar" % path,
        "threads": threads,
        "input_file": input_file,
        "output_dir": output_dir,
        "species": species,
        "loci": loci,
        "loci_arg": "chains"
    }

    # support.sys_call("%(mixcr_cmd)s align -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --%(loci_arg)s %(loci)s --noMerge --species %(species)s %(input_file)s %(output_dir)s/mixcr.vdjca" % args,
    #                  log=log)
    timer = Timer()
    support.sys_call(
        "%(mixcr_cmd)s align -p kaligner2 --species %(species)s -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --noMerge --%(loci_arg)s %(loci)s -OreadsLayout=Collinear -OvParameters.geneFeatureToAlign=VTranscript -OallowPartialAlignments=true %(input_file)s %(output_dir)s/mixcr.vdjca"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    if remove_tmp:
        if input_file_tmp is not None:
            os.remove(input_file_tmp)

        os.remove(output_dir + "/align_report.txt")
        os.remove(output_dir + "/mixcr.vdjca")
def run_mixcr2(input_file,
               output_dir,
               log=None,
               loci="all",
               enforce_fastq=False,
               threads=16,
               remove_tmp=True,
               species="hsa",
               region_from="FR1Begin",
               region_to="FR4Begin"):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    if enforce_fastq and idFormatByFileName(input_file) == "fasta":
        input_file_fq = "%s/input_reads.fq" % output_dir
        fastx2fastx(input_file, input_file_fq)
        input_file = input_file_tmp = input_file_fq
    elif idFormatByFileName(input_file) == "fasta":
        input_file_fasta = "%s/input_reads.fasta" % output_dir
        fastx2fastx(input_file, input_file_fasta)
        input_file = input_file_tmp = input_file_fasta
    else:
        input_file_tmp = None

    path = path_to_mixcr2
    args = {
        "path": path,
        "compress_eq_clusters_cmd":
        path_to_igrec + "/py/ig_compress_equal_clusters.py",
        "mixcr_cmd": "java -jar %s/mixcr.jar" % path,
        "threads": threads,
        "input_file": input_file,
        "output_dir": output_dir,
        "species": species,
        "loci": loci,
        "from": region_from,
        "to": region_to,
        "loci_arg": "chains"
    }

    # support.sys_call("%(mixcr_cmd)s align -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --%(loci_arg)s %(loci)s --noMerge --species %(species)s %(input_file)s %(output_dir)s/mixcr.vdjca" % args,
    timer = Timer()
    #                  log=log)
    support.sys_call(
        "%(mixcr_cmd)s align -p kaligner2 --species %(species)s -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --noMerge --%(loci_arg)s %(loci)s -OreadsLayout=Collinear -OvParameters.geneFeatureToAlign=VTranscript -OallowPartialAlignments=true %(input_file)s %(output_dir)s/mixcr.vdjca"
        % args,
        log=log)
    # support.sys_call("%(mixcr_cmd)s assemble -p default_affine -OassemblingFeatures=VDJRegion -OseparateByC=true -OqualityAggregationType=Average -OclusteringFilter.specificMutationProbability=1E-5 -OmaxBadPointsPercent=0 -t %(threads)d -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns" % args,
    # support.sys_call("%(mixcr_cmd)s assemble -f -p default_affine -OassemblingFeatures=VDJRegion -OseparateByC=true -OqualityAggregationType=Average -OclusteringFilter.specificMutationProbability=1E-5 -OmaxBadPointsPercent=0 -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns" % args,
    #                  log=log)
    # support.sys_call("%(mixcr_cmd)s assemble -t %(threads)d -f -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns" % args,
    #                  log=log)
    support.sys_call(
        "%(mixcr_cmd)s assemble -t %(threads)d -f -r %(output_dir)s/assemble_report.txt --index %(output_dir)s/index_file -OassemblingFeatures=\"{%(from)s:%(to)s}\" %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns"
        % args,
        log=log)
    args[
        "small_features"] = "-sequence -count -readIds %(output_dir)s/index_file" % args
    support.sys_call(
        "%(mixcr_cmd)s exportClones %(small_features)s -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/mixcr.txt"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    args[
        "features"] = "-count -sequence -nFeature CDR3 -vHit -jHit -vAlignment -jAlignment -aaFeature CDR3 -readIds %(output_dir)s/index_file" % args
    support.sys_call(
        "%(mixcr_cmd)s exportClones %(features)s -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/features.txt"
        % args,
        log=log)
    # convert_mixcr_output_to_igrec("%(output_dir)s/mixcr.txt" % args, "%(output_dir)s/mixcr_uncompressed.fa" % args)

    convert_mixcr2_output_to_igrec(
        "%(output_dir)s/mixcr.txt" % args,
        "%(output_dir)s/mixcr_uncompressed.fa" % args, input_file,
        "%(output_dir)s/mixcr_uncompressed.rcm" % args)
    support.sys_call(
        "%(compress_eq_clusters_cmd)s %(output_dir)s/mixcr_uncompressed.fa %(output_dir)s/final_repertoire.fa -r %(output_dir)s/mixcr_uncompressed.rcm -R %(output_dir)s/final_repertoire.rcm"
        % args)

    if remove_tmp:
        if input_file_tmp is not None:
            os.remove(input_file_tmp)

        os.remove(output_dir + "/align_report.txt")
        os.remove(output_dir + "/assemble_report.txt")
        os.remove(output_dir + "/mixcr.clns")
        os.remove(output_dir + "/mixcr.txt")
        os.remove(output_dir + "/features.txt")
        os.remove(output_dir + "/mixcr.vdjca")
        os.remove(output_dir + "/mixcr_uncompressed.fa")
        os.remove(output_dir + "/mixcr_uncompressed.rcm")
        os.remove(output_dir + "/index_file")
def run_mixcr(input_file,
              output_dir,
              log=None,
              loci="all",
              enforce_fastq=False,
              threads=16,
              remove_tmp=True,
              species="hsa",
              version=1):
    if log is None:
        log = FakeLog()

    mkdir_p(output_dir)

    if enforce_fastq and idFormatByFileName(input_file) == "fasta":
        input_file_fq = "%s/input_reads.fq" % output_dir
        fastx2fastx(input_file, input_file_fq)
        input_file = input_file_tmp = input_file_fq
    elif idFormatByFileName(input_file) == "fasta":
        input_file_fasta = "%s/input_reads.fasta" % output_dir
        fastx2fastx(input_file, input_file_fasta)
        input_file = input_file_tmp = input_file_fasta
    else:
        input_file_tmp = None

    path = path_to_mixcr if version == 1 else path_to_mixcr2
    args = {
        "path": path,
        "compress_eq_clusters_cmd":
        path_to_igrec + "/py/ig_compress_equal_clusters.py",
        "mixcr_cmd": "java -jar %s/mixcr.jar" % path,
        "threads": threads,
        "input_file": input_file,
        "output_dir": output_dir,
        "species": species,
        "loci": loci,
        "loci_arg": "loci" if version == 1 else "chains"
    }

    timer = Timer()
    support.sys_call(
        "%(mixcr_cmd)s align -t %(threads)d -f -g -r %(output_dir)s/align_report.txt --%(loci_arg)s %(loci)s --noMerge -OvParameters.geneFeatureToAlign=VTranscript --species %(species)s %(input_file)s %(output_dir)s/mixcr.vdjca"
        % args,
        log=log)
    support.sys_call(
        "%(mixcr_cmd)s assemble -t %(threads)d -f -r %(output_dir)s/assemble_report.txt -OassemblingFeatures=\"{FR1Begin:FR4Begin}\" %(output_dir)s/mixcr.vdjca %(output_dir)s/mixcr.clns"
        % args,
        log=log)
    support.sys_call(
        "%(mixcr_cmd)s exportClones -sequence -count -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/mixcr.txt"
        % args,
        log=log)
    timer.stamp(output_dir + "/time.txt")

    args[
        "features"] = "-count -sequence -nFeature CDR3 -vHit -jHit -vAlignment -jAlignment -aaFeature CDR3"
    support.sys_call(
        "%(mixcr_cmd)s exportClones %(features)s -f --no-spaces %(output_dir)s/mixcr.clns %(output_dir)s/features.txt"
        % args,
        log=log)
    convert_mixcr_output_to_igrec(
        "%(output_dir)s/mixcr.txt" % args,
        "%(output_dir)s/mixcr_uncompressed.fa" % args)
    support.sys_call(
        "%(compress_eq_clusters_cmd)s %(output_dir)s/mixcr_uncompressed.fa %(output_dir)s/final_repertoire.fa"
        % args)

    if remove_tmp:
        if input_file_tmp is not None:
            os.remove(input_file_tmp)

        os.remove(output_dir + "/mixcr.clns")
        os.remove(output_dir + "/mixcr.txt")
        os.remove(output_dir + "/mixcr.vdjca")
        os.remove(output_dir + "/mixcr_uncompressed.fa")
Esempio n. 6
0
def GetMigecSteps(params, run_params):
    migec_steps = [
        ShStep(igrec_dir, [
            "%s/vj_finder" % igrec_bin,
            "--input-file %s/amplified/amplified.fasta" % run_params.data_path,
            "--output-dir %s/vjf_amplified" % run_params.data_path,
            "--loci IG",
            "--threads %d" % params.threads
        ]),
        PyStep(
            "converting fasta file (%s) to fastq format (%s)" %
            ("%s/vjf_amplified/cleaned_reads.fa" % run_params.data_path,
             "%s/vjf_amplified/cleaned_reads.fastq" % run_params.data_path),
            lambda: fastx2fastx(
                "%s/vjf_amplified/cleaned_reads.fa" % run_params.data_path,
                "%s/vjf_amplified/cleaned_reads.fastq" % run_params.data_path,
                50, True)),
        ShStep(None, [
            "python -u %s/py/convert_sim_to_migec.py" % igrec_dir,
            "-r %s/vjf_amplified/cleaned_reads.fastq" % run_params.data_path,
            "-o %s/vjf_amplified/migec.fastq" % run_params.data_path
        ]),
        ShStep(None, [
            "java -jar %s/migec.jar Assemble" % migec_path,
            "-c %s/vjf_amplified/migec.fastq" % run_params.data_path, ".",
            "%s/migec" % run_params.data_path
        ]),
        PyStep(
            "running MIXCR", lambda: run_mixcr2(
                input_file="%s/migec/migec.t5.fastq.gz" % run_params.data_path,
                output_dir="%s/migec/mixcr" % run_params.data_path,
                threads=params.threads,
                remove_tmp=False)),
        PyStep(
            "fixing cluster sizes", lambda: fix_migec_mixcr_cluster_sizes(
                input_file="%s/migec/mixcr/final_repertoire.fa" % run_params.
                data_path,
                rcm_file="%s/migec/mixcr/final_repertoire.rcm" % run_params.
                data_path,
                output_file="%s/migec/final_repertoire.fa" % run_params.
                data_path,
            )),

        # ShStep(None, ["java -jar %s/mixcr.jar" % mixcr_path,
        #         "align -p kaligner2",
        #         "--chains IGH",
        #         "%s/migec/migec.t5.fastq.gz" % simulation_params.data_path,
        #         "%s/migec/alignments.vdcja" % simulation_params.data_path
        #         ]),
        # ShStep(None, ["java -jar %s/mixcr.jar assemble" % mixcr_path,
        #         "-t %d" % threads,
        #         "-OassemblingFeatures=VDJRegion",
        #         "%s/migec/alignments.vdcja" % simulation_params.data_path,
        #         "%s/migec/clones.clns" % simulation_params.data_path
        #         ]),
        # ShStep(None, ["java -jar %s/mixcr.jar exportClones" % mixcr_path,
        #         "-t %d" % threads,
        #         "%s/migec/clones.clns" % simulation_params.data_path,
        #         "%s/migec/clones.txt" % simulation_params.data_path
        #         ]),
        # ShStep(None, ["python -u %s/py/convert_mixcr_to_quast.py" % igrec_dir,
        #         "-r %s/migec/clones.txt" % simulation_params.data_path,
        #         "-o %s/migec/clones.fasta" % simulation_params.data_path
        #         ]),

        # ShStep(None, ["gunzip",
        #  "--keep",
        #  "--force",
        #  "%s/migec/migec.t5.fastq.gz" % simulation_params.data_path,
        #  "> %s/migec/migec.t5.fastq" % simulation_params.data_path
        #  ]),
        # ShStep(None, ["sed",
        #  "'s/ /_/g'",
        #  "%s/migec/migec.t5.fastq" % simulation_params.data_path,
        #  "> %s/migec/migec.fastq" % simulation_params.data_path
        #  ]),
        # ShStep(None, ["python -u %s/py/convert_migec_to_trie.py" % igrec_dir,
        #  "-r %s/migec/migec.fastq" % simulation_params.data_path,
        #  "-o %s/migec/migec.fasta" % simulation_params.data_path
        #  ]),
        # ShStep(None, ["%s/py/ig_compress_equal_clusters.py" % igrec_dir,
        #  "%s/migec/migec.fasta" % simulation_params.data_path,
        #  "%s/migec/migec_compressed.fasta" % simulation_params.data_path,
        #  "--barcode"
        #  ]),

        # ShStep(None, ["python -u %s/igquast.py" % igrec_dir,
        #         "-s %s/amplified/amplified.fasta" % simulation_params.data_path,
        #         "-c %s/migec/clones.fasta" % simulation_params.data_path,
        #         "-r %s/vjf_reference/cleaned_reads.fa" % simulation_params.data_path,
        #         "-o %s/quast_migec" % simulation_params.data_path,
        #         "--json %s/quast_migec/aimquast.json" % simulation_params.data_path,
        #         "--reference-free",
        #         "--rcm-based"
        #         ]),
        ShStep(
            None,
            [
                "python -u %s/igquast.py" % igrec_dir,
                "-s %s/amplified/amplified.fasta" % run_params.data_path,
                "-c %s/migec/final_repertoire.fa" % run_params.data_path,
                # "-C %s/migec/final_repertoire.rcm" % simulation_params.data_path,
                "-r %s/vjf_reference/cleaned_reads.fa" % run_params.data_path,
                "-o %s/quast_migec" % run_params.data_path,
                "--json %s/quast_migec/aimquast.json" % run_params.data_path,
                "--reference-free",
                "--rcm-based"
            ])
    ]
    return migec_steps
Esempio n. 7
0
threads = 32

steps = [
    ShStep(igrec_dir, [
        "%s/vj_finder" % igrec_bin,
        "--input-file %s" % "/Marx/serg/data/age/3/age_ig_s3_R12.fastq",
        "--output-dir %s/vjf_amplified" % word_dir, "--loci IG",
        "--threads %d" % threads
    ]),
    PyStep(
        "converting fasta file (%s) to fastq format (%s)" %
        ("%s/vjf_amplified/cleaned_reads.fa" % word_dir,
         "%s/vjf_amplified/cleaned_reads.fastq" % word_dir),
        lambda: fastx2fastx("%s/vjf_amplified/cleaned_reads.fa" % word_dir,
                            "%s/vjf_amplified/cleaned_reads.fastq" % word_dir,
                            50, True)),
    ShStep(None, [
        "python -u %s/py/convert_sim_to_migec.py" % igrec_dir,
        "-r %s/vjf_amplified/cleaned_reads.fastq" % word_dir,
        "-o %s/vjf_amplified/migec.fastq" % word_dir
    ]),
    ShStep(None, [
        "java -jar %s/migec.jar Assemble" % migec_path,
        "-c %s/vjf_amplified/migec.fastq" % word_dir, ".",
        "%s/migec" % word_dir
    ]),
    PyStep(
        "running MIXCR",
        lambda: run_mixcr2(input_file="%s/migec/migec.t5.fastq.gz" % word_dir,
                           output_dir="%s/migec/mixcr" % word_dir,