Beispiel #1
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr)

    start_message = """
#   Starting Empire Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded,
               inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions,
               tHdp=args.templateHDP, cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection")

    reference_sequence = args.ref

    for cycle in range(0, args.cycles):
        check, reference_sequence_length = write_degenerate_reference_set(input_fasta=reference_sequence,
                                                                          out_path=temp_dir_path, step=STEP)
        assert check, "Problem making degenerate reference sequence set"

        # index the reference for bwa
        print("signalAlign - indexing reference", file=sys.stderr)
        bwa_ref_index = get_bwa_index(reference_sequence, temp_dir_path)
        print("signalAlign - indexing reference, done", file=sys.stderr)

        # setup workers for multiprocessing
        workers = args.nb_jobs
        work_queue = Manager().Queue()
        done_queue = Manager().Queue()
        jobs = []

        # list of alignment files
        fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")]

        # take only some
        if args.nb_files < len(fast5s):
            shuffle(fast5s)
            fast5s = fast5s[:args.nb_files]

        for fast5 in fast5s:
            alignment_args = {
                "forward_reference": None,
                "backward_reference": None,
                "path_to_EC_refs": temp_dir_path,
                "destination": temp_dir_path,
                "stateMachineType": args.stateMachineType,
                "bwa_index": bwa_ref_index,
                "in_templateHmm": args.in_T_Hmm,
                "in_complementHmm": args.in_C_Hmm,
                "in_templateHdp": args.templateHDP,
                "in_complementHdp": args.complementHDP,
                "banded": args.banded,
                "sparse_output": True,
                "in_fast5": args.files_dir + fast5,
                "threshold": args.threshold,
                "diagonal_expansion": args.diag_expansion,
                "constraint_trim": args.constraint_trim,
                "target_regions": None,
                "degenerate": degenerate_enum(args.degenerate),
            }
            #alignment = SignalAlignment(**alignment_args)
            #alignment.run()
            work_queue.put(alignment_args)

        for w in xrange(workers):
            p = Process(target=aligner, args=(work_queue, done_queue))
            p.start()
            jobs.append(p)
            work_queue.put('STOP')

        for p in jobs:
            p.join()

        done_queue.put('STOP')

        print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
        print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

        # working sequence is a string, that has the reference we're going to update this cycle
        working_sequence = get_first_sequence(reference_sequence)

        # register is the relative position that is being N-ed:
        # ACGTAGACAATA --> NCGTAGNCAATA = register 0
        # ACGTAGACAATA --> ANGTAGANAATA = register 1 ...
        for register in range(0, STEP):
            print("#  Starting Variant Calling, register: {}...".format(register), file=sys.stdout, end='\n')
            print("#  Starting Variant Calling, register: {}...".format(register), file=sys.stderr, end='')
            # cull the alignment files for this register
            alns, forward_mask = get_alignments_labels_and_mask(
                path_to_alignments=temp_dir_path + "*.tsv.{}".format(register),
                max=args.nb_files,
                suffix=".{}".format(register)
            )
            # this is the list of positions that we're going to look at, based on this register
            degenerate_positions = {'forward': range(register, reference_sequence_length, STEP),
                                    'backward': range(register, reference_sequence_length, STEP)
                                    }

            # place to put the marginal probs
            variant_call_file = temp_folder.add_file_path("variants.{cycle}.{reg}.calls".format(cycle=cycle,
                                                                                                reg=register))
            # arguments for multiprocessing
            for aln, forward_bool in zip(alns, forward_mask):
                call_methyl_args = {
                    "sequence": None,
                    "alignment_file": aln,
                    "forward": forward_bool,
                    "out_file": variant_call_file,
                    "positions": degenerate_positions,
                    "degenerate_type": degenerate_enum(args.degenerate),
                }
                #c = CallMethylation(**call_methyl_args)
                #c.write()
                work_queue.put(call_methyl_args)

            for w in xrange(workers):
                p = Process(target=run_methyl_caller, args=(work_queue, done_queue))
                p.start()
                jobs.append(p)
                work_queue.put('STOP')

            for p in jobs:
                p.join()

            done_queue.put('STOP')

            # this is where the per-register update happens
            working_sequence = update_reference(variant_call_file, working_sequence, register,
                                                min_depth=0, get_sites=False)

            # remove alignments for this register
            for f in glob.glob(temp_dir_path + "*.tsv.{}".format(register)):
                os.remove(f)
            print("done", file=sys.stdout, end="\n")
            print("done", file=sys.stderr, end="\n")

        # add a file for this cycle
        ref_path = temp_folder.add_file_path("iteration.{cycle}.fa".format(cycle=cycle))
        # write it to a file
        write_fasta("iteration.{cycle}.fa".format(cycle=cycle), working_sequence, open(ref_path, 'w'))
        # update the path to the reference for the next cycle
        reference_sequence = ref_path
    return
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
          file=sys.stderr)

    start_message = """
#   Starting Signal Align
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: True
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(
        fileDir=args.files_dir,
        reference=args.ref,
        nbFiles=args.nb_files,  #banding=args.banded,
        inThmm=args.in_T_Hmm,
        inChmm=args.in_C_Hmm,
        model=args.stateMachineType,
        regions=args.target_regions,
        tHdp=args.templateHDP,
        cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if args.files_dir is None and args.fofn is None:
        print("Need to provide directory with .fast5 files of fofn",
              file=sys.stderr)
        sys.exit(1)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file, looked for it {here}".format(
            here=args.ref),
              file=sys.stderr)
        sys.exit(1)

    # make directory to put temporary files
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_alignment")

    if args.error_correct is True:
        print(
            "[runSignalAlign]:ERROR: Error correction not implemented, yet\n",
            file=sys.stderr)
        sys.exit(1)
        #write_degenerate_reference_set(input_fasta=args.ref, out_path=temp_dir_path)
        #plus_strand_sequence = None
        #minus_strand_sequence = None
    else:
        # parse the substitution file, if given
        plus_strand_sequence = temp_folder.add_file_path(
            "forward_reference.txt")
        minus_strand_sequence = temp_folder.add_file_path(
            "backward_reference.txt")
        if args.substitution_file is not None:
            add_ambiguity_chars_to_reference(
                input_fasta=args.ref,
                substitution_file=args.substitution_file,
                sequence_outfile=plus_strand_sequence,
                rc_sequence_outfile=minus_strand_sequence,
                degenerate_type=args.degenerate,
                sub_char=args.ambig_char)
        else:
            make_temp_sequence(fasta=args.ref,
                               sequence_outfile=plus_strand_sequence,
                               rc_sequence_outfile=minus_strand_sequence)

    # index the reference for bwa
    print("signalAlign - indexing reference", file=sys.stderr)
    bwa_ref_index = get_bwa_index(args.ref, temp_dir_path)
    print("signalAlign - indexing reference, done", file=sys.stderr)

    # parse the target regions, if provided
    # TODO make this the same as the 'labels' file
    if args.target_regions is not None:
        target_regions = TargetRegions(args.target_regions)
    else:
        target_regions = None

    # setup workers for multiprocessing
    workers = args.nb_jobs
    work_queue = Manager().Queue()
    done_queue = Manager().Queue()
    jobs = []

    # list of read files
    if args.fofn is not None:
        fast5s = [x for x in parse_fofn(args.fofn) if x.endswith(".fast5")]
    else:
        fast5s = [
            args.files_dir + x for x in os.listdir(args.files_dir)
            if x.endswith(".fast5")
        ]

    nb_files = args.nb_files
    if nb_files < len(fast5s):
        shuffle(fast5s)
        fast5s = fast5s[:nb_files]
    print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)),
          file=sys.stdout)
    for fast5 in fast5s:
        alignment_args = {
            "forward_reference": plus_strand_sequence,
            "backward_reference": minus_strand_sequence,
            "path_to_EC_refs": (temp_dir_path if args.error_correct else None),
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_index": bwa_ref_index,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "banded": True,  #args.banded,
            "output_format": args.outFmt,
            #"in_fast5": args.files_dir + fast5,
            "in_fast5": fast5,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "target_regions": target_regions,
            "degenerate": degenerate_enum(args.degenerate),
        }
        #alignment = SignalAlignment(**alignment_args)
        #alignment.run()
        work_queue.put(alignment_args)

    for w in xrange(workers):
        p = Process(target=aligner, args=(work_queue, done_queue))
        p.start()
        jobs.append(p)
        work_queue.put('STOP')

    for p in jobs:
        p.join()

    done_queue.put('STOP')
    print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
    print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

    if args.outFmt == "variantCaller":
        concat_variant_call_files(temp_dir_path)
Beispiel #3
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr)

    start_message = """
#   Starting Jamison Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
#   Performing {cycles} cycles
    """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded,
               inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions,
               tHdp=args.templateHDP, cHdp=args.complementHDP, cycles=args.cycles)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection")

    # initialize to input fasta
    reference_sequence_path = args.ref

    # list of alignment files
    fast5s = cull_fast5_files(args.files_dir, args.nb_files)

    for cycle in range(0, args.cycles):
        # index the reference for bwa this is a string with the path to the index
        bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path)

        # unpack the reference sequence
        reference_sequence_string = get_first_sequence(reference_sequence_path)

        alignment_args = {
            "path_to_EC_refs": None,
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_index": bwa_ref_index,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "banded": args.banded,
            "sparse_output": True,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "target_regions": None,
            "degenerate": degenerate_enum(args.degenerate),
        }

        proposals = scan_for_proposals(temp_folder, STEP, reference_sequence_string, fast5s, alignment_args,
                                       args.nb_jobs)

        proposals = group_sites_in_window(proposals, 6)

        print("Cycle {cycle} - Got {nb} sites to check: {sites}".format(nb=len(proposals),
                                                                        sites=proposals,
                                                                        cycle=cycle))

        updated_reference_string = update_reference_with_marginal_probs(temp_folder, proposals,
                                                                        reference_sequence_string, fast5s,
                                                                        alignment_args, args.nb_jobs)

        updated_reference_path = temp_folder.add_file_path("cycle_snapshot.{cycle}.fa".format(cycle=cycle))

        write_fasta("jamison{}".format(cycle), updated_reference_string, open(updated_reference_path, 'w'))

        reference_sequence_path = updated_reference_path

    # copy final file
    copyfile(reference_sequence_path, temp_dir_path + args.corrected)

    return
Beispiel #4
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
          file=sys.stderr)

    start_message = """
#   Starting Empire Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir,
               reference=args.ref,
               nbFiles=args.nb_files,
               banding=args.banded,
               inThmm=args.in_T_Hmm,
               inChmm=args.in_C_Hmm,
               model=args.stateMachineType,
               regions=args.target_regions,
               tHdp=args.templateHDP,
               cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out +
                                            "tempFiles_errorCorrection")

    reference_sequence = args.ref

    for cycle in range(0, args.cycles):
        check, reference_sequence_length = write_degenerate_reference_set(
            input_fasta=reference_sequence, out_path=temp_dir_path, step=STEP)
        assert check, "Problem making degenerate reference sequence set"

        # index the reference for bwa
        print("signalAlign - indexing reference", file=sys.stderr)
        bwa_ref_index = get_bwa_index(reference_sequence, temp_dir_path)
        print("signalAlign - indexing reference, done", file=sys.stderr)

        # setup workers for multiprocessing
        workers = args.nb_jobs
        work_queue = Manager().Queue()
        done_queue = Manager().Queue()
        jobs = []

        # list of alignment files
        fast5s = [
            x for x in os.listdir(args.files_dir) if x.endswith(".fast5")
        ]

        # take only some
        if args.nb_files < len(fast5s):
            shuffle(fast5s)
            fast5s = fast5s[:args.nb_files]

        for fast5 in fast5s:
            alignment_args = {
                "forward_reference": None,
                "backward_reference": None,
                "path_to_EC_refs": temp_dir_path,
                "destination": temp_dir_path,
                "stateMachineType": args.stateMachineType,
                "bwa_index": bwa_ref_index,
                "in_templateHmm": args.in_T_Hmm,
                "in_complementHmm": args.in_C_Hmm,
                "in_templateHdp": args.templateHDP,
                "in_complementHdp": args.complementHDP,
                "banded": args.banded,
                "sparse_output": True,
                "in_fast5": args.files_dir + fast5,
                "threshold": args.threshold,
                "diagonal_expansion": args.diag_expansion,
                "constraint_trim": args.constraint_trim,
                "target_regions": None,
                "degenerate": degenerate_enum(args.degenerate),
            }
            #alignment = SignalAlignment(**alignment_args)
            #alignment.run()
            work_queue.put(alignment_args)

        for w in xrange(workers):
            p = Process(target=aligner, args=(work_queue, done_queue))
            p.start()
            jobs.append(p)
            work_queue.put('STOP')

        for p in jobs:
            p.join()

        done_queue.put('STOP')

        print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
        print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

        # working sequence is a string, that has the reference we're going to update this cycle
        working_sequence = get_first_sequence(reference_sequence)

        # register is the relative position that is being N-ed:
        # ACGTAGACAATA --> NCGTAGNCAATA = register 0
        # ACGTAGACAATA --> ANGTAGANAATA = register 1 ...
        for register in range(0, STEP):
            print("#  Starting Variant Calling, register: {}...".format(
                register),
                  file=sys.stdout,
                  end='\n')
            print("#  Starting Variant Calling, register: {}...".format(
                register),
                  file=sys.stderr,
                  end='')
            # cull the alignment files for this register
            alns, forward_mask = get_alignments_labels_and_mask(
                path_to_alignments=temp_dir_path + "*.tsv.{}".format(register),
                max=args.nb_files,
                suffix=".{}".format(register))
            # this is the list of positions that we're going to look at, based on this register
            degenerate_positions = {
                'forward': range(register, reference_sequence_length, STEP),
                'backward': range(register, reference_sequence_length, STEP)
            }

            # place to put the marginal probs
            variant_call_file = temp_folder.add_file_path(
                "variants.{cycle}.{reg}.calls".format(cycle=cycle,
                                                      reg=register))
            # arguments for multiprocessing
            for aln, forward_bool in zip(alns, forward_mask):
                call_methyl_args = {
                    "sequence": None,
                    "alignment_file": aln,
                    "forward": forward_bool,
                    "out_file": variant_call_file,
                    "positions": degenerate_positions,
                    "degenerate_type": degenerate_enum(args.degenerate),
                }
                #c = CallMethylation(**call_methyl_args)
                #c.write()
                work_queue.put(call_methyl_args)

            for w in xrange(workers):
                p = Process(target=run_methyl_caller,
                            args=(work_queue, done_queue))
                p.start()
                jobs.append(p)
                work_queue.put('STOP')

            for p in jobs:
                p.join()

            done_queue.put('STOP')

            # this is where the per-register update happens
            working_sequence = update_reference(variant_call_file,
                                                working_sequence,
                                                register,
                                                min_depth=0,
                                                get_sites=False)

            # remove alignments for this register
            for f in glob.glob(temp_dir_path + "*.tsv.{}".format(register)):
                os.remove(f)
            print("done", file=sys.stdout, end="\n")
            print("done", file=sys.stderr, end="\n")

        # add a file for this cycle
        ref_path = temp_folder.add_file_path(
            "iteration.{cycle}.fa".format(cycle=cycle))
        # write it to a file
        write_fasta("iteration.{cycle}.fa".format(cycle=cycle),
                    working_sequence, open(ref_path, 'w'))
        # update the path to the reference for the next cycle
        reference_sequence = ref_path
    return
Beispiel #5
0
def main(args):
    # parse args
    args = parse_args()

    start_message = """
#   Starting Signal Align
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning {nbFiles}
#   Using model: {model}
#   Using banding: {banding} DEPRECIATE this ASAP
#   Aligning to regions in: {regions}
#   Input template HMM: {inThmm}
#   Input complement HMM: {inChmm}
    """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded,
               inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    # make directory to put temporary files
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_alignment")
    reference_seq = temp_folder.add_file_path("reference_seq.txt")
    make_temp_sequence(args.ref, True, reference_seq)

    # index the reference for bwa
    print("signalAlign - indexing reference", file=sys.stderr)
    bwa_ref_index = get_bwa_index(args.ref, temp_dir_path)
    print("signalAlign - indexing reference, done", file=sys.stderr)

    # parse the target regions, if provided
    if args.target_regions is not None:
        target_regions = TargetRegions(args.target_regions)
    else:
        target_regions = None

    workers = args.nb_jobs
    work_queue = Manager().Queue()
    done_queue = Manager().Queue()
    jobs = []

    fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")]

    nb_files = args.nb_files
    if nb_files < len(fast5s):
        shuffle(fast5s)
        fast5s = fast5s[:nb_files]

    for fast5 in fast5s:
        alignment_args = {
            "reference": reference_seq,
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_index": bwa_ref_index,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "banded": args.banded,
            "in_fast5": args.files_dir + fast5,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "target_regions": target_regions,
        }
        #alignment = SignalAlignment(**alignment_args)
        #alignment.run()
        work_queue.put(alignment_args)

    for w in xrange(workers):
        p = Process(target=aligner, args=(work_queue, done_queue))
        p.start()
        jobs.append(p)
        work_queue.put('STOP')

    for p in jobs:
        p.join()

    done_queue.put('STOP')
    print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
    print("\n#  signalAlign - finished alignments\n", file=sys.stdout)
Beispiel #6
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr)

    start_message = """
#   Starting Zayante Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded,
               inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions,
               tHdp=args.templateHDP, cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection")

    reference_sequence = args.ref

    STEP = 10
    for cycle in range(0, 8):
        for it in range(0, STEP):
            # make paths for reference files
            forward_reference = temp_folder.add_file_path("forward_reference.{cycle}.{iter}.txt".format(cycle=cycle,
                                                                                                        iter=it))
            backward_reference = temp_folder.add_file_path("backward_reference.{cycle}.{iter}.txt".format(cycle=cycle,
                                                                                                          iter=it))

            # make N-ed reference sequence for this iteration
            deg, reference_sequence_length = make_degenerate_reference(reference_sequence, it,
                                                                       forward_reference, backward_reference,
                                                                       step=STEP)
            assert deg, "Problem making degenerate reference for cycle {cycle} iteration {iter}" \
                        "".format(cycle=cycle, iter=it)

            # index the reference for bwa
            print("signalAlign - indexing reference", file=sys.stderr)
            bwa_ref_index = get_bwa_index(args.ref, temp_dir_path)
            print("signalAlign - indexing reference, done", file=sys.stderr)

            # setup workers for multiprocessing
            workers = args.nb_jobs
            work_queue = Manager().Queue()
            done_queue = Manager().Queue()
            jobs = []

            # list of alignment files
            fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")]

            # take only some
            if args.nb_files < len(fast5s):
                shuffle(fast5s)
                fast5s = fast5s[:args.nb_files]

            for fast5 in fast5s:
                alignment_args = {
                    "forward_reference": forward_reference,
                    "backward_reference": backward_reference,
                    "path_to_EC_refs": None,
                    "destination": temp_dir_path,
                    "stateMachineType": args.stateMachineType,
                    "bwa_index": bwa_ref_index,
                    "in_templateHmm": args.in_T_Hmm,
                    "in_complementHmm": args.in_C_Hmm,
                    "in_templateHdp": args.templateHDP,
                    "in_complementHdp": args.complementHDP,
                    "banded": args.banded,
                    "sparse_output": True,
                    "in_fast5": args.files_dir + fast5,
                    "threshold": args.threshold,
                    "diagonal_expansion": args.diag_expansion,
                    "constraint_trim": args.constraint_trim,
                    "target_regions": None,
                    "degenerate": degenerate_enum(args.degenerate),
                }
                #alignment = SignalAlignment(**alignment_args)
                #alignment.run()
                work_queue.put(alignment_args)

            for w in xrange(workers):
                p = Process(target=aligner, args=(work_queue, done_queue))
                p.start()
                jobs.append(p)
                work_queue.put('STOP')

            for p in jobs:
                p.join()

            done_queue.put('STOP')

            print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
            print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

            print("\n#  Starting Variant Calling\n", file=sys.stdout)
            print("\n#  Starting Variant Calling\n", file=sys.stderr)

            # cull the alignment files
            alns, forward_mask = get_alignments_labels_and_mask(temp_dir_path + "*.tsv", args.nb_files)

            degenerate_positions = {
                'forward': range(it, reference_sequence_length, STEP),
                'backward': range(it, reference_sequence_length, STEP) }

            variant_call_file = temp_folder.add_file_path("variants.{cycle}.{iter}.calls".format(cycle=cycle, iter=it))

            for aln, forward_bool in zip(alns, forward_mask):
                call_methyl_args = {
                    "sequence": None,
                    "alignment_file": aln,
                    "forward": forward_bool,
                    "out_file": variant_call_file,
                    "positions": degenerate_positions,
                    "degenerate_type": degenerate_enum(args.degenerate),
                }
                #c = CallMethylation(**call_methyl_args)
                #c.write()
                work_queue.put(call_methyl_args)

            for w in xrange(workers):
                p = Process(target=run_methyl_caller, args=(work_queue, done_queue))
                p.start()
                jobs.append(p)
                work_queue.put('STOP')

            for p in jobs:
                p.join()

            done_queue.put('STOP')

            print("\n#  Finished Variant Calling\n", file=sys.stdout)
            print("\n#  Finished Variant Calling\n", file=sys.stderr)

            new_ref = update_reference(variant_call_file, reference_sequence, 0)

            ref_path = temp_folder.add_file_path("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it))

            write_fasta("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it), new_ref, open(ref_path, 'w'))

            reference_sequence = ref_path

            # remove old alignments
            for f in glob.glob(temp_dir_path + "*.tsv"):
                os.remove(f)
        STEP -= 1
    return
Beispiel #7
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
          file=sys.stderr)

    start_message = """
#   Starting Jamison Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
#   Performing {cycles} cycles
    """.format(fileDir=args.files_dir,
               reference=args.ref,
               nbFiles=args.nb_files,
               banding=args.banded,
               inThmm=args.in_T_Hmm,
               inChmm=args.in_C_Hmm,
               model=args.stateMachineType,
               regions=args.target_regions,
               tHdp=args.templateHDP,
               cHdp=args.complementHDP,
               cycles=args.cycles)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out +
                                            "tempFiles_errorCorrection")

    # initialize to input fasta
    reference_sequence_path = args.ref

    # list of alignment files
    fast5s = cull_fast5_files(args.files_dir, args.nb_files)

    for cycle in range(0, args.cycles):
        # index the reference for bwa this is a string with the path to the index
        bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path)

        # unpack the reference sequence
        reference_sequence_string = get_first_sequence(reference_sequence_path)

        alignment_args = {
            "path_to_EC_refs": None,
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_index": bwa_ref_index,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "banded": args.banded,
            "sparse_output": True,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "target_regions": None,
            "degenerate": degenerate_enum(args.degenerate),
        }

        proposals = scan_for_proposals(temp_folder, STEP,
                                       reference_sequence_string, fast5s,
                                       alignment_args, args.nb_jobs)

        proposals = group_sites_in_window(proposals, 6)

        print("Cycle {cycle} - Got {nb} sites to check: {sites}".format(
            nb=len(proposals), sites=proposals, cycle=cycle))

        updated_reference_string = update_reference_with_marginal_probs(
            temp_folder, proposals, reference_sequence_string, fast5s,
            alignment_args, args.nb_jobs)

        updated_reference_path = temp_folder.add_file_path(
            "cycle_snapshot.{cycle}.fa".format(cycle=cycle))

        write_fasta("jamison{}".format(cycle), updated_reference_string,
                    open(updated_reference_path, 'w'))

        reference_sequence_path = updated_reference_path

    # copy final file
    copyfile(reference_sequence_path, temp_dir_path + args.corrected)

    return
Beispiel #8
0
    def run(self, get_expectations=False):
        # file checks
        if os.path.isfile(self.in_fast5) is False:
            print("signalAlign - problem with file path {file}".format(file=self.in_fast5))
            return False

        # Preamble set up before doing the alignment

        # containers and defaults
        read_label = self.in_fast5.split("/")[-1]  # used in the posteriors file as identifier
        read_name = self.in_fast5.split("/")[-1][:-6]  # get the name without the '.fast5'

        # object for handling temporary files
        temp_folder = FolderHandler()
        temp_dir_path = temp_folder.open_folder(self.destination + "tempFiles_{readLabel}".format(readLabel=read_label))

        # read-specific files, could be removed later but are kept right now to make it easier to rerun commands
        temp_np_read = temp_folder.add_file_path("temp_{read}.npRead".format(read=read_label))
        temp_2d_read = temp_folder.add_file_path("temp_2Dseq_{read}.fa".format(read=read_label))
        temp_t_model = temp_folder.add_file_path("template_model.model")
        temp_c_model = temp_folder.add_file_path("complement_model.model")

        # make the npRead and fasta todo make this assert
        success, temp_t_model, temp_c_model = get_npRead_2dseq_and_models(
            fast5=self.in_fast5,
            npRead_path=temp_np_read,
            twod_read_path=temp_2d_read,
            template_model_path=temp_t_model,
            complement_model_path=temp_c_model,
        )

        if success is False:
            return False

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = "--s "
        elif self.stateMachineType == "fourState":
            model_label = ".4s"
            stateMachineType_flag = "--f "
        elif self.stateMachineType == "echelon":
            model_label = ".e"
            stateMachineType_flag = "--e "
        else:
            model_label = ".vl"
            stateMachineType_flag = ""

        # get orientation and cigar from BWA this serves as the guide alignment
        cigar_string, strand = exonerated_bwa(
            bwa_index=self.bwa_index, query=temp_2d_read, target_regions=self.target_regions
        )

        # this gives the format: /directory/for/files/file.model.orientation.tsv
        posteriors_file_path = ""

        # forward strand
        if strand == "+":
            forward = True
            posteriors_file_path = self.destination + read_name + model_label + ".forward.tsv"

        # backward strand
        if strand == "-":
            forward = False
            posteriors_file_path = self.destination + read_name + model_label + ".backward.tsv"

        # didn't map
        elif (strand != "+") and (strand != "-"):
            print("signalAlign - {} didn't map".format(read_label), file=sys.stderr)
            temp_folder.remove_folder()
            return False

        # Alignment routine

        # containers and defaults
        # temp_ref_seq = temp_folder.add_file_path("temp_ref_seq.txt")
        path_to_vanillaAlign = "./vanillaAlign"  # todo could require this in path

        # make sequence for vanillaAlign, we orient the sequence so that the template events align to the
        # reference and the complement events align to the reverse complement of the reference
        # make_temp_sequence(self.reference, forward, temp_ref_seq)

        # alignment flags

        # input (match) models
        if self.in_templateModel is not None:
            template_model_flag = "-T {model_loc} ".format(model_loc=self.in_templateModel)
        if temp_t_model is not None:
            template_model_flag = "-T {t_model} ".format(t_model=temp_t_model)
        else:
            template_model_flag = ""
        if self.in_complementModel is not None:
            complement_model_flag = "-C {model_loc} ".format(model_loc=self.in_complementModel)
        if temp_c_model is not None:
            complement_model_flag = "-C {c_model} ".format(c_model=temp_c_model)
        else:
            complement_model_flag = ""

        # input HMMs
        if self.in_templateHmm is not None:
            template_hmm_flag = "-y {hmm_loc} ".format(hmm_loc=self.in_templateHmm)
        else:
            template_hmm_flag = ""
        if self.in_complementHmm is not None:
            complement_hmm_flag = "-z {hmm_loc} ".format(hmm_loc=self.in_complementHmm)
        else:
            complement_hmm_flag = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-d {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # banded alignment
        if self.banded is True:
            banded_flag = "--b "
        else:
            banded_flag = ""

        # commands
        if get_expectations:
            template_expectations_file_path = self.destination + read_name + ".template.expectations"
            complement_expectations_file_path = self.destination + read_name + ".complement.expectations"

            command = (
                "echo {cigar} | {vA} {banded}{model}-r {ref} -q {npRead} {t_model}{c_model}{t_hmm}{c_hmm}{thresh}"
                "{expansion}{trim} -L {readLabel} -t {templateExpectations} -c {complementExpectations}".format(
                    cigar=cigar_string,
                    vA=path_to_vanillaAlign,
                    model=stateMachineType_flag,
                    banded=banded_flag,
                    ref=self.reference,
                    readLabel=read_label,
                    npRead=temp_np_read,
                    t_model=template_model_flag,
                    c_model=complement_model_flag,
                    t_hmm=template_hmm_flag,
                    c_hmm=complement_hmm_flag,
                    templateExpectations=template_expectations_file_path,
                    complementExpectations=complement_expectations_file_path,
                    thresh=threshold_flag,
                    expansion=diag_expansion_flag,
                    trim=trim_flag,
                )
            )
        else:
            command = (
                "echo {cigar} | {vA} {banded}{model}-r {ref} -q {npRead} {t_model}{c_model}{t_hmm}{c_hmm}{thresh}"
                "{expansion}{trim} -u {posteriors} -L {readLabel}".format(
                    cigar=cigar_string,
                    vA=path_to_vanillaAlign,
                    model=stateMachineType_flag,
                    banded=banded_flag,
                    ref=self.reference,
                    readLabel=read_label,
                    npRead=temp_np_read,
                    t_model=template_model_flag,
                    c_model=complement_model_flag,
                    t_hmm=template_hmm_flag,
                    c_hmm=complement_hmm_flag,
                    posteriors=posteriors_file_path,
                    thresh=threshold_flag,
                    expansion=diag_expansion_flag,
                    trim=trim_flag,
                )
            )

        # run
        print("signalAlign - running command: ", command, end="\n", file=sys.stderr)
        os.system(command)
        temp_folder.remove_folder()
        return True