コード例 #1
0
def extract_fastq(input_f, ref_f, mode=0):
    """
    Args:
        input_f: intput fast5 file handle
        ref_f: file name of the reference
        mode: 0-dna, 1-rna, -1-rna 180mV
    """
    with h5py.File(input_f, 'r') as input_fh:
        raw_signal = list(input_fh['/Raw/Reads'].values())[0]['Signal'].value
        raw_seq = input_fh[
            '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'].value
        ref = mappy.Aligner(ref_f)
        align = ref.map(raw_seq)
        ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5)
        aligns = ref.map(raw_seq.split(b'\n')[1])
        maxmapq = -np.inf
        align = None
        for aln in aligns:
            if aln.mapq > maxmapq:
                maxmapq = aln.mapq
                align = aln
        if align is None:
            print("FAIL MAPPING " + input_f)
        if align.strand == -1:
            ref_seq = mappy.revcomp(
                ref.seq(align.ctg, start=align.r_st, end=align.r_en))
        else:
            ref_seq = ref.seq(align.ctg, start=align.r_st, end=align.r_en)
        if (mode == 1) or (mode == -1):
            raw_signal = raw_signal[::-1]
    if ref_seq is None:
        print(input_f)
        print(aligns)
    return raw_signal, raw_seq, ref_seq
コード例 #2
0
ファイル: parse.py プロジェクト: kepbod/pacbio
def parse_target(fn):
    target = defaultdict(dict)
    with open(fn, 'r') as f:
        for line in f:
            (name, left, donor, right, total, cut, in_s, in_e,
             fa) = line.rstrip().split()
            target_type = 'gDNA' if name.endswith('gDNA') else 'donor'
            left, right = int(left), int(right)
            total, cut = int(total), int(cut)
            in_s, in_e = int(in_s), int(in_e)
            donor = int(donor)
            target[name]['left_bond'] = left
            target[name]['right_bond'] = right
            target[name]['donor'] = donor
            target[name]['total'] = total
            if target_type == 'gDNA':
                target[name]['cut_left'] = cut - 10
                target[name]['cut_right'] = cut + 10
                interval = [[0, left, 'L'], [left, cut - 25, 'LH'],
                            [cut - 25, cut + 25, 'C'],
                            [cut + 25, left + donor, 'RH'],
                            [left + donor, total, 'R']]
                target[name]['fa'] = mp.Aligner(fa, preset='map-pb')
            else:
                interval = [[0, in_s, 'LH'], [in_s, in_e, 'I'],
                            [in_e, total, 'RH']]
                target[name]['fa'] = mp.Aligner(fa, preset='sr')
            target[name]['interval'] = interval
            target[name]['rev_interval'] = [[total - x[1], total - x[0], x[2]]
                                            for x in interval][::-1]
    return target
コード例 #3
0
ファイル: minimap2.py プロジェクト: iMetOsaka/UNAGI
def main(argv):
	opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
	if len(args) < 2:
		print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
		print("Options:")
		print("  -x STR      preset: sr, map-pb, map-ont, asm5, asm10 or splice")
		print("  -n INT      mininum number of minimizers")
		print("  -m INT      mininum chaining score")
		print("  -k INT      k-mer length")
		print("  -w INT      minimizer window length")
		print("  -r INT      band width")
		print("  -c          output the cs tag")
		sys.exit(1)

	preset = min_cnt = min_sc = k = w = bw = None
	out_cs = False
	for opt, arg in opts:
		if opt == '-x': preset = arg
		elif opt == '-n': min_cnt = int(arg)
		elif opt == '-m': min_chain_score = int(arg)
		elif opt == '-r': bw = int(arg)
		elif opt == '-k': k = int(arg)
		elif opt == '-w': w = int(arg)
		elif opt == '-c': out_cs = True

	a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
	if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
	for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
		for h in a.map(seq, cs=out_cs): # traverse hits
			print('{}\t{}\t{}'.format(name, len(seq), h))
コード例 #4
0
def getIndex(reference, thread):
    """
    Find reference sequence
    make a index
    return mappy alignment result
    default only keep one best alignment
    default using 2 threads
    """
    if reference:
        reffa = reference
    else:
        reffa = path.join(path.dirname(path.abspath(path.dirname(__file__))),
                          "reference.fa")
    if not path.isfile(reffa):
        logging.error("Could not find reference.fa")
        sys.exit(
            "ERROR: Could not find reference.fa! Programme exit due to reference.fa problem."
        )
    if thread is None:  #check wether thread arugment is specified, default using 2 threads
        thread = 2
    aligner = mp.Aligner(
        reffa, preset="map-ont", best_n=1, n_threads=int(thread)
    )  #invoke minimap2 API from Li Heng, keep only best alignmen
    if not aligner:
        logging.error("Failed to load/build index")
        raise Exception(
            "ERROR: failed to load/build index! Programme exit due to mappy problem."
        )
    return aligner
コード例 #5
0
ファイル: patcher2.py プロジェクト: zooniikayler/Branco-Diss
def runsingle(reffile, reads1, reads2, fname, distance, cut_site, min_len,
              output_type):
    """Run single processor version of PAtChER"""
    print("Loading in Reference")
    reference = mp.Aligner(reffile, preset="sr")
    if not reference:
        raise Exception("ERROR: failed to load/build index file")
    print("Done.")
    sambam_output = SAMBAMWriter(fname, reference, output_type)
    print("Running Alignment")
    while True:
        try:
            read1 = process_reads2.Read(reads1.__next__())
            #Read is a class from process_reads2.py
            read1.split_read(cut_site, min_len)
            read1.qual_trim(10, 10)
            read2 = process_reads2.Read(reads2.__next__())
            read2.split_read(cut_site, min_len)
            read2.qual_trim(10, 10)
            #r1 and r2 could be None at this stage
            #.seq is part of mappy
            if read1.seq and read2.seq:
                res = alignment2.map_reads(reference, read1, read2, distance)
                if res:
                    sambam_output.process_output(res, read1, read2, distance)
        except StopIteration:

            break
コード例 #6
0
    def process_chunk(virtuals, ref):

        if ref is not None:
            if type(ref) == str:
                #print("Loading allign, chunks")
                Al = mappy.Aligner(ref, preset="map-ont")
            else:
                Al = ref
        res = {}
        if virtuals is None or len(virtuals) == 0:
            return res
        #print(virtuals)
        for block in virtuals:
            if block is None:
                continue
            virtual, k = block
            if virtual is not None:
                try:
                    res[k] = virtual_h5_to_processing(virtual, Al)
                except IndexError as err:
                    error = {}
                    if len(err.args) > 0:
                        msg = err.args[0]
                        error[msg] = 1
                    else:
                        error["IndexError"] += 1
                    res[k] = [[], [], error]

        return res
コード例 #7
0
def get_mapping(pred):
    try:
        aligner = mp.Aligner(reference_file)
        return next(aligner.map(pred))
    except Exception as e:
        print(e)
        return None
コード例 #8
0
def align_to_chromosomes(teloes):
    aligner = mp.Aligner('../../GRCh38_latest_genomic.fna.gz',
                         preset='map-ont')
    if not aligner: raise Exception("ERROR: failed to load/build index")

    chromosome_dict = {}
    chromosome_graph = {}
    for i in range(1, NUM_CHROMOSOMES + 1):
        chromosome_dict[str(i)] = [[0, 0, 0], [0, 0, 0]]
        chromosome_graph[str(i)] = [[], []]
    chromosome_dict["X"] = [[0, 0, 0], [0, 0, 0]]
    chromosome_graph["X"] = [[], []]
    chromosome_dict["Y"] = [[0, 0, 0], [0, 0, 0]]
    chromosome_graph["Y"] = [[], []]
    chromosome_dict["Unknown"] = [[0, 0, 0], [0, 0, 0]]
    chromosome_graph["Unknown"] = [[], []]
    chromo_count = [[0, 0], [0, 0]]

    for telo in teloes:
        first_print = True
        aligns_dict = {}
        for to_align in telo.non_telomeric_parts:
            if len(to_align) < MIN_ALIGNMENT_LENGTH:
                continue
            for hit in aligner.map(to_align):
                if hit.is_primary:
                    aligns_dict[hit.mlen] = hit
                    if (first_print):
                        print(telo.rec_num)
                        first_print = False
                    print(hit.ctg + " : " + str(hit.r_st) + " - " +
                          str(hit.r_en) + " starnd: " + str(hit.strand) +
                          " blen: " + str(hit.blen) + " mlen: " +
                          str(hit.mlen) + " NM: " + str(hit.NM))
        if aligns_dict:
            best = aligns_dict[max(aligns_dict)]
            chromosome_matcher(best, chromosome_dict, chromosome_graph,
                               telo.longest_telomere_len, chromo_count)

    for j in chromosome_dict:
        if (chromosome_dict[j][0] != 0) or (chromosome_dict[j][1] !=
                                            0) or (chromosome_dict[j][2] != 0):
            print(j + " " + str(chromosome_dict[j]))

    for chromosome in chromosome_graph:
        plt.axhline(y=1, color='b', linestyle='-')
        plt.axhline(y=0, color='b', linestyle='-')
        plt.axhline(y=8, color='b', linestyle='-')
        plt.axhline(y=-7, color='b', linestyle='-')
        for dot in chromosome_graph[chromosome][1]:
            plt.plot(dot, 0, 'ro')
        for dot in chromosome_graph[chromosome][0]:
            plt.plot(dot, 1, 'ro')
        plt.savefig('Chromosome_' + chromosome + '.jpg')
        plt.close()

    print("Average telo legth on edges - " +
          str(chromo_count[0][1] / chromo_count[0][0]) + "\n")
    print("Average telo legth in center - " +
          str(chromo_count[1][1] / chromo_count[1][0]) + "\n")
コード例 #9
0
def _main(args):
    try:
        mh.mkdir(args.guppy_logs_output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting " +
            "guppy logs.")
    logging.init_logger(args.guppy_logs_output_directory)
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    args.outputs = [mh.PR_VAR_NAME]

    LOGGER.info("Loading model.")
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        LOGGER.info("Loading reference.")
        aligner = mappy.Aligner(str(args.reference),
                                preset=str("map-ont"),
                                best_n=1)

        process_all_reads(
            args.fast5s_dir,
            not args.not_recursive,
            args.num_reads,
            args.read_ids_filename,
            model_info,
            aligner,
            args.processes,
            args.output,
            args.suppress_progress,
            args.compute_false_reference_scores,
        )
コード例 #10
0
ファイル: extract.py プロジェクト: Global19/apps-scripts
def getFlankAligner(ref,ctg,start,stop,**kwargs):
    tmpRef = NamedTemporaryFile(mode='w',delete=False)
    for side,seq in zip(['L','R'],getFlanks(ref,ctg,start,stop,**kwargs)):
        tmpRef.write(f'>{"_".join([str(ctg),side])}\n{seq}\n')
    tmpRef.close()
    aligner = mp.Aligner(tmpRef.name,preset='sr')
    return aligner,tmpRef
コード例 #11
0
def align_contigs(**kwargs):

    if 'infile_fasta' in kwargs:
        infile = kwargs['infile_fasta']
    if 'out' in kwargs:
        outfile = kwargs['out']
    if 'genome' in kwargs:
        genome = kwargs['genome']
    if 'preset' in kwargs:
        preset = kwargs['preset']
    if 'nthreads' in kwargs:
        nthreads = kwargs['nthreads']

    a = mp.Aligner(str(genome), preset=preset, n_threads=nthreads)

    if not a: raise Exception("ERROR: failed to load/build index")

    outfile = open(outfile, 'w')

    outfile.write(
        "read\tchr\tpos\tr_st\tr_en\tq_st\tq_en\tq_len\tprimary\tstrand\tcs\tcigstr\tcigtup\n"
    )

    for name, seq, qual in mp.fastx_read(infile):
        seq_len = len(seq)
        print name
        for hit in a.map(seq, cs=True):
            outfile.write(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    name, hit.ctg, hit.r_st, hit.r_st, hit.r_en, hit.q_st,
                    hit.q_en, seq_len, hit.is_primary, hit.strand, hit.cs,
                    hit.cigar_str, hit.cigar))

    outfile.close()
コード例 #12
0
def get_relation(seq1, seq2):
    len1, len2 = len(seq1), len(seq2)
    if 0.9 * len1 > len2 or 1.1 * len1 < len2: return '', 0, 0
    res = ''
    min_len = min(len1, len2)
    # seq1 = seq1 + seq1
    # seq2 = seq2 + seq2
    a = mp.Aligner(seq=seq1)
    f_iden = r_iden = 0
    for h in a.map(seq2):
        # only use primary alignment
        if not h.is_primary: continue
        f_iden = h.mlen / (min_len + 0.0)
        f_strand = h.strand
        break
    rseq2 = seq2[::-1]
    for h in a.map(rseq2):
        # only use primary alignment
        if not h.is_primary: continue
        r_iden = h.mlen / (min_len + 0.0)
        r_strand = h.strand
        break
    if max(f_iden, r_iden) < 0.8: return 'NA', f_iden, r_iden
    if f_iden > r_iden:
        if f_strand == 1:
            res = 'ID'  # 'identical'
        else:
            res = 'RC'
    else:
        if r_strand == 1:
            res = 'R'
        else:
            res = 'C'
    return res
コード例 #13
0
def filter_fastq(TE, R1, R2, out_fastq):
    '''Filter reads with a single read aligning to a given sequence
    '''
    reference = mp.Aligner(TE, preset="sr")  # load or build index
    if not reference:
        raise Exception("ERROR: failed to load/build index")

    out_fastq = open(out_fastq, "w")

    iterator1 = SeqIO.parse(R1, "fastq")
    iterator2 = SeqIO.parse(R2, "fastq")

    for r1 in iterator1:
        r2 = iterator2.__next__()

        r1_maps = list(reference.map(r1.seq))
        hits1 = [get_output_var(x) for x in r1_maps]
        for hit in hits1:
            if hit["blen"] < 0.95 * len(r1.seq):
                r1_maps = []
        r2_maps = list(reference.map(r2.seq))
        hits2 = [get_output_var(x) for x in r2_maps]
        for hix in hits2:
            if hix["blen"] < 0.95 * len(r2.seq):
                r2_maps = []

        if (len(r1_maps) >= 1) and not (len(r2_maps) >= 1):
            SeqIO.write(r2, out_fastq, 'fastq')
        elif not (len(r1_maps) >= 1) and (len(r2_maps) >= 1):
            SeqIO.write(r1, out_fastq, 'fastq')
    out_fastq.close()
コード例 #14
0
def get_minimap_cigar(genome, sequence, preset='map-ont', cigar_string=True):
    """Get the alignment between a genome and alignment file

    :param genome: fasta file to genome
    :param sequence: sequence to align
    :param preset: sr for single-end short reads;
                   map-pb for PacBio read-to-reference mapping;
                   map-ont for Oxford Nanopore read mapping;
                   splice for long-read spliced alignment;
                   asm5 for assembly-to-assembly alignment;
                   asm10 for full genome alignment of closely related species.
    :param cigar_string: if True return normal cigar string, if false return array of shape (n_cigar, 2)
                        The two numbers give the length and the operator of each CIGAR operation.

    """
    assert os.path.exists(genome), "Genome path does not exist: {}".format(
        genome)
    assert preset in ["sr", "map-pb", "map-ont", "splice", "asm5", "asm10"]
    assert len(
        sequence) > 60, "minimap does not find alignments for small reads"
    a = mp.Aligner(genome, preset=preset)  # load or build index
    if not a:
        raise Exception("ERROR: failed to load/build index")
    for hit in a.map(sequence):
        if hit.is_primary:
            print(hit)
            if cigar_string:
                return str(hit.cigar_str)
            else:
                return hit.cigar
コード例 #15
0
def get_mp_error_rate(ref_seq, read_seq):
	a = mp.Aligner(seq=read_seq)
	error = -1
	for h in a.map(ref_seq):
		if not h.is_primary: continue
		error = h.NM / (h.NM + mlen + 0.0)
		break
	return error
コード例 #16
0
 def __init__(self, index):
     self.index = index
     if self.index:
         self.mapper = mp.Aligner(self.index, preset="map-ont")
         self.initialised = True
     else:
         self.mapper = None
         self.initialised = False
コード例 #17
0
def read_in_contigs_as_reference():
    global contigs_as_reference
    contigs_format = guess_fileformat(arguments.contigs)
    if contigs_format == "fasta":
        import mappy
        contigs_as_reference = mappy.Aligner(arguments.contigs)
    else:
        raise ValueError("Contigs are in weird format, I refuse to cooperate.")
コード例 #18
0
 def __init__(self, model_config, test_config, model_filepath):
     self._generator = get_generator(model_config, test_config, kind="testing")
     self._reads = test_config['reads']
     self._batch_size = test_config['batch_size']
     self._aligner = mp.Aligner("../useful_files/zymo-ref-uniq_2019-03-15.fa")
     self._with_assembler = model_config['encoder_max_length'] == test_config['stride']
     self._model_file_path = model_filepath
     self._result_dic = self._get_result_dic(self._model_file_path)
コード例 #19
0
def _main(args):
    logging.init_logger()
    LOGGER.info("Loading reference")
    aligner = mappy.Aligner(str(args.reference),
                            preset=str("map-ont"),
                            best_n=1)
    LOGGER.info("Loading variants")
    var_data = variants.VarInfo(args.in_vcf,
                                aligner,
                                args.max_indel_size,
                                keep_var_fp_open=True)
    contigs = var_data.variants_idx.header.contigs.values()
    LOGGER.info("Atomizing variants")
    with open(args.out_vcf, "w") as out_vars:
        # preprocess contigs to set contig lengths for VCF header
        ctg_lens = {}
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            if len(chrm_seq) != ctg.length:
                LOGGER.warning(
                    ("Mismatched contig lengths ({}) between " +
                     "reference ({}) and input VCF ({}) using length from "
                     "reference").format(ctg.name, len(chrm_seq), ctg.length))
            ctg_lens[ctg.name] = len(chrm_seq)

        out_vars.write("\n".join(HEADER + [
            CONTIG_HEADER_LINE.format(ctg, ctg_len)
            for ctg, ctg_len in ctg_lens.items()
        ] + [
            variants.CONTEXT_BASE_MI_LINE,
            COMMAND_HEADER_LINE.format(" ".join(sys.argv)),
            FIELDS_LINE,
        ]) + "\n")
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            map_pos = mapping.MAP_POS(
                chrm=ctg.name,
                strand=None,
                start=0,
                end=len(chrm_seq),
                q_trim_start=None,
                q_trim_end=None,
            )
            for var in var_data.fetch_read_variants(map_pos,
                                                    mh.seq_to_int(chrm_seq)):
                out_vars.write(
                    RECORD_LINE.format(
                        chrm=ctg.name,
                        pos=var.ref_start + 1,
                        rid=var.id,
                        ref=var.ref,
                        alts=",".join(var.alts),
                        info=variants.HAS_CONTEXT_BASE_TAG
                        if var.has_context_base else ".",
                    ))

    LOGGER.info("Indexing output variant file")
    variants.index_variants(args.out_vcf)
コード例 #20
0
def process_unique_one(reference, hits, read1, read2, distance):
    """
    Method to process non unique mapping hits
    """
    if len(hits[0]) == 1 and len(hits[1]) > 0:
        indx = 0
        seq = read2.seq
        out = [[get_output_var(hits[0][0]), "u"], []]
    elif len(hits[1]) == 1 and len(hits[0]) > 0:
        indx = 1
        seq = read1.seq
        out = [[], [get_output_var(hits[1][0]), "u"]]
    #Get reference sequence for unique hit
    refseq = reference.seq(hits[indx][0].ctg, hits[indx][0].r_st - distance,
                           hits[indx][0].r_en + distance)
    if refseq:
        local_reference = mp.Aligner(seq=refseq, preset="sr", n_threads=1)
        new_hits = []
        for hit in local_reference.map(seq):  # traverse alignments
            if hit.mlen / (len(seq) * 1.0) > 0.8:
                new_hits.append(hit)
        # Need to fix hit.ctg and hit.r_st???
        if len(new_hits) == 1:
            if indx == 0:
                out[1] = [get_output_var(new_hits[0]), "r"]
                out[1][0]["ctg"] = out[0][0]["ctg"]
            else:
                out[0] = [get_output_var(new_hits[0]), "r"]
                out[0][0]["ctg"] = out[1][0]["ctg"]
        elif len(new_hits) > 0:
            distance_list = []
            for new_hit in new_hits:
                if new_hit.r_st >= distance:
                    distance_list.append(new_hit.r_st - distance)
                else:
                    distance_list.append(distance - new_hit.r_en)
            cumulative_probability = 0
            probability_list = []
            for dist in distance_list:
                scale_probability = math.exp(-0.8 * dist / 50 - 0.6618)
                if scale_probability < 0:
                    scale_probability = 0
                cumulative_probability += scale_probability
                probability_list.append(cumulative_probability)

            selected_probability = random.random() * cumulative_probability
            selected_probability = random.random() * cumulative_probability
            pindex = 0
            while selected_probability >= probability_list[pindex]:
                pindex += 1
            if indx == 0:
                out[1] = [get_output_var(new_hits[pindex - 1]), "p"]
                out[1][0]["ctg"] = out[0][0]["ctg"]
            else:
                out[0] = [get_output_var(new_hits[pindex - 1]), "p"]
                out[0][0]["ctg"] = out[1][0]["ctg"]
    return out
コード例 #21
0
 def __init__(self, reference, preset=None):
     self.kwargs = {'fn_idx_in': reference, 'best_n': 1}
     if preset:
         self.kwargs['preset'] = preset
     else:
         #self.kwargs['scoring'] = (2,5,5,4,56,0)
         #self.kwargs['scoring'] = (1,2,2,1,32,0) # (A,B,o,e,O,E)
         self.kwargs['scoring'] = (1, 2, 2, 1, 18, 0)  # (A,B,o,e,O,E)
     self._aligner = mp.Aligner(**self.kwargs)
コード例 #22
0
def prep(args):
    if pathlib.Path(args.fast5_dir).is_dir():
        fast5s = find_all_fast5s(args.fast5_dir)
    else:
        fast5s = [args.fast5_dir]

    read_seqs = load_fastq(args.fastq)

    albacore_barcodes = load_albacore_barcodes_from_sequencing_summary(
        args.sequencing_summary)

    # For the ligation kit we need to align to reference (but not for the rapid kit).
    if args.kit == 'EXP-NBD103_start' or args.kit == 'EXP-NBD103_end':
        mappy_aligner = mp.Aligner(args.ref)
    else:
        mappy_aligner = None

    read_count = 0
    for fast5_file in fast5s:
        try:
            read_id, signal = get_read_id_and_signal(fast5_file)
        except KeyError:
            continue
        if read_id not in read_seqs:
            continue

        print('', file=sys.stderr)
        print(fast5_file, file=sys.stderr)
        print('  read ID: {}'.format(read_id), file=sys.stderr)

        if albacore_barcodes is not None:
            try:
                albacore_barcode = albacore_barcodes[read_id]
            except KeyError:
                albacore_barcode = None
        else:
            albacore_barcode = None

        if args.kit == 'EXP-NBD103_start':
            prep_native_read_start(signal, read_seqs[read_id], mappy_aligner,
                                   args.signal_size, albacore_barcode)

        if args.kit == 'EXP-NBD103_end':
            prep_native_read_end(signal, read_seqs[read_id], mappy_aligner,
                                 args.signal_size, albacore_barcode)

        elif args.kit == 'SQK-RBK004_start':
            prep_rapid_read_start()

        read_count += 1
        if args.read_limit is not None:
            if read_count >= args.read_limit:
                break

    print('', file=sys.stderr)
コード例 #23
0
def extract_fastq(input_f, ref_f, mode=0, trans_start=None):
    """
    Args:
        input_f: intput fast5 file handle
        ref_f: file name of the reference
        mode: 0-dna, 1-rna, -1-rna 180mV
        trans_start: Start position of the transcription(required in RNA mode).
    """
    with h5py.File(input_f, 'r') as input_fh:
        raw_entry = list(input_fh['/Raw/Reads'].values())[0]
        raw_signal = raw_entry['Signal'].value
        raw_seq = input_fh[BASECALL_ENTRY + '/BaseCalled_template/Fastq'].value
        if mode != 0:
            assert trans_start is not None
            raw_signal, raw_seq, decap_event = _decap(input_fh, trans_start,
                                                      raw_signal, raw_seq)
        else:
            decap_event = input_fh[BASECALL_ENTRY +
                                   '/BaseCalled_template/Events'].value
        ref = mappy.Aligner(ref_f)
        align = ref.map(raw_seq)
        ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5)
        aligns = ref.map(raw_seq.split(b'\n')[1])
        maxmapq = -np.inf
        for aln in aligns:
            if aln.mapq > maxmapq:
                maxmapq = aln.mapq
                align = aln
        if align is None:
            print("FAIL MAPPING " + input_f)
        if align.strand == -1:
            ref_seq = mappy.revcomp(
                ref.seq(align.ctg, start=align.r_st, end=align.r_en))
        else:
            ref_seq = ref.seq(align.ctg, start=align.r_st, end=align.r_en)
        if (mode == 1) or (mode == -1):
            raw_signal = raw_signal[::-1]
    if ref_seq is None:
        print("No Reference sequence found in %s" % (input_f))
        print(aligns)
        raise
    return raw_signal, raw_seq, ref_seq, decap_event
コード例 #24
0
def test(config, experiment_name, new_testing=False):
    if new_testing:
        discard_existing_testing(experiment_name)
    model = get_trained_model(config, experiment_name)
    controller = TestingController(config, experiment_name, model, new_testing)

    for bacteria in config['testing']['bacteria']:
        name = bacteria['name']
        generator = data_api.get_raw_generator(config, bacteria['data'])
        aligner = mp.Aligner(bacteria['reference'])
        controller.test(name, generator, aligner)
コード例 #25
0
ファイル: get_align_info.py プロジェクト: quentin0515/TT-Mars
def align_before_after(output_dir, sv, query_seq, ref_seq_1, ref_seq_2):
    #within length limit
    if not sv.is_third_fil:
        aligner = Align.PairwiseAligner()
        aligner.mode = 'global'
        #aligner.mode = 'local'
        aligner.match_score = 1
        aligner.mismatch_score = -1
        aligner.open_gap_score = -1
        aligner.extend_gap_score = -0.5
        #aligner.score_only = True
        alignment_beforeSV = aligner.score(query_seq, ref_seq_1)
        alignment_afterSV = aligner.score(query_seq, ref_seq_2)
    else:
        h = open(output_dir + "tmp_query.fasta", "w")
        h.write('>' + str(sv.idx) + "\n")
        h.write(query_seq + "\n")
        h.close()
        #         aligner = mappy.Aligner(fn_idx_in=output_dir+"tmp_query.fasta", scoring=[1,1,2,1])
        aligner = mappy.Aligner(fn_idx_in=output_dir + "tmp_query.fasta")
        #if not alignment: raise Exception("ERROR: failed to load/build index")
        aligner_beforeSV = aligner.map(ref_seq_1,
                                       seq2=None,
                                       cs=False,
                                       MD=False)
        aligner_afterSV = aligner.map(ref_seq_2, seq2=None, cs=False, MD=False)

        #test
        #         for agt in aligner_beforeSV:
        #             alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt.mlen)
        #             break
        #         for agt in aligner_afterSV:
        #             alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt.mlen)
        #             break

        try:
            agt_before = next(aligner_beforeSV)
        except:
            os.remove(output_dir + "tmp_query.fasta")
            return None, None

        try:
            agt_after = next(aligner_afterSV)
        except:
            os.remove(output_dir + "tmp_query.fasta")
            return None, None

        alignment_beforeSV = len(query_seq) - (len(ref_seq_1) -
                                               agt_before.mlen)
        alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt_after.mlen)

        os.remove(output_dir + "tmp_query.fasta")

    return alignment_beforeSV, alignment_afterSV
コード例 #26
0
def remove_by_alignment(fq, ref, out, mapq, preset, human_out, threads,
                        logger):

    fout = smart_open(filename=out, mode="w")

    if human_out:
        hout = smart_open(filename=human_out, mode="w")
    else:
        hout = None

    logger.info(f"Starting to map reads against: {ref}")

    logger.info(f"Initiating aligner: {ref}")
    aligner = mp.Aligner(str(ref), preset=preset, n_threads=threads)

    logger.info(f"Opening file handle: {fq}")
    if fq:
        reads = mp.fastx_read(str(fq))
    else:
        reads = None  # PE

    ref_maps = 0
    total_reads = 0

    logger.info(f"Filtering mapped reads [Q >= {mapq}]")

    human = []
    not_human = []
    for name, seq, qual in reads:
        mapped = aligner.map(seq)
        for aln in mapped:
            if aln.mapq >= mapq:
                ref_maps += 1
                if name not in human:
                    human.append(name)
                if hout is not None:
                    hout.write(str(f"@{name}\n{seq}\n+\n{qual}\n"))
                continue

        if name not in human:
            fout.write(str(f"@{name}\n{seq}\n+\n{qual}\n"))
            if name not in not_human:
                not_human.append(name)

        total_reads += 1

    fout.close()

    if hout is not None:
        hout.close()

    logger.info(f"Computed {ref_maps} mappings against reference: {ref}")
    logger.info(f"Recovered  {len(not_human)} / {total_reads} reads from {fq}")
コード例 #27
0
def create_index(reference_file):
    aligner = mp.Aligner(reference_file, best_n=1)

    for name, seq, qual in mp.fastx_read(reference_file, read_comment=False):
        reference_names.append(name)
        reference_lengths[name] = len(seq)

    if not aligner:
        raise Exception("ERROR: failed to load/build index file '{}'".format(
            reference_file))

    return aligner
コード例 #28
0
def hdf_to_sam_worker(reference, fname):
    """Extract and align basecall and methylation data from `.fast5`.

    :param reference: `.fasta` file containing reference sequence(s).
    :param fname: `.'fast5` file containing read data.
    """
    logger = medaka.common.get_named_logger('ModExtract')
    logger.info("Processing {}.".format(fname))
    results = list()
    aligner = mappy.Aligner(reference, preset='map-ont')
    with get_fast5_file(fname, mode="r") as f5:
        reads = list(f5.get_read_ids())
        logger.info("Found {} reads for {}.".format(len(reads), fname))
        for read_id in reads:
            read = f5.get_read(read_id)
            tool = Basecall1DTools(read)
            name, sequence, qstring = tool.get_called_sequence('template',
                                                               fastq=False)
            try:
                align = next(aligner.map(sequence, MD=True, cs=True))
            except StopIteration:
                continue
            else:
                if align.strand == +1:
                    flag = '0'
                    seq = sequence
                else:
                    flag = '16'
                    seq = medaka.common.reverse_complement(sequence)
                rname = align.ctg
                pos = str(align.r_st + 1)
                mapq = str(align.mapq)
                clip = [
                    '' if x == 0 else '{}S'.format(x)
                    for x in (align.q_st, len(sequence) - align.q_en)
                ]
                if align.strand == -1:
                    clip = clip[::-1]
                cigar = clip[0] + align.cigar_str + clip[1]
                NM = 'NM:i:' + str(align.NM)

            latest = read.get_latest_analysis('Basecall_1D')
            mod_base = read.get_analysis_dataset(latest, MODBASEPATH)
            mod_base = mod_base.view(dtype=MODTYPE)
            mA = 'MA:B:C,{}'.format(','.join(
                str(x) for x in mod_base['6mA'].reshape(-1)))
            mC = 'MC:B:C,{}'.format(','.join(
                str(x) for x in mod_base['5mC'].reshape(-1)))

            results.append('\t'.join(
                (read_id, flag, rname, pos, mapq, cigar, '*', '0', '0', seq,
                 qstring, NM, mA, mC)))
    return results
コード例 #29
0
ファイル: peak_utils.py プロジェクト: wckdouglas/cfNA
def is_mt(seq, rnr=False):
    is_chrM = 'not_MT'
    chrom_path = '/stor/work/Lambowitz/ref/hg19'
    if rnr:
        genome = chrom_path + '/new_genes/mt_rnr.fa'
    else:
        genome = chrom_path + '/genome/chrM.minimap2_idx'

    aligner = mappy.Aligner(genome, preset='sr')
    if list(aligner.map(seq)):
        is_chrM = 'is_MT'
    return is_chrM
コード例 #30
0
def generate_coverage(read1, read2, mapping, ref, pwid=0.95, ncpu=1, chunk_size=500000, quiet=False):

    if not quiet: print("Building index and data structures...")

    seq_cov = {}
    for name, seq in pyfastx.Fasta(ref, build_index=False):
        seq_cov[name] = np.zeros(len(seq), dtype=int)

    nreads = 0
    read_len = 0
    for r in mp.fastx_read(read1):
        nreads+=1
        read_len += len(r[1])
    read_len /= nreads
    min_chain_score = int(0.9*read_len)
    min_mis_match = int(read_len-pwid*read_len)

    a = mp.Aligner(ref, preset='sr', n_threads=ncpu, best_n=1000, min_chain_score=min_chain_score)  # load or build index 
    if not a: raise Exception("ERROR: failed to load/build index")

    def mpile(seqs):
        if seqs is None: return([])
        thrbuf = mp.ThreadBuffer()
        hits = []
        chrom=None
        for hit in a.map(seqs[1], buf=thrbuf):
            if (hit.NM<=min_mis_match) and ('S' not in hit.cigar_str) and ('H' not in hit.cigar_str):
                if chrom is None:
                    chrom=mapping[hit.ctg]
                    hits.append((hit.ctg, hit.r_st-1, hit.r_en))
                elif mapping[hit.ctg] == chrom:
                    hits.append((hit.ctg, hit.r_st-1, hit.r_en))
                else:
                    break
        return(hits)

    if not quiet: print("Aligning reads...")
    pool = ThreadPool(ncpu)
    for reads in tqdm(grouper(chain(
        mp.fastx_read(read1),
        mp.fastx_read(read2)), chunk_size), 
        total=int(1+2*nreads/chunk_size), disable=quiet):
        hits = pool.map(mpile, reads)
        for hit in chain.from_iterable(hits):
            if hit is None: continue
            seq_cov[hit[0]][hit[1]:hit[2]] += 1

    #close the pool and wait for the work to finish
    pool.close()
    pool.join()

    return(seq_cov)