Ejemplo n.º 1
0
    def map(self, name, seq, qual):
        seq = seq.replace('U', 'T')
        seqmaps = list(self.aligner.map(seq))
        if not seqmaps:
            yield (name, int(FUNMAP), '*', 0, 0, '*', '*', 0, 0, seq, qual)
            return

        for i, h in enumerate(seqmaps):
            if i > 0:
                flag = int(FSECONDARY)
            elif not h.is_primary:
                flag = int(FSUPPLEMENTARY)
            else:
                flag = 0

            leftclip = '{}S'.format(h.q_st) if h.q_st > 0 else ''
            rightclip = '{}S'.format(len(seq) -
                                     h.q_en) if h.q_en < len(seq) else ''

            if h.strand > 0:
                seq_f = seq
                qual_f = qual
            else:
                seq_f = mappy.revcomp(seq)
                qual_f = qual[::-1]
                leftclip, rightclip = rightclip, leftclip
                flag |= FREVERSE

            fullcigar = leftclip + h.cigar_str + rightclip

            yield (name, flag, h.ctg, h.r_st + 1, h.mapq, fullcigar, '*', 0, 0,
                   seq_f, qual_f, 'NM:i:{}'.format(h.NM))
Ejemplo n.º 2
0
def extract_fastq(input_f, ref_f, mode=0):
    """
    Args:
        input_f: intput fast5 file handle
        ref_f: file name of the reference
        mode: 0-dna, 1-rna, -1-rna 180mV
    """
    with h5py.File(input_f, 'r') as input_fh:
        raw_signal = list(input_fh['/Raw/Reads'].values())[0]['Signal'].value
        raw_seq = input_fh[
            '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'].value
        ref = mappy.Aligner(ref_f)
        align = ref.map(raw_seq)
        ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5)
        aligns = ref.map(raw_seq.split(b'\n')[1])
        maxmapq = -np.inf
        align = None
        for aln in aligns:
            if aln.mapq > maxmapq:
                maxmapq = aln.mapq
                align = aln
        if align is None:
            print("FAIL MAPPING " + input_f)
        if align.strand == -1:
            ref_seq = mappy.revcomp(
                ref.seq(align.ctg, start=align.r_st, end=align.r_en))
        else:
            ref_seq = ref.seq(align.ctg, start=align.r_st, end=align.r_en)
        if (mode == 1) or (mode == -1):
            raw_signal = raw_signal[::-1]
    if ref_seq is None:
        print(input_f)
        print(aligns)
    return raw_signal, raw_seq, ref_seq
Ejemplo n.º 3
0
def sam_record(read_id, sequence, qstring, mapping, tags=None, sep='\t'):
    """
    Format a string sam record.
    """
    if mapping:
        softclip = [
            '%sS' % mapping.q_st if mapping.q_st else '',
            mapping.cigar_str,
            '%sS' % (len(sequence) - mapping.q_en) if len(sequence) - mapping.q_en else ''
        ]
        record = [
            read_id,
            0 if mapping.strand == +1 else 16,
            mapping.ctg,
            mapping.r_st + 1,
            mapping.mapq,
            ''.join(softclip if mapping.strand == +1 else softclip[::-1]),
            '*', 0, 0,
            sequence if mapping.strand == +1 else mappy.revcomp(sequence),
            qstring,
            'NM:i:%s' % mapping.NM,
            'MD:Z:%s' % mapping.MD,
        ]
    else:
        record = [
            read_id, 4, '*', 0, 0, '*', '*', 0, 0, sequence, qstring, 'NM:i:0'
        ]

    if tags is not None:
        record.extend(tags)

    return sep.join(map(str, record))
Ejemplo n.º 4
0
def write_sam(read_id, sequence, qstring, mapping, fd=sys.stdout, unaligned=False, sep='\t'):
    """
    Write a sam record to a file descriptor.
    """
    if unaligned:
        fd.write("%s\n" % sep.join(map(str, [
            read_id, 4, '*', 0, 0, '*', '*', 0, 0, sequence, qstring, 'NM:i:0'
        ])))
    else:
        softclip = [
            '%sS' % mapping.q_st if mapping.q_st else '',
            mapping.cigar_str,
            '%sS' % (len(sequence) - mapping.q_en) if len(sequence) - mapping.q_en else ''
        ]
        fd.write("%s\n" % sep.join(map(str, [
            read_id,
            0 if mapping.strand == +1 else 16,
            mapping.ctg,
            mapping.r_st + 1,
            mapping.mapq,
            ''.join(softclip if mapping.strand == +1 else softclip[::-1]),
            '*', 0, 0,
            sequence if mapping.strand == +1 else revcomp(sequence),
            qstring,
            'NM:i:%s' % mapping.NM,
        ])))
    fd.flush()
Ejemplo n.º 5
0
    def __init__(self, fn):
        self.name = fn
        self.min = 100
        self.max = 0
        self.seqs = {}
        _seqs = {}

        for l in open(fn):
            amplicon, name, seq, left, forward, pos = l.strip().split("\t")
            pos = int(pos)
            forward = forward.lower() in ["t", "+", "forward", "true"]
            left = left.lower() in ["left", "true", "t"]

            length = len(seq)
            if length > self.max:
                self.max = length
            if length < self.min:
                self.min = length

            _seqs[seq] = Primer(amplicon, name, left, forward, True, pos, length)
            _seqs[mp.revcomp(seq)] = Primer(
                amplicon, name, left, forward, False, pos, length
            )

        for k, v in _seqs.items():
            self.seqs[k[: self.min]] = v
Ejemplo n.º 6
0
def extract_fa(fa, rep1, rep2, known_rep1_5_fa, known_rep2_5_fa, novel_rep1_5_fa, novel_rep2_5_fa, known_rep1_3_fa, known_rep2_3_fa, novel_rep1_3_fa, novel_rep2_3_fa):
    bsj_dict = dd(lambda: dd(lambda:0))

    rep = 0
    for fn in [rep1, rep2]:
        rep += 1
        with open(fn) as fp:
            for line in fp:
                if line.startswith('#'): continue
                ele = line.rsplit()
                bsj = (ele[idx['chrom']], ele[idx['startCoor0base']], ele[idx['endCoor']], ele[idx['canoBSJMotif']][0])
                is_known = ele[idx['isKnownBSJ']]
                if 'True' not in is_known:
                    bsj_dict[bsj]['Cate'] = 'Novel'
                else:
                    bsj_dict[bsj]['Cate'] = 'Known'
                bsj_dict[bsj][rep] = 1

    with open(known_rep1_5_fa, 'w') as k1_5, open(known_rep2_5_fa, 'w') as k2_5, open(novel_rep1_5_fa, 'w') as n1_5, open(novel_rep2_5_fa, 'w') as n2_5, open(known_rep1_3_fa, 'w') as k1_3, open(known_rep2_3_fa, 'w') as k2_3, open(novel_rep1_3_fa, 'w') as n1_3, open(novel_rep2_3_fa, 'w') as n2_3:
        for bsj in bsj_dict:
            (chrom, start, end, strand) = bsj
            start, end = int(start), int(end)
            ref_seq = fa[chrom]
            # 5'
            if strand == '+':
                seq = ref_seq[end-3:end+6].seq.upper()
                _5_str = '>{}:{}-{} {}\n{}\n'.format(chrom, end-2, end+6, strand, seq)
                seq = ref_seq[start-20:start+3].seq.upper()
                _3_str = '>{}:{}-{} {}\n{}\n'.format(chrom, start-19, start+3, strand, seq)
            elif strand == '-':
                seq = ref_seq[start-6:start+3].seq.upper()
                _5_str = '>{}:{}-{} {}\n{}\n'.format(chrom, start-5, start+3, strand, mp.revcomp(seq))
                seq = ref_seq[end-3:end+20].seq.upper()
                _3_str = '>{}:{}-{} {}\n{}\n'.format(chrom, end-2, end+20, strand, mp.revcomp(seq))
            else:
                print('Unexpected strand: {}'.format(strand))

            cate = bsj_dict[bsj]['Cate']
            rep_cnt = bsj_dict[bsj][1] + bsj_dict[bsj][2]
            if cate == 'Known':
                if rep_cnt == 1:
                    k1_5.write(_5_str)
                    k1_3.write(_3_str)
                elif rep_cnt == 2:
                    k2_5.write(_5_str)
                    k2_3.write(_3_str)
                else:
                    print('Unexpected rep_cnt: {}'.format(rep_cnt))
            elif cate == 'Novel':
                if rep_cnt == 1:
                    n1_5.write(_5_str)
                    n1_3.write(_3_str)
                elif rep_cnt == 2:
                    n2_5.write(_5_str)
                    n2_3.write(_3_str)
                else:
                    print('Unexpected rep_cnt: {}'.format(rep_cnt))
            else:
                print('Unexpected cate: {}'.format(cate))
Ejemplo n.º 7
0
def match_single(fq, primersets):
    for r in mp.fastx_read(fq, read_comment=True):
        r = Read(*r)
        rc = mp.revcomp(r.seq)
        matches = {}
        for pset in primersets:
            matches[pset.name] = Matched(r, pset.match(r.seq), r,
                                         pset.match(rc))
        yield r, matches
Ejemplo n.º 8
0
def get_seqs(ref_fa, chrom, strand, up_site, down_site):
    _5_name, _3_name, _5_seq, _3_seq = '', '', '', ''
    ref_seq = ref_fa[chrom]
    if strand == '+':
        _5_name = '{}:{}-{} {}'.format(chrom, up_site - 2, up_site + 6, strand)
        _5_seq = ref_seq[up_site - 3:up_site + 6].seq.upper()
        _3_name = '{}:{}-{} {}'.format(chrom, down_site - 20, down_site + 2,
                                       strand)
        _3_seq = ref_seq[down_site - 21:down_site + 2].seq.upper()
    else:  # '-'
        _5_name = '{}:{}-{} {}'.format(chrom, down_site - 6, down_site + 2,
                                       strand)
        _5_seq = ref_seq[down_site - 7:down_site + 2].seq.upper()
        _5_seq = mp.revcomp(_5_seq)
        _3_name = '{}:{}-{} {}'.format(chrom, up_site - 2, up_site + 20,
                                       strand)
        _3_seq = ref_seq[up_site - 3:up_site + 20].seq.upper()
        _3_seq = mp.revcomp(_3_seq)
    if _5_seq[3:5] != 'GT' or _3_seq[-5:-3] != 'AG':
        _5_name = ''
    return _5_name, _5_seq, _3_name, _3_seq
def _align(dna_pred):
    mapped = get_mapping(dna_pred)
    if mapped == None:
        raise Exception("Unable to map prediction.")

    dna_cigar = mapped.cigar_str
    dna_true = get_reference(mapped.ctg)
    dna_true = dna_true[mapped.r_st:mapped.r_en]
    dna_pred = dna_pred[mapped.q_st:mapped.q_en]

    if mapped.strand == -1:
        dna_pred = mp.revcomp(dna_pred)

    return dna_pred, dna_true, dna_cigar
Ejemplo n.º 10
0
def update_transcript(a, h):
        ''' 
        for each matched regex group in the cs string, concatenate to a string
        representing the aligned sequence from the reference.
        for matches, insertions, and mismatches, add to the string. 
        for deletions (in the reference), do nothing.
        skip over introns called by minimap2.
        '''

        my_transcript = ""
        idx = 0 

        ## retrieve the matching subsequence from the index.
        s = a.seq(h.ctg, h.r_st, h.r_en)

        ''' 
        see the documentation for the cs string regex at the minimap2 manpage, 
        online at: https://lh3.github.io/minimap2/minimap2.html 
        '''
        cs_regex = re.compile(r'(=[ACGTN]+|:[0-9]+|\*[acgtn][acgtn]|\-[acgtn]+|\+[acgtn]+|~[acgtn]{2}[0-9]+[acgtn]{2})')

        for m in cs_regex.findall(h.cs):
                if m.startswith(':'): ## get the length of the match and add to the transcript.
                        seq_match_len = int(m[1:])
                        my_transcript = my_transcript + s[idx:idx + seq_match_len]
                        idx = idx + seq_match_len
                elif m.startswith('*'): ## mismatch of a single base. add to the transcript.
                        my_transcript = my_transcript + s[idx:idx + 1]
                        idx = idx + 1
                elif m.startswith('-'):
                        ## deletion in the transcript; add the genome sequence to the transcript.
                        deletion_len = len(m[1:])
                        my_transcript = my_transcript + s[idx:idx + deletion_len]
                        idx = idx + deletion_len
                elif m.startswith('+'):
                        ## insertion in the transcript that is not in the genome. skip over.
                        insertion_len = len(m[1:])
                        continue
                elif m.startswith('~'): ## in an intron: skip over genome sequence and update idx.
                        ## because pattern is something like '~ag123ag' where 123 is intron length, with 4bp of flanking splice signal 'ag'.
                        intron_len = int(m[3:-2]) + 4
                        idx = idx + intron_len
                else: ## note that we don't deal with '=' for the long-form cs string.
                      raise Exception("ERROR: failed to match cs string group '{}'".format(m))

        ## check the strand, take the reverse complement if needed.
        if h.strand == -1: my_transcript = mp.revcomp(my_transcript)

        return my_transcript
Ejemplo n.º 11
0
def align_mappy(dir_in, file_out, file_fasta):
    a = mp.Aligner(file_fasta, preset='map-ont')  # Load or build index
    if not a:
        raise Exception("ERROR: failed to load/build index")

    reads = get_files(dir_in)
    files_fastq = {}
    data = []

    for read in tqdm(reads):
        with h5py.File(read, 'r', libver='latest') as fd:
            no_alignment = True
            fastq = read_fastq(fd)
            files_fastq[fastq.id] = len(fastq.seq)

            for hit in a.map(fastq.seq):  # Traverse alignments
                if hit.is_primary:  # Check if the alignment is primary
                    # Reference
                    for seq_record in SeqIO.parse(file_fasta, 'fasta'):
                        ref = seq_record.seq[hit.r_st:hit.r_en]
                    r_CG_num = len(re.findall(r'(CG)', str(ref)))

                    # Query
                    query = fastq.seq[hit.q_st:hit.q_en]
                    if hit.strand == -1:
                        query = mp.revcomp(query)
                    q_CG_num = len(re.findall(r'(CG)', str(query)))

                    no_alignment = False
                    data.append([
                        fastq.id, hit.r_st, hit.r_en, hit.q_st, hit.q_en,
                        r_CG_num, q_CG_num, hit.cigar_str
                    ])
                    break

        if no_alignment:
            data.append([fastq.id, '', '', '', '', 0, 0, ''])

    data = pd.DataFrame(data,
                        columns=[
                            'read_id', 'r_st', 'r_en', 'q_st', 'q_en',
                            'r_CG_num', 'q_CG_num', 'cigar_str'
                        ])
    data.sort_values('read_id', inplace=True)
    data.to_csv(file_out, index=False)

    print("Average length of fastq files:",
          sum(files_fastq.values()) / len(files_fastq.values()))
Ejemplo n.º 12
0
    def process(self) -> Optional[List[ResegmentationData]]:
        read, called = self.basecall_data

        alignment = self.align(called.seq)
        if not alignment:
            return None

        relevant_motif_positions = self._get_relevant_motif_positions(
            alignment)
        if not relevant_motif_positions:
            return None

        seq_to_raw = sequence_to_raw(read, called)

        signal_intervals, deletion_idx = CustomProcessor.resolve_insertions(
            alignment, seq_to_raw)
        signal_intervals = CustomProcessor.resolve_deletions(
            signal_intervals, deletion_idx)

        resegmentation_data = []

        for motif_position in relevant_motif_positions:
            r_len = alignment.r_en - alignment.r_st
            if motif_position - self.window < 0 or motif_position + self.window >= r_len:
                continue

            position = alignment.r_st + motif_position if alignment.strand == 1 else alignment.r_en - 1 - motif_position

            event_intervals = signal_intervals[motif_position -
                                               self.window:motif_position +
                                               self.window + 1]
            event_lens = np.array([
                interval.end - interval.start for interval in event_intervals
            ])

            reference = get_reference(self.reference_file, alignment.ctg)
            region = reference[position - self.window:position + self.window +
                               1]
            bases = region if alignment.strand == 1 else mappy.revcomp(region)

            assert len(event_intervals) == len(event_lens) == len(bases)

            resegmentation_data.append(
                ResegmentationData(position, event_intervals, event_lens,
                                   bases))

        return resegmentation_data
Ejemplo n.º 13
0
def analyse_CG(dir_in, file_out, file_fasta):
    a = mp.Aligner(file_fasta, preset='map-ont')  # Load or build index
    if not a:
        raise Exception("ERROR: failed to load/build index")

    reads = get_files(dir_in)
    data = []

    for read in tqdm(reads):
        with h5py.File(read, 'r', libver='latest') as fd:
            matches = {'M': 0, 'X': 0, 'D': 0, 'I': 0}
            CG_cnt = {'M': 0, 'X': 0, 'D': 0, 'I': 0}

            fastq = read_fastq(fd)
            ref = ''
            mapq = 0

            for hit in a.map(fastq.seq, cs=True):  # Traverse alignments
                if hit.is_primary:  # Check if the alignment is primary
                    # Alignment
                    matches = count_matches(hit.cs)

                    # Reference
                    for seq_record in SeqIO.parse(file_fasta, 'fasta'):
                        ref = seq_record.seq[hit.r_st: hit.r_en]

                    # Query
                    query = fastq.seq[hit.q_st: hit.q_en]
                    if hit.strand == -1:
                        query = mp.revcomp(query)

                    # Normalize
                    ref, query = normalize(ref, query, hit.cigar_str)

                    # Analyse CG motif
                    CG_cnt = count_CG(ref, query)

                    mapq = hit.mapq
                    break

            data.append([fastq.id, len(ref), matches['M'], matches['X'], matches['D'], matches['I'],
                         CG_cnt['M'], CG_cnt['X'], CG_cnt['D'], CG_cnt['I'], mapq])

    data = pd.DataFrame(data, columns=['read_id', 'alignment_len', 'M', 'X', 'D', 'I',
                                       'M_CG', 'X_CG', 'D_CG', 'I_CG', 'mapq'])
    data.sort_values('read_id', inplace=True)
    data.to_csv(file_out, index=False)
    def _get_mod_likelihoods(self, read):
        """Return modification likelihoods of the given read as numpy array for each base of the read.
        The parameter `read` is AlignedSegment from pysam and must contain full sequence of the read,
        i.e. no secondary alignments or hard clips."""
        import numpy as np
        import logging as log
        import mappy

        meth = self.db.get(read.query_name)

        if meth is None:
            log.info(f"Couldn't find modification data for {read.query_name}")
            return None

        # The read from mapping is given in the reference orientation but the extraction was done in the read orientation.
        if read.is_reverse:
            read_sequence = mappy.revcomp(read.query_sequence)
            MOD_BASE = b"G"
        else:
            read_sequence = read.query_sequence
            MOD_BASE = b"C"

        try:
            assert len(meth) == read_sequence.count(
                "C"
            ), "Unexpected number of methylation observations {} instead of {}.\n Read {}".format(
                len(meth), read_sequence.count("C"), read.tostring())
        except AssertionError:
            log.info(
                "Unexpected number of methylation observations {} instead of {}.\n Read {}"
                .format(len(meth), read_sequence.count("C"), read.tostring()))
            log.info(
                "Query length: {} len(read sequence): {} Inferred read length: {}"
                .format(read.query_length, len(read_sequence),
                        read.infer_read_length()))
            return None

        # Tabulate methylation values to an array
        meth_like = np.zeros(read.query_length, np.uint8)

        read_sequence_arr = np.fromstring(read_sequence, dtype="|S1")
        mod_indices = np.where(read_sequence_arr == b"C")[0]
        meth_like[
            mod_indices] = meth  # 255*P(Data | base modified, base called) Likelihood of modification for each base of the called sequence (read_sequence).

        return meth_like
Ejemplo n.º 15
0
def extract_fastq(input_f, ref_f, mode=0, trans_start=None, alignment=True):
    """
    Args:
        input_f: intput fast5 file handle
        ref_f: file name of the reference
        mode: 0-dna, 1-rna, -1-rna 180mV
        trans_start: Start position of the transcription(required in RNA mode).
        alignment: If requrie alignment.
    """
    with h5py.File(input_f, 'r') as input_fh:
        raw_entry = list(input_fh['/Raw/Reads'].values())[0]
        raw_signal = raw_entry['Signal'].value
        raw_seq = input_fh[BASECALL_ENTRY + '/BaseCalled_template/Fastq'].value
        if mode != 0:
            assert trans_start is not None
            raw_signal, raw_seq, decap_event = _decap(input_fh, trans_start,
                                                      raw_signal, raw_seq)
        else:
            decap_event = input_fh[BASECALL_ENTRY +
                                   '/BaseCalled_template/Events'].value
        align = None
        ref_seq = None
        if alignment:
            ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5)
            aligns = ref.map(raw_seq.split(b'\n')[1])
            maxmapq = -np.inf
            for aln in aligns:
                if aln.mapq > maxmapq:
                    maxmapq = aln.mapq
                    align = aln
            if align is None:
                print("FAIL MAPPING " + input_f)
            else:
                if align.strand == -1:
                    ref_seq = mappy.revcomp(
                        ref.seq(align.ctg, start=align.r_st, end=align.r_en))
                else:
                    ref_seq = ref.seq(align.ctg,
                                      start=align.r_st,
                                      end=align.r_en)
        if (mode == 1) or (mode == -1):
            raw_signal = raw_signal[::-1]
    if ref_seq is None and alignment:
        print("No Reference sequence found in %s" % (input_f))
    return raw_signal, raw_seq, ref_seq, decap_event
Ejemplo n.º 16
0
def get_motif_positions(reference_file: str, motif: str, index: int) -> Dict[str, Tuple[Set[int], Set[int]]]:
    chromosomes = SeqIO.to_dict(SeqIO.parse(reference_file, 'fasta'))
    motif_positions = dict()

    for chromosome, record in chromosomes.items():
        reference = str(record.seq)

        # Forward strand
        fwd_matches = re.finditer(motif, reference, re.I)
        fwd_pos = set(m.start() + index for m in fwd_matches)

        # Reverse strand
        rev_matches = re.finditer(motif, mappy.revcomp(reference), re.I)
        rev_pos = set(len(reference) - (m.start() + index) - 1 for m in rev_matches)

        motif_positions[chromosome] = fwd_pos, rev_pos

    return motif_positions
Ejemplo n.º 17
0
def test_write_sequence_coverage_minimap_hits_reversed(projects,
                                                       sequence_report):
    hxb2_name = 'HIV1-B-FR-K03455-seed'
    ref = projects.getReference(hxb2_name)
    seq = ref[1000:1100] + revcomp(ref[2000:2100])
    expected_minimap_hits = """\
contig,ref_name,start,end,ref_start,ref_end
1-my-contig,HIV1-B-FR-K03455-seed,1,100,1001,1100
1-my-contig,HIV1-B-FR-K03455-seed,101,200,2100,2001
"""
    report_file = StringIO()
    sequence_report.projects = projects
    sequence_report.write_genome_coverage_header(StringIO())
    sequence_report.write_minimap_hits_header(report_file)
    sequence_report.write_sequence_coverage_counts('1-my-contig', hxb2_name,
                                                   seq)

    assert report_file.getvalue() == expected_minimap_hits
Ejemplo n.º 18
0
def custom_processor(basecall_data: Tuple[ReadData, CalledReadData], aligner: mappy.Aligner, reference_file: str,
                     motif_positions: Dict[str, Tuple[Set[int], Set[int]]], mapq: int, window: int) -> ResegmentationData:
    read, called = basecall_data

    alignment = align(aligner, called.seq, mapq)
    if not alignment:
        return None

    relevant_motif_positions = get_relevant_motif_positions(motif_positions, alignment)
    if not relevant_motif_positions:
        return None

    seq_to_raw = sequence_to_raw(read, called)

    signal_intervals, deletion_idx = resolve_insertions(alignment, seq_to_raw)
    signal_intervals = resolve_deletions(signal_intervals, deletion_idx)

    resegmentation_data = []

    for motif_position in relevant_motif_positions:
        r_len = alignment.r_en - alignment.r_st
        if motif_position - window < 0 or motif_position + window >= r_len:
            continue

        position = alignment.r_st + motif_position if alignment.strand == 1 else alignment.r_en - 1 - motif_position

        event_intervals = signal_intervals[motif_position - window: motif_position + window + 1]
        event_lens = np.array([interval.end - interval.start for interval in event_intervals])

        reference = get_reference(reference_file, alignment.ctg)
        region = reference[position - window: position + window + 1]
        bases = region if alignment.strand == 1 else mappy.revcomp(region)

        assert len(event_intervals) == len(event_lens) == len(bases)

        resegmentation_data.append(ResegmentationData(position, event_intervals, event_lens, bases))

    return resegmentation_data
Ejemplo n.º 19
0
    def run(self):

        chunks = []
        targets = []
        target_lens = []

        while True:

            job = self.queue.get()
            if job is None: break
            chunks_, predictions = job

            # convert logprobs to probs
            predictions = np.exp(predictions.astype(np.float32))

            for chunk, pred in zip(chunks_, predictions):

                try:
                    sequence = self.model.decode(pred)
                except:
                    continue

                if not sequence:
                    continue

                for mapping in self.aligner.map(sequence):
                    cov = (mapping.q_en - mapping.q_st) / len(sequence)
                    acc = mapping.mlen / mapping.blen
                    refseq = self.aligner.seq(mapping.ctg, mapping.r_st + 1,
                                              mapping.r_en)
                    if 'N' in refseq: continue
                    if mapping.strand == -1: refseq = revcomp(refseq)
                    break
                else:
                    continue

                if acc > self.min_accuracy and cov > self.min_accuracy:
                    chunks.append(chunk.squeeze())
                    targets.append([
                        int(x) for x in refseq.translate({
                            65: '1',
                            67: '2',
                            71: '3',
                            84: '4'
                        })
                    ])
                    target_lens.append(len(refseq))

        if len(chunks) == 0: return

        chunks = np.array(chunks, dtype=np.float32)
        chunk_lens = np.full(chunks.shape[0], chunks.shape[1], dtype=np.int16)

        targets_ = np.zeros((chunks.shape[0], max(target_lens)),
                            dtype=np.uint8)
        for idx, target in enumerate(targets):
            targets_[idx, :len(target)] = target
        target_lens = np.array(target_lens, dtype=np.uint16)

        training = ChunkDataSet(chunks, chunk_lens, targets_, target_lens)
        training = filter_chunks(training)

        output_directory = '.' if sys.stdout.isatty() else dirname(
            realpath('/dev/fd/1'))
        np.save(os.path.join(output_directory, "chunks.npy"),
                training.chunks.squeeze(1))
        np.save(os.path.join(output_directory, "chunk_lengths.npy"),
                training.chunk_lengths)
        np.save(os.path.join(output_directory, "references.npy"),
                training.targets)
        np.save(os.path.join(output_directory, "reference_lengths.npy"),
                training.target_lengths)

        sys.stderr.write("> written ctc training data\n")
        sys.stderr.write("  - chunks.npy with shape (%s)\n" %
                         ','.join(map(str,
                                      training.chunks.squeeze(1).shape)))
        sys.stderr.write("  - chunk_lengths.npy with shape (%s)\n" %
                         ','.join(map(str, training.chunk_lengths.shape)))
        sys.stderr.write("  - references.npy with shape (%s)\n" %
                         ','.join(map(str, training.targets.shape)))
        sys.stderr.write("  - reference_lengths.npy shape (%s)\n" %
                         ','.join(map(str, training.target_lengths.shape)))
Ejemplo n.º 20
0
    def run(self):

        chunks = []
        targets = []
        lengths = []

        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, ctc_data in self.iterator:

                seq = ctc_data['sequence']
                qstring = ctc_data['qstring']
                mean_qscore = ctc_data['mean_qscore']
                mapping = ctc_data.get('mapping', False)

                self.log.append((read.read_id, len(read.signal)))

                if len(seq) == 0 or mapping is None:
                    continue

                cov = (mapping.q_en - mapping.q_st) / len(seq)
                acc = mapping.mlen / mapping.blen
                refseq = self.aligner.seq(mapping.ctg, mapping.r_st,
                                          mapping.r_en)

                if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq:
                    continue

                write_sam(read.read_id,
                          seq,
                          qstring,
                          mapping,
                          fd=self.fd,
                          unaligned=mapping is None)
                summary.append(
                    summary_row(read, len(seq), mean_qscore,
                                alignment=mapping))

                if mapping.strand == -1:
                    refseq = revcomp(refseq)

                target = [
                    int(x) for x in refseq.translate({
                        65: '1',
                        67: '2',
                        71: '3',
                        84: '4'
                    })
                ]
                targets.append(target)
                chunks.append(read.signal)
                lengths.append(len(target))

        if len(chunks) == 0:
            sys.stderr.write("> no suitable ctc data to write\n")
            return

        chunks = np.array(chunks, dtype=np.float16)
        targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8)
        for idx, target in enumerate(targets):
            targets_[idx, :len(target)] = target
        lengths = np.array(lengths, dtype=np.uint16)
        indices = np.random.permutation(typical_indices(lengths))

        chunks = chunks[indices]
        targets_ = targets_[indices]
        lengths = lengths[indices]

        summary = pd.read_csv(summary_file(), sep='\t')
        summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False)

        output_directory = '.' if sys.stdout.isatty() else dirname(
            realpath('/dev/fd/1'))
        np.save(os.path.join(output_directory, "chunks.npy"), chunks)
        np.save(os.path.join(output_directory, "references.npy"), targets_)
        np.save(os.path.join(output_directory, "reference_lengths.npy"),
                lengths)

        sys.stderr.write("> written ctc training data\n")
        sys.stderr.write("  - chunks.npy with shape (%s)\n" %
                         ','.join(map(str, chunks.shape)))
        sys.stderr.write("  - references.npy with shape (%s)\n" %
                         ','.join(map(str, targets_.shape)))
        sys.stderr.write("  - reference_lengths.npy shape (%s)\n" %
                         ','.join(map(str, lengths.shape)))
Ejemplo n.º 21
0
def StrandSim(w, c):
    '''
	Perform first part of strand-seq simulations and re-align to the original haplotype
	'''

    hfa = pyfaidx.Fasta(c.ffile)

    if w.chrom not in hfa.keys():

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Warning] Chromosome ' + w.chrom +
              ' not found in ' + c.ffile + '. Skipped simulation')

    else:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Preparing simulation from ' + c.ffile +
              '. Haplotype ' + str(c.hapnumber))

        chr_ = hfa[w.chrom]
        seq_ = chr_[w.start - 1:w.end].seq
        tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa')
        region = w.chrom + '_' + str(w.start) + '_' + str(w.end)

        with open(tmpfa,
                  'w') as tmpfout:  #write temporary fa for sampling reads

            tmpfout.write('>' + region + '\n' +
                          '\n'.join(re.findall('.{1,60}', seq_)) + '\n')

        Ns = seq_.count('N')  #normalize coverage on Ns
        Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) /
                       2)  #for paired-end sequencing

        mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq')
        mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq')

        hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns)
        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now +
              '][Message] Simulated coverage for this region will be ' +
              str(hapcov))

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Simulating')

        wgsim.core(r1=mate1h,
                   r2=mate2h,
                   ref=tmpfa,
                   err_rate=c.error,
                   mut_rate=c.mutation,
                   indel_frac=c.indels,
                   indel_ext=c.extindels,
                   N=Nreads,
                   dist=c.distance,
                   stdev=c.stdev,
                   size_l=c.length,
                   size_r=c.length,
                   max_n=0.05,
                   is_hap=0,
                   is_fixed=0,
                   seed=0)

        os.remove(tmpfa)

        mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
        mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

        with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

            for (name1, seq1, qual1), (name2, seq2,
                                       qual2) in zip(mp.fastx_read(mate1h),
                                                     mp.fastx_read(mate2h)):

                #change name1/name2

                newname1 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name1
                newname2 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name2

                read1 = [newname1, seq1, '+', qual1]
                read2 = [newname2, seq2, '+', qual2]

                out1.write('\n'.join(read1) + '\n')
                out2.write('\n'.join(read2) + '\n')

        os.remove(mate1h)
        os.remove(mate2h)

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping simulated reads to the corresponding haplotype'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), c.ffile, mate1hnew, mate2hnew
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(mate1hnew)
        os.remove(mate2hnew)

        #now re-parse BAM file to keep only Watson/Crick reads
        #Watson reads: read1 forward, read2 reverse
        #Crick reads: read2 forward, read1 reverse

        ivf = None

        if len(c.sce_bedregion) != 0:

            sce_string = ''

            for s in c.sce_bedregion:

                if s[3] == c.cellid and s[4] == c.hapid:

                    sce_string += s.chrom + '\t' + str(s.start) + '\t' + str(
                        s.end) + '\n'

            if sce_string != '':

                sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(),
                                                     from_string=True)
                ivf = sce_fromscratch.as_intervalfile(
                )  #intervals where to perform SCE events

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now +
                    '][Message] Detected one ore more SCE event for current cell/haplotype'
                )

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads')

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #until-eof consumes the bamfile
        pysam.set_verbosity(save)
        Wreads = list(WR(bamstrand, ivf))
        bamstrand.close()

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #re-open for second round
        pysam.set_verbosity(save)
        Creads = list(CR(bamstrand, ivf))
        bamstrand.close()

        os.remove(BAM)

        if c.noise > 0:

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Adding noise to strands')

            CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise))
            Wreads += CtoW

            WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise))
            Creads += WtoC

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Writing Watson and Crick FASTQ')

        w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq')
        w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq')

        c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq')
        c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq')

        with open(w1, 'w') as wout1, open(w2, 'w') as wout2:

            for r1, r2 in Wreads:

                if r1.get_tag('OS') == 'W':  #this is true W

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                else:  #write to Watson, but is Crick

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                wout1.write('\n'.join(read1) + '\n')
                wout2.write('\n'.join(read2) + '\n')

        with open(c1, 'w') as cout1, open(c2, 'w') as cout2:

            for r1, r2 in Creads:

                if r1.get_tag('OS') == 'C':  #this is true C

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                else:  #write to Crick, but is Watson

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                cout1.write('\n'.join(read1) + '\n')
                cout2.write('\n'.join(read2) + '\n')

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping Watson and Crick reads to the original reference'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.W.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1,
            w2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(w1)
        os.remove(w2)

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.C.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1,
            c2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(c1)
        os.remove(c2)
Ejemplo n.º 22
0
def determine_consensus(name, fasta, fastq_reads_full, fastq_reads_partial,
                        counter):
    '''Aligns and returns the consensus'''
    corrected_consensus = ''
    repeats = '0'

    fasta_read_dict = fasta
    fasta_reads = []
    for read, seq in fasta_read_dict.items():
        fasta_reads.append((read, seq))
    repeats = str(len(fasta_reads))

    out_Fq = temp_folder + '/' + counter + '_subsampled.fastq'
    out_F = temp_folder + '/' + counter + '_subsampled.fasta'
    combined_consensus_file = open(temp_folder + '/' + counter + '.fasta', 'w')
    out = open(out_Fq, 'w')

    poa_cons = temp_folder + '/consensus.' + counter + '.fasta'
    output_cons = temp_folder + '/corrected_consensus.' + counter + '.fasta'
    overlap = temp_folder + '/overlaps.' + counter + '.paf'
    overlap_fh = open(overlap, 'w')

    fastq_reads = fastq_reads_full + fastq_reads_partial
    if len(fastq_reads) > 0:
        if len(fastq_reads_full) < subsample:
            subsample_fastq_reads = fastq_reads
        else:
            indeces = np.random.choice(np.arange(0, len(fastq_reads_full)),
                                       min(len(fastq_reads_full), subsample),
                                       replace=False)
            subsample_fastq_reads = []
            for index in indeces:
                subsample_fastq_reads.append(fastq_reads_full[index])

        subread_counter = 0

        subsample_fastq_reads_numbered = []
        for read in subsample_fastq_reads:
            subread_counter += 1
            out.write('@' + read[0] + '_' + str(subread_counter) + '\n' +
                      read[1] + '\n+\n' + read[2] + '\n')
            subsample_fastq_reads_numbered.append(
                (read[0] + '_' + str(subread_counter), read[1], read[2],
                 read[3]))
        out.close()
        subsample_fastq_reads = list(subsample_fastq_reads_numbered)

        indeces = np.random.choice(np.arange(0, len(fasta_reads)),
                                   min(len(fasta_reads), 20),
                                   replace=False)
        subsample_fasta_reads = []
        for index in indeces:
            subsample_fasta_reads.append(fasta_reads[index])

        first = subsample_fasta_reads[0][1]
        sequences = []
        mm_align = mm.Aligner(seq=first, preset='map-ont')
        for read, sequence in subsample_fasta_reads:
            for hit in mm_align.map(sequence):
                if hit.is_primary:
                    if hit.strand == 1:
                        sequences.append(sequence)
                    elif hit.strand == -1:
                        sequences.append(mm.revcomp(sequence))

        res = poa_aligner.msa(sequences, out_cons=True, out_msa=False)
        if len(sequences) <= 2:
            consensus_sequence = sequences[0]
        elif not res.cons_seq:
            consensus_sequence = sequences[0]
        else:
            consensus_sequence = res.cons_seq[0]

        out_cons_file = open(poa_cons, 'w')
        out_cons_file.write('>Consensus\n' + consensus_sequence + '\n')
        out_cons_file.close()

        final = poa_cons
        mm_align = mm.Aligner(seq=consensus_sequence, preset='map-ont')
        for name, sequence, q, le in subsample_fastq_reads:
            for hit in mm_align.map(sequence):
                if hit.is_primary:
                    overlap_fh.write(
                        "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".
                        format(name, str(len(sequence)), hit.q_st, hit.q_en,
                               hit.strand, 'Consensus', hit.ctg_len, hit.r_st,
                               hit.r_en, hit.mlen, hit.blen, hit.mapq))

        overlap_fh.close()

        os.system('%s -q 5 -t 1 --no-trimming %s %s %s >%s 2>./racon_messages.txt' \
                   %(racon,out_Fq, overlap, poa_cons, output_cons))
        final = output_cons

        reads = read_fasta(final)
        if len(reads) == 0:
            print('racon no')
            reads = read_fasta(poa_cons)

        forMedaka = open(output_cons, 'w')
        for read in reads:
            corrected_consensus = reads[read]
            forMedaka.write('>Corrected_Consensus\n' + corrected_consensus +
                            '\n')
        forMedaka.close()

        os.system('mkdir ' + temp_folder + '/' + counter)
        os.system('%s -f -i %s -d %s -o %s > %s_medaka_messages.txt 2>&1' %
                  (medaka, out_Fq, final, temp_folder + '/' + counter,
                   temp_folder + '/' + counter))
        final = temp_folder + '/' + counter + '/consensus.fasta'
        reads = read_fasta(final)
        for read in reads:
            corrected_consensus = reads[
                read]  # if no read in file, corrected_consensus from racon output is used implicitly
        return corrected_consensus
Ejemplo n.º 23
0
def write_fasta_file(args, path, adapter_dict, reads, seq_to_idx, idx_to_seq):
    undirectional = args.undirectional
    barcoded = args.barcoded
    trim = args.trim

    odT = True if seq_to_idx else False

    if barcoded:
        out10X = open(path + 'R2C2_full_length_consensus_reads_10X_sequences.fasta', 'w')
    if odT:
        outdT = open(path + 'R2C2_oligodT_multiplexing.tsv', 'w')
        for idx in idx_to_seq:
            if os.path.exists(path + idx):
                shutil.rmtree(path + idx)
    else:
        out = open(path + 'R2C2_full_length_consensus_reads.fasta', 'w')
        out3 = open(path + 'R2C2_full_length_consensus_reads_left_splint.fasta', 'w')
        out5 = open(path + 'R2C2_full_length_consensus_reads_right_splint.fasta', 'w')

    for name, sequence in (tqdm(reads.items()) if args.threads==1  else reads.items()):
        adapter_plus = sorted(adapter_dict[name]['+'],
                              key=lambda x: x[2], reverse=False)
        adapter_minus = sorted(adapter_dict[name]['-'],
                              key=lambda x: x[2], reverse=False)
        plus_list_name, plus_positions = [], []
        minus_list_name, minus_positions = [], []

        for adapter in adapter_plus:
            if adapter[0] != '-':
                plus_list_name.append(adapter[0])
                plus_positions.append(adapter[2])
        for adapter in adapter_minus:
            if adapter[0] != '-':
                minus_list_name.append(adapter[0])
                minus_positions.append(adapter[2])

        if len(plus_list_name) != 1 or len(minus_list_name) != 1:
            continue
        if minus_positions[0] <= plus_positions[0]:
            continue

        if undirectional:
            direction = '+'
        elif plus_list_name[0] != minus_list_name[0]:
            if plus_list_name[0] == '5Prime_adapter':
                direction = '+'
            else:
                direction = '-'
        else:
            continue

        if odT:
            outdT.write('%s\t%s\t%s\n' %(
                name,
                mm.revcomp(sequence[minus_positions[0]-16:minus_positions[0]+4]),
                sequence[plus_positions[0]-4:plus_positions[0]+16])
            )
            reverse_index, forward_index = '-', '-'
            forward_index = match_index(sequence[plus_positions[0]-4:plus_positions[0]+16], seq_to_idx)
            reverse_index = match_index(mm.revcomp(sequence[minus_positions[0]-16:minus_positions[0]+4]), seq_to_idx)

            demux = False
            if forward_index in idx_to_seq and reverse_index not in idx_to_seq:
                direction, idx_name, demux = '-', forward_index, True
            if reverse_index in idx_to_seq and forward_index not in idx_to_seq:
                direction, idx_name, demux = '+', reverse_index, True
            if not demux:
                idx_name = 'no_index_found'

            demux_path = path + idx_name + '/'
            if not os.path.isdir(demux_path):
                os.mkdir(demux_path)

            out = open(demux_path + 'R2C2_full_length_consensus_reads.fasta', 'a+')
            out3 = open(demux_path + 'R2C2_full_length_consensus_reads_left_splint.fasta', 'a+')
            out5 = open(demux_path + 'R2C2_full_length_consensus_reads_right_splint.fasta', 'a+')

        seq = sequence[plus_positions[0]:minus_positions[0]]
        ada = sequence[max(plus_positions[0]-40, 0):minus_positions[0]+40]
        name += '_' + str(len(seq))
        if direction == '+':
            if trim:
                out.write('>%s\n%s\n' %(name, seq))
            else:
                out.write('>%s\n%s\n' %(name, ada))
            out5.write('>%s\n%s\n' %(name, mm.revcomp(sequence[:plus_positions[0]])))
            out3.write('>%s\n%s\n' %(name, sequence[minus_positions[0]:]))
            if barcoded:
                out10X.write('>%s\n%splus\n' %(name, mm.revcomp(sequence[minus_positions[0]-40:minus_positions[0]])))
        elif direction == '-':
            if trim:
                out.write('>%s\n%s\n' %(name, mm.revcomp(seq)))
            else:
                out.write('>%s\n%s\n' %(name, mm.revcomp(ada)))
            out3.write('>%s\n%s\n' %(name, mm.revcomp(sequence[:plus_positions[0]+40])))
            out5.write('>%s\n%s\n' %(name, sequence[minus_positions[0]:]))
            if barcoded:
                out10X.write('>%s\n%sminus\n' %(name, sequence[plus_positions[0]:plus_positions[0]+40]))

        if odT:
            out.close()
            out3.close()
            out5.close()

    if not odT:
        out.close()
        out3.close()
        out5.close()
    if barcoded:
        out10X.close()
    if odT:
        outdT.close()
Ejemplo n.º 24
0
def main(args):
    if not args.out_path.endswith('/'):
        args.out_path += '/'
    if not os.path.exists(args.out_path):
        os.mkdir(args.out_path)
    log_file = open(args.out_path + 'c3poa.log', 'w+')

    if args.config:
        progs = configReader(args.out_path, args.config)
        racon = progs['racon']
        blat = progs['blat']
    else:
        racon = 'racon'
        blat = 'blat'

    tmp_dir = args.out_path + 'tmp/'
    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir)

    # read in the file and preprocess
    read_list, total_reads = [], 0
    short_reads = 0
    tmp_fasta = tmp_dir + 'R2C2_temp_for_BLAT.fasta'
    align_psl = tmp_dir + 'splint_to_read_alignments.psl'

    tmp_adapter_dict = {}
    for read in mm.fastx_read(args.reads, read_comment=False):
        if len(read[1]) < args.lencutoff:
            short_reads += 1
            continue
        tmp_adapter_dict[read[0]] = [[None, 1,
                                      None]]  # [adapter, matches, strand]
        total_reads += 1
    adapter_dict, adapter_set, no_splint = preprocess(blat, args, tmp_dir,
                                                      tmp_adapter_dict,
                                                      total_reads)

    for adapter in adapter_set:
        if not os.path.exists(args.out_path + adapter):
            os.mkdir(args.out_path + adapter)

    all_reads = total_reads + short_reads
    print('C3POa version:', VERSION, file=log_file)
    print('No splint reads:',
          no_splint,
          '({:.2f}%)'.format((no_splint / all_reads) * 100),
          file=log_file)
    print('Under len cutoff:',
          short_reads,
          '({:.2f}%)'.format((short_reads / all_reads) * 100),
          file=log_file)
    print('Total thrown away reads:',
          short_reads + no_splint,
          '({:.2f}%)'.format(((short_reads + no_splint) / all_reads) * 100),
          file=log_file)
    print('Total reads:', all_reads, file=log_file)
    log_file.close()

    splint_dict = {}
    for splint in mm.fastx_read(args.splint_file, read_comment=False):
        splint_dict[splint[0]] = [splint[1]]
        splint_dict[splint[0]].append(mm.revcomp(splint[1]))

    pool = mp.Pool(args.numThreads, maxtasksperchild=1)
    pbar = tqdm(total=total_reads // args.groupSize + 1,
                desc='Calling consensi')
    iteration, current_num, tmp_reads, target = 1, 0, [], args.groupSize
    for read in mm.fastx_read(args.reads, read_comment=False):
        if len(read[1]) < args.lencutoff:
            continue
        tmp_reads.append(read)
        current_num += 1
        if current_num == target:
            pool.apply_async(analyze_reads,
                             args=(args, tmp_reads, splint_dict, adapter_dict,
                                   adapter_set, iteration, racon),
                             callback=lambda _: pbar.update(1))
            iteration += 1
            target = args.groupSize * iteration
            if target >= total_reads:
                target = total_reads
            tmp_reads = []
            gc.collect()
    pool.close()
    pool.join()
    pbar.close()

    for adapter in adapter_set:
        cat_files(args.out_path + adapter, '/tmp*/R2C2_Consensus.fasta',
                  args.out_path + adapter + '/R2C2_Consensus.fasta',
                  'Catting consensus reads')
        cat_files(args.out_path + adapter, '/tmp*/subreads.fastq',
                  args.out_path + adapter + '/R2C2_Subreads.fastq',
                  'Catting subreads')
        remove_files(args.out_path + adapter, '/tmp*')
Ejemplo n.º 25
0
#print(orientation)
if args.debug:
    for k, v in natsorted(flags.items()):
        print(k, v)
Complete = basename + '_complete_facs.fasta'
Unassigned = basename + '_unassigned_facs.fasta'
complete_count = 0
the_rest = 0
with open(Complete, 'w') as comp:
    with open(Unassigned, 'w') as bad:
        for seq in mappy.fastx_read(os.path.abspath(args.assembly),
                                    read_comment=True):
            if seq[0] in completeFACs:
                if seq[0] in orientation:
                    if orientation[seq[0]] == -1:
                        FinalSeq = mappy.revcomp(seq[1])
                    else:
                        FinalSeq = seq[1]
                else:
                    FinalSeq = seq[1]
                comments = seq[3].split(' ')
                facname = completeFACs[seq[0]][0]
                complete_count += len(completeFACs[seq[0]])
                comp.write('>{:};organism={:};{:};\n{:}\n'.format(
                    '|'.join(completeFACs[seq[0]]),
                    inputDict[facname]['organism'], ';'.join(comments),
                    FinalSeq))
            else:
                the_rest += 1
                bad.write('>{:} {:}\n{:}\n'.format(seq[0], seq[3], seq[1]))
print('[{:}] Found {:,} full-length FACs corresponing to {:} unique sequences'.