Esempio n. 1
0
def realign_filter(rec, inslib):
    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    seqn = rec['Superfamily'] + ':' + rec['Subfamily']
    if seqn not in inslib:
        return False

    seq_headers = ['Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p']

    for seqtype in seq_headers:
        s1 = align.string_to_alignment(rec[seqtype])
        s2 = align.string_to_alignment(inslib[seqn])

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = 0.0
        if len(a1) > 0:
            score = float(len(a1) - (len(a1)-s)) / float(len(a1))

        #print seqtype, score, len(a1)

        if score > 0.9 and len(a1) > 25:
            return False

        return True
Esempio n. 2
0
def align_test(s, m):
    S = np.array([[1 if i == j else -1 for i in range(256)] for j in range(256)], dtype=np.short)
    score, p, _ = align.align(list(align.string_to_alignment(s)),
                              list(align.string_to_alignment(m)),
                              -1, -1, S, True, True)
    p = align.alignment_to_string(p).replace('-', '')
    return score, s.find(p)
Esempio n. 3
0
def realign_filter(rec, inslib):
    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    seqn = rec['Superfamily'] + ':' + rec['Subfamily']
    if seqn not in inslib:
        return False

    seq_headers = [
        'Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p',
        'Insert_Consensus_3p'
    ]

    for seqtype in seq_headers:
        s1 = align.string_to_alignment(rec[seqtype])
        s2 = align.string_to_alignment(inslib[seqn])

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join(
            [b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = 0.0
        if len(a1) > 0:
            score = float(len(a1) - (len(a1) - s)) / float(len(a1))

        #print seqtype, score, len(a1)

        if score > 0.9 and len(a1) > 25:
            return False

        return True
Esempio n. 4
0
    def consensus(self, minscore = 0.9):
        ''' build consensus from sorted aligned reads iteratively '''

        S = -np.ones((256, 256)) + 2 * np.identity(256)
        S = S.astype(np.int16)

        minqual = self.reads[0].minqual

        sortable_reads = [SortableRead(sr.read) for sr in self.reads]
        seqs = [qualtrim(sorted_read.read, minqual=minqual) for sorted_read in sorted(sortable_reads)]
        seqs = [s for s in seqs if len(s) > 20]

        if len(seqs) == 0:
            return '', 0.0

        if len(seqs) == 1: # no consensus necessary
            return seqs[0], 1.0

        uniq_seqs = [seqs[0]]
        for i, seq in enumerate(seqs[1:], start=1):
            if seq != seqs[i-1]:
                uniq_seqs.append(seq)

        if len(uniq_seqs) == 1: # all seqs were the same!
            return uniq_seqs[0], 1.0

        cons = uniq_seqs[0]
        scores = []

        if len(uniq_seqs) > 1000:
            uniq_seqs = [uniq_seqs[u] for u in sorted(np.random.choice(range(len(uniq_seqs)), size=1000))]

        for seq in uniq_seqs[1:]:

            s1 = align.string_to_alignment(cons)
            s2 = align.string_to_alignment(seq)

            (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
            a1 = align.alignment_to_string(a1)
            a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])

            score = 0.0
            if len(a1) > 0:
                score = float(len(a1) - (len(a1)-s)) / float(len(a1))

            if re.search(a1, cons):
                cons_start, cons_end = locate_subseq(cons, a1)

                if score >= minscore and cons_end > len(cons)-5:
                    scores.append(score)
                    align_end = locate_subseq(seq, a2)[1]
                    cons += seq[align_end:]
                    #print self.start, self.end, cons

        if scores:
            return cons, np.mean(scores)

        else:
            return cons, 0.0
Esempio n. 5
0
def align_shortlist(seq, shortlist):
    result = []

    nseq = list(align.string_to_alignment(seq))
    for sh in shortlist:
        score, nr, shr = align.align(nseq, list(align.string_to_alignment(sh)),
                                     -1, -1, S, True, True)
        result.append((align.alignment_to_string(nr),
                       align.alignment_to_string(shr)))
    return result
Esempio n. 6
0
def local_check(*zipped):
    for x, y in list(*zipped):
        nx = list(align.string_to_alignment(x.replace("-", "")))
        ny = list(align.string_to_alignment(y))
        score, xr, yr = align.align(nx, ny, -1, -1, S, True, True)
        xr = align.alignment_to_string(xr)
        yr = align.alignment_to_string(yr)
        # diff = abs(len(y.replace("-", "")) - len(xr.replace("-", "")))
        # if diff > 1:
        #     return False
        if hamming_dist(y, xr) > 1:
            # print(y, xr, yr, hamming_dist(y, xr))
            return False
    return True
Esempio n. 7
0
    def align(self, seq1, seq2, local = False):
        s1 = align.string_to_alignment(seq1)
        s2 = align.string_to_alignment(seq2)
        score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend,
                                    self.subs, local)
        res1, res2 = align.alignment_to_string(a1), align.alignment_to_string(a2)

        if local:
            strip1, strip2 = res1.replace("-", ""), res2.replace("-", "")
            start1, start2 = seq1.index(strip1), seq2.index(strip2)
            start_flank = max(start1, start2)
            end_flank = max(len(seq1) - len(strip1) - start1,
                            len(seq2) - len(strip2) - start2)
            res1 = "-" * start_flank + res1 + "-" * end_flank
            res2 = "-" * start_flank + res2 + "-" * end_flank
        return res1, res2, score
Esempio n. 8
0
def _build_stream_and_connection_data(msgs, limit):
    stream_data = defaultdict(list)
    conn_data   = defaultdict(list)

    # Build streams and connections
    for msg in msgs:
        data = align.string_to_alignment(msg.data[:limit])
        stream_data[msg.stream].append(data)
        conn_data[msg.conn].append(data)

    # Remove those with insufficient data
    tbr = []
    for key in stream_data:
        if len(stream_data[key]) <= 1:
            tbr.append(key)
    for key in tbr:
        del stream_data[key]
    tbr = []
    for key in conn_data:
        if len(conn_data[key]) <= 1:
            tbr.append(key)
    for key in tbr:
        del conn_data[key]

    return (stream_data, conn_data)
Esempio n. 9
0
def _classify_cluster_fields(msgs, limit, sizes, max_num_flag_values,
        noise_ratio, num_iters, labels):
    data = [align.string_to_alignment(msg.data) for msg in msgs]

    # Create clusters from FD labels
    clusters = defaultdict(list)
    for (i, label) in enumerate(labels):
        clusters[label].append(i)

    est = defaultdict(lambda: defaultdict(dict))
    for label in clusters:
        cluster_data = [data[i] for i in clusters[label]]
        if len(cluster_data) <= 1:
            continue
        aligned_data = _build_aligned_data(cluster_data, limit)
        samples = _build_samples(aligned_data)
        for size in sizes:
            est[label]['constants'][size]       = constant(samples, size)
            est[label]['flags'][size]           = flag(samples, size, max_num_flag_values)
            est[label]['uniforms'][size]        = uniform(samples, size)
            est[label]['numbers'][size]         = number(aligned_data, size)
            est[label]['incrementals'][size]    = incremental(aligned_data, size)
            est[label]['lengths'][size]         = length(cluster_data, size,
                    noise_ratio, num_iters)

    return dict(est)
Esempio n. 10
0
    def align(self, seq1, seq2, local=False):
        s1 = align.string_to_alignment(seq1)
        s2 = align.string_to_alignment(seq2)
        score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend,
                                    self.subs, local)
        res1, res2 = align.alignment_to_string(a1), align.alignment_to_string(
            a2)

        if local:
            strip1, strip2 = res1.replace("-", ""), res2.replace("-", "")
            start1, start2 = seq1.index(strip1), seq2.index(strip2)
            start_flank = max(start1, start2)
            end_flank = max(len(seq1) - len(strip1) - start1,
                            len(seq2) - len(strip2) - start2)
            res1 = "-" * start_flank + res1 + "-" * end_flank
            res2 = "-" * start_flank + res2 + "-" * end_flank
        return res1, res2, score
Esempio n. 11
0
def consensus(seqs, minscore=0.95):
    ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order '''

    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    if len(seqs) == 1: # no consensus necessary
        return seqs[0], 1.0

    uniq_seqs = [seqs[0]]
    for i, seq in enumerate(seqs[1:], start=1):
        if seq != seqs[i-1]:
            uniq_seqs.append(seq)

    if len(uniq_seqs) == 1: # all seqs were the same!
        return uniq_seqs[0], 1.0

    cons = uniq_seqs[0]
    scores = []

    if len(uniq_seqs) > 1000: uniq_seqs = np.random.choice(uniq_seqs, size=1000)

    for seq in uniq_seqs[1:]:

        s1 = align.string_to_alignment(cons)
        s2 = align.string_to_alignment(seq)

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = float(len(a1) - (len(a1)-s)) / float(len(a1))
        scores.append(score)

        if re.search(a1, cons):
            cons_start, cons_end = locate_subseq(cons, a1)

            if score >= minscore and cons_end > len(cons)-5:
                align_end = locate_subseq(seq, a2)[1]
                cons += seq[align_end:]

    return cons, np.mean(scores)
Esempio n. 12
0
def _classify_global_fields(msgs, limit, sizes, max_num_flag_values,
        noise_ratio, num_iters):
    data = [align.string_to_alignment(msg.data) for msg in msgs]
    limited_data = [d[:limit] for d in data]
    (stream_data, conn_data) = _build_stream_and_connection_data(msgs, limit)

    global_samples  = _build_samples(limited_data)
    conn_samples    = {}
    stream_samples  = {}

    for key in conn_data:
        conn_samples[key] = _build_samples(conn_data[key], zero=False,
                flag=False, uniform=False)
    for key in stream_data:
        stream_samples[key] = _build_samples(stream_data[key], zero=False,
                flag=False, uniform=False)

    global_est  = defaultdict(dict)
    conn_est    = defaultdict(dict)
    stream_est  = defaultdict(dict)

    for size in sizes:
        global_est['constants'][size]       = constant(global_samples, size)
        global_est['flags'][size]           = flag(global_samples, size, max_num_flag_values)
        global_est['uniforms'][size]        = uniform(global_samples, size)
        global_est['numbers'][size]         = number(limited_data, size)
        global_est['lengths'][size]         = length(data, size, noise_ratio, num_iters)
        conn_est['constants'][size]         = _field_consensus(conn_samples, limit, size, constant)
        stream_est['constants'][size]       = _field_consensus(stream_samples, limit, size, constant)
        stream_est['incrementals'][size]    = _field_consensus(stream_data, limit, size, incremental)

    est = {
        'global':       global_est,
        'connection':   conn_est,
        'stream':       stream_est,
    }

    return dict(est)
Esempio n. 13
0
def nwalign_wrapper(seq1, seq2, matrix=NUCMATRIX):
    s1 = align.string_to_alignment(seq1)
    s2 = align.string_to_alignment(seq2)
    (score, a1, a2) = align.align(s1, s2, -1, -1, matrix)
    return float(score) / len(a1)
Esempio n. 14
0
def consensus(seqs, minscore=0.95):
    ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order '''

    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    if len(seqs) == 0:
        return '', 0.0

    if len(seqs) == 1:  # no consensus necessary
        return seqs[0], 1.0

    uniq_seqs = [seqs[0]]
    for i, seq in enumerate(seqs[1:], start=1):
        if seq != seqs[i - 1]:
            uniq_seqs.append(seq)

    if len(uniq_seqs) == 1:  # all seqs were the same!
        return uniq_seqs[0], 1.0

    start_index = 0
    cons = uniq_seqs[start_index]
    scores = []

    align_init = False

    for i, seq in enumerate(uniq_seqs[1:]):

        #print 'oldcons:', cons
        #print 'seq    :', seq

        s1 = align.string_to_alignment(cons)
        s2 = align.string_to_alignment(seq)

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join(
            [b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = 0.0

        if len(a1) > 0:
            score = float(len(a1) - (len(a1) - s)) / float(len(a1))

        #print 'score  :', score

        scores.append(score)

        if re.search(a1, cons):
            cons_start, cons_end = locate_subseq(cons, a1)

            if score >= minscore and cons_end > len(cons) - 5:
                align_end = locate_subseq(seq, a2)[1]
                cons += seq[align_end:]
                align_init = True
                #print 'newcons:', cons

            elif not align_init:  # haven't found a scaffold yet
                start_index += 1
                cons = uniq_seqs[start_index]

        #print '****'

    return cons, np.mean(scores)
Esempio n. 15
0
    def consensus(self, minscore=0.9):
        ''' build consensus from sorted aligned reads iteratively '''

        S = -np.ones((256, 256)) + 2 * np.identity(256)
        S = S.astype(np.int16)

        minqual = self.reads[0].minqual

        sortable_reads = [SortableRead(sr.read) for sr in self.reads]
        seqs = [
            qualtrim(sorted_read.read, minqual=minqual)
            for sorted_read in sorted(sortable_reads)
        ]
        seqs = [s for s in seqs if len(s) > 20]

        if len(seqs) == 0:
            return '', 0.0

        if len(seqs) == 1:  # no consensus necessary
            return seqs[0], 1.0

        uniq_seqs = [seqs[0]]
        for i, seq in enumerate(seqs[1:], start=1):
            if seq != seqs[i - 1]:
                uniq_seqs.append(seq)

        if len(uniq_seqs) == 1:  # all seqs were the same!
            return uniq_seqs[0], 1.0

        cons = uniq_seqs[0]
        scores = []

        if len(uniq_seqs) > 1000:
            uniq_seqs = [
                uniq_seqs[u] for u in sorted(
                    np.random.choice(range(len(uniq_seqs)), size=1000))
            ]

        for seq in uniq_seqs[1:]:

            s1 = align.string_to_alignment(cons)
            s2 = align.string_to_alignment(seq)

            (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
            a1 = align.alignment_to_string(a1)
            a2 = ''.join(
                [b for b in list(align.alignment_to_string(a2)) if b != '-'])

            score = 0.0
            if len(a1) > 0:
                score = float(len(a1) - (len(a1) - s)) / float(len(a1))

            if re.search(a1, cons):
                cons_start, cons_end = locate_subseq(cons, a1)

                if score >= minscore and cons_end > len(cons) - 5:
                    scores.append(score)
                    align_end = locate_subseq(seq, a2)[1]
                    cons += seq[align_end:]
                    #print self.start, self.end, cons

        if scores:
            return cons, np.mean(scores)

        else:
            return cons, 0.0
Esempio n. 16
0
def consensus(seqs, minscore=0.95):
    ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order '''

    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    if len(seqs) == 0:
        return '', 0.0

    if len(seqs) == 1: # no consensus necessary
        return seqs[0], 1.0

    uniq_seqs = [seqs[0]]
    for i, seq in enumerate(seqs[1:], start=1):
        if seq != seqs[i-1]:
            uniq_seqs.append(seq)

    if len(uniq_seqs) == 1: # all seqs were the same!
        return uniq_seqs[0], 1.0

    start_index = 0
    cons = uniq_seqs[start_index]
    scores = []

    align_init = False

    for i, seq in enumerate(uniq_seqs[1:]):

        #print 'oldcons:', cons
        #print 'seq    :', seq

        s1 = align.string_to_alignment(cons)
        s2 = align.string_to_alignment(seq)

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])


        score = 0.0

        if len(a1) > 0:
            score = float(len(a1) - (len(a1)-s)) / float(len(a1))

        #print 'score  :', score

        scores.append(score)

        if re.search(a1, cons):
            cons_start, cons_end = locate_subseq(cons, a1)

            if score >= minscore and cons_end > len(cons)-5:
                align_end = locate_subseq(seq, a2)[1]
                cons += seq[align_end:]
                align_init = True
                #print 'newcons:', cons

            elif not align_init: # haven't found a scaffold yet
                start_index += 1
                cons = uniq_seqs[start_index]

        #print '****'

    return cons, np.mean(scores)