Beispiel #1
0
def structure_score(dom, pep, models, model_seqs, mat):
    '''
    '''
    model_max = {"1SSH": 49.900, "1ZUK": 61.800, "1N5Z": 48.100,
                 "2KYM": 47.100, "2RQW": 56.800, "2VKN": 51.100}
    model_scr = []
    for model in model_seqs:
        dom_align = nw.global_align(dom, "".join(model_seqs[model]["D"]),
                                    gap_open=-100, gap_extend=-1,
                                    matrix=mat)
        dom_matches = contact_map_helper(dom_align)

        pep_align = nw.global_align(pep, "".join(model_seqs[model]["P"]),
                                    gap_open=-100, gap_extend=-1,
                                    matrix=mat)
        pep_matches = contact_map_helper(pep_align)

        score = 0
        count = 0
        # max_model = sum(models[model].values())
        for i in models[model]:
            if i[0] in dom_matches and i[1] in pep_matches:
                score += models[model][i]
                count += 1

        if count == 0:
            model_scr.append(0)
        else:
            model_scr.append((score / float(model_max[model])) / count)

    return max(model_scr)
def worker(seq1, seq2):
	# ignore old / previously blased ones.

	# start = time.clock()
	align1 = nwalign.global_align(seq1, seq2, matrix='deps/nuc44')
	align2 = nwalign.global_align(seq1, rev_compliment(seq2), gap_open=-1, gap_extend=-1, matrix='deps/nuc44')

	t1 = sim(align1[0], align1[1])
	t2 = sim(align2[0], align2[1])

	res = float(max(t1, t2)) / float(min(len(seq1), len(seq2)))

	return res
Beispiel #3
0
def score_phonetic_alignment(srcw,
                             tgtw,
                             slang,
                             tlang,
                             sim_matrix_path,
                             gap_start_p=-1.0,
                             gap_extend_p=-1.0):

    # convert to ascii required by align library
    nsrcw = ''.join(
        make_ascii(srcw, slang) if slang in
        langinfo.SCRIPT_RANGES else [str(c) for c in srcw])
    ntgtw = ''.join(
        make_ascii(tgtw, tlang) if tlang in
        langinfo.SCRIPT_RANGES else [str(c) for c in tgtw])

    ## use global alignment
    src_aln, tgt_aln = nw.global_align(nsrcw,
                                       ntgtw,
                                       matrix=sim_matrix_path,
                                       gap_open=gap_start_p,
                                       gap_extend=gap_extend_p)
    return nw.score_alignment(src_aln,
                              tgt_aln,
                              matrix=sim_matrix_path,
                              gap_open=gap_start_p,
                              gap_extend=gap_extend_p)
Beispiel #4
0
 def sound_seq_distance_str(self,seq1_str, seq2_str):
     seq1_str = np.asanyarray(seq1_str)
     seq2_str = np.asanyarray(seq2_str)
     
     align = nw.global_align(seq1_str.tostring(), seq2_str.tostring(), gap_open=0, gap_extend=-5, matrix='/tmp/som.costs')
     len1 = len(seq1_str.tostring())
     len2 = len(seq2_str.tostring())
     return (-nw.score_alignment(*align, gap_open=0, gap_extend=-5, matrix='/tmp/som.costs'))/(len1+len2+0.0)
def score_phonetic_alignment(srcw,tgtw,slang,tlang,sim_matrix_path,gap_start_p=-1.0,gap_extend_p=-1.0):

    # convert to ascii required by align library 
    nsrcw=''.join(make_ascii(srcw,slang) if slang in langinfo.SCRIPT_RANGES else [str(c) for c in srcw ])
    ntgtw=''.join(make_ascii(tgtw,tlang) if tlang in langinfo.SCRIPT_RANGES else [str(c) for c in tgtw ])
    
    ## use global alignment 
    src_aln,tgt_aln=nw.global_align(nsrcw,ntgtw,matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p)
    return nw.score_alignment(src_aln,tgt_aln,matrix=sim_matrix_path, gap_open=gap_start_p, gap_extend=gap_extend_p)
Beispiel #6
0
 def _align(self):
     matrix = self._get_matrix_file(match=self._match,
                                    mismatch=self._mismatch,
                                    matrix=self._matrix)
     aln = nw.global_align(self.query.sequence,
                           self.target.sequence,
                           gap_open=self._gap_open,
                           gap_extend=self._gap_extend,
                           matrix=matrix)
     return aln
Beispiel #7
0
 def _align(self):
     matrix = self._get_matrix_file(match=self._match,
                                    mismatch=self._mismatch,
                                    matrix=self._matrix)
     aln = nw.global_align(self.query.sequence,
                           self.target.sequence,
                           gap_open=self._gap_open,
                           gap_extend=self._gap_extend,
                           matrix=matrix)
     return aln
Beispiel #8
0
def alignMotifs(junctionSeqStart, junctionSeqEnd):
    """Align the sequence of two motifs."""
    for side in ['side1', 'side2']:
        if len(junctionSeqStart.loc[side]) != len(junctionSeqEnd.loc[side]):
            s1 = junctionSeqStart.loc[side]
            s2 = junctionSeqEnd.loc[side]
            a, b = nw.global_align(s1, s2, gap_open=-10)
            #print side, a, b
            junctionSeqStart.loc[side] = a.replace('-', '_')
            junctionSeqEnd.loc[side] = b.replace('-', '_')
    return junctionSeqStart, junctionSeqEnd
Beispiel #9
0
def calc_distance(x, y):
    if aln_fn == '':
        x, y = nw.global_align(x, y, gap_open=-5, gap_extend=-1, match=1)
    dxy = 0
    lxy = 0
    for i in range(min(len(x), len(y))):
        if x[i] != '-' and y[i] != '-':
            lxy += 1
            if x[i] != y[i]:
                dxy += 1
    return 1. * dxy / lxy
Beispiel #10
0
def alignMotifs(junctionSeqStart, junctionSeqEnd):
    """Align the sequence of two motifs."""
    for side in ['side1', 'side2']:
        if len(junctionSeqStart.loc[side]) != len(junctionSeqEnd.loc[side]):
            s1 = junctionSeqStart.loc[side]
            s2 = junctionSeqEnd.loc[side]
            a, b = nw.global_align(s1, s2, gap_open=-10)
            #print side, a, b
            junctionSeqStart.loc[side] = a.replace('-', '_')
            junctionSeqEnd.loc[side] = b.replace('-', '_')
    return junctionSeqStart, junctionSeqEnd
Beispiel #11
0
def calc_distance(x,y):
    if aln_fn == '':
        x, y = nw.global_align(x, y, gap_open=-5, gap_extend=-1, match=1)
    dxy = 0
    lxy = 0
    for i in range(min(len(x),len(y))):
        if x[i] != '-' and y[i] != '-':
            lxy += 1
            if x[i] != y[i]:
                dxy += 1
    return 1.*dxy/lxy
Beispiel #12
0
def getPathsFromSequence(seqInit, seqEnd):
    a, b = nw.global_align(seqInit, seqEnd, gap_open=-10)
    # add bases to vecInit
    while len(vecInit) < len(vecEnd):
        vecInit += '_'

    while len(vecInit) > len(vecEnd):
        vecEnd += '_'

    steps = getSteps(vecInit, vecEnd, numSteps)
    graph = getGraphFromSteps(steps)
    paths = np.array(list(dfs_paths(graph, vecInit, vecEnd)))
    paths_parsed = formatPaths(paths, numSteps / 2)
Beispiel #13
0
def getPathsFromSequence(seqInit, seqEnd):
    a, b = nw.global_align(seqInit, seqEnd, gap_open=-10)
    # add bases to vecInit
    while len(vecInit) < len(vecEnd):
        vecInit += '_'
    
    while len(vecInit) > len(vecEnd):
        vecEnd += '_'
        
    steps = getSteps(vecInit, vecEnd, numSteps)
    graph = getGraphFromSteps(steps)
    paths = np.array(list(dfs_paths(graph, vecInit, vecEnd)))
    paths_parsed = formatPaths(paths, numSteps/2)    
Beispiel #14
0
def testNeedleman(N):

    alpha = [
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
        'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
    ]
    Validated = 0
    SamePath = 0
    for epoch in tqdm(range(N)):
        sizeA = random.randint(1, SIZEMAX)
        sizeB = random.randint(sizeA // 2, 2 * sizeA)
        A = ""
        B = ""
        for i in range(sizeA):
            A += alpha[random.randint(0, len(alpha) - 1)]

        for i in range(sizeB):
            B += alpha[random.randint(0, len(alpha) - 1)]

        aligned = nw.global_align(A,
                                  B,
                                  matrix='atiam-fpa_alpha.dist',
                                  gap_open=-3,
                                  gap_extend=-3)
        score = nw.score_alignment(aligned[0],
                                   aligned[1],
                                   gap_open=-3,
                                   gap_extend=-3,
                                   matrix='atiam-fpa_alpha.dist')

        res = (aligned[0], aligned[1], score)

        try:
            (a, b, s) = myNeedleman(A,
                                    B,
                                    matrix='atiam-fpa_alpha.dist',
                                    gap_open=-3,
                                    gap_extend=-3)

            if s == score:
                Validated += 1

            if res == (a, b, s):
                SamePath += 1
        except RuntimeError:
            print(A, B)
            pass

    print(str(100 * Validated / N) + "% are validated.")
    print(str(100 * SamePath / N) + "% have the exact same path.")
Beispiel #15
0
def sim(a, b):
    # This data _has_ been aligned, but sometimes they're not the same length?
    # If that's the case, send it through NW
    if len(a) != len(b):
        (a, b) = nw.global_align(a,b)
        print "Length mismatch, realigned using nw"
    s = 0
    n = 0
    for i in range(len(a)):
        if(a[i] == '-' and b[i] == '-'):
            continue
        if(a[i] == b[i]):
            s += 1
        n += 1
    return float(s)/float(n)
Beispiel #16
0
def merge_overlaps(seq1, qual1, seq2, qual2):
    """Merge two sequences that overlap at the ends.

    This assumes both sequences are given in forward orientation. The second
    sequence is first complemented, then the two sequences are aligned to
    find the coordinates of overlap. The base calls are then compared for
    each position in the overlap; if there is a mismatch, compare the quality
    scores (reversed for seq2) using the ASCII values and keep the best one.

    >>> ord('B')
    66

    The alignment is done using the nwalign package which implements the
    Needleman-Wunsch algorithm in C via Cython.

    """
    # reverse-complement the second sequence
    seq1 = seq1.upper()     # to make sure
    seq2 = quick_revcom(seq2)
    qual2 = qual2[::-1]
    # run the alignment
    alignment = nw.global_align(seq1, seq2, gap_open=-50, gap_extend=-2)
    seqln1 = alignment[0]
    seqln2 = alignment[1]
    # compose the merged sequence
    merge_list = []
    pos = 0
    qpos1 = 0
    qpos2 = 0
    while pos < len(alignment[0]):
        if seqln1[pos] is '-' or seqln1[pos] is 'N':
            merge_list.append(seqln2[pos])
            qpos1 +=1
        elif seqln2[pos] is '-' or seqln2[pos] is 'N':
            merge_list.append(seqln1[pos])
            qpos2 +=1
        elif seqln1[pos] is seqln2[pos]:
            merge_list.append(seqln1[pos])
        else:  # determine the consensus of the overlap using quality scores
            if ord(qual1[qpos1]) >= ord(qual2[qpos2]):
                merge_list.append(seqln1[pos])
            else:
                merge_list.append(seqln2[pos])
        pos +=1
        qpos1 +=1
        qpos2 +=1
    merged_seq = ''.join(merge_list)
    return merged_seq
Beispiel #17
0
def align_transliterations(src_wordlist, tgt_wordlist, lang):

    for srcw, tgtw in itertools.izip(src_wordlist, tgt_wordlist):
        # convert to ascii required by align library
        nsrcw = ''.join(make_ascii(srcw, lang))
        ntgtw = ''.join(make_ascii(tgtw, lang))

        # use global alignment
        src_aln, tgt_aln = nw.global_align(nsrcw, ntgtw)

        # make it readable again

        src_aln = restore_from_ascii(src_aln, lang)
        tgt_aln = restore_from_ascii(tgt_aln, lang)

        yield (src_aln, tgt_aln)
Beispiel #18
0
def gs_align(left, right):
    lstring, lduration, lstart, lend = left
    rstring, rduration, rstart, rend = right

    lalign, ralign = nw.global_align(lstring, rstring)

    def insert_placeholders(align, duration, start, end):
        for i, ch in enumerate(align):
            if ch == '-':
                duration.insert(i, None)
                start.insert(i, None)
                end.insert(i, None)
    insert_placeholders(lalign, lduration, lstart, lend)
    insert_placeholders(ralign, rduration, rstart, rend)
    align_ix = [i for i in range(len(lalign)) if lalign[i] == ralign[i]]
    return lalign, ralign, align_ix
def align_transliterations(src_wordlist,tgt_wordlist,lang):

    for srcw,tgtw in itertools.izip(src_wordlist,tgt_wordlist): 
        # convert to ascii required by align library 
        nsrcw=''.join(make_ascii(srcw,lang))
        ntgtw=''.join(make_ascii(tgtw,lang))
        
        # use global alignment 
        src_aln,tgt_aln=nw.global_align(nsrcw,ntgtw)
        
        # make it readable again 
        
        src_aln=restore_from_ascii(src_aln,lang) 
        tgt_aln=restore_from_ascii(tgt_aln,lang) 

        yield (src_aln,tgt_aln)
Beispiel #20
0
def gs_align(left, right):
    lstring, lduration, lstart, lend = left
    rstring, rduration, rstart, rend = right

    lalign, ralign = nw.global_align(lstring, rstring)

    def insert_placeholders(align, duration, start, end):
        for i, ch in enumerate(align):
            if ch == '-':
                duration.insert(i, None)
                start.insert(i, None)
                end.insert(i, None)

    insert_placeholders(lalign, lduration, lstart, lend)
    insert_placeholders(ralign, rduration, rstart, rend)
    align_ix = [i for i in range(len(lalign)) if lalign[i] == ralign[i]]
    return lalign, ralign, align_ix
Beispiel #21
0
def cl2string(recorded, target):
    all_labels = list(set([l for _, l in recorded] + [l for _, l in target]))
    l2char = {all_labels[i]: string.ascii_letters[i]
              for i in range(len(all_labels))}
    rec_s = "".join(l2char[l] for _, l in recorded)
    tgt_s = "".join(l2char[l] for _, l in target)

    recalign, tgtalign = nw.global_align(rec_s, tgt_s)
    rectimes = [t for t, _ in recorded]
    tgttimes = [t for t, _ in target]

    for i, (recchar, tgtchar) in enumerate(zip(recalign, tgtalign)):
        if tgtchar == '-':
            tgttimes.insert(i, None)
        if recchar == '-':
            rectimes.insert(i, None)
    return recalign, tgtalign, rectimes, tgttimes
Beispiel #22
0
def cl2string(recorded, target):
    all_labels = list(set([l for _, l in recorded] + [l for _, l in target]))
    l2char = {
        all_labels[i]: string.ascii_letters[i]
        for i in range(len(all_labels))
    }
    rec_s = "".join(l2char[l] for _, l in recorded)
    tgt_s = "".join(l2char[l] for _, l in target)

    recalign, tgtalign = nw.global_align(rec_s, tgt_s)
    rectimes = [t for t, _ in recorded]
    tgttimes = [t for t, _ in target]

    for i, (recchar, tgtchar) in enumerate(zip(recalign, tgtalign)):
        if tgtchar == '-':
            tgttimes.insert(i, None)
        if recchar == '-':
            rectimes.insert(i, None)
    return recalign, tgtalign, rectimes, tgttimes
def align():
    hg19 = Fasta('hg19.fa')
    print hg19.keys()

    hg19Chr = sorted(hg19.keys(), reverse=True)

    YRI = Fasta('YRIref.fasta')
    print YRI.keys()
    YRIChr = sorted(YRI.keys())
    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    fhout = open('hg19_YRI_diff.bed', 'w')

    header = 'chrom, chromStart, chromEnd, hg19, YRI \n'
    fhout.write(header)
    for each in hg19Chr:
        seq1 = hg19[each][:10000]
        seq2 = YRI[each][:10000]
        print 'reached 1'
        print 'doing alignment for ', each
        alignment = nw.global_align(seq1, seq2, gap=-2, matrix=None, match=1, mismatch=-1)
        print 'reached 2'
        len1 = len(alignment[0]) #hg19
        len2 = len(alignment[1]) #YRI

        if len2>len1:
            x = len2
        else:
            x = len1

        for i in range(x):
            if alignment[0][i] != alignment[1][i]:
                #write to fhout
                outline = each + ',' + str(i) + ',' + str(i+1) + ',' + alignment[0][i] + ',' + alignment[1][i] + '\n'
                fhout.write(outline)


    fhout.close()
Beispiel #24
0
    def pairwise_from_seq(self, in_seq):
        if self.pairwise is None:
            self.calculate_pairwise()
        s_seq, i_seq = global_align(self.seq, in_seq)
        scount = -1
        mcount = -1
        i_inds = []
        for s, i in zip(s_seq, i_seq):
            if s != '-':
                scount += 1
            if i != '-':
                mcount += 1
                if scount != -1 and mcount != -1:
                    i_inds.append((mcount, scount))

        pdict = {}
        for s1, s2 in product(i_inds, repeat = 2):
            if s1[1] != s2[1]:
                pdict[(s1[0], s2[0])] = self.pairwise[(s1[1], s2[1])]
        return pdict
Beispiel #25
0
def alignSequences(targetsite_sequence, window_sequence, max_errors=7):
    window_sequence = window_sequence.upper()
    # Try both strands
    query_regex_standard, query_regex_gap = regexFromSequence(
        targetsite_sequence, errors=max_errors)

    alignments = list()
    alignments.append(('+', 'standard',
                       regex.search(query_regex_standard, window_sequence,
                                    regex.BESTMATCH)))
    alignments.append(('-', 'standard',
                       regex.search(query_regex_standard,
                                    reverseComplement(window_sequence),
                                    regex.BESTMATCH)))
    alignments.append(('+', 'gapped',
                       regex.search(query_regex_gap, window_sequence,
                                    regex.BESTMATCH)))
    alignments.append(('-', 'gapped',
                       regex.search(query_regex_gap,
                                    reverseComplement(window_sequence),
                                    regex.BESTMATCH)))

    lowest_distance_score = 100
    chosen_alignment = None
    chosen_alignment_strand = None
    for i, aln in enumerate(alignments):
        strand, alignment_type, match = aln
        if match != None:
            substitutions, insertions, deletions = match.fuzzy_counts
            distance_score = substitutions + (insertions + deletions) * 3
            if distance_score < lowest_distance_score:
                chosen_alignment = match
                chosen_alignment_strand = strand
                lowest_distance_score = distance_score

    if chosen_alignment:
        match_sequence = chosen_alignment.group()
        match_substitutions, match_insertions, match_deletions = chosen_alignment.fuzzy_counts
        distance = sum(chosen_alignment.fuzzy_counts)
        length = len(match_sequence)

        start = chosen_alignment.start()
        end = chosen_alignment.end()
        path = os.path.dirname(os.path.abspath(__file__))
        if match_insertions or match_deletions:
            realigned_match_sequence, realigned_target = nw.global_align(
                match_sequence,
                targetsite_sequence,
                gap_open=-10,
                gap_extend=-100,
                matrix='{0}/NUC_SIMPLE'.format(path))
            return [
                realigned_match_sequence, distance, length,
                chosen_alignment_strand, start, end, realigned_target
            ]
        else:
            return [
                match_sequence, distance, length, chosen_alignment_strand,
                start, end, targetsite_sequence
            ]

    else:
        return [''] * 6 + ['none']
 def align_sequences(self, seq1, seq2):
     result = nw.global_align(seq1, seq2, matrix='BLOSUM62.txt')
     return result
Beispiel #27
0
            kw[k] = int(kwargs[2 * i + 1])
    return a, b, kw


while True:
    client, address = server.accept()
    data = True
    while data:
        try:
            data = client.recv(CHUNK).strip()
            if data == "EXIT":
                client.close()
                server.close()
                print "EXITING service"
                sys.exit(0)

            a, b, kwargs = get_args(data)
            r = global_align(a, b, **kwargs)
            client.send(" ".join(r))
        except Exception, e:
            try:
                client.send("ERROR:" + str(e))
            except socket.error:
                # they already closed...
                client.close()
                break

    client.close()

atexit.register(server.close)
Beispiel #28
0


while True:
    client, address = server.accept()
    data = True 
    while data:
        try: 
            data = client.recv(CHUNK).strip()
            if data == "EXIT":
                client.close()
                server.close()
                print "EXITING service"
                sys.exit(0)

            a,b, kwargs = get_args(data)
            r = global_align(a, b, **kwargs)
            client.send(" ".join(r))
        except Exception, e:
            try:
                client.send("ERROR:" + str(e))
            except socket.error:
                # they already closed...
                client.close()
                break


    client.close()

atexit.register(server.close)
    def compare_datapoints(p1, p2):
        gist_1 = p1[2:len(p1) - 2]
        gist_2 = p2[2:len(p2) - 2]
        gist_1_flipped = list(gist_1)

        # We also try the 'flipped' (mirrored) version of the gesture to allow comparison between left-handed and right-handed gestures
        # (although it's not super accurate).
        # Need to take into account not to flip spine (so we drop the last 3 parts)
        for i in range(0, (len(gist_1_flipped) - 3) / 2):
            tmp = GestureComparison._flip_quadrants(gist_1_flipped[i])
            # print gist_1_flipped[i] + " --> " + tmp
            # print str(i) + "<->" + str(len(gist_1_flipped) / 2 + i - 3 + 1)
            gist_1_flipped[i] = GestureComparison._flip_quadrants(
                gist_1_flipped[len(gist_1_flipped) / 2 + i - 3 + 1])
            gist_1_flipped[len(gist_1_flipped) / 2 + i - 3 + 1] = tmp

        score = 0.0
        score_flipped = 0.0

        for i in range(0, len(gist_1)):
            if gist_1[i] != 0 and gist_2[i] != '':
                res = nwalign.global_align(
                    gist_1[i],
                    gist_2[i],
                    matrix=os.path.dirname(os.path.realpath(__file__)) +
                    '/alignment.matrix')
                this_score = nwalign.score_alignment(
                    res[0],
                    res[1],
                    gap_open=0,
                    gap_extend=-5,
                    matrix=os.path.dirname(os.path.realpath(__file__)) +
                    '/alignment.matrix')
                if i >= len(gist_1) - 3:
                    this_score *= 2

                score += this_score

                res = nwalign.global_align(
                    gist_1_flipped[i],
                    gist_2[i],
                    matrix=os.path.dirname(os.path.realpath(__file__)) +
                    '/alignment.matrix')
                this_score_flipped = nwalign.score_alignment(
                    res[0],
                    res[1],
                    gap_open=0,
                    gap_extend=-5,
                    matrix=os.path.dirname(os.path.realpath(__file__)) +
                    '/alignment.matrix')

                if i >= len(gist_1) - 3:
                    this_score_flipped *= 2

                score_flipped += this_score_flipped

        # print str(score_flipped) + " " + str(score)

        if score_flipped > score:
            score_flipped -= abs(int(p1[-2]) -
                                 int(p2[-1])) + abs(int(p1[-1]) - int(p2[-2]))
            # print "using flipped! " + p1[1] + " " + p2[1] + " " + str(score_flipped) + " > " + str(score)
            return -score_flipped
        else:
            score -= abs(int(p1[-2]) -
                         int(p2[-2])) + abs(int(p1[-1]) - int(p2[-1]))
            return -score
Beispiel #30
0
        singreps=line.split("#")
        rep_count.append(len(singreps)-1)
        poslist=[]
        for singrep in singreps:
            if singrep != "\n":
                repinfo=singrep.split(":")
                poslist.append(int(repinfo[1]))
        rep_pos.append(poslist)                
        locus_num.append(i)
        locus.append(curloc)
        
fullscore=0
array=()
score=0

for k in range(len(locus)):
    print "Processing repeat", k, "of", locus[k]
    fullscore=nw.score_alignment(rep[k], rep[k], gap_open=-5,\
        gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62')
    print >> fileout, ">", k, locus[k], rep[k], rep_count[k], rep_pos[k]
    for j in range(len(rep)):
        array=nw.global_align(rep[k], rep[j], gap_open=-5,\
            gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62')
        score=nw.score_alignment(array[0], array[1], gap_open=-5,\
            gap_extend=-2, matrix='/home/CT/server/pybin/BLOSUM62')
        if score>0 and score/float(fullscore)>=threshold and j!=k:
            print >> fileout, j, locus[j], rep[j], rep_count[j], rep_pos[j]

filein.close()
fileout.close()
Beispiel #31
0
def run_align_nw(data):
    for s in data:
        nw.global_align(s[0], s[1])
Beispiel #32
0
    0:MAX_SEQUENCES]  # only do the first 50 for speed..

similarity_matrix = np.zeros((len(unique_sequences), len(unique_sequences)))

dist_matrix = np.zeros((len(unique_sequences), len(unique_sequences)))

scoring = sw.ScoringMatrix('scoring_matrix.txt')
sw = sw.LocalAlignment(scoring)

match = 2
n = 0

for x, seq1 in enumerate(unique_sequences):
    for y, seq2 in enumerate(unique_sequences):

        alignment = nw.global_align(allsequences[seq1], allsequences[seq2])

        score = float(
            nw.score_alignment(alignment[0],
                               alignment[1],
                               gap_open=-5,
                               gap_extend=-2,
                               matrix='scoring_matrix.txt'))

        n = float(len(alignment[0]) * match)
        if abs(score) > n:
            score = 0

        similarity_matrix[x, y] = int(score)
        dist_matrix[x, y] = float(score / n)
Beispiel #33
0
    '''Lee un fasta y devuelve la secuencia que contiene'''
    F = open(fastafile)
    #Definimos la variable "secuencia"
    secuencia = ''
    #Iniciamos el bucle de lectura
    for linea in F:
        #Asociamos a la variable "linea" cada línea del archivo, pero sin el salto de línea
        linea = linea.strip('\n')
        #Comprobamos que la línea no está vacía, ni empieza por '>' (1ª linea de info)
        if linea != '' and linea[0] != '>' :
            #Añadimos a la variable "secuencia" cada iteración de linea
            secuencia = secuencia + linea
    return secuencia

D = fasta2sec('Q9V429.fasta')
H = fasta2sec('P10599.fasta')
acounter = 0
bcounter = 0
a, b = nw.global_align(D, H, matrix='BLOSUM62.txt')
for aa in a:
    if aa != '-':
        acounter += 1 
        
for aa in b:
    if aa != '-':
        bcounter += 1 
#print a
#print b
print ((acounter*100./len(a))+(bcounter*100./len(b))) / 2

            matrix='atiam-fpa_alpha.dist',
            gap_open=-5,
            gap_extend=-2)

# Reference code for testing
import nwalign as nw
print("myNeedleman")
print(
    myNeedleman("CEELECANTH",
                "PELICAN",
                matrix='atiam-fpa_alpha.dist',
                gap_open=-1,
                gap_extend=-1))
print("Nwalign")
aligned = nw.global_align("CEELECANTH",
                          "PELICAN",
                          matrix='atiam-fpa_alpha.dist')
score = nw.score_alignment(aligned[0],
                           aligned[1],
                           gap_open=-1,
                           gap_extend=-1,
                           matrix='atiam-fpa_alpha.dist')
print('Results for basic gap costs (linear)')
print(aligned[0])
print(aligned[1])
print('Score : ' + str(score))

print("myNeedleman")
print(
    myNeedleman("CEELECANTH",
                "PELICAN",
Beispiel #35
0
    def _RealignRead(self, read):
        has_score = False

        if read.is_unmapped is True:
            #self._out.write(read)
            return read
    
        tags = read.tags
        '''If any of the read tags are AS, then remember the read has an
        existing score.'''
        for i in range(0, len(tags)):
            if tags[i][0] == 'AS':
                has_score = True
                continue

        if self.only_gapped is True:
            has_indel = False
            for c in read.cigar:
                if c[0] == 1 or c[0] == 2:
                    # read has an indel
                    has_indel = True
                    break
            if has_indel == False:
                # Read must not have an indel
                '''
                If the read is a perfect match then don't realign
                '''
                print(read.qname + ', ' + str(read.cigar) + " does not have an indel")
                #self._out.write(read)
                return read

        fivep_soft_clip = 0
        threep_soft_clip = 0
        cigar_last = len(read.cigar) - 1
        if read.cigar[0][0] == 4:
            fivep_soft_clip = read.cigar[0][1]
        if read.cigar[cigar_last][0] == 4:
            threep_soft_clip = read.cigar[cigar_last][1]

        ref = self.ref.fetch(reference=self.refnames[read.tid],
                                      start=read.aend - read.alen - fivep_soft_clip,
                                      end=read.aend + threep_soft_clip)


        # Realign sense strand reads
        query = ''
        subject = ''
        if self.reverse_sense is True and read.is_reverse is False:
            query = self.ReverseSeq(read.seq)
            subject = self.ReverseSeq(ref.upper())
            #query = Seq(read.seq).complement().tostring()
            #subject = Seq(ref.upper()).complement().tostring()
        else:
            query = read.seq
            subject = ref.upper()

        print query, subject
        aln = nw.global_align(query, subject,
                              gap_open=self.gap_open,
                              gap_extend=self.gap_extend,
                              matrix=self.matrix)

        if self.compute_scores is True:
            score = nw.score_alignment(aln[0], aln[1],
                                       gap_open=self.gap_open,
                                       gap_extend=self.gap_extend,
                                       matrix=self.matrix)

            if has_score is True:
                as_index = None
                tags = read.tags
                for i in range(0, len(tags)):
                    if tags[i][0] == 'AS':
                        as_index = i
                if as_index is None:
                    raise ValueError("Read " + read.qname +
                    " is missing an alignment score.")
                tags[as_index] = ('AS', score)
                read.tags = tags
            else:
                read.tags = [('AS', score)] + read.tags 
        
        bam_cigar = self._MakeBamCigar(aln, read)
        if self.reverse_sense is True and read.is_reverse is False:
            bam_cigar.reverse()
        
        if self.verbose is True:
            self.PrettyPrint(read, aln, bam_cigar)

        # New read
        read.cigar = bam_cigar
        read.tags = read.tags + [('OC', self._MakeSamCigar(read.cigar)),
                                 ('OP', read.pos)]
        #self._out.write(read)
        return read
Beispiel #36
0
    type = str, 
    help = "Which allele to align against"
)


if __name__ == '__main__':
    args = parser.parse_args()
    print args

    with open(args.input_filename,'r') as f:
        d = parse_fasta(f.read())

    allele = args.reference_allele

    if allele in d:
        refseq = d[allele]
    else:
        refseq = d[allele.replace("*", "")]

    result = {}
    for k,v in d.iteritems():
        x, y = nw.global_align(refseq, v, gap_open=-40, gap_extend=-20, matrix='BLOSUM50')
        good_positions = [i for i,xi in enumerate(x) if xi != "-"]
        x_subset = "".join(x[i] for i in good_positions)
        y_subset = "".join(y[i] for i in good_positions)
        result[k] = y_subset
    if args.output_filename:
        with open(args.output_filename, 'w') as f:
            for k,v in result.iteritems():
                f.write(">%s\n%s\n" % (k, v))
Beispiel #37
0
    def RemapReadsSingle(self, count=None):
        #scores = {}
        counter = 0
        has_score = False
        write_mode = 'wb'
        if self.binary_mode is False:
            write_mode = 'wh'
        self._out = pysam.Samfile(self.sam_out,
                                  mode=write_mode,
                                  referencenames=self.sam_in.references,
                                  referencelengths=self.sam_in.lengths,
                                  header=self._MakeHeader(self.sam_in.header)
                                  )

        for read in self.sam_in.fetch():
            'Optional setting of count, to only realign count reads'
            if count is not None and counter > count:
                break
            if read.is_unmapped is True:
                self._out.write(read)
                continue
            if counter == 0:
                # Check if an alignment score is already present
                # If it is then record this in the has_score flag
                tags = read.tags
                for i in range(0, len(tags)):
                    if tags[i][0] == 'AS':
                        has_score = True
                        continue
            if self.only_gapped is True:
                has_indel = False
                for c in read.cigar:
                    if c[0] == 1 or c[0] == 2:
                        # read has an indel
                        has_indel = True
                        break
                if has_indel == False:
                    # Read must not have an indel
                    print(read.qname + ', ' + str(read.cigar) + " does not have an indel")
                    self._out.write(read)
                    continue
                    '''
                    if the read is a perfect match then don't realign
                    '''

            fivep_soft_clip = 0
            threep_soft_clip = 0
            cigar_last = len(read.cigar) - 1
            if read.cigar[0][0] == 4:
                fivep_soft_clip = read.cigar[0][1]
            if read.cigar[cigar_last][0] == 4:
                threep_soft_clip = read.cigar[cigar_last][1]

            ref = self.ref.fetch(reference=self.refnames[read.tid],
                                          start=read.aend - read.alen - fivep_soft_clip,
                                          end=read.aend + threep_soft_clip)

            # Realign sense strand reads
            query = ''
            subject = ''
            if self.reverse_sense is True and read.is_reverse is False:
                query = self.ReverseSeq(read.seq)
                subject = self.ReverseSeq(ref.upper())
                #query = Seq(read.seq).complement().tostring()
                #subject = Seq(ref.upper()).complement().tostring()
            else:
                query = read.seq
                subject = ref.upper()

            aln = nw.global_align(query, subject,
                                  gap_open=self.gap_open,
                                  gap_extend=self.gap_extend,
                                  matrix=self.matrix)

            if self.compute_scores is True:
                score = nw.score_alignment(aln[0], aln[1],
                                           gap_open=self.gap_open,
                                           gap_extend=self.gap_extend,
                                           matrix=self.matrix)

                if has_score is True:
                    as_index = None
                    tags = read.tags
                    for i in range(0, len(tags)):
                        if tags[i][0] == 'AS':
                            as_index = i
                    if as_index is None:
                        raise ValueError("Read " + read.qname +
                        " is missing an alignment score.")
                    tags[as_index] = ('AS', score)
                    read.tags = tags
                else:
                    read.tags = [('AS', score)] + read.tags 

            bam_cigar = self._MakeBamCigar(aln, read)
            if self.reverse_sense is True and read.is_reverse is False:
                bam_cigar.reverse()

            if self.verbose is True:
                self.PrettyPrint(read, aln, bam_cigar)

            # New read
            read.cigar = bam_cigar
            read.tags = read.tags + [('OC', self._MakeSamCigar(read.cigar)),
                                     ('OP', read.pos)]
            self._out.write(read)
            counter += 1
        self._out.close
Beispiel #38
0
def nwalign(s1,s2):
    return nw.global_align(s1,s2)
Beispiel #39
0
def get_rmsd_rotation_and_translations(file1, file2):

    nres1 = get_total_residue_number(file1)
    nres2 = get_total_residue_number(file2)

    seq1, ind1 = get_sequence_from_PDB(file1)
    seq2, ind2 = get_sequence_from_PDB(file2)

    alignment = nw.global_align(seq1, seq2)

    nalign = len(alignment[0])
    nresidues_min = min(len(seq1), len(seq2))

    ind1new = []
    ind2new = []
    idx1, idx2 = 0, 0

    for idx in range(nalign):
        if (idx <
                nresidues_min) and seq1[idx] == seq2[idx] and seq1[idx] != '-':
            ind1new.append(ind1[idx1])
            ind2new.append(ind2[idx2])
        if (idx < len(seq1)) and seq1[idx] != '-':
            idx1 += 1
        if (idx < len(seq2)) and seq2[idx] != '-':
            idx2 += 1

    ind1 = ind1new
    ind2 = ind2new

    #TODO: add a threshold for the number of residues considered
    frac1 = len(ind1) * 100.0 / nres1
    frac2 = len(ind2) * 100.0 / nres2

    # get coordinates of specific residues
    coords1, ind1 = get_residues_coordinates(file1, ind1)
    coords2, ind2 = get_residues_coordinates(file2, ind2)

    new_coords1 = []
    new_coords2 = []

    # check if there is consistency in atom names
    nresidues1 = len(coords1)
    for idx in range(nresidues1):
        coords1_res = coords1[idx]
        coords2_res = coords2[idx]

        atomnames1 = [item[0] for item in coords1_res]
        atomnames2 = [item[0] for item in coords2_res]
        if set(atomnames1) != set(atomnames2):
            sys.exit(
                "Inconsistency found in residue %s in file %s and residue %s in file %s! Missing atom suspected..."
                % (ind1[idx], file1, ind2[idx], file2))

        # create new coordinates
        for an1, x1, y1, z1 in coords1_res:
            for an2, x2, y2, z2 in coords2_res:
                if an1 == an2:
                    new_coords1.append([x1, y1, z1])
                    new_coords2.append([x2, y2, z2])
                    break

    new_coords1 = np.array(new_coords1).T
    new_coords2 = np.array(new_coords2).T

    rotation = np.zeros(9)
    trans1 = -new_coords1[:, 0]
    trans2 = -new_coords2[:, 0]

    rmsd = pyqcprot.CalcRMSDRotationalMatrix(new_coords1, new_coords2,
                                             rotation, None)

    rotation = rotation.reshape((3, 3))
    trans1 += new_coords1[:, 0]
    trans2 += new_coords2[:, 0]

    return rotation, trans1, trans2
    idB = record.id
    seqB = str(record.seq)
    bSeqs[idB] = seqB
handle.close()


handle = open(fileNameToAlign, "rU")
for line in handle:
    line = line.rstrip()
    genesList = line.split("\t")
    geneA = genesList[0]
    geneB = genesList[1]
    
    seqTxtA = aSeqs[geneA].upper()
    seqTxtB = bSeqs[geneB].upper()
    alignment = nwalign.global_align(seqTxtA, seqTxtB, matrix='NUC.4.4',gap_open=-11,gap_extend=-1)
    
    lenOfSeqs = len(alignment[0])
    matched = 0
    
    for i in range(len(alignment[0])):
        currLetterA = alignment[0][i]
        currLetterB = alignment[1][i]
        print currLetterA
        print currLetterB
        
        if currLetterA == currLetterB:
                matched = matched + 1;
                
    print str(matched)
    print str(lenOfSeqs)
Beispiel #41
0
	norm = float(d[i-1][j-1]/((len(s1)+len(s2))/2.0))
	return norm

# 1
if sys.argv[1] == "1":
	print "Pregunta 1..."
	sec_ini = generar_secuencia()
#	d, d_matrix, sec_mut  = [], [], []
	d, sec_mut  = [], []
	sec_mut = sec_ini
	tmp1 = "".join(sec_ini)
	for i in range(m+1):
		sec_mut = mutacion(sec_mut,i)
		tmp2 = "".join(sec_mut)
		align = nw.global_align(tmp1,tmp2)
		distance =distancia(align[0],align[1])
#		matrix,distance =distancia(align[0],align[1])
		d.append(distance)
#		d_matrix.append(matrix)
		tmp2 = ""

	p.show()
	plot1 = p.plot(m_list,d)
	title(u'Distancia vs Número de mutaciones')
	grid(True)
	p.axis([0,m,0,max(d)+1],0.01)
	xlabel(u'Número de mutaciones')
	ylabel('Distancia')
	p.show()