def name_align_score(n1, n2): if n1 == '' and n2 == '': return -3, -3 elif n1 == '': return -1, -1 elif n2 == '': return -2, -2 # Full name maxScoreFull = 0 alignment = align.globalxx(clean_name(n1), clean_name(n2)) for an_al in alignment: score = float(an_al[2]) / float(an_al[4]) if score > maxScoreFull: maxScoreFull = score ## print clean_name(n1), clean_name(n2), maxScoreFull # Initialized name maxScoreIni = 0 alignment = align.globalxx(initialize_name(n1), initialize_name(n2)) for an_al in alignment: score = float(an_al[2]) / float(an_al[4]) if score > maxScoreIni: maxScoreIni = score # Done return maxScoreFull, maxScoreIni
def find_common_atoms(chain_fix, chain_mov): ''' Function that takes 2 chain objects previously filtered and considered the same in different .pdb files. Thenreturns two lists of atoms, the ones that will be fixed and that will be moved when superimposed ''' for residue in ppb.build_peptides(chain_fix): seq_fix = residue.get_sequence() for residue in ppb.build_peptides(chain_mov): seq_mov = residue.get_sequence() atoms_fix = list() atoms_mov = list() alignment = align.globalxx(seq_fix, seq_mov) #align sequences count_fix = 0 count_move = 0 for i in range(len(alignment[0][0])): if alignment[0][0][i] == '-': count_fix -= 1 elif alignment[0][1][i] == '-': count_move -= 1 elif alignment[0][0][i] != '-' and alignment[0][1][i] != '-': atoms_fix.extend(chain_fix.get_atoms()) atoms_mov.extend(chain_mov.get_atoms()) return atoms_fix, atoms_mov
def title_align_score(t1, t2): maxScore = 0 alignment = align.globalxx(t1, t2) for an_al in alignment: score = float(an_al[2]) / float(an_al[4]) if score > maxScore: maxScore = score return maxScore
def affil_align_score(t1, t2): maxScore = 0 alignment = align.globalxx(t1, t2) ## print alignment for an_al in alignment: score = float(an_al[2]) / float(an_al[4]) if score > maxScore: maxScore = score return maxScore
def string_align_score(s1, s2): alignment = align.globalxx(clean(s1), clean(s2)) print alignment maxScore = 0.0 for an_al in alignment: score = float(an_al[2]) / float(an_al[4]) if score > maxScore: maxScore = score return maxScore
def kw_align_score(kws1, kws2): maxScore = 0.0 for kw1 in [k for k in kws1 if len(k) > 5]: for kw2 in [k for k in kws2 if len(k) > 5]: alignment = align.globalxx(kw1, kw2) for an_al in alignment: score = float(an_al[2]) / float(an_al[4]) if score > maxScore: maxScore = score return maxScore
def affil_align_score(t1, t2): if t1 == '' or t2 == '': return 0 maxScore = 0 alignment = align.globalxx(t1, t2) for an_al in alignment: score = float(an_al[2]) / float(an_al[4]) if score > maxScore: maxScore = score return maxScore
def has_homolgs(target_seq, known_seqs): """Checks if a given sequence is an homolog of any of the known sequences and returns it""" for k_seq in known_seqs: alignment = align.globalxx(target_seq, k_seq)[ 0] # Generates and selects the alignment with the best score aln_seq_1 = alignment[ 0] # Get the first sequence of the alignment (with - as gaps ) aln_seq_2 = alignment[1] # Get the second one al_length = len(alignment[0]) ident = sum(base1 == base2 for base1, base2 in zip( aln_seq_1, aln_seq_2)) # Calculate number of identities if ident / al_length >= 0.95: # If 95% of identity, return known sequence return k_seq
def align_fasta_seqs (fasta_file, output_name): """ This function reads a Fasta file and return a file with all the pairwise alignments. """ dict_alignments = {} out_file = open("alignments-" + output_name + ".txt", "w") for ID1, sequence1 in FASTA_iterator(fasta_file): # Iterates for each fasta sequence in fasta file. for ID2, sequence2 in FASTA_iterator(fasta_file): if ID1 != ID2: # If the sequences are not the same one alignment = align.globalxx(sequence1, sequence2) # Align them and store the result in a dictionary dict_alignments.setdefault(ID1+"-"+ID2, alignment) out_file.write(ID1+"-"+ID2+":"+"\n") out_file.write(format_alignment(*alignment[0])) out_file.close()
def locate_reference(sequences, pdb_sequence): """ Get the index of the reference sequence in sequences using a global pairwise alignment :param sequences: :param pdb_sequence: :return: """ score = list() for index, sequence in enumerate(sequences): score.append( align.globalxx(pdb_sequence, sequence, one_alignment_only=1, score_only=1)) return score.index(max(score))
def find_common_atoms(chain_fix, chain_mov): ''' :param chain_fix: Chain object :param chain_mov: Chain object :return: A two lists of Atom objects prepared to be superimposed. That means the atoms are coherent in their position with the other list. ''' #first check which kind of molecule are our chains (and if they are the same type of molecules) chem_type = None if chain_fix.sequence.__class__ == chain_mov.sequence.__class__: if chain_fix.sequence.__class__.__name__ == 'ProteinSequence': chem_type = 'P' elif chain_fix.sequence.__class__.__name__ == 'DNASequence': chem_type = 'D' elif chain_fix.sequence.__class__.__name__ == 'RNASequence': chem_type = 'R' seq_fix = chain_fix.get_sequence_str() seq_mov = chain_mov.get_sequence_str() atoms_fix = list() atoms_mov = list() alignment = align.globalxx(seq_fix, seq_mov) #align sequences count_fix = 0 count_move = 0 for i in range(len(alignment[0][0])): if alignment[0][0][i] == '-': count_fix -= 1 if alignment[0][1][i] == '-': count_move -= 1 elif alignment[0][0][i] != '-' and alignment[0][1][i] != '-': if chem_type == 'P': try:#try get full backbone (CONCA) atoms_fix.extend(chain_fix.childs[i + count_fix].backbone()) atoms_mov.extend(chain_mov.childs[i + count_move].backbone()) except:#Maybe an atom from the backbone is missing so we use C only atoms_fix.extend(chain_fix.childs[i + count_fix].backbone(True, 'C')) atoms_mov.extend(chain_mov.childs[i + count_move].backbone(True, 'C')) elif chem_type is not None:#DNA or RNA atoms_fix.extend(chain_fix.childs[i + count_fix].childs) atoms_mov.extend(chain_mov.childs[i + count_move].childs) else: sys.stderr.write('Tried to align chains with an unknown or different chemical structure') exit(1) #print(format_alignment(*alignment[0])) return atoms_fix, atoms_mov
def _f(a, b): align.globalxx(a, b, force_generic=True, one_alignment_only=True) return (max(len(a), len(b)))
def junction_distance(seq1, seq2, n, dist_mat, norm, sym, tol=3, c=35., length_constraint=True): """Calculate a distance between two input sequences. .. note:: Deprecated. `junction_distance` will be removed in icing 0.2. It is replaced by `string_distance`. Parameters ---------- seq1, seq2 : str String sequences. n : int Choose how to break down sequences. Usually is 1 or 5. dist_mat : pandas.DataFrame Matrix which define the distance between the single characters. norm : ('len', 'mut', 'max', 'min', 'none') Normalisation method. sym : ('avg', 'min', 'sum') Choose how to symmetrise distances between seq1 and seq2 or seq2 and seq1. tol : int, optional, default: 3 Tolerance in the length of the sequences. Default is 3 (3 nucleotides form an amminoacid. If seq1 and seq2 represent amminoacidic sequences, use tol = 1). c : float, optional, default: 35.0, deprecated Constant used with mutations. Now ignored. Will be removed. length_constraint : boolean, optional, default: True Insert the constraint on the difference between the lengths of seq1 and seq2. If False, `tol` is ignored. Returns ------- distance : float A normalised distance between seq1 and seq2. Values are in [0,1]. """ if length_constraint and 0 < abs(len(seq1) - len(seq2)) <= tol: # different lengths, seqs alignment seq1, seq2 = map(extra.junction_re, align.globalxx(seq1, seq2)[0][:2]) nmers = get_nmers([seq1, seq2], n) mutated = np.array( [i for i, (c1, c2) in enumerate(izip(seq1, seq2)) if c1 != c2]) mut_len = mutated.shape[0] seqq1 = np.empty(mut_len, dtype=object) seqq2 = np.empty(mut_len, dtype=object) nmer1 = np.empty(mut_len, dtype=object) nmer2 = np.empty(mut_len, dtype=object) for i, m in enumerate(mutated): seqq1[i] = seq1[m] seqq2[i] = seq2[m] nmer1[i] = nmers[seq1][m] nmer2[i] = nmers[seq2][m] # Determine normalizing factor if norm == 'len': norm_by = len(seq1) elif norm == 'mut': norm_by = len(mutated) elif norm == 'max': norm_by = max(len(seq1), len(seq2)) elif norm == 'min': norm_by = min(len(seq1), len(seq2)) else: norm_by = 1 # Determine symmetry function if sym == 'avg': sym_fun = np.mean elif sym == 'min': sym_fun = min else: sym_fun = sum if length_constraint and abs(len(seq1) - len(seq2)) > tol: return min(len(seq1), len(seq2)) / norm_by return sum([ sym_fun([float(dist_mat.at[c1, n2]), float(dist_mat.at[c2, n1])]) for c1, c2, n1, n2 in izip(seqq1, seqq2, nmer1, nmer2) ]) / (norm_by)
def junction_distance(seq1, seq2, n, dist_mat, norm, sym, tol=3, c=35., length_constraint=True): """Calculate a distance between two input sequences. .. note:: Deprecated. `junction_distance` will be removed in icing 0.2. It is replaced by `string_distance`. Parameters ---------- seq1, seq2 : str String sequences. n : int Choose how to break down sequences. Usually is 1 or 5. dist_mat : pandas.DataFrame Matrix which define the distance between the single characters. norm : ('len', 'mut', 'max', 'min', 'none') Normalisation method. sym : ('avg', 'min', 'sum') Choose how to symmetrise distances between seq1 and seq2 or seq2 and seq1. tol : int, optional, default: 3 Tolerance in the length of the sequences. Default is 3 (3 nucleotides form an amminoacid. If seq1 and seq2 represent amminoacidic sequences, use tol = 1). c : float, optional, default: 35.0, deprecated Constant used with mutations. Now ignored. Will be removed. length_constraint : boolean, optional, default: True Insert the constraint on the difference between the lengths of seq1 and seq2. If False, `tol` is ignored. Returns ------- distance : float A normalised distance between seq1 and seq2. Values are in [0,1]. """ if length_constraint and 0 < abs(len(seq1)-len(seq2)) <= tol: # different lengths, seqs alignment seq1, seq2 = map(extra.junction_re, align.globalxx(seq1, seq2)[0][:2]) nmers = get_nmers([seq1, seq2], n) mutated = np.array([i for i, (c1, c2) in enumerate(izip(seq1, seq2)) if c1 != c2]) mut_len = mutated.shape[0] seqq1 = np.empty(mut_len, dtype=object) seqq2 = np.empty(mut_len, dtype=object) nmer1 = np.empty(mut_len, dtype=object) nmer2 = np.empty(mut_len, dtype=object) for i, m in enumerate(mutated): seqq1[i] = seq1[m] seqq2[i] = seq2[m] nmer1[i] = nmers[seq1][m] nmer2[i] = nmers[seq2][m] # Determine normalizing factor if norm == 'len': norm_by = len(seq1) elif norm == 'mut': norm_by = len(mutated) elif norm == 'max': norm_by = max(len(seq1), len(seq2)) elif norm == 'min': norm_by = min(len(seq1), len(seq2)) else: norm_by = 1 # Determine symmetry function if sym == 'avg': sym_fun = np.mean elif sym == 'min': sym_fun = min else: sym_fun = sum if length_constraint and abs(len(seq1)-len(seq2)) > tol: return min(len(seq1), len(seq2)) / norm_by return sum([sym_fun([float(dist_mat.at[c1, n2]), float(dist_mat.at[c2, n1])]) for c1, c2, n1, n2 in izip(seqq1, seqq2, nmer1, nmer2)]) / (norm_by)