コード例 #1
0
ファイル: name.py プロジェクト: seeslab/kaggle_kdd
def name_align_score(n1, n2):
    if n1 == '' and n2 == '':
        return -3, -3
    elif n1 == '':
        return -1, -1
    elif n2 == '':
        return -2, -2

    # Full name
    maxScoreFull = 0
    alignment = align.globalxx(clean_name(n1), clean_name(n2))
    for an_al in alignment:
        score = float(an_al[2]) / float(an_al[4])
        if score > maxScoreFull:
            maxScoreFull = score
    ## print clean_name(n1), clean_name(n2), maxScoreFull

    # Initialized name
    maxScoreIni = 0
    alignment = align.globalxx(initialize_name(n1), initialize_name(n2))
    for an_al in alignment:
        score = float(an_al[2]) / float(an_al[4])
        if score > maxScoreIni:
            maxScoreIni = score

    # Done
    return maxScoreFull, maxScoreIni
コード例 #2
0
def find_common_atoms(chain_fix, chain_mov):
    '''
	Function that takes 2 chain objects
	previously filtered and considered the same in 
	different .pdb files. Thenreturns two lists
	of atoms, the ones that will be fixed and that 
	will be moved when superimposed
	'''
    for residue in ppb.build_peptides(chain_fix):
        seq_fix = residue.get_sequence()
    for residue in ppb.build_peptides(chain_mov):
        seq_mov = residue.get_sequence()

    atoms_fix = list()
    atoms_mov = list()
    alignment = align.globalxx(seq_fix, seq_mov)  #align sequences
    count_fix = 0
    count_move = 0
    for i in range(len(alignment[0][0])):
        if alignment[0][0][i] == '-':
            count_fix -= 1
        elif alignment[0][1][i] == '-':
            count_move -= 1
        elif alignment[0][0][i] != '-' and alignment[0][1][i] != '-':
            atoms_fix.extend(chain_fix.get_atoms())
            atoms_mov.extend(chain_mov.get_atoms())

    return atoms_fix, atoms_mov
コード例 #3
0
ファイル: titles.py プロジェクト: seeslab/kaggle_kdd
def title_align_score(t1, t2):
    maxScore = 0
    alignment = align.globalxx(t1, t2)
    for an_al in alignment:
        score = float(an_al[2]) / float(an_al[4])
        if score > maxScore:
            maxScore = score
    return maxScore
コード例 #4
0
ファイル: affiliation.py プロジェクト: seeslab/kaggle_kdd
def affil_align_score(t1, t2):
    maxScore = 0
    alignment = align.globalxx(t1, t2)
##    print alignment 
    for an_al in alignment:
        score = float(an_al[2]) / float(an_al[4])
        if score > maxScore:
            maxScore = score
    return maxScore
コード例 #5
0
ファイル: alignment.py プロジェクト: seeslab/kaggle_kdd
def string_align_score(s1, s2):
    alignment = align.globalxx(clean(s1), clean(s2))
    print alignment
    maxScore = 0.0
    for an_al in alignment:
        score = float(an_al[2]) / float(an_al[4])
        if score > maxScore:
            maxScore = score

    return maxScore
コード例 #6
0
ファイル: keywords.py プロジェクト: seeslab/kaggle_kdd
def kw_align_score(kws1, kws2):
    maxScore = 0.0
    for kw1 in [k for k in kws1 if len(k) > 5]:
        for kw2 in [k for k in kws2 if len(k) > 5]:
            alignment = align.globalxx(kw1, kw2)
            for an_al in alignment:
                score = float(an_al[2]) / float(an_al[4])
                if score > maxScore:
                    maxScore = score
    return maxScore
コード例 #7
0
ファイル: affiliation.py プロジェクト: seeslab/kaggle_kdd
def affil_align_score(t1, t2):
    if t1 == '' or t2 == '':
        return 0
    maxScore = 0
    alignment = align.globalxx(t1, t2)
    for an_al in alignment:
        score = float(an_al[2]) / float(an_al[4])
        if score > maxScore:
            maxScore = score
    return maxScore
コード例 #8
0
def has_homolgs(target_seq, known_seqs):
    """Checks if a given sequence is an homolog of any of the known sequences and returns it"""
    for k_seq in known_seqs:
        alignment = align.globalxx(target_seq, k_seq)[
            0]  # Generates and selects the alignment with the best score
        aln_seq_1 = alignment[
            0]  # Get the first sequence of the alignment (with - as gaps )
        aln_seq_2 = alignment[1]  # Get the second one
        al_length = len(alignment[0])
        ident = sum(base1 == base2 for base1, base2 in zip(
            aln_seq_1, aln_seq_2))  # Calculate number of identities
        if ident / al_length >= 0.95:  # If 95% of identity, return known sequence
            return k_seq
コード例 #9
0
def align_fasta_seqs (fasta_file, output_name):
	""" This function reads a Fasta file and return a file with all the pairwise alignments. """

	dict_alignments = {}
	out_file = open("alignments-" + output_name + ".txt", "w")
	for ID1, sequence1 in FASTA_iterator(fasta_file): # Iterates for each fasta sequence in fasta file.
		for ID2, sequence2 in FASTA_iterator(fasta_file):
			if ID1 != ID2: # If the sequences are not the same one
				alignment = align.globalxx(sequence1, sequence2) # Align them and store the result in a dictionary
				dict_alignments.setdefault(ID1+"-"+ID2, alignment)
				out_file.write(ID1+"-"+ID2+":"+"\n")
				out_file.write(format_alignment(*alignment[0]))
	out_file.close()
コード例 #10
0
def locate_reference(sequences, pdb_sequence):
    """
    Get the index of the reference sequence in sequences using a global pairwise alignment
    :param sequences:
    :param pdb_sequence:
    :return:
    """
    score = list()
    for index, sequence in enumerate(sequences):
        score.append(
            align.globalxx(pdb_sequence,
                           sequence,
                           one_alignment_only=1,
                           score_only=1))
    return score.index(max(score))
コード例 #11
0
def find_common_atoms(chain_fix, chain_mov):
    '''
    :param chain_fix: Chain object
    :param chain_mov: Chain object
    :return: A two lists of Atom objects prepared to be superimposed.
    That means the atoms are coherent in their position with the other list.
    '''
    #first check which kind of molecule are our chains (and if they are the same type of molecules)
    chem_type = None
    if chain_fix.sequence.__class__ == chain_mov.sequence.__class__:
        if chain_fix.sequence.__class__.__name__ == 'ProteinSequence':
            chem_type = 'P'
        elif chain_fix.sequence.__class__.__name__ == 'DNASequence':
            chem_type = 'D'
        elif chain_fix.sequence.__class__.__name__ == 'RNASequence':
            chem_type = 'R'

    seq_fix = chain_fix.get_sequence_str()
    seq_mov = chain_mov.get_sequence_str()
    atoms_fix = list()
    atoms_mov = list()
    alignment = align.globalxx(seq_fix, seq_mov) #align sequences
    count_fix = 0
    count_move = 0
    for i in range(len(alignment[0][0])):
        if alignment[0][0][i] == '-':
            count_fix -= 1
        if alignment[0][1][i] == '-':
            count_move -= 1
        elif alignment[0][0][i] != '-' and alignment[0][1][i] != '-':
            if chem_type == 'P':
                try:#try get full backbone (CONCA)
                    atoms_fix.extend(chain_fix.childs[i + count_fix].backbone())
                    atoms_mov.extend(chain_mov.childs[i + count_move].backbone())
                except:#Maybe an atom from the backbone is missing so we use C only
                    atoms_fix.extend(chain_fix.childs[i + count_fix].backbone(True, 'C'))
                    atoms_mov.extend(chain_mov.childs[i + count_move].backbone(True, 'C'))
            elif chem_type is not None:#DNA or RNA
                atoms_fix.extend(chain_fix.childs[i + count_fix].childs)
                atoms_mov.extend(chain_mov.childs[i + count_move].childs)
            else:
                sys.stderr.write('Tried to align chains with an unknown or different chemical structure')
                exit(1)
    #print(format_alignment(*alignment[0]))
    return atoms_fix, atoms_mov
コード例 #12
0
def _f(a, b):
    align.globalxx(a, b, force_generic=True, one_alignment_only=True)
    return (max(len(a), len(b)))
コード例 #13
0
ファイル: distances.py プロジェクト: vijaykarthik123/icing
def junction_distance(seq1,
                      seq2,
                      n,
                      dist_mat,
                      norm,
                      sym,
                      tol=3,
                      c=35.,
                      length_constraint=True):
    """Calculate a distance between two input sequences.

    .. note:: Deprecated.
          `junction_distance` will be removed in icing 0.2. It is replaced by
          `string_distance`.

    Parameters
    ----------
    seq1, seq2 : str
        String sequences.
    n : int
        Choose how to break down sequences. Usually is 1 or 5.
    dist_mat : pandas.DataFrame
        Matrix which define the distance between the single characters.
    norm : ('len', 'mut', 'max', 'min', 'none')
        Normalisation method.
    sym : ('avg', 'min', 'sum')
        Choose how to symmetrise distances between seq1 and seq2 or seq2 and
        seq1.
    tol : int, optional, default: 3
        Tolerance in the length of the sequences. Default is 3 (3 nucleotides
        form an amminoacid. If seq1 and seq2 represent amminoacidic sequences,
        use tol = 1).
    c : float, optional, default: 35.0, deprecated
        Constant used with mutations. Now ignored. Will be removed.
    length_constraint : boolean, optional, default: True
        Insert the constraint on the difference between the lengths of seq1 and
        seq2. If False, `tol` is ignored.

    Returns
    -------
    distance : float
        A normalised distance between seq1 and seq2. Values are in [0,1].
    """
    if length_constraint and 0 < abs(len(seq1) - len(seq2)) <= tol:
        # different lengths, seqs alignment
        seq1, seq2 = map(extra.junction_re, align.globalxx(seq1, seq2)[0][:2])

    nmers = get_nmers([seq1, seq2], n)
    mutated = np.array(
        [i for i, (c1, c2) in enumerate(izip(seq1, seq2)) if c1 != c2])
    mut_len = mutated.shape[0]
    seqq1 = np.empty(mut_len, dtype=object)
    seqq2 = np.empty(mut_len, dtype=object)
    nmer1 = np.empty(mut_len, dtype=object)
    nmer2 = np.empty(mut_len, dtype=object)
    for i, m in enumerate(mutated):
        seqq1[i] = seq1[m]
        seqq2[i] = seq2[m]
        nmer1[i] = nmers[seq1][m]
        nmer2[i] = nmers[seq2][m]

    # Determine normalizing factor
    if norm == 'len':
        norm_by = len(seq1)
    elif norm == 'mut':
        norm_by = len(mutated)
    elif norm == 'max':
        norm_by = max(len(seq1), len(seq2))
    elif norm == 'min':
        norm_by = min(len(seq1), len(seq2))
    else:
        norm_by = 1

    # Determine symmetry function
    if sym == 'avg':
        sym_fun = np.mean
    elif sym == 'min':
        sym_fun = min
    else:
        sym_fun = sum

    if length_constraint and abs(len(seq1) - len(seq2)) > tol:
        return min(len(seq1), len(seq2)) / norm_by

    return sum([
        sym_fun([float(dist_mat.at[c1, n2]),
                 float(dist_mat.at[c2, n1])])
        for c1, c2, n1, n2 in izip(seqq1, seqq2, nmer1, nmer2)
    ]) / (norm_by)
コード例 #14
0
ファイル: distances.py プロジェクト: slipguru/icing
def junction_distance(seq1, seq2, n, dist_mat, norm, sym, tol=3, c=35.,
                      length_constraint=True):
    """Calculate a distance between two input sequences.

    .. note:: Deprecated.
          `junction_distance` will be removed in icing 0.2. It is replaced by
          `string_distance`.

    Parameters
    ----------
    seq1, seq2 : str
        String sequences.
    n : int
        Choose how to break down sequences. Usually is 1 or 5.
    dist_mat : pandas.DataFrame
        Matrix which define the distance between the single characters.
    norm : ('len', 'mut', 'max', 'min', 'none')
        Normalisation method.
    sym : ('avg', 'min', 'sum')
        Choose how to symmetrise distances between seq1 and seq2 or seq2 and
        seq1.
    tol : int, optional, default: 3
        Tolerance in the length of the sequences. Default is 3 (3 nucleotides
        form an amminoacid. If seq1 and seq2 represent amminoacidic sequences,
        use tol = 1).
    c : float, optional, default: 35.0, deprecated
        Constant used with mutations. Now ignored. Will be removed.
    length_constraint : boolean, optional, default: True
        Insert the constraint on the difference between the lengths of seq1 and
        seq2. If False, `tol` is ignored.

    Returns
    -------
    distance : float
        A normalised distance between seq1 and seq2. Values are in [0,1].
    """
    if length_constraint and 0 < abs(len(seq1)-len(seq2)) <= tol:
        # different lengths, seqs alignment
        seq1, seq2 = map(extra.junction_re, align.globalxx(seq1, seq2)[0][:2])

    nmers = get_nmers([seq1, seq2], n)
    mutated = np.array([i for i, (c1, c2) in enumerate(izip(seq1, seq2)) if c1 != c2])
    mut_len = mutated.shape[0]
    seqq1 = np.empty(mut_len, dtype=object)
    seqq2 = np.empty(mut_len, dtype=object)
    nmer1 = np.empty(mut_len, dtype=object)
    nmer2 = np.empty(mut_len, dtype=object)
    for i, m in enumerate(mutated):
        seqq1[i] = seq1[m]
        seqq2[i] = seq2[m]
        nmer1[i] = nmers[seq1][m]
        nmer2[i] = nmers[seq2][m]

    # Determine normalizing factor
    if norm == 'len':
        norm_by = len(seq1)
    elif norm == 'mut':
        norm_by = len(mutated)
    elif norm == 'max':
        norm_by = max(len(seq1), len(seq2))
    elif norm == 'min':
        norm_by = min(len(seq1), len(seq2))
    else:
        norm_by = 1

    # Determine symmetry function
    if sym == 'avg':
        sym_fun = np.mean
    elif sym == 'min':
        sym_fun = min
    else:
        sym_fun = sum

    if length_constraint and abs(len(seq1)-len(seq2)) > tol:
        return min(len(seq1), len(seq2)) / norm_by

    return sum([sym_fun([float(dist_mat.at[c1, n2]), float(dist_mat.at[c2, n1])])
                for c1, c2, n1, n2 in izip(seqq1, seqq2, nmer1, nmer2)]) / (norm_by)
コード例 #15
0
def _f(a, b):
    align.globalxx(a, b, force_generic=True, one_alignment_only=True)
    return (max(len(a), len(b)))