Ejemplo n.º 1
0
 def test_seq1_seq3(self):
     s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr"
     s1 = "MAYWCTKLIGPQNAPYKSHWKT"
     self.assertEqual(seq1(s3), s1)
     self.assertEqual(seq3(s1).upper(), s3.upper())
     self.assertEqual(seq1(seq3(s1)), s1)
     self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
Ejemplo n.º 2
0
def residue_set_aln(self, structure, chain_name, offset=0):
    # TODO deberia tener menos parametros??
    rs = ResidueSet(name="aln_" + structure.name + "_" + chain_name)
    chain = structure.chain(chain_name)
    delta = 0
    mol = chain.residues[self.aln_hit.start + offset]
    if seq1(mol.compound).lower() != self.aln_hit.seq.replace("-",
                                                              "")[0].lower():
        delta = -1

    error = 0
    for i, aa in enumerate(self.aln_hit.seq.replace("-", "")):
        try:
            mol = chain.residues[self.aln_hit.start + i + delta + offset]
            if seq1(mol.compound).lower() != aa.lower():
                error += 1
                if error > 10:
                    raise Exception("too many mismaches")
            #                 assert seq1( mol.compound).lower() == aa.lower(), (mol.compound, aa)
            residue = chain_name + "_" + str(mol.resid)
            rs.residues.append(residue)
        except Exception as ex:
            _log.error("pdb %s mal alineado con residuos: %s" %
                       (self.aln_hit.name, ex))
            return ResidueSet(name="aln_" + structure.name + "_" + chain_name)

    return rs
Ejemplo n.º 3
0
def langage(seq):
    """
	Determine le mode d'ecriture à 1 ou 3 lettres des séquences peptidiques
	
	arguments:
		
		seq: la séquence à tester (str) ou (objet Seq) ou (objet SeqRecord)
	
	return:
		1 (int) si langage à une lettre
		3 (int) si langage à trois lettres
		ou None si la séquence n'est pas reconnue comme une séquence proteique
	"""

    seq = toSeq(seq)

    seq = str(seq)

    alpha = testalpha(seq)

    if seq.isupper() and (alpha == IUPAC.protein
                          or alpha == IUPAC.extended_protein):

        return 1

    elif seq.isupper() == False and seq.islower() == False and (
            testalpha(seq1(seq)) == IUPAC.protein
            or testalpha(seq1(seq)) == IUPAC.extended_protein):

        return 3

    else:

        return None
Ejemplo n.º 4
0
 def test_seq1_seq3(self):
     s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr"
     s1 = "MAYWCTKLIGPQNAPYKSHWKT"
     self.assertEqual(seq1(s3), s1)
     self.assertEqual(seq3(s1).upper(), s3.upper())
     self.assertEqual(seq1(seq3(s1)), s1)
     self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
Ejemplo n.º 5
0
def main(args):
    """Main script"""
    pdb_parser = PDBParser()

    pdb_name = Path(args.pdb).stem
    # deal with FoldX repaired PDBs
    if pdb_name.endswith('_Repair'):
        pdb_name = pdb_name.replace('_Repair', '')

    structure = pdb_parser.get_structure(pdb_name, args.pdb)

    sections = import_sections(args.yaml, pdb_name)

    variants = []
    if sections is not None:
        for section in sections:
            filter_region = 'region' in section
            for residue in structure[0][section['chain']]:
                if not residue.id[0] == ' ':
                    continue  # Filter HETATMs

                position = int(residue.id[1])
                amino_acid = seq1(residue.get_resname())

                if not amino_acid in AA_ALPHABET:
                    # Filter non-standard AAs, required when processing
                    # foldx repaired PDBs as they turn HETATMs to regular ATOMs
                    # for regular proteins
                    continue

                if (filter_region and (position > section['region'][1]
                                       or position < section['region'][0])):
                    continue

                variants.extend([
                    f"{amino_acid}{section['chain']}{position}{x}"
                    for x in AA_ALPHABET if not x == amino_acid
                ])
    else:
        for chain in structure[0]:
            for residue in chain:
                if not residue.id[0] == ' ':
                    continue  # Filter HETATMs

                position = int(residue.id[1])
                amino_acid = seq1(residue.get_resname())

                if not amino_acid in AA_ALPHABET:
                    continue

                variants.extend([
                    f"{amino_acid}{chain.id}{position}{x}" for x in AA_ALPHABET
                    if not x == amino_acid
                ])

    print(*variants, sep=';\n', end=';\n', file=sys.stdout)
Ejemplo n.º 6
0
def HGVS_p_to_AA_abrev(HGVS_p):
    mut = HGVS_p.split("p.(")[1].split(")")[0]
    num = re.findall(r"[+-]?\d+(?:\.\d+)?", mut)
    first_part = mut[0:(len(num[0])+6)]
    frameshift = re.search('fs\w+', mut)
    if frameshift is not None:
        fs = frameshift.group(0)
        aa3 = seq1(mut[-(len(num[1])+3):-len(num[1])])
        fs_part = fs[0:2]+aa3+num[1]
    else:
        fs_part = ''
    aa1 = seq1(first_part[0:3])
    aa2 = seq1(first_part[-3:])
    aa_abbrev = aa1+num[0]+aa2+fs_part
    return aa_abbrev
Ejemplo n.º 7
0
def _extract_translation_exception(translation_exception):
    output = []

    if isinstance(translation_exception, str):
        translation_exception = [translation_exception]

    for t_e in translation_exception:
        pos, aa = t_e.strip("()").split(",")

        if "complement" in pos:
            strand = -1
            pos_start, pos_end = pos.split("(")[1].strip(")").split("..")
        else:
            strand = 1
            pos_start, pos_end = pos.split(":")[1].split("..")
        pos_start = int(pos_start) - 1
        pos_end = int(pos_end)

        if ":" in aa:
            aa = aa.split(":")[1]
        elif "=" in aa:
            aa = aa.split("=")[1]

        output.append({
            "location": make_location(pos_start, pos_end, strand),
            "amino_acid": seq1(aa),
        })
    return output
Ejemplo n.º 8
0
def aa_seq(pdb_file):
    """
    Gets the full sequence of each protein chain from the SEQRES section of a 
    PDB file, if present. If it isn't present, returns None.

    :return: 
        A dictionary mapping each protein chain to its full sequence from the
        pdb file, irrespective of whether or not a residue has a coordinate.
    :rtype: defaultdict(str)
    """
    # TODO: Try to get sequence from PDB API if the SEQRES section is missing
    with open(pdb_file, 'r') as f:
        seq_lines = [l[:-1] for l in f.readlines() if l[:6] == 'SEQRES']

    seqs = defaultdict(str)
    for l in seq_lines:
        # Get the index of the last letter
        for i in reversed(range(len(l))):
            if l[i] != ' ':
                break
        chain_id = l[11]
        seq_3letters = l[19:i + 1]
        seqs[chain_id] += seq1(''.join(seq_3letters.split(' ')))
    if len(seq_lines) == 0:
        return None

    return seqs
Ejemplo n.º 9
0
 def __init__(self, chain, index, chainID, resNum):
     """The constructor of a Residue class."""
     self.index = index
     self.chainID = chainID
     self.resNum = resNum
     self.resName = seq1(chain[resNum].get_resname())
     self.structure = NONE  # 'H', 'B', 'E', 'G', 'I', 'T', 'S' or ' '
     self.nturns = {3: Nturn(), 4: Nturn(), 5: Nturn()}
     self.bend = NONE
     self.chirality = NONE
     self.bridge_1 = NONE
     self.bridge_2 = NONE
     self.bp1 = 0
     self.bp2 = 0
     self.sheet = NONE
     self.tco = 0
     self.kappa = 360
     self.alpha = 360
     self.phi = 360
     self.psi = 360
     self.CA = chain[self.resNum]['CA'].get_vector()
     self.C = chain[self.resNum]['C']
     self.O = chain[self.resNum]['O']
     self.N = chain[self.resNum]['N']
     try:
         self.H = chain[self.resNum]['H']
     except:
         pass
Ejemplo n.º 10
0
def struct_to_seq(structure, chains=None):
    if not structure.child_list:
        raise PDBAlignError("No models in %s" % structure)
    model = structure.child_list[0]
    if not model.child_list:
        raise PDBAlignError("No chains in %s" % structure)
    if chains is None:
        chain_list = model.child_list
    else:
        chain_list = [model[ch] for ch in chains]
    atom_seq_dict = dict()
    for ch in chain_list:
        # Don't include all-het chain
        if all(res.get_id()[0].strip() for res in ch.child_list):
            continue
        sequence = list()
        for res in ch.child_list:
            if res.get_id()[0] == " ":
                sequence.append(seq1(res.resname))
            else:
                het_res1 = allowed_het_res.get(res.resname)
                if het_res1 is not None:
                    sequence.append(het_res1)
        #atom_seq_dict[ch.id] = "".join(seq1(res.resname)
        #for res in ch.child_list
        #if res.get_id()[0] == " ")
        atom_seq_dict[ch.id] = "".join(sequence)
    return atom_seq_dict
Ejemplo n.º 11
0
def write_backbone_angles(chain,
                          region=None,
                          offset=0,
                          outfile=sys.stdout,
                          header=False):
    """
    Write Psi/Phi angles from a pdb file
    """
    if region is None:
        region = (0, float('inf'))

    polypeptide_builder = PPBuilder()
    polypeptides = polypeptide_builder.build_peptides(chain)

    if header:
        print(HEADER, file=outfile)

    for peptide in polypeptides:
        angles = peptide.get_phi_psi_list()
        for residue, (phi, psi) in zip(peptide, angles):
            position = residue.get_id()[1]
            if region[0] <= position <= region[1]:
                print(chain.id,
                      position,
                      seq1(residue.get_resname()),
                      position + offset,
                      'NA' if phi is None else phi * RAD_FACTOR,
                      'NA' if psi is None else psi * RAD_FACTOR,
                      sep='\t',
                      file=outfile)
Ejemplo n.º 12
0
def retrieveAtomicStructureMapping(pdb_sequence,
                                   translation_to_structure_mapping):
    """Retrieves the mapping to atoms in a PDB file, based on the measured structure"""
    measured_structure = retrieveAtomicStructure(pdb_sequence)

    seq_ids = [x for x in sorted(measured_structure.keys())]

    # aligned_sequence to atomic structure mapping
    atomic_structure_mapping = {}

    index = 0
    for i in range(len(
            translation_to_structure_mapping['secondary_sequence'])):
        if translation_to_structure_mapping['secondary_sequence'][i] != '-':
            if seq1(
                    measured_structure[seq_ids[index]]
            ) != translation_to_structure_mapping['secondary_sequence'][i]:
                raise AtomicSequenceIDMappingFailedException(
                    "Alternative mapping for atomic sequence to atomic seqids failed for pdb structure "
                    + pdb_sequence['pdb_id'] + ", chain " +
                    pdb_sequence['chain_id'])
            atomic_structure_mapping[i] = seq_ids[index]
            index += 1

    return atomic_structure_mapping
Ejemplo n.º 13
0
    def get_sequence( self, chain_id ):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        '''
        res_chain = list(self.structure[0][chain_id])
        aa_3L=[]
        aa_1L =''

        for residue in res_chain:
            residue_id = residue.get_id()
            hetfield = residue_id[0]
            if hetfield[0]==" ":
                aa_3L.append(residue)  
        for aa in aa_3L:
            aa_1L += seq1(aa.get_resname())
    
    
        sequence = aa_1L
        return sequence
Ejemplo n.º 14
0
    def create_sequence(self, pdb_code, pdb_path):
        pdb = PDB.objects.get(code=pdb_code)

        struct = PDBParser(PERMISSIVE=1,
                           QUIET=1).get_structure(pdb_code, pdb_path)
        for chain in struct[0].get_chains():

            residues = []
            for residue in chain.get_residues():

                if is_aa(residue, standard=True):
                    # alts = [a.get_altloc() for a in residue.get_atoms() if a.get_altloc()]
                    # if len(alts) > 1 :
                    #     print(pdb_code)
                    #     disordered_select
                    #     print("alternative residue %s from %s was removed from sequence" % (
                    #         str(residue.id), pdb_code
                    #     ))
                    # else:
                    residues.append(residue)

            if residues:
                seq = "".join([seq1(x.resname) for x in residues])
                start = str(residues[0].id[1])
                end = str(residues[-1].id[1])
                seqid = "_".join([pdb_code, chain.id, start, end])
                if not Bioentry.objects.filter(biodatabase=self.biodb,
                                               identifier=seqid).exists():
                    be = Bioentry(biodatabase=self.biodb,
                                  accession=seqid,
                                  identifier=seqid,
                                  name=pdb.code)
                    be.save()
                    Biosequence(bioentry=be, seq=seq, length=len(seq)).save()
Ejemplo n.º 15
0
def struct_to_seq(structure, chains=None):
    if not structure.child_list:
        raise PDBAlignError("No models in %s" % structure)
    model = structure.child_list[0]
    if not model.child_list:
        raise PDBAlignError("No chains in %s" % structure)
    if chains is None:
        chain_list = model.child_list
    else:
        chain_list = [model[ch] for ch in chains]
    atom_seq_dict = dict()
    for ch in chain_list:
        # Don't include all-het chain
        if all(res.get_id()[0].strip() for res in ch.child_list):
            continue
        sequence = list()
        for res in ch.child_list:
            if res.get_id()[0] == " ":
                sequence.append(seq1(res.resname))
            else:
                het_res1 = allowed_het_res.get(res.resname)
                if het_res1 is not None:
                    sequence.append(het_res1)
        #atom_seq_dict[ch.id] = "".join(seq1(res.resname)
                                       #for res in ch.child_list
                                       #if res.get_id()[0] == " ")
        atom_seq_dict[ch.id] = "".join(sequence)
    return atom_seq_dict
Ejemplo n.º 16
0
def distance_to_nearest(residues, distance_matrix=None):
    """
    Yeilds chemical environments parameterised as the distance to the nearest residue
    of each type. Hetero atoms are included so must be dropped separately if desired.

    residues: list of residues to consider
    distance_matrix: numpy matrix of distances between residues, with rows/columns in
                     that order. Calculated if not supplied

    yields: chemical environment profiles (np.array)
    """

    if distance_matrix is None:
        distance_matrix = residue_distance_matrix(residues)

    residue_indices = [
        np.array([seq1(r.get_resname()) == aa for r in residues])
        for aa in protein_alphabet.letters
    ]

    for res_index in range(len(residues)):
        dists = distance_matrix[res_index, ]

        non_self = np.ones_like(dists, dtype=bool)
        non_self[res_index] = False

        yield np.array([
            min(dists[aa & non_self]) if any(aa & non_self) else np.inf
            for aa in residue_indices
        ])
Ejemplo n.º 17
0
def k_nearest_residues(residues, k=10, distance_matrix=None):
    """
    Yields chemical environments parameterised by the make up of the k nearest AAs.
    Hetero atoms are included so must be dropped separately if desired.

    residues: list of residues to consider
    k: count the k nearest residues
    distance_matrix: numpy matrix of distances between residues, with rows/columns in
                     that order. Calculated if not supplied

    yields: chemical environment profiles (np.array)
    """
    if k >= len(residues):
        raise ValueError('k >= number of residues')

    if distance_matrix is None:
        distance_matrix = residue_distance_matrix(residues)

    for res_index in range(len(residues)):
        dists = distance_matrix[res_index, ]

        non_self = np.ones_like(dists, dtype=bool)
        non_self[res_index] = False

        nearest_k = [
            residues[i] for i in np.argpartition(dists[non_self], k)[:k]
        ]

        counts = defaultdict(lambda: 0)
        for i in nearest_k:
            counts[seq1(i.get_resname())] += 1

        yield np.array([counts[aa] for aa in protein_alphabet.letters])
Ejemplo n.º 18
0
def within_distance(residues, max_dist=10, distance_matrix=None):
    """
    Yeilds chemical environments parameterised as the residues within max_dist
    angstroms. Hetero atoms are included so must be dropped separately if desired.

    residues: list of residues to consider
    max_dist: maximum distance to count within (in Angstroms)
    distance_matrix: numpy matrix of distances between residues, with rows/columns in
                     that order. Calculated if not supplied

    yields: chemical environment profiles (np.array)
    """

    if distance_matrix is None:
        distance_matrix = residue_distance_matrix(residues)

    for res_index in range(len(residues)):
        dists = distance_matrix[res_index, ]

        res_within_dist = [
            residues[i] for i in np.argwhere(dists < max_dist)[:, 0]
            if not i == res_index
        ]

        counts = defaultdict(lambda: 0)
        for i in res_within_dist:
            counts[seq1(i.get_resname())] += 1

        yield np.array([counts[aa] for aa in protein_alphabet.letters])
Ejemplo n.º 19
0
def read_in_experiment():
    """ 
	Read in raw data from Nisthal_2019.xlsx, making sure
	it matches FoldX read in
  
    Returns: 
	array-like   
		organized ddG data by residue and amino acid type

    """

    AAs = [seq1(aa_3) for aa_3 in aminoacids]

    data_file = 'Nisthal_2019.xlsx'
    df2 = pd.read_excel(data_file)
    ddGs = np.zeros((56, 20))
    ddGs.fill(np.nan)  #match Tokuriki dataset read in
    for res, ddG in zip(df2['MUT_LBL'], df2['ddG(mAvg)_mean']):
        res_num, AA = parse(res)
        try:
            ddG = float(ddG)
            if ddG != -4:  #Nisthal includes the black squares in fig 2 as -4 kcal/mol
                ddGs[res_num - 1][AAs.index(AA)] = -float(ddG)
        except:
            pass
    return ddGs
def get_info(pdb_file, fasta_file=None, verbose=True):
    if fasta_file is not None:
        chain_seqs = get_chain_seqs(fasta_file)
    else:
        if verbose:
            print(
                'WARNING: No fasta file given to get_info(), getting sequence '
                'from PDB file')
        chain_seqs = dict()
        parser = PDBParser()
        structure = parser.get_structure(get_id(pdb_file), pdb_file)
        for chain in structure.get_chains():
            id_ = chain.id
            seq = seq1(''.join([residue.resname for residue in chain]))
            if id_ not in ['H', 'L']:
                msg = (
                    'Expected a heavy chain or light chain, marked as \'H\' '
                    ' or \'L\'. Got a chain id of :{} from protein {}')
                raise ValueError(msg.format(id_, get_id(pdb_file)))

            chain_seqs.update({id_: letter_to_num(seq, _aa_dict)})

    id_ = get_id(pdb_file)
    cdr_indices = get_cdr_indices(pdb_file)
    dist_angle_mat = protein_dist_angle_matrix(pdb_file)

    info = cdr_indices
    info.update(chain_seqs)
    info.update(dict(dist_angle_mat=dist_angle_mat, id=id_))
    return info
Ejemplo n.º 21
0
def align_nodes(foldx_nodes_features, pdb_list, seq, pdb_chain):
    # selected features
    #feats = ['phi', 'psi', 'Sidechain Accessibility', 'Mainchain Accessibility']

    feats = [
        'phi', 'psi', 'total', 'backHbond', 'sideHbond', 'energy_VdW',
        'electro', 'energy_SolvP', 'energy_SolvH', 'energy_vdwclash',
        'entrop_sc', 'entrop_mc', 'cis_bond', 'energy_torsion',
        'backbone_vdwclash', 'energy_dipole', 'Sidechain Contact Ratio',
        'Mainchain Contact Ratio'
    ]

    assert len(feats) + 2 == hparams['in_dim_n']

    # check sequence
    tmp_seq = "".join([
        seq1(aa.title()) for aa in foldx_nodes_features.loc[
            foldx_nodes_features.pdb_seq_num.isin(
                pdb_list)].three_letter.tolist()
    ])
    assert len(seq) == len(tmp_seq)
    for i, j in zip(list(seq), list(tmp_seq)):
        if j != "X": assert i == j

    # add foldx features
    nodes_feats = foldx_nodes_features.loc[
        foldx_nodes_features.pdb_seq_num.isin(pdb_list), feats].copy()

    # add more features
    nodes_feats['phobos'] = [phobos[aa] for aa in list(seq)]
    nodes_feats['radius'] = [radius[aa] for aa in list(seq)]

    return nodes_feats.values.astype(np.float32)
Ejemplo n.º 22
0
def read_naccess_rsa(model):
    """
    Import a Naccess output RSA table, based on a model row imported by parse_model_table
    """
    with open(f'{model.uniprot}_{model.name}_{model.model}.rsa',
              'r') as rsa_file:
        rsa_data = process_rsa_data(rsa_file)
    df = pd.DataFrame.from_dict(rsa_data, orient='index').reset_index()
    df = df.rename(columns={
        'level_0': 'chain',
        'level_1': 'position',
        'res_name': 'wt'
    })
    df['position'] = [i[1] for i in df['position']]
    df['uniprot'] = model.uniprot
    df['name'] = model.name
    df['template'] = model.template
    df['wt'] = [seq1(i) for i in df['wt']]
    positions = [int(i) for i in model.positions.split(',')]
    df = df[df['position'].isin(positions)]
    df = df[[
        'uniprot', 'name', 'position', 'wt', 'template', 'chain',
        'all_atoms_abs', 'all_atoms_rel', 'side_chain_abs', 'side_chain_rel',
        'main_chain_abs', 'main_chain_rel', 'non_polar_abs', 'non_polar_rel',
        'all_polar_abs', 'all_polar_rel'
    ]]
    return df
Ejemplo n.º 23
0
    def add_struc_path(self, struc_path):
        from Bio.SeqRecord import SeqRecord
        from Bio.Seq import Seq
        from Bio.PDB import PDBParser, MMCIFParser
        from Bio.SeqUtils import seq1

        self.struc_path = struc_path
        if ntpath.splitext(self.struc_path)[1] == ".pdb":
            parser = PDBParser()
        elif ntpath.splitext(self.struc_path)[1] == ".cif":
            parser = MMCIFParser()
        else:
            raise IOError(
                "Unrecognized structure file type! Please use .pdb or .cif files!"
            )

        structure = parser.get_structure("none", self.struc_path)
        chains = list()
        for chain in structure.get_chains():
            chains.append(chain)
        if len(chains) != 1:
            raise IOError(
                f"When using structure files, they need to have a single chain!"
            )
        sequence = str()
        seq_ix_mapping = dict()
        untrue_seq_ix = 1
        residues = list(chains[0].get_residues())
        for resi in residues:
            resi_id = resi.get_id()
            if not re.match(r' ', resi_id[2]):
                continue
            if re.match(r'^H_', resi_id[0]):
                continue
            if re.match(r'W', resi_id[0]):
                continue
            sequence += resi.get_resname().replace(' ', '')
            seq_ix_mapping[untrue_seq_ix] = int(resi.get_id()[1])
            untrue_seq_ix += 1

        if len(seq1(residues[seq_ix_mapping[1]].get_resname().replace(
                ' ', ''))) != 0:
            sequence = seq1(sequence)
        self.seq_ix_mapping = seq_ix_mapping
        self.struc_seq = SeqRecord(Seq(sequence))
Ejemplo n.º 24
0
def _adjust_aa_seq(fraglist):
    """Transforms three-letter amino acid codes into one-letters in the
    given HSPFragments."""
    hsp_hstart = fraglist[0].hit_start
    hsp_qstart = fraglist[0].query_start
    for frag in fraglist:
        assert frag.query_strand == 0 or frag.hit_strand == 0
        # fragment should have a length that is a multiple of 3
        assert len(frag) % 3 == 0
        # hit step may be -1 as we're aligning to DNA
        hstep = 1 if frag.hit_strand >= 0 else -1
        # get one letter codes
        # and replace gap codon markers and termination characters
        custom_map = {'***': '*', '<->': '-'}

        hseq1 = seq1(str(frag.hit.seq), custom_map=custom_map)
        hstart = hsp_hstart
        hend = hstart + len(hseq1.replace('-', '')) * hstep

        qseq1 = seq1(str(frag.query.seq), custom_map=custom_map)
        qstart = hsp_qstart
        qend = qstart + len(qseq1.replace('-', ''))

        # replace the old frag sequences with the new ones
        frag.hit = None
        frag.query = None
        frag.hit = hseq1
        frag.query = qseq1

        # set coordinates for the protein sequence
        if frag.query_strand == 0:
            frag.query_start, frag.query_end = qstart, qend
        elif frag.hit_strand == 0:
            frag.hit_start, frag.hit_end = hstart, hend

        # update alignment annotation
        # by turning them into list of triplets
        for annot, annotseq in frag.aln_annotation.items():
            frag.aln_annotation[annot] = _make_triplets(annotseq)

        # update values for next iteration
        hsp_hstart, hsp_qstart = hend, qend

    return fraglist
Ejemplo n.º 25
0
def _adjust_aa_seq(fraglist):
    """Transforms three-letter amino acid codes into one-letters in the
    given HSPFragments."""
    hsp_hstart = fraglist[0].hit_start
    hsp_qstart = fraglist[0].query_start
    for frag in fraglist:
        assert frag.query_strand == 0 or frag.hit_strand == 0
        # fragment should have a length that is a multiple of 3
        assert len(frag) % 3 == 0
        # hit step may be -1 as we're aligning to DNA
        hstep = 1 if frag.hit_strand >= 0 else -1
        # get one letter codes
        # and replace gap codon markers and termination characters
        custom_map = {"***": "*", "<->": "-"}

        hseq1 = seq1(str(frag.hit.seq), custom_map=custom_map)
        hstart = hsp_hstart
        hend = hstart + len(hseq1.replace("-", "")) * hstep

        qseq1 = seq1(str(frag.query.seq), custom_map=custom_map)
        qstart = hsp_qstart
        qend = qstart + len(qseq1.replace("-", ""))

        # replace the old frag sequences with the new ones
        frag.hit = None
        frag.query = None
        frag.hit = hseq1
        frag.query = qseq1

        # set coordinates for the protein sequence
        if frag.query_strand == 0:
            frag.query_start, frag.query_end = qstart, qend
        elif frag.hit_strand == 0:
            frag.hit_start, frag.hit_end = hstart, hend

        # update alignment annotation
        # by turning them into list of triplets
        for annot, annotseq in frag.aln_annotation.items():
            frag.aln_annotation[annot] = _make_triplets(annotseq)

        # update values for next iteration
        hsp_hstart, hsp_qstart = hend, qend

    return fraglist
Ejemplo n.º 26
0
    def assesment(pdb_path, fasta_path=None, output_dir=None,
                  accpro_path=None, psipred_path=None,cpus=multiprocessing.cpu_count()):

        if not output_dir:
            output_dir = tempfile.mkdtemp(suffix="_qmean")

        data = dict()
        if not fasta_path and (accpro_path or psipred_path):
            fasta_path = output_dir + "/seq.fasta"

            p = PDBParser(PERMISSIVE=True, QUIET=True)
            seq = "".join([seq1(residue.resname) for residue in p.get_structure("x", pdb_path).get_residues()
                           ])  # if is_aa(residue)
            with open(fasta_path, "w") as h:
                h.write(">seq\n")
                h.write(seq)

            data["seq"] = seq

        psipred_handler = None
        accpro_handler = None

        if accpro_path:
            data["acc"] = QMean.accpro(fasta_path, output_dir, accpro_path)
            accpro_handler = ACCPROHandler(data)

        if psipred_path:
            data["ss"], data["conf"] = QMean.psipred(fasta_path, output_dir, psipred_path,cpus)
            psipred_handler = PSIPREDHandler(data)

        pdb = LoadPDB(pdb_path)

        if psipred_handler and accpro_handler:
            assessment = AssessModelQuality(pdb, output_dir=output_dir,
                                            psipred=psipred_handler, accpro=accpro_handler)
        elif psipred_handler and not accpro_handler:
            assessment = AssessModelQuality(pdb, output_dir=output_dir,
                                            psipred=psipred_handler)
        elif not psipred_handler and accpro_handler:
            assessment = AssessModelQuality(pdb, output_dir=output_dir,
                                            accpro=accpro_handler)
        else:
            assessment = AssessModelQuality(pdb, output_dir=output_dir)

        #shutil.rmtree(output_dir)
        result = {}
        for x in assessment[0].all_scores:
            result[x.name + "_norm"] = x.norm
            result[x.name + "_zscore"] = x.z_score
        result["residues"] = {}
        for row in assessment[1].score_table.rows:
            r = {f: row[i] for i, f in enumerate(assessment[1].score_table.col_names[4:], 4)}
            result["residues"][row[0] + "_" + str(row[2]) + "_" + str(row[3])] = r
        return result
Ejemplo n.º 27
0
    def strucToSeq(chain: Bio.PDB.Entity.Entity) -> str:
        '''
        Parses a structure object and returns the sequence as a 1-letter AA code.
        '''
        res = list(chain.get_residues())  # residue list from the structure
        seq = ""  # sequence to return later

        for r in res:  # for each residue,
            seq += seq1(
                r.get_resname()
            )  # append the 3-letter code from each residue name to the sequence string
        return seq
Ejemplo n.º 28
0
 def get_sequence(self, chain_id):
     """
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     """
     one_letter_sequence = ''.join(seq1(res.get_resname()) for res in self.get_residues(chain_id))
     return one_letter_sequence
Ejemplo n.º 29
0
 def seq(self, selected_chain=None, standard_aa=True):
     records = []
     for chain in self.struct.get_chains():
         if (not selected_chain) or (selected_chain == chain.id):
             residues = [x for x in chain.get_residues() if is_aa(x, standard=standard_aa)]
             if residues:
                 seq = "".join([seq1(x.resname) for x in residues])
                 start = str(residues[0].id[1])
                 end = str(residues[-1].id[1])
                 record = SeqRecord(id="_".join([self.code, chain.id, start, end]), description="", seq=Seq(seq))
                 records.append(record)
     return records
Ejemplo n.º 30
0
def retrieveAtomicStructureSequence(pdb_sequence):
    """Retrieves the sequence of a PDB file, based on the measured structure"""
    measured_structure = retrieveAtomicStructure(pdb_sequence)

    seq_ids = [x for x in measured_structure.keys()]
    measure_structure_sequence = ""
    for seq_id in range(np.min(seq_ids), np.max(seq_ids) + 1):
        if seq_id in measured_structure.keys():
            measure_structure_sequence += seq1(measured_structure[seq_id])
        else:
            measure_structure_sequence += "-"

    return measure_structure_sequence
Ejemplo n.º 31
0
def extract_seq_from_pdb (pedxxxx,ensemble,confomer,chain):
    pdb = ("%s_%s-%s_%s.pdb"%(pedxxxx, ensemble,conformer,chain))
    structure = PDBParser().get_structure("pdb",pdb)
    seq_pdb=[]
    seq_pdb_complete=[]
    for residue in structure.get_residues():
        if residue.id[0] == " ":
            seq_pdb.append(residue.get_resname())
    seq_pdb = ''.join(seq_pdb)
    
    seq_pdb= seq1(seq_pdb)
    longitud=len(seq_pdb)
    return (seq_pdb)
Ejemplo n.º 32
0
def CB_coords(pdb_file, include_masks=False):
    """
    Gets the coordinates of the C-Beta atom in each residue or the C-Alpha atom
    if the residue does not have a C-beta coordinate. If a residue has neither,
    its coordinates are set to [0, 0, 0].
    
    An array mask can also be returned to denote non-existing coordinates, where
    mask[i] is denotes whether or not (1 or 0) the i-th residue has a
    coordinate.
    """
    p = PDBParser()
    file_name = filename_no_extension(pdb_file)
    structure = p.get_structure(file_name, pdb_file)

    def get_cb_or_ca(residue):
        if 'CB' in residue:
            return residue['CB'].get_coord()
        elif 'CA' in residue:
            return residue['CA'].get_coord()
        else:
            return [0, 0, 0]

    coords = {}
    masks = mask_aa_coords(pdb_file)
    for chain in structure.get_chains():
        chain_id = chain.get_id()
        if chain_id in masks:
            coords[chain_id] = np.zeros((len(masks[chain_id]), 3))
            chain_coords = [
                get_cb_or_ca(r) for r in chain.get_residues()
                if seq1(r.get_resname()) != 'X'
            ]
            if chain_coords:
                if len(chain_coords) != len(coords[chain_id][masks[chain_id]]):
                    msg = ('WARNING: In {}, chain {} the mask is not equal to '
                           'the number of coordinates. Returning None')
                    warnings.warn(msg.format(pdb_file, chain_id))
                    return None
                coords[chain_id][masks[chain_id]] = chain_coords
        else:
            msg = (
                'WARNING: Chain ID mismatch between the full sequences and the '
                'sequences derived from coordinates in the {} file. Chain {} in '
                'the full sequence is not in the sequences derived from '
                'coordinates. Skipping chain {}')
            warnings.warn(msg.format(pdb_file, chain_id, chain_id))

    if include_masks:
        return coords, masks
    else:
        return coords
Ejemplo n.º 33
0
    def records_from_pdb(self, pdb, pdb_file_path, standard_aa=True, selected_chain=None):
        records = []
        struct = PDBParser(PERMISSIVE=1, QUIET=1).get_structure(pdb, pdb_file_path)[0]

        for chain in struct.get_chains():
            if (not selected_chain) or (selected_chain == chain.id):
                residues = [x for x in chain.get_residues() if is_aa(x, standard=standard_aa)]
                if residues:
                    seq = "".join([seq1(x.resname) for x in residues])
                    start = str(residues[0].id[1])
                    end = str(residues[-1].id[1])
                    record = SeqRecord(id="_".join([pdb, chain.id, start, end]), description="", seq=Seq(seq))
                    records.append(record)
        return records
Ejemplo n.º 34
0
    def write_chain(self, key):
        """Writes the chain information to sequence"""

        # make our output
        header = ">{0}:{1}|PDBID|CHAIN|SEQUENCE".format(self.code, key)
        print(header, file=self.buf)
        # init our sequence
        seq = []
        # grab attributes
        chain = self.chains[key]
        residues = chain.get_residues()
        # grab sequences
        for res in residues:
            seq.append(seq1(res.resname))
        # write sequence
        length = len(seq)
        for index in range(0, length, LINE_LENGTH):
            out = "".join(seq[index : index + LINE_LENGTH])
            print(out, file=self.buf)
Ejemplo n.º 35
0
def get_pdb_chain_partial_seq (model, chain, _1lc, with_h2o):
    chain=model[chain]
    chain_seq = []
    chain_seq_raw = []
    
    for residue in chain.get_residues():   
        residue_name = residue.get_resname()
        # Remove all occurences of a value in list : http://stackoverflow.com/questions/1157106/remove-all-occurences-of-a-value-from-a-python-list
        ''' Check if sequence with water molecules has to be returned '''
        if with_h2o is False:   
            ''' Remove all water molecules residues in the sequence ''' 
            if residue_name == "HOH":
                continue
    
            ''' Check if 1 letter code sequence has to be returned '''
            if _1lc is True:        
                ''' Convert 3 letter code protein sequence to 1 letter code protein sequence ''' #http://biopython.org/DIST/docs/api/Bio.SeqUtils-module.html#seq1
                residue.resname = seq1(residue_name) # Change variable resname
        chain_seq.append(residue)
        chain_seq_raw.append(residue.resname)
    return [chain_seq, chain_seq_raw]
from Bio.SeqUtils.CodonUsage import SynonymousCodons
from Bio.SeqUtils import seq1

import sys
import os

fpath = os.path.join(os.getcwd(),sys.argv[-1])

f = open(fpath, 'r')
protein = f.readline().rstrip()
f.close()

#Codon dictionary of just possibility counts (e.g. Met = 1, Ala = 4)
codonTable = {}
for key in SynonymousCodons.keys():
    # Use seq1 to convert three letter codes to one letter
    codonTable[seq1(key)] = len(SynonymousCodons[key])

# Amino acid combinations
aa_comb = 1
for aa in protein:
    aa_comb *= codonTable[aa]

# Times 3 for the 3 possible stop codons
# Modulo 1000000 to make final number reasonable sized
print aa_comb * 3 % 1000000
Ejemplo n.º 37
0
    def __init__(self, ref_seq_id, start, stop, ref, alt, edit_type, predicted=False):

        if self.three_letter_regex.match(ref) and self.three_letter_regex.match(alt):
            ref = seq1(ref)
            alt = seq1(alt)
        super().__init__(ref_seq_id, start, stop, ref, alt, edit_type, predicted)
Ejemplo n.º 38
0
def _adjust_aa_seq(fraglist):
    """Transform 3-letter AA codes of input fragments to one-letter codes (PRIVATE).

    Argument fraglist should be a list of HSPFragments objects.
    """
    custom_map = {'***': '*', '<->': '-'}
    hsp_hstart = fraglist[0].hit_start
    hsp_qstart = fraglist[0].query_start
    frag_phases = _get_fragments_phase(fraglist)
    for frag, phase in zip(fraglist, frag_phases):
        assert frag.query_strand == 0 or frag.hit_strand == 0
        # hit step may be -1 as we're aligning to DNA
        hstep = 1 if frag.hit_strand >= 0 else -1

        # set fragment phase
        frag.phase = phase

        # fragment should have a length that is a multiple of 3
        # assert len(frag) % 3 == 0
        qseq = str(frag.query.seq)
        q_triplets_pre, q_triplets, q_triplets_post = \
            _make_triplets(qseq, phase)

        hseq = str(frag.hit.seq)
        h_triplets_pre, h_triplets, h_triplets_post = \
            _make_triplets(hseq, phase)

        # get one letter codes
        # and replace gap codon markers and termination characters
        hseq1_pre = "X" if h_triplets_pre else ""
        hseq1_post = "X" if h_triplets_post else ""
        hseq1 = seq1("".join(h_triplets), custom_map=custom_map)
        hstart = hsp_hstart + (len(hseq1_pre) * hstep)
        hend = hstart + len(hseq1.replace('-', '')) * hstep

        qseq1_pre = "X" if q_triplets_pre else ""
        qseq1_post = "X" if q_triplets_post else ""
        qseq1 = seq1("".join(q_triplets), custom_map=custom_map)
        qstart = hsp_qstart + len(qseq1_pre)
        qend = qstart + len(qseq1.replace('-', ''))

        # replace the old frag sequences with the new ones
        frag.hit = None
        frag.query = None
        frag.hit = hseq1_pre + hseq1 + hseq1_post
        frag.query = qseq1_pre + qseq1 + qseq1_post

        # set coordinates for the protein sequence
        if frag.query_strand == 0:
            frag.query_start, frag.query_end = qstart, qend
        elif frag.hit_strand == 0:
            frag.hit_start, frag.hit_end = hstart, hend

        # update alignment annotation
        # by turning them into list of triplets
        for annot, annotseq in frag.aln_annotation.items():
            pre, intact, post = _make_triplets(annotseq, phase)
            frag.aln_annotation[annot] = \
                list(filter(None, [pre])) + intact + list(filter(None, [post]))

        # update values for next iteration
        hsp_hstart, hsp_qstart = hend, qend

    return fraglist
Ejemplo n.º 39
0
def PdbAtomIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.
    """
    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB import PDBParser
    from Bio.SeqUtils import seq1
    from Bio.SCOP.three_to_one_dict import to_one_letter_code

    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return seq1(residue.resname, custom_map=to_one_letter_code)

    # Deduce the PDB ID from the PDB header
    # ENH: or filename?
    from Bio.File import UndoHandle
    undo_handle = UndoHandle(handle)
    firstline = undo_handle.peekline()
    if firstline.startswith("HEADER"):
        pdb_id = firstline[62:66]
    else:
        warnings.warn("First line is not a 'HEADER'; can't determine PDB ID")
        pdb_id = '????'

    struct = PDBParser().get_structure(pdb_id, undo_handle)
    model = struct[0]
    for chn_id, chain in sorted(model.child_dict.iteritems()):
        # HETATM mod. res. policy: remove mod if in sequence, else discard
        residues = [res for res in chain.get_unpacked_list()
                    if seq1(res.get_resname().upper(),
                        custom_map=to_one_letter_code) != "X"]
        if not residues:
            continue
        # Identify missing residues in the structure
        # (fill the sequence with 'X' residues in these regions)
        gaps = []
        rnumbers = [r.id[1] for r in residues]
        for i, rnum in enumerate(rnumbers[:-1]):
            if rnumbers[i+1] != rnum + 1:
                # It's a gap!
                gaps.append((i+1, rnum, rnumbers[i+1]))
        if gaps:
            res_out = []
            prev_idx = 0
            for i, pregap, postgap in gaps:
                if postgap > pregap:
                    gapsize = postgap - pregap - 1
                    res_out.extend(map(restype, residues[prev_idx:i]))
                    prev_idx = i
                    res_out.append('X'*gapsize)
                    # Last segment
                    res_out.extend(map(restype, residues[prev_idx:]))
                else:
                    warnings.warn("Ignoring out-of-order residues after a gap",
                                  UserWarning)
                    # Keep the normal part, drop the out-of-order segment
                    # (presumably modified or hetatm residues, e.g. 3BEG)
                    res_out.extend(map(restype, residues[prev_idx:i]))
        else:
            # No gaps
            res_out = map(restype, residues)
        record_id = "%s:%s" % (pdb_id, chn_id)
        # ENH - model number in SeqRecord id if multiple models?
        # id = "Chain%s" % str(chain.id)
        # if len(structure) > 1 :
        #     id = ("Model%s|" % str(model.id)) + id

        record = SeqRecord(Seq(''.join(res_out), generic_protein),
                id=record_id,
                description=record_id,
                )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record
Ejemplo n.º 40
0
def PdbSeqresIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from Bio.SeqUtils import seq1
    from Bio.SCOP.three_to_one_dict import to_one_letter_code

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    for line in handle:
        rec_name = line[0:6].strip()
        if rec_name == 'SEQRES':
            # NB: We only actually need chain ID and the residues here;
            # commented bits are placeholders from the wwPDB spec.
            # Serial number of the SEQRES record for the current chain.
            # Starts at 1 and increments by one each line.
            # Reset to 1 for each chain.
            # ser_num = int(line[8:10])
            # Chain identifier. This may be any single legal character,
            # including a blank which is used if there is only one chain.
            chn_id = line[11]
            # Number of residues in the chain (repeated on every record)
            # num_res = int(line[13:17])
            residues = [seq1(res, custom_map=to_one_letter_code) for res in line[19:].split()]
            chains[chn_id].extend(residues)
        elif rec_name == 'DBREF':
            #  ID code of this entry (PDB ID)
            pdb_id = line[7:11]
            # Chain identifier.
            chn_id = line[12]
            # Initial sequence number of the PDB sequence segment.
            # seq_begin = int(line[14:18])
            # Initial insertion code of the PDB sequence segment.
            # icode_begin = line[18]
            # Ending sequence number of the PDB sequence segment.
            # seq_end = int(line[20:24])
            # Ending insertion code of the PDB sequence segment.
            # icode_end = line[24]
            # Sequence database name.
            database = line[26:32].strip()
            # Sequence database accession code.
            db_acc = line[33:41].strip()
            # Sequence database identification code.
            db_id_code = line[42:54].strip()
            # Initial sequence number of the database seqment.
            # db_seq_begin = int(line[55:60])
            # Insertion code of initial residue of the segment, if PDB is the
            # reference.
            # db_icode_begin = line[60]
            # Ending sequence number of the database segment.
            # db_seq_end = int(line[62:67])
            # Insertion code of the ending residue of the segment, if PDB is the
            # reference.
            # db_icode_end = line[67]
            metadata[chn_id].append({'pdb_id': pdb_id, 'database': database,
                                    'db_acc': db_acc, 'db_id_code': db_id_code})
        # ENH: 'SEQADV' 'MODRES'

    for chn_id, residues in sorted(chains.iteritems()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = ("%s:%s %s" % (m['database'],
                                                m['db_acc'],
                                                m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])])
        else:
            record.id = chn_id
        yield record
Ejemplo n.º 41
0
    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return seq1(residue.resname, custom_map=to_one_letter_code)
Ejemplo n.º 42
0
def CifSeqresIterator(handle):
    """Return SeqRecord objects for each chain in an mmCIF file.

    The sequences are derived from the _entity_poly_seq entries in the mmCIF
    file, not the atoms of the 3D structure.

    Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and
    _struct_ref_seq. The _pdbx_poly_seq records contain sequence information,
    and the _struct_ref_seq records contain database cross-references.

    See:
    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html
    and
    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html

    This gets called internally via Bio.SeqIO for the sequence-based
    interpretation of the mmCIF file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...     print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Equivalently,

    >>> with open("PDB/1A8O.cif") as handle:
    ...     for record in CifSeqresIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...         print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Note the chain is recorded in the annotations dictionary, and any mmCIF
    _struct_ref_seq entries are recorded in the database cross-references list.
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from Bio.SeqUtils import seq1

    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    records = MMCIF2Dict(handle)

    # Explicitly convert records to list (See #1533).
    # If an item is not present, use an empty list
    for field in (
            PDBX_POLY_SEQ_SCHEME_FIELDS
            + STRUCT_REF_SEQ_FIELDS
            + STRUCT_REF_FIELDS):
        if field not in records:
            records[field] = []
        elif not isinstance(records[field], list):
            records[field] = [records[field]]

    for asym_id, mon_id in zip(records["_pdbx_poly_seq_scheme.asym_id"],
                               records["_pdbx_poly_seq_scheme.mon_id"]):
        mon_id_1l = seq1(mon_id, custom_map=protein_letters_3to1)
        chains[asym_id].append(mon_id_1l)

    # Build a dict of _struct_ref records, indexed by the id field:
    struct_refs = {}
    for fields in zip(records["_struct_ref.id"],
                      records["_struct_ref.db_name"],
                      records["_struct_ref.db_code"],
                      records["_struct_ref.pdbx_db_accession"]):
        ref_id, db_name, db_code, db_acc = fields
        struct_refs[ref_id] = {
            "database": db_name,
            "db_id_code": db_code,
            "db_acc": db_acc}

    # Look through _struct_ref_seq records, look up the corresponding
    # _struct_ref and add an entry to the metadata list for this chain.
    for fields in zip(records["_struct_ref_seq.ref_id"],
                      records["_struct_ref_seq.pdbx_PDB_id_code"],
                      records["_struct_ref_seq.pdbx_strand_id"]):
        ref_id, pdb_id, chain_id = fields
        struct_ref = struct_refs[ref_id]

        # The names here mirror those in PdbIO
        metadata[chain_id].append({'pdb_id': pdb_id})
        metadata[chain_id][-1].update(struct_ref)

    for chn_id, residues in sorted(chains.items()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = ("%s:%s %s" % (m['database'],
                                                m['db_acc'],
                                                m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])])
        else:
            record.id = chn_id
        yield record
Ejemplo n.º 43
0
def AtomIterator(pdb_id, struct):
    """Return SeqRecords from Structure objects.

    Base function for sequence parsers that read structures Bio.PDB parsers.

    Once a parser from Bio.PDB has been used to load a structure into a
    Bio.PDB.Structure.Structure object, there is no difference in how the
    sequence parser interprets the residue sequence. The functions in this
    module may be used by SeqIO modules wishing to parse sequences from lists
    of residues.

    Calling funtions must pass a Bio.PDB.Structure.Structure object.


    See Bio.SeqIO.PdbIO.PdbAtomIterator and Bio.SeqIO.PdbIO.CifAtomIterator for
    details.
    """
    from Bio.SeqUtils import seq1

    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return seq1(residue.resname, custom_map=protein_letters_3to1)

    model = struct[0]
    for chn_id, chain in sorted(model.child_dict.items()):
        # HETATM mod. res. policy: remove mod if in sequence, else discard
        residues = [res for res in chain.get_unpacked_list()
                    if seq1(res.get_resname().upper(),
                            custom_map=protein_letters_3to1) != "X"]
        if not residues:
            continue
        # Identify missing residues in the structure
        # (fill the sequence with 'X' residues in these regions)
        gaps = []
        rnumbers = [r.id[1] for r in residues]
        for i, rnum in enumerate(rnumbers[:-1]):
            if rnumbers[i + 1] != rnum + 1:
                # It's a gap!
                gaps.append((i + 1, rnum, rnumbers[i + 1]))
        if gaps:
            res_out = []
            prev_idx = 0
            for i, pregap, postgap in gaps:
                if postgap > pregap:
                    gapsize = postgap - pregap - 1
                    res_out.extend(restype(x) for x in residues[prev_idx:i])
                    prev_idx = i
                    res_out.append('X' * gapsize)
                else:
                    warnings.warn("Ignoring out-of-order residues after a gap",
                                  BiopythonParserWarning)
                    # Keep the normal part, drop the out-of-order segment
                    # (presumably modified or hetatm residues, e.g. 3BEG)
                    res_out.extend(restype(x) for x in residues[prev_idx:i])
                    break
            else:
                # Last segment
                res_out.extend(restype(x) for x in residues[prev_idx:])
        else:
            # No gaps
            res_out = [restype(x) for x in residues]
        record_id = "%s:%s" % (pdb_id, chn_id)
        # ENH - model number in SeqRecord id if multiple models?
        # id = "Chain%s" % str(chain.id)
        # if len(structure) > 1 :
        #     id = ("Model%s|" % str(model.id)) + id

        record = SeqRecord(Seq(''.join(res_out), generic_protein),
                           id=record_id, description=record_id)

        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])
        yield record
def get_sequence_from_pdb_structure(structure):
    sequence = ''
    for residue in structure.get_residues():
        if residue.id[0] == ' ':
            sequence += seq1(residue.resname)
    return sequence
Ejemplo n.º 45
0
def load_cdna_and_polyA(paths,organism,pep_dict,exon_info,failed):
    organism_out = open("./Data/"+organism+"_polyA.data",'w')
    cdna = list(SeqIO.parse(open(paths["cdna"],'r'),"fasta"))
    cdna_size = len(cdna)
    cdna_counter = 0
    next_step = 0
    for cd in cdna:
        cdna_counter += 1
        if  (float(cdna_counter*100)/cdna_size) >= next_step:
            next_step += 10
            print str(int(float(cdna_counter*100)/cdna_size))+"%"
        description =  dict([z.split(":",1) for z in cd.description.split()[1:] if ":" in z])
        cdna_transcript_id = cd.id
        cdna_gene = description["gene"]
        
        gt = (cdna_gene,cdna_transcript_id)
        if gt in pep_dict:
            
            for p in pep_dict[gt]:
                
                protein_id = p.id
                gene_id = str(gt[0])
                pep_sequence = str(seq3(p.seq))
                cdna_sequence = str(cd.seq)
                cdna_translated_list = []
                
                cdna_start = 0
                cdna_stop = 0
                
                for x in range(3):
                    cdna_translated_list.append(seq3(str(Seq(cdna_sequence[x:]+"N"*(3-len(cdna_sequence[x:])%3)).translate())))
                cut_found = [v for v in range(len(cdna_translated_list)) if pep_sequence in cdna_translated_list[v]]
                
                #easy
                if pep_sequence == cdna_translated_list[0]:
                    cdna_start = 0
                    cdna_stop = len(cdna_sequence)
                    proper_seq  = cdna_sequence
                    AAA_list = findPolyA(proper_seq)
                    grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id])
                        
                #cutting
                elif cut_found:
                    for c in cut_found:
                        idx = c+ cdna_translated_list[c].find(pep_sequence)
                        cdna_start = idx
                        cdna_stop = idx+len(pep_sequence)
                        proper_seq = cdna_sequence[idx:(idx+len(pep_sequence))]
                        AAA_list = findPolyA(proper_seq)
                        grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id])
                        
                #alignment
                else:
                    prot_seq = SeqRecord(Seq(seq1(pep_sequence)),id = "prot_seq")
                    y = open(organism+"prot.fasta",'w')
                    SeqIO.write(prot_seq, y, "fasta")
                    y.close()
                    best = []
                    for i in range(3):
                        cdna_seq = SeqRecord(Seq(cdna_sequence[i:len(cdna_sequence)-((len(cdna_sequence)-i)%3)]).translate(stop_symbol="W"),id="cdna_seq")
                        k = open(organism+"cdna.fasta",'w')
                        SeqIO.write(cdna_seq, k , "fasta")
                        k.close()
                        output = NcbiblastpCommandline(query=organism+"prot.fasta", subject=organism+"cdna.fasta", outfmt=5)()[0]
                        blast_result_records = list(NCBIXML.parse(StringIO(output)))
                        for bl_res in blast_result_records:
                            for z in bl_res.alignments:
                                for h in z.hsps:
                                    best.append((h.query,h.sbjct,i,h.sbjct_start, h.query_start,h.score))
                    if best:
                        l = sorted(best,key=lambda x:x[-1])[-1]
                        proper_seq = cdna_sequence[l[2]+(int(l[3])-1)*3:l[2]+((int(l[3])-1)+len(l[1]))*3]
                        AAA_list = findPolyA(proper_seq)
                        cdna_start = l[2]+(int(l[3])-1)*3
                        cdna_stop = l[2]+((int(l[3])-1)+len(l[1]))*3
                        grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id])
                    else:
                        failed.write(",".join([protein_id,cdna_transcript_id,gene_id,pep_sequence,cdna_sequence])+"\n")
                    
                    
                    
                    os.remove(organism+"cdna.fasta")
                    os.remove(organism+"prot.fasta")
        
                cdna = schema.Cdna(transcript_id = cdna_transcript_id, gene_id = cdna_gene, nucleotide_sequence=str(cd.seq),organism_name =organism, cdna_start = cdna_start, cdna_stop =cdna_stop)
                db.session.add(cdna)