def test_seq1_seq3(self): s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr" s1 = "MAYWCTKLIGPQNAPYKSHWKT" self.assertEqual(seq1(s3), s1) self.assertEqual(seq3(s1).upper(), s3.upper()) self.assertEqual(seq1(seq3(s1)), s1) self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
def residue_set_aln(self, structure, chain_name, offset=0): # TODO deberia tener menos parametros?? rs = ResidueSet(name="aln_" + structure.name + "_" + chain_name) chain = structure.chain(chain_name) delta = 0 mol = chain.residues[self.aln_hit.start + offset] if seq1(mol.compound).lower() != self.aln_hit.seq.replace("-", "")[0].lower(): delta = -1 error = 0 for i, aa in enumerate(self.aln_hit.seq.replace("-", "")): try: mol = chain.residues[self.aln_hit.start + i + delta + offset] if seq1(mol.compound).lower() != aa.lower(): error += 1 if error > 10: raise Exception("too many mismaches") # assert seq1( mol.compound).lower() == aa.lower(), (mol.compound, aa) residue = chain_name + "_" + str(mol.resid) rs.residues.append(residue) except Exception as ex: _log.error("pdb %s mal alineado con residuos: %s" % (self.aln_hit.name, ex)) return ResidueSet(name="aln_" + structure.name + "_" + chain_name) return rs
def langage(seq): """ Determine le mode d'ecriture à 1 ou 3 lettres des séquences peptidiques arguments: seq: la séquence à tester (str) ou (objet Seq) ou (objet SeqRecord) return: 1 (int) si langage à une lettre 3 (int) si langage à trois lettres ou None si la séquence n'est pas reconnue comme une séquence proteique """ seq = toSeq(seq) seq = str(seq) alpha = testalpha(seq) if seq.isupper() and (alpha == IUPAC.protein or alpha == IUPAC.extended_protein): return 1 elif seq.isupper() == False and seq.islower() == False and ( testalpha(seq1(seq)) == IUPAC.protein or testalpha(seq1(seq)) == IUPAC.extended_protein): return 3 else: return None
def main(args): """Main script""" pdb_parser = PDBParser() pdb_name = Path(args.pdb).stem # deal with FoldX repaired PDBs if pdb_name.endswith('_Repair'): pdb_name = pdb_name.replace('_Repair', '') structure = pdb_parser.get_structure(pdb_name, args.pdb) sections = import_sections(args.yaml, pdb_name) variants = [] if sections is not None: for section in sections: filter_region = 'region' in section for residue in structure[0][section['chain']]: if not residue.id[0] == ' ': continue # Filter HETATMs position = int(residue.id[1]) amino_acid = seq1(residue.get_resname()) if not amino_acid in AA_ALPHABET: # Filter non-standard AAs, required when processing # foldx repaired PDBs as they turn HETATMs to regular ATOMs # for regular proteins continue if (filter_region and (position > section['region'][1] or position < section['region'][0])): continue variants.extend([ f"{amino_acid}{section['chain']}{position}{x}" for x in AA_ALPHABET if not x == amino_acid ]) else: for chain in structure[0]: for residue in chain: if not residue.id[0] == ' ': continue # Filter HETATMs position = int(residue.id[1]) amino_acid = seq1(residue.get_resname()) if not amino_acid in AA_ALPHABET: continue variants.extend([ f"{amino_acid}{chain.id}{position}{x}" for x in AA_ALPHABET if not x == amino_acid ]) print(*variants, sep=';\n', end=';\n', file=sys.stdout)
def HGVS_p_to_AA_abrev(HGVS_p): mut = HGVS_p.split("p.(")[1].split(")")[0] num = re.findall(r"[+-]?\d+(?:\.\d+)?", mut) first_part = mut[0:(len(num[0])+6)] frameshift = re.search('fs\w+', mut) if frameshift is not None: fs = frameshift.group(0) aa3 = seq1(mut[-(len(num[1])+3):-len(num[1])]) fs_part = fs[0:2]+aa3+num[1] else: fs_part = '' aa1 = seq1(first_part[0:3]) aa2 = seq1(first_part[-3:]) aa_abbrev = aa1+num[0]+aa2+fs_part return aa_abbrev
def _extract_translation_exception(translation_exception): output = [] if isinstance(translation_exception, str): translation_exception = [translation_exception] for t_e in translation_exception: pos, aa = t_e.strip("()").split(",") if "complement" in pos: strand = -1 pos_start, pos_end = pos.split("(")[1].strip(")").split("..") else: strand = 1 pos_start, pos_end = pos.split(":")[1].split("..") pos_start = int(pos_start) - 1 pos_end = int(pos_end) if ":" in aa: aa = aa.split(":")[1] elif "=" in aa: aa = aa.split("=")[1] output.append({ "location": make_location(pos_start, pos_end, strand), "amino_acid": seq1(aa), }) return output
def aa_seq(pdb_file): """ Gets the full sequence of each protein chain from the SEQRES section of a PDB file, if present. If it isn't present, returns None. :return: A dictionary mapping each protein chain to its full sequence from the pdb file, irrespective of whether or not a residue has a coordinate. :rtype: defaultdict(str) """ # TODO: Try to get sequence from PDB API if the SEQRES section is missing with open(pdb_file, 'r') as f: seq_lines = [l[:-1] for l in f.readlines() if l[:6] == 'SEQRES'] seqs = defaultdict(str) for l in seq_lines: # Get the index of the last letter for i in reversed(range(len(l))): if l[i] != ' ': break chain_id = l[11] seq_3letters = l[19:i + 1] seqs[chain_id] += seq1(''.join(seq_3letters.split(' '))) if len(seq_lines) == 0: return None return seqs
def __init__(self, chain, index, chainID, resNum): """The constructor of a Residue class.""" self.index = index self.chainID = chainID self.resNum = resNum self.resName = seq1(chain[resNum].get_resname()) self.structure = NONE # 'H', 'B', 'E', 'G', 'I', 'T', 'S' or ' ' self.nturns = {3: Nturn(), 4: Nturn(), 5: Nturn()} self.bend = NONE self.chirality = NONE self.bridge_1 = NONE self.bridge_2 = NONE self.bp1 = 0 self.bp2 = 0 self.sheet = NONE self.tco = 0 self.kappa = 360 self.alpha = 360 self.phi = 360 self.psi = 360 self.CA = chain[self.resNum]['CA'].get_vector() self.C = chain[self.resNum]['C'] self.O = chain[self.resNum]['O'] self.N = chain[self.resNum]['N'] try: self.H = chain[self.resNum]['H'] except: pass
def struct_to_seq(structure, chains=None): if not structure.child_list: raise PDBAlignError("No models in %s" % structure) model = structure.child_list[0] if not model.child_list: raise PDBAlignError("No chains in %s" % structure) if chains is None: chain_list = model.child_list else: chain_list = [model[ch] for ch in chains] atom_seq_dict = dict() for ch in chain_list: # Don't include all-het chain if all(res.get_id()[0].strip() for res in ch.child_list): continue sequence = list() for res in ch.child_list: if res.get_id()[0] == " ": sequence.append(seq1(res.resname)) else: het_res1 = allowed_het_res.get(res.resname) if het_res1 is not None: sequence.append(het_res1) #atom_seq_dict[ch.id] = "".join(seq1(res.resname) #for res in ch.child_list #if res.get_id()[0] == " ") atom_seq_dict[ch.id] = "".join(sequence) return atom_seq_dict
def write_backbone_angles(chain, region=None, offset=0, outfile=sys.stdout, header=False): """ Write Psi/Phi angles from a pdb file """ if region is None: region = (0, float('inf')) polypeptide_builder = PPBuilder() polypeptides = polypeptide_builder.build_peptides(chain) if header: print(HEADER, file=outfile) for peptide in polypeptides: angles = peptide.get_phi_psi_list() for residue, (phi, psi) in zip(peptide, angles): position = residue.get_id()[1] if region[0] <= position <= region[1]: print(chain.id, position, seq1(residue.get_resname()), position + offset, 'NA' if phi is None else phi * RAD_FACTOR, 'NA' if psi is None else psi * RAD_FACTOR, sep='\t', file=outfile)
def retrieveAtomicStructureMapping(pdb_sequence, translation_to_structure_mapping): """Retrieves the mapping to atoms in a PDB file, based on the measured structure""" measured_structure = retrieveAtomicStructure(pdb_sequence) seq_ids = [x for x in sorted(measured_structure.keys())] # aligned_sequence to atomic structure mapping atomic_structure_mapping = {} index = 0 for i in range(len( translation_to_structure_mapping['secondary_sequence'])): if translation_to_structure_mapping['secondary_sequence'][i] != '-': if seq1( measured_structure[seq_ids[index]] ) != translation_to_structure_mapping['secondary_sequence'][i]: raise AtomicSequenceIDMappingFailedException( "Alternative mapping for atomic sequence to atomic seqids failed for pdb structure " + pdb_sequence['pdb_id'] + ", chain " + pdb_sequence['chain_id']) atomic_structure_mapping[i] = seq_ids[index] index += 1 return atomic_structure_mapping
def get_sequence( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' res_chain = list(self.structure[0][chain_id]) aa_3L=[] aa_1L ='' for residue in res_chain: residue_id = residue.get_id() hetfield = residue_id[0] if hetfield[0]==" ": aa_3L.append(residue) for aa in aa_3L: aa_1L += seq1(aa.get_resname()) sequence = aa_1L return sequence
def create_sequence(self, pdb_code, pdb_path): pdb = PDB.objects.get(code=pdb_code) struct = PDBParser(PERMISSIVE=1, QUIET=1).get_structure(pdb_code, pdb_path) for chain in struct[0].get_chains(): residues = [] for residue in chain.get_residues(): if is_aa(residue, standard=True): # alts = [a.get_altloc() for a in residue.get_atoms() if a.get_altloc()] # if len(alts) > 1 : # print(pdb_code) # disordered_select # print("alternative residue %s from %s was removed from sequence" % ( # str(residue.id), pdb_code # )) # else: residues.append(residue) if residues: seq = "".join([seq1(x.resname) for x in residues]) start = str(residues[0].id[1]) end = str(residues[-1].id[1]) seqid = "_".join([pdb_code, chain.id, start, end]) if not Bioentry.objects.filter(biodatabase=self.biodb, identifier=seqid).exists(): be = Bioentry(biodatabase=self.biodb, accession=seqid, identifier=seqid, name=pdb.code) be.save() Biosequence(bioentry=be, seq=seq, length=len(seq)).save()
def distance_to_nearest(residues, distance_matrix=None): """ Yeilds chemical environments parameterised as the distance to the nearest residue of each type. Hetero atoms are included so must be dropped separately if desired. residues: list of residues to consider distance_matrix: numpy matrix of distances between residues, with rows/columns in that order. Calculated if not supplied yields: chemical environment profiles (np.array) """ if distance_matrix is None: distance_matrix = residue_distance_matrix(residues) residue_indices = [ np.array([seq1(r.get_resname()) == aa for r in residues]) for aa in protein_alphabet.letters ] for res_index in range(len(residues)): dists = distance_matrix[res_index, ] non_self = np.ones_like(dists, dtype=bool) non_self[res_index] = False yield np.array([ min(dists[aa & non_self]) if any(aa & non_self) else np.inf for aa in residue_indices ])
def k_nearest_residues(residues, k=10, distance_matrix=None): """ Yields chemical environments parameterised by the make up of the k nearest AAs. Hetero atoms are included so must be dropped separately if desired. residues: list of residues to consider k: count the k nearest residues distance_matrix: numpy matrix of distances between residues, with rows/columns in that order. Calculated if not supplied yields: chemical environment profiles (np.array) """ if k >= len(residues): raise ValueError('k >= number of residues') if distance_matrix is None: distance_matrix = residue_distance_matrix(residues) for res_index in range(len(residues)): dists = distance_matrix[res_index, ] non_self = np.ones_like(dists, dtype=bool) non_self[res_index] = False nearest_k = [ residues[i] for i in np.argpartition(dists[non_self], k)[:k] ] counts = defaultdict(lambda: 0) for i in nearest_k: counts[seq1(i.get_resname())] += 1 yield np.array([counts[aa] for aa in protein_alphabet.letters])
def within_distance(residues, max_dist=10, distance_matrix=None): """ Yeilds chemical environments parameterised as the residues within max_dist angstroms. Hetero atoms are included so must be dropped separately if desired. residues: list of residues to consider max_dist: maximum distance to count within (in Angstroms) distance_matrix: numpy matrix of distances between residues, with rows/columns in that order. Calculated if not supplied yields: chemical environment profiles (np.array) """ if distance_matrix is None: distance_matrix = residue_distance_matrix(residues) for res_index in range(len(residues)): dists = distance_matrix[res_index, ] res_within_dist = [ residues[i] for i in np.argwhere(dists < max_dist)[:, 0] if not i == res_index ] counts = defaultdict(lambda: 0) for i in res_within_dist: counts[seq1(i.get_resname())] += 1 yield np.array([counts[aa] for aa in protein_alphabet.letters])
def read_in_experiment(): """ Read in raw data from Nisthal_2019.xlsx, making sure it matches FoldX read in Returns: array-like organized ddG data by residue and amino acid type """ AAs = [seq1(aa_3) for aa_3 in aminoacids] data_file = 'Nisthal_2019.xlsx' df2 = pd.read_excel(data_file) ddGs = np.zeros((56, 20)) ddGs.fill(np.nan) #match Tokuriki dataset read in for res, ddG in zip(df2['MUT_LBL'], df2['ddG(mAvg)_mean']): res_num, AA = parse(res) try: ddG = float(ddG) if ddG != -4: #Nisthal includes the black squares in fig 2 as -4 kcal/mol ddGs[res_num - 1][AAs.index(AA)] = -float(ddG) except: pass return ddGs
def get_info(pdb_file, fasta_file=None, verbose=True): if fasta_file is not None: chain_seqs = get_chain_seqs(fasta_file) else: if verbose: print( 'WARNING: No fasta file given to get_info(), getting sequence ' 'from PDB file') chain_seqs = dict() parser = PDBParser() structure = parser.get_structure(get_id(pdb_file), pdb_file) for chain in structure.get_chains(): id_ = chain.id seq = seq1(''.join([residue.resname for residue in chain])) if id_ not in ['H', 'L']: msg = ( 'Expected a heavy chain or light chain, marked as \'H\' ' ' or \'L\'. Got a chain id of :{} from protein {}') raise ValueError(msg.format(id_, get_id(pdb_file))) chain_seqs.update({id_: letter_to_num(seq, _aa_dict)}) id_ = get_id(pdb_file) cdr_indices = get_cdr_indices(pdb_file) dist_angle_mat = protein_dist_angle_matrix(pdb_file) info = cdr_indices info.update(chain_seqs) info.update(dict(dist_angle_mat=dist_angle_mat, id=id_)) return info
def align_nodes(foldx_nodes_features, pdb_list, seq, pdb_chain): # selected features #feats = ['phi', 'psi', 'Sidechain Accessibility', 'Mainchain Accessibility'] feats = [ 'phi', 'psi', 'total', 'backHbond', 'sideHbond', 'energy_VdW', 'electro', 'energy_SolvP', 'energy_SolvH', 'energy_vdwclash', 'entrop_sc', 'entrop_mc', 'cis_bond', 'energy_torsion', 'backbone_vdwclash', 'energy_dipole', 'Sidechain Contact Ratio', 'Mainchain Contact Ratio' ] assert len(feats) + 2 == hparams['in_dim_n'] # check sequence tmp_seq = "".join([ seq1(aa.title()) for aa in foldx_nodes_features.loc[ foldx_nodes_features.pdb_seq_num.isin( pdb_list)].three_letter.tolist() ]) assert len(seq) == len(tmp_seq) for i, j in zip(list(seq), list(tmp_seq)): if j != "X": assert i == j # add foldx features nodes_feats = foldx_nodes_features.loc[ foldx_nodes_features.pdb_seq_num.isin(pdb_list), feats].copy() # add more features nodes_feats['phobos'] = [phobos[aa] for aa in list(seq)] nodes_feats['radius'] = [radius[aa] for aa in list(seq)] return nodes_feats.values.astype(np.float32)
def read_naccess_rsa(model): """ Import a Naccess output RSA table, based on a model row imported by parse_model_table """ with open(f'{model.uniprot}_{model.name}_{model.model}.rsa', 'r') as rsa_file: rsa_data = process_rsa_data(rsa_file) df = pd.DataFrame.from_dict(rsa_data, orient='index').reset_index() df = df.rename(columns={ 'level_0': 'chain', 'level_1': 'position', 'res_name': 'wt' }) df['position'] = [i[1] for i in df['position']] df['uniprot'] = model.uniprot df['name'] = model.name df['template'] = model.template df['wt'] = [seq1(i) for i in df['wt']] positions = [int(i) for i in model.positions.split(',')] df = df[df['position'].isin(positions)] df = df[[ 'uniprot', 'name', 'position', 'wt', 'template', 'chain', 'all_atoms_abs', 'all_atoms_rel', 'side_chain_abs', 'side_chain_rel', 'main_chain_abs', 'main_chain_rel', 'non_polar_abs', 'non_polar_rel', 'all_polar_abs', 'all_polar_rel' ]] return df
def add_struc_path(self, struc_path): from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.PDB import PDBParser, MMCIFParser from Bio.SeqUtils import seq1 self.struc_path = struc_path if ntpath.splitext(self.struc_path)[1] == ".pdb": parser = PDBParser() elif ntpath.splitext(self.struc_path)[1] == ".cif": parser = MMCIFParser() else: raise IOError( "Unrecognized structure file type! Please use .pdb or .cif files!" ) structure = parser.get_structure("none", self.struc_path) chains = list() for chain in structure.get_chains(): chains.append(chain) if len(chains) != 1: raise IOError( f"When using structure files, they need to have a single chain!" ) sequence = str() seq_ix_mapping = dict() untrue_seq_ix = 1 residues = list(chains[0].get_residues()) for resi in residues: resi_id = resi.get_id() if not re.match(r' ', resi_id[2]): continue if re.match(r'^H_', resi_id[0]): continue if re.match(r'W', resi_id[0]): continue sequence += resi.get_resname().replace(' ', '') seq_ix_mapping[untrue_seq_ix] = int(resi.get_id()[1]) untrue_seq_ix += 1 if len(seq1(residues[seq_ix_mapping[1]].get_resname().replace( ' ', ''))) != 0: sequence = seq1(sequence) self.seq_ix_mapping = seq_ix_mapping self.struc_seq = SeqRecord(Seq(sequence))
def _adjust_aa_seq(fraglist): """Transforms three-letter amino acid codes into one-letters in the given HSPFragments.""" hsp_hstart = fraglist[0].hit_start hsp_qstart = fraglist[0].query_start for frag in fraglist: assert frag.query_strand == 0 or frag.hit_strand == 0 # fragment should have a length that is a multiple of 3 assert len(frag) % 3 == 0 # hit step may be -1 as we're aligning to DNA hstep = 1 if frag.hit_strand >= 0 else -1 # get one letter codes # and replace gap codon markers and termination characters custom_map = {'***': '*', '<->': '-'} hseq1 = seq1(str(frag.hit.seq), custom_map=custom_map) hstart = hsp_hstart hend = hstart + len(hseq1.replace('-', '')) * hstep qseq1 = seq1(str(frag.query.seq), custom_map=custom_map) qstart = hsp_qstart qend = qstart + len(qseq1.replace('-', '')) # replace the old frag sequences with the new ones frag.hit = None frag.query = None frag.hit = hseq1 frag.query = qseq1 # set coordinates for the protein sequence if frag.query_strand == 0: frag.query_start, frag.query_end = qstart, qend elif frag.hit_strand == 0: frag.hit_start, frag.hit_end = hstart, hend # update alignment annotation # by turning them into list of triplets for annot, annotseq in frag.aln_annotation.items(): frag.aln_annotation[annot] = _make_triplets(annotseq) # update values for next iteration hsp_hstart, hsp_qstart = hend, qend return fraglist
def _adjust_aa_seq(fraglist): """Transforms three-letter amino acid codes into one-letters in the given HSPFragments.""" hsp_hstart = fraglist[0].hit_start hsp_qstart = fraglist[0].query_start for frag in fraglist: assert frag.query_strand == 0 or frag.hit_strand == 0 # fragment should have a length that is a multiple of 3 assert len(frag) % 3 == 0 # hit step may be -1 as we're aligning to DNA hstep = 1 if frag.hit_strand >= 0 else -1 # get one letter codes # and replace gap codon markers and termination characters custom_map = {"***": "*", "<->": "-"} hseq1 = seq1(str(frag.hit.seq), custom_map=custom_map) hstart = hsp_hstart hend = hstart + len(hseq1.replace("-", "")) * hstep qseq1 = seq1(str(frag.query.seq), custom_map=custom_map) qstart = hsp_qstart qend = qstart + len(qseq1.replace("-", "")) # replace the old frag sequences with the new ones frag.hit = None frag.query = None frag.hit = hseq1 frag.query = qseq1 # set coordinates for the protein sequence if frag.query_strand == 0: frag.query_start, frag.query_end = qstart, qend elif frag.hit_strand == 0: frag.hit_start, frag.hit_end = hstart, hend # update alignment annotation # by turning them into list of triplets for annot, annotseq in frag.aln_annotation.items(): frag.aln_annotation[annot] = _make_triplets(annotseq) # update values for next iteration hsp_hstart, hsp_qstart = hend, qend return fraglist
def assesment(pdb_path, fasta_path=None, output_dir=None, accpro_path=None, psipred_path=None,cpus=multiprocessing.cpu_count()): if not output_dir: output_dir = tempfile.mkdtemp(suffix="_qmean") data = dict() if not fasta_path and (accpro_path or psipred_path): fasta_path = output_dir + "/seq.fasta" p = PDBParser(PERMISSIVE=True, QUIET=True) seq = "".join([seq1(residue.resname) for residue in p.get_structure("x", pdb_path).get_residues() ]) # if is_aa(residue) with open(fasta_path, "w") as h: h.write(">seq\n") h.write(seq) data["seq"] = seq psipred_handler = None accpro_handler = None if accpro_path: data["acc"] = QMean.accpro(fasta_path, output_dir, accpro_path) accpro_handler = ACCPROHandler(data) if psipred_path: data["ss"], data["conf"] = QMean.psipred(fasta_path, output_dir, psipred_path,cpus) psipred_handler = PSIPREDHandler(data) pdb = LoadPDB(pdb_path) if psipred_handler and accpro_handler: assessment = AssessModelQuality(pdb, output_dir=output_dir, psipred=psipred_handler, accpro=accpro_handler) elif psipred_handler and not accpro_handler: assessment = AssessModelQuality(pdb, output_dir=output_dir, psipred=psipred_handler) elif not psipred_handler and accpro_handler: assessment = AssessModelQuality(pdb, output_dir=output_dir, accpro=accpro_handler) else: assessment = AssessModelQuality(pdb, output_dir=output_dir) #shutil.rmtree(output_dir) result = {} for x in assessment[0].all_scores: result[x.name + "_norm"] = x.norm result[x.name + "_zscore"] = x.z_score result["residues"] = {} for row in assessment[1].score_table.rows: r = {f: row[i] for i, f in enumerate(assessment[1].score_table.col_names[4:], 4)} result["residues"][row[0] + "_" + str(row[2]) + "_" + str(row[3])] = r return result
def strucToSeq(chain: Bio.PDB.Entity.Entity) -> str: ''' Parses a structure object and returns the sequence as a 1-letter AA code. ''' res = list(chain.get_residues()) # residue list from the structure seq = "" # sequence to return later for r in res: # for each residue, seq += seq1( r.get_resname() ) # append the 3-letter code from each residue name to the sequence string return seq
def get_sequence(self, chain_id): """ Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. """ one_letter_sequence = ''.join(seq1(res.get_resname()) for res in self.get_residues(chain_id)) return one_letter_sequence
def seq(self, selected_chain=None, standard_aa=True): records = [] for chain in self.struct.get_chains(): if (not selected_chain) or (selected_chain == chain.id): residues = [x for x in chain.get_residues() if is_aa(x, standard=standard_aa)] if residues: seq = "".join([seq1(x.resname) for x in residues]) start = str(residues[0].id[1]) end = str(residues[-1].id[1]) record = SeqRecord(id="_".join([self.code, chain.id, start, end]), description="", seq=Seq(seq)) records.append(record) return records
def retrieveAtomicStructureSequence(pdb_sequence): """Retrieves the sequence of a PDB file, based on the measured structure""" measured_structure = retrieveAtomicStructure(pdb_sequence) seq_ids = [x for x in measured_structure.keys()] measure_structure_sequence = "" for seq_id in range(np.min(seq_ids), np.max(seq_ids) + 1): if seq_id in measured_structure.keys(): measure_structure_sequence += seq1(measured_structure[seq_id]) else: measure_structure_sequence += "-" return measure_structure_sequence
def extract_seq_from_pdb (pedxxxx,ensemble,confomer,chain): pdb = ("%s_%s-%s_%s.pdb"%(pedxxxx, ensemble,conformer,chain)) structure = PDBParser().get_structure("pdb",pdb) seq_pdb=[] seq_pdb_complete=[] for residue in structure.get_residues(): if residue.id[0] == " ": seq_pdb.append(residue.get_resname()) seq_pdb = ''.join(seq_pdb) seq_pdb= seq1(seq_pdb) longitud=len(seq_pdb) return (seq_pdb)
def CB_coords(pdb_file, include_masks=False): """ Gets the coordinates of the C-Beta atom in each residue or the C-Alpha atom if the residue does not have a C-beta coordinate. If a residue has neither, its coordinates are set to [0, 0, 0]. An array mask can also be returned to denote non-existing coordinates, where mask[i] is denotes whether or not (1 or 0) the i-th residue has a coordinate. """ p = PDBParser() file_name = filename_no_extension(pdb_file) structure = p.get_structure(file_name, pdb_file) def get_cb_or_ca(residue): if 'CB' in residue: return residue['CB'].get_coord() elif 'CA' in residue: return residue['CA'].get_coord() else: return [0, 0, 0] coords = {} masks = mask_aa_coords(pdb_file) for chain in structure.get_chains(): chain_id = chain.get_id() if chain_id in masks: coords[chain_id] = np.zeros((len(masks[chain_id]), 3)) chain_coords = [ get_cb_or_ca(r) for r in chain.get_residues() if seq1(r.get_resname()) != 'X' ] if chain_coords: if len(chain_coords) != len(coords[chain_id][masks[chain_id]]): msg = ('WARNING: In {}, chain {} the mask is not equal to ' 'the number of coordinates. Returning None') warnings.warn(msg.format(pdb_file, chain_id)) return None coords[chain_id][masks[chain_id]] = chain_coords else: msg = ( 'WARNING: Chain ID mismatch between the full sequences and the ' 'sequences derived from coordinates in the {} file. Chain {} in ' 'the full sequence is not in the sequences derived from ' 'coordinates. Skipping chain {}') warnings.warn(msg.format(pdb_file, chain_id, chain_id)) if include_masks: return coords, masks else: return coords
def records_from_pdb(self, pdb, pdb_file_path, standard_aa=True, selected_chain=None): records = [] struct = PDBParser(PERMISSIVE=1, QUIET=1).get_structure(pdb, pdb_file_path)[0] for chain in struct.get_chains(): if (not selected_chain) or (selected_chain == chain.id): residues = [x for x in chain.get_residues() if is_aa(x, standard=standard_aa)] if residues: seq = "".join([seq1(x.resname) for x in residues]) start = str(residues[0].id[1]) end = str(residues[-1].id[1]) record = SeqRecord(id="_".join([pdb, chain.id, start, end]), description="", seq=Seq(seq)) records.append(record) return records
def write_chain(self, key): """Writes the chain information to sequence""" # make our output header = ">{0}:{1}|PDBID|CHAIN|SEQUENCE".format(self.code, key) print(header, file=self.buf) # init our sequence seq = [] # grab attributes chain = self.chains[key] residues = chain.get_residues() # grab sequences for res in residues: seq.append(seq1(res.resname)) # write sequence length = len(seq) for index in range(0, length, LINE_LENGTH): out = "".join(seq[index : index + LINE_LENGTH]) print(out, file=self.buf)
def get_pdb_chain_partial_seq (model, chain, _1lc, with_h2o): chain=model[chain] chain_seq = [] chain_seq_raw = [] for residue in chain.get_residues(): residue_name = residue.get_resname() # Remove all occurences of a value in list : http://stackoverflow.com/questions/1157106/remove-all-occurences-of-a-value-from-a-python-list ''' Check if sequence with water molecules has to be returned ''' if with_h2o is False: ''' Remove all water molecules residues in the sequence ''' if residue_name == "HOH": continue ''' Check if 1 letter code sequence has to be returned ''' if _1lc is True: ''' Convert 3 letter code protein sequence to 1 letter code protein sequence ''' #http://biopython.org/DIST/docs/api/Bio.SeqUtils-module.html#seq1 residue.resname = seq1(residue_name) # Change variable resname chain_seq.append(residue) chain_seq_raw.append(residue.resname) return [chain_seq, chain_seq_raw]
from Bio.SeqUtils.CodonUsage import SynonymousCodons from Bio.SeqUtils import seq1 import sys import os fpath = os.path.join(os.getcwd(),sys.argv[-1]) f = open(fpath, 'r') protein = f.readline().rstrip() f.close() #Codon dictionary of just possibility counts (e.g. Met = 1, Ala = 4) codonTable = {} for key in SynonymousCodons.keys(): # Use seq1 to convert three letter codes to one letter codonTable[seq1(key)] = len(SynonymousCodons[key]) # Amino acid combinations aa_comb = 1 for aa in protein: aa_comb *= codonTable[aa] # Times 3 for the 3 possible stop codons # Modulo 1000000 to make final number reasonable sized print aa_comb * 3 % 1000000
def __init__(self, ref_seq_id, start, stop, ref, alt, edit_type, predicted=False): if self.three_letter_regex.match(ref) and self.three_letter_regex.match(alt): ref = seq1(ref) alt = seq1(alt) super().__init__(ref_seq_id, start, stop, ref, alt, edit_type, predicted)
def _adjust_aa_seq(fraglist): """Transform 3-letter AA codes of input fragments to one-letter codes (PRIVATE). Argument fraglist should be a list of HSPFragments objects. """ custom_map = {'***': '*', '<->': '-'} hsp_hstart = fraglist[0].hit_start hsp_qstart = fraglist[0].query_start frag_phases = _get_fragments_phase(fraglist) for frag, phase in zip(fraglist, frag_phases): assert frag.query_strand == 0 or frag.hit_strand == 0 # hit step may be -1 as we're aligning to DNA hstep = 1 if frag.hit_strand >= 0 else -1 # set fragment phase frag.phase = phase # fragment should have a length that is a multiple of 3 # assert len(frag) % 3 == 0 qseq = str(frag.query.seq) q_triplets_pre, q_triplets, q_triplets_post = \ _make_triplets(qseq, phase) hseq = str(frag.hit.seq) h_triplets_pre, h_triplets, h_triplets_post = \ _make_triplets(hseq, phase) # get one letter codes # and replace gap codon markers and termination characters hseq1_pre = "X" if h_triplets_pre else "" hseq1_post = "X" if h_triplets_post else "" hseq1 = seq1("".join(h_triplets), custom_map=custom_map) hstart = hsp_hstart + (len(hseq1_pre) * hstep) hend = hstart + len(hseq1.replace('-', '')) * hstep qseq1_pre = "X" if q_triplets_pre else "" qseq1_post = "X" if q_triplets_post else "" qseq1 = seq1("".join(q_triplets), custom_map=custom_map) qstart = hsp_qstart + len(qseq1_pre) qend = qstart + len(qseq1.replace('-', '')) # replace the old frag sequences with the new ones frag.hit = None frag.query = None frag.hit = hseq1_pre + hseq1 + hseq1_post frag.query = qseq1_pre + qseq1 + qseq1_post # set coordinates for the protein sequence if frag.query_strand == 0: frag.query_start, frag.query_end = qstart, qend elif frag.hit_strand == 0: frag.hit_start, frag.hit_end = hstart, hend # update alignment annotation # by turning them into list of triplets for annot, annotseq in frag.aln_annotation.items(): pre, intact, post = _make_triplets(annotseq, phase) frag.aln_annotation[annot] = \ list(filter(None, [pre])) + intact + list(filter(None, [post])) # update values for next iteration hsp_hstart, hsp_qstart = hend, qend return fraglist
def PdbAtomIterator(handle): """Returns SeqRecord objects for each chain in a PDB file The sequences are derived from the 3D structure (ATOM records), not the SEQRES lines in the PDB file header. Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries are converted to "X" in the sequence. In addition to information from the PDB header (which is the same for all records), the following chain specific information is placed in the annotation: record.annotations["residues"] = List of residue ID strings record.annotations["chain"] = Chain ID (typically A, B ,...) record.annotations["model"] = Model ID (typically zero) Where amino acids are missing from the structure, as indicated by residue numbering, the sequence is filled in with 'X' characters to match the size of the missing region, and None is included as the corresponding entry in the list record.annotations["residues"]. This function uses the Bio.PDB module to do most of the hard work. The annotation information could be improved but this extra parsing should be done in parse_pdb_header, not this module. """ # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB import PDBParser from Bio.SeqUtils import seq1 from Bio.SCOP.three_to_one_dict import to_one_letter_code def restype(residue): """Return a residue's type as a one-letter code. Non-standard residues (e.g. CSD, ANP) are returned as 'X'. """ return seq1(residue.resname, custom_map=to_one_letter_code) # Deduce the PDB ID from the PDB header # ENH: or filename? from Bio.File import UndoHandle undo_handle = UndoHandle(handle) firstline = undo_handle.peekline() if firstline.startswith("HEADER"): pdb_id = firstline[62:66] else: warnings.warn("First line is not a 'HEADER'; can't determine PDB ID") pdb_id = '????' struct = PDBParser().get_structure(pdb_id, undo_handle) model = struct[0] for chn_id, chain in sorted(model.child_dict.iteritems()): # HETATM mod. res. policy: remove mod if in sequence, else discard residues = [res for res in chain.get_unpacked_list() if seq1(res.get_resname().upper(), custom_map=to_one_letter_code) != "X"] if not residues: continue # Identify missing residues in the structure # (fill the sequence with 'X' residues in these regions) gaps = [] rnumbers = [r.id[1] for r in residues] for i, rnum in enumerate(rnumbers[:-1]): if rnumbers[i+1] != rnum + 1: # It's a gap! gaps.append((i+1, rnum, rnumbers[i+1])) if gaps: res_out = [] prev_idx = 0 for i, pregap, postgap in gaps: if postgap > pregap: gapsize = postgap - pregap - 1 res_out.extend(map(restype, residues[prev_idx:i])) prev_idx = i res_out.append('X'*gapsize) # Last segment res_out.extend(map(restype, residues[prev_idx:])) else: warnings.warn("Ignoring out-of-order residues after a gap", UserWarning) # Keep the normal part, drop the out-of-order segment # (presumably modified or hetatm residues, e.g. 3BEG) res_out.extend(map(restype, residues[prev_idx:i])) else: # No gaps res_out = map(restype, residues) record_id = "%s:%s" % (pdb_id, chn_id) # ENH - model number in SeqRecord id if multiple models? # id = "Chain%s" % str(chain.id) # if len(structure) > 1 : # id = ("Model%s|" % str(model.id)) + id record = SeqRecord(Seq(''.join(res_out), generic_protein), id=record_id, description=record_id, ) # The PDB header was loaded as a dictionary, so let's reuse it all record.annotations = struct.header.copy() # Plus some chain specifics: record.annotations["model"] = model.id record.annotations["chain"] = chain.id # Start & end record.annotations["start"] = int(rnumbers[0]) record.annotations["end"] = int(rnumbers[-1]) # ENH - add letter annotations -- per-residue info, e.g. numbers yield record
def PdbSeqresIterator(handle): """Returns SeqRecord objects for each chain in a PDB file. The sequences are derived from the SEQRES lines in the PDB file header, not the atoms of the 3D structure. Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES See: http://www.wwpdb.org/documentation/format23/sect3.html """ # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils from Bio.SeqUtils import seq1 from Bio.SCOP.three_to_one_dict import to_one_letter_code chains = collections.defaultdict(list) metadata = collections.defaultdict(list) for line in handle: rec_name = line[0:6].strip() if rec_name == 'SEQRES': # NB: We only actually need chain ID and the residues here; # commented bits are placeholders from the wwPDB spec. # Serial number of the SEQRES record for the current chain. # Starts at 1 and increments by one each line. # Reset to 1 for each chain. # ser_num = int(line[8:10]) # Chain identifier. This may be any single legal character, # including a blank which is used if there is only one chain. chn_id = line[11] # Number of residues in the chain (repeated on every record) # num_res = int(line[13:17]) residues = [seq1(res, custom_map=to_one_letter_code) for res in line[19:].split()] chains[chn_id].extend(residues) elif rec_name == 'DBREF': # ID code of this entry (PDB ID) pdb_id = line[7:11] # Chain identifier. chn_id = line[12] # Initial sequence number of the PDB sequence segment. # seq_begin = int(line[14:18]) # Initial insertion code of the PDB sequence segment. # icode_begin = line[18] # Ending sequence number of the PDB sequence segment. # seq_end = int(line[20:24]) # Ending insertion code of the PDB sequence segment. # icode_end = line[24] # Sequence database name. database = line[26:32].strip() # Sequence database accession code. db_acc = line[33:41].strip() # Sequence database identification code. db_id_code = line[42:54].strip() # Initial sequence number of the database seqment. # db_seq_begin = int(line[55:60]) # Insertion code of initial residue of the segment, if PDB is the # reference. # db_icode_begin = line[60] # Ending sequence number of the database segment. # db_seq_end = int(line[62:67]) # Insertion code of the ending residue of the segment, if PDB is the # reference. # db_icode_end = line[67] metadata[chn_id].append({'pdb_id': pdb_id, 'database': database, 'db_acc': db_acc, 'db_id_code': db_id_code}) # ENH: 'SEQADV' 'MODRES' for chn_id, residues in sorted(chains.iteritems()): record = SeqRecord(Seq(''.join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) record.description = ("%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code'])) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem['database'], melem['db_acc']), "%s:%s" % (melem['database'], melem['db_id_code'])]) else: record.id = chn_id yield record
def restype(residue): """Return a residue's type as a one-letter code. Non-standard residues (e.g. CSD, ANP) are returned as 'X'. """ return seq1(residue.resname, custom_map=to_one_letter_code)
def CifSeqresIterator(handle): """Return SeqRecord objects for each chain in an mmCIF file. The sequences are derived from the _entity_poly_seq entries in the mmCIF file, not the atoms of the 3D structure. Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and _struct_ref_seq. The _pdbx_poly_seq records contain sequence information, and the _struct_ref_seq records contain database cross-references. See: http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html and http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html This gets called internally via Bio.SeqIO for the sequence-based interpretation of the mmCIF file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Equivalently, >>> with open("PDB/1A8O.cif") as handle: ... for record in CifSeqresIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Note the chain is recorded in the annotations dictionary, and any mmCIF _struct_ref_seq entries are recorded in the database cross-references list. """ # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils from Bio.SeqUtils import seq1 # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB.MMCIF2Dict import MMCIF2Dict chains = collections.defaultdict(list) metadata = collections.defaultdict(list) records = MMCIF2Dict(handle) # Explicitly convert records to list (See #1533). # If an item is not present, use an empty list for field in ( PDBX_POLY_SEQ_SCHEME_FIELDS + STRUCT_REF_SEQ_FIELDS + STRUCT_REF_FIELDS): if field not in records: records[field] = [] elif not isinstance(records[field], list): records[field] = [records[field]] for asym_id, mon_id in zip(records["_pdbx_poly_seq_scheme.asym_id"], records["_pdbx_poly_seq_scheme.mon_id"]): mon_id_1l = seq1(mon_id, custom_map=protein_letters_3to1) chains[asym_id].append(mon_id_1l) # Build a dict of _struct_ref records, indexed by the id field: struct_refs = {} for fields in zip(records["_struct_ref.id"], records["_struct_ref.db_name"], records["_struct_ref.db_code"], records["_struct_ref.pdbx_db_accession"]): ref_id, db_name, db_code, db_acc = fields struct_refs[ref_id] = { "database": db_name, "db_id_code": db_code, "db_acc": db_acc} # Look through _struct_ref_seq records, look up the corresponding # _struct_ref and add an entry to the metadata list for this chain. for fields in zip(records["_struct_ref_seq.ref_id"], records["_struct_ref_seq.pdbx_PDB_id_code"], records["_struct_ref_seq.pdbx_strand_id"]): ref_id, pdb_id, chain_id = fields struct_ref = struct_refs[ref_id] # The names here mirror those in PdbIO metadata[chain_id].append({'pdb_id': pdb_id}) metadata[chain_id][-1].update(struct_ref) for chn_id, residues in sorted(chains.items()): record = SeqRecord(Seq(''.join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) record.description = ("%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code'])) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem['database'], melem['db_acc']), "%s:%s" % (melem['database'], melem['db_id_code'])]) else: record.id = chn_id yield record
def AtomIterator(pdb_id, struct): """Return SeqRecords from Structure objects. Base function for sequence parsers that read structures Bio.PDB parsers. Once a parser from Bio.PDB has been used to load a structure into a Bio.PDB.Structure.Structure object, there is no difference in how the sequence parser interprets the residue sequence. The functions in this module may be used by SeqIO modules wishing to parse sequences from lists of residues. Calling funtions must pass a Bio.PDB.Structure.Structure object. See Bio.SeqIO.PdbIO.PdbAtomIterator and Bio.SeqIO.PdbIO.CifAtomIterator for details. """ from Bio.SeqUtils import seq1 def restype(residue): """Return a residue's type as a one-letter code. Non-standard residues (e.g. CSD, ANP) are returned as 'X'. """ return seq1(residue.resname, custom_map=protein_letters_3to1) model = struct[0] for chn_id, chain in sorted(model.child_dict.items()): # HETATM mod. res. policy: remove mod if in sequence, else discard residues = [res for res in chain.get_unpacked_list() if seq1(res.get_resname().upper(), custom_map=protein_letters_3to1) != "X"] if not residues: continue # Identify missing residues in the structure # (fill the sequence with 'X' residues in these regions) gaps = [] rnumbers = [r.id[1] for r in residues] for i, rnum in enumerate(rnumbers[:-1]): if rnumbers[i + 1] != rnum + 1: # It's a gap! gaps.append((i + 1, rnum, rnumbers[i + 1])) if gaps: res_out = [] prev_idx = 0 for i, pregap, postgap in gaps: if postgap > pregap: gapsize = postgap - pregap - 1 res_out.extend(restype(x) for x in residues[prev_idx:i]) prev_idx = i res_out.append('X' * gapsize) else: warnings.warn("Ignoring out-of-order residues after a gap", BiopythonParserWarning) # Keep the normal part, drop the out-of-order segment # (presumably modified or hetatm residues, e.g. 3BEG) res_out.extend(restype(x) for x in residues[prev_idx:i]) break else: # Last segment res_out.extend(restype(x) for x in residues[prev_idx:]) else: # No gaps res_out = [restype(x) for x in residues] record_id = "%s:%s" % (pdb_id, chn_id) # ENH - model number in SeqRecord id if multiple models? # id = "Chain%s" % str(chain.id) # if len(structure) > 1 : # id = ("Model%s|" % str(model.id)) + id record = SeqRecord(Seq(''.join(res_out), generic_protein), id=record_id, description=record_id) record.annotations["model"] = model.id record.annotations["chain"] = chain.id record.annotations["start"] = int(rnumbers[0]) record.annotations["end"] = int(rnumbers[-1]) yield record
def get_sequence_from_pdb_structure(structure): sequence = '' for residue in structure.get_residues(): if residue.id[0] == ' ': sequence += seq1(residue.resname) return sequence
def load_cdna_and_polyA(paths,organism,pep_dict,exon_info,failed): organism_out = open("./Data/"+organism+"_polyA.data",'w') cdna = list(SeqIO.parse(open(paths["cdna"],'r'),"fasta")) cdna_size = len(cdna) cdna_counter = 0 next_step = 0 for cd in cdna: cdna_counter += 1 if (float(cdna_counter*100)/cdna_size) >= next_step: next_step += 10 print str(int(float(cdna_counter*100)/cdna_size))+"%" description = dict([z.split(":",1) for z in cd.description.split()[1:] if ":" in z]) cdna_transcript_id = cd.id cdna_gene = description["gene"] gt = (cdna_gene,cdna_transcript_id) if gt in pep_dict: for p in pep_dict[gt]: protein_id = p.id gene_id = str(gt[0]) pep_sequence = str(seq3(p.seq)) cdna_sequence = str(cd.seq) cdna_translated_list = [] cdna_start = 0 cdna_stop = 0 for x in range(3): cdna_translated_list.append(seq3(str(Seq(cdna_sequence[x:]+"N"*(3-len(cdna_sequence[x:])%3)).translate()))) cut_found = [v for v in range(len(cdna_translated_list)) if pep_sequence in cdna_translated_list[v]] #easy if pep_sequence == cdna_translated_list[0]: cdna_start = 0 cdna_stop = len(cdna_sequence) proper_seq = cdna_sequence AAA_list = findPolyA(proper_seq) grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id]) #cutting elif cut_found: for c in cut_found: idx = c+ cdna_translated_list[c].find(pep_sequence) cdna_start = idx cdna_stop = idx+len(pep_sequence) proper_seq = cdna_sequence[idx:(idx+len(pep_sequence))] AAA_list = findPolyA(proper_seq) grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id]) #alignment else: prot_seq = SeqRecord(Seq(seq1(pep_sequence)),id = "prot_seq") y = open(organism+"prot.fasta",'w') SeqIO.write(prot_seq, y, "fasta") y.close() best = [] for i in range(3): cdna_seq = SeqRecord(Seq(cdna_sequence[i:len(cdna_sequence)-((len(cdna_sequence)-i)%3)]).translate(stop_symbol="W"),id="cdna_seq") k = open(organism+"cdna.fasta",'w') SeqIO.write(cdna_seq, k , "fasta") k.close() output = NcbiblastpCommandline(query=organism+"prot.fasta", subject=organism+"cdna.fasta", outfmt=5)()[0] blast_result_records = list(NCBIXML.parse(StringIO(output))) for bl_res in blast_result_records: for z in bl_res.alignments: for h in z.hsps: best.append((h.query,h.sbjct,i,h.sbjct_start, h.query_start,h.score)) if best: l = sorted(best,key=lambda x:x[-1])[-1] proper_seq = cdna_sequence[l[2]+(int(l[3])-1)*3:l[2]+((int(l[3])-1)+len(l[1]))*3] AAA_list = findPolyA(proper_seq) cdna_start = l[2]+(int(l[3])-1)*3 cdna_stop = l[2]+((int(l[3])-1)+len(l[1]))*3 grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id]) else: failed.write(",".join([protein_id,cdna_transcript_id,gene_id,pep_sequence,cdna_sequence])+"\n") os.remove(organism+"cdna.fasta") os.remove(organism+"prot.fasta") cdna = schema.Cdna(transcript_id = cdna_transcript_id, gene_id = cdna_gene, nucleotide_sequence=str(cd.seq),organism_name =organism, cdna_start = cdna_start, cdna_stop =cdna_stop) db.session.add(cdna)