def align(seq1, seq2, debug=False): flat1 = seq.seq1(''.join(seq1)).replace('X', '-') flat2 = seq.seq1(''.join(seq2)).replace('X', '-') flats = [flat1, flat2] # aligning 2 to 1 seems to give better results align = pairwise2.align.localxs(flat2, flat1, -1000, -1000, one_alignment_only=True) start = align[0][3] offset = [0, 0] # compute how many gaps had to be inserted at beginning to align for i in range(2): assert len(align[0][0]) == len(align[0][1]) for j in range(len(align[0][0])): # account for the fact that 2 and 1 are switched in alignment results # if there is a gap in 1 if align[0][(i + 1) % 2][j] == '-': # but not the other if flats[i][j - offset[i]] != '-': offset[i] += 1 else: break if debug: print( pairwise2.format_alignment(flat2[offset[0]:], flat1[offset[1]:], 10, 0, len(flat1) - offset[1])) return -offset[0], -offset[1]
def var3_to_var1(var3): refAA3 = variant.get_refAA(var3) newAA3 = variant.get_newAA(var3) pos = variant.get_pos(var3) refAA1 = "*" if refAA3 == "*" else SeqUtils.seq1(refAA3) # refAA1 = SeqUtils.seq1(refAA3) newAA1 = "*" if newAA3 == "*" else SeqUtils.seq1(newAA3) # newAA1 = SeqUtils.seq1(newAA3) var1 = ''.join([refAA1, str(pos), newAA1]) return var1
def get_distances(res_pairs, get_coords): ''' Get distances for all pairs of residues between two chains res_pairs: generator over tuples ((res_a, res_b), ...) get_coords: function to get residue coordinates Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...] ''' return [(res_a.id[1], res_b.id[1], distances.calc_residue_distance(res_a, res_b, get_coords), SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname)) for (res_a, res_b) in res_pairs]
def get_sequences(pdb_id, chain=None): '''Gets the sequences in a PDB file.''' return [SeqUtils.seq1(''.join([residue.get_resname() for residue in chn if 'CA' in residue.child_dict])) for chn in get_structure(pdb_id).get_chains() if chain is None or chain == chn.get_id()]
def extractPDBdata(structure, adjustChains, substitutionData, verbose): print('Extracting atoms details from PDB...') pdbData = {} for model in structure: for chain in model: chainID = chain.get_id() if chainID in adjustChains: pdbData[chainID] = {} residueID = 0 for residue in chain: residueName = SeqUtils.seq1(residue.get_resname()) if residueName != substitutionData[chainID][residueID][0]: continue (heteroFlag, sequenceID, insertionCode) = residue.get_id() if heteroFlag != ' ': continue value = substitutionData[chainID][residueID][1] if value != "-": pdbData[chainID][sequenceID] = value if verbose: print("Chain: " + chainID + "\t residue: " + residueName + " " + str(sequenceID) + "\t value: " + value) residueID += 1 if (residueID >= len(substitutionData[chainID])): break print('OK') return pdbData
def get_bfactors(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the B-Factors for all residues in a chain of a Biopython.PDB structure. The B-Factors describe the mobility of an atom or a residue. In a Biopython.PDB structure B-Factors are given for each atom in a residue. Calculate the mean B-Factor for a residue by averaging over the B-Factor of all atoms in a residue. Sometimes B-Factors are not available for a certain residue; (e.g. the residue was not resolved); insert np.nan for those cases. Finally normalize your B-Factors using Standard scores (zero mean, unit variance). You have to use np.nanmean, np.nanvar etc. if you have nan values in your array. The returned data structure has to be a numpy array rounded again to integer. ''' bf = [ np.mean([a.get_bfactor() for a in r]) for r in self.structure[chain_id] if SeqUtils.seq1(r.get_resname()) != 'X' ] meanBf = np.mean(bf) stdBf = np.std(bf) bf = [(b - meanBf) / stdBf for b in bf] return np.array(bf).astype(np.int) # return rounded (integer) values
def get_distances(res_pairs, get_coords): ''' Get distances for all pairs of residues between two chains res_pairs: generator over tuples ((res_a, res_b), ...) get_coords: function to get residue coordinates Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...] ''' return [ (res_a.id[1], res_b.id[1], distances.calc_residue_distance(res_a, res_b, get_coords), SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname) ) for (res_a, res_b) in res_pairs ]
def generate_wt_seqs(peptides): wt_dict = {} r = re.compile("([a-zA-Z]+)([0-9]+)([a-zA-Z]+)") d_pattern = re.compile("([a-zA-Z]+)([0-9]+)") for x in peptides: trans = x.get_all_transcripts() for t in trans: mut_seq = [a for a in x] protein_pos = x.get_protein_positions(t.transcript_id) not_available = False variant_available = False for p in protein_pos: variant_dic = x.get_variants_by_protein_position( t.transcript_id, p) variant_available = bool(variant_dic) for key in variant_dic: var_list = variant_dic[key] for v in var_list: mut_syntax = v.coding[t.transcript_id.split(':') [0]].aaMutationSyntax if v.type in [3, 4, 5] or '?' in mut_syntax: not_available = True elif v.type in [1]: m = d_pattern.match(mut_syntax.split('.')[1]) wt = SeqUtils.seq1(m.groups()[0]) mut_seq.insert(key, wt) elif v.type in [2]: not_available = True else: m = r.match(mut_syntax.split('.')[1]) if m is None: not_available = True else: wt = SeqUtils.seq1(m.groups()[0]) mut_seq[key] = wt if not_available: wt_dict['{}_{}'.format(str(x), t.transcript_id)] = np.nan elif variant_available: wt_dict['{}_{}'.format(str(x), t.transcript_id)] = ''.join(mut_seq) return wt_dict
def changeBfactors(structure, adjustChains, substitutionData, verbose, noDataValue): print('Changing b-factors of atoms in PDB...') for model in structure: for chain in model: chainID = chain.get_id() if chainID in adjustChains: residueID = 0 for residue in chain: residueName = SeqUtils.seq1(residue.get_resname()) if residueName != substitutionData[chainID][residueID][0]: if verbose: print("WARNING: Unexpected residue " + str(residueID) + " in chain " + str(chainID)) print("Expected residue " + substitutionData[chainID][residueID][0] + " got " + residueName) continue (heteroFlag, sequenceID, insertionCode) = residue.get_id() if heteroFlag != ' ': continue for atom in residue: value = float(substitutionData[chainID][residueID][1]) if atom.is_disordered(): atom.__class__ = Atom.DisorderedAtom for altloc in atom.disordered_get_id_list(): a = atom.disordered_get(altloc) if verbose: print("Chain: " + chainID + "\t residue: " + residueName + "\t atom: " + str(a.get_full_id()) + " \t b-factor: " + str(a.get_bfactor()) + " => " + str(value)) a.set_bfactor(value) else: if verbose: print("Chain: " + chainID + "\t residue: " + residueName + "\t atom: " + str(atom.get_full_id()) + " \t b-factor: " + str(atom.get_bfactor()) + " => " + str(value)) atom.set_bfactor(value) residueID += 1 if (residueID >= len(substitutionData[chainID])): break else: for residue in chain: for atom in residue: atom.set_bfactor(float(noDataValue)) print('OK') return structure
def get_number_of_water_molecules(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the number of water molecules of a given chain (chain_id) in a Biopython.PDB structure as an integer. ''' n_waters = 0 for r in self.structure[chain_id]: n_waters += SeqUtils.seq1(r.get_resname()) == "X" return n_waters
def get_dihedral( residue_list ): ''' returns phi and psi angles of a residue and the amino acid sidechain present residue_list - []Bio.PDB.Residue - list of 3 *hopefully* continuous residues ''' for one, two in zip( residue_list[:-1], residue_list[1:] ): if ( two.get_id()[1] - one.get_id()[1] ) != 1: raise BackboneError( "Discontinuous residues", two.get_id()[1] ) atoms = ( {"C": False}, {"N": False, "CA": False, "C": False}, {"N": False} ) for i, residue in enumerate( residue_list ): if i == 1: res_name = SeqUtils.seq1( residue.get_resname() ) if not is_aa( res_name ): raise BackboneError( "Not a valid amino acid", residue.get_id()[1] ) for atom in residue.get_unpacked_list(): if atom.name in atoms[i].keys(): atoms[i][ atom.name ] = atom.get_vector() if False in map( check_dict, atoms ): raise BackboneError( "Missing backbone atoms", residue.get_id()[1] ) dihedrals = [ PDB.calc_dihedral( atoms[0]["C"], atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"] ), #phi PDB.calc_dihedral( atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"], atoms[2]["N"] ) #psi ] return ( dihedrals, res_name )
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' sequence = '' for r in self.structure[chain_id]: s = SeqUtils.seq1(r.get_resname()) if s != "X": sequence = sequence + s return sequence
def get_codon_usage(cuspp): """ Creates codon usage table. :param cuspp: path to cusp generated file :returns: codon usage table (pandas dataframe) """ # get codon usage stats dcodonusage = pd.read_csv(cuspp, sep='\t', header=5) cols = ''.join(dcodonusage.columns.tolist()).split(' ') dcodonusage.columns = [cols[-1]] dcodonusage.index.names = cols[:-1] dcodonusage = dcodonusage.reset_index().set_index('Codon') dcodonusage['amino acid'] = [SeqUtils.seq1(s) for s in dcodonusage['#AA']] return dcodonusage
def get_sequence(self, chain_id): """ Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. """ sequences = list(self.structure.get_chains()) for i in sequences: if i.id == chain_id: ret = "" for j in i.get_list(): ret += SeqUtils.seq1(j.resname) return ret.replace("X", "") return None
def Chain_to_SeqRecord(chain): ''' Generates a SeqRecord from a Chain entity. chain: a Bio.PDB.Chain object Keeps only residues with blank flags (eg. no HET residues). Returns seqr: a Bio.SeqRecord object with a list of resnums saved in its letter_annotations['resnum']. ''' aas = '' resns = list() for res in get_nonhet_residues(chain): aas += SeqUtils.seq1(res.get_resname()) # get 1-letter resname resns += [res.id[1]] seqr = SeqRecord.SeqRecord(Seq.Seq(aas), id = chain.id, letter_annotations = {"resnum": resns}) return seqr
def Chain_to_SeqRecord(chain): ''' Generates a SeqRecord from a Chain entity. chain: a Bio.PDB.Chain object Keeps only residues with blank flags (eg. no HET residues). Returns seqr: a Bio.SeqRecord object with a list of resnums saved in its letter_annotations['resnum']. ''' aas = '' resns = list() for res in get_nonhet_residues(chain): aas += SeqUtils.seq1(res.get_resname()) # get 1-letter resname resns += [res.id[1]] seqr = SeqRecord.SeqRecord(Seq.Seq(aas), id=chain.id, letter_annotations={"resnum": resns}) return seqr
def parse_change(self, change): dref, dalt = change.split("->")[0].strip().split("/") dref = SeqUtils.seq1(dref) dalt = SeqUtils.seq1(dalt) return (dref, dalt)
if verbose: print '\n', linia asequence = sequence_lines[i][:f_length].replace('.', 'n') bsequence = sequence[start - 1:end] if strand == "-": bsequence = str(Seq(bsequence).reverse_complement()) mfe = RNA.energy_of_structure(bsequence, f_struct, 0) if verbose: print f_struct, mfe print asequence print bsequence score = str(-1 * mfe) # score = elementy_linii[3] #this was actually wrong - it is relative anticodon position! gname = elementy_linii[1].split("-")[1] label = SeqUtils.seq1(gname) anticodon_position = int(elementy_linii[3]) - 1 # anticodon = elementy_linii[4] if True: ###gname in ['Ser','Leu','Met']: # print elementy_linii[4] if gname in ['Ser', 'Arg', 'Gly' ] and elementy_linii[4][2:4] == 'ct': label = label + '2' if ttable_id == '4' and gname == 'Arg' and elementy_linii[4][ 2:4] == 'cg': label = label + '1' if ttable_id == '13' and gname == 'Gly' and elementy_linii[4][ 2:4] == 'cc': label = label + '1' if gname == 'Ser' and elementy_linii[4][2:4] == 'ga': label = label + '1'
def load(self): ''' GeneID Drug GeneName NucleotidePosition Polymorphism EstimatedCodonPosition ReportedCodonPosition AminoAcid 0 Rv3795 EMB embB 1121-1122 GGC/GTG 374 374 Gly/Val 1 Rv3795 EMB embB 1123-1124 CCG/GCG 375 375 Pro/Ala ''' cols = ['ID', 'GeneID', 'SeqNo', 'Drug', 'GeneName', 'ApprovalLevel', 'AddedByUser', 'PrimaryReference', 'ReviewReferece', 'SecondaryReference', 'NucleotidePosition', 'Polymorphism', 'EstimatedCodonPosition', 'ReportedCodonPosition', 'AminoAcid', 'Note', 'TimePeriod', 'StudyPopulation', 'Country', 'MolecularDetectionMethod', 'GeneCoverage', 'ResistancePattern', 'MIC', 'SusceptibilityTestingMethod', 'RTotalIsolates', 'RSIsolatesWMutation', 'AdditionalMutations', 'HighQuality', 'PMID', 'OverallSequentialNumber', 'NoHC', 'Temp1', 'Temp2', 'Temp3', 'Temp4', 'Temp5', 'MutationType'] self._df = pd.DataFrame() errors = [] i = 0 rvs = [] with open(self.csv_db_path, "r") as handle: lines = handle.readlines() _log.debug("Total lines count: " + str(len(lines))) for idx, line in enumerate(lines): i = i + 1 if i < 3: continue split = line.split(",") if len(split) >= 14: dict_snp = {"line": line} j = 0 for col in cols: dict_snp[col] = split[j] j = j + 1 if (not (("/" in dict_snp["Polymorphism"]) or ("ins" in dict_snp["Polymorphism"]) or ( "del" in dict_snp["Polymorphism"])) and not (("/" in dict_snp["AminoAcid"]) or ("ins" in dict_snp["AminoAcid"]) or ( "del" in dict_snp["AminoAcid"])) ): print dict_snp["ID"] self._df = self._df.append(dict_snp, ignore_index=True) rvs.append(dict_snp["GeneID"]) # i = i -1 # if not i: # break # (rv,nucleotide,polimorphism,codon_aa,change_aa,drug) = (split[1],split[10],split[11],split[14],split[12],split[3]) # if isinstance( codon_aa,int): # snps.append((rv,nucleotide,polimorphism,codon_aa,change_aa,drug)) # else: # errors.append(line) else: errors.append(line) self._df["codon"] = map(self._codon_position, [x for i, x in self._df.iterrows()]) self._df["change"] = map(self._mutated_aa, [x for i, x in self._df.iterrows()]) self._df["ref"] = map(lambda x: bpsutils.seq1(x[0]) if x else None, self._df["change"]) self._df["mutation"] = map(lambda x: bpsutils.seq1(x[1]) if x else None, self._df["change"]) self._df["mut_type"] = map(self._mutation_type, [x for i, x in self._df.iterrows()]) self._df["rv"] = map(lambda x: x.lower(), self._df["GeneID"]) self._df["gene"] = map(lambda x: x.lower(), self._df["GeneName"]) self._df["nu_pos"] = map(self.nu_pos, self._df["NucleotidePosition"]) self._df["nu_ref"] = map(lambda x: x.split("/")[0].strip(), self._df["Polymorphism"]) self._df["nu_alt"] = map(self.nu_alt, self._df["Polymorphism"]) self._df["raw"] = line self._df["rna"] = map(lambda x: True if x == None else False, self._df["change"]) _log.info("SNPs loaded:" + str(len(self._df))) _log.info("Errors: " + str(len(errors))) _log.info("RV count: " + str(len(set(rvs))))
def load_in_sndg(self, organism="H37Rv"): from SNDG.BioMongo.Model.Protein import Protein from SNDG.BioMongo.Model.Feature import Feature, Location from SNDG.BioMongo.Model.SeqCollection import SeqCollection from SNDG.BioMongo.Model.SeqColDruggabilityParam import SeqColDruggabilityParamTypes, SeqColDruggabilityParam from bson.objectid import ObjectId search_params = [("resistance", "Associated with resistance", "variant-db", SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg") ] search_params = search_params + [ (x, "Associated with " + x + " resistance", "variant-db", SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg") for x in TBDream.drugs ] Protein.objects(organism=organism).update(__raw__={"$pull": {"features": {"type": "tbdream"}}}) collection = SeqCollection.objects(name=organism).get() for name, description, target, _type, options, defaultValue, defaultOperation, defaultGroupOperation in search_params: Protein.objects(organism=organism).update(__raw__={"$set": {"search." + name: False}}) if not collection.has_druggability_param(name): dp = SeqColDruggabilityParam(name=name, description=description, target=target, type=_type, uploader="demo") dp.options = options dp.defaultValue = defaultValue dp.defaultOperation = defaultOperation dp.defaultGroupOperation = defaultGroupOperation collection.druggabilityParams.append(dp) collection.save() for rv, rows in self._df.groupby("rv"): prot = list(Protein.objects(organism=organism, gene__iexact=rv)) if prot: prot = prot[0] for _, r in rows.iterrows(): mut = None if r.change: change = str(r.change[0]) + "/" + str(r.change[1]) mut = SeqUtils.seq1(r.change[1]) else: change = r.AminoAcid if math.isnan(r.codon): try: pos = int(r.AminoAcid) except: _log.warn("couldnt find the variant position") continue else: pos = int(r.codon) try: res, t = r.RTotalIsolates.strip().split("/") r_div_total_coef = int(res) * 1.0 / int(t) r_div_total = r.RTotalIsolates.strip() except: r_div_total = None r_div_total_coef = None quals = { "drug": r.Drug, "change": change, "gene": r.GeneID, "pattern": r.ResistancePattern, "additional": r.AdditionalMutations, "r_div_total": r_div_total, "r_div_total_coef": r_div_total_coef, "mic": r.MIC} if mut: quals["mut"] = mut fvariant = Feature(_id=ObjectId(), location=Location(start=pos, end=pos), type="tbdream", identifier="TBDream id " + r.ID, qualifiers=quals) prot.features.append(fvariant) prot.search.resistance = True prot.search[r.Drug] = True prot.save()