Esempio n. 1
0
def align(seq1, seq2, debug=False):
    flat1 = seq.seq1(''.join(seq1)).replace('X', '-')
    flat2 = seq.seq1(''.join(seq2)).replace('X', '-')
    flats = [flat1, flat2]
    # aligning 2 to 1 seems to give better results
    align = pairwise2.align.localxs(flat2,
                                    flat1,
                                    -1000,
                                    -1000,
                                    one_alignment_only=True)
    start = align[0][3]
    offset = [0, 0]
    # compute how many gaps had to be inserted at beginning to align
    for i in range(2):
        assert len(align[0][0]) == len(align[0][1])
        for j in range(len(align[0][0])):
            # account for the fact that 2 and 1 are switched in alignment results
            # if there is a gap in 1
            if align[0][(i + 1) % 2][j] == '-':
                # but not the other
                if flats[i][j - offset[i]] != '-':
                    offset[i] += 1
            else:
                break
    if debug:
        print(
            pairwise2.format_alignment(flat2[offset[0]:], flat1[offset[1]:],
                                       10, 0,
                                       len(flat1) - offset[1]))
    return -offset[0], -offset[1]
 def var3_to_var1(var3):
     refAA3 = variant.get_refAA(var3)
     newAA3 = variant.get_newAA(var3)
     pos = variant.get_pos(var3)
     refAA1 = "*" if refAA3 == "*" else SeqUtils.seq1(refAA3)
     # refAA1 = SeqUtils.seq1(refAA3)
     newAA1 = "*" if newAA3 == "*" else SeqUtils.seq1(newAA3)
     # newAA1 = SeqUtils.seq1(newAA3)
     var1 = ''.join([refAA1, str(pos), newAA1])
     return var1
Esempio n. 3
0
def get_distances(res_pairs, get_coords):
    ''' Get distances for all pairs of residues between two chains

        res_pairs: generator over tuples ((res_a, res_b), ...)
        get_coords: function to get residue coordinates

        Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...]

    '''

    return [(res_a.id[1], res_b.id[1],
             distances.calc_residue_distance(res_a, res_b, get_coords),
             SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname))
            for (res_a, res_b) in res_pairs]
Esempio n. 4
0
def get_sequences(pdb_id, chain=None):
    '''Gets the sequences in a PDB file.'''
    return [SeqUtils.seq1(''.join([residue.get_resname()
                                   for residue in chn
                                   if 'CA' in residue.child_dict]))
            for chn in get_structure(pdb_id).get_chains()
            if chain is None or chain == chn.get_id()]
Esempio n. 5
0
def extractPDBdata(structure, adjustChains, substitutionData, verbose):
	print('Extracting atoms details from PDB...')
	pdbData = {}
	for model in structure:
		for chain in model:
			chainID = chain.get_id()
			if chainID in adjustChains:
				pdbData[chainID] = {}
				residueID = 0
				for residue in chain:
					residueName = SeqUtils.seq1(residue.get_resname())
					if residueName != substitutionData[chainID][residueID][0]:
						continue
					(heteroFlag, sequenceID, insertionCode) = residue.get_id()
					if heteroFlag != ' ':
						continue
					value = substitutionData[chainID][residueID][1]
					if value != "-":
						pdbData[chainID][sequenceID] = value
					if verbose:
						print("Chain: " + chainID + "\t residue: " + residueName + " " + str(sequenceID) + "\t value: " + value)
					residueID += 1
					if (residueID >= len(substitutionData[chainID])):
						break
	print('OK')
	return pdbData
Esempio n. 6
0
    def get_bfactors(self, chain_id):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the B-Factors for all residues in a chain of a Biopython.PDB structure.
                The B-Factors describe the mobility of an atom or a residue.
                In a Biopython.PDB structure B-Factors are given for each atom in a residue.
                Calculate the mean B-Factor for a residue by averaging over the B-Factor
                of all atoms in a residue.
                Sometimes B-Factors are not available for a certain residue;
                (e.g. the residue was not resolved); insert np.nan for those cases.

                Finally normalize your B-Factors using Standard scores (zero mean, unit variance).
                You have to use np.nanmean, np.nanvar etc. if you have nan values in your array.
                The returned data structure has to be a numpy array rounded again to integer.
        '''
        bf = [
            np.mean([a.get_bfactor() for a in r])
            for r in self.structure[chain_id]
            if SeqUtils.seq1(r.get_resname()) != 'X'
        ]
        meanBf = np.mean(bf)
        stdBf = np.std(bf)
        bf = [(b - meanBf) / stdBf for b in bf]
        return np.array(bf).astype(np.int)  # return rounded (integer) values
def get_distances(res_pairs, get_coords):
    ''' Get distances for all pairs of residues between two chains

        res_pairs: generator over tuples ((res_a, res_b), ...)
        get_coords: function to get residue coordinates

        Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...]

    '''

    return [
            (res_a.id[1], res_b.id[1],
             distances.calc_residue_distance(res_a, res_b, get_coords),
             SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname)
             )
            for (res_a, res_b)
            in res_pairs
            ]
Esempio n. 8
0
def generate_wt_seqs(peptides):
    wt_dict = {}

    r = re.compile("([a-zA-Z]+)([0-9]+)([a-zA-Z]+)")
    d_pattern = re.compile("([a-zA-Z]+)([0-9]+)")
    for x in peptides:
        trans = x.get_all_transcripts()
        for t in trans:
            mut_seq = [a for a in x]
            protein_pos = x.get_protein_positions(t.transcript_id)
            not_available = False
            variant_available = False
            for p in protein_pos:
                variant_dic = x.get_variants_by_protein_position(
                    t.transcript_id, p)
                variant_available = bool(variant_dic)
                for key in variant_dic:
                    var_list = variant_dic[key]
                    for v in var_list:
                        mut_syntax = v.coding[t.transcript_id.split(':')
                                              [0]].aaMutationSyntax
                        if v.type in [3, 4, 5] or '?' in mut_syntax:
                            not_available = True
                        elif v.type in [1]:
                            m = d_pattern.match(mut_syntax.split('.')[1])
                            wt = SeqUtils.seq1(m.groups()[0])
                            mut_seq.insert(key, wt)
                        elif v.type in [2]:
                            not_available = True
                        else:
                            m = r.match(mut_syntax.split('.')[1])
                            if m is None:
                                not_available = True
                            else:
                                wt = SeqUtils.seq1(m.groups()[0])
                                mut_seq[key] = wt
            if not_available:
                wt_dict['{}_{}'.format(str(x), t.transcript_id)] = np.nan
            elif variant_available:
                wt_dict['{}_{}'.format(str(x),
                                       t.transcript_id)] = ''.join(mut_seq)
    return wt_dict
Esempio n. 9
0
def changeBfactors(structure, adjustChains, substitutionData, verbose,
                   noDataValue):
    print('Changing b-factors of atoms in PDB...')
    for model in structure:
        for chain in model:
            chainID = chain.get_id()
            if chainID in adjustChains:
                residueID = 0
                for residue in chain:
                    residueName = SeqUtils.seq1(residue.get_resname())
                    if residueName != substitutionData[chainID][residueID][0]:
                        if verbose:
                            print("WARNING: Unexpected residue " +
                                  str(residueID) + " in chain " + str(chainID))
                            print("Expected residue " +
                                  substitutionData[chainID][residueID][0] +
                                  " got " + residueName)
                        continue
                    (heteroFlag, sequenceID, insertionCode) = residue.get_id()
                    if heteroFlag != ' ':
                        continue
                    for atom in residue:
                        value = float(substitutionData[chainID][residueID][1])
                        if atom.is_disordered():
                            atom.__class__ = Atom.DisorderedAtom
                            for altloc in atom.disordered_get_id_list():
                                a = atom.disordered_get(altloc)
                                if verbose:
                                    print("Chain: " + chainID +
                                          "\t residue: " + residueName +
                                          "\t atom: " + str(a.get_full_id()) +
                                          " \t b-factor: " +
                                          str(a.get_bfactor()) + " => " +
                                          str(value))
                                a.set_bfactor(value)
                        else:
                            if verbose:
                                print("Chain: " + chainID + "\t residue: " +
                                      residueName + "\t atom: " +
                                      str(atom.get_full_id()) +
                                      " \t b-factor: " +
                                      str(atom.get_bfactor()) + " => " +
                                      str(value))
                            atom.set_bfactor(value)
                    residueID += 1
                    if (residueID >= len(substitutionData[chainID])):
                        break
            else:
                for residue in chain:
                    for atom in residue:
                        atom.set_bfactor(float(noDataValue))
    print('OK')
    return structure
Esempio n. 10
0
 def get_number_of_water_molecules(self, chain_id):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the number of water molecules of a given chain (chain_id)
             in a Biopython.PDB structure as an integer.
     '''
     n_waters = 0
     for r in self.structure[chain_id]:
         n_waters += SeqUtils.seq1(r.get_resname()) == "X"
     return n_waters
Esempio n. 11
0
def get_dihedral( residue_list ):

	'''
	returns phi and psi angles of a residue and the amino acid sidechain present

	residue_list - []Bio.PDB.Residue - list of 3 *hopefully* continuous residues

	'''

	for one, two in zip( residue_list[:-1], residue_list[1:] ):

		if ( two.get_id()[1] - one.get_id()[1] ) != 1:

			raise BackboneError( "Discontinuous residues", two.get_id()[1] )

	atoms = (
		{"C": False},
		{"N": False,
		"CA": False,
		"C": False},
		{"N": False}
	)

	for i, residue in enumerate( residue_list ):

		if i == 1:

			res_name = SeqUtils.seq1( residue.get_resname() )

			if not is_aa( res_name ):

				raise BackboneError( "Not a valid amino acid", residue.get_id()[1] )

		for atom in residue.get_unpacked_list():

			if atom.name in atoms[i].keys():
				
				atoms[i][ atom.name ] = atom.get_vector()

	if False in map( check_dict, atoms ):

		raise BackboneError( "Missing backbone atoms", residue.get_id()[1] )

	dihedrals = [
		PDB.calc_dihedral( atoms[0]["C"], atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"] ), #phi
		PDB.calc_dihedral( atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"], atoms[2]["N"] ) #psi
	]

	return ( dihedrals, res_name )
Esempio n. 12
0
 def get_sequence(self, chain_id):
     '''
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     '''
     sequence = ''
     for r in self.structure[chain_id]:
         s = SeqUtils.seq1(r.get_resname())
         if s != "X":
             sequence = sequence + s
     return sequence
Esempio n. 13
0
def get_codon_usage(cuspp):
    """
    Creates codon usage table.

    :param cuspp: path to cusp generated file
    :returns: codon usage table (pandas dataframe)
    """
    # get codon usage stats
    dcodonusage = pd.read_csv(cuspp, sep='\t', header=5)
    cols = ''.join(dcodonusage.columns.tolist()).split(' ')
    dcodonusage.columns = [cols[-1]]
    dcodonusage.index.names = cols[:-1]

    dcodonusage = dcodonusage.reset_index().set_index('Codon')
    dcodonusage['amino acid'] = [SeqUtils.seq1(s) for s in dcodonusage['#AA']]
    return dcodonusage
 def get_sequence(self, chain_id):
     """
         Input:
             self: Use Biopython.PDB structure which has been stored in an object variable
             chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                     depends on the specific protein and the resulting structure)
         Return:
             Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
             in a Biopython.PDB structure as a string.
     """
     sequences = list(self.structure.get_chains())
     for i in sequences:
         if i.id == chain_id:
             ret = ""
             for j in i.get_list():
                 ret += SeqUtils.seq1(j.resname)
             return ret.replace("X", "")
     return None
def Chain_to_SeqRecord(chain):
    ''' Generates a SeqRecord from a Chain entity.

        chain: a Bio.PDB.Chain object

        Keeps only residues with blank flags (eg. no HET residues).

        Returns seqr: a Bio.SeqRecord object with a list of resnums saved in
        its letter_annotations['resnum'].

    '''

    aas = ''
    resns = list()
    for res in get_nonhet_residues(chain):
        aas += SeqUtils.seq1(res.get_resname())  # get 1-letter resname
        resns += [res.id[1]]

    seqr = SeqRecord.SeqRecord(Seq.Seq(aas), id = chain.id,
                               letter_annotations = {"resnum": resns})

    return seqr
Esempio n. 16
0
def Chain_to_SeqRecord(chain):
    ''' Generates a SeqRecord from a Chain entity.

        chain: a Bio.PDB.Chain object

        Keeps only residues with blank flags (eg. no HET residues).

        Returns seqr: a Bio.SeqRecord object with a list of resnums saved in
        its letter_annotations['resnum'].

    '''

    aas = ''
    resns = list()
    for res in get_nonhet_residues(chain):
        aas += SeqUtils.seq1(res.get_resname())  # get 1-letter resname
        resns += [res.id[1]]

    seqr = SeqRecord.SeqRecord(Seq.Seq(aas),
                               id=chain.id,
                               letter_annotations={"resnum": resns})

    return seqr
Esempio n. 17
0
 def parse_change(self, change):
     dref, dalt = change.split("->")[0].strip().split("/")
     dref = SeqUtils.seq1(dref)
     dalt = SeqUtils.seq1(dalt)
     return (dref, dalt)
Esempio n. 18
0
        if verbose: print '\n', linia
        asequence = sequence_lines[i][:f_length].replace('.', 'n')
        bsequence = sequence[start - 1:end]
        if strand == "-":
            bsequence = str(Seq(bsequence).reverse_complement())
        mfe = RNA.energy_of_structure(bsequence, f_struct, 0)
        if verbose:
            print f_struct, mfe
            print asequence
            print bsequence
        score = str(-1 * mfe)

        #	score = elementy_linii[3] #this was actually wrong - it is relative anticodon position!

        gname = elementy_linii[1].split("-")[1]
        label = SeqUtils.seq1(gname)
        anticodon_position = int(elementy_linii[3]) - 1
        #        anticodon = elementy_linii[4]
        if True:  ###gname in ['Ser','Leu','Met']:
            #          print elementy_linii[4]
            if gname in ['Ser', 'Arg', 'Gly'
                         ] and elementy_linii[4][2:4] == 'ct':
                label = label + '2'
            if ttable_id == '4' and gname == 'Arg' and elementy_linii[4][
                    2:4] == 'cg':
                label = label + '1'
            if ttable_id == '13' and gname == 'Gly' and elementy_linii[4][
                    2:4] == 'cc':
                label = label + '1'
            if gname == 'Ser' and elementy_linii[4][2:4] == 'ga':
                label = label + '1'
Esempio n. 19
0
    def load(self):
        '''
            GeneID    Drug    GeneName    NucleotidePosition    Polymorphism    EstimatedCodonPosition    ReportedCodonPosition    AminoAcid
        0    Rv3795    EMB    embB        1121-1122            GGC/GTG            374                        374                    Gly/Val
        1    Rv3795    EMB    embB        1123-1124            CCG/GCG            375                        375                    Pro/Ala
        '''
        cols = ['ID', 'GeneID', 'SeqNo', 'Drug', 'GeneName', 'ApprovalLevel', 'AddedByUser', 'PrimaryReference',
                'ReviewReferece',
                'SecondaryReference', 'NucleotidePosition', 'Polymorphism', 'EstimatedCodonPosition',
                'ReportedCodonPosition',
                'AminoAcid', 'Note', 'TimePeriod', 'StudyPopulation', 'Country', 'MolecularDetectionMethod',
                'GeneCoverage',
                'ResistancePattern', 'MIC', 'SusceptibilityTestingMethod', 'RTotalIsolates', 'RSIsolatesWMutation',
                'AdditionalMutations',
                'HighQuality', 'PMID', 'OverallSequentialNumber', 'NoHC', 'Temp1', 'Temp2', 'Temp3', 'Temp4', 'Temp5',
                'MutationType']
        self._df = pd.DataFrame()

        errors = []
        i = 0
        rvs = []
        with open(self.csv_db_path, "r") as handle:
            lines = handle.readlines()
            _log.debug("Total lines count: " + str(len(lines)))
            for idx, line in enumerate(lines):
                i = i + 1
                if i < 3:
                    continue

                split = line.split(",")
                if len(split) >= 14:

                    dict_snp = {"line": line}
                    j = 0
                    for col in cols:
                        dict_snp[col] = split[j]
                        j = j + 1
                    if (not (("/" in dict_snp["Polymorphism"]) or ("ins" in dict_snp["Polymorphism"]) or (
                            "del" in dict_snp["Polymorphism"]))
                            and
                            not (("/" in dict_snp["AminoAcid"]) or ("ins" in dict_snp["AminoAcid"]) or (
                                    "del" in dict_snp["AminoAcid"]))
                    ):
                        print dict_snp["ID"]

                    self._df = self._df.append(dict_snp, ignore_index=True)
                    rvs.append(dict_snp["GeneID"])
                    # i = i -1
                    # if not i:
                    #    break
                    # (rv,nucleotide,polimorphism,codon_aa,change_aa,drug) =  (split[1],split[10],split[11],split[14],split[12],split[3])
                    # if isinstance( codon_aa,int):
                    #    snps.append((rv,nucleotide,polimorphism,codon_aa,change_aa,drug))
                    # else:
                    #    errors.append(line)
                else:
                    errors.append(line)

        self._df["codon"] = map(self._codon_position, [x for i, x in self._df.iterrows()])
        self._df["change"] = map(self._mutated_aa, [x for i, x in self._df.iterrows()])

        self._df["ref"] = map(lambda x: bpsutils.seq1(x[0]) if x else None, self._df["change"])
        self._df["mutation"] = map(lambda x: bpsutils.seq1(x[1]) if x else None, self._df["change"])
        self._df["mut_type"] = map(self._mutation_type, [x for i, x in self._df.iterrows()])

        self._df["rv"] = map(lambda x: x.lower(), self._df["GeneID"])
        self._df["gene"] = map(lambda x: x.lower(), self._df["GeneName"])
        self._df["nu_pos"] = map(self.nu_pos, self._df["NucleotidePosition"])
        self._df["nu_ref"] = map(lambda x: x.split("/")[0].strip(), self._df["Polymorphism"])
        self._df["nu_alt"] = map(self.nu_alt, self._df["Polymorphism"])
        self._df["raw"] = line
        self._df["rna"] = map(lambda x: True if x == None else False, self._df["change"])

        _log.info("SNPs loaded:" + str(len(self._df)))
        _log.info("Errors: " + str(len(errors)))
        _log.info("RV count: " + str(len(set(rvs))))
Esempio n. 20
0
    def load_in_sndg(self, organism="H37Rv"):
        from SNDG.BioMongo.Model.Protein import Protein
        from SNDG.BioMongo.Model.Feature import Feature, Location
        from SNDG.BioMongo.Model.SeqCollection import SeqCollection
        from SNDG.BioMongo.Model.SeqColDruggabilityParam import SeqColDruggabilityParamTypes, SeqColDruggabilityParam

        from bson.objectid import ObjectId

        search_params = [("resistance", "Associated with resistance", "variant-db",
                          SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg")

                         ]
        search_params = search_params + [
            (x, "Associated with " + x + " resistance", "variant-db",
             SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg")
            for x in TBDream.drugs
        ]

        Protein.objects(organism=organism).update(__raw__={"$pull": {"features": {"type": "tbdream"}}})
        collection = SeqCollection.objects(name=organism).get()
        for name, description, target, _type, options, defaultValue, defaultOperation, defaultGroupOperation in search_params:
            Protein.objects(organism=organism).update(__raw__={"$set": {"search." + name: False}})
            if not collection.has_druggability_param(name):
                dp = SeqColDruggabilityParam(name=name, description=description, target=target,
                                             type=_type, uploader="demo")
                dp.options = options
                dp.defaultValue = defaultValue
                dp.defaultOperation = defaultOperation
                dp.defaultGroupOperation = defaultGroupOperation
                collection.druggabilityParams.append(dp)
        collection.save()

        for rv, rows in self._df.groupby("rv"):
            prot = list(Protein.objects(organism=organism, gene__iexact=rv))
            if prot:
                prot = prot[0]
                for _, r in rows.iterrows():
                    mut = None
                    if r.change:
                        change = str(r.change[0]) + "/" + str(r.change[1])
                        mut = SeqUtils.seq1(r.change[1])
                    else:
                        change = r.AminoAcid
                    if math.isnan(r.codon):
                        try:
                            pos = int(r.AminoAcid)
                        except:
                            _log.warn("couldnt find the variant position")
                            continue
                    else:
                        pos = int(r.codon)

                    try:
                        res, t = r.RTotalIsolates.strip().split("/")
                        r_div_total_coef = int(res) * 1.0 / int(t)
                        r_div_total = r.RTotalIsolates.strip()

                    except:
                        r_div_total = None
                        r_div_total_coef = None

                    quals = {
                        "drug": r.Drug,
                        "change": change,
                        "gene": r.GeneID,
                        "pattern": r.ResistancePattern,
                        "additional": r.AdditionalMutations,
                        "r_div_total": r_div_total,
                        "r_div_total_coef": r_div_total_coef,
                        "mic": r.MIC}
                    if mut:
                        quals["mut"] = mut
                    fvariant = Feature(_id=ObjectId(), location=Location(start=pos, end=pos), type="tbdream",
                                       identifier="TBDream id " + r.ID,
                                       qualifiers=quals)
                    prot.features.append(fvariant)
                    prot.search.resistance = True
                    prot.search[r.Drug] = True
                prot.save()