def extractStep(self): listIds = {} smallMolID = "none" for item in self.inputListID.get(): if hasattr(item, "_iteractsWithPDBId"): tokens = item._iteractsWithPDBId.get().split(";") for token in tokens: pdbId = token.strip() if not pdbId in listIds: listIds[pdbId] = [] chemId = item.getDbId() if hasattr(item, "_PDBChemId"): chemId = item._PDBChemId.get() smallMolID = "pdbchem" listIds[pdbId].append(chemId) outputDatabaseID = SetOfDatabaseID().create(path=self._getPath(), suffix='PDBs') for pdbId in listIds: pdb = DatabaseID() pdb.setDatabase("pdb") pdb.setDbId(pdbId) pdb._pdbId = pwobj.String(pdbId) pdb._PDBLink = pwobj.String("https://www.rcsb.org/structure/%s" % pdbId) aux = " ; ".join(listIds[pdbId]) if smallMolID == "none": pdb._interactsWithChemId = pwobj.String(aux) elif smallMolID == "pdbchem": pdb._interactsWithPDBChemId = pwobj.String(aux) outputDatabaseID.append(pdb) self._defineOutputs(outputPDBs=outputDatabaseID) self._defineSourceRelation(self.inputListID, outputDatabaseID)
def searchStep(self): outputDatabaseID = SetOfDatabaseID().create(path=self._getPath()) for item in self.inputListID.get(): newItem = DatabaseID() newItem.copy(item) newItem._uniprotId = pwobj.String("Not available") newItem._uniprotLink = pwobj.String("Not available") pdbId = item._pdbId.get() print("Processing %s" % pdbId) urlId = "https://www.rcsb.org/pdb/rest/das/pdb_uniprot_mapping/alignment?query=%s" % pdbId if hasattr(item, "_chain"): urlId += "." + item._chain.get().upper() fnXml = self._getExtraPath("%s.xml" % pdbId) if not os.path.exists(fnXml): print("Fetching uniprot: %s" % urlId) for i in range(3): try: urllib.request.urlretrieve(urlId, fnXml) break except: # The library raises an exception when the web is not found pass if os.path.exists(fnXml): try: tree = ET.parse(fnXml) # print(ET.tostring(tree, pretty_print=True)) uniprotId = None for child in tree.getroot().iter(): if child.tag.endswith("alignObject"): if child.attrib['dbSource'] == "UniProt": uniprotId = child.attrib['dbAccessionId'] break if uniprotId: newItem._uniprotId = pwobj.String(uniprotId) newItem._uniprotLink = pwobj.String( "https://www.uniprot.org/uniprot/%s" % uniprotId) except: print(" Cannot parse the Uniprot XML: %s" % fnXml) outputDatabaseID.append(newItem) self._defineOutputs(outputUniprot=outputDatabaseID) self._defineSourceRelation(self.inputListID, outputDatabaseID)
def searchStep(self): outputDatabaseID = SetOfDatabaseID().create(path=self._getPath()) fnList = [] for item in self.inputListID.get(): newItem = DatabaseID() newItem.copy(item) newItem._uniprotFile = pwobj.String("Not available") newItem._unitprotSeqLength = pwobj.Integer(-1) uniprotId = item._uniprotId.get() print("Processing %s" % uniprotId) urlId = "https://www.uniprot.org/uniprot/%s.fasta" % uniprotId fnFasta = self._getExtraPath("%s.fasta" % uniprotId) if not os.path.exists(fnFasta): print("Fetching uniprot: %s" % urlId) for i in range(3): try: urllib.request.urlretrieve(urlId, fnFasta) if not fnFasta in fnList: fnList.append(fnFasta) break except: # The library raises an exception when the web is not found pass if os.path.exists(fnFasta): newItem._uniprotFile = pwobj.String(fnFasta) newItem._unitprotSeqLength = pwobj.Integer( sequenceLength(fnFasta)) outputDatabaseID.append(newItem) fnAll = self._getPath("sequences.fasta") with open(fnAll, 'w') as outfile: for fname in fnList: with open(fname) as infile: for line in infile: outfile.write(line) outfile.write('\n\n') seqFile = ProteinSequenceFile() seqFile.setFileName(fnAll) self._defineOutputs(outputUniprot=outputDatabaseID) self._defineSourceRelation(self.inputListID, outputDatabaseID) self._defineOutputs(outputSequence=seqFile) self._defineSourceRelation(self.inputListID, seqFile)
def constructOutput(self, fnTxt): fnDir, fnResults = os.path.split(fnTxt) tokens = fnResults.split('-') if len(tokens) > 1: subset = tokens[1].split('.')[0] else: subset = "" outputSet = SetOfDatabaseID.create(path=self._getPath(), suffix=subset) for line in open(fnTxt, "r"): line = line.strip() if line == "": continue elif line.startswith("# Structural equivalences"): break elif line.startswith("#"): continue else: tokens = line.split() pdbId = DatabaseID() tokens2 = tokens[1].split('-') pdbId.setDatabase("pdb") pdbId.setDbId(tokens[1]) pdbId._pdbId = pwobj.String(tokens2[0]) if len(tokens2) > 1: pdbId._chain = pwobj.String(tokens2[1]) pdbId._PDBLink = pwobj.String( "https://www.rcsb.org/structure/%s" % tokens2[0]) pdbId._DaliZscore = pwobj.Float(float(tokens[2])) pdbId._DaliRMSD = pwobj.Float(float(tokens[3])) pdbId._DaliSuperpositionLength = pwobj.Integer(int(tokens[4])) pdbId._DaliSeqLength = pwobj.Integer(int(tokens[5])) pdbId._DaliSeqIdentity = pwobj.Float(float(tokens[6])) pdbId._DaliDescription = pwobj.String(" ".join(tokens[7:])) outputSet.append(pdbId) outputDict = {'outputDatabaseIds%s' % subset: outputSet} self.protocol._defineOutputs(**outputDict) self.protocol._defineSourceRelation(self.protocol.inputStructure, outputSet)
def operateStep(self): outputDict = {} if self.operation.get() == 1: # Union for database in self.multipleInputListID: for databaseEntry in database.get(): add = True if self.removeDuplicates.get(): add = not databaseEntry.getDbId() in outputDict if add: dbEntry = DatabaseID() dbEntry.copy(databaseEntry, copyId=False) outputDict[databaseEntry.getDbId()] = dbEntry elif self.operation.get() == 0 or self.operation.get( ) == 2 or self.operation.get() == 3: # Unique, Intersection, Difference outputList2 = [] if self.operation.get() == 2 or self.operation.get() == 3: for databaseEntry in self.inputListID2.get(): outputList2.append(databaseEntry.getDbId()) for databaseEntry in self.inputListID.get(): add = False if self.operation.get() == 0: # Unique add = not databaseEntry.getDbId() in outputDict elif self.operation.get() == 2: # Intersection add = databaseEntry.getDbId() in outputList2 if self.removeDuplicates.get(): add = add and not databaseEntry.getDbId() in outputDict elif self.operation.get() == 3: # Difference add = not databaseEntry.getDbId() in outputList2 if self.removeDuplicates.get(): add = add and not databaseEntry.getDbId() in outputDict if add: dbEntry = DatabaseID() dbEntry.copy(databaseEntry) outputDict[databaseEntry.getDbId()] = dbEntry elif self.operation.get() == 4: # Change ID newLabel = True for name, _ in self.inputListID.get().getFirstItem().getAttributes( ): if self.newDb.get() == name: newLabel = False break for databaseEntry in self.inputListID.get(): dbEntry = DatabaseID() dbEntry.copy(databaseEntry) if hasattr(dbEntry, self.newDbId.get()): if newLabel: dbEntry.setDatabase(self.newDb.get()) else: dbEntry.setDatabase( dbEntry.getAttributeValue(self.newDb.get())) dbEntry.setDbId( dbEntry.getAttributeValue(self.newDbId.get())) add = True if self.removeDuplicates.get(): add = add and not dbEntry.getDbId() in outputDict if add: outputDict[dbEntry.getDbId()] = dbEntry elif self.operation.get() == 5: # Keep columns keepList = [x.strip() for x in self.keepColumns.get().split()] keepList.append("database") keepList.append("dbId") ignoreList = [] for name, _ in self.inputListID.get().getFirstItem().getAttributes( ): if not name in keepList: ignoreList.append(name) for databaseEntry in self.inputListID.get(): dbEntry = DatabaseID() dbEntry.copy(databaseEntry, ignoreAttrs=ignoreList) add = True if self.removeDuplicates.get(): add = add and not dbEntry.getDbId() in outputDict if add: outputDict[dbEntry.getDbId()] = dbEntry elif self.operation.get() == 6: # Filter columns referenceValue = self.filterValue.get() value = self.inputListID.get().getFirstItem().getAttributeValue( self.filterColumn.get()) if isinstance(value, float): referenceValue = float(referenceValue) elif isinstance(value, int): referenceValue = int(referenceValue) for databaseEntry in self.inputListID.get(): dbEntry = DatabaseID() dbEntry.copy(databaseEntry) add = False value = dbEntry.getAttributeValue(self.filterColumn.get()) if isinstance(value, Float): value = float(value) elif isinstance(value, Integer): value = int(value) filterOp = self.filterOp.get() if filterOp == 0: # == add = value == referenceValue elif filterOp == 1: # > add = value > referenceValue elif filterOp == 2: # >= add = value > referenceValue elif filterOp == 3: # < add = value < referenceValue elif filterOp == 4: # <= add = value <= referenceValue elif filterOp == 5: # != add = value != referenceValue elif filterOp == 6: #startswith add = value.startswith(referenceValue) elif filterOp == 7: # endswith add = value.endswith(referenceValue) elif filterOp == 8: # contains add = referenceValue in value elif filterOp == 9: # does not startswith add = not (value.startswith(referenceValue)) elif filterOp == 10: # does not endswith add = not (value.endswith(referenceValue)) elif filterOp == 11: # does not contains add = not (referenceValue in value) if self.removeDuplicates.get(): add = add and not dbEntry.getDbId() in outputDict if add: outputDict[dbEntry.getDbId()] = dbEntry outputDatabaseID = SetOfDatabaseID().create(path=self._getPath()) for dbId in outputDict: outputDatabaseID.append(outputDict[dbId]) self._defineOutputs(output=outputDatabaseID) self._defineSourceRelation(self.inputListID, outputDatabaseID)