def get_dataset(structure, label, length=None): '''Returns a dataset of continuous segments of protein sequence with the specified DSSP secondary structure code (E, H, C) of a minimum length. Attributes ---------- structure: structure data label (char): DSSP secondary structure label (E, H, C) length (int): minimum length of secondary structure segment Returns ------- dataset of continuous segments of protein sequence ''' colNames = ["sequence", "label"] if length == None: rows = secondaryStructureExtractor.get_python_rdd(structure) \ .flatMap(StructureToSecondaryStructureElements(label)) return pythonRDDToDataset.get_dataset(rows, colNames) else: rows = secondaryStructureExtractor.get_python_rdd(structure) \ .flatMap(StructureToSecondaryStructureElements(label, length)) return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structureRDD, length): '''Returns a dataset of sequence segments of the specified length and the DSSP Q8 and Q3 code of the center residue in a segment. Parameters ---------- structureRDD : structure length : int segment length, must be an odd number Returns ------- dataset dataset of segments Raises ------ Exception Segment length must be an odd number ''' if length % 2 == 0: raise Exception("Segment length must be an odd number %i" % length) rows = secondaryStructureExtractor.get_python_rdd(structureRDD) \ .flatMap(StructureToSecondaryStructureSegments(length)) colNames = ["structureChainId", "sequence", "labelQ8", "labelQ3"] return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structure): '''Returns a dataset with protein sequence and secondary structure assignments. Parameters ---------- structure : mmtfStructure single protein chain Returns ------- dataset dataset with sequence and secondary structure assignments ''' print("RUNNING") rows = structure.flatMap(lambda x: _get_phi_psi(x)) # Map or flatMap print("MAPPED") # convert to dataset colNames = ["pdbId", "chain", "resi", "resn", "phi", "psi"] #+["is"+aa for aa in aa3] #sc = SparkContext.getOrCreate() #newdf = it.chain.from_iterable(rows) #allrows = sc.union(rows) #return reduce(DataFrame.unionAll, rows.collect()) return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(): '''Gets JPred 4/JNet (v.2.3.1) secondary structure dataset. Returns ------- dataset secondaryStructure dataset ''' URL = "http://www.compbio.dundee.ac.uk/jpred/downloads/retr231.tar.gz" instream = urllib.request.urlopen(URL) secondaryStructures, sequences, trained = {}, {}, {} scopIds = set() res = [] with tarfile.open(fileobj=instream, mode="r:gz") as tf: for entry in tf: if entry.isdir(): continue br = tf.extractfile(entry) if ".dssp" in entry.name: scopID = str(br.readline())[3:-3] # Remove newline and byte secondaryStructure = str( br.readline())[2:-3] # Remove newline and byte secondaryStructure = secondaryStructure.replace('-', 'C') secondaryStructures[scopID] = secondaryStructure if ".fasta" in entry.name: scopID = str(br.readline())[3:-3] # Remove newline and byte sequence = str(br.readline())[2:-3] # Remove newline and byte scopIds.add(scopID) sequences[scopID] = sequence if "training/" in entry.name: trained[scopID] = "true" elif "blind/" in entry.name: trained[scopID] = "false" for scopId in scopIds: row = Row(scopId, sequences[scopId], secondaryStructures[scopId], trained[scopId]) res.append(row) sc = SparkContext.getOrCreate() data = sc.parallelize(res) colNames = ["scopID", "sequence", "secondaryStructure", "trained"] return pythonRDDToDataset.get_dataset(data, colNames)
def get_dataset(self, structures): '''Returns a dataset of residues that interact with specified group within a specified cutoff distance Attricutes: structure (pythonRdd): a set of PDB structures Returns: dataset with interacting residue and atom information ''' # create a list of all residues with a threshold distance rows = structures.flatMap( StructureToAllInteractions(self.groupName, self.distance)) # convert to a dataset colNames = [ "structureId", "residue1", "atom1", "element1", "index1", "residue2", "atom2", "element2", "index2", "distance" ] return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structures): '''Returns a dataset of polymer sequence contained in PDB entries using the full sequence used in the experimnet (i.e., the "SEQRES" record in PDB files) Attributes ---------- structures (pythonRDD): a set of PDB structures Returns ------- dataset with interacting residue and atom information ''' rows = structures.flatMap(StructureToPolymerSequences()) \ .map(lambda x: Row(x[0],x[1])) colNames = ["structureChainId", "sequence"] return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structure, parameters=None, classifier=None, options=None): '''Returns a dataset with protein sequence and secondary structure assignments. Parameters ---------- structure : mmtfStructure single protein chain Returns ------- dataset dataset with sequence and secondary structure assignments ''' rows = structure.map(lambda x: _get_free_sasa(x, parameters, classifier, options)) # Map or flatMap # convert to dataset colNames = ["structureChainId", "totalArea"] return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structure): '''Returns a dataset with protein sequence and secondary structure assignments. Attributes ---------- structure (mmtfStructure): single protein chain Returns ------- dataset with sequence and secondary structure assignments ''' rows = structure.map( lambda x: _get_sec_struct_fractions(x)) # Map or flatMap # convert to dataset colNames = [ "structureChainId", "sequence", "alpha", "beta", "coil", "dsspQ8Code", "dsspQ3Code" ] return pythonRDDToDataset.get_dataset(rows, colNames)