Example #1
0
def get_dataset(structure, label, length=None):
    '''Returns a dataset of continuous segments of protein sequence with the
    specified DSSP secondary structure code (E, H, C) of a minimum length.

    Attributes
    ----------
        structure: structure data
        label (char): DSSP secondary structure label (E, H, C)
        length (int): minimum length of secondary structure segment

    Returns
    -------
        dataset of continuous segments of protein sequence
    '''

    colNames = ["sequence", "label"]

    if length == None:

        rows = secondaryStructureExtractor.get_python_rdd(structure) \
            .flatMap(StructureToSecondaryStructureElements(label))

        return pythonRDDToDataset.get_dataset(rows, colNames)
    else:

        rows = secondaryStructureExtractor.get_python_rdd(structure) \
            .flatMap(StructureToSecondaryStructureElements(label, length))

        return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structureRDD, length):
    '''Returns a dataset of sequence segments of the specified length and
    the DSSP Q8 and Q3 code of the center residue in a segment.

    Parameters
    ----------
    structureRDD : structure
    length : int
       segment length, must be an odd number

    Returns
    -------
    dataset
       dataset of segments

    Raises
    ------
    Exception
        Segment length must be an odd number

    '''

    if length % 2 == 0:
        raise Exception("Segment length must be an odd number %i" % length)

    rows = secondaryStructureExtractor.get_python_rdd(structureRDD) \
            .flatMap(StructureToSecondaryStructureSegments(length))

    colNames = ["structureChainId", "sequence", "labelQ8", "labelQ3"]
    return pythonRDDToDataset.get_dataset(rows, colNames)