def __init__(self, fnames, keepExons=False, labels=None, verbose=False):
        """
        Driver function to actually parse files. The steps are as follows:

        1) skip to the first non-comment line
        2) Infer the type from that
        3) Call a type-specific processing function accordingly
          * These call the underlying C code for storage
          * These handle chromsome name conversions (python-level)

        Required inputs are as follows:

        fnames:	A list of (possibly compressed with gzip or bzip2) GTF or BED files.

        Optional input is:

        keepExons:    For BED12 files, exons are ignored by default.
        labels:       Override the feature labels supplied in the file(s).
                      Note that this might instead be replaced later in the .features attribute.
        verbose:      Whether to print warnings (default: False)
        """
        self.fname = []
        self.filename = ""
        self.chroms = []
        self.features = []
        self.tree = tree.initTree()
        self.keepExons = keepExons
        self.verbose = verbose

        if not isinstance(fnames, list):
            fnames = [fnames]

        # Load the files
        for labelIdx, fname in enumerate(fnames):
            self.filename = fname
            fp = openPossiblyCompressed(fname)
            line, labelColumn = self.firstNonComment(fp)
            if line is None:
                # This will only ever happen if a file is empty or just has a header/comment
                continue
            line = line.strip()

            ftype = self.inferType(fp, line, labelColumn)

            if ftype != 'GTF' and labels is not None:
                assert(len(labels) > labelIdx)
                bname = labels[labelIdx]
            else:
                bname = basename(fname)
            if ftype == 'GTF':
                self.parseGTF(fp, line)
            elif ftype == 'BED3':
                self.parseBED(fp, line, 3, feature=bname, labelColumn=labelColumn)
            elif ftype == 'BED6':
                self.parseBED(fp, line, 6, feature=bname, labelColumn=labelColumn)
            else:
                self.parseBED(fp, line, 12, feature=bname, labelColumn=labelColumn)
            fp.close()

        # Sanity check
        if self.tree.countEntries() == 0:
            raise RuntimeError("None of the input BED/GTF files had valid regions")

        if len(self.features) == 0:
            raise RuntimeError("There were no valid feature labels!")

        # vine -> tree
        self.tree.finish()
Beispiel #2
0
    def __init__(self, fnames, keepExons=False, labels=None, verbose=False):
        """
        Driver function to actually parse files. The steps are as follows:

        1) skip to the first non-comment line
        2) Infer the type from that
        3) Call a type-specific processing function accordingly
          * These call the underlying C code for storage
          * These handle chromsome name conversions (python-level)

        Required inputs are as follows:

        fnames:	A list of (possibly compressed with gzip or bzip2) GTF or BED files.

        Optional input is:

        keepExons:    For BED12 files, exons are ignored by default.
        labels:       Override the feature labels supplied in the file(s).
                      Note that this might instead be replaced later in the .features attribute.
        verbose:      Whether to print warnings (default: False)
        """
        self.fname = []
        self.filename = ""
        self.chroms = []
        self.features = []
        self.tree = tree.initTree()
        self.keepExons = keepExons
        self.verbose = verbose

        if not isinstance(fnames, list):
            fnames = [fnames]

        # Load the files
        for labelIdx, fname in enumerate(fnames):
            self.filename = fname
            fp = openPossiblyCompressed(fname)
            line, labelColumn = self.firstNonComment(fp)
            if line is None:
                # This will only ever happen if a file is empty or just has a header/comment
                continue
            line = line.strip()

            ftype = self.inferType(fp, line, labelColumn)

            if ftype != 'GTF' and labels is not None:
                assert(len(labels) > labelIdx)
                bname = labels[labelIdx]
            else:
                bname = basename(fname)
            if ftype == 'GTF':
                self.parseGTF(fp, line)
            elif ftype == 'BED3':
                self.parseBED(fp, line, 3, feature=bname, labelColumn=labelColumn)
            elif ftype == 'BED6':
                self.parseBED(fp, line, 6, feature=bname, labelColumn=labelColumn)
            else:
                self.parseBED(fp, line, 12, feature=bname, labelColumn=labelColumn)
            fp.close()

        # Sanity check
        if self.tree.countEntries() == 0:
            raise RuntimeError("None of the input BED/GTF files had valid regions")

        if len(self.features) == 0:
            raise RuntimeError("There were no valid feature labels!")

        # vine -> tree
        self.tree.finish()
    def __init__(self, fnames, exonID="exon", transcriptID="transcript", keepExons=False, labels=[], transcript_id_designator="transcript_id", defaultGroup=None, verbose=False):
        """
        Driver function to actually parse files. The steps are as follows:

        1) skip to the first non-comment line
        2) Infer the type from that
        3) Call a type-specific processing function accordingly
          * These call the underlying C code for storage
          * These handle chromsome name conversions (python-level)
          * These handle labels (python-level, with a C-level numeric attribute)
        4) Sanity checking (do the number of labels make sense?)

        Required inputs are as follows:

        fnames:	A list of (possibly compressed with gzip or bzip2) GTF or BED files.

        Optional input is:

        exonID:	      For GTF files, the feature column (column 3) label for
                      exons, or whatever else should be stored as exons. The
                      default is 'exon', though one could use 'CDS' instead.
        transcriptID: As above, but for transcripts. The default is
                      'transcript_id'.
        keepExons:    For BED12 and GTF files, exons are ignored by default.
        labels:       A list of group labels.
        transcript_id_designator: For gtf files, this is the key used in a searching for the transcript ID.
                      If one sets transcriptID to 'gene', then
                      transcript_id_designator would need to be changed to
                      'gene_id' or 'gene_name' to extract the gene ID/name from
                      the attributes.
        defaultGroup: The default group name. If None, the file name is used.
        verbose:      Whether to produce warning messages (default: False)
        """
        self.fname = []
        self.filename = ""
        self.chroms = []
        self.exons = []
        self.labels = []
        self.transcriptIDduplicated = []
        self.tree = tree.initTree()
        self.labelIdx = 0
        self.transcript_id_designator = transcript_id_designator
        self.exonID = exonID
        self.transcriptID = transcriptID
        self.keepExons = keepExons
        self.defaultGroup = defaultGroup
        self.verbose = verbose

        if labels != []:
            self.already_input_labels = True

        if not isinstance(fnames, list):
            fnames = [fnames]

        # Load the files
        for fname in fnames:
            self.filename = fname
            fp = openPossiblyCompressed(fname)
            line, labelColumn = self.firstNonComment(fp)
            if line is None:
                # This will only ever happen if a file is empty or just has a header/comment
                continue
            line = line.strip()

            ftype = self.inferType(fp, line, labelColumn)
            if ftype == 'GTF':
                self.parseGTF(fp, line)
            elif ftype == 'BED3':
                self.parseBED(fp, line, 3, labelColumn)
            elif ftype == 'BED6':
                self.parseBED(fp, line, 6, labelColumn)
            else:
                self.parseBED(fp, line, 12, labelColumn)
            fp.close()

        # Sanity check
        if self.tree.countEntries() == 0:
            raise RuntimeError("None of the input BED/GTF files had valid regions")

        # Replace labels
        if len(labels) > 0:
            if len(labels) != len(self.labels):
                raise RuntimeError("The number of labels found ({0}) does not match the number input ({1})!".format(self.labels, labels))
            else:
                self.labels = labels

        # vine -> tree
        self.tree.finish()
Beispiel #4
0
    def __init__(self,
                 fnames,
                 exonID="exon",
                 transcriptID="transcript",
                 keepExons=False,
                 labels=[],
                 transcript_id_designator="transcript_id",
                 defaultGroup=None,
                 verbose=False):
        """
        Driver function to actually parse files. The steps are as follows:

        1) skip to the first non-comment line
        2) Infer the type from that
        3) Call a type-specific processing function accordingly
          * These call the underlying C code for storage
          * These handle chromsome name conversions (python-level)
          * These handle labels (python-level, with a C-level numeric attribute)
        4) Sanity checking (do the number of labels make sense?)

        Required inputs are as follows:

        fnames:	A list of (possibly compressed with gzip or bzip2) GTF or BED files.

        Optional input is:

        exonID:	      For GTF files, the feature column (column 3) label for
                      exons, or whatever else should be stored as exons. The
                      default is 'exon', though one could use 'CDS' instead.
        transcriptID: As above, but for transcripts. The default is
                      'transcript_id'.
        keepExons:    For BED12 and GTF files, exons are ignored by default.
        labels:       A list of group labels.
        transcript_id_designator: For gtf files, this is the key used in a searching for the transcript ID.
                      If one sets transcriptID to 'gene', then
                      transcript_id_designator would need to be changed to
                      'gene_id' or 'gene_name' to extract the gene ID/name from
                      the attributes.
        defaultGroup: The default group name. If None, the file name is used.
        verbose:      Whether to produce warning messages (default: False)
        """
        self.fname = []
        self.filename = ""
        self.chroms = []
        self.exons = []
        self.labels = []
        self.transcriptIDduplicated = []
        self.tree = tree.initTree()
        self.labelIdx = 0
        self.transcript_id_designator = transcript_id_designator
        self.exonID = exonID
        self.transcriptID = transcriptID
        self.keepExons = keepExons
        self.defaultGroup = defaultGroup
        self.verbose = verbose

        if labels != []:
            self.already_input_labels = True

        if not isinstance(fnames, list):
            fnames = [fnames]

        # Load the files
        for fname in fnames:
            self.filename = fname
            fp = openPossiblyCompressed(fname)
            line, labelColumn = self.firstNonComment(fp)
            if line is None:
                # This will only ever happen if a file is empty or just has a header/comment
                continue
            line = line.strip()

            ftype = self.inferType(fp, line, labelColumn)
            if ftype == 'GTF':
                self.parseGTF(fp, line)
            elif ftype == 'BED3':
                self.parseBED(fp, line, 3, labelColumn)
            elif ftype == 'BED6':
                self.parseBED(fp, line, 6, labelColumn)
            else:
                self.parseBED(fp, line, 12, labelColumn)
            fp.close()

        # Sanity check
        if self.tree.countEntries() == 0:
            raise RuntimeError(
                "None of the input BED/GTF files had valid regions")

        # Replace labels
        if len(labels) > 0:
            if len(labels) != len(self.labels):
                raise RuntimeError(
                    "The number of labels found ({0}) does not match the number input ({1})!"
                    .format(self.labels, labels))
            else:
                self.labels = labels

        # vine -> tree
        self.tree.finish()