Beispiel #1
0
    def read_transfac(cls, fin, alphabet=None):
        """Parse a TRANSFAC-format PWM from a file.
        Returns a Motif object, representing the provided
        PWM along with an inferred or provided alphabet.
        """

        items = []

        start = False
        for line in fin:
            if line.isspace() or line[0] == "#":
                continue  # pragma: no cover

            stuff = line.split()

            if stuff[0] == "PO" or stuff[0] == "P0":
                start = True

            # 'XX' delimiters may precede the first motif
            if start:
                if stuff[0] in cls._TRANSFAC_DELIM_LINES:
                    break
                else:
                    items.append(stuff)

        if len(items) < 2:
            raise ValueError("Vacuous file.")

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not (header[0] == "PO" or header[0] == "P0" or hcols == cols - 1
                or hcols == cols - 2):
            raise ValueError("Missing header line!")  # pragma: no cover

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)):
            if cols != len(items[i]):
                raise ValueError("Inconsistant length, row: {}".format(
                    i))  # pragma: no cover

        # Vertical or horizontal arrangement?
        if header[0] == "PO" or header[0] == "P0":
            header.pop(0)

        position_header = True

        for h in header:
            if not ischar(h):
                raise ValueError("Expected a single character per header "
                                 'item, but got "{}" as one item'.format(
                                     h))  # pragma: no cover
            if not isint(h):
                position_header = False

        alphabet_header = False if position_header else True

        # Check row headers
        if alphabet_header:
            for i, r in enumerate(items):
                if not isint(r[0]) and r[0][0] != "P":
                    raise ValueError("Expected position "
                                     "as first item on line {}".format(
                                         i))  # pragma: no cover
                r.pop(0)
                defacto_alphabet = "".join(header)
        else:
            a = []  # pragma: no cover
            for i, r in enumerate(items):  # pragma: no cover
                if not ischar(r[0]) and r[0][0] != "P":  # pragma: no cover
                    raise ValueError("Expected position "  # pragma: no cover
                                     "as first item on line {}".format(
                                         i))  # pragma: no cover
                a.append(r.pop(0))  # pragma: no cover
            defacto_alphabet = "".join(a)  # pragma: no cover

        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet:
            alphabet = Alphabet(alphabet)
            if not defacto_alphabet.alphabetic(alphabet):
                # Allow alphabet to be a superset of defacto_alphabet
                alphabet = defacto_alphabet

        else:
            alphabets = (
                unambiguous_rna_alphabet,
                unambiguous_dna_alphabet,
                unambiguous_protein_alphabet,
            )
            for a in alphabets:
                if defacto_alphabet.alphabetic(a):
                    alphabet = a
                    break
            if not alphabet:
                alphabet = defacto_alphabet  # pragma: no cover

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) + 1:
            for r in items:
                r.pop()

        # items should now be a list of lists of numbers (as strings)
        rows = len(items)
        cols = len(items[0])
        matrix = np.zeros((rows, cols), dtype=np.float64)
        for r in range(rows):
            for c in range(cols):
                matrix[r, c] = float(items[r][c])

        if position_header:
            matrix.transpose()  # pragma: no cover

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
Beispiel #2
0
 def test_alphabet_alphabetic(self):
     a = Alphabet("alphbet")
     self.assertTrue(a.alphabetic("alphbet"))
     self.assertTrue(not a.alphabetic("alphbetX"))