Esempio n. 1
0
def parse(handle):
    # yields: name, position (1-based), strand, score, ln_p_value
    from filelib import openfh

    handle = openfh(handle)
    line = handle.readline()
    assert line.startswith("COMMAND LINE"), "Unexpected: %s" % line.strip()

    for line in handle:
        # Check for errors.
        if line.find("cannot execute binary file") >= 0:
            raise AssertionError, line.strip()
        # "-l" (item 9 on the command line) does not match any of the
        #    legal options.
        if line.find("does not match any of the legal options") >= 0:
            raise AssertionError, line.strip()

        # Bug: can discard useful information.
        if line.find("position") < 0:
            continue
        cols = line.strip().split()
        name, position, score, ln_p_value = \
              cols[0], cols[2], cols[4], cols[6]

        assert cols[1] == "position=" and cols[5] == "ln(p-value)="
        if position.endswith("C"):
            position = position[:-1]
            strand = "-"
        else:
            strand = "+"
        position = int(position)
        score, nlp = float(score), -float(ln_p_value)

        yield name, position, strand, score, nlp
Esempio n. 2
0
def _clean_blat_psl(handle):
    # Yields cleaned up lines.
    import filelib
    
    handle = filelib.openfh(handle)

    # Format:
    # psLayout version 4 DNA DNA   (OR psLayout version 3)
    #
    # <header 1>
    # <header 2>
    # ----------
    # hits
    assert handle.readline().startswith("psLayout version")
    assert handle.readline().strip() == ""
    # Read the 2 header lines and join them together.
    # header2 contains fewer columns than header1.
    header1 = handle.readline().rstrip("\r\n").split("\t")
    header2 = handle.readline().rstrip("\r\n").split("\t")
    header1 = [x.strip() for x in header1]
    header2 = [x.strip() for x in header2]
    assert len(header1) >= len(header2)
    header2 = header2 + [""]*(len(header1)-len(header2))
    header = ["%s %s" % (x1, x2) for (x1, x2) in zip(header1, header2)]
    header = [x.strip() for x in header]
    yield "\t".join(header)+"\n"

    x = handle.readline().strip()
    assert x == "-"*len(x)

    for cols in filelib.read_cols(handle):
        assert len(cols) == len(header), "%d %d" % (len(header), len(cols))
        yield "\t".join(cols)+"\n"
Esempio n. 3
0
def read_fastq(fh):
    # Yield tuples of (title, sequence, quality)
    # Title contains the "@" character.
    import filelib

    handle = filelib.openfh(fh)
    n = 0
    while True:
        # <@title>
        # <sequence>
        # +
        # <quality>
        n += 1
        #if fh.find("exm178_2") >= 0 and n >= 5:
        #    break
        x1 = handle.readline()
        if not x1:
            break
        x2 = handle.readline()
        x3 = handle.readline()
        x4 = handle.readline()
        assert x2
        assert x2
        assert x4
        assert x3.strip() == "+", "Missing + line"
        title = x1.strip()
        sequence = x2.strip()
        quality = x4.strip()
        if len(sequence) != len(quality):
            err = "Mismatch sequence [%d] and quality [%d] lengths: %s" % (
                len(sequence), len(quality), title)
            if type(fh) is type(""):
                err += " (%s)" % fh
            assert len(sequence) == len(quality), err
        yield title, sequence, quality
Esempio n. 4
0
def scan_celv3(filename):
    # Yields:
    # (SECTION, NAME, VALUE)
    # ("INTENSITY", "DATA", (X, Y, MEAN, STDEV, NPIXELS))
    # ("MASKS", "DATA", (X, Y))
    # ("OUTLIERS", "DATA", (X, Y))
    # ("MODIFIED", "DATA", (X, Y))
    import filelib

    assert type(filename) is type(""), "Need actual filename."
    handle = filelib.openfh(filename)  # in case of GZ file
    section = None
    for i, line in enumerate(handle):
        line = line.strip()
        if not line:
            continue
        if line.startswith("[") and line.endswith("]"):
            section = line[1:-1]
        elif section == "INTENSITY" and line.find("=") < 0:
            x = line.strip().split()
            #if len(x) != 5:
            #    y = line.replace("\0", "")
            #    print repr(y)
            assert len(x) == 5, "Broken INTENSITY line: %s" % line.strip()
            x = int(x[0]), int(x[1]), float(x[2]), float(x[3]), int(x[4])
            yield section, "DATA", x
        elif section == "MASKS" and line.find("=") < 0:
            x = line.strip().split()
            assert len(x) == 2
            x = [int(x) for x in x]
            yield section, "DATA", x
        elif section == "OUTLIERS" and line.find("=") < 0:
            x = line.strip().split()
            assert len(x) == 2
            x = [int(x) for x in x]
            yield section, "DATA", x
        elif section == "MODIFIED" and line.find("=") < 0:
            x = line.strip().split()
            assert len(x) == 3
            x = int(x[0]), int(x[1]), float(x[2])
            yield section, "DATA", x
        else:
            assert section
            assert line.find("=") >= 0, line
            name, value = [x.strip() for x in line.split("=", 1)]
            yield section, name, value

    # If I opened this file, then close it.  gunzip might not die.
    handle.close()
Esempio n. 5
0
def guess_cel_version(filename):
    # Returns:
    # v3   Version 3 from MAS software.
    # v4   Version 4 from GCOS software.
    # cc1  Command Console version 1.
    import struct
    import filelib

    # Guess the version from the beginning of the file.

    # I need to be able to read from the start of the file.  If I
    # accept a file handle, it's not guaranteed to be at the start of
    # the file.  I can try to seek to the beginning of the file, but
    # this will fail for some files, e.g. gzip'd files.  It's easiest
    # just to not allow file handles.
    assert type(filename) is type("")
    handle = filelib.openfh(filename, "rb")
    #handle.seek(0)   # in case filename was a file handle
    data = handle.read(100)
    handle.close()  # close or gunzip may not die
    assert data, "Empty CEL file: %s" % filename
    assert len(data) == 100, "CEL file is truncated: %s" % filename

    # Check to see if it has the magic numbers for version 4.
    size = struct.calcsize("<ii")
    magic, version = struct.unpack("<ii", data[:size])
    if magic == 64 and version == 4:
        return "v4"

    # Check to see if it has the magic numbers for Command Console
    # version 1.
    size = struct.calcsize(">BB")
    magic, version = struct.unpack(">BB", data[:size])
    if magic == 59 and version == 1:
        return "cc1"

    # See if it looks like version 3.
    # [CEL]
    # Version=3
    s = "[CEL]\nVersion=3"
    d = data
    d = d.replace("\r\n", "\n")
    d = d.replace("\r", "\n")
    if d[:len(s)] == s:
        return "v3"

    raise AssertionError, "Unable to guess CEL version for file %s" % filename
Esempio n. 6
0
def read_cls_file(filename):
    # Return tuple (class_names, classes).  class_names is a list
    # containing the names of the classes.  classes is a list of the
    # classes given in the file.  Classes are either from class_names,
    # or an integer from [0, class_names).
    #
    # Limitations:
    # Only handles categorical CLS files with 2 classes.
    import filelib

    # Space or tab-delimited format.
    # <num samples> <num classes> 1
    # # <class name 0> <class name 1> ...
    # <0/1 or class name> ...
    handle = filelib.openfh(filename)
    x = [x for x in handle if x.strip()]
    assert len(x) == 3, "CLS file should contain 3 lines.  Found %d." % len(x)
    line1, line2, line3 = x

    # Parse the first line.
    x = line1.strip().split()
    assert len(x) == 3
    assert x[2] == "1"
    num_samples, num_classes = int(x[0]), int(x[1])

    # Parse the second line.
    x = line2.strip().split()
    assert x
    assert x[0] == "#"
    assert len(x) == num_classes + 1, "Class mismatch %d %s: %s" % (
        num_classes + 1, x, filename)
    class_names = x[1:]

    # Parse the third line.
    x = line3.strip().split()
    assert len(x) == num_samples
    classes = x
    for i, x in enumerate(classes):
        if x in class_names:
            continue
        try:
            x = int(x)
        except ValueError:
            assert False, "Invalid class: %s" % x
        assert x >= 0 and x < num_classes
        classes[i] = x
    return class_names, classes
Esempio n. 7
0
def read_fasta_many(fh):
    # Yield tuples of (title, sequence)
    # Title does does not have the ">" character.
    import filelib

    handle = filelib.openfh(fh)
    title, sequence = "", []
    for line in handle:
        if line.startswith(">"):
            if title or sequence:
                yield title, "".join(sequence)
            title = line[1:].strip()
            sequence = []
        else:
            sequence.append(line.strip())
    if title or sequence:
        yield title, "".join(sequence)
Esempio n. 8
0
def parse_fastq(filename):
    # Iterator that yields tuples (title, sequence, quality).
    from genomicode import filelib

    # Format of FASTQ files:
    # @4GEOU:00042:00049                          Title
    # ACTGCTAATTCACACTGGATTAGTTGGGCTACTTCATCGT    Sequence
    # +                                           Always "+"
    # =<>>A77.7.54>4444.46-444,44*3333:9:44443    Quality
    
    handle = filelib.openfh(filename)
    while True:
        x = [handle.readline() for x in range(4)]
        lines = [x.strip() for x in x]
        if not lines[0]:
            break
        title, sequence, x, quality = lines
        assert x == "+"
        assert len(sequence) == len(quality)
        assert quality

        yield title, sequence, quality
Esempio n. 9
0
def scan_celv4(filename):
    # Yields:
    # (SECTION, NAME, VALUE)
    # ("INTENSITY", "DATA", (X, Y, MEAN, STDEV, NPIXELS))
    # ("MASKS", "DATA", (X, Y))
    # ("OUTLIERS", "DATA", (X, Y))
    # ("MODIFIED", "DATA", (X, Y))
    import struct
    import filelib

    # integer   32-bit signed integer
    # DWORD     32-bit unsigned integer
    # float     32-bit floating-point number
    # short     16-bit signed integer
    # little-endian
    def read(fmt):
        size = struct.calcsize(fmt)
        return struct.unpack(fmt, handle.read(size))

    assert type(filename) is type("")
    handle = filelib.openfh(filename, "rb")
    #handle.seek(0)

    magic, version = read("<ii")
    assert magic == 64
    assert version == 4
    yield "CEL", "Version", version

    num_cols, num_rows, num_cells = read("<iii")
    assert num_cells == num_cols * num_rows
    yield "HEADER", "Cols", num_cols
    yield "HEADER", "Rows", num_rows

    # The entire HEADER section of the CEL v3 files.
    length, = read("<i")
    header, = read("<%ds" % length)
    yield "HEADER", "Header", header

    length, = read("<i")
    algorithm, = read("<%ds" % length)
    yield "HEADER", "Algorithm", algorithm

    length, = read("<i")
    parameters, = read("<%ds" % length)
    yield "HEADER", "AlgorithmParameters", parameters

    cell_margin, num_outliers, num_masked, num_sub_grids = read("<iIIi")

    # Optimize the unpacking here.
    READ_SIZE = 100000  # 10 bytes each
    total_to_read = num_cells
    index = 0
    while total_to_read:
        n = min(total_to_read, READ_SIZE)
        fmt = "<" + "ffh" * n
        data = read(fmt)
        total_to_read -= n
        for i in range(0, len(data), 3):
            x = index % num_cols
            y = index / num_cols
            mean, stdev, npixels = data[i:i + 3]
            yield "INTENSITY", "DATA", (x, y, mean, stdev, npixels)
            index += 1

    for i in range(num_masked):
        x, y = read("<hh")
        yield "MASKS", "DATA", (x, y)

    for i in range(num_outliers):
        x, y = read("<hh")
        yield "OUTLIERS", "DATA", (x, y)

    for i in range(num_sub_grids):
        row, col = read("<ii")
        x = read("<ffff")
        upper_left_x, upper_left_y, upper_right_x, upper_right_y = x
        x = read("<ffff")
        lower_left_x, lower_left_y, lower_right_x, lower_right_y = x
        x = read("<ffff")
        left_cell_pos, top_cell_pos, right_cell_pos, bottom_cell_pos = x

    #if type(filename) is type(""):
    handle.close()
Esempio n. 10
0
def scan_cdf(filename):
    # Yields:
    # (SECTION, NAME, VALUE)
    # ("CDF", "Version", VERSION)
    # ("Chip", "Name", NAME_OF_ARRAY)
    # ("Chip", "Rows", NUM_ROWS)
    # ("Chip", "Cols", NUM_COLS)
    #
    # ("Unit<x>", "UnitType", <type>)    <type> 3 is Expression.
    # ("Unit<x>", "NumAtoms", <num>)     Num probes, match/mismatch count as 1.
    # ("Unit<x>", "NumCells", <num>)     Num probes, match/mismatch count as 2.
    # ("Unit<x>", "NumberBlocks", <num>) Num blocks in probe set.
    # ("Unit_Block<x>", "CellHeader", <values>)  Header of table.
    #   X      X coordinate of cell.
    #   Y      Y coordinate of cell.
    #   QUAL   Probe set name.  For Genotyping units, includes allele.
    #   EXPOS  For Expression, ranges from [0, NumAtoms-1]
    #   ATOM   For Expression, same as EXPOS.  Groups together match/mismatch.
    #   INDEX  Used to look up cell data in CEL file.
    #   POS    Indexes within probe where mismatch occurs.
    #   PBASE  Base of probe at substitution position.
    #   TBASE  Base of target.  Is a PM probe if PBASE != TBASE.
    #          Otherwise, is a MM probe.  Seems flipped to me?  I guess
    #          it's a PM because it interrogates the complement.
    import filelib

    # ("Unit_Block<x>", "Cell<i>", <values>)

    section = None
    Unit_Block_Cell_int_indexes = None
    assert type(filename) is type("")
    handle = filelib.openfh(filename)
    for i, line in enumerate(handle):
        line = line.strip("\r\n")
        if not line:
            continue

        if line.startswith("[") and line.endswith("]"):
            section = line[1:-1]
            continue

        assert section
        assert line.find("=") >= 0, line
        name, value = [x.strip() for x in line.split("=", 1)]

        # Do some cleaning up of the results.
        Chip_to_int = [
            "Rows", "Cols", "NumberOfUnits", "MaxUnit", "NumQCUnits"
        ]
        Unit_to_int = [
            "Direction", "NumAtoms", "NumCells", "UnitNumber", "UnitType",
            "NumberBlocks"
        ]
        Unit_Block_to_int = [
            "BlockNumber", "NumAtoms", "NumCells", "StartPosition",
            "StopPosition"
        ]
        Unit_Block_Cell_to_int = [
            "X", "Y", "EXPOS", "POS", "ATOM", "INDEX", "CODONIND", "CODON",
            "REGIONTYPE"
        ]

        if section == "Chip" and name in Chip_to_int:
            value = int(value)
        elif section.startswith("QC") and name == "CellHeader":
            value = tuple(value.split("\t"))
            assert len(value) == 6
        elif section.startswith("QC") and name.startswith("Cell"):
            value = value.split("\t")
            assert len(value) == 6
            x0, x1, x2, x3, x4, x5 = value
            value = int(x0), int(x1), x2, int(x3), int(x4), int(x5)
        elif (section.startswith("Unit") and section.find("Block") < 0
              and name in Unit_to_int):
            value = int(value)
        elif (section.startswith("Unit") and section.find("Block") >= 0
              and name in Unit_Block_to_int):
            value = int(value)
        elif (section.startswith("Unit") and section.find("Block") >= 0
              and name == "CellHeader"):
            value = tuple(value.split("\t"))
            Unit_Block_Cell_int_indexes = [
                i for (i, x) in enumerate(value) if x in Unit_Block_Cell_to_int
            ]
        elif (section.startswith("Unit") and section.find("Block") >= 0
              and name.startswith("Cell")):
            value = value.split("\t")
            assert Unit_Block_Cell_int_indexes
            for i in Unit_Block_Cell_int_indexes:
                value[i] = int(value[i])
            value = tuple(value)

        yield section, name, value
    if type(filename) is type(""):
        handle.close()
Esempio n. 11
0
def scan_bpmapv3(filename):
    # Yields:
    # (SECTION, NAME, VALUE)
    # ("DESCRIPTION", "SEQUENCE_ID", <data>)
    # ("DESCRIPTION", "NAME", <data>)
    # ("DESCRIPTION", "TYPE", <data>)             0 (PM/MM); 1 (PM-only)
    # ("DESCRIPTION", "OFFSET", <data>)           file offset of POSITION_INFO
    # ("DESCRIPTION", "GROUP", <data>)
    # ("DESCRIPTION", "VERSION", <data>)
    # ("DESCRIPTION", "PARAMETER", (<name>, <value>))
    # ("POSITION_INFO", "SEQUENCE_ID", <data>)
    # ("POSITION_INFO", "PM_COORD", (<x>, <y>))   0-based coordinate
    # ("POSITION_INFO", "MM_COORD", (<x>, <y>))
    # ("POSITION_INFO", "PROBE_SEQ", <data>)
    # ("POSITION_INFO", "MATCH_SCORE", <data>)    always 1
    # ("POSITION_INFO", "PROBE_POS", <data>)      0-based position
    # ("POSITION_INFO", "STRAND", <data>)         1 target on +, 0 target on -
    import struct
    import filelib

    def read(fmt):
        size = struct.calcsize(fmt)
        return struct.unpack(fmt, handle.read(size))

    def read_string():
        length, = read(">I")
        return read(">%ds" % length)[0]

    # big-endian
    assert type(filename) is type("")
    handle = filelib.openfh(filename, "rb")
    #handle.seek(0)

    magic, = read(">8s")
    assert magic == "PHT7\r\n\x1a\n"

    # Bug in format: sometimes not stored as big-endian float.
    s = handle.read(4)
    version, = struct.unpack(">f", s)
    if int(version) not in [1, 2, 3]:
        version, = struct.unpack("<f", s)
    assert int(version) in [1, 2, 3]
    assert version == 3

    num_sequences, = read(">I")

    # SEQUENCE DESCRIPTION
    seq2nprobes = [None] * num_sequences
    seq2types = [None] * num_sequences
    for i in range(num_sequences):
        yield "DESCRIPTION", "SEQUENCE_ID", i
        sequence_name = read_string()
        yield "DESCRIPTION", "NAME", sequence_name

        probe_type, offset, num_probes = read(">III")
        seq2types[i] = probe_type
        seq2nprobes[i] = num_probes
        yield "DESCRIPTION", "TYPE", probe_type
        yield "DESCRIPTION", "OFFSET", offset
        group_name = read_string()
        yield "DESCRIPTION", "GROUP", group_name
        version = read_string()
        yield "DESCRIPTION", "VERSION", version

        num_params, = read(">I")
        for j in range(num_params):
            name = read_string()
            value = read_string()
            yield "DESCRIPTION", "PARAMETER", (name, value)

    # SEQUENCES
    for i in range(num_sequences):
        sequence_id, = read(">I")
        yield "POSITION_INFO", "SEQUENCE_ID", sequence_id
        for j in range(seq2nprobes[i]):
            pm_x, pm_y = read(">II")
            yield "POSITION_INFO", "PM_COORD", (pm_x, pm_y)
            if seq2types[i] == 0:
                mm_x, mm_y = read(">II")
                yield "POSITION_INFO", "MM_COORD", (mm_x, mm_y)

            probe_length, = read(">B")
            seq_code = read(">7B")
            x = read(">fIB")
            match_score, probe_pos, strand = x

            # Convert the code to a sequence.
            seq_str = []
            for k in range(len(seq_code)):
                s = "ACGT"
                seq_str.append(s[seq_code[k] >> 6 & 3])
                seq_str.append(s[seq_code[k] >> 4 & 3])
                seq_str.append(s[seq_code[k] >> 2 & 3])
                seq_str.append(s[seq_code[k] >> 0 & 3])
            seq_str = seq_str[:probe_length]
            seq_str = "".join(seq_str)
            yield "POSITION_INFO", "PROBE_SEQ", seq_str
            yield "POSITION_INFO", "MATCH_SCORE", match_score
            yield "POSITION_INFO", "PROBE_POS", probe_pos
            yield "POSITION_INFO", "STRAND", strand

    #if type(filename) is type(""):
    handle.close()