def parse(handle): # yields: name, position (1-based), strand, score, ln_p_value from filelib import openfh handle = openfh(handle) line = handle.readline() assert line.startswith("COMMAND LINE"), "Unexpected: %s" % line.strip() for line in handle: # Check for errors. if line.find("cannot execute binary file") >= 0: raise AssertionError, line.strip() # "-l" (item 9 on the command line) does not match any of the # legal options. if line.find("does not match any of the legal options") >= 0: raise AssertionError, line.strip() # Bug: can discard useful information. if line.find("position") < 0: continue cols = line.strip().split() name, position, score, ln_p_value = \ cols[0], cols[2], cols[4], cols[6] assert cols[1] == "position=" and cols[5] == "ln(p-value)=" if position.endswith("C"): position = position[:-1] strand = "-" else: strand = "+" position = int(position) score, nlp = float(score), -float(ln_p_value) yield name, position, strand, score, nlp
def _clean_blat_psl(handle): # Yields cleaned up lines. import filelib handle = filelib.openfh(handle) # Format: # psLayout version 4 DNA DNA (OR psLayout version 3) # # <header 1> # <header 2> # ---------- # hits assert handle.readline().startswith("psLayout version") assert handle.readline().strip() == "" # Read the 2 header lines and join them together. # header2 contains fewer columns than header1. header1 = handle.readline().rstrip("\r\n").split("\t") header2 = handle.readline().rstrip("\r\n").split("\t") header1 = [x.strip() for x in header1] header2 = [x.strip() for x in header2] assert len(header1) >= len(header2) header2 = header2 + [""]*(len(header1)-len(header2)) header = ["%s %s" % (x1, x2) for (x1, x2) in zip(header1, header2)] header = [x.strip() for x in header] yield "\t".join(header)+"\n" x = handle.readline().strip() assert x == "-"*len(x) for cols in filelib.read_cols(handle): assert len(cols) == len(header), "%d %d" % (len(header), len(cols)) yield "\t".join(cols)+"\n"
def read_fastq(fh): # Yield tuples of (title, sequence, quality) # Title contains the "@" character. import filelib handle = filelib.openfh(fh) n = 0 while True: # <@title> # <sequence> # + # <quality> n += 1 #if fh.find("exm178_2") >= 0 and n >= 5: # break x1 = handle.readline() if not x1: break x2 = handle.readline() x3 = handle.readline() x4 = handle.readline() assert x2 assert x2 assert x4 assert x3.strip() == "+", "Missing + line" title = x1.strip() sequence = x2.strip() quality = x4.strip() if len(sequence) != len(quality): err = "Mismatch sequence [%d] and quality [%d] lengths: %s" % ( len(sequence), len(quality), title) if type(fh) is type(""): err += " (%s)" % fh assert len(sequence) == len(quality), err yield title, sequence, quality
def scan_celv3(filename): # Yields: # (SECTION, NAME, VALUE) # ("INTENSITY", "DATA", (X, Y, MEAN, STDEV, NPIXELS)) # ("MASKS", "DATA", (X, Y)) # ("OUTLIERS", "DATA", (X, Y)) # ("MODIFIED", "DATA", (X, Y)) import filelib assert type(filename) is type(""), "Need actual filename." handle = filelib.openfh(filename) # in case of GZ file section = None for i, line in enumerate(handle): line = line.strip() if not line: continue if line.startswith("[") and line.endswith("]"): section = line[1:-1] elif section == "INTENSITY" and line.find("=") < 0: x = line.strip().split() #if len(x) != 5: # y = line.replace("\0", "") # print repr(y) assert len(x) == 5, "Broken INTENSITY line: %s" % line.strip() x = int(x[0]), int(x[1]), float(x[2]), float(x[3]), int(x[4]) yield section, "DATA", x elif section == "MASKS" and line.find("=") < 0: x = line.strip().split() assert len(x) == 2 x = [int(x) for x in x] yield section, "DATA", x elif section == "OUTLIERS" and line.find("=") < 0: x = line.strip().split() assert len(x) == 2 x = [int(x) for x in x] yield section, "DATA", x elif section == "MODIFIED" and line.find("=") < 0: x = line.strip().split() assert len(x) == 3 x = int(x[0]), int(x[1]), float(x[2]) yield section, "DATA", x else: assert section assert line.find("=") >= 0, line name, value = [x.strip() for x in line.split("=", 1)] yield section, name, value # If I opened this file, then close it. gunzip might not die. handle.close()
def guess_cel_version(filename): # Returns: # v3 Version 3 from MAS software. # v4 Version 4 from GCOS software. # cc1 Command Console version 1. import struct import filelib # Guess the version from the beginning of the file. # I need to be able to read from the start of the file. If I # accept a file handle, it's not guaranteed to be at the start of # the file. I can try to seek to the beginning of the file, but # this will fail for some files, e.g. gzip'd files. It's easiest # just to not allow file handles. assert type(filename) is type("") handle = filelib.openfh(filename, "rb") #handle.seek(0) # in case filename was a file handle data = handle.read(100) handle.close() # close or gunzip may not die assert data, "Empty CEL file: %s" % filename assert len(data) == 100, "CEL file is truncated: %s" % filename # Check to see if it has the magic numbers for version 4. size = struct.calcsize("<ii") magic, version = struct.unpack("<ii", data[:size]) if magic == 64 and version == 4: return "v4" # Check to see if it has the magic numbers for Command Console # version 1. size = struct.calcsize(">BB") magic, version = struct.unpack(">BB", data[:size]) if magic == 59 and version == 1: return "cc1" # See if it looks like version 3. # [CEL] # Version=3 s = "[CEL]\nVersion=3" d = data d = d.replace("\r\n", "\n") d = d.replace("\r", "\n") if d[:len(s)] == s: return "v3" raise AssertionError, "Unable to guess CEL version for file %s" % filename
def read_cls_file(filename): # Return tuple (class_names, classes). class_names is a list # containing the names of the classes. classes is a list of the # classes given in the file. Classes are either from class_names, # or an integer from [0, class_names). # # Limitations: # Only handles categorical CLS files with 2 classes. import filelib # Space or tab-delimited format. # <num samples> <num classes> 1 # # <class name 0> <class name 1> ... # <0/1 or class name> ... handle = filelib.openfh(filename) x = [x for x in handle if x.strip()] assert len(x) == 3, "CLS file should contain 3 lines. Found %d." % len(x) line1, line2, line3 = x # Parse the first line. x = line1.strip().split() assert len(x) == 3 assert x[2] == "1" num_samples, num_classes = int(x[0]), int(x[1]) # Parse the second line. x = line2.strip().split() assert x assert x[0] == "#" assert len(x) == num_classes + 1, "Class mismatch %d %s: %s" % ( num_classes + 1, x, filename) class_names = x[1:] # Parse the third line. x = line3.strip().split() assert len(x) == num_samples classes = x for i, x in enumerate(classes): if x in class_names: continue try: x = int(x) except ValueError: assert False, "Invalid class: %s" % x assert x >= 0 and x < num_classes classes[i] = x return class_names, classes
def read_fasta_many(fh): # Yield tuples of (title, sequence) # Title does does not have the ">" character. import filelib handle = filelib.openfh(fh) title, sequence = "", [] for line in handle: if line.startswith(">"): if title or sequence: yield title, "".join(sequence) title = line[1:].strip() sequence = [] else: sequence.append(line.strip()) if title or sequence: yield title, "".join(sequence)
def parse_fastq(filename): # Iterator that yields tuples (title, sequence, quality). from genomicode import filelib # Format of FASTQ files: # @4GEOU:00042:00049 Title # ACTGCTAATTCACACTGGATTAGTTGGGCTACTTCATCGT Sequence # + Always "+" # =<>>A77.7.54>4444.46-444,44*3333:9:44443 Quality handle = filelib.openfh(filename) while True: x = [handle.readline() for x in range(4)] lines = [x.strip() for x in x] if not lines[0]: break title, sequence, x, quality = lines assert x == "+" assert len(sequence) == len(quality) assert quality yield title, sequence, quality
def scan_celv4(filename): # Yields: # (SECTION, NAME, VALUE) # ("INTENSITY", "DATA", (X, Y, MEAN, STDEV, NPIXELS)) # ("MASKS", "DATA", (X, Y)) # ("OUTLIERS", "DATA", (X, Y)) # ("MODIFIED", "DATA", (X, Y)) import struct import filelib # integer 32-bit signed integer # DWORD 32-bit unsigned integer # float 32-bit floating-point number # short 16-bit signed integer # little-endian def read(fmt): size = struct.calcsize(fmt) return struct.unpack(fmt, handle.read(size)) assert type(filename) is type("") handle = filelib.openfh(filename, "rb") #handle.seek(0) magic, version = read("<ii") assert magic == 64 assert version == 4 yield "CEL", "Version", version num_cols, num_rows, num_cells = read("<iii") assert num_cells == num_cols * num_rows yield "HEADER", "Cols", num_cols yield "HEADER", "Rows", num_rows # The entire HEADER section of the CEL v3 files. length, = read("<i") header, = read("<%ds" % length) yield "HEADER", "Header", header length, = read("<i") algorithm, = read("<%ds" % length) yield "HEADER", "Algorithm", algorithm length, = read("<i") parameters, = read("<%ds" % length) yield "HEADER", "AlgorithmParameters", parameters cell_margin, num_outliers, num_masked, num_sub_grids = read("<iIIi") # Optimize the unpacking here. READ_SIZE = 100000 # 10 bytes each total_to_read = num_cells index = 0 while total_to_read: n = min(total_to_read, READ_SIZE) fmt = "<" + "ffh" * n data = read(fmt) total_to_read -= n for i in range(0, len(data), 3): x = index % num_cols y = index / num_cols mean, stdev, npixels = data[i:i + 3] yield "INTENSITY", "DATA", (x, y, mean, stdev, npixels) index += 1 for i in range(num_masked): x, y = read("<hh") yield "MASKS", "DATA", (x, y) for i in range(num_outliers): x, y = read("<hh") yield "OUTLIERS", "DATA", (x, y) for i in range(num_sub_grids): row, col = read("<ii") x = read("<ffff") upper_left_x, upper_left_y, upper_right_x, upper_right_y = x x = read("<ffff") lower_left_x, lower_left_y, lower_right_x, lower_right_y = x x = read("<ffff") left_cell_pos, top_cell_pos, right_cell_pos, bottom_cell_pos = x #if type(filename) is type(""): handle.close()
def scan_cdf(filename): # Yields: # (SECTION, NAME, VALUE) # ("CDF", "Version", VERSION) # ("Chip", "Name", NAME_OF_ARRAY) # ("Chip", "Rows", NUM_ROWS) # ("Chip", "Cols", NUM_COLS) # # ("Unit<x>", "UnitType", <type>) <type> 3 is Expression. # ("Unit<x>", "NumAtoms", <num>) Num probes, match/mismatch count as 1. # ("Unit<x>", "NumCells", <num>) Num probes, match/mismatch count as 2. # ("Unit<x>", "NumberBlocks", <num>) Num blocks in probe set. # ("Unit_Block<x>", "CellHeader", <values>) Header of table. # X X coordinate of cell. # Y Y coordinate of cell. # QUAL Probe set name. For Genotyping units, includes allele. # EXPOS For Expression, ranges from [0, NumAtoms-1] # ATOM For Expression, same as EXPOS. Groups together match/mismatch. # INDEX Used to look up cell data in CEL file. # POS Indexes within probe where mismatch occurs. # PBASE Base of probe at substitution position. # TBASE Base of target. Is a PM probe if PBASE != TBASE. # Otherwise, is a MM probe. Seems flipped to me? I guess # it's a PM because it interrogates the complement. import filelib # ("Unit_Block<x>", "Cell<i>", <values>) section = None Unit_Block_Cell_int_indexes = None assert type(filename) is type("") handle = filelib.openfh(filename) for i, line in enumerate(handle): line = line.strip("\r\n") if not line: continue if line.startswith("[") and line.endswith("]"): section = line[1:-1] continue assert section assert line.find("=") >= 0, line name, value = [x.strip() for x in line.split("=", 1)] # Do some cleaning up of the results. Chip_to_int = [ "Rows", "Cols", "NumberOfUnits", "MaxUnit", "NumQCUnits" ] Unit_to_int = [ "Direction", "NumAtoms", "NumCells", "UnitNumber", "UnitType", "NumberBlocks" ] Unit_Block_to_int = [ "BlockNumber", "NumAtoms", "NumCells", "StartPosition", "StopPosition" ] Unit_Block_Cell_to_int = [ "X", "Y", "EXPOS", "POS", "ATOM", "INDEX", "CODONIND", "CODON", "REGIONTYPE" ] if section == "Chip" and name in Chip_to_int: value = int(value) elif section.startswith("QC") and name == "CellHeader": value = tuple(value.split("\t")) assert len(value) == 6 elif section.startswith("QC") and name.startswith("Cell"): value = value.split("\t") assert len(value) == 6 x0, x1, x2, x3, x4, x5 = value value = int(x0), int(x1), x2, int(x3), int(x4), int(x5) elif (section.startswith("Unit") and section.find("Block") < 0 and name in Unit_to_int): value = int(value) elif (section.startswith("Unit") and section.find("Block") >= 0 and name in Unit_Block_to_int): value = int(value) elif (section.startswith("Unit") and section.find("Block") >= 0 and name == "CellHeader"): value = tuple(value.split("\t")) Unit_Block_Cell_int_indexes = [ i for (i, x) in enumerate(value) if x in Unit_Block_Cell_to_int ] elif (section.startswith("Unit") and section.find("Block") >= 0 and name.startswith("Cell")): value = value.split("\t") assert Unit_Block_Cell_int_indexes for i in Unit_Block_Cell_int_indexes: value[i] = int(value[i]) value = tuple(value) yield section, name, value if type(filename) is type(""): handle.close()
def scan_bpmapv3(filename): # Yields: # (SECTION, NAME, VALUE) # ("DESCRIPTION", "SEQUENCE_ID", <data>) # ("DESCRIPTION", "NAME", <data>) # ("DESCRIPTION", "TYPE", <data>) 0 (PM/MM); 1 (PM-only) # ("DESCRIPTION", "OFFSET", <data>) file offset of POSITION_INFO # ("DESCRIPTION", "GROUP", <data>) # ("DESCRIPTION", "VERSION", <data>) # ("DESCRIPTION", "PARAMETER", (<name>, <value>)) # ("POSITION_INFO", "SEQUENCE_ID", <data>) # ("POSITION_INFO", "PM_COORD", (<x>, <y>)) 0-based coordinate # ("POSITION_INFO", "MM_COORD", (<x>, <y>)) # ("POSITION_INFO", "PROBE_SEQ", <data>) # ("POSITION_INFO", "MATCH_SCORE", <data>) always 1 # ("POSITION_INFO", "PROBE_POS", <data>) 0-based position # ("POSITION_INFO", "STRAND", <data>) 1 target on +, 0 target on - import struct import filelib def read(fmt): size = struct.calcsize(fmt) return struct.unpack(fmt, handle.read(size)) def read_string(): length, = read(">I") return read(">%ds" % length)[0] # big-endian assert type(filename) is type("") handle = filelib.openfh(filename, "rb") #handle.seek(0) magic, = read(">8s") assert magic == "PHT7\r\n\x1a\n" # Bug in format: sometimes not stored as big-endian float. s = handle.read(4) version, = struct.unpack(">f", s) if int(version) not in [1, 2, 3]: version, = struct.unpack("<f", s) assert int(version) in [1, 2, 3] assert version == 3 num_sequences, = read(">I") # SEQUENCE DESCRIPTION seq2nprobes = [None] * num_sequences seq2types = [None] * num_sequences for i in range(num_sequences): yield "DESCRIPTION", "SEQUENCE_ID", i sequence_name = read_string() yield "DESCRIPTION", "NAME", sequence_name probe_type, offset, num_probes = read(">III") seq2types[i] = probe_type seq2nprobes[i] = num_probes yield "DESCRIPTION", "TYPE", probe_type yield "DESCRIPTION", "OFFSET", offset group_name = read_string() yield "DESCRIPTION", "GROUP", group_name version = read_string() yield "DESCRIPTION", "VERSION", version num_params, = read(">I") for j in range(num_params): name = read_string() value = read_string() yield "DESCRIPTION", "PARAMETER", (name, value) # SEQUENCES for i in range(num_sequences): sequence_id, = read(">I") yield "POSITION_INFO", "SEQUENCE_ID", sequence_id for j in range(seq2nprobes[i]): pm_x, pm_y = read(">II") yield "POSITION_INFO", "PM_COORD", (pm_x, pm_y) if seq2types[i] == 0: mm_x, mm_y = read(">II") yield "POSITION_INFO", "MM_COORD", (mm_x, mm_y) probe_length, = read(">B") seq_code = read(">7B") x = read(">fIB") match_score, probe_pos, strand = x # Convert the code to a sequence. seq_str = [] for k in range(len(seq_code)): s = "ACGT" seq_str.append(s[seq_code[k] >> 6 & 3]) seq_str.append(s[seq_code[k] >> 4 & 3]) seq_str.append(s[seq_code[k] >> 2 & 3]) seq_str.append(s[seq_code[k] >> 0 & 3]) seq_str = seq_str[:probe_length] seq_str = "".join(seq_str) yield "POSITION_INFO", "PROBE_SEQ", seq_str yield "POSITION_INFO", "MATCH_SCORE", match_score yield "POSITION_INFO", "PROBE_POS", probe_pos yield "POSITION_INFO", "STRAND", strand #if type(filename) is type(""): handle.close()