def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib import util if hrows not in [None, 1]: return False if hcols not in [None, 4]: return False if not filelib.exists(locator_str): # This will only work if locator_str is a string. return False # Read 5 lines and check the headers. If the file is small, this # may contain fewer than 5 lines. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] # Make sure there's at least 1 line. if not matrix: return False header = matrix[0] if header[:len(ROW_HEADERS)] != ROW_HEADERS: return False # Check if there's extraneous stuff. nr, nc = util.num_headers(matrix) if nc > 4: return False return True
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False if hrows not in [None, 1]: return False if hcols not in [None, 2]: return False # Read 5 lines and check the headers. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] if len(matrix) < 3: return False # First line could be just one column, or could be many columns. if len(matrix[0]) < 1: return False # Second line must have at least 2 columns. if len(matrix[1]) < 2: return False if matrix[0][0] != "#1.2": return False #if matrix[2][0].strip().upper() != "NAME": # return False #if matrix[2][1].strip().upper() != "DESCRIPTION": # return False return True
def parse_sam(file_or_handle): # yield SAMAlignment objects from genomicode import filelib # Somehow, csv raises errors on some BAM files read directly with # "samtools view" (via the subprocess module). Just implement our # own column splitting. #for cols in filelib.read_cols(file_or_handle): handle = filelib.openfh(file_or_handle) for line in handle: cols = line.rstrip("\r\n").split("\t") assert len(cols) >= 11, "Invalid line (%d):\n%s" % (len(cols), line) qname = cols[0] flag = int(cols[1]) rname = cols[2] pos = int(cols[3]) mapq = int(cols[4]) cigar = cols[5] rnext = cols[6] pnext = cols[7] tlen = int(cols[8]) seq = cols[9] qual = cols[10] tags = {} for i in range(11, len(cols)): x = cols[i] x = x.split(":", 2) assert len(x) == 3, cols[i] tag, type_, value = x assert tag not in tags tags[tag] = (type_, value) x = SAMAlignment(qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, tags) yield x
def count_reads(fastq_filename): # Requires an uncompressed fastq file. from genomicode import filelib from genomicode import parallel sq = parallel.quote # Make sure it's a fastq file. # @M03807:17:000000000-AHGYH:1:1101:20554:1508 1:N:0:16 # CTTTACACCCAGTGGAGAAGCTCCCAACCAAGCTCTCTTGAGGATCTTGAAGGAAACTGA # + # <BCC@FAFEC8,C<8968<@EEEFFCCFEC@EDEFGGGGA,@,@EFGGF9,,88,@FFA< handle = filelib.openfh(fastq_filename) x = [handle.readline() for i in range(4)] x = [x.strip() for x in x] x = [x for x in x] assert len(x) == 4 assert len(x[1]) == len(x[3]) assert x[2] == "+" wc_out = parallel.sshell("wc -l %s" % sq(fastq_filename)) # velocitron:biocore$ wc -l test01.txt # 22278 test01.txt # 0 test 1.txt x = wc_out.strip().split() assert len(x) >= 2, "Unknown format from wc -l\n" % wc_out num_lines, filename = x[0], " ".join(x[1:]) num_lines = int(num_lines) num_reads = num_lines / 4 return num_reads
def merge_or_symlink_files(in_filenames, out_filename): # If only 1 file, then just symlink it rather than copy. # out_filename must not exist. import os from genomicode import filelib CHUNK_SIZE = 1024 * 1024 assert not os.path.exists(out_filename) # If only one file, and it's not compressed, then just symlink it. if len(in_filenames) == 1: in_filename = in_filenames[0] x, ext = os.path.splitext(in_filename) if ext.lower() in [".fa", ".fasta"]: os.symlink(in_filename, out_filename) return # Create an empty outfile that I can append to. open(out_filename, 'w') # Append the files in order. for in_filename in in_filenames: in_handle = filelib.openfh(in_filename) out_handle = open(out_filename, 'a') while True: x = in_handle.read(CHUNK_SIZE) if not x: break out_handle.write(x)
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False # Read 5 lines and check the headers. If the file is small, this # may contain fewer than 5 lines. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] matrix = _clean_tdf(matrix) # Make sure there's at least 1 line. if not matrix: return False # All rows should contain at least one column. for x in matrix: if not x: return False # All rows should contain the same number of columns. for x in matrix: if len(x) != len(matrix[0]): return False return True
def resolve_symbol_or_file(name): from genomicode import filelib if not os.path.exists(name): return [name] symbols = [x.strip() for x in filelib.openfh(name)] return symbols
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False # Read 5 lines and count the headers. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] if len(matrix) < 3: return False # Line 3 should contain only 1 column. if len(matrix[2]) != 1: return False # Line 1 contains 1 more column than line 2. if len(matrix[0]) != len(matrix[1]) + 1: return False if len(matrix[0]) < 2: return False x = [x.upper() for x in matrix[0][:2]] if sorted(x) != sorted(["ACCESSION", "DESCRIPTION"]): return False return True
def uncompress_file(in_filename, out_filename): from genomicode import filelib CHUNK_SIZE = 16 * 1024 * 1024 in_handle = filelib.openfh(in_filename) out_handle = open(out_filename, 'w') while True: x = in_handle.read(CHUNK_SIZE) if not x: break out_handle.write(x)
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False handle = filelib.openfh(locator_str) x = handle.readline() handle.close() # need to close it properly, or gunzip might not die. if not x: # blank file return False if "," in x: return True return False
def read(handle, hrows=None, hcols=None, datatype=float): import const import tab_delimited_format from genomicode import filelib assert hrows is None or hrows == 1 assert hcols is None or hcols == 2 handle = filelib.openfh(handle) assert handle.readline().strip() == "#1.2" x = handle.readline().rstrip("\r\n").split("\t") assert len(x) >= 2 num_genes, num_samples = map(int, x[:2]) X = tab_delimited_format.read(handle, hrows=1, hcols=2, datatype=datatype) assert X.dim() == (num_genes, num_samples), ( "Matrix size mismatch.\n" "The GCT headers indicate a matrix with %d rows and %d columns.\n" "However, I found %d rows and %d columns." % (num_genes, num_samples, X.nrow(), X.ncol())) #assert X.row_headers()[0].upper() == "NAME" #assert X.row_headers()[1].upper() == "DESCRIPTION" header0, header1 = X.row_names()[:2] synonyms = {} NAME, DESCRIPTION = "NAME", "DESCRIPTION" if header0 != NAME: synonyms[NAME] = header0 if header1 != DESCRIPTION: synonyms[DESCRIPTION] = header1 synonyms[const.ROW_ID] = header0 X._synonyms.update(synonyms) #X = Matrix.add_synonyms(X, synonyms) assert is_matrix(X) # The GCT File Format description at the Broad Institute does not # require the NAMEs to be unique. ## Make sure the NAMEs are unique. #seen = {} #dups = {} #for name in X.row_annots(NAME): # if name in seen: # dups[name] = 1 # seen[name] = 1 #dups = sorted(dups) #assert len(dups) < 5, "%s column has %d duplicated names." % ( # header0, len(dups)) #assert not dups, "%s column has duplicated names: %s" % ( # header0, dups) return X
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib import util if not filelib.exists(locator_str): return False # Read 5 lines and count the headers. # Actually, sometimes 5 lines not enough. Working on matrix with # 13 lines of header. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(20)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] # Make sure there's at least 1 line. if not matrix: return False # All rows should contain the same number of columns. for cols in matrix: if len(cols) != len(matrix[0]): return False nr, nc = util.num_headers(matrix) nrow = hrows or nr ncol = hcols or nc if nrow < 1 or nrow > 4: return False if ncol < 1 or ncol > 5: return False header_def = [ (0, 0, "GID"), (0, 2, "NAME"), (0, 3, "GWEIGHT"), (0, 4, "GORDER"), (1, 0, "AID"), (2, 0, "EWEIGHT"), (3, 0, "EORDER"), ] for row, col, name in header_def: if nrow > row and ncol > col: if matrix[row][col].strip().upper() != name: return False return True
def read(handle, hrows=None, hcols=None, datatype=float): import csv from StringIO import StringIO from genomicode import filelib import tab_delimited_format # Convert this to tab-delimited format and let the other module # deal with it. outhandle = StringIO() reader = csv.reader(filelib.openfh(handle)) for row in reader: print >> outhandle, "\t".join(row) outhandle.seek(0) return tab_delimited_format.read(outhandle, hrows=hrows, hcols=hcols, datatype=datatype)
def read(handle, hrows=None, hcols=None, datatype=float): import StringIO import tab_delimited_format from genomicode import filelib # Figure out the number of headers for tab_delimited_format. If # sample names are numbers, then tab_delimited_format might # mistake the first row(s) for non-headers. s = filelib.openfh(handle).read() # Read 5 lines and check the headers. If the file is small, this # may contain fewer than 5 lines. handle = StringIO.StringIO(s) lines = [handle.readline() for i in range(5)] lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] assert len(matrix) >= 1 assert len(matrix[0]) >= 2 if hcols is None: hcols = 1 if len(matrix[0]) >= 2 and matrix[0][1].strip().upper() in ROW_HEADERS: hcols += 1 if len(matrix[0]) >= 3 and matrix[0][2].strip().upper() in ROW_HEADERS: hcols += 1 if len(matrix[0]) >= 4 and matrix[0][3].strip().upper() in ROW_HEADERS: hcols += 1 if hrows is None: hrows = 1 if len(matrix) >= 2 and matrix[1][0].strip().upper() in COL_HEADERS: hrows += 1 if len(matrix) >= 3 and matrix[2][0].strip().upper() in COL_HEADERS: hrows += 1 handle = StringIO.StringIO(s) X = tab_delimited_format.read(handle, hrows=hrows, hcols=hcols, datatype=datatype) #is_matrix(X); print DIAGNOSIS assert is_matrix(X) return X
def read_normal_cancer_file(file_or_handle): # Return list of (normal_sample, tumor_sample). import os from genomicode import filelib handle = file_or_handle if type(handle) is type(""): assert os.path.exists(file_or_handle) handle = filelib.openfh(handle) data = [] for d in filelib.read_row(handle, header=1, pad_cols=""): assert hasattr(d, "Normal"), "Missing header: Normal" assert hasattr(d, "Cancer"), "Missing header: Cancer" ns = d.Normal ts = d.Cancer ns, ts = ns.strip(), ts.strip() assert ns != ts x = ns, ts data.append(x) return data
def _read_coverage_file(filename): # Return dict of key -> value (as string) from genomicode import filelib matrix = [] for line in filelib.openfh(filename): if line.startswith("#"): continue if not line.strip(): continue x = line.rstrip("\r\n").split("\t") matrix.append(x) assert len(matrix) == 2 assert len(matrix[0]) == len(matrix[1]) data = {} for i in range(len(matrix[0])): key = matrix[0][i].strip() value = matrix[1][i].strip() assert key not in data data[key] = value return data
def merge_parsed_files(parsed_files, outfile): # First, make sure each of the parsed files has the same header. from genomicode import filelib assert parsed_files header = None for f in parsed_files: cols = filelib.read_cols(f).next() if not header: header = cols assert header == cols, "Mismatched headers" assert header handle = open(outfile, 'w') seen = {} for f in parsed_files: for line in filelib.openfh(f): if line in seen: continue seen[line] = 1 print >> handle, line,
def parse_trimmomatic_output(filename): # Return a dictionary with keys: # reads_processed # dropped_reads # # For paired ends, this refers to pairs of reads. dropped_reads # indicates how many pairs were dropped, because on or both were # dropped. from genomicode import filelib # Single end reads: # Input Reads: 1764254 Surviving: 1764160 (99.99%) Dropped: 94 (0.01%) # # Paired end reads: # Input Read Pairs: 60032406 Both Surviving: 59093198 (98.44%) # Forward Only Surviving: 891164 (1.48%) Reverse Only Surviving: # 20511 (0.03%) Dropped: 27533 (0.05%) results = {} for line in filelib.openfh(filename): if not line.startswith("Input Read"): continue cols = line.strip().split() if line.startswith("Input reads:"): # Single end. assert len(cols) == 9 reads = int(cols[2]) dropped = int(cols[7]) elif line.startswith("Input Read Pairs:"): # Paired end. assert len(cols) == 21 reads = int(cols[3]) dropped = reads - int(cols[6]) assert dropped < reads results["reads_processed"] = reads results["dropped_reads"] = dropped assert results, "Parse error: %s" % filename return results
def extract_sample2desc(filename): from genomicode.filelib import openfh title_dict = {} description_dict = {} id_ = None for line in openfh(filename): if line.startswith("^SAMPLE"): assert id_ is None, "problem with %s" % filename id_ = line.strip().split()[2] elif line.startswith("!Sample_description"): assert id_ is not None, "problem with %s" % filename title = line.strip().split(None, 2)[2] #x = id, title description_dict[id_] = title elif line.startswith("!Sample_title"): assert id_ is not None, "problem with %s" % filename title = line.strip().split(None, 2)[2] #x = id, "Title: %s" % title title_dict[id_] = title elif line.startswith("!sample_table_end"): id_ = None return title_dict, description_dict
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib import util if not filelib.exists(locator_str): return False # Read NUM_LINES lines and count the headers. Previously, we read # only 5 lines, and had problems. In a matrix, one of the # annotation columns had spaces in the first 5 lines, so it was # mistakenly annotated as part of the matrix, rather than part of # the annotations. Probably should look at least at the first 100 # lines. U133Av2 has 62 AFFX genes that may or may not have # annotations. #NUM_LINES = 25 NUM_LINES = 100 handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(NUM_LINES)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] # Make sure there's at least 1 line. if not matrix: return False # Has to have at least a header. if len(matrix) < 1: return False # All rows should contain the same number of columns. for cols in matrix: if len(cols) != len(matrix[0]): return False nr, nc = util.num_headers(matrix) nrow = hrows or nr ncol = hcols or nc # PCL requires at least the gene IDs. if ncol == 0: return False #if nrow == 0 and ncol == 0: # return False nrow = max(nrow, 1) # what is this for??? if nrow < 1 or nrow > 3: return False # PCL format has at most 4 header columns. if ncol > 4: return False #if ncol > 2: # ncol = 2 #if ncol < 2 or ncol > 4: # return False assert len(matrix) >= 1 header_def = [ (0, 1, "NAME"), (0, 2, "GWEIGHT"), (0, 3, "GORDER"), (1, 0, "EWEIGHT"), (2, 0, "EORDER"), ] for row, col, name in header_def: if nrow > row and ncol > col: if matrix[row][col].strip().upper() != name: return False return True
def read(handle, hrows=None, hcols=None, datatype=float): import math from genomicode import filelib from genomicode import Matrix from genomicode import jmath from genomicode import iolib import util import const # Format: # - gene x experiment # - optional header row # - optional rows of sample annotations (requires header row) # - optional columns of gene annotations filename = None if type(handle) is type(""): filename = handle handle = filelib.openfh(handle) data = filelib.read_all_cols(handle) #data = [x for x in filelib.read_cols(handle)] #x = handle.read() #data = iolib.split_tdf(x, strip=True) #handle = filelib.read_cols(handle) #data = [handle.next() for i in range(100)] data = _clean_tdf(data) num_cols = len(data[0]) for i, x in enumerate(data): nc = len(data[i]) f = "" if filename: f = " [%s]" % filename error_msg = "Header%s has %d columns but line %d has %d." % ( f, num_cols, i + 1, nc) assert nc == num_cols, error_msg if not data: return Matrix.InMemoryMatrix([]) # If the rows and cols not explicitly specified, then try to guess # them from the file. #print "HEADERS 1", hrows, hcols if hrows is None or hcols is None: hr, hc = util.num_headers(data) if hrows is None: hrows = hr if hcols is None: hcols = hc #print "HEADERS 2", hrows, hcols #num_genes, num_arrays = num_rows-hrows, num_cols-hcols # Pull out the row names from the columns. row_names = {} # header -> list of names (1 for each gene) row_order = [] # in-order list of the headers if hcols: if hrows: # If a header row is provided, then the names of these # annotations are provided in the header. row_order = data[0][:hcols] else: # No header row. Make default name for these annotations. ndigits = int(math.ceil(math.log(hcols, 10))) row_order = ["ANNOT%*d" % (ndigits, i + 1) for i in range(hcols)] # Strip extraneous whitespace from the header names. # Not necessary. Handled now in split_tdf. #row_order = [x.strip() for x in row_order] # Sometimes the format detection can go wrong and a GCT file # will slip through to here. If this occurs, a "duplicate # header" exception will be generated. Check for this and # generate a more meaningful error message. if (row_order[0] == "#1.2" and len(row_order) > 1 and row_order[1] == "" and row_order[-1] == ""): raise AssertionError("ERROR: It looks like a GCT file was missed.") for i, header in enumerate(row_order): names = [x[i] for x in data[hrows:]] assert header not in row_names, "duplicate header: %s" % header row_names[header] = names # Pull out the column names. col_names = {} # header -> list of names (1 for each array) col_order = [] if hrows: for i in range(1, hrows): header = data[i][0] names = data[i][hcols:] assert header not in col_names, "duplicate name: %s" % header # Strip extraneous whitespace from the header names. # Not necessary. Handled now in split_tdf. #header = header.strip() col_order.append(header) col_names[header] = names # Now extract the expression values. matrix = data if hrows or hcols: matrix = [x[hcols:] for x in matrix[hrows:]] # Pull out the sample names. sample_names = None if hrows: # If a header is provided, then use these as the column names. sample_names = data[0][hcols:] if sample_names: col_names[SAMPLE_NAME] = sample_names col_order.insert(0, SAMPLE_NAME) if datatype is None: convert_fn = None # no conversion elif datatype is int: convert_fn = jmath.safe_int elif datatype is float: convert_fn = jmath.safe_float else: # Assume that I was passed a function. convert_fn = datatype if convert_fn == jmath.safe_float: # Try and convert to an integer instead. is_int = True for i in range(len(matrix)): for j in range(len(matrix[i])): if not jmath.is_int(matrix[i][j]): is_int = False break if not is_int: break if is_int: convert_fn = jmath.safe_int if convert_fn: check_each_row = False try: matrix = [map(convert_fn, x) for x in matrix] except ValueError, err1: if str(err1) == "empty string for float()": check_each_row = True elif str(err1).startswith("invalid literal for float()"): check_each_row = True elif str(err1).startswith("could not convert string to float"): check_each_row = True else: raise if check_each_row: # If there was an exception, then check each row carefully # to try to pinpoint the problem. for i, x in enumerate(matrix): try: map(convert_fn, x) except ValueError, err2: row = data[hrows + i] raise ValueError("%s\nProblem with row %d: %s" % (str(err2), i + 1, row)) raise AssertionError("Error converting values.")
def extract_signal(filename, outhandle): import os import tempfile from genomicode import filelib # Write stuff to file to handle large data sets. tmpfile1 = tmpfile2 = tmpfile3 = None try: # tmpfile1 Raw signal data from series matrix file. # tmpfile2.<num> Raw data split into separate tables. # tmpfile3 Final merged signal table. x, tmpfile1 = tempfile.mkstemp(dir=".") os.close(x) x, tmpfile2 = tempfile.mkstemp(dir=".") os.close(x) x, tmpfile3 = tempfile.mkstemp(dir=".") os.close(x) # Get a list of all lines in the series matrix tables. handle = open(tmpfile1, 'w') in_matrix_table = 0 for cols in filelib.read_cols(filename): # Some files can have blank lines. if not cols: continue if cols[0] == "!series_matrix_table_begin": in_matrix_table = 1 elif cols[0] == "!series_matrix_table_end": in_matrix_table = 0 elif in_matrix_table: cols = [remove_quotes(x).strip() for x in cols] print >> handle, "\t".join(cols) handle.close() handle = None # Split the data into separate tables. num_tables = 0 for line in filelib.openfh(tmpfile1): if line.startswith("ID_REF"): handle = open("%s.%d" % (tmpfile2, num_tables), 'w') num_tables += 1 assert handle print >> handle, line, if handle: handle.close() assert num_tables # Sometimes the tables will not be aligned. # E.g. GSE9899-GPL570 contains two tables, and the 2nd is # missing some probe sets. Get a list of the probe sets in # the tables. files = ["%s.%d" % (tmpfile2, i) for i in range(num_tables)] matrices = [FileMatrix(x) for x in files] id2indexes = [] for matrix in matrices: id2index = {} for i, row in enumerate(matrix): id_ = row[0] id2index[id_] = i id2indexes.append(id2index) # Make a list of all the IDs. all_ids = {} for id2index in id2indexes: for id_ in id2index: all_ids[id_] = 1 del all_ids["ID_REF"] all_ids = all_ids.keys() all_ids.sort() all_ids = ["ID_REF"] + all_ids # Align the indexes. #num_rows = row_names = None #for i in range(num_tables): # filename = "%s.%d" % (tmpfile2, i) # rname, nrow = [], 0 # for line in openfh(filename): # x = line.split("\t", 1)[0] # rname.append(x) # nrow += 1 # if num_rows is None: # num_rows = nrow # if row_names is None: # row_names = rname # assert num_rows == nrow, "table is unaligned" # assert row_names == rname # Merge all the pieces together into one big table. handle = open(tmpfile3, 'w') for id_ in all_ids: cols = [] for matrix, id2index in zip(matrices, id2indexes): if id_ in id2index: x = matrix[id2index[id_]] else: # If this ID is missing, then just insert blank values. x = [""] * len(matrix[0]) if cols: # If this is not the first matrix, then delete the # row names. x = x[1:] cols.extend(x) print >> handle, "\t".join(cols) handle.close() num_rows = len(all_ids) num_cols = len(filelib.read_cols(tmpfile3).next()) # Figure out which expression values are missing. data_missing = {} for i, cols in enumerate(filelib.read_cols(tmpfile3)): assert len(cols) == num_cols, "line %d unaligned [%d:%d]" % ( i, len(cols), num_cols) if i == 0: continue for j in range(1, len(cols)): try: float(cols[j]) except ValueError, x: data_missing[(i, j)] = 1 if cols[j] == "nan": data_missing[(i, j)] = 1 ## Remove the samples where >50% values are missing. #col_missing = [0] * num_cols # number of values missing in each col #for i, j in data_missing: # col_missing[j] += 1 good_cols = [0] for i in range(1, num_cols): #if col_missing[i] > 0.50*(num_rows-1): # -1 for the row names # continue good_cols.append(i) ## Remove the genes where any value is missing. #row_missing = [0] * num_rows #for i, j in data_missing: # if j not in good_cols: # ignore samples that are already dropped # continue # row_missing[i] += 1 good_rows = [0] for i in range(1, num_rows): #if row_missing[i] > 0: # a value is missing. # continue good_rows.append(i) assert len(good_cols) > 1, "no data" assert len(good_rows) > 1, "no data" # Write out the data. for i, cols in enumerate(filelib.read_cols(tmpfile3)): if i not in good_rows: continue x = [x for (i, x) in enumerate(cols) if i in good_cols] print >> outhandle, "\t".join(x)
def read(handle, hrows=None, hcols=None, datatype=float): from genomicode import filelib from genomicode import jmath from genomicode import Matrix import tab_delimited_format as tdf import const handle = filelib.openfh(handle) # Can't use iolib.split_tdf here because it does not handle empty # lines properly (which can occur if there is a file with no # samples). #data = iolib.split_tdf(handle.read()) data = [x.rstrip("\r\n").split("\t") for x in handle] assert len(data) >= 3, "Invalid RES file." # Do some checking on the format. assert len(data[0]) == len(data[1]) + 1 x = sorted([x.upper() for x in data[0][:2]]) assert x == ["ACCESSION", "DESCRIPTION"] assert len(data[2]) == 1, "%d: %s" % (len(data[2]), repr(data[2])) # Parse out the number of genes and delete the row. num_genes = int(data[2][0]) del data[2] assert len(data) == num_genes + 2 # data + 2 headers # GenePattern creates files where the last column is all blank. # If this is the case, then delete it. #blank_last_col = True x = [x[-1] for x in data if x[-1]] if not x: # Last column is all blank so delete it. data = [x[:-1] for x in data] # Parse the names of the samples. sample_names = [] for i, x in enumerate(data[0][2:]): if i % 2: assert not x else: assert x sample_names.append(x) # Parse out the sample_description. sample_description = [] for i, x in enumerate(data[1]): if i % 2 == 0: assert not x else: assert x sample_description.append(x) assert len(sample_description) == len(sample_names) # Pull the scale factors out of the sample_description. # Some of the descriptions can be missing scale factors. scale_factors = [""] * len(sample_description) for i in range(len(sample_description)): x = sample_description[i] sf = "scale factor" j = x.lower().find(sf) if j < 0: continue assert x[j - 1] == "/" assert x[j + len(sf)] == "=" scale_factors[i] = float(sample_description[i][j + len(sf) + 1:]) sample_description[i] = sample_description[i][:j - 1] # Parse out the description and accession columns. accession_header = data[0][0] description_header = data[0][1] accession = [x[0] for x in data[2:]] description = [x[1] for x in data[2:]] x = [x.upper() for x in data[0][:2]] if x == ["DESCRIPTION", "ACCESSION"]: accession_header, description_header = \ description_header, accession_header accession, description = description, accession assert (accession_header.upper(), description_header.upper()) == \ ("ACCESSION", "DESCRIPTION") # Accession should be unique. x = {}.fromkeys(accession).keys() assert len(x) == len(accession) # Parse out the matrix and calls. matrix = [] calls = [] for row in data[2:]: row = row[2:] x0 = [x for (i, x) in enumerate(row) if i % 2 == 0] x1 = [x for (i, x) in enumerate(row) if i % 2 == 1] assert len(x0) == len(x1) for x in x1: assert x.upper() in ["A", "P", "M"], x matrix.append(x0) calls.append(x1) assert len(matrix) == num_genes # Should have some way of specifying no conversion. if datatype is None: convert_fn = None # default elif datatype is int: convert_fn = jmath.safe_int elif datatype is float: convert_fn = jmath.safe_float else: convert_fn = datatype if convert_fn: matrix = [map(convert_fn, x) for x in matrix] row_names = {} col_names = {} row_order = data[0][:2] + ["CALL"] col_order = [tdf.SAMPLE_NAME, "DESCRIPTION", "SCALE_FACTOR"] row_names[accession_header] = accession row_names[description_header] = description # Store the calls as row annotations. The gene annotation "CALL" # is a string of A, P, or M, with one call per sample. row_names["CALL"] = ["".join(x) for x in calls] col_names[tdf.SAMPLE_NAME] = sample_names col_names["DESCRIPTION"] = sample_description col_names["SCALE_FACTOR"] = scale_factors synonyms = {} synonyms[const.COL_ID] = tdf.SAMPLE_NAME synonyms[const.ROW_ID] = accession_header X = Matrix.InMemoryMatrix(matrix, row_names=row_names, col_names=col_names, row_order=row_order, col_order=col_order, synonyms=synonyms) #X = Matrix.add_synonyms(X, synonyms) #is_matrix(X); print DIAGNOSIS assert is_matrix(X) return X
def read_sample_group_file(file_or_handle): # Return list of (filename, sample, pair). pair is None, 1, or 2. # filename is a relative path. # # Reads can be split across multiple files (e.g. for multiple # lanes), or across pairs. # Headers: # Filename Sample Pair # F1 A 1 # F3 A 2 # F2 A 1 # F4 A 2 # F5 B 1 # F6 B 2 # # - Filenames should be unique. # - Filename should be relative. No full path information. # - Pair should be 1 or 2. If single end reads, just leave blank. # - There can be many Filenames per Sample. There can be many # Pairs per Sample (if the reads for one pair are split). # - The pairs that match (1 to its 2 partner) should be next to # each other in the file. import os from genomicode import filelib handle = file_or_handle if type(handle) is type(""): assert os.path.exists(file_or_handle) handle = filelib.openfh(handle) data = [] for d in filelib.read_row(handle, header=1, pad_cols=""): assert hasattr(d, "Pair"), "Missing column: Pair" pair = d.Pair.strip() assert pair in ["", "1", "2"], "Invalid pair: %s" % d.Pair x = d.Filename, d.Sample, pair data.append(x) # Make sure filenames are unique. seen = {} for x in data: filename, sample, pair = x x1, x2 = os.path.split(filename) assert not x1, "Filename should not contain a path: %s" % filename assert filename not in seen, "Filename is not unique: %s" % filename seen[filename] = 1 # If all the Pairs are "1", then make them all blank. x = [x[-1] for x in data] x = sorted({}.fromkeys(x)) if x == ["1"]: for i in range(len(data)): filename, sample, pair = data[i] data[i] = filename, sample, "" # For each sample, make sure there isn't a mix of paired and # single ended files. It must be all single ended or all paired. x = [x[1] for x in data] all_samples = sorted({}.fromkeys(x)) for sample in all_samples: x = [x[2] for x in data if x[1] == sample] x = sorted({}.fromkeys(x)) if x == [""] or x == ["1"]: # All single continue elif x == ["1", "2"]: # All paired continue raise AssertionError, "Weird pairing [%s]: %s" % (repr(x), sample) # Make sure each pair is next to each other. for sample in all_samples: pairs = [x[2] for x in data if x[1] == sample] # Should be all "", or a pattern of "1", "2". x = sorted({}.fromkeys(pairs)) if x == [""] or x == ["1"]: # all "" continue assert len(x) % 2 == 0, "Weird pairing: %s" % sample for i in range(0, len(x), 2): assert x[i] == "1", "Weird pairing: %s" % sample assert x[i + 1] == "2", "Weird pairing: %s" % sample return data
def _diagnose_format_problem(filename): import StringIO import csv from genomicode import filelib # Try to diagnose potential problems with the format of the file. # Return a description of the error. assert os.path.exists(filename) # Read the first 5 lines of the file. Can usually diagnose from # that. NUM_LINES = 5 handle = filelib.openfh(filename) x = [handle.readline() for i in range(NUM_LINES)] handle.close() lines = [x for x in x if x is not None] if not lines: return "The file is empty." # Figure out whether this is a tab-delimited or comma-delimited file. is_tdf = True for line in lines: # Don't check the first line, because GCT format might not # have a tab in the first line. if "\t" not in line: is_tdf = False is_csv = False if not is_tdf: is_csv = True if "," not in line: is_csv = False assert not (is_tdf and is_csv) # Problem: Not tab-delimited and not comma-delimited format. if not is_tdf and not is_csv: return "File does not appear to be delimited by tabs or commas." # If this is a tab-delimited file, then split it into columns # based on the tabs. cols = [] if is_tdf: for line in lines: x = line.rstrip("\r\n").split("\t") cols.append(x) elif is_csv: handle = StringIO.StringIO("".join(lines)) reader = csv.reader(handle) cols = [x for x in reader] # Problem: Sometimes people provide a file where the first row # contains one fewer column than the remaining rows. For example, # if they used R to create the file. num_cols = [len(x) for x in cols] if num_cols[0] == num_cols[1] - 1: return "First row has 1 fewer column than second row." if min(num_cols) != max(num_cols): return "Rows have different numbers of columns." return None