Beispiel #1
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    import util

    if hrows not in [None, 1]:
        return False
    if hcols not in [None, 4]:
        return False

    if not filelib.exists(locator_str):
        # This will only work if locator_str is a string.
        return False

    # Read 5 lines and check the headers.  If the file is small, this
    # may contain fewer than 5 lines.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()   # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    header = matrix[0]
    if header[:len(ROW_HEADERS)] != ROW_HEADERS:
        return False

    # Check if there's extraneous stuff.
    nr, nc = util.num_headers(matrix)
    if nc > 4:
        return False

    return True
Beispiel #2
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False

    if hrows not in [None, 1]:
        return False
    if hcols not in [None, 2]:
        return False

    # Read 5 lines and check the headers.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()  # need to close it properly, or gunzip might not die.

    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    if len(matrix) < 3:
        return False
    # First line could be just one column, or could be many columns.
    if len(matrix[0]) < 1:
        return False
    # Second line must have at least 2 columns.
    if len(matrix[1]) < 2:
        return False
    if matrix[0][0] != "#1.2":
        return False
    #if matrix[2][0].strip().upper() != "NAME":
    #    return False
    #if matrix[2][1].strip().upper() != "DESCRIPTION":
    #    return False
    return True
Beispiel #3
0
def parse_sam(file_or_handle):
    # yield SAMAlignment objects
    from genomicode import filelib

    # Somehow, csv raises errors on some BAM files read directly with
    # "samtools view" (via the subprocess module).  Just implement our
    # own column splitting.
    #for cols in filelib.read_cols(file_or_handle):
    handle = filelib.openfh(file_or_handle)
    for line in handle:
        cols = line.rstrip("\r\n").split("\t")
        assert len(cols) >= 11, "Invalid line (%d):\n%s" % (len(cols), line)
        qname = cols[0]
        flag = int(cols[1])
        rname = cols[2]
        pos = int(cols[3])
        mapq = int(cols[4])
        cigar = cols[5]
        rnext = cols[6]
        pnext = cols[7]
        tlen = int(cols[8])
        seq = cols[9]
        qual = cols[10]
        tags = {}
        for i in range(11, len(cols)):
            x = cols[i]
            x = x.split(":", 2)
            assert len(x) == 3, cols[i]
            tag, type_, value = x
            assert tag not in tags
            tags[tag] = (type_, value)
        x = SAMAlignment(qname, flag, rname, pos, mapq, cigar, rnext, pnext,
                         tlen, seq, qual, tags)
        yield x
Beispiel #4
0
def count_reads(fastq_filename):
    # Requires an uncompressed fastq file.
    from genomicode import filelib
    from genomicode import parallel

    sq = parallel.quote

    # Make sure it's a fastq file.
    # @M03807:17:000000000-AHGYH:1:1101:20554:1508 1:N:0:16
    # CTTTACACCCAGTGGAGAAGCTCCCAACCAAGCTCTCTTGAGGATCTTGAAGGAAACTGA
    # +
    # <BCC@FAFEC8,C<8968<@EEEFFCCFEC@EDEFGGGGA,@,@EFGGF9,,88,@FFA<
    handle = filelib.openfh(fastq_filename)
    x = [handle.readline() for i in range(4)]
    x = [x.strip() for x in x]
    x = [x for x in x]
    assert len(x) == 4
    assert len(x[1]) == len(x[3])
    assert x[2] == "+"

    wc_out = parallel.sshell("wc -l %s" % sq(fastq_filename))
    # velocitron:biocore$ wc -l test01.txt
    # 22278 test01.txt
    # 0 test 1.txt
    x = wc_out.strip().split()
    assert len(x) >= 2, "Unknown format from wc -l\n" % wc_out
    num_lines, filename = x[0], " ".join(x[1:])
    num_lines = int(num_lines)
    num_reads = num_lines / 4
    return num_reads
Beispiel #5
0
def merge_or_symlink_files(in_filenames, out_filename):
    # If only 1 file, then just symlink it rather than copy.
    # out_filename must not exist.
    import os
    from genomicode import filelib

    CHUNK_SIZE = 1024 * 1024
    assert not os.path.exists(out_filename)

    # If only one file, and it's not compressed, then just symlink it.
    if len(in_filenames) == 1:
        in_filename = in_filenames[0]
        x, ext = os.path.splitext(in_filename)
        if ext.lower() in [".fa", ".fasta"]:
            os.symlink(in_filename, out_filename)
            return

    # Create an empty outfile that I can append to.
    open(out_filename, 'w')

    # Append the files in order.
    for in_filename in in_filenames:
        in_handle = filelib.openfh(in_filename)
        out_handle = open(out_filename, 'a')
        while True:
            x = in_handle.read(CHUNK_SIZE)
            if not x:
                break
            out_handle.write(x)
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False

    # Read 5 lines and check the headers.  If the file is small, this
    # may contain fewer than 5 lines.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]
    matrix = _clean_tdf(matrix)

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    # All rows should contain at least one column.
    for x in matrix:
        if not x:
            return False

    # All rows should contain the same number of columns.
    for x in matrix:
        if len(x) != len(matrix[0]):
            return False

    return True
Beispiel #7
0
def resolve_symbol_or_file(name):
    from genomicode import filelib
    
    if not os.path.exists(name):
        return [name]
    symbols = [x.strip() for x in filelib.openfh(name)]
    return symbols
Beispiel #8
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False

    # Read 5 lines and count the headers.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(5)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    if len(matrix) < 3:
        return False

    # Line 3 should contain only 1 column.
    if len(matrix[2]) != 1:
        return False

    # Line 1 contains 1 more column than line 2.
    if len(matrix[0]) != len(matrix[1]) + 1:
        return False

    if len(matrix[0]) < 2:
        return False
    x = [x.upper() for x in matrix[0][:2]]
    if sorted(x) != sorted(["ACCESSION", "DESCRIPTION"]):
        return False

    return True
def uncompress_file(in_filename, out_filename):
    from genomicode import filelib
    CHUNK_SIZE = 16 * 1024 * 1024

    in_handle = filelib.openfh(in_filename)
    out_handle = open(out_filename, 'w')
    while True:
        x = in_handle.read(CHUNK_SIZE)
        if not x:
            break
        out_handle.write(x)
Beispiel #10
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    if not filelib.exists(locator_str):
        return False
    handle = filelib.openfh(locator_str)
    x = handle.readline()
    handle.close()  # need to close it properly, or gunzip might not die.
    if not x:  # blank file
        return False
    if "," in x:
        return True
    return False
Beispiel #11
0
def read(handle, hrows=None, hcols=None, datatype=float):
    import const
    import tab_delimited_format
    from genomicode import filelib

    assert hrows is None or hrows == 1
    assert hcols is None or hcols == 2

    handle = filelib.openfh(handle)
    assert handle.readline().strip() == "#1.2"
    x = handle.readline().rstrip("\r\n").split("\t")
    assert len(x) >= 2
    num_genes, num_samples = map(int, x[:2])

    X = tab_delimited_format.read(handle, hrows=1, hcols=2, datatype=datatype)
    assert X.dim() == (num_genes, num_samples), (
        "Matrix size mismatch.\n"
        "The GCT headers indicate a matrix with %d rows and %d columns.\n"
        "However, I found %d rows and %d columns." %
        (num_genes, num_samples, X.nrow(), X.ncol()))

    #assert X.row_headers()[0].upper() == "NAME"
    #assert X.row_headers()[1].upper() == "DESCRIPTION"
    header0, header1 = X.row_names()[:2]
    synonyms = {}
    NAME, DESCRIPTION = "NAME", "DESCRIPTION"
    if header0 != NAME:
        synonyms[NAME] = header0
    if header1 != DESCRIPTION:
        synonyms[DESCRIPTION] = header1
    synonyms[const.ROW_ID] = header0
    X._synonyms.update(synonyms)
    #X = Matrix.add_synonyms(X, synonyms)
    assert is_matrix(X)

    # The GCT File Format description at the Broad Institute does not
    # require the NAMEs to be unique.
    ## Make sure the NAMEs are unique.
    #seen = {}
    #dups = {}
    #for name in X.row_annots(NAME):
    #    if name in seen:
    #        dups[name] = 1
    #    seen[name] = 1
    #dups = sorted(dups)
    #assert len(dups) < 5, "%s column has %d duplicated names." % (
    #    header0, len(dups))
    #assert not dups, "%s column has duplicated names: %s" % (
    #    header0, dups)

    return X
Beispiel #12
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    import util
    if not filelib.exists(locator_str):
        return False

    # Read 5 lines and count the headers.
    # Actually, sometimes 5 lines not enough.  Working on matrix with
    # 13 lines of header.
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(20)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    # All rows should contain the same number of columns.
    for cols in matrix:
        if len(cols) != len(matrix[0]):
            return False

    nr, nc = util.num_headers(matrix)
    nrow = hrows or nr
    ncol = hcols or nc

    if nrow < 1 or nrow > 4:
        return False
    if ncol < 1 or ncol > 5:
        return False
    header_def = [
        (0, 0, "GID"),
        (0, 2, "NAME"),
        (0, 3, "GWEIGHT"),
        (0, 4, "GORDER"),
        (1, 0, "AID"),
        (2, 0, "EWEIGHT"),
        (3, 0, "EORDER"),
    ]
    for row, col, name in header_def:
        if nrow > row and ncol > col:
            if matrix[row][col].strip().upper() != name:
                return False
    return True
Beispiel #13
0
def read(handle, hrows=None, hcols=None, datatype=float):
    import csv
    from StringIO import StringIO
    from genomicode import filelib
    import tab_delimited_format

    # Convert this to tab-delimited format and let the other module
    # deal with it.
    outhandle = StringIO()
    reader = csv.reader(filelib.openfh(handle))
    for row in reader:
        print >> outhandle, "\t".join(row)
    outhandle.seek(0)
    return tab_delimited_format.read(outhandle,
                                     hrows=hrows,
                                     hcols=hcols,
                                     datatype=datatype)
Beispiel #14
0
def read(handle, hrows=None, hcols=None, datatype=float):
    import StringIO
    import tab_delimited_format
    from genomicode import filelib

    # Figure out the number of headers for tab_delimited_format.  If
    # sample names are numbers, then tab_delimited_format might
    # mistake the first row(s) for non-headers.
    s = filelib.openfh(handle).read()

    # Read 5 lines and check the headers.  If the file is small, this
    # may contain fewer than 5 lines.
    handle = StringIO.StringIO(s)
    lines = [handle.readline() for i in range(5)]
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    assert len(matrix) >= 1
    assert len(matrix[0]) >= 2

    if hcols is None:
        hcols = 1
        if len(matrix[0]) >= 2 and matrix[0][1].strip().upper() in ROW_HEADERS:
            hcols += 1
        if len(matrix[0]) >= 3 and matrix[0][2].strip().upper() in ROW_HEADERS:
            hcols += 1
        if len(matrix[0]) >= 4 and matrix[0][3].strip().upper() in ROW_HEADERS:
            hcols += 1
    if hrows is None:
        hrows = 1
        if len(matrix) >= 2 and matrix[1][0].strip().upper() in COL_HEADERS:
            hrows += 1
        if len(matrix) >= 3 and matrix[2][0].strip().upper() in COL_HEADERS:
            hrows += 1

    handle = StringIO.StringIO(s)
    X = tab_delimited_format.read(handle,
                                  hrows=hrows,
                                  hcols=hcols,
                                  datatype=datatype)
    #is_matrix(X); print DIAGNOSIS
    assert is_matrix(X)
    return X
Beispiel #15
0
def read_normal_cancer_file(file_or_handle):
    # Return list of (normal_sample, tumor_sample).
    import os
    from genomicode import filelib

    handle = file_or_handle
    if type(handle) is type(""):
        assert os.path.exists(file_or_handle)
        handle = filelib.openfh(handle)

    data = []
    for d in filelib.read_row(handle, header=1, pad_cols=""):
        assert hasattr(d, "Normal"), "Missing header: Normal"
        assert hasattr(d, "Cancer"), "Missing header: Cancer"
        ns = d.Normal
        ts = d.Cancer
        ns, ts = ns.strip(), ts.strip()
        assert ns != ts
        x = ns, ts
        data.append(x)
    return data
def _read_coverage_file(filename):
    # Return dict of key -> value (as string)
    from genomicode import filelib

    matrix = []
    for line in filelib.openfh(filename):
        if line.startswith("#"):
            continue
        if not line.strip():
            continue
        x = line.rstrip("\r\n").split("\t")
        matrix.append(x)
    assert len(matrix) == 2
    assert len(matrix[0]) == len(matrix[1])

    data = {}
    for i in range(len(matrix[0])):
        key = matrix[0][i].strip()
        value = matrix[1][i].strip()
        assert key not in data
        data[key] = value
    return data
def merge_parsed_files(parsed_files, outfile):
    # First, make sure each of the parsed files has the same header.
    from genomicode import filelib

    assert parsed_files

    header = None
    for f in parsed_files:
        cols = filelib.read_cols(f).next()
        if not header:
            header = cols
        assert header == cols, "Mismatched headers"
    assert header

    handle = open(outfile, 'w')
    seen = {}
    for f in parsed_files:
        for line in filelib.openfh(f):
            if line in seen:
                continue
            seen[line] = 1
            print >> handle, line,
def parse_trimmomatic_output(filename):
    # Return a dictionary with keys:
    # reads_processed
    # dropped_reads
    #
    # For paired ends, this refers to pairs of reads.  dropped_reads
    # indicates how many pairs were dropped, because on or both were
    # dropped.
    from genomicode import filelib

    # Single end reads:
    # Input Reads: 1764254 Surviving: 1764160 (99.99%) Dropped: 94 (0.01%)
    #
    # Paired end reads:
    # Input Read Pairs: 60032406 Both Surviving: 59093198 (98.44%)
    #   Forward Only Surviving: 891164 (1.48%) Reverse Only Surviving:
    #   20511 (0.03%) Dropped: 27533 (0.05%)

    results = {}
    for line in filelib.openfh(filename):
        if not line.startswith("Input Read"):
            continue
        cols = line.strip().split()
        if line.startswith("Input reads:"):
            # Single end.
            assert len(cols) == 9
            reads = int(cols[2])
            dropped = int(cols[7])
        elif line.startswith("Input Read Pairs:"):
            # Paired end.
            assert len(cols) == 21
            reads = int(cols[3])
            dropped = reads - int(cols[6])
        assert dropped < reads
        results["reads_processed"] = reads
        results["dropped_reads"] = dropped
    assert results, "Parse error: %s" % filename
    return results
Beispiel #19
0
def extract_sample2desc(filename):
    from genomicode.filelib import openfh
    title_dict = {}
    description_dict = {}
    id_ = None
    for line in openfh(filename):
        if line.startswith("^SAMPLE"):
            assert id_ is None, "problem with %s" % filename
            id_ = line.strip().split()[2]
        elif line.startswith("!Sample_description"):
            assert id_ is not None, "problem with %s" % filename
            title = line.strip().split(None, 2)[2]
            #x = id, title
            description_dict[id_] = title
        elif line.startswith("!Sample_title"):
            assert id_ is not None, "problem with %s" % filename
            title = line.strip().split(None, 2)[2]
            #x = id, "Title: %s" % title
            title_dict[id_] = title
        elif line.startswith("!sample_table_end"):
            id_ = None
    
    return title_dict, description_dict
Beispiel #20
0
def is_format(locator_str, hrows=None, hcols=None):
    from genomicode import filelib
    import util
    if not filelib.exists(locator_str):
        return False

    # Read NUM_LINES lines and count the headers.  Previously, we read
    # only 5 lines, and had problems.  In a matrix, one of the
    # annotation columns had spaces in the first 5 lines, so it was
    # mistakenly annotated as part of the matrix, rather than part of
    # the annotations.  Probably should look at least at the first 100
    # lines.  U133Av2 has 62 AFFX genes that may or may not have
    # annotations.
    #NUM_LINES = 25
    NUM_LINES = 100
    handle = filelib.openfh(locator_str)
    lines = [handle.readline() for i in range(NUM_LINES)]
    handle.close()  # need to close it properly, or gunzip might not die.
    lines = [x for x in lines if x]
    matrix = [line.rstrip("\r\n").split("\t") for line in lines]

    # Make sure there's at least 1 line.
    if not matrix:
        return False

    # Has to have at least a header.
    if len(matrix) < 1:
        return False
    # All rows should contain the same number of columns.
    for cols in matrix:
        if len(cols) != len(matrix[0]):
            return False

    nr, nc = util.num_headers(matrix)
    nrow = hrows or nr
    ncol = hcols or nc

    # PCL requires at least the gene IDs.
    if ncol == 0:
        return False
    #if nrow == 0 and ncol == 0:
    #    return False
    nrow = max(nrow, 1)  # what is this for???
    if nrow < 1 or nrow > 3:
        return False
    # PCL format has at most 4 header columns.
    if ncol > 4:
        return False
    #if ncol > 2:
    #    ncol = 2
    #if ncol < 2 or ncol > 4:
    #    return False
    assert len(matrix) >= 1
    header_def = [
        (0, 1, "NAME"),
        (0, 2, "GWEIGHT"),
        (0, 3, "GORDER"),
        (1, 0, "EWEIGHT"),
        (2, 0, "EORDER"),
    ]
    for row, col, name in header_def:
        if nrow > row and ncol > col:
            if matrix[row][col].strip().upper() != name:
                return False
    return True
def read(handle, hrows=None, hcols=None, datatype=float):
    import math
    from genomicode import filelib
    from genomicode import Matrix
    from genomicode import jmath
    from genomicode import iolib
    import util
    import const
    # Format:
    # - gene x experiment
    # - optional header row
    # - optional rows of sample annotations (requires header row)
    # - optional columns of gene annotations

    filename = None
    if type(handle) is type(""):
        filename = handle
    handle = filelib.openfh(handle)
    data = filelib.read_all_cols(handle)
    #data = [x for x in filelib.read_cols(handle)]
    #x = handle.read()
    #data = iolib.split_tdf(x, strip=True)
    #handle = filelib.read_cols(handle)
    #data = [handle.next() for i in range(100)]
    data = _clean_tdf(data)

    num_cols = len(data[0])
    for i, x in enumerate(data):
        nc = len(data[i])
        f = ""
        if filename:
            f = " [%s]" % filename
        error_msg = "Header%s has %d columns but line %d has %d." % (
            f, num_cols, i + 1, nc)
        assert nc == num_cols, error_msg
    if not data:
        return Matrix.InMemoryMatrix([])

    # If the rows and cols not explicitly specified, then try to guess
    # them from the file.
    #print "HEADERS 1", hrows, hcols
    if hrows is None or hcols is None:
        hr, hc = util.num_headers(data)
        if hrows is None:
            hrows = hr
        if hcols is None:
            hcols = hc
    #print "HEADERS 2", hrows, hcols
    #num_genes, num_arrays = num_rows-hrows, num_cols-hcols

    # Pull out the row names from the columns.
    row_names = {}  # header -> list of names (1 for each gene)
    row_order = []  # in-order list of the headers
    if hcols:
        if hrows:
            # If a header row is provided, then the names of these
            # annotations are provided in the header.
            row_order = data[0][:hcols]
        else:
            # No header row.  Make default name for these annotations.
            ndigits = int(math.ceil(math.log(hcols, 10)))
            row_order = ["ANNOT%*d" % (ndigits, i + 1) for i in range(hcols)]
        # Strip extraneous whitespace from the header names.
        # Not necessary.  Handled now in split_tdf.
        #row_order = [x.strip() for x in row_order]

        # Sometimes the format detection can go wrong and a GCT file
        # will slip through to here.  If this occurs, a "duplicate
        # header" exception will be generated.  Check for this and
        # generate a more meaningful error message.
        if (row_order[0] == "#1.2" and len(row_order) > 1
                and row_order[1] == "" and row_order[-1] == ""):
            raise AssertionError("ERROR: It looks like a GCT file was missed.")
        for i, header in enumerate(row_order):
            names = [x[i] for x in data[hrows:]]
            assert header not in row_names, "duplicate header: %s" % header
            row_names[header] = names

    # Pull out the column names.
    col_names = {}  # header -> list of names (1 for each array)
    col_order = []
    if hrows:
        for i in range(1, hrows):
            header = data[i][0]
            names = data[i][hcols:]
            assert header not in col_names, "duplicate name: %s" % header
            # Strip extraneous whitespace from the header names.
            # Not necessary.  Handled now in split_tdf.
            #header = header.strip()
            col_order.append(header)
            col_names[header] = names

    # Now extract the expression values.
    matrix = data
    if hrows or hcols:
        matrix = [x[hcols:] for x in matrix[hrows:]]

    # Pull out the sample names.
    sample_names = None
    if hrows:
        # If a header is provided, then use these as the column names.
        sample_names = data[0][hcols:]
    if sample_names:
        col_names[SAMPLE_NAME] = sample_names
        col_order.insert(0, SAMPLE_NAME)

    if datatype is None:
        convert_fn = None  # no conversion
    elif datatype is int:
        convert_fn = jmath.safe_int
    elif datatype is float:
        convert_fn = jmath.safe_float
    else:
        # Assume that I was passed a function.
        convert_fn = datatype

    if convert_fn == jmath.safe_float:
        # Try and convert to an integer instead.
        is_int = True
        for i in range(len(matrix)):
            for j in range(len(matrix[i])):
                if not jmath.is_int(matrix[i][j]):
                    is_int = False
                    break
            if not is_int:
                break
        if is_int:
            convert_fn = jmath.safe_int

    if convert_fn:
        check_each_row = False
        try:
            matrix = [map(convert_fn, x) for x in matrix]
        except ValueError, err1:
            if str(err1) == "empty string for float()":
                check_each_row = True
            elif str(err1).startswith("invalid literal for float()"):
                check_each_row = True
            elif str(err1).startswith("could not convert string to float"):
                check_each_row = True
            else:
                raise
        if check_each_row:
            # If there was an exception, then check each row carefully
            # to try to pinpoint the problem.
            for i, x in enumerate(matrix):
                try:
                    map(convert_fn, x)
                except ValueError, err2:
                    row = data[hrows + i]
                    raise ValueError("%s\nProblem with row %d: %s" %
                                     (str(err2), i + 1, row))
            raise AssertionError("Error converting values.")
Beispiel #22
0
def extract_signal(filename, outhandle):
    import os
    import tempfile
    from genomicode import filelib

    # Write stuff to file to handle large data sets.
    tmpfile1 = tmpfile2 = tmpfile3 = None
    try:
        # tmpfile1        Raw signal data from series matrix file.
        # tmpfile2.<num>  Raw data split into separate tables.
        # tmpfile3        Final merged signal table.
        x, tmpfile1 = tempfile.mkstemp(dir=".")
        os.close(x)
        x, tmpfile2 = tempfile.mkstemp(dir=".")
        os.close(x)
        x, tmpfile3 = tempfile.mkstemp(dir=".")
        os.close(x)

        # Get a list of all lines in the series matrix tables.
        handle = open(tmpfile1, 'w')
        in_matrix_table = 0
        for cols in filelib.read_cols(filename):
            # Some files can have blank lines.
            if not cols:
                continue
            if cols[0] == "!series_matrix_table_begin":
                in_matrix_table = 1
            elif cols[0] == "!series_matrix_table_end":
                in_matrix_table = 0
            elif in_matrix_table:
                cols = [remove_quotes(x).strip() for x in cols]
                print >> handle, "\t".join(cols)
        handle.close()
        handle = None

        # Split the data into separate tables.
        num_tables = 0
        for line in filelib.openfh(tmpfile1):
            if line.startswith("ID_REF"):
                handle = open("%s.%d" % (tmpfile2, num_tables), 'w')
                num_tables += 1
            assert handle
            print >> handle, line,
        if handle:
            handle.close()
        assert num_tables

        # Sometimes the tables will not be aligned.
        # E.g. GSE9899-GPL570 contains two tables, and the 2nd is
        # missing some probe sets.  Get a list of the probe sets in
        # the tables.
        files = ["%s.%d" % (tmpfile2, i) for i in range(num_tables)]
        matrices = [FileMatrix(x) for x in files]
        id2indexes = []
        for matrix in matrices:
            id2index = {}
            for i, row in enumerate(matrix):
                id_ = row[0]
                id2index[id_] = i
            id2indexes.append(id2index)

        # Make a list of all the IDs.
        all_ids = {}
        for id2index in id2indexes:
            for id_ in id2index:
                all_ids[id_] = 1
        del all_ids["ID_REF"]
        all_ids = all_ids.keys()
        all_ids.sort()
        all_ids = ["ID_REF"] + all_ids

        # Align the indexes.
        #num_rows = row_names = None
        #for i in range(num_tables):
        #    filename = "%s.%d" % (tmpfile2, i)
        #    rname, nrow = [], 0
        #    for line in openfh(filename):
        #        x = line.split("\t", 1)[0]
        #        rname.append(x)
        #        nrow += 1
        #    if num_rows is None:
        #        num_rows = nrow
        #    if row_names is None:
        #        row_names = rname
        #    assert num_rows == nrow, "table is unaligned"
        #    assert row_names == rname

        # Merge all the pieces together into one big table.
        handle = open(tmpfile3, 'w')
        for id_ in all_ids:
            cols = []
            for matrix, id2index in zip(matrices, id2indexes):
                if id_ in id2index:
                    x = matrix[id2index[id_]]
                else:
                    # If this ID is missing, then just insert blank values.
                    x = [""] * len(matrix[0])
                if cols:
                    # If this is not the first matrix, then delete the
                    # row names.
                    x = x[1:]
                cols.extend(x)
            print >> handle, "\t".join(cols)
        handle.close()

        num_rows = len(all_ids)
        num_cols = len(filelib.read_cols(tmpfile3).next())

        # Figure out which expression values are missing.
        data_missing = {}
        for i, cols in enumerate(filelib.read_cols(tmpfile3)):
            assert len(cols) == num_cols, "line %d unaligned [%d:%d]" % (
                i, len(cols), num_cols)
            if i == 0:
                continue
            for j in range(1, len(cols)):
                try:
                    float(cols[j])
                except ValueError, x:
                    data_missing[(i, j)] = 1
                if cols[j] == "nan":
                    data_missing[(i, j)] = 1

        ## Remove the samples where >50% values are missing.
        #col_missing = [0] * num_cols   # number of values missing in each col
        #for i, j in data_missing:
        #    col_missing[j] += 1

        good_cols = [0]
        for i in range(1, num_cols):
            #if col_missing[i] > 0.50*(num_rows-1):  # -1 for the row names
            #    continue
            good_cols.append(i)

        ## Remove the genes where any value is missing.
        #row_missing = [0] * num_rows
        #for i, j in data_missing:
        #    if j not in good_cols:   # ignore samples that are already dropped
        #        continue
        #    row_missing[i] += 1

        good_rows = [0]
        for i in range(1, num_rows):
            #if row_missing[i] > 0:  # a value is missing.
            #    continue
            good_rows.append(i)

        assert len(good_cols) > 1, "no data"
        assert len(good_rows) > 1, "no data"

        # Write out the data.
        for i, cols in enumerate(filelib.read_cols(tmpfile3)):
            if i not in good_rows:
                continue
            x = [x for (i, x) in enumerate(cols) if i in good_cols]
            print >> outhandle, "\t".join(x)
Beispiel #23
0
def read(handle, hrows=None, hcols=None, datatype=float):
    from genomicode import filelib
    from genomicode import jmath
    from genomicode import Matrix
    import tab_delimited_format as tdf
    import const

    handle = filelib.openfh(handle)
    # Can't use iolib.split_tdf here because it does not handle empty
    # lines properly (which can occur if there is a file with no
    # samples).
    #data = iolib.split_tdf(handle.read())
    data = [x.rstrip("\r\n").split("\t") for x in handle]
    assert len(data) >= 3, "Invalid RES file."

    # Do some checking on the format.
    assert len(data[0]) == len(data[1]) + 1
    x = sorted([x.upper() for x in data[0][:2]])
    assert x == ["ACCESSION", "DESCRIPTION"]
    assert len(data[2]) == 1, "%d: %s" % (len(data[2]), repr(data[2]))

    # Parse out the number of genes and delete the row.
    num_genes = int(data[2][0])
    del data[2]
    assert len(data) == num_genes + 2  # data + 2 headers

    # GenePattern creates files where the last column is all blank.
    # If this is the case, then delete it.
    #blank_last_col = True
    x = [x[-1] for x in data if x[-1]]
    if not x:
        # Last column is all blank so delete it.
        data = [x[:-1] for x in data]

    # Parse the names of the samples.
    sample_names = []
    for i, x in enumerate(data[0][2:]):
        if i % 2:
            assert not x
        else:
            assert x
            sample_names.append(x)

    # Parse out the sample_description.
    sample_description = []
    for i, x in enumerate(data[1]):
        if i % 2 == 0:
            assert not x
        else:
            assert x
            sample_description.append(x)
    assert len(sample_description) == len(sample_names)

    # Pull the scale factors out of the sample_description.
    # Some of the descriptions can be missing scale factors.
    scale_factors = [""] * len(sample_description)
    for i in range(len(sample_description)):
        x = sample_description[i]
        sf = "scale factor"
        j = x.lower().find(sf)
        if j < 0:
            continue
        assert x[j - 1] == "/"
        assert x[j + len(sf)] == "="
        scale_factors[i] = float(sample_description[i][j + len(sf) + 1:])
        sample_description[i] = sample_description[i][:j - 1]

    # Parse out the description and accession columns.
    accession_header = data[0][0]
    description_header = data[0][1]
    accession = [x[0] for x in data[2:]]
    description = [x[1] for x in data[2:]]
    x = [x.upper() for x in data[0][:2]]
    if x == ["DESCRIPTION", "ACCESSION"]:
        accession_header, description_header = \
                          description_header, accession_header
        accession, description = description, accession
    assert (accession_header.upper(), description_header.upper()) == \
           ("ACCESSION", "DESCRIPTION")

    # Accession should be unique.
    x = {}.fromkeys(accession).keys()
    assert len(x) == len(accession)

    # Parse out the matrix and calls.
    matrix = []
    calls = []
    for row in data[2:]:
        row = row[2:]
        x0 = [x for (i, x) in enumerate(row) if i % 2 == 0]
        x1 = [x for (i, x) in enumerate(row) if i % 2 == 1]
        assert len(x0) == len(x1)
        for x in x1:
            assert x.upper() in ["A", "P", "M"], x
        matrix.append(x0)
        calls.append(x1)
    assert len(matrix) == num_genes

    # Should have some way of specifying no conversion.
    if datatype is None:
        convert_fn = None  # default
    elif datatype is int:
        convert_fn = jmath.safe_int
    elif datatype is float:
        convert_fn = jmath.safe_float
    else:
        convert_fn = datatype

    if convert_fn:
        matrix = [map(convert_fn, x) for x in matrix]

    row_names = {}
    col_names = {}
    row_order = data[0][:2] + ["CALL"]
    col_order = [tdf.SAMPLE_NAME, "DESCRIPTION", "SCALE_FACTOR"]

    row_names[accession_header] = accession
    row_names[description_header] = description
    # Store the calls as row annotations.  The gene annotation "CALL"
    # is a string of A, P, or M, with one call per sample.
    row_names["CALL"] = ["".join(x) for x in calls]

    col_names[tdf.SAMPLE_NAME] = sample_names
    col_names["DESCRIPTION"] = sample_description
    col_names["SCALE_FACTOR"] = scale_factors

    synonyms = {}
    synonyms[const.COL_ID] = tdf.SAMPLE_NAME
    synonyms[const.ROW_ID] = accession_header

    X = Matrix.InMemoryMatrix(matrix,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #X = Matrix.add_synonyms(X, synonyms)
    #is_matrix(X); print DIAGNOSIS
    assert is_matrix(X)
    return X
Beispiel #24
0
def read_sample_group_file(file_or_handle):
    # Return list of (filename, sample, pair).  pair is None, 1, or 2.
    # filename is a relative path.
    #
    # Reads can be split across multiple files (e.g. for multiple
    # lanes), or across pairs.
    # Headers:
    # Filename  Sample  Pair
    # F1         A       1
    # F3         A       2
    # F2         A       1
    # F4         A       2
    # F5         B       1
    # F6         B       2
    #
    # - Filenames should be unique.
    # - Filename should be relative.  No full path information.
    # - Pair should be 1 or 2.  If single end reads, just leave blank.
    # - There can be many Filenames per Sample.  There can be many
    #   Pairs per Sample (if the reads for one pair are split).
    # - The pairs that match (1 to its 2 partner) should be next to
    #   each other in the file.
    import os
    from genomicode import filelib

    handle = file_or_handle
    if type(handle) is type(""):
        assert os.path.exists(file_or_handle)
        handle = filelib.openfh(handle)

    data = []
    for d in filelib.read_row(handle, header=1, pad_cols=""):
        assert hasattr(d, "Pair"), "Missing column: Pair"
        pair = d.Pair.strip()
        assert pair in ["", "1", "2"], "Invalid pair: %s" % d.Pair
        x = d.Filename, d.Sample, pair
        data.append(x)

    # Make sure filenames are unique.
    seen = {}
    for x in data:
        filename, sample, pair = x
        x1, x2 = os.path.split(filename)
        assert not x1, "Filename should not contain a path: %s" % filename
        assert filename not in seen, "Filename is not unique: %s" % filename
        seen[filename] = 1

    # If all the Pairs are "1", then make them all blank.
    x = [x[-1] for x in data]
    x = sorted({}.fromkeys(x))
    if x == ["1"]:
        for i in range(len(data)):
            filename, sample, pair = data[i]
            data[i] = filename, sample, ""

    # For each sample, make sure there isn't a mix of paired and
    # single ended files.  It must be all single ended or all paired.
    x = [x[1] for x in data]
    all_samples = sorted({}.fromkeys(x))
    for sample in all_samples:
        x = [x[2] for x in data if x[1] == sample]
        x = sorted({}.fromkeys(x))
        if x == [""] or x == ["1"]:  # All single
            continue
        elif x == ["1", "2"]:  # All paired
            continue
        raise AssertionError, "Weird pairing [%s]: %s" % (repr(x), sample)

    # Make sure each pair is next to each other.
    for sample in all_samples:
        pairs = [x[2] for x in data if x[1] == sample]
        # Should be all "", or a pattern of "1", "2".
        x = sorted({}.fromkeys(pairs))
        if x == [""] or x == ["1"]:  # all ""
            continue
        assert len(x) % 2 == 0, "Weird pairing: %s" % sample
        for i in range(0, len(x), 2):
            assert x[i] == "1", "Weird pairing: %s" % sample
            assert x[i + 1] == "2", "Weird pairing: %s" % sample

    return data
Beispiel #25
0
def _diagnose_format_problem(filename):
    import StringIO
    import csv
    from genomicode import filelib

    # Try to diagnose potential problems with the format of the file.
    # Return a description of the error.
    assert os.path.exists(filename)

    # Read the first 5 lines of the file.  Can usually diagnose from
    # that.
    NUM_LINES = 5
    handle = filelib.openfh(filename)
    x = [handle.readline() for i in range(NUM_LINES)]
    handle.close()
    lines = [x for x in x if x is not None]

    if not lines:
        return "The file is empty."

    # Figure out whether this is a tab-delimited or comma-delimited file.
    is_tdf = True
    for line in lines:
        # Don't check the first line, because GCT format might not
        # have a tab in the first line.
        if "\t" not in line:
            is_tdf = False

    is_csv = False
    if not is_tdf:
        is_csv = True
        if "," not in line:
            is_csv = False

    assert not (is_tdf and is_csv)

    # Problem: Not tab-delimited and not comma-delimited format.
    if not is_tdf and not is_csv:
        return "File does not appear to be delimited by tabs or commas."

    # If this is a tab-delimited file, then split it into columns
    # based on the tabs.
    cols = []
    if is_tdf:
        for line in lines:
            x = line.rstrip("\r\n").split("\t")
            cols.append(x)
    elif is_csv:
        handle = StringIO.StringIO("".join(lines))
        reader = csv.reader(handle)
        cols = [x for x in reader]

    # Problem: Sometimes people provide a file where the first row
    # contains one fewer column than the remaining rows.  For example,
    # if they used R to create the file.
    num_cols = [len(x) for x in cols]
    if num_cols[0] == num_cols[1] - 1:
        return "First row has 1 fewer column than second row."
    if min(num_cols) != max(num_cols):
        return "Rows have different numbers of columns."

    return None