Exemple #1
0
def _res_to_gct(X):
    # Will lose the CALL information.
    # Will lose the SCALE_FACTOR.
    # Will lose the column DESCRIPTIONS.
    from genomicode import Matrix
    assert res_format.is_matrix(X)

    # Figure out the annotation names for the row name and description.
    acc_header, desc_header, call_header = X.row_names()
    if X._synonyms[ROW_ID] != acc_header:
        acc_header, desc_header = desc_header, acc_header
    assert X._synonyms[ROW_ID] == acc_header

    row_order = ["Name", "Description"]
    col_order = [X._col_order[0]]
    row_names = {}
    col_names = {}
    synonyms = {}

    row_names["Name"] = X.row_names(acc_header)
    row_names["Description"] = X.row_names(desc_header)
    col_names[col_order[0]] = X.col_names(col_order[0])
    synonyms[ROW_ID] = "Name"
    synonyms[COL_ID] = col_order[0]

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #x = Matrix.add_synonyms(x, synonyms)
    #gct_format.is_matrix(x); print gct_format.DIAGNOSIS
    assert gct_format.is_matrix(x)
    return x
Exemple #2
0
def merge_two_files(A_file, B_file, handle):
    """input two files and merge, write the output to handle"""
    import arrayio
    from genomicode import Matrix
    from genomicode import matrixlib

    M_A = arrayio.read(A_file)
    M_B = arrayio.read(B_file)
    assert arrayio.tab_delimited_format.is_matrix(M_A)
    assert arrayio.tab_delimited_format.is_matrix(M_B)
    [M_A, M_B] = matrixlib.align_rows(M_A, M_B)
    assert M_A.nrow() > 0, 'there is no common genes between two files'
    X = []
    for i in range(M_A.dim()[0]):
        x = M_A._X[i] + M_B._X[i]
        X.append(x)
    row_names = M_A._row_names
    row_order = M_A._row_order
    col_names = {}
    for name in M_A._col_names:
        if name not in M_B._col_names:
            continue
        newsample_list = []
        for sample in M_B._col_names[name]:
            if sample in M_A._col_names[name]:
                newsample = sample + '_2'
            else:
                newsample = sample
            newsample_list.append(newsample)
        #x = M_A._col_names[name] + M_B._col_names[name]
        x = M_A._col_names[name] + newsample_list
        col_names[name] = x
    M_c = Matrix.InMemoryMatrix(X, row_names, col_names, row_order)
    arrayio.tab_delimited_format.write(M_c, handle)
Exemple #3
0
def _jeff_to_pcl(X):
    # Will lose a lot of annotations.
    from genomicode import Matrix
    assert jeffs_format.is_matrix(X)

    assert len(X.col_names()) == 1
    assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME

    row_order = ["Probe.Set.ID", "NAME"]
    col_order = X._col_order[:]
    row_names = {}
    col_names = X._col_names.copy()
    synonyms = {}

    row_names["Probe.Set.ID"] = X._row_names["Probe.Set.ID"]
    row_names["NAME"] = X._row_names["Gene.Symbol"]
    synonyms[ROW_ID] = "Probe.Set.ID"
    synonyms[COL_ID] = col_order[0]

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #x = Matrix.add_synonyms(x, synonyms)
    assert pcl_format.is_matrix(x)
    return x
Exemple #4
0
def _gct_to_pcl(X):
    from genomicode import Matrix

    assert gct_format.is_matrix(X)
    assert len(X.col_names()) == 1
    assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME
    assert len(X.row_names()) == 2
    name, desc = X.row_names()

    row_order = ["GeneID", "NAME"]
    col_order = X._col_order[:]
    row_names = {}
    col_names = X._col_names.copy()
    synonyms = {}

    row_names["GeneID"] = X._row_names[name]
    row_names["NAME"] = X._row_names[desc]
    synonyms[ROW_ID] = "GeneID"
    synonyms[COL_ID] = col_order[0]

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #x = Matrix.add_synonyms(x, synonyms)
    assert pcl_format.is_matrix(x)
    return x
Exemple #5
0
def read_geneset_scores(filename):
    # Read the output from score_geneset.py and return a Matrix
    # object.
    import os
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import Matrix
    from arrayio import const
    from arrayio import tab_delimited_format as tdf

    assert os.path.exists(filename)
    matrix = [x for x in filelib.read_cols(filename)]
    matrix = jmath.transpose(matrix)

    # Only want the scores.  Get rid of the direction, pvalue, and
    # significance lines.
    # Columns:
    # SAMPLE
    # FILE
    # [Score ...]
    # [Direction ...] " direction"
    # [p value ...] " pvalue"
    # [significant ...] " significant"
    assert matrix
    i = 0
    while i < len(matrix):
        assert matrix[i]
        metadata = False
        if matrix[i][0].endswith(" direction"):
            metadata = True
        elif matrix[i][0].endswith(" pvalue"):
            metadata = True
        elif matrix[i][0].endswith(" significant"):
            metadata = True
        if not metadata:
            i += 1
            continue
        del matrix[i]

    # BUG: Need more checks on size and format of matrix.
    col_names = {}
    sample_row = 0
    if matrix[1][0].upper() == "SAMPLE":
        sample_row = 1
    col_names[tdf.SAMPLE_NAME] = matrix[sample_row][1:]
    row_names = {}
    row_names['geneset'] = []
    synonyms = {}
    synonyms[const.COL_ID] = tdf.SAMPLE_NAME
    data = []
    for line in matrix[2:]:
        single_data = [jmath.safe_float(i) for i in line[1:]]
        data.append(single_data)
        row_names['geneset'].append(line[0])
    M = Matrix.InMemoryMatrix(data,
                              row_names=row_names,
                              col_names=col_names,
                              synonyms=synonyms)
    return M
Exemple #6
0
def _tdf_to_gct(X):
    from genomicode import Matrix
    from genomicode import parselib
    assert tab_delimited_format.is_matrix(X)

    assert len(X.col_names()) >= 1
    assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME

    name_header = "NAME"
    desc_header = "DESCRIPTION"

    # Make up default names.
    name = ["NAME%s" % x for x in parselib.pretty_range(0, X.nrow())]
    desc = ["DESC%s" % x for x in parselib.pretty_range(0, X.nrow())]

    # Try to find better names.
    if not X.row_names():
        pass
    elif len(X.row_names()) == 1:
        # Only 1 header, so use that for the name.
        name = X.row_names(X.row_names()[0])
    else:
        # Use the first two columns for the name and description.
        name_i, desc_i = 0, 1
        # See if there is a ROW_ID set.  If there is, use that for NAME.
        if ROW_ID in X._synonyms:
            name_i = X.row_names().index(X._synonyms[ROW_ID])
            if name_i == desc_i:
                # name_i used to be 0, and desc_i is not 0.
                assert desc_i != 0
                desc_i = 0
        assert name_i != desc_i
        name = X.row_names(X.row_names()[name_i])
        desc = X.row_names(X.row_names()[desc_i])

    row_order = [name_header, desc_header]
    col_order = [X._col_order[0]]
    row_names = {}
    col_names = {}
    synonyms = {}

    row_names[name_header] = name
    row_names[desc_header] = desc
    col_names[tab_delimited_format.SAMPLE_NAME] = X._col_names[X._col_order[0]]
    synonyms[ROW_ID] = name_header
    synonyms[COL_ID] = col_order[0]

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #x = Matrix.add_synonyms(x, synonyms)
    assert gct_format.is_matrix(x)
    return x
Exemple #7
0
def _convert_to_pcl(MATRIX):
    # Convert the matrix to PCL format.
    # Row names   <ID>  NAME
    # Col names   
    import arrayio
    from genomicode import Matrix

    # Select from the row names an ID and a NAME.
    id_name = _choose_gene_id(MATRIX)
    name_name = _choose_gene_label(MATRIX)

    # Make sure there aren't any blank gene IDs, or cluster will
    # complain.  Also, make sure they are unique.
    seen = {}
    for id_ in MATRIX.row_names(id_name):
        id_ = id_.strip()
        assert id_, "Missing gene IDs (header %s)." % id_name
        assert id_ not in seen, "Duplicate gene ID %s." % id_
        seen[id_] = 1

    # Should not use "GID" as column name for PCL file.  When
    # clustering, cluster will add another "GID" column, and then
    # there will be two columns called "GID".  Rename this to
    # something else, if necessary.
    pretty_id_name = id_name
    if pretty_id_name == "GID":
        pretty_id_name = "GID.OLD"
    if pretty_id_name == "NAME":
        # GCT files uses "NAME" for ID, which conflicts with PCL definition.
        pretty_id_name = "ID.NAME"
    pretty_name_name = "NAME"

    SAMPLE_NAME = arrayio.tab_delimited_format.SAMPLE_NAME 
    row_order = [pretty_id_name, pretty_name_name]
    col_order = [SAMPLE_NAME]
    row_names = {}
    col_names = {}
    synonyms = {}
    
    row_names[pretty_id_name] = MATRIX.row_names(id_name)
    row_names[pretty_name_name] = MATRIX.row_names(name_name)
    col_names[SAMPLE_NAME] = MATRIX.col_names(arrayio.COL_ID)
    synonyms[arrayio.ROW_ID] = pretty_id_name
    synonyms[arrayio.COL_ID] = SAMPLE_NAME

    pcl_matrix = Matrix.InMemoryMatrix(
        MATRIX.slice(), row_names=row_names, col_names=col_names,
        row_order=row_order, col_order=col_order, synonyms=synonyms)
    #pcl_matrix = Matrix.add_synonyms(x, synonyms)
    assert arrayio.pcl_format.is_matrix(pcl_matrix)
    return pcl_matrix
Exemple #8
0
def format_rsem_isoforms(txt_file, outfile):
    import arrayio
    from genomicode import arrayplatformlib

    M = arrayio.read(txt_file)
    # detect platform
    x = arrayplatformlib.score_matrix(M, min_score=0.8)
    assert x, "Cannot identify platform."
    header, platform = x.header, x.platform_name

    probe_ids = M.row_names(header)
    #if kg5, convert to kg7
    if platform == 'UCSC_human_hg19_kg5':
        new_platform = 'UCSC_human_hg19_kg7'
        kg7_ids = arrayannot.convert_probe_ids(probe_ids, new_platform)
        kg7_header = 'Hybridization REF kg7'
        M = make_matrix_new_ids(M, kg7_ids, kg7_header, 1)
        probe_ids = M.row_names(kg7_header)
    # add LocusLink ids
    LocusLink_ids = arrayannot.convert_probe_ids(probe_ids, 'Entrez_ID_human')
    gene_symbol_ids = arrayannot.convert_probe_ids(probe_ids,
                                                   'Entrez_Symbol_human')
    newMatrix = make_matrix_new_ids(M, LocusLink_ids, 'Entrez_ID_human', 2)
    newMatrix = make_matrix_new_ids(newMatrix, gene_symbol_ids,
                                    'Entrez_Symbol_human', 3)
    #get rid of scaled_estimate
    assert 'scaled_estimate' in newMatrix._col_names['isoform_id']
    assert 'raw_count' in newMatrix._col_names['isoform_id']
    col_names = {}
    col_names['_SAMPLE_NAME'] = [
        newMatrix._col_names['_SAMPLE_NAME'][i]
        for i in range(len(newMatrix._col_names['_SAMPLE_NAME'])) if not i % 2
    ]
    row_names = newMatrix._row_names.copy()
    row_order = newMatrix._row_order[:]
    col_order = newMatrix._col_order[:]
    col_order.remove('isoform_id')
    synonyms = newMatrix._synonyms.copy()
    X = []
    for line in newMatrix._X:
        line = [line[i] for i in range(len(line)) if not i % 2]
        X.append(line)
    x = Matrix.InMemoryMatrix(X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    f = file(outfile, 'w')
    arrayio.tab_delimited_format.write(x, f)
    f.close()
Exemple #9
0
def _pcl_to_gct(X):
    # Will lose the column annotations.
    from genomicode import Matrix
    from genomicode import parselib
    assert pcl_format.is_matrix(X)

    # PCL format can have multiple column annotations, e.g. EWEIGHT.
    #assert len(X.col_names()) == 1
    assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME
    assert len(X.row_names()) > 0

    # Figure out the annotation names for the row name and description.
    if len(X.row_names()) == 1:
        row_name, row_desc = X.row_names()[0], None
    else:
        row_name, row_desc = X.row_names()[:2]

    row_order = ["NAME", "DESCRIPTION"]
    #col_order = X._col_order[:]
    col_order = [tab_delimited_format.SAMPLE_NAME]
    row_names = {}
    col_names = {}
    synonyms = {}

    row_names["NAME"] = X._row_names[row_name]
    if row_desc:
        row_names["DESCRIPTION"] = X._row_names[row_desc]
    else:
        # Make up default row names.
        x = ["DESC%s" % x for x in parselib.pretty_range(0, X.nrow())]
        row_names["DESCRIPTION"] = x
    col_names[col_order[0]] = X._col_names[tab_delimited_format.SAMPLE_NAME]
    synonyms[ROW_ID] = "NAME"
    synonyms[COL_ID] = col_order[0]

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #x = Matrix.add_synonyms(x, synonyms)
    #gct_format.is_matrix(x)
    #print gct_format.DIAGNOSIS
    assert gct_format.is_matrix(x)
    return x
Exemple #10
0
def make_matrix_new_ids(DATA, output_ids, header, index):
    # Make a matrix with the new IDs.
    X = DATA._X
    row_names = DATA._row_names.copy()
    row_order = DATA._row_order[:]
    col_names = DATA._col_names.copy()
    col_order = DATA._col_order[:]
    synonyms = DATA._synonyms.copy()
    row_order.insert(index, header)
    row_names[header] = output_ids
    # Write the outfile.
    x = Matrix.InMemoryMatrix(X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    return x
Exemple #11
0
def _res_to_pcl(X):
    # Will lose the CALL information.
    # Will lose the SCALE_FACTOR.
    # Will lose the column DESCRIPTIONS.
    from genomicode import Matrix
    assert res_format.is_matrix(X)

    # Figure out the annotation names for the row name and description.
    acc_header, desc_header, call_header = X.row_names()
    if X._synonyms[ROW_ID] != acc_header:
        acc_header, desc_header = desc_header, acc_header
    assert X._synonyms[ROW_ID] == acc_header

    # Make sure the names don't conflict.
    row_name = acc_header
    if row_name == "NAME":
        row_name = "ORIGINAL_NAME"

    row_order = [row_name, "NAME"]
    col_order = [X._col_order[0]]
    row_names = {}
    col_names = {}
    synonyms = {}

    row_names[row_name] = X.row_names(acc_header)
    row_names["NAME"] = X.row_names(desc_header)
    col_names[col_order[0]] = X.col_names(col_order[0])
    synonyms[ROW_ID] = row_name
    synonyms[COL_ID] = col_order[0]

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #x = Matrix.add_synonyms(x, synonyms)
    #pcl_format.is_matrix(x); print pcl_format.DIAGNOSIS
    assert pcl_format.is_matrix(x)
    return x
Exemple #12
0
def _res_to_tdf(X):
    from genomicode import Matrix
    assert res_format.is_matrix(X)

    # Figure out the annotation names for the row name and description.
    acc_header, desc_header, call_header = X.row_names()
    if X._synonyms[ROW_ID] != acc_header:
        acc_header, desc_header = desc_header, acc_header
    assert X._synonyms[ROW_ID] == acc_header

    row_order = [acc_header, desc_header, call_header]
    col_order = X._col_order
    row_names = X._row_names
    col_names = X._col_names
    synonyms = X._synonyms

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    assert tab_delimited_format.is_matrix(x)
    return x
Exemple #13
0
def _tdf_to_pcl(X):
    from genomicode import Matrix
    from genomicode import parselib
    assert tab_delimited_format.is_matrix(X)

    assert len(X.col_names()) >= 1
    assert X._col_order and X._col_order[0] == tab_delimited_format.SAMPLE_NAME

    # Make up default headers and names.
    id_header = "GENE_ID"
    name_header = "NAME"
    geneid = ["GENE%s" % x for x in parselib.pretty_range(0, X.nrow())]
    name = None

    # Try to find better names.
    if not X.row_names():
        pass
    elif len(X.row_names()) == 1:
        # Only 1 header, so use that for the gene ID.
        id_header = X.row_names()[0]
        geneid = X.row_names(id_header)
    else:
        # Use the first two columns for the ID and name.
        geneid_i, name_i = 0, 1
        # See if there is a ROW_ID set.  If there is, use that for NAME.
        if ROW_ID in X._synonyms:
            id_header = X._synonyms[ROW_ID]
            geneid_i = X.row_names().index(id_header)
            if geneid_i == name_i:
                # geneid_i used to be 0, and name_i is not 0.
                name_i = 0
        assert geneid_i != name_i
        geneid = X.row_names(X.row_names()[geneid_i])
        name = X.row_names(X.row_names()[name_i])

    if id_header == name_header:
        id_header = "GENE_ID"  # assume this is not the name_header
    assert id_header != name_header

    row_order = [id_header]
    if name is not None:
        row_order = [id_header, name_header]
    col_order = [X._col_order[0]]
    row_names = {}
    col_names = {}
    synonyms = {}

    row_names[id_header] = geneid
    if name_header in row_order:
        row_names[name_header] = name
    col_names[tab_delimited_format.SAMPLE_NAME] = X._col_names[X._col_order[0]]
    synonyms[ROW_ID] = id_header
    synonyms[COL_ID] = col_order[0]

    x = Matrix.InMemoryMatrix(X._X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #x = Matrix.add_synonyms(x, synonyms)
    assert pcl_format.is_matrix(x)
    return x
Exemple #14
0
def summarize_factor_scores(file_layout, factor_cutoff, python, arrayplot,
                            cluster, libpath):
    import arrayio
    from genomicode import Matrix
    from genomicode import graphlib

    DATA = arrayio.read(file_layout.DATASET)
    model = _read_model(file_layout, factor_cutoff)

    F = model["F"]
    # If there were no factors, then don't generate any files.
    if not F.nrow():
        print "Not generating factor scores file.  No factors detected."
        return
    assert F.ncol() == DATA.ncol()

    # Read the factor names.
    x = [x.strip() for x in open(file_layout.BFRM_FACTOR_IDS)]
    factor_names = x
    assert len(factor_names) == F.nrow()
    # The factor names are in the same order as the data files.  Sort
    # them so they'll be in the same order as the clean model.
    factor_names = [factor_names[i] for i in model["FACTOR_O"]]

    SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME
    row_names = {}
    col_names = {}
    row_names["xID"] = factor_names
    col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME)
    M = Matrix.InMemoryMatrix(F._X, row_names, col_names)
    arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES)

    # Make the heatmap.
    x = graphlib.find_wide_heatmap_size(M.nrow(),
                                        M.ncol(),
                                        min_box_height=10,
                                        min_box_width=10,
                                        max_total_height=768,
                                        max_total_width=1024)
    xpix, ypix = x
    ypix = min(ypix, xpix * 4)
    # TODO: Don't show array label if there are too many samples.
    x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES,
                              file_layout.FACTOR_SCORES_PNG,
                              xpix,
                              ypix,
                              color="bild",
                              show_colorbar=True,
                              show_grid=True,
                              gene_label=True,
                              cluster_genes=True,
                              gene_center="mean",
                              gene_normalize="var",
                              array_label=True,
                              cluster_arrays=True,
                              python=python,
                              arrayplot=arrayplot,
                              cluster=cluster,
                              libpath=libpath)

    # Clean up some of the cluster files.
    files = [
        file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR
    ]
    for filename in files:
        if not os.path.exists(filename):
            continue
        src = filename
        x = os.path.split(filename)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
Exemple #15
0
def summarize_gene_factor_probs(file_layout, factor_cutoff, python, arrayplot,
                                cluster, libpath):
    import arrayio
    from genomicode import Matrix
    from genomicode import graphlib

    model = _read_model(file_layout, factor_cutoff)
    PostPib = model["PostPib"]
    ExternalProb = model.get("ExternalProb")

    # If there were no factors, then don't generate any files.
    if not PostPib.ncol():
        print "Not generating factor probabilities file.  No factors detected."
        return

    # Pull out the gene names.
    DATA = arrayio.read(file_layout.DATASET)
    DATA_m = DATA.matrix(model["VariablesIn"], None)

    # Pull out the factor names.
    assert os.path.exists(file_layout.FACTOR_SCORES)
    D_scores = arrayio.read(file_layout.FACTOR_SCORES)
    factor_names = D_scores.row_names(arrayio.ROW_ID)
    assert len(factor_names) == PostPib.ncol()

    # Write the probabilities for the genes in the model.
    SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME
    row_names = {}
    col_names = {}
    row_order = DATA_m.row_names()
    for x in row_order:
        row_names[x] = DATA_m.row_names(x)
    col_names[SAMPLE_NAME] = factor_names
    M = Matrix.InMemoryMatrix(PostPib._X, row_names, col_names, row_order)
    arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS)

    # Make heatmap of the factor probs.
    #x = graphlib.find_tall_heatmap_size(
    #    M.nrow(), M.ncol(), min_box_width=10, max_total_height=1000,
    #    max_total_width=1000)
    xpix, ypix = 20, 20
    x = graphlib.plot_heatmap(
        file_layout.FACTOR_PROBS,
        file_layout.FACTOR_PROBS_PNG,
        xpix,
        ypix,
        color="red",
        #show_colorbar=True, show_grid=True,
        array_label=True,
        gene_label=True,
        scale=-0.5,
        gain=2.0,
        python=python,
        arrayplot=arrayplot,
        cluster=cluster,
        libpath=libpath)

    # If exists, write the probabilities for all genes in the data set.
    if not ExternalProb:
        return
    row_names = {}
    col_names = {}
    row_order = DATA.row_names()
    for x in row_order:
        row_names[x] = DATA.row_names(x)
    col_names[SAMPLE_NAME] = factor_names
    M = Matrix.InMemoryMatrix(ExternalProb._X, row_names, col_names, row_order)
    arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS_ALL)
Exemple #16
0
def summarize_factor_scores(file_layout, python, arrayplot, cluster, libpath):
    import zipfile
    import arrayio
    from genomicode import Matrix
    from genomicode import jmath
    from genomicode import archive
    from genomicode import graphlib
    from genomicode import bfrm

    DATA = arrayio.read(file_layout.DATASET)

    param_file = "parameters.txt"
    model = bfrm.read_clean_model(file_layout.BFRM_MODEL,
                                  param_file=param_file)
    num_factors = model["F"].nrow()

    # Load the factor names.
    assert zipfile.is_zipfile(file_layout.BFRM_MODEL)
    s2f = archive.unzip_dict(file_layout.BFRM_MODEL)
    assert "factorids.txt" in s2f, "Missing: factorids.txt"
    zfile = zipfile.ZipFile(file_layout.BFRM_MODEL)
    factor_names = [x.strip() for x in zfile.open(s2f["factorids.txt"])]
    assert len(factor_names) == num_factors

    # sample x factor matrix
    F = arrayio.read(file_layout.BFRM_AF)
    assert F.nrow() == DATA.ncol()
    F_X = jmath.transpose(F._X)

    # F_X contains all factors, including intercept and design.
    # Remove all but the latent factors.
    F_X = F_X[-num_factors:]

    # Sort the factors so they'll be in the same order as the clean
    # model.
    assert len(F_X) == len(model["FACTOR_O"])
    F_X = [F_X[i] for i in model["FACTOR_O"]]
    factor_names = [factor_names[i] for i in model["FACTOR_O"]]

    # Write out the projected factor scores.
    SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME
    row_names = {}
    col_names = {}
    row_names["xID"] = factor_names
    col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME)
    M = Matrix.InMemoryMatrix(F_X, row_names, col_names)
    arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES)

    # Make the heatmap.
    x = graphlib.find_wide_heatmap_size(M.nrow(),
                                        M.ncol(),
                                        min_box_height=10,
                                        min_box_width=10,
                                        max_total_height=768,
                                        max_total_width=1024)
    xpix, ypix = x
    ypix = min(ypix, xpix * 4)
    x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES,
                              file_layout.FACTOR_SCORES_PNG,
                              xpix,
                              ypix,
                              color="bild",
                              show_colorbar=True,
                              show_grid=True,
                              gene_center="mean",
                              gene_normalize="var",
                              gene_label=True,
                              cluster_genes=True,
                              array_label=True,
                              cluster_arrays=True,
                              python=python,
                              arrayplot=arrayplot,
                              cluster=cluster,
                              libpath=libpath)

    # Clean up the cluster files.
    files = [
        file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR
    ]
    for filename in files:
        if not os.path.exists(filename):
            continue
        src = filename
        x = os.path.split(filename)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
Exemple #17
0
def convert_matrix(filename, header, header_and_platform, in_delim, out_delim,
                   keep_dups, keep_emptys, no_na, out_platforms,
                   min_match_score, debug):
    import arrayio
    from genomicode import Matrix
    from genomicode import arrayplatformlib as apl
    from genomicode import arrayannot

    MIN_SCORE = 0.80
    REMOVE_VERSION = True

    assert not (header and header_and_platform)

    DATA = arrayio.read(filename)

    if header:
        x = DATA.row_names(header)
        gene_ids = apl.normalize_ids(x,
                                     delimiter=in_delim,
                                     remove_version_number=REMOVE_VERSION)
        x = apl.score_annotations(gene_ids, min_score=0.5)
        assert x, "I could not identify the platform for %s." % header
        best_score = x[0]
        in_platform, score = best_score.platform_name, best_score.max_score
    elif header_and_platform:
        x = header_and_platform.split(",", 1)
        assert len(x) == 2
        header, in_platform = x
        score = 1.0
        x = DATA.row_names(header)
        gene_ids = apl.normalize_ids(x,
                                     delimiter=in_delim,
                                     remove_version_number=REMOVE_VERSION)
        assert apl.find_platform_by_name(in_platform), \
               "Unknown platform: %s" % in_platform
    else:
        # Take the platform with the highest match score.
        scores = apl.score_matrix(DATA,
                                  annot_delim=in_delim,
                                  min_score=None,
                                  remove_version=REMOVE_VERSION)
        best_score = 0
        if scores:
            best_score = scores[0].max_score
        if best_score < MIN_SCORE and debug and scores:
            header = ("Header", "Platform", "Score", "Matrix Only",
                      "Plat Only", "Shared", "Matrix Only", "Plat Only",
                      "Shared")
            print "\t".join(header)
            for s in scores:
                x1 = sorted(s.mine_only)[:3]
                x2 = sorted(s.platform_only)[:3]
                x3 = sorted(s.shared)[:3]
                x1 = ", ".join(x1)
                x2 = ", ".join(x2)
                x3 = ", ".join(x3)
                x = (s.header, s.platform_name, s.max_score, len(s.mine_only),
                     len(s.platform_only), len(s.shared), x1, x2, x3)
                assert len(x) == len(header)
                print "\t".join(map(str, x))
        assert best_score >= MIN_SCORE, "No platforms found"
        best_score = scores[0]
        header = best_score.header
        in_platform = best_score.platform_name
        score = best_score = best_score.max_score
    err = "I could not find any platforms.  The best was %s (%g)." % (
        in_platform, score)
    assert score >= min_match_score, err
    gene_ids = DATA.row_names(header)

    # Convert each of the platforms.
    output_ids_list = []
    for out_platform in out_platforms:
        x = arrayannot.convert_gene_ids(gene_ids, in_platform, out_platform,
                                        in_delim, out_delim, keep_dups,
                                        keep_emptys, no_na)
        output_ids_list.append(x)

    # Make a matrix with the new IDs.
    X = DATA._X
    row_names = DATA._row_names.copy()
    row_order = DATA._row_order[:]
    col_names = DATA._col_names.copy()
    col_order = DATA._col_order[:]
    synonyms = DATA._synonyms.copy()

    for (out_platform, output_ids) in zip(out_platforms, output_ids_list):
        header = out_platform
        i = 1
        while header in row_order:
            header = "%s_%d" % (out_platform, i)
            i += 1
        row_order.append(header)
        row_names[header] = output_ids

    # Write the outfile.
    x = Matrix.InMemoryMatrix(X,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    arrayio.tab_delimited_format.write(x, sys.stdout)
Exemple #18
0
def read(handle, hrows=None, hcols=None, datatype=float):
    from genomicode import filelib
    from genomicode import jmath
    from genomicode import Matrix
    import tab_delimited_format as tdf
    import const

    handle = filelib.openfh(handle)
    # Can't use iolib.split_tdf here because it does not handle empty
    # lines properly (which can occur if there is a file with no
    # samples).
    #data = iolib.split_tdf(handle.read())
    data = [x.rstrip("\r\n").split("\t") for x in handle]
    assert len(data) >= 3, "Invalid RES file."

    # Do some checking on the format.
    assert len(data[0]) == len(data[1]) + 1
    x = sorted([x.upper() for x in data[0][:2]])
    assert x == ["ACCESSION", "DESCRIPTION"]
    assert len(data[2]) == 1, "%d: %s" % (len(data[2]), repr(data[2]))

    # Parse out the number of genes and delete the row.
    num_genes = int(data[2][0])
    del data[2]
    assert len(data) == num_genes + 2  # data + 2 headers

    # GenePattern creates files where the last column is all blank.
    # If this is the case, then delete it.
    #blank_last_col = True
    x = [x[-1] for x in data if x[-1]]
    if not x:
        # Last column is all blank so delete it.
        data = [x[:-1] for x in data]

    # Parse the names of the samples.
    sample_names = []
    for i, x in enumerate(data[0][2:]):
        if i % 2:
            assert not x
        else:
            assert x
            sample_names.append(x)

    # Parse out the sample_description.
    sample_description = []
    for i, x in enumerate(data[1]):
        if i % 2 == 0:
            assert not x
        else:
            assert x
            sample_description.append(x)
    assert len(sample_description) == len(sample_names)

    # Pull the scale factors out of the sample_description.
    # Some of the descriptions can be missing scale factors.
    scale_factors = [""] * len(sample_description)
    for i in range(len(sample_description)):
        x = sample_description[i]
        sf = "scale factor"
        j = x.lower().find(sf)
        if j < 0:
            continue
        assert x[j - 1] == "/"
        assert x[j + len(sf)] == "="
        scale_factors[i] = float(sample_description[i][j + len(sf) + 1:])
        sample_description[i] = sample_description[i][:j - 1]

    # Parse out the description and accession columns.
    accession_header = data[0][0]
    description_header = data[0][1]
    accession = [x[0] for x in data[2:]]
    description = [x[1] for x in data[2:]]
    x = [x.upper() for x in data[0][:2]]
    if x == ["DESCRIPTION", "ACCESSION"]:
        accession_header, description_header = \
                          description_header, accession_header
        accession, description = description, accession
    assert (accession_header.upper(), description_header.upper()) == \
           ("ACCESSION", "DESCRIPTION")

    # Accession should be unique.
    x = {}.fromkeys(accession).keys()
    assert len(x) == len(accession)

    # Parse out the matrix and calls.
    matrix = []
    calls = []
    for row in data[2:]:
        row = row[2:]
        x0 = [x for (i, x) in enumerate(row) if i % 2 == 0]
        x1 = [x for (i, x) in enumerate(row) if i % 2 == 1]
        assert len(x0) == len(x1)
        for x in x1:
            assert x.upper() in ["A", "P", "M"], x
        matrix.append(x0)
        calls.append(x1)
    assert len(matrix) == num_genes

    # Should have some way of specifying no conversion.
    if datatype is None:
        convert_fn = None  # default
    elif datatype is int:
        convert_fn = jmath.safe_int
    elif datatype is float:
        convert_fn = jmath.safe_float
    else:
        convert_fn = datatype

    if convert_fn:
        matrix = [map(convert_fn, x) for x in matrix]

    row_names = {}
    col_names = {}
    row_order = data[0][:2] + ["CALL"]
    col_order = [tdf.SAMPLE_NAME, "DESCRIPTION", "SCALE_FACTOR"]

    row_names[accession_header] = accession
    row_names[description_header] = description
    # Store the calls as row annotations.  The gene annotation "CALL"
    # is a string of A, P, or M, with one call per sample.
    row_names["CALL"] = ["".join(x) for x in calls]

    col_names[tdf.SAMPLE_NAME] = sample_names
    col_names["DESCRIPTION"] = sample_description
    col_names["SCALE_FACTOR"] = scale_factors

    synonyms = {}
    synonyms[const.COL_ID] = tdf.SAMPLE_NAME
    synonyms[const.ROW_ID] = accession_header

    X = Matrix.InMemoryMatrix(matrix,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #X = Matrix.add_synonyms(X, synonyms)
    #is_matrix(X); print DIAGNOSIS
    assert is_matrix(X)
    return X
def read(handle, hrows=None, hcols=None, datatype=float):
    import math
    from genomicode import filelib
    from genomicode import Matrix
    from genomicode import jmath
    from genomicode import iolib
    import util
    import const
    # Format:
    # - gene x experiment
    # - optional header row
    # - optional rows of sample annotations (requires header row)
    # - optional columns of gene annotations

    filename = None
    if type(handle) is type(""):
        filename = handle
    handle = filelib.openfh(handle)
    data = filelib.read_all_cols(handle)
    #data = [x for x in filelib.read_cols(handle)]
    #x = handle.read()
    #data = iolib.split_tdf(x, strip=True)
    #handle = filelib.read_cols(handle)
    #data = [handle.next() for i in range(100)]
    data = _clean_tdf(data)

    num_cols = len(data[0])
    for i, x in enumerate(data):
        nc = len(data[i])
        f = ""
        if filename:
            f = " [%s]" % filename
        error_msg = "Header%s has %d columns but line %d has %d." % (
            f, num_cols, i + 1, nc)
        assert nc == num_cols, error_msg
    if not data:
        return Matrix.InMemoryMatrix([])

    # If the rows and cols not explicitly specified, then try to guess
    # them from the file.
    #print "HEADERS 1", hrows, hcols
    if hrows is None or hcols is None:
        hr, hc = util.num_headers(data)
        if hrows is None:
            hrows = hr
        if hcols is None:
            hcols = hc
    #print "HEADERS 2", hrows, hcols
    #num_genes, num_arrays = num_rows-hrows, num_cols-hcols

    # Pull out the row names from the columns.
    row_names = {}  # header -> list of names (1 for each gene)
    row_order = []  # in-order list of the headers
    if hcols:
        if hrows:
            # If a header row is provided, then the names of these
            # annotations are provided in the header.
            row_order = data[0][:hcols]
        else:
            # No header row.  Make default name for these annotations.
            ndigits = int(math.ceil(math.log(hcols, 10)))
            row_order = ["ANNOT%*d" % (ndigits, i + 1) for i in range(hcols)]
        # Strip extraneous whitespace from the header names.
        # Not necessary.  Handled now in split_tdf.
        #row_order = [x.strip() for x in row_order]

        # Sometimes the format detection can go wrong and a GCT file
        # will slip through to here.  If this occurs, a "duplicate
        # header" exception will be generated.  Check for this and
        # generate a more meaningful error message.
        if (row_order[0] == "#1.2" and len(row_order) > 1
                and row_order[1] == "" and row_order[-1] == ""):
            raise AssertionError("ERROR: It looks like a GCT file was missed.")
        for i, header in enumerate(row_order):
            names = [x[i] for x in data[hrows:]]
            assert header not in row_names, "duplicate header: %s" % header
            row_names[header] = names

    # Pull out the column names.
    col_names = {}  # header -> list of names (1 for each array)
    col_order = []
    if hrows:
        for i in range(1, hrows):
            header = data[i][0]
            names = data[i][hcols:]
            assert header not in col_names, "duplicate name: %s" % header
            # Strip extraneous whitespace from the header names.
            # Not necessary.  Handled now in split_tdf.
            #header = header.strip()
            col_order.append(header)
            col_names[header] = names

    # Now extract the expression values.
    matrix = data
    if hrows or hcols:
        matrix = [x[hcols:] for x in matrix[hrows:]]

    # Pull out the sample names.
    sample_names = None
    if hrows:
        # If a header is provided, then use these as the column names.
        sample_names = data[0][hcols:]
    if sample_names:
        col_names[SAMPLE_NAME] = sample_names
        col_order.insert(0, SAMPLE_NAME)

    if datatype is None:
        convert_fn = None  # no conversion
    elif datatype is int:
        convert_fn = jmath.safe_int
    elif datatype is float:
        convert_fn = jmath.safe_float
    else:
        # Assume that I was passed a function.
        convert_fn = datatype

    if convert_fn == jmath.safe_float:
        # Try and convert to an integer instead.
        is_int = True
        for i in range(len(matrix)):
            for j in range(len(matrix[i])):
                if not jmath.is_int(matrix[i][j]):
                    is_int = False
                    break
            if not is_int:
                break
        if is_int:
            convert_fn = jmath.safe_int

    if convert_fn:
        check_each_row = False
        try:
            matrix = [map(convert_fn, x) for x in matrix]
        except ValueError, err1:
            if str(err1) == "empty string for float()":
                check_each_row = True
            elif str(err1).startswith("invalid literal for float()"):
                check_each_row = True
            elif str(err1).startswith("could not convert string to float"):
                check_each_row = True
            else:
                raise
        if check_each_row:
            # If there was an exception, then check each row carefully
            # to try to pinpoint the problem.
            for i, x in enumerate(matrix):
                try:
                    map(convert_fn, x)
                except ValueError, err2:
                    row = data[hrows + i]
                    raise ValueError("%s\nProblem with row %d: %s" %
                                     (str(err2), i + 1, row))
            raise AssertionError("Error converting values.")
Exemple #20
0
def read(handle, datatype=None):
    from genomicode import filefns
    from genomicode import Matrix
    import const

    # datatype is not used here.  It is explicitly specified in the
    # format.
    handle = filefns.openfh(handle)

    # Read the header.  The format description doesn't specify whether
    # the names are case sensitive, so accept case insensitive names
    # by converting everything to uppercase.
    x = handle.readline().strip()
    assert x == "ODF 1.0", "Missing ODF version."
    x = handle.readline().strip().split("=")
    assert len(x) == 2
    assert x[0].upper() == "HEADERLINES"
    header_lines = int(x[1])
    assert header_lines >= 3 and header_lines <= 7, \
           "Invalid number of header lines."
    lines = [handle.readline() for i in range(header_lines)]
    lines = [x for x in lines if x]   # remove blank lines if EOF
    assert len(lines) == header_lines, "Wrong number of lines in header."

    # Parse the header lines.
    header = {}  # name -> value
    num_cols = None   # just the data, not the annotation headers.
    for line in lines:
        delimiter = "="
        if line.startswith("COLUMN"):
            delimiter = ":"
        assert delimiter in line, "Header missing delimiter '%s': %s" % (
            delimiter, line)
        name, value = line.split(delimiter)
        name, value = name.strip(" \r\n"), value.strip(" \r\n")
        if name.startswith("COLUMN"):
            value = value.split("\t")
            num_data = len(value)
            if name in ["COLUMN_TYPES", "COLUMN_NAMES"]:
                # Contains metadata describing the annotations.
                num_data = len(value)-2
            if num_cols is None:
                num_cols = num_data
            assert num_data == num_cols
        name = name.upper()
        header[name] = value
    header["DATALINES"] = int(header["DATALINES"])

    assert "MODEL" in header, 'Missing "Model" header.'
    assert "DATALINES" in header, 'Missing "DataLines" header.'
    assert "COLUMN_TYPES" in header, 'Missing "COLUMN_TYPES" header.'
    assert num_cols is not None  # Should come from COLUMN_TYPES.
    assert header["DATALINES"] >= 0 and header["DATALINES"] < 1E6

    # Read the data block.
    lines = [handle.readline() for i in range(header["DATALINES"])]
    lines = [x for x in lines if not x.startswith("#")]  # no comments
    data = [x.rstrip("\r\n").split("\t") for x in lines]
    # There might be leftover information in the file.  The format
    # does not describe how to handle this case.
    
    # Parse the column names out of the header.
    col_types = header["COLUMN_TYPES"]                    # required
    col_names = header.get("COLUMN_NAMES")                # optional
    col_descriptions = header.get("COLUMN_DESCRIPTIONS")  # optional

    # Make sure each line has the right number of columns.
    for x in data:
        assert len(x) == num_cols+2  # add the 2 columns of annotations
    # Convert the types of the data.
    for j in range(num_cols):
        coltype = col_types[j]
        ucoltype = coltype.upper()
        convert_fn = None
        if ucoltype == "STRING":
            pass
        elif ucoltype == "FLOAT":
            convert_fn = float
        else:
            raise AssertionError, "Unknown column type: %s" % coltype
        if not convert_fn:
            continue
        for i in range(len(data)):
            data[i][j] = convert_fn(data[i][j])

    # The first two columns are for the row names and description.
    col0 = [x[0] for x in data]
    col1 = [x[1] for x in data]
    head0, head1 = "Name", "Description"
    if col_names:
        head0, head1 = col_names[0], col_names[1]
    row_names, row_descriptions = col0, col1
    row_name_header, row_description_header = head0, head1
    if "ROWNAMESCOLUMN" in header:
        col = int(header["ROWNAMESCOLUMN"])
        assert col in [0, 1]
        if col == 1:
            row_names = col1
            row_name_header = head1
    if "ROWDESCRIPTIONSCOLUMN" in header:
        col = int(header["ROWDESCRIPTIONSCOLUMN"])
        assert col in [0, 1]
        if col == 0:
            row_descriptions = col0
            row_description_header = head0

    # Cut off the headers for the annotations.
    col_types = col_types[2:]
    if col_names:
        col_names = col_names[2:]

    matrix = [x[2:] for x in data]
    row_headers = [head0, head1]
    col_headers = None
    row_annots = {}
    col_annots = {}
    synonyms = {}

    row_annots[row_name_header] = row_names
    row_annots[row_description_header] = row_descriptions
    col_annots["TYPE"] = col_types
    if col_descriptions:
        col_annots["DESCRIPTION"] = col_descriptions
    synonyms[const.ROW_ID] = row_name_header
        
    X = Matrix.InMemoryMatrix(
        matrix, row_names=None, col_names=col_names,
        row_headers=row_headers, col_headers=col_headers,
        row_annots=row_annots, col_annots=col_annots, synonyms=synonyms)
    assert is_matrix(X)
    return X
Exemple #21
0
def clean_model(model, factor_cutoff=None):
    # Process the model for simpler handling.  Return a dictionary of
    # the cleaned up model.  The dictionary contains:
    # A              pxk
    # Bnz            pxk
    # PostPib        pxk
    # factors        pxk    1/0, based on factor_cutoff
    # ExternalProb   mxk    Parallel to data set.  (May be missing.)
    # F              kxn
    # Psi            p
    # Tau            k
    # VariablesIn    p      Indexes (0-based) of genes, relative to data set.
    # 
    # GENE_O         p      For sorting original model to this order.  0-based
    # FACTOR_O       k      For sorting original model to this order.  0-based
    # 
    # p  num_genes_in_model
    # n  num_samples
    # k  num_factors              Just latent factors.
    # m  num_genes_in_dataset
    #
    # Matrix variables, A, F, PostPib, factors, VariablesIn,
    # etc. are all sorted according to FACTOR_O and GENE_O.  FACTOR_O
    # and GENE_O are provided to help convert the original data set to
    # the same order as these variables.
    #
    # Changes from the original model:
    # o Remove the designs and controls.
    # o Make VariablesIn, if it doesn't exist.
    # o Make VariablesIn 0-based, instead of 1-based.
    # o Sort the genes and factors.
    from genomicode import jmath
    from genomicode import Matrix

    factor_cutoff = factor_cutoff or 0.99

    # Figure out the dimensions of this model.
    p = len(model["Psi"])
    n = model["NObservations"]
    q = len(model["Tau"])
    m = model["NVariables"]
    k = q - model["NControlVariables"] - model["NDesigns"]
                                
    # Get rid of the design variables.
    # Boundary case: k == 0, otherwise won't slice correctly.
    if k > 0:
        A = model["A"].matrix(None, (-k, None))
        Bnz = model["Bnz"].matrix(None, (-k, None))
        PostPib = model["PostPib"].matrix(None, (-k, None))
        F = model["F"].matrix((-k, None), None)
        Tau = model["Tau"][-k:]
    else:
        A = model["A"].matrix(None, [])
        Bnz = model["Bnz"].matrix(None, [])
        PostPib = model["PostPib"].matrix(None, [])
        F = model["F"].matrix([], None)
        Tau = []

    # Make the factors variable by thresholding PostPib on factor_cutoff.
    factors = PostPib.matrix()
    for x in factors._X:
        for i in range(len(x)):
            if x[i] >= factor_cutoff:
                x[i] = 1
            else:
                x[i] = 0

    # If VariablesIn doesn't exist, make it.
    VariablesIn = model.get("VariablesIn")
    if not VariablesIn:
        # First, make a 1-based index.
        VariablesIn = list(range(1, p+1))
    # Convert VariablesIn to 0-based index.
    VariablesIn = [x-1 for x in VariablesIn]

    # Make the ExternalProb variable.
    ExternalProb = None
    if "ExternalProb" in model:
        # First column is the (1-based) index of the genes.  These
        # should not overlap with VariablesIn.
        seen = {}.fromkeys(VariablesIn)
        for index in model["ExternalProb"].value(None, 0):
            index = int(index)-1
            assert index not in seen
            seen[index] = 1
        assert len(seen) == m
        
        prob = [[0]*k for i in range(m)]

        # Set the probabilities from PostPib.
        for i, index in enumerate(VariablesIn):
            prob[index] = PostPib.value(i, None)
        for x in model["ExternalProb"]._X:
            index, x = int(x[0])-1, x[1:]
            assert len(x) == k
            prob[index] = x
        #for index in model["mVariablesIn"]:
        #    print "%d\tmVariablesIn" % index
        #for index in model["mExternalProb"].value(None, 0):
        #    index = int(index)
        #    print "%d\tmExternalProb" % index
        #sys.exit(0)
        ExternalProb = Matrix.InMemoryMatrix(prob)

    # Order the factors based on decreasing number of genes.
    if factors._X:
        sums = jmath.mysum(factors._X, byrow=0)
        FACTOR_O = jmath.order(sums, decreasing=1)
    else:
        # No factors.
        FACTOR_O = []

    # Order the genes based on decreasing number of factors.  Earlier
    # factors should get much higher weights.
    X = factors.slice(None, FACTOR_O)
    if X:
        weights = [2**x for x in reversed(range(k))]
        sums = [None] * p
        for i in range(p):
            sums[i] = sum([x1*x2 for (x1, x2) in zip(X[i], weights)])
        GENE_O = jmath.order(sums, decreasing=1)
    else:
        # No factors.
        # Can't specify genes, or the indexes will be out of range of
        # an empty matrix.
        #GENE_O = list(range(p))
        GENE_O = []
    
    cmod = {}
    cmod["A"] = A.matrix(GENE_O, FACTOR_O)
    cmod["Bnz"] = Bnz.matrix(GENE_O, FACTOR_O)
    cmod["PostPib"] = PostPib.matrix(GENE_O, FACTOR_O)
    cmod["factors"] = factors.matrix(GENE_O, FACTOR_O)
    if ExternalProb:
        cmod["ExternalProb"] = ExternalProb.matrix(None, FACTOR_O)
    cmod["F"] = F.matrix(FACTOR_O, None)
    cmod["Psi"] = [model["Psi"][i] for i in GENE_O]
    cmod["Tau"] = [Tau[i] for i in FACTOR_O]
    cmod["VariablesIn"] = [VariablesIn[i] for i in GENE_O]
    cmod["GENE_O"] = GENE_O
    cmod["FACTOR_O"] = FACTOR_O

    # Check the dimensions of the matrices.
    if k:
        # If no factors, then the number of rows of the matrices won't
        # be preserved.
        assert cmod["A"].dim() == (p, k), "%s %s" % (A.dim(), (p, k))
        assert cmod["Bnz"].dim() == (p, k)
        assert cmod["PostPib"].dim() == (p, k)
        assert cmod["factors"].dim() == (p, k)
        if "ExternalProb" in cmod:
            assert cmod["ExternalProb"].dim() == (m, k)
        assert cmod["F"].dim() == (k, n)
        # Length of all these will be 0.
        assert len(cmod["Psi"]) == p
        assert len(cmod["Tau"]) == k
        assert len(cmod["VariablesIn"]) == p
        assert len(cmod["GENE_O"]) == p
        assert len(cmod["FACTOR_O"]) == k
    
    return cmod
                    row = data[hrows + i]
                    raise ValueError("%s\nProblem with row %d: %s" %
                                     (str(err2), i + 1, row))
            raise AssertionError("Error converting values.")

    # Set ROW_ID and COL_ID to reasonable defaults.
    synonyms = {}
    if SAMPLE_NAME in col_names:
        synonyms[const.COL_ID] = SAMPLE_NAME
    if row_order:
        # Bug: This should be the first column with unique values.
        synonyms[const.ROW_ID] = row_order[0]

    X = Matrix.InMemoryMatrix(matrix,
                              row_names=row_names,
                              col_names=col_names,
                              row_order=row_order,
                              col_order=col_order,
                              synonyms=synonyms)
    #X = Matrix.add_synonyms(X, synonyms)
    return X


CLEAN_RE = None
CLEAN_DISALLOWED = None


def _clean(s, disallowed=None):
    # Make sure there are no disallowed characters in the string s.
    global CLEAN_RE
    global CLEAN_DISALLOWED