Beispiel #1
0
def write_prism_file(filename, hist):
    # hist is R list from hist function.
    from genomicode import jmath

    # XY plot in Prism.

    # Get "breaks" out of histogram return value.
    breaks = [x for x in hist.rx2("breaks")]
    breaks = breaks[:-1]
    counts = [x for x in hist.rx2("counts")]
    density = [x for x in hist.rx2("density")]
    mids = [x for x in hist.rx2("mids")]
    assert len(breaks) == len(counts)
    assert len(breaks) == len(density)
    assert len(breaks) == len(mids)

    # density from R doesn't sum up to 1.  (e.g. sum to 2).
    # Recalculate so that it sums to 1.
    total = sum(counts)
    for i in range(len(density)):
        density[i] = counts[i] / float(total)
    header = ["Mids", "Left", "Counts", "Density"]
    x = [mids, breaks, counts, density]
    x = jmath.transpose(x)
    x = [header] + x
    handle = open(filename, 'w')
    for x in x:
        print >>handle, "\t".join(map(str, x))
Beispiel #2
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        from genomicode import geolib
        from genomicode import jmath

        # Input should be a GEOSeriesMatrixFile.
        filename = in_data.identifier

        convert_NA = user_options.get("set_NA_to")

        # Get the sample data and write it out.
        matrix = geolib._extract_sm_sample_meta(filename)
        matrix = geolib._clean_sm_sample_meta(matrix)
        matrix = geolib._prettify_sm_sample_meta(matrix)
        matrix = jmath.transpose(matrix)

        # each column is an annotation
        if convert_NA != "NA":
            for i in range(1, len(matrix)):
                for j in range(len(matrix[i])):
                    # Do case sensitive?
                    if matrix[i][j] == "NA":
                        matrix[i][j] = convert_NA

        outhandle = open(outfile, 'w')
        for x in matrix:
            print >> outhandle, "\t".join(map(str, x))
        outhandle.close()
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        import os
        from genomicode import jmath
        in_data = antecedents
        matrix = [x for x in filelib.read_cols(in_data.identifier)]
        matrix = [x[1:] for x in matrix]
        matrix = jmath.transpose(matrix)
        sample = matrix[0][1:]
        data = matrix[1:]
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        for one_data in data:
            value = one_data[1:]
            value = [float(i) for i in value]
            pair = [(value[i], sample[i]) for i in range(len(value))]
            pair.sort()
            gene_value = [i[0] for i in pair]
            label = [i[1] for i in pair]
            ylabel = one_data[0]
            from genomicode import mplgraph
            fig = mplgraph.barplot(gene_value,
                                   box_label=label,
                                   xtick_rotation=90,
                                   xlabel='sample',
                                   ylabel=ylabel)
            output = os.path.join(outfile, ylabel)
            fig.savefig(output + '.png')

        assert filelib.exists_nz(outfile), (
            'the output file %s for plot_geneset_score_bar fails' % outfile)
Beispiel #4
0
def read_geneset_scores(filename):
    # Read the output from score_geneset.py and return a Matrix
    # object.
    import os
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import Matrix
    from arrayio import const
    from arrayio import tab_delimited_format as tdf

    assert os.path.exists(filename)
    matrix = [x for x in filelib.read_cols(filename)]
    matrix = jmath.transpose(matrix)

    # Only want the scores.  Get rid of the direction, pvalue, and
    # significance lines.
    # Columns:
    # SAMPLE
    # FILE
    # [Score ...]
    # [Direction ...] " direction"
    # [p value ...] " pvalue"
    # [significant ...] " significant"
    assert matrix
    i = 0
    while i < len(matrix):
        assert matrix[i]
        metadata = False
        if matrix[i][0].endswith(" direction"):
            metadata = True
        elif matrix[i][0].endswith(" pvalue"):
            metadata = True
        elif matrix[i][0].endswith(" significant"):
            metadata = True
        if not metadata:
            i += 1
            continue
        del matrix[i]

    # BUG: Need more checks on size and format of matrix.
    col_names = {}
    sample_row = 0
    if matrix[1][0].upper() == "SAMPLE":
        sample_row = 1
    col_names[tdf.SAMPLE_NAME] = matrix[sample_row][1:]
    row_names = {}
    row_names['geneset'] = []
    synonyms = {}
    synonyms[const.COL_ID] = tdf.SAMPLE_NAME
    data = []
    for line in matrix[2:]:
        single_data = [jmath.safe_float(i) for i in line[1:]]
        data.append(single_data)
        row_names['geneset'].append(line[0])
    M = Matrix.InMemoryMatrix(data,
                              row_names=row_names,
                              col_names=col_names,
                              synonyms=synonyms)
    return M
Beispiel #5
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import arrayio
     from Betsy import read_label_file
     from genomicode import jmath
     
     cls_node_train, data_node = antecedents
     result, label_line, second_line = read_label_file.read(
         cls_node_train.identifier)
     y = [second_line[int(i)] for i in label_line]
     R = jmath.start_R()
     M = arrayio.read(data_node.identifier)
     M_train = M.matrix(None, range(0, len(label_line)))
     M_test = M.matrix(None, range(len(label_line), M.dim()[1]))
     M1 = M_train.slice()
     M_train = jmath.transpose(M1)
     jmath.R_equals_matrix(M_train, 'data')
     M2 = M_test.slice()
     M2 = jmath.transpose(M2)
     jmath.R_equals_matrix(M2, 'test')
     jmath.R_equals(y, 'y')
     R('y<-as.factor(y)')
     R('require(randomForest, quietly=TRUE)')
     R('library(randomForest)')
     R('model <- randomForest(data,y=y,importance=TRUE)')
     R('predict_result <- predict(model, test)')
     predict_result = R['predict_result']
     levels = predict_result.levels
     predict_labels = predict_result[:]
     predict_labels = [levels[i - 1] for i in predict_labels]
     name = M_test._col_names.keys()[0]
     sample_name = M_test._col_names[name]
     result = [['Sample_name', 'Predicted_class', 'Confidence']]
     for i in range(len(sample_name)):
         result.append([str(sample_name[i]), predict_labels[i], ''])
     
     f = file(outfile, 'w')
     for i in result:
         f.write('\t'.join(i))
         f.write('\n')
     f.close()
Beispiel #6
0
def write_selap_dataset(file_layout):
    import arrayio
    from genomicode import jmath

    matrix = arrayio.read(file_layout.DATASET)

    # Align the matrix to the SELAP model.

    # Make a matrix for SELAP.
    X_selap = jmath.transpose(matrix._X)
    handle = open(file_layout.SELAP_DATASET, 'w')
    for x in X_selap:
        print >> handle, "\t".join(map(str, x))
    handle.close()
def write_from_am(handle_or_file, svm_matrix):
    from genomicode import jmath

    headers0 = [None] * len(svm_matrix.headers)
    headers1 = [None] * len(svm_matrix.headers)
    headers2 = [None] * len(svm_matrix.headers)
    for i, header in enumerate(svm_matrix.headers):
        x = header.split("___")
        assert len(x) == 3, "Invalid header format: %s" % x
        headers0[i] = x[0]
        headers1[i] = x[1]
        headers2[i] = x[2]

    for i in range(len(headers0) - 1, -1, -1):
        # If headers1[i] is the same as header1[i-1], then do not
        # write it out again.
        #
        # Exception: If headers0[i] != headers0[i-1], then we're
        # starting a new "block", and headers1[i] should still be
        # written out.
        # Example: If there's only one <Caller>, then the <Sample>
        # will not be blank, but the <Caller> should still be copied
        # over (because they are the same).
        # <Sample1>    <Sample2>
        # <Caller>     <Caller>
        # Ref/Alt/VAF  Ref/Alt/VAF
        if headers1[i] == headers1[i - 1] and headers0[i] == headers0[i - 1]:
            headers1[i] = ""
        if headers0[i] == headers0[i - 1]:
            headers0[i] = ""

    matrix = []
    for i, header_h in enumerate(svm_matrix.headers_h):
        h0 = headers0[i]
        h1 = headers1[i]
        h2 = headers2[i]
        annots = svm_matrix.header2annots[header_h]
        x = [h0, h1, h2] + annots
        matrix.append(x)
    # Transpose the matrix.
    matrix = jmath.transpose(matrix)

    handle = handle_or_file
    if type(handle) is type(""):
        handle = open(handle, 'w')
    for x in svm_matrix.headerlines:
        print >> handle, x
    for x in matrix:
        print >> handle, "\t".join(map(str, x))
Beispiel #8
0
def write_prism_file(filename, MATRIX, gene_names):
    # Format in prism format for an XY plot.  Each gene is a different
    # series.
    from genomicode import jmath
    num_samples = MATRIX.ncol()

    m = []  # Make a row-based matrix and transpose it.

    # Write the sample name.
    sample_names = MATRIX.col_names(MATRIX.col_names()[0])
    x = []
    for i in range(len(gene_names)):
        x.extend(sample_names)
    m.append(x)

    # Write the X-coordinate.
    x = []
    for i in range(len(gene_names)):
        for j in range(num_samples):
            x.append(j + 1)
    m.append(x)

    # Add each series.
    for i in range(len(gene_names)):
        # Pre-pad blanks for the other series.
        x1 = [""] * num_samples * i
        x2 = MATRIX._X[i]
        # Post-pad blanks to fill in the matrix.
        x3 = [""] * (len(m[0]) - len(x1) - len(x2))
        x = x1 + x2 + x3
        m.append(x)

    # Transpose to column-major format.
    m = jmath.transpose(m)

    # Add the gene names as the column headers.
    x = ["Sample", "X"] + gene_names
    m = [x] + m

    # Write the matrix to the file.
    handle = open(filename, 'w')
    for x in m:
        print >> handle, "\t".join(map(str, x))
Beispiel #9
0
def write(handle_or_file, annot_matrix, delim=None):
    from genomicode import jmath

    if delim is None:
        delim = "\t"

    matrix = []
    for i, header_h in enumerate(annot_matrix.headers_h):
        header = annot_matrix.headers[i]
        annots = annot_matrix.header2annots[header_h]
        x = [header] + annots
        matrix.append(x)
    # Transpose the matrix.
    matrix = jmath.transpose(matrix)

    handle = handle_or_file
    if type(handle) is type(""):
        handle = open(handle, 'w')
    for x in annot_matrix.headerlines:
        print >> handle, x
    for x in matrix:
        print >> handle, delim.join(map(str, x))
Beispiel #10
0
def write_prism_file(filename, hist):
    # hist is R list from hist function.
    from genomicode import jmath

    # XY plot in Prism.

    # Get "breaks" out of histogram return value.
    breaks = [x for x in hist.rx2("breaks")]
    breaks = breaks[:-1]
    counts = [x for x in hist.rx2("counts")]
    density = [x for x in hist.rx2("density")]
    mids = [x for x in hist.rx2("mids")]
    assert len(breaks) == len(counts)
    assert len(breaks) == len(density)
    assert len(breaks) == len(mids)

    header = ["Mids", "Left", "Counts", "Density"]
    x = [mids, breaks, counts, density]
    x = jmath.transpose(x)
    x = [header] + x
    handle = open(filename, 'w')
    for x in x:
        print >> handle, "\t".join(map(str, x))
Beispiel #11
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import hashlib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        simple_node = in_data
        filelib.assert_exists_nz(simple_node.identifier)

        gene_file = mlib.get_user_option(
            user_options, "cancer_genes_file", not_empty=True, check_file=True)

        # Read the cancer genes file.
        # <Gene ID>  <Gene Symbol>  <Dataset>  ...
        symbol2info = {}  # symbol -> d
        gene_iter = filelib.read_row(gene_file, header=1)
        header = None
        for d in gene_iter:
            assert "Gene Symbol" in d._header
            if header is None:
                header = [
                    x for x in d._header
                    if x not in ["Gene ID", "Gene Symbol"]]
            if not d.Gene_Symbol:
                continue
            symbol2info[d.Gene_Symbol] = d

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(simple_node.identifier)

        GENE_H = "Annovar______Gene.refGene"
        assert GENE_H in SVM.headers, "Missing annotation: %s" % GENE_H
        GENES = SVM[GENE_H]

        # Align the matrix to the simple variant matrix.
        gene_headers = header
        gene_annotations = []
        for i, gene_str in enumerate(GENES):
            # Format of genes:
            # PFN1P2
            # PMS2P2,PMS2P7
            values = [""] * len(gene_headers)
            genes = gene_str.split(",")
            for gene in genes:
                if gene not in symbol2info:
                    continue
                d = symbol2info[gene]
                for j, h in enumerate(gene_headers):
                    h = hashlib.hash_var(h)
                    assert hasattr(d, h)
                    x = getattr(d, h)
                    assert x in ["", "1"]
                    if x == "1":
                        values[j] = 1
            gene_annotations.append(values)
        # Convert the headers and annotations to SVM format.
        gene_headers = ["Cancer Genes______%s" % x for x in gene_headers]
        gene_annotations = jmath.transpose(gene_annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        # If Annovar exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("ANNOVAR")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If SnpEff exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("SNPEFF")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If COSMIC exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("COSMIC")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + gene_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + gene_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_filename):
        import os
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import alignlib
        #from Betsy import module_utils as mlib

        rsem_path = in_data.identifier
        assert os.path.exists(rsem_path)
        assert os.path.isdir(rsem_path)
        result_files = alignlib.find_rsem_result_files(rsem_path)
        assert result_files, "No .results files found."
        metadata = {}

        preprocess = out_attributes.get("preprocess")
        assert preprocess in ["tpm", "fpkm"]

        #x = mlib.get_user_option(
        #    user_options, "genes_or_isoforms", not_empty=True,
        #    allowed_values=["genes", "isoforms"])
        #get_genes = x == "genes"

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["expression_of"]
        assert x in ["gene", "isoform"]
        get_genes = x == "gene"

        transcript_header = "transcript_id(s)"
        if not get_genes:
            transcript_header = "transcript_id"

        # For each of the gene files, get the expression data.
        sample2matrix = {}  # sample -> AnnotationMatrix
        for x in result_files:
            sample, gene_filename, isoform_filename = x
            # Get the gene results.
            # TODO: Implement isoforms.
            filename = gene_filename
            if not get_genes:
                filename = isoform_filename
            assert filename is not None, "Missing: %s" % filename
            #if filename is None:
            #    continue
            assert os.path.exists(filename)
            matrix = AnnotationMatrix.read(filename)
            # Do some checking on the matrix.
            assert "gene_id" in matrix.headers
            assert transcript_header in matrix.headers
            assert "TPM" in matrix.headers
            assert "FPKM" in matrix.headers
            sample2matrix[sample] = matrix
        assert sample2matrix, "No samples"

        gene_id = transcript_id = None
        # Pull out the gene and transcript IDs.
        for matrix in sample2matrix.itervalues():
            x1 = matrix["gene_id"]
            x2 = matrix[transcript_header]
            if gene_id is None:
                gene_id = x1
            if transcript_id is None:
                transcript_id = x2
            assert x1 == gene_id
            assert x2 == transcript_id
        assert gene_id
        assert transcript_id
        assert len(gene_id) == len(transcript_id)

        # Assemble into a gene expression matrix.
        header = "TPM"
        if preprocess == "fpkm":
            header = "FPKM"
        t_data = []  # matrix, where each row is a sample.
        t_data.append(gene_id)
        t_data.append(transcript_id)
        samples = []
        for sample in sorted(sample2matrix):
            matrix = sample2matrix[sample]
            exp = matrix[header]
            assert len(exp) == len(gene_id)
            t_data.append(exp)
            samples.append(sample)

        data = jmath.transpose(t_data)
        header = ["gene_id", transcript_header] + samples
        data = [header] + data

        # Write out the data file.
        handle = open(out_filename, 'w')
        for x in data:
            print >>handle, "\t".join(map(str, x))

        return metadata
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov):
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import AnnotationMatrix
    from genomicode import SimpleVariantMatrix
    
    # Read the variant file.
    SVM = SimpleVariantMatrix.read(svm_file)
    AM = SVM.annot_matrix
    assert "Chrom" in AM.headers
    assert "Pos" in AM.headers
    CHROM = AM["Chrom"]
    POS = AM["Pos"]
    POS = [int(x) for x in POS]

    # Read the coverage matrix.
    # Chrom  Pos  <Sample>  [<Sample> ...]
    # Pos is 1-based.
    coord2sample2cov = {}  # (chrom, pos) -> sample -> ref/alt/vaf
    cov_samples = {}
    for d in filelib.read_row(coverage_file, header=1):
        coord = d.Chrom, int(d.Pos)
        if coord not in coord2sample2cov:
            coord2sample2cov[coord] = {}
        for i in range(2, len(d._header)):
            sample = d._header[i]
            cov = d._cols[i]
            if not cov:
                continue
            #coord2sample2cov[coord][sample] = int(cov)
            coord2sample2cov[coord][sample] = cov
            cov_samples[sample] = 1

    # Make sure the samples from the variant matrix can be found
    # in the coverage matrix.
    missing = [x for x in SVM.samples if x not in cov_samples]
    assert len(missing) < len(SVM.samples), (
        "SimpleVariantMatrix and coverage file have "
        "no common samples.")
    # If the samples aren't sequenced at high coverage, it's
    # possible they just don't have reads at these positions.  Be
    # a little lenient here, and accept the file if some of the
    # samples overlap.
    #x = missing
    #if len(x) > 5:
    #    x = x[:5] + ["..."]
    #msg = "Samples (%d) not found in coverage file: %s" % (
    #    len(missing), ", ".join(x))
    #assert not missing, msg
    # Report the coverage for the samples at the intersection.
    SAMPLES = [x for x in SVM.samples if x in cov_samples]

    # Align the matrix to the simple variant matrix.
    #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())]
    matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())]
    for i in range(AM.num_annots()):
        coord = CHROM[i], POS[i]
        sample2cov = coord2sample2cov.get(coord, {})
        x = [sample2cov.get(x, "") for x in SAMPLES]
        #x = map(str, x)
        matrix[i] = x

    # Add the matrix back to the simple variant matrix.
    headers = SAMPLES
    all_annots = jmath.transpose(matrix)
    name = "Coverage"
    # If this is being used to add RNA coverage, use a different
    # name.
    if is_rna_cov:
        name = "RNA Coverage"
    x = AnnotationMatrix.create_from_annotations(headers, all_annots)
    SVM.named_matrices.append((name, x))

    # Write to file.
    SimpleVariantMatrix.write(outfile, SVM)
Beispiel #14
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import math
        from genomicode import filelib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        svm_node = in_data
        filelib.assert_exists_nz(svm_node.identifier)

        linked_file = mlib.get_user_option(user_options,
                                           "linked_variants_file",
                                           not_empty=True,
                                           check_file=True)

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier)
        CHROM = SVM["______Chrom"]
        POS = SVM["______Pos"]
        POS = [int(x) for x in POS]
        all_coords = {}  # (chrom, pos) -> 1
        for x in zip(CHROM, POS):
            all_coords[x] = 1

        # Read the linked variant file.
        # Chrom  Pos  Perc Linked  p
        coord2info = {}  # (chrom, pos) -> d
        for d in filelib.read_row(linked_file, header=1):
            pos = int(d.Pos)
            if (d.Chrom, pos) not in all_coords:
                continue
            coord2info[(d.Chrom, pos)] = d

        # Align the linked annotations to the matrix.
        MAX_SCORE = 1000
        min_p = 10**-(MAX_SCORE / 10)
        linked_headers = ["Perc Linked", "Score"]
        annotations = []
        for (chrom, pos) in zip(CHROM, POS):
            if (chrom, pos) not in coord2info:
                x = [""] * len(linked_headers)
                annotations.append(x)
                continue
            d = coord2info[(chrom, pos)]
            score = MAX_SCORE
            if float(d.p) >= min_p:
                score = -10 * math.log(float(d.p), 10)
            x = d.Perc_Linked, score
            assert len(x) == len(linked_headers)
            annotations.append(x)
        # Convert the headers and annotations to SVM format.
        linked_headers = ["Linkage______%s" % x for x in linked_headers]
        linked_annotations = jmath.transpose(annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        ## If Annovar exists, put after.
        #I = [i for (i, x) in enumerate(SVM.headers)
        #     if x.upper().startswith("ANNOVAR")]
        #if I:
        #    INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + linked_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
Beispiel #15
0
def find_diffexp_genes(outfile, gmt_file, algorithm, paired, MATRIX,
                       geneid_header, genename_header, genename_delim, name1,
                       name2, classes, filter_fold_change, fold_change,
                       p_cutoff, fdr_cutoff, bonf_cutoff, sam_DELTA,
                       sam_qq_file, edger_tagwise_dispersion, num_procs):
    # classes must be 0, 1, None.
    import os
    import sys
    import math
    import StringIO
    import warnings

    from rpy2 import rinterface

    from genomicode import config
    from genomicode import jmath
    from genomicode import genesetlib

    algorithm2function_unpaired = {
        "fold_change": "find.de.genes.fc",
        "ttest": "find.de.genes.ttest",
        "sam": "find.de.genes.sam",
        "ebayes": "find.de.genes.ebayes",
        "deseq2": "find.de.genes.deseq2",
        "edger": "find.de.genes.edgeR",
    }
    algorithm2function_paired = {
        "ebayes": "find.de.genes.paired.ebayes",
    }
    algorithm2function = algorithm2function_unpaired
    if paired:
        algorithm2function = algorithm2function_paired
        assert algorithm in algorithm2function_paired, \
               "No paired version of %s" % algorithm
    assert algorithm in algorithm2function, "Unknown algorithm: %s" % algorithm

    # Select the relevant columns from MATRIX.
    I = [i for (i, x) in enumerate(classes) if x in [0, 1]]
    assert len(I)
    MATRIX = MATRIX.matrix(None, I)
    classes = [classes[i] for i in I]

    # All algorithms except "fold_change" need at least 2 samples of
    # each class.
    counts = {}
    for x in classes:
        counts[x] = counts.get(x, 0) + 1
    assert sorted(counts) == [0, 1], "Only one class represented."

    if algorithm not in ["fold_change", "deseq2"]:
        assert counts[0] >= 2, "There must be at least 2 of each class."
        assert counts[1] >= 2, "There must be at least 2 of each class."

    names = [name1, name2]
    X = MATRIX._X
    Y = [names[x] for x in classes]
    sample_name = None
    if MATRIX.col_names():
        sample_name = MATRIX.col_names(MATRIX.col_names()[0])

    x = choose_gene_names(MATRIX)
    if not geneid_header:
        geneid_header = x[0]
    if not genename_header:
        genename_header = x[1]
    assert not geneid_header or geneid_header in MATRIX.row_names()
    assert not genename_header or genename_header in MATRIX.row_names()

    R = jmath.start_R()
    de_lib = os.path.join(config.changlab_Rlib, "diffexp.R")
    stat_lib = os.path.join(config.changlab_Rlib, "statlib.R")
    assert os.path.exists(de_lib), "I could not find file: %s" % de_lib
    assert os.path.exists(stat_lib), "I could not find file: %s" % stat_lib
    R('source("%s")' % de_lib)
    R('source("%s")' % stat_lib)

    jmath.R_equals(X, "X")
    jmath.R_equals(Y, "Y")
    if sample_name:
        jmath.R_equals(sample_name, "sample.name")
        jmath.R('colnames(X) <- sample.name')

    geneid = genenames = None
    if geneid_header:
        geneid = MATRIX.row_names(geneid_header)
        jmath.R_equals(geneid, "geneid")
    if genename_header:
        genenames = MATRIX.row_names(genename_header)
        jmath.R_equals(genenames, "genenames")

    # Set up the arguments.
    args = ["X", "Y"]
    if algorithm == "sam":
        args.append("%g" % sam_DELTA)
    if geneid:
        args.append("geneid=geneid")
    if genenames:
        args.append("genenames=genenames")
    # Pass the fold change to the algorithm, because it can affect the
    # multiple hypothesis correction.
    if filter_fold_change is not None:
        args.append("FOLD.CHANGE=%g" % filter_fold_change)
    if algorithm in ["ttest", "deseq2"]:
        args.append("NPROCS=%d" % num_procs)  # t-test only
    #if show_all_genes and algorithm != "sam":
    if algorithm not in ["sam", "fold_change"]:
        args.append("filter.p05=FALSE")
    if algorithm == "edger":
        if edger_tagwise_dispersion:
            args.append("tagwise.dispersion=TRUE")
        else:
            args.append("tagwise.dispersion=FALSE")

    # Prevent SAM from writing junk to the screen.
    handle = StringIO.StringIO()
    old_stdout = sys.stdout
    sys.stdout = handle

    # Call the proper R function.  DESeq2 throws off a lot of
    # warnings.  Turn them off temporarily.
    fn = algorithm2function[algorithm]
    x = ", ".join(args)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        R("x <- %s(%s)" % (fn, x))
    R("DATA <- x$DATA")
    DATA_R = R["DATA"]

    sys.stdout = old_stdout

    # Write out a QQ file for SAM.
    if algorithm == "sam" and sam_qq_file:
        R('S <- x$S')
        jmath.R_fn("bitmap",
                   sam_qq_file,
                   type="png256",
                   height=1600,
                   width=1600,
                   units="px",
                   res=300)
        jmath.R_fn("samr.plot", jmath.R_var("S"), sam_DELTA)
        jmath.R_fn("dev.off")

    # Convert this DataFrame into a Python object.  Columns of floats
    # can be StrVector objects if there are NA embedded within them.
    # NA are special objects of either type
    # rpy2.rinterface.NACharacterType or type
    # rpy2.rinterface.NARealType.
    tDATA_py = []
    header = [DATA_R.colnames[i] for i in range(DATA_R.ncol)]
    for zzz, col_R in enumerate(DATA_R):  # iterate over columns
        col_py = [col_R[i] for i in range(len(col_R))]

        if col_R.__class__.__name__ == "StrVector":
            pass
        elif col_R.__class__.__name__ == "FloatVector":
            col_py = [float(x) for x in col_py]
        elif col_R.__class__.__name__ == "IntVector":
            col_py = [int(x) for x in col_py]
        tDATA_py.append(col_py)
    DATA_py = jmath.transpose(tDATA_py)

    #handle = open('test01.txt', 'w')
    #for x in DATA_py:
    #    print >>handle, "\t".join(map(str, x))

    # Convert NA to None.
    for i in range(len(DATA_py)):
        for j in range(len(DATA_py[i])):
            if type(DATA_py[i][j]) in [
                    rinterface.NACharacterType, rinterface.NARealType
            ]:
                DATA_py[i][j] = None

    # Sort by increasing p-value, then decreasing fold change.
    name = "p.value"
    direction = 1
    #if algorithm == "sam":
    #    name = "Score(d)"
    if name not in header:
        name = "Log_2 Fold Change"
        direction = -1
    assert name in header, 'I could not find the "%s" column.' % name

    I = header.index(name)
    #schwartz = [(direction*float(x[I]), x) for x in DATA_py]
    values = [x[I] for x in DATA_py]
    for i in range(len(values)):
        if values[i] is None:
            values[i] = direction * 1E10
        else:
            values[i] = direction * float(values[i])
    schwartz = zip(values, DATA_py)
    schwartz.sort()
    DATA_py = [x[-1] for x in schwartz]

    # Filter based on user criteria.
    if fold_change is not None:
        log_2_fc = math.log(fold_change, 2)
        name = "Log_2 Fold Change"
        assert name in header, 'I could not find the "%s" column.' % name
        I = header.index(name)
        DATA_py = [
            x for x in DATA_py if x[I] is not None and abs(x[I]) >= log_2_fc
        ]
    if p_cutoff is not None:
        name = "p.value"
        assert name in header, 'I could not find the "%s" column.' % name
        I = header.index(name)
        DATA_py = [
            x for x in DATA_py if x[I] is not None and float(x[I]) < p_cutoff
        ]
    if fdr_cutoff is not None:
        name = "FDR"
        # This might be missing if all the genes have already been
        # filtered.
        #assert name in header, 'I could not find the "%s" column.' % name
        if name in header:
            I = header.index(name)
            DATA_py = [
                x for x in DATA_py
                if x[I] is not None and float(x[I]) < fdr_cutoff
            ]
    if bonf_cutoff is not None:
        name = "Bonf"
        assert name in header, 'I could not find the "%s" column.' % name
        I = header.index(name)
        DATA_py = [
            x for x in DATA_py
            if x[I] is not None and float(x[I]) < bonf_cutoff
        ]

    ## If no significant genes, then don't produce any output.
    ##if not DATA_py:
    ##    return

    # Write to the outhandle.
    _write_matrix(outfile, header, DATA_py)
    # Don't close someone else's file handle.
    #outhandle.close()

    # Write out the gene sets in GMT format, if requested.
    if not gmt_file:
        return
    assert "Direction" in header, 'I could not find the "Direction" column.'
    assert "Gene ID" in header, 'I could not find the "Gene ID" column.'
    assert "Gene Name" in header, 'I could not find the "Gene Name" column.'
    I_direction = header.index("Direction")
    I_geneid = header.index("Gene ID")
    I_genename = header.index("Gene Name")

    # "Higher in <name1>"
    # "Higher in <name2>"
    # "SAME"
    possible_directions = [
        "Higher in %s" % name1,
        "Higher in %s" % name2, "SAME"
    ]
    direction = [x[I_direction] for x in DATA_py]
    for x in direction:
        assert x.startswith("Higher in ") or x == "SAME"
        assert x in possible_directions
    samples = [x.replace("Higher in ", "") for x in direction]

    genesets = []  # list of (<SAMPLE>, [UP|DN])
    for s in samples:
        if s == "SAME":
            continue
        assert s in [name1, name2]
        # Make genesets relative to name2.  (Assume name1 is control).
        d = "UP"
        if s == name1:
            s, d = name2, "DN"
        genesets.append((s, d))
    genesets_all = sorted({}.fromkeys(genesets))

    outhandle = open(gmt_file, 'w')
    for geneset in genesets_all:
        sample, direct = geneset
        I = [i for (i, gs) in enumerate(genesets) if gs == geneset]
        gid = [DATA_py[i][I_geneid] for i in I]
        gn = [DATA_py[i][I_genename] for i in I]
        # gn might be float.  genesetlib expects array of strings.
        #import sys; sys.exit(0)
        gid = genesetlib.clean_genes(gid)
        gn = genesetlib.clean_genes(gn, delim=genename_delim)
        # <SAMPLE>_[ID|NAME]_[UP|DN]
        if gid:
            x = "%s_%s_%s" % (sample, "ID", direct)
            x = [x, "na"] + gid
            print >> outhandle, "\t".join(x)
        if gn:
            x = "%s_%s_%s" % (sample, "NAME", direct)
            x = [x, "na"] + gn
            print >> outhandle, "\t".join(x)
    outhandle.close()
Beispiel #16
0
def main():
    import argparse
    import glob
    import itertools

    DEF_PVALUE = 0.05

    parser = argparse.ArgumentParser(
        description="Score a gene set on a gene expression data set.")

    parser.add_argument("expression_files",
                        nargs="+",
                        help="Data set(s) to score.")
    parser.add_argument("-o",
                        dest="outfile",
                        default=None,
                        help="Name of file for results.")
    parser.add_argument("--transpose",
                        action="store_true",
                        help="Transpose the output matrix.")
    parser.add_argument(
        "--pvalue",
        type=float,
        default=DEF_PVALUE,
        help="p-value cutoff for determining significant changes "
        "(default %g)." % DEF_PVALUE)

    parser.add_argument("--libpath",
                        dest="libpath",
                        action="append",
                        default=[],
                        help="Add to the Python library search path.")
    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")

    # Assumes that there are no commas in names of gene sets.
    group = parser.add_argument_group(title="Gene Set")
    group.add_argument(
        "--geneset_file",
        dest="geneset_files",
        action="append",
        default=[],
        help="File(s) with gene sets.  Should be in gmx or gmt format.")
    group.add_argument(
        "-g",
        dest="gene_set",
        action="append",
        default=[],
        help="Name of the gene set to score.  If you want to score both "
        "the positively and negatively correlated genes, specify both "
        "gene sets using the format: <positive_geneset>,<negative_geneset>.  "
        "You can use this option multiple times to score more than one gene "
        "set.")
    group.add_argument("--all",
                       dest="all_gene_sets",
                       action="store_true",
                       default=False,
                       help="Score all gene sets in the files.")
    group.add_argument(
        "--any_matching",
        dest="any_matching_gene_sets",
        action="store_true",
        default=False,
        help="Score gene sets in the files that matches these genes.")
    group.add_argument("--automatch",
                       action="store_true",
                       default=False,
                       help="Will match _UP with _DN (or _DOWN).")

    group = parser.add_argument_group(
        title="Genes", description="Add gene expression profiles to output.")
    group.add_argument(
        "--genes",
        default=[],
        action="append",
        help="Comma-separated list of IDs (e.g. probes, gene names) "
        "to include.")

    args = parser.parse_args()
    assert args.expression_files, \
           "Please specify an expression data set to score."
    expression_files = []
    for x in args.expression_files:
        xg = glob.glob(x)
        assert xg, "I could not find the expression file: %s" % x
        expression_files.extend(xg)
    for x in expression_files:
        assert os.path.exists(x), \
           "I could not find the expression file: %s" % x
    assert args.outfile, "Please specify the name of an outfile."

    if args.num_procs < 1 or args.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    assert args.pvalue > 0 and args.pvalue <= 1, \
           "Invalid pvalue %g" % args.pvalue

    assert args.geneset_files, "Please specify one or more geneset files."
    for x in args.geneset_files:
        assert os.path.exists(x), "I could not find the gene set file: %s" % x
    assert args.all_gene_sets or args.gene_set or args.any_matching_gene_sets,\
           "Please specify one or more gene sets to score."
    if args.all_gene_sets:
        assert not args.gene_set and not args.any_matching_gene_sets
    if args.any_matching_gene_sets:
        assert not args.gene_set and not args.all_gene_sets

    #if args.num_procs > 1:
    #    raise NotImplementedError, "Doesn't work.  Matrix class decorator."

    if args.libpath:
        sys.path = args.libpath + sys.path
    # Import after the library path is set.
    #import time
    import multiprocessing
    from genomicode import genesetlib
    from genomicode import genepattern
    from genomicode import jmath

    #start_time = time.time()

    genepattern.fix_environ_path()

    gene_names = _parse_gene_names(args.genes)

    msg = "Reading gene set file."
    if len(args.geneset_files) > 1:
        msg = "Reading gene set files."
    print msg
    sys.stdout.flush()
    geneset2genes = {}  # name -> list of genes
    for filename in args.geneset_files:
        for x in genesetlib.read_genesets(filename):
            name, description, genes = x
            assert name not in geneset2genes, "Duplicate geneset: %s." % name
            geneset2genes[name] = genes

    genesets = args.gene_set
    if args.all_gene_sets or args.any_matching_gene_sets:
        genesets = sorted(geneset2genes)
    if args.automatch:
        genesets = match_gene_sets(genesets)
    #genesets = genesets[:10]

    matrix_names = [os.path.split(x)[1] for x in expression_files]

    print "Setting up jobs."
    sys.stdout.flush()
    ignore_gene_not_found = args.any_matching_gene_sets
    # list of gs_name, pos_genes, neg_genes, matrix_name, matrix_file
    # list of gene_name, None, None, matrix_name, matrix_file
    jobs = []
    for geneset in genesets:
        pos_gs, neg_gs = _parse_geneset(geneset)
        assert pos_gs in geneset2genes, \
               "I could not find gene set: %s" % pos_gs
        if neg_gs:
            assert neg_gs in geneset2genes, \
                   "I could not find gene set: %s" % neg_gs
        gs_name = pos_gs
        if neg_gs:
            gs_name = "%s/%s" % (pos_gs, neg_gs)

        pos_genes = geneset2genes[pos_gs]
        neg_genes = geneset2genes.get(neg_gs, [])

        if not pos_genes and not neg_genes:
            print "Empty gene set: %s.  Skipping." % gs_name
            continue

        for matrix_name, matrix_file in zip(matrix_names, expression_files):
            x = gs_name, pos_genes, neg_genes, matrix_name, matrix_file, \
                ignore_gene_not_found
            jobs.append(x)
    for name in gene_names:
        for matrix_name, matrix_file in zip(matrix_names, expression_files):
            x = name, None, None, matrix_name, matrix_file, None
            jobs.append(x)

    # Group the jobs into batches such that jobs that use the same
    # matrix are in the same batch.
    batched_jobs = {}  # matrix_file -> list of jobs
    for i in range(len(jobs)):
        batch = jobs[i][4]
        if batch not in batched_jobs:
            batched_jobs[batch] = []
        batched_jobs[batch].append(jobs[i])
    batched_jobs = batched_jobs.values()  # list of list of jobs

    # If there are too many gene sets to score for a file, split it up
    # into multiple batches.  Don't know the tradeoff between reading
    # a file twice and calculating more gene sets.
    while len(batched_jobs) < args.num_procs:
        # Find the largest job and split it into two.
        largest = i_largest = None
        for i in range(len(batched_jobs)):
            nj = len(batched_jobs[i])
            if nj > 1 and nj > largest:
                largest = nj
                i_largest = i
        if largest is None:
            break
        # Split i_largest in half.
        bj = batched_jobs[i_largest]
        i = len(bj) / 2
        j1, j2 = bj[:i], bj[i:]
        batched_jobs[i_largest] = j1
        batched_jobs.append(j2)

    job_str = "jobs"
    if len(jobs) == 1:
        job_str = "job"
    print "Scoring %d %s." % (len(jobs), job_str)
    sys.stdout.flush()
    manager = multiprocessing.Manager()
    lock = manager.Lock()
    pool = multiprocessing.Pool(args.num_procs)

    # (matrix, geneset, index, sample) -> GeneSetScore or GeneScore
    score_dict = {}
    results = []  # AsyncResults
    for batch in batched_jobs:
        fn_args = (batch, )
        fn_keywds = {}
        fn_keywds["lock"] = lock
        if args.num_procs == 1:
            x = score_many(batch)
            score_dict.update(x)
        else:
            x = pool.apply_async(score_many, fn_args, fn_keywds)
            results.append(x)
    pool.close()
    pool.join()
    for x in results:
        x = x.get()
        score_dict.update(x)

    all_matrix_samples = []
    all_genesets = []
    all_genes = []
    for (x, score) in score_dict.iteritems():
        matrix_name, gene_name, index, sample = x
        x = matrix_name, index, sample
        all_matrix_samples.append(x)
        if isinstance(score, GeneSetScore):
            all_genesets.append(gene_name)
        elif isinstance(score, GeneScore):
            all_genes.append(gene_name)
        else:
            raise AssertionError
    all_matrix_samples = sorted({}.fromkeys(all_matrix_samples))
    all_genesets = sorted({}.fromkeys(all_genesets))
    all_genes = sorted({}.fromkeys(all_genes))

    # Format the output.  Columns should be in order:
    # <SAMPLE> <FILE>
    # <GS SCORES> ... <GS DIRECTION> ... <GS PVALUE> ... <GS SIGNIFICANT> ...
    # <GENES> ...
    header = ["SAMPLE", "FILE"]
    x = ["", "direction", "pvalue", "significant"]
    for x in itertools.product(x, all_genesets):
        suffix, name = x
        x = "%s %s" % (name, suffix)
        x = x.strip()
        header = header + [x]
    for g in all_genes:
        header = header + [g]

    output = []
    output.append(header)
    for x in all_matrix_samples:
        matrix, index, sample = x
        #x = [scores[(matrix, x, index, sample)] for x in all_genesets]

        # Get the scores for the gene sets.
        keys = [(matrix, x, index, sample) for x in all_genesets]
        default = GeneSetScore("", "", "", "")
        scores = [score_dict.get(x, default).score for x in keys]
        directs = [score_dict.get(x, default).direction for x in keys]
        pvalues = [score_dict.get(x, default).pvalue for x in keys]
        signifs = []
        for x in zip(directs, pvalues):
            direct, pvalue = x
            x = ""
            if type(pvalue) is type(0.0) and pvalue < args.pvalue:
                x = direct
            signifs.append(x)

        # Get the scores for the genes.
        keys = [(matrix, x, index, sample) for x in all_genes]
        default = GeneScore("")
        gene_scores = [score_dict.get(x, default).score for x in keys]

        x = [sample, matrix] + \
            scores + directs + pvalues + signifs + gene_scores
        assert len(x) == len(header)
        output.append(x)

    if args.transpose:
        output = jmath.transpose(output)

    outhandle = open(args.outfile, 'w')
    for x in output:
        print >> outhandle, "\t".join(map(str, x))
    outhandle.close()

    print "Done."
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        svm_node = in_data
        filelib.assert_exists_nz(svm_node.identifier)

        cosmic_file = mlib.get_user_option(
            user_options, "cosmic_variants_file", not_empty=True,
            check_file=True)
        
        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier)
        CHROM = SVM["______Chrom"]
        POS = SVM["______Pos"]
        POS = [int(x) for x in POS]
        all_coords = {}  # (chrom, pos) -> 1
        for x in zip(CHROM, POS):
            all_coords[x] = 1

        # Read the COSMIC variant file.
        # Chrom  Start  End  GRCh  Count  SNP
        # Mutation CDS  Mutation AA
        # FATHMM prediction  FATHMM score  Mutation somatic status
        coord2info = {}  # (chrom, pos) -> d
        for d in filelib.read_row(cosmic_file, header=1):
            start, end = int(d.Start), int(d.End)
            in_svm = False
            for pos in range(start, end+1):
                if (d.Chrom, pos) in all_coords:
                    in_svm = True
                    break
            if not in_svm:
                continue
            coord2info[(d.Chrom, pos)] = d

        # Align the COSMIC annotations to the matrix.
        cosmic_headers = [
            "SNP", "Num Tumors", "Mutation CDS", "Mutation AA",
            "FATHMM prediction", "FATHMM score", "Mutation somatic status"]
        annotations = []
        for (chrom, pos) in zip(CHROM, POS):
            if (chrom, pos) not in coord2info:
                x = [""] * len(cosmic_headers)
                annotations.append(x)
                continue
            d = coord2info[(chrom, pos)]
            x = d.SNP, d.Count, d.Mutation_CDS, d.Mutation_AA, \
                d.FATHMM_prediction, d.FATHMM_score, \
                d.Mutation_somatic_status
            annotations.append(x)
        # Convert the headers and annotations to SVM format.
        cosmic_headers = ["COSMIC______%s" % x for x in cosmic_headers]
        cosmic_annotations = jmath.transpose(annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        # If Annovar exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("ANNOVAR")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If SnpEff exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("SNPEFF")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + cosmic_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + cosmic_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
Beispiel #18
0
def summarize_factor_scores(file_layout, python, arrayplot, cluster, libpath):
    import zipfile
    import arrayio
    from genomicode import Matrix
    from genomicode import jmath
    from genomicode import archive
    from genomicode import graphlib
    from genomicode import bfrm

    DATA = arrayio.read(file_layout.DATASET)

    param_file = "parameters.txt"
    model = bfrm.read_clean_model(file_layout.BFRM_MODEL,
                                  param_file=param_file)
    num_factors = model["F"].nrow()

    # Load the factor names.
    assert zipfile.is_zipfile(file_layout.BFRM_MODEL)
    s2f = archive.unzip_dict(file_layout.BFRM_MODEL)
    assert "factorids.txt" in s2f, "Missing: factorids.txt"
    zfile = zipfile.ZipFile(file_layout.BFRM_MODEL)
    factor_names = [x.strip() for x in zfile.open(s2f["factorids.txt"])]
    assert len(factor_names) == num_factors

    # sample x factor matrix
    F = arrayio.read(file_layout.BFRM_AF)
    assert F.nrow() == DATA.ncol()
    F_X = jmath.transpose(F._X)

    # F_X contains all factors, including intercept and design.
    # Remove all but the latent factors.
    F_X = F_X[-num_factors:]

    # Sort the factors so they'll be in the same order as the clean
    # model.
    assert len(F_X) == len(model["FACTOR_O"])
    F_X = [F_X[i] for i in model["FACTOR_O"]]
    factor_names = [factor_names[i] for i in model["FACTOR_O"]]

    # Write out the projected factor scores.
    SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME
    row_names = {}
    col_names = {}
    row_names["xID"] = factor_names
    col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME)
    M = Matrix.InMemoryMatrix(F_X, row_names, col_names)
    arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES)

    # Make the heatmap.
    x = graphlib.find_wide_heatmap_size(M.nrow(),
                                        M.ncol(),
                                        min_box_height=10,
                                        min_box_width=10,
                                        max_total_height=768,
                                        max_total_width=1024)
    xpix, ypix = x
    ypix = min(ypix, xpix * 4)
    x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES,
                              file_layout.FACTOR_SCORES_PNG,
                              xpix,
                              ypix,
                              color="bild",
                              show_colorbar=True,
                              show_grid=True,
                              gene_center="mean",
                              gene_normalize="var",
                              gene_label=True,
                              cluster_genes=True,
                              array_label=True,
                              cluster_arrays=True,
                              python=python,
                              arrayplot=arrayplot,
                              cluster=cluster,
                              libpath=libpath)

    # Clean up the cluster files.
    files = [
        file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR
    ]
    for filename in files:
        if not os.path.exists(filename):
            continue
        src = filename
        x = os.path.split(filename)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)