Exemple #1
0
def relabel(data_file, rename_file, outfile, user_options):
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    sample_header = mlib.get_user_option(
        user_options, "sample_labels_header", not_empty=True)
    # Make sure sample_header in rename file.
    x = open(rename_file).readline()
    x = x.rstrip("\r\n").split("\t")
    assert sample_header in x, "Missing header (%s): %s" % (
        sample_header, rename_file)

    sq = parallel.quote
    slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)
    x = "'%s,%s'" % (rename_file, sample_header)
    cmd = [
        "python",
        sq(slice_matrix),
        '--relabel_col_ids', x,
        sq(data_file),
        ]
    cmd = " ".join(cmd)
    cmd = "%s >& %s" % (cmd, outfile)
    parallel.sshell(cmd)

    filelib.assert_exists_nz(outfile)
    return cmd
def count_duplicates(bam_filename):
    # Return a tuple of (total_reads, duplicated_reads).
    import subprocess
    from genomicode import samtools
    from Betsy import module_utils as mlib

    samtools_bin = mlib.get_config("samtools", which_assert_file=True)
    cmd = [
        samtools_bin,
        "view",
        bam_filename,
        ]
    total = num_dup = 0
    p = subprocess.Popen(
        cmd, bufsize=0, stdin=subprocess.PIPE,
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
    w, r = p.stdin, p.stdout
    w.close()
    for align in samtools.parse_sam(r):
        if align.flag & 0x400:
            num_dup += 1
        total += 1
    return total, num_dup
    
                                
Exemple #3
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     """preprocess the inputfile with RMA 
        using preprocess.py will generate a output file"""
     import os
     import subprocess
     from Betsy import module_utils as mlib
     
     in_data = antecedents
     #preprocess the cel file to text signal file
     PREPROCESS_BIN = mlib.get_config("preprocess", which_assert_file=True)
     #PREPROCESS_path = config.preprocess
     #PREPROCESS_BIN = filelib.which(PREPROCESS_path)
     #assert PREPROCESS_BIN, 'cannot find the %s' % PREPROCESS_path
     command = ['python', PREPROCESS_BIN, 'RMA', in_data.identifier]
     process = subprocess.Popen(
         command, shell=False,
         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     error_message = process.communicate()[1]
     if error_message:
         if not "Loading required package: Biobase" in error_message:
             raise ValueError(error_message)
 
     outputfiles = os.listdir(".")
     outputfile = None
     for i in outputfiles:
         if i.endswith('.rma'):
             outputfile = i
     
     assert outputfile, "No output file created."
     os.rename(outputfile, outfile)
Exemple #4
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import sortlib
        from Betsy import module_utils as mlib

        # Should be a folder of fastqc results.
        fastqc_path = in_data.identifier

        # Find all the FASTQC results.
        x = filelib.list_files_in_path(fastqc_path, endswith="summary.txt")
        x = [os.path.split(x)[0] for x in x]
        paths = x
        assert paths, "No FASTQC files found."

        # Read the results.
        all_results = [read_fastqc_results(x) for x in paths]
        assert all_results

        # Make table where the rows are the samples and the columns
        # are the statistics.
        sample2results = {}
        for x in all_results:
            assert x.sample not in sample2results
            sample2results[x.sample] = x
        all_statistics = all_results[0].statistics_order
        all_samples = sortlib.sort_natural(sample2results)

        table = []
        header = [
            "Sample", "Total Sequences", "Filtered Sequences",
            "Sequence length", "GC"
        ] + all_statistics
        table.append(header)
        for sample in all_samples:
            results = sample2results[sample]
            x1 = [sample]
            x2 = [
                results.total_sequences, results.filtered_sequences,
                results.sequence_length, results.percent_gc
            ]
            x3 = [results.statistics[x] for x in all_statistics]
            x = x1 + x2 + x3
            assert len(x) == len(header)
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "fastqc_summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(map(str, x))
        handle.close()

        x = mlib.get_config("txt2xls", which_assert_file=True, quote=True)
        os.system("%s -b %s > %s" % (x, TXT_FILE, outfile))
        filelib.assert_exists_nz(outfile)
def make_gsea_command(expression_file, class_label_file, gsea_path, name1,
                      name2, indexes1, indexes2, permutation_type, database):
    # indexes should be 1-based, not including headers.
    from genomicode import parallel
    from genomicode import filelib
    from genomicode import parselib
    from Betsy import module_utils as mlib
    from Betsy.rules import GSEAAnalysis

    filelib.assert_exists_nz(expression_file)
    filelib.assert_exists_nz(class_label_file)
    assert permutation_type in GSEAAnalysis.GSEA_PERMUTATION
    assert database in GSEAAnalysis.GSEA_DATABASE

    ranges1 = [(i, i + 1) for i in indexes1]
    ranges2 = [(i, i + 1) for i in indexes2]
    indexes1_str = parselib.unparse_ranges(ranges1)
    indexes2_str = parselib.unparse_ranges(ranges2)

    gsea = mlib.get_config("gsea", which_assert_file=True)

    sq = parallel.quote
    cmd = [
        sq(gsea),
        "--name1",
        name1,
        "--name2",
        name2,
        "--indexes1",
        indexes1_str,
        "--indexes2",
        indexes2_str,
        "--permutation_type",
        sq(permutation_type),
        "--database",
        sq(database),
        "--min_match_score",
        0.80,
        "--clobber",
        sq(expression_file),
        sq(gsea_path),
    ]
    cmd = " ".join(map(str, cmd))
    return cmd
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        import arrayio
        from genomicode import jmath
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        metadata = {}

        norm_para = ["variance", "sum_of_squares"]
        assert "gene_normalize" in out_attributes
        normalize = out_attributes["gene_normalize"]
        assert normalize in norm_para, \
               "Invalid normalize option: %s" % normalize

        if normalize == "variance":
            f = file(outfile, 'w')
            M = arrayio.read(in_data.identifier, format=arrayio.pcl_format)
            M_n = jmath.safe_norm_mv(M.slice())
            M._X = M_n
            M_c = arrayio.convert(M, to_format=arrayio.pcl_format)
            arrayio.pcl_format.write(M_c, f)
            f.close()
        elif normalize == "sum_of_squares":
            cluster = mlib.get_config("cluster", which_assert_file=True)
            sq = parallel.quote
            cmd = [
                sq(cluster),
                "-f",
                sq(in_data.identifier),
                "-ng",
                "-u",
                outfile,
            ]
            parallel.sshell(cmd)
            metadata["command"] = cmd
            outputfile = outfile + '.nrm'
            filelib.assert_exists_nz(outputfile)
            os.rename(outputfile, outfile)

        filelib.assert_exists_nz(outfile)
        return metadata
Exemple #7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        in_data = antecedents
        metadata = {}

        lineplot = mlib.get_config("lineplot", which_assert_file=True)

        gene_names = [
            "ACTB",
            60,  # Human beta actin.
            "TUBB",
            203068,  # Human beta tubulin.
            "Actb",
            22461,  # Mouse beta actin.
            "Tubb4a",
            22153,  # Mouse beta tubulin.
        ]

        infile = in_data.identifier

        sq = parallel.quote
        cmd = [
            sq(lineplot),
            "--gene_names",
            ",".join(map(str, gene_names)),
            "--mar_bottom",
            1.50,
            sq(infile),
            sq(outfile),
        ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]

        filelib.assert_exists_nz(outfile)
        return metadata
Exemple #8
0
def _make_config_file(config_filename, skip_depth_filter=False):
    import os
    from genomicode import filelib
    from Betsy import module_utils as mlib

    strelka_path = mlib.get_config("strelka", assert_exists=True)

    src_config = os.path.join(strelka_path, "etc",
                              "strelka_config_bwa_default.ini")
    filelib.exists_nz(src_config)
    lines = open(src_config).readlines()
    assert lines

    # Edit configure options.
    for i in range(len(lines)):
        x = lines[i]
        x = x.strip()
        line = x

        # Make sure skip_depth_filter is correct.
        # isSkipDepthFilters should be set to 1 to skip depth
        # filtration for whole exome or other targeted sequencing data
        #
        # sSkipDepthFilters = 0
        if line.startswith("isSkipDepthFilters"):
            # isSkipDepthFilters = 0
            x = line.split()
            assert len(x) == 3
            assert x[1] == "="
            assert x[2] in ["0", "1"]
            if skip_depth_filter:
                x[2] = "1"
            else:
                x[2] = "0"
            line = " ".join(x)
        lines[i] = line

    lines = [x + "\n" for x in lines]  # replace newline that was stripped.
    open(config_filename, 'w').writelines(lines)
Exemple #9
0
def make_snpeff_command(in_file,
                        genome,
                        out_file,
                        log_file,
                        is_cancer=False,
                        cancer_samples_file=None):
    import os
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    if is_cancer:
        filelib.assert_exists_nz(cancer_samples_file)

    path = mlib.get_config("snp_eff_path", which_assert_file=True)
    snpeff = os.path.join(path, "snpEff.jar")
    filelib.assert_exists_nz(snpeff)

    sq = parallel.quote
    cmd = [
        "java",
        "-Xmx16g",
        "-jar",
        sq(snpeff),
    ]
    if is_cancer:
        cmd += [
            "-cancer",
            "-cancerSamples",
            sq(cancer_samples_file),
        ]
    cmd += [
        sq(genome),
        sq(in_file),
    ]
    cmd = " ".join(cmd)
    cmd = "%s 1> %s 2> %s" % (cmd, sq(out_file), sq(log_file))
    return cmd
Exemple #10
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib
        
        in_data = antecedents
        metadata = {}


        #M = arrayio.read(in_data.identifier)
        #data = jmath.transpose(M._X)
        #tickname = M._col_names['_SAMPLE_NAME']
        #fig = mplgraph.boxplot(
        #    data,
        #    xlabel='Sample Name',
        #    ylabel='Signal',
        #    title='Signal Intensity',
        #    box_label=tickname)
        #fig.savefig(outfile)

        boxplot = mlib.get_config("boxplot", which_assert_file=True)

        sq = parallel.quote
        cmd = [
            sq(boxplot),
            sq(in_data.identifier),
            sq(outfile),
            ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)

        metadata["commands"] = [cmd]
        
        filelib.assert_exists_nz(outfile)
        
        return metadata
Exemple #11
0
def list_snpeff_databases():
    import os
    import StringIO
    from genomicode import parallel
    from genomicode import filelib
    from Betsy import module_utils as mlib

    path = mlib.get_config("snp_eff_path", which_assert_file=True)
    snpeff = os.path.join(path, "snpEff.jar")
    filelib.assert_exists_nz(snpeff)

    # Genome    Organism    Status    Bundle    Database download link
    # ------    --------    ------    ------    ----------------------
    sq = parallel.quote
    cmd = [
        "java",
        "-Xmx16g",
        "-jar",
        sq(snpeff),
        "databases",
    ]
    output = parallel.sshell(cmd)
    header = i_db = None
    databases = []
    for cols in filelib.read_cols(StringIO.StringIO(output)):
        cols = [x.strip() for x in cols]
        if header is None:
            header = cols
            assert "Genome" in header
            i_db = header.index("Genome")
            continue
        assert len(cols) == len(header)
        if cols[0].startswith("---"):
            continue
        db_name = cols[i_db]
        databases.append(db_name)
    return databases
Exemple #12
0
def _make_analysis_directory(analysis_path, config_file, reference_fa,
                             normal_bam, tumor_bam):
    import os
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    filelib.assert_exists_nz(config_file)
    filelib.assert_exists_nz(reference_fa)
    filelib.assert_exists_nz(normal_bam)
    filelib.assert_exists_nz(tumor_bam)

    strelka_path = mlib.get_config("strelka", assert_exists=True)
    config_pl = os.path.join(strelka_path, "bin",
                             "configureStrelkaWorkflow.pl")
    filelib.assert_exists_nz(config_pl)

    # $STRELKA/bin/configureStrelkaWorkflow.pl \
    #   --normal=../test31.bam --tumor=../test32.bam \
    #   --ref=../genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
    #   --config=./config.ini --output-dir=./myAnalysis
    sq = mlib.sq
    cmd = [
        sq(config_pl),
        "--normal",
        sq(normal_bam),
        "--tumor",
        sq(tumor_bam),
        "--ref",
        sq(reference_fa),
        "--config",
        sq(config_file),
        "--output-dir",
        sq(analysis_path),
    ]
    cmd = " ".join(cmd)
    parallel.sshell(cmd)
Exemple #13
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        metadata = {}
        
        center_alg = {
            'mean': 'a',
            'median': 'm',
            }
        assert "gene_center" in out_attributes
        center = out_attributes['gene_center']
        assert center in center_alg, "Invalid center option: %s" % center
        center_parameter = center_alg[center]

        cluster = mlib.get_config("cluster", which_assert_file=True)
        sq = parallel.quote
        cmd = [
            sq(cluster),
            "-f", sq(in_data.identifier),
            "-cg", center_parameter,
            "-u", outfile,
            ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]
        
        outputfile = outfile + '.nrm'
        filelib.assert_exists_nz(outputfile)
        os.rename(outputfile, outfile)

        return metadata
Exemple #14
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        import shutil
        #from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        ref_node, gene_node = antecedents
        # Don't copy the whole path.  Just get the fasta file.
        #ref = alignlib.standardize_reference_genome(
        #    ref_node.identifier, out_path, use_symlinks=True)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        gtf_file = gene_node.identifier
        filelib.assert_exists_nz(gtf_file)

        # Symlink the fasta file into the out path.
        filelib.safe_mkdir(out_path)
        x = os.path.join(out_path, ref.fasta_file)
        os.symlink(ref.fasta_file_full, x)

        # rsem-prepare-reference --bowtie --bowtie2 --gtf gtf02.gtf
        #   <reference.fa> <reference_name>
        # <reference_name>.[1234].ebwt    # Bowtie1.
        # <reference_name>.rev.[12].ebwt
        # <reference_name>.[1234].bt2     # Bowtie2.
        # <reference_name>.rev.[12].bt2
        # <reference_name>.chrlist        # RSEM.
        # <reference_name>.grp
        # <reference_name>.idx.fa
        # <reference_name>.n2g.idx.fa
        # <reference_name>.seq
        # <reference_name>.ti
        # <reference_name>.transcripts.fa
        # chrLength.txt                   # STAR
        # chrNameLength.txt
        # chrName.txt
        # chrStart.txt
        # exonGeTrInfo.tab
        # exonInfo.tab
        # gencode.vM8.annotation.gtf
        # geneInfo.tab
        # Genome
        # genomeParameters.txt
        # SA
        # SAindex
        # sjdbInfo.txt
        # sjdbList.fromGTF.out.tab
        # sjdbList.out.tab
        # transcriptInfo.tab

        rsem_prepare = mlib.get_config("rsem_prepare", which_assert_file=True)
        bowtie = mlib.get_config("bowtie", which_assert_file=True)
        bowtie2 = mlib.get_config("bowtie2", which_assert_file=True)
        STAR = mlib.get_config("STAR", which_assert_file=True)

        # RSEM wants the path that contains the executables.
        bowtie = os.path.split(bowtie)[0]
        bowtie2 = os.path.split(bowtie2)[0]
        STAR = os.path.split(STAR)[0]

        sq = parallel.quote
        cmd = [
            sq(rsem_prepare),
            "--num-threads", num_cores,
            "--bowtie",
            "--bowtie-path", sq(bowtie),
            "--bowtie2",
            "--bowtie2-path", sq(bowtie2),
            "--star",
            "--star-path", sq(STAR),
            "--gtf", sq(gtf_file),
            sq(ref.fasta_file_full),
            ref.name,
            ]
        parallel.sshell(cmd, path=out_path)

        # Copy the GTF file into the output path.
        shutil.copy2(gtf_file, out_path)

        assembly = ref.name
        # Check to make sure index was created successfully.
        x1 = ["%s.%d.ebwt" % (assembly, i+1) for i in range(4)]
        x2 = ["%s.rev.%d.ebwt" % (assembly, i+1) for i in range(2)]
        x3 = ["%s.%d.bt2" % (assembly, i+1) for i in range(4)]
        x4 = ["%s.rev.%d.bt2" % (assembly, i+1) for i in range(2)]
        x5 = [
            "%s.chrlist" % assembly,
            "%s.grp" % assembly,
            "%s.idx.fa" % assembly,
            "%s.n2g.idx.fa" % assembly,
            "%s.seq" % assembly,
            "%s.ti" % assembly,
            "%s.transcripts.fa" % assembly,
            ]
        x6 = [
            "chrLength.txt",
            "chrNameLength.txt",
            "chrName.txt",
            "chrStart.txt",
            "exonGeTrInfo.tab",
            "exonInfo.tab",
            "gencode.vM8.annotation.gtf",
            "geneInfo.tab",
            "Genome",
            "genomeParameters.txt",
            "SA",
            "SAindex",
            "sjdbInfo.txt",
            "sjdbList.fromGTF.out.tab",
            "sjdbList.out.tab",
            "transcriptInfo.tab",
            ]
        x = x1 + x2 + x3 + x4 + x5 + x6
        index_files = [os.path.join(out_path, x) for x in x]
        filelib.assert_exists_nz_many(index_files)
Exemple #15
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #          vcf_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                vcf_outfile
            jobs.append(x)

        # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   test31/tumor.bam test31/normal.bam test41.vcf
        somaticsniper = mlib.get_config("somaticsniper",
                                        which_assert_file=True)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x

            x = [
                sq(somaticsniper),
                "-q",
                1,
                "-Q",
                15,
                "-G",
                "-L",
                "-F",
                "vcf",
                "-f",
                sq(ref.fasta_file_full),
                sq(cancer_bamfile),
                sq(normal_bamfile),
                sq(vcf_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # SomaticSniper names the samples "NORMAL" and "TUMOR".
        # Replace them with the actual names.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x
            call_somatic_varscan._fix_normal_cancer_names(
                vcf_outfile, normal_sample, cancer_sample)

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Exemple #16
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import StringIO
        import arrayio
        from genomicode import arrayplatformlib
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        M = arrayio.read(in_data.identifier)
        metadata = {}

        # Add GENE_ID, GENE_SYMBOL, and DESCRIPTION.  Figure out which
        # platforms provide each one of this.
        CATEGORIES = [
            arrayplatformlib.GENE_ID,
            arrayplatformlib.GENE_SYMBOL,
            # biomaRt doesn't convert description.  So just ignore it
            # for now.
            # TODO: implement DESCRIPTION.
            #arrayplatformlib.DESCRIPTION,
        ]

        #all_platforms = arrayplatformlib.identify_all_platforms_of_matrix(M)
        #assert all_platforms, "Unknown platform: %s" % in_data.identifier
        #header, platform_name = all_platforms[0]
        scores = arrayplatformlib.score_matrix(M)
        scores = [x for x in scores if x.max_score >= 0.75]
        assert scores, "I could not identify any platforms."

        # Find all the platforms not in the matrix.
        platforms = [
            arrayplatformlib.find_platform_by_name(x.platform_name)
            for x in scores
        ]
        categories = [x.category for x in platforms]
        missing = [x for x in CATEGORIES if x not in categories]

        score = scores[0]
        platform = platforms[0]
        to_add = []  # list of platform names
        for category in missing:
            x = arrayplatformlib.PLATFORMS
            x = [x for x in x if x.category == category]
            x = [x for x in x if x.bm_organism == platform.bm_organism]
            x = [x for x in x if x.name != score.platform_name]
            # Take the first one, if any.
            if x:
                to_add.append(x[0].name)

        if to_add:
            annotate = mlib.get_config("annotate_matrix",
                                       which_assert_file=True)
            sq = parallel.quote
            cmd = [
                "python",
                sq(annotate),
                "--no_na",
                "--header",
                sq(score.header),
            ]
            for x in to_add:
                x = ["--platform", sq(x)]
                cmd.extend(x)
            cmd.append(in_data.identifier)
            cmd = " ".join(cmd)
            data = parallel.sshell(cmd)
            metadata["commands"] = [cmd]
            assert data.find("Traceback") < 0, data
        else:
            data = open(in_data.identifier).read()

        # Clean up the headers.
        platform2pretty = {
            "Entrez_ID_human": "Gene ID",
            "Entrez_Symbol_human": "Gene Symbol",
            "Entrez_ID_mouse": "Gene ID",
            "Entrez_Symbol_mouse": "Gene Symbol",
        }
        handle = open(outfile, 'w')
        header_written = False
        for cols in filelib.read_cols(StringIO.StringIO(data)):
            if not header_written:
                cols = [platform2pretty.get(x, x) for x in cols]
                cols = AnnotationMatrix.uniquify_headers(cols)
                header_written = True
            print >> handle, "\t".join(cols)

        return metadata