Example #1
0
def find_cluster_files(cluster_path):
    # Return a dictionary of:
    #   "cdt" : cdt_filename
    #   "atr" : atr_filename
    #   "gtr" : gtr_filename
    #   "kag" : kag_filename
    #   "kgg" : kgg_filename
    # Any of these files can be missing.
    import os
    from genomicode import filelib

    filelib.assert_exists(cluster_path)

    opj = os.path.join
    cdt = opj(cluster_path, "signal.cdt")
    atr = opj(cluster_path, "array_tree.atr")
    gtr = opj(cluster_path, "gene_tree.gtr")
    kag = opj(cluster_path, "array_cluster.kag")
    kgg = opj(cluster_path, "gene_cluster.kgg")

    cluster_files = {}
    if filelib.exists_nz(cdt):
        cluster_files["cdt"] = cdt
    if filelib.exists_nz(atr):
        cluster_files["atr"] = atr
    if filelib.exists_nz(gtr):
        cluster_files["gtr"] = gtr
    if filelib.exists_nz(kag):
        cluster_files["kag"] = kag
    if filelib.exists_nz(kgg):
        cluster_files["kgg"] = kgg

    assert "cdt" in cluster_files, "No clustered file."

    return cluster_files
Example #2
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import subprocess
     from genomicode import config
     from genomicode import filelib
     from Betsy import module_utils
     in_data = antecedents
     species = out_attributes['ref']
     annotate_BIN = config.annotate_vcf
     command = ['python', annotate_BIN, in_data.identifier, '-o', outfile,
                '-species', species]
     process = subprocess.Popen(command,
                                shell=False,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
     process.wait()
     error_message = process.communicate()[1]
     if 'error' in error_message:
         raise ValueError(error_message)
 
     
     assert filelib.exists_nz(outfile), (
         'the output file %s for annot_vcf_file fails' % outfile
     )
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bowtie2_build = filelib.which_assert(config.bowtie2_build)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        # bowtie2-build <ref.fa> <output_stem>
        # Makes files:
        # <output_stem>.[1234].bt2
        # <output_stem>.rev.[12].bt2

        sq = parallel.quote
        cmd = [
            sq(bowtie2_build),
            sq(ref.fasta_file_full),
            ref.name,
        ]
        parallel.sshell(cmd, path=out_path)

        # Check to make sure index was created successfully.
        f = os.path.join(out_path, "%s.1.bt2" % ref.name)
        assert filelib.exists_nz(f)
Example #4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        """extract the cel files with cc or v3_4"""
        import os
        import shutil
        from Betsy import module_utils
        from genomicode import affyio
        from genomicode import filelib
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        ver_list = []
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        for filename in filenames:
            if filename == '.DS_Store':
                pass
            else:
                fileloc = os.path.join(directory, filename)
                cel_v = affyio.guess_cel_version(fileloc)
                if cel_v in ['cc1', 'v3', 'v4']:
                    shutil.copyfile(fileloc, os.path.join(outfile, filename))
                    ver_list.append(True)
                else:
                    ver_list.append(False)

        if True in ver_list:
            assert filelib.exists_nz(outfile), (
                'the output file %s for extract_CEL_files fails' % outfile)
        else:
            assert ValueError('There is no cel file in the input.')
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import shiftscalenorm
        import arrayio
        from Betsy import read_label_file
        from genomicode import filelib
        data_node, cls_node = antecedents
        if data_node and cls_node:
            result, label_line, second_line = read_label_file.read(
                cls_node.identifier)
            assert len(
                result) == 2, 'for shiftscale,there should be only 2 classes'
            M = arrayio.read(data_node.identifier)
            index1 = result[0][0]
            index2 = result[1][0]
            M_1 = M.matrix(None, index1)
            M_2 = M.matrix(None, index2)
            M_y = shiftscalenorm.normalize(M_1, M_2)
            for i in range(M_y.dim()[0]):
                for j in range(M_y.dim()[1]):
                    if str(M_y._X[i][j]) == 'nan':
                        M_y._X[i][j] = M_2._X[i][0]
            for j in range(M.nrow()):
                for i in range(len(index1)):
                    M._X[j][index1[i]] = M_y._X[j][i]

            f = file(outfile, 'w')
            arrayio.tab_delimited_format.write(M, f)
            f.close()
            assert filelib.exists_nz(outfile), (
                'the output file %s for shiftscale fails' % outfile)

        return False
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from genomicode import config
        from genomicode import filelib

        data_node, cel_node = antecedents
        #out_attributes = set_out_attributes(data_node, out_attributes)
        phenotype_BIN = config.analyze_phenotype
        assert os.path.exists(phenotype_BIN)
        assert "geneset_value" in user_options, 'no geneset are provided'
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        command = [
            'python', phenotype_BIN, '--phenotype', 'EMT', '--ignore_samples',
            'shCDH1,1', '--gene', user_options['geneset_value'], '-o',
            outfile + '/EMT', data_node.identifier, cel_node.identifier
        ]
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        x = process.communicate()
        error_message = x[1]
        assert not error_message, error_message
        assert filelib.exists_nz(outfile), (
            'the output file %s for analyze_phenotype fails' % outfile)
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import subprocess
     from Betsy import read_label_file
     from Betsy import module_utils
     from genomicode import filelib
     from genomicode import config
     data_node, cls_node = antecedents
     if data_node and cls_node:
         result, label_line, second_line = read_label_file.read(
             cls_node.identifier)
         assert len(
             result) >= 2, 'for combat,there should be equal or larger than 2 classes'
         combat_path = config.combatnorm
         combat_BIN = module_utils.which(combat_path)
         assert combat_BIN, 'cannot find the %s' % combat_path
         command = ['python', combat_BIN, '-f', data_node.identifier, '-o',
                    outfile, '-label', cls_node.identifier]
         process = subprocess.Popen(command,
                                    shell=False,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
         error_message = process.communicate()[1]
         if error_message:
             raise ValueError(error_message)
         assert filelib.exists_nz(outfile), (
             'the output file %s for combat fails' % outfile
         )
     
     return False
Example #8
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import subprocess
        from genomicode import filelib
        from Betsy import module_utils
        from genomicode import config
        in_data = antecedents
        bcftools_BIN = config.bcftools
        bcftools_module = module_utils.which(bcftools_BIN)
        assert bcftools_module, 'cannot find the %s' % bcftools_BIN
        vcfutils_BIN = config.vcfutils
        #vcfutils_module = module_utils.which(vcfutils_BIN)
        #assert bcftools_module, 'cannot find the %s' % bcftools_BIN
        command = [
            bcftools_BIN, 'view', in_data.identifier, '|', vcfutils_BIN,
            'varFilter', '-D500'
        ]
        #command = ['vcfutils.pl','varFilter','-D100',single_object.identifier]
        f = file(outfile, 'w')
        try:
            process = subprocess.Popen(command,
                                       shell=False,
                                       stdout=f,
                                       stderr=subprocess.PIPE)
        finally:
            f.close()

        error_message = process.communicate()[1]
        if 'error' in error_message:
            raise ValueError(error_message)

        assert filelib.exists_nz(outfile), (
            'the output file %s for filter_vcf_file does not exist' % outfile)
Example #9
0
def plot_line_keywd(filename, keyword, outfile):
    import arrayio
    from genomicode import mplgraph
    from genomicode import filelib

    M = arrayio.read(filename)
    header = M.row_names()
    label = M._col_names['_SAMPLE_NAME']
    lines = []
    data = []
    legend_name = []
    for i in range(M.dim()[0]):
        if M.row_names(header[1])[i] == keyword:
            data.append(M.slice()[i])
            x = "%s (%s)" % (keyword, M.row_names(header[0])[i])
            legend_name.append(x)
    assert len(data) > 0, 'cannot find the keyword %s in the file %s' % (
        keyword, filename)
    for i in range(len(data)):
        line = [(j, data[i][j]) for j in range(len(data[i]))]
        lines.append(line)
    params = {
        "box_label": label,
        "legend": legend_name,
        "ylim_min": 0,
        "ylabel": "Signal",
        "left": 0.1,
    }
    fig = mplgraph.lineplot(*lines, **params)
    fig.savefig(outfile)
    assert filelib.exists_nz(outfile), 'the plot_line_keywd fails'
Example #10
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        """analyze geneset"""
        import subprocess
        from Betsy import module_utils
        from genomicode import config
        from genomicode import filelib
        data_node, geneset_node = antecedents
        score_geneset_path = config.score_geneset
        score_geneset_BIN = module_utils.which(score_geneset_path)
        assert score_geneset_BIN, 'cannot find the %s' % score_geneset_path
        automatch = out_attributes['automatch']
        command = [
            'python', score_geneset_BIN, '-o', outfile, '--geneset_file',
            geneset_node.identifier, data_node.identifier, '--all'
        ]
        if automatch == 'yes':
            command.append('--automatch')

        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)

        assert filelib.exists_nz(outfile), (
            'the output file %s for score_pathway_with_geneset fails' %
            outfile)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from genomicode import config
        from genomicode import filelib
        #out_attributes = set_out_attributes(in_data, out_attributes)
        TCGA_BIN = config.download_tcga
        assert 'disease' in user_options
        if 'date' in user_options:
            x = ['--date', user_options['date']]
        else:
            x = []

        command = [
            'python', TCGA_BIN, '--disease', user_options['disease'], '--data',
            out_attributes['preprocess'], '--download_only'
        ] + x
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)

        result_files = os.listdir(".")
        result_format = 'tar.gz'
        for result_file in result_files:
            if result_file.endswith(result_format):
                os.rename(result_file, outfile)

        assert filelib.exists_nz(outfile), (
            'the output file %s for download_tcga fails' % outfile)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        """extract the fastq rna seq files"""
        import os
        import shutil
        from genomicode import filelib
        from Betsy import module_utils
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        format_types = ['fa', 'fastq']
        for format_type in format_types:
            for filename in filenames:
                if filename == '.DS_Store':
                    continue
                fileloc = os.path.join(in_data.identifier, filename)
                if fileloc.endswith(format_type + '.gz'):
                    newfname = os.path.splitext(filename)[0]
                    new_file = module_utils.gunzip(fileloc)
                elif fileloc.endswith(format_type):
                    new_file = fileloc
                    newfname = filename
                    shutil.copyfile(new_file, os.path.join(outfile, newfname))
                if fileloc.endswith('.gz'):
                    os.remove(new_file)

        assert filelib.exists_nz(outfile), (
            'the output file %s for extract_rna_files_fastq fails' % outfile)
Example #13
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from Betsy import module_utils
        from genomicode import config
        from genomicode import filelib
        in_data = antecedents
        sortsam_BIN = config.sortsam
        assert os.path.exists(sortsam_BIN), 'cannot find the %s' % sortsam_BIN
        command = [
            'java', '-Xmx5g', '-jar', sortsam_BIN, 'I=' + in_data.identifier,
            'O=' + outfile, 'SO=coordinate', 'VALIDATION_STRINGENCY=LENIENT',
            'CREATE_INDEX=true'
        ]
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        process.wait()
        error_message = process.communicate()[1]
        if 'error' in error_message:
            raise ValueError(error_message)

        assert filelib.exists_nz(outfile), (
            'the output file %s for sort_sam_file does not exist' % outfile)
Example #14
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import subprocess
     from Betsy import module_utils
     from genomicode import config
     from genomicode import filelib
     in_data = antecedents
     scoresig_path = config.scoresig
     scoresig_BIN = module_utils.which(scoresig_path)
     assert scoresig_BIN, 'cannot find the %s' % scoresig_path
     command = ['python', scoresig_BIN, '-r', in_data.identifier, '-m',
                in_data.identifier, '-j', '20', '-o', outfile]
     process = subprocess.Popen(command,
                                shell=False,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
     error_message = process.communicate()[1]
     if error_message:
         raise ValueError(error_message)
 
     
     assert filelib.exists_nz(outfile), (
         'the output file %s for run_scoresig does not exists' % outfile
     )
Example #15
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     from genomicode import filelib
     in_data = antecedents
     import arrayio
     f_out = file(outfile, 'w')
     M = arrayio.read(in_data.identifier)
     I_good = []
     #get the percentage of gene filter
     percent = float(user_options['filter_value']) / 100
     for i in range(M.dim()[0]):
         missing_count = 0
         for j in range(M.dim()[1]):
             if M._X[i][j] in [None, 'NA']:
                 missing_count = missing_count + 1
         if float(missing_count) / M.dim()[1] < percent:
             I_good.append(i)
 
     
     M_c = M.matrix(I_good, None)
     arrayio.tab_delimited_format.write(M_c, f_out)
     f_out.close()
     assert filelib.exists_nz(outfile), (
         'the output file %s for gene_filter fails' % outfile
     )
Example #16
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from genomicode import config
        from genomicode import filelib
        from Betsy import module_utils
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        samtools_BIN = config.samtools
        assert os.path.exists(
            samtools_BIN), 'cannot find the %s' % samtools_BIN
        for filename in filenames:
            infile = os.path.join(directory, filename)
            outname = os.path.splitext(filename)[-2] + '.bam'
            outname = os.path.join(outfile, outname)
            command = [samtools_BIN, 'view', '-S', '-b', '-o', outname, infile]
            process = subprocess.Popen(command,
                                       shell=False,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            process.wait()
            error_message = process.communicate()
            if 'error' in error_message[1]:
                raise ValueError(error_message)
            assert filelib.exists_nz(outname), (
                'the output file %s for convert_sam_to_bam does not exist' %
                outname)
Example #17
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bwa = filelib.which_assert(config.bwa)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        # bwa index <out_stem.fa>
        # Makes files:
        # <out_stem>.fa.amb .ann .bwt .pac .sa

        sq = parallel.quote
        cmd = [
            sq(bwa),
            "index",
            sq(ref.fasta_file_full),
        ]
        parallel.sshell(cmd, path=out_path)

        # Make sure the indexing worked properly.
        EXTENSIONS = [".amb", ".ann", ".bwt", ".pac", ".sa"]
        for ext in EXTENSIONS:
            f = "%s%s" % (ref.fasta_file_full, ext)
            assert filelib.exists_nz(f), "Missing: %s" % f
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        import os
        from genomicode import jmath
        in_data = antecedents
        matrix = [x for x in filelib.read_cols(in_data.identifier)]
        matrix = [x[1:] for x in matrix]
        matrix = jmath.transpose(matrix)
        sample = matrix[0][1:]
        data = matrix[1:]
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        for one_data in data:
            value = one_data[1:]
            value = [float(i) for i in value]
            pair = [(value[i], sample[i]) for i in range(len(value))]
            pair.sort()
            gene_value = [i[0] for i in pair]
            label = [i[1] for i in pair]
            ylabel = one_data[0]
            from genomicode import mplgraph
            fig = mplgraph.barplot(gene_value,
                                   box_label=label,
                                   xtick_rotation=90,
                                   xlabel='sample',
                                   ylabel=ylabel)
            output = os.path.join(outfile, ylabel)
            fig.savefig(output + '.png')

        assert filelib.exists_nz(outfile), (
            'the output file %s for plot_geneset_score_bar fails' % outfile)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        import arrayio
        from Betsy import module_utils
        from genomicode import filelib
        from genomicode import config
        in_data = antecedents
        bfrm_path = config.bfrmnorm
        bfrm_BIN = module_utils.which(bfrm_path)
        assert bfrm_BIN, 'cannot find the %s' % bfrm_path
        num_factor = 1
        #num_factor = 10
        if 'num_factors' in user_options.keys():
            num_factor = int(user_options['num_factors'])
            assert num_factor >= 1, 'the num_factor should be >=1'
            # What is single_object?
            #M = arrayio.read(single_object.identifier)
            M = arrayio.read(in_data.identifier)
            col_num = M.ncol()
            assert num_factor <= col_num, (
                'the num_factor should be less than %d' % col_num)

        tmp = 'tmp_dir'
        command = [
            'python', bfrm_BIN, in_data.identifier, '-f',
            str(num_factor), '-o', tmp
        ]
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)

        assert filelib.exists_nz(tmp), (
            'the output dir %s for bfrm_normalize fails' % tmp)
        assert filelib.exists_nz(os.path.join(tmp, 'normalized.gct')), (
            'the output gct file for bfrm_normalize fails')
        out = os.path.join(tmp, 'normalized.gct')
        M = arrayio.read(out)
        M_new = arrayio.convert(M, to_format=arrayio.pcl_format)
        f = file(outfile, 'w')
        arrayio.tab_delimited_format.write(M_new, f)
        f.close()
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        MAX_RAM = 64   # maximum amount of ram to use in Gb.

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # list of (in_filename, log_filename, out_filename)
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            x = in_filename, log_filename, out_filename
            jobs.append(x)
        
        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar
        #   -T SplitNCigarReads -R ../hg19.fa -I $i -o $j
        #   -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60
        #   -U ALLOW_N_CIGAR_READS

        # Start with 5 Gb RAM.
        commands = make_commands(jobs, ref.fasta_file_full, 5)
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_procs"] = nc

        # If any of the analyses didn't finish, try again with more
        # RAM.
        jobs2 = []
        for x in jobs:
            in_filename, log_filename, out_filename = x
            if filelib.exists_nz(out_filename):
                continue
            jobs2.append(x)
        if jobs2:
            commands = make_commands(jobs2, ref.fasta_file_full, MAX_RAM)
            nc = mlib.calc_max_procs_from_ram(MAX_RAM, upper_max=num_cores)
            parallel.pshell(commands, max_procs=nc)
            metadata["commands"] += commands
            
        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)

        return metadata
Example #21
0
def plot_line_keywds(filename, keywords, outfile):
    import arrayio
    from genomicode import mplgraph
    from genomicode import filelib

    M = arrayio.read(filename)
    header = M.row_names()
    label = M._col_names['_SAMPLE_NAME']
    outfiles = []
    for keyword in keywords:
        out = keyword + '.png'
        lines = []
        data = []
        legend_name = []
        for i in range(M.dim()[0]):
            if M.row_names(header[1])[i] == keyword:
                data.append(M.slice()[i])
                legend_name.append(M.row_names(header[0])[i])
        assert len(data) > 0, 'cannot find the keywords %s in the file %s' % (
            keywords, filename)
        for i in range(len(data)):
            line = [(j, data[i][j]) for j in range(len(data[i]))]
            lines.append(line)
        params = {
            "box_label": label,
            "legend": legend_name,
            "ylim_min": 0,
            "ylabel": keyword,
            "left": 0.1,
        }
        fig = mplgraph.lineplot(*lines, **params)
        fig.savefig(out)
        outfiles.append(out)
    import Image
    img_w_list = []
    img_h_list = []
    imgs = []
    for i in range(len(outfiles)):
        img = Image.open(outfiles[i], 'r')
        img_w, img_h = img.size
        img_w_list.append(img_w)
        img_h_list.append(img_h)
        imgs.append(img)
    total_w = max(img_w_list) + 30
    total_h = sum(img_h_list) + 10
    background = Image.new('RGBA', (total_w, total_h), (255, 255, 255, 255))
    bg_w, bg_h = background.size
    offset_w = (bg_w - max(img_w_list)) / 2
    offset_h_list = []
    for i in range(len(img_h_list)):
        offset_h = bg_h - sum(img_h_list[i:])
        offset_h_list.append(offset_h)
    for img, offset_h in zip(imgs, offset_h_list):
        background.paste(img, (offset_w, offset_h))
    background.save(outfile)
    assert filelib.exists_nz(outfile), 'the plot_line_keywds fails'
 def run(self, network, antecedents, out_attributes, user_options,
         num_cores, outfile):
     import shutil
     from genomicode import filelib
     in_data = antecedents
     #out_attributes = set_out_attributes(in_data, out_attributes)
     shutil.copyfile(in_data.identifier, outfile)
     assert filelib.exists_nz(outfile), (
         'the output file %s for convert_postprocess_impute fails' %
         outfile)
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     from genomicode import filelib
     import os
     import arrayio
     from genomicode import config
     from genomicode import arrayplatformlib
     in_data = antecedents
     mapfile = config.HumanHT_12_to_HG_u133_Plus_2
     assert os.path.exists(mapfile), 'mapping file %s does not exist' % mapfile
     result = []
     for d in filelib.read_row(mapfile, header=True):
         if int(d.Distance) <= 1000 and d.Match == 'Best for Both':
             result.append((d.Affymetrix_Probe_Set_ID, d.Illumina_Probe_ID))
 
     
     
     M = arrayio.read(in_data.identifier)
     #platform_list = arrayplatformlib.identify_all_platforms_of_matrix(M)
     platform_list = arrayplatformlib.score_all_platforms_of_matrix(M)
     illu_id = None
     probe_id = None
     for platform in platform_list:
         if 'HumanHT_12' in platform:
             illu_id = M._row_names[platform[0]]
         if 'HG_U133_Plus_2' in platform:
             probe_id = M._row_names[platform[0]]
 
     
     
     if not illu_id or not probe_id:
         return None
 
     
     
     index = []
     for i in range(M.nrow()):
         if (probe_id[i], illu_id[i]) in result:
             index.append(i)
 
     
     
     if len(index) > 0:
         M_new = M.matrix(index, None)
         f = file(outfile, 'w')
         arrayio.tab_delimited_format.write(M_new, f)
         f.close()
         assert filelib.exists_nz(outfile), (
             'the output file %s for best_match_both fails' % outfile
         )
     else:
         return None
Example #24
0
def _make_config_file(config_filename, skip_depth_filter=False):
    import os
    from genomicode import filelib
    from Betsy import module_utils as mlib

    strelka_path = mlib.get_config("strelka", assert_exists=True)

    src_config = os.path.join(strelka_path, "etc",
                              "strelka_config_bwa_default.ini")
    filelib.exists_nz(src_config)
    lines = open(src_config).readlines()
    assert lines

    # Edit configure options.
    for i in range(len(lines)):
        x = lines[i]
        x = x.strip()
        line = x

        # Make sure skip_depth_filter is correct.
        # isSkipDepthFilters should be set to 1 to skip depth
        # filtration for whole exome or other targeted sequencing data
        #
        # sSkipDepthFilters = 0
        if line.startswith("isSkipDepthFilters"):
            # isSkipDepthFilters = 0
            x = line.split()
            assert len(x) == 3
            assert x[1] == "="
            assert x[2] in ["0", "1"]
            if skip_depth_filter:
                x[2] = "1"
            else:
                x[2] = "0"
            line = " ".join(x)
        lines[i] = line

    lines = [x + "\n" for x in lines]  # replace newline that was stripped.
    open(config_filename, 'w').writelines(lines)
Example #25
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        """convert the cel file with ccl or v3_4 to v3_4"""
        import shutil
        from genomicode import filelib

        in_data = antecedents
        #new_parameters = set_out_attributes(in_data, out_attributes)
        shutil.copytree(in_data.identifier, outfile)
        assert filelib.exists_nz(outfile), (
            'the output file %s for detect_CEL_version' % outfile
        )
Example #26
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import os
     from genomicode import jmath
     from genomicode import filelib
     in_data = antecedents
     cwd = os.getcwd()
     R = jmath.start_R()
     R('require(limma,quietly=TRUE)')
     R('library(marray)')
     os.chdir(in_data.identifier)
     try:
         R('dir<-getwd()')
         R('files<-list.files(dir)')
         R('x.read<-read.Agilent(files)')
     finally:
         os.chdir(cwd)
 
     
     R('xnorm.loc <- maNorm(x.read, norm = "loess")')
     R('x.norm <- maNormScale(xnorm.loc, norm = "p")')
     tmpfile = 'tmp.txt'
     jmath.R_equals(tmpfile, 'tmpfile')
     R('write.marray(x.norm,tmpfile)')
     f = open(tmpfile, 'r')
     text = f.readlines()
     firstline = text[0].split()
     f.close()
     firstindex = firstline.index('"ProbeName"')
     if '"Sequence"' in firstline:
         secondindex = firstline.index('"Sequence"')
     else:
         secondindex = firstline.index('"ControlType"')
 
     
     sample = range(secondindex + 1, len(firstline))
     f = open(outfile, 'w')
     for i in text:
         line = i.split()
         f.write(line[firstindex] + '\t')
         for j in sample:
             f.write(line[j] + '\t')
         f.write('\n')
 
     
     f.close()
     os.remove(tmpfile)
     assert filelib.exists_nz(outfile), (
         'the output file %s for preprocess_agilent fails' % outfile
     )
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import shutil
        from genomicode import filelib
        in_data = antecedents
        result_files = os.listdir(in_data.identifier)
        for result_file in result_files:
            if '-controls' in result_file:
                goal_file = os.path.join(in_data.identifier, result_file)
                shutil.copyfile(goal_file, outfile)

        assert filelib.exists_nz(outfile), (
            'the output file %s for illu_control fails' % outfile)
def add_snpeff_to_svm(svm_file, snpeff_file, outfile):
    import shutil
    from genomicode import filelib
    from genomicode import SimpleVariantMatrix
    from genomicode import AnnotationMatrix

    if not filelib.exists_nz(snpeff_file):
        shutil.copy2(svm_file, outfile)
        return

    # Read the annotations.
    header = None  # includes Chrom, Pos, Ref, Alt
    coord2d = {}
    for d in filelib.read_row(snpeff_file, header=1):
        if header is None:
            header = d._header
        coord = d.Chrom, d.Pos, d.Ref, d.Alt
        coord2d[coord] = d

    svm = SimpleVariantMatrix.read_as_am(svm_file)
    CHROM = svm.header2annots["______Chrom"]
    POS = svm.header2annots["______Pos"]
    REF = svm.header2annots["______Ref"]
    ALT = svm.header2annots["______Alt"]

    snpeff_header = header[4:]
    snpeff_matrix = []  # Row major.
    for i in range(len(CHROM)):
        coord = CHROM[i], POS[i], REF[i], ALT[i]
        row = [""] * len(snpeff_header)
        d = coord2d.get(coord)
        if d:
            row = d._cols[4:]
        assert len(row) == len(snpeff_header)
        snpeff_matrix.append(row)
    assert len(snpeff_matrix) == len(CHROM)
    # AnnotationMatrix is column major.
    snpeff_annots = []
    for j in range(len(snpeff_header)):
        x = [snpeff_matrix[i][j] for i in range(len(snpeff_matrix))]
        snpeff_annots.append(x)
    # Convert the headers to SVM format.
    snpeff_header = ["SnpEff______%s" % x for x in snpeff_header]
    # Make the new SimpleVariantMatrix.
    headers = svm.headers[:4] + snpeff_header + svm.headers[4:]
    x = [svm.header2annots[x] for x in svm.headers_h]
    all_annots = x[:4] + snpeff_annots + x[4:]
    merged = AnnotationMatrix.create_from_annotations(
        headers, all_annots, headerlines=svm.headerlines)
    SimpleVariantMatrix.write_from_am(outfile, merged)
Example #29
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import mplgraph
        from genomicode import filelib
        in_data = antecedents
        matrix = [x for x in filelib.read_cols(in_data.identifier)]
        header = matrix[0]
        index = header.index('Confidence')
        matrix = matrix[1:]
        confidence = [float(i[index]) for i in matrix]
        sample = [i[0] for i in matrix]
        if confidence == [''] * len(matrix) or 'Correct?' in header:
            index = header.index('Predicted_class')
            class_value = [i[index] for i in matrix]
            label_dict = dict()
            label_list = []
            i = -1
            for label in class_value:
                if label not in label_dict.keys():
                    i = i + 1
                    label_dict[label] = i
                label_list.append(label_dict[label])
            yticks = label_dict.keys()
            ytick_pos = [label_dict[i] for i in label_dict.keys()]
            fig = mplgraph.barplot(label_list,
                                   box_label=sample,
                                   ylim=(-0.5, 1.5),
                                   ytick_pos=ytick_pos,
                                   yticks=yticks,
                                   xtick_rotation='vertical',
                                   ylabel='Prediction',
                                   xlabel='Sample')
            fig.savefig(outfile)
        else:
            fig = mplgraph.barplot(confidence,
                                   box_label=sample,
                                   ylim=(-1.5, 1.5),
                                   xtick_rotation='vertical',
                                   ylabel='Prediction',
                                   xlabel='Sample')
            fig.savefig(outfile)

    
        
        
        assert filelib.exists_nz(outfile), (
            'the output file %s for plot_prediction_bar fails' % outfile
        )
Example #30
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     from genomicode import quantnorm
     import arrayio
     from genomicode import filelib
     in_data = antecedents
     M = arrayio.read(in_data.identifier)
     Y = quantnorm.normalize(M)
     f = file(outfile, 'w')
     Y_c = arrayio.convert(Y, to_format=arrayio.pcl_format)
     arrayio.tab_delimited_format.write(Y_c, f)
     f.close()
     assert filelib.exists_nz(outfile), (
         'the output file %s for quantile fails' % outfile
     )