def find_cluster_files(cluster_path): # Return a dictionary of: # "cdt" : cdt_filename # "atr" : atr_filename # "gtr" : gtr_filename # "kag" : kag_filename # "kgg" : kgg_filename # Any of these files can be missing. import os from genomicode import filelib filelib.assert_exists(cluster_path) opj = os.path.join cdt = opj(cluster_path, "signal.cdt") atr = opj(cluster_path, "array_tree.atr") gtr = opj(cluster_path, "gene_tree.gtr") kag = opj(cluster_path, "array_cluster.kag") kgg = opj(cluster_path, "gene_cluster.kgg") cluster_files = {} if filelib.exists_nz(cdt): cluster_files["cdt"] = cdt if filelib.exists_nz(atr): cluster_files["atr"] = atr if filelib.exists_nz(gtr): cluster_files["gtr"] = gtr if filelib.exists_nz(kag): cluster_files["kag"] = kag if filelib.exists_nz(kgg): cluster_files["kgg"] = kgg assert "cdt" in cluster_files, "No clustered file." return cluster_files
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from genomicode import config from genomicode import filelib from Betsy import module_utils in_data = antecedents species = out_attributes['ref'] annotate_BIN = config.annotate_vcf command = ['python', annotate_BIN, in_data.identifier, '-o', outfile, '-species', species] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate()[1] if 'error' in error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for annot_vcf_file fails' % outfile )
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bowtie2_build = filelib.which_assert(config.bowtie2_build) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) # bowtie2-build <ref.fa> <output_stem> # Makes files: # <output_stem>.[1234].bt2 # <output_stem>.rev.[12].bt2 sq = parallel.quote cmd = [ sq(bowtie2_build), sq(ref.fasta_file_full), ref.name, ] parallel.sshell(cmd, path=out_path) # Check to make sure index was created successfully. f = os.path.join(out_path, "%s.1.bt2" % ref.name) assert filelib.exists_nz(f)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """extract the cel files with cc or v3_4""" import os import shutil from Betsy import module_utils from genomicode import affyio from genomicode import filelib in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' ver_list = [] if not os.path.exists(outfile): os.mkdir(outfile) for filename in filenames: if filename == '.DS_Store': pass else: fileloc = os.path.join(directory, filename) cel_v = affyio.guess_cel_version(fileloc) if cel_v in ['cc1', 'v3', 'v4']: shutil.copyfile(fileloc, os.path.join(outfile, filename)) ver_list.append(True) else: ver_list.append(False) if True in ver_list: assert filelib.exists_nz(outfile), ( 'the output file %s for extract_CEL_files fails' % outfile) else: assert ValueError('There is no cel file in the input.')
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import shiftscalenorm import arrayio from Betsy import read_label_file from genomicode import filelib data_node, cls_node = antecedents if data_node and cls_node: result, label_line, second_line = read_label_file.read( cls_node.identifier) assert len( result) == 2, 'for shiftscale,there should be only 2 classes' M = arrayio.read(data_node.identifier) index1 = result[0][0] index2 = result[1][0] M_1 = M.matrix(None, index1) M_2 = M.matrix(None, index2) M_y = shiftscalenorm.normalize(M_1, M_2) for i in range(M_y.dim()[0]): for j in range(M_y.dim()[1]): if str(M_y._X[i][j]) == 'nan': M_y._X[i][j] = M_2._X[i][0] for j in range(M.nrow()): for i in range(len(index1)): M._X[j][index1[i]] = M_y._X[j][i] f = file(outfile, 'w') arrayio.tab_delimited_format.write(M, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for shiftscale fails' % outfile) return False
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from genomicode import config from genomicode import filelib data_node, cel_node = antecedents #out_attributes = set_out_attributes(data_node, out_attributes) phenotype_BIN = config.analyze_phenotype assert os.path.exists(phenotype_BIN) assert "geneset_value" in user_options, 'no geneset are provided' if not os.path.exists(outfile): os.mkdir(outfile) command = [ 'python', phenotype_BIN, '--phenotype', 'EMT', '--ignore_samples', 'shCDH1,1', '--gene', user_options['geneset_value'], '-o', outfile + '/EMT', data_node.identifier, cel_node.identifier ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) x = process.communicate() error_message = x[1] assert not error_message, error_message assert filelib.exists_nz(outfile), ( 'the output file %s for analyze_phenotype fails' % outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from Betsy import read_label_file from Betsy import module_utils from genomicode import filelib from genomicode import config data_node, cls_node = antecedents if data_node and cls_node: result, label_line, second_line = read_label_file.read( cls_node.identifier) assert len( result) >= 2, 'for combat,there should be equal or larger than 2 classes' combat_path = config.combatnorm combat_BIN = module_utils.which(combat_path) assert combat_BIN, 'cannot find the %s' % combat_path command = ['python', combat_BIN, '-f', data_node.identifier, '-o', outfile, '-label', cls_node.identifier] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for combat fails' % outfile ) return False
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from genomicode import filelib from Betsy import module_utils from genomicode import config in_data = antecedents bcftools_BIN = config.bcftools bcftools_module = module_utils.which(bcftools_BIN) assert bcftools_module, 'cannot find the %s' % bcftools_BIN vcfutils_BIN = config.vcfutils #vcfutils_module = module_utils.which(vcfutils_BIN) #assert bcftools_module, 'cannot find the %s' % bcftools_BIN command = [ bcftools_BIN, 'view', in_data.identifier, '|', vcfutils_BIN, 'varFilter', '-D500' ] #command = ['vcfutils.pl','varFilter','-D100',single_object.identifier] f = file(outfile, 'w') try: process = subprocess.Popen(command, shell=False, stdout=f, stderr=subprocess.PIPE) finally: f.close() error_message = process.communicate()[1] if 'error' in error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for filter_vcf_file does not exist' % outfile)
def plot_line_keywd(filename, keyword, outfile): import arrayio from genomicode import mplgraph from genomicode import filelib M = arrayio.read(filename) header = M.row_names() label = M._col_names['_SAMPLE_NAME'] lines = [] data = [] legend_name = [] for i in range(M.dim()[0]): if M.row_names(header[1])[i] == keyword: data.append(M.slice()[i]) x = "%s (%s)" % (keyword, M.row_names(header[0])[i]) legend_name.append(x) assert len(data) > 0, 'cannot find the keyword %s in the file %s' % ( keyword, filename) for i in range(len(data)): line = [(j, data[i][j]) for j in range(len(data[i]))] lines.append(line) params = { "box_label": label, "legend": legend_name, "ylim_min": 0, "ylabel": "Signal", "left": 0.1, } fig = mplgraph.lineplot(*lines, **params) fig.savefig(outfile) assert filelib.exists_nz(outfile), 'the plot_line_keywd fails'
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """analyze geneset""" import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib data_node, geneset_node = antecedents score_geneset_path = config.score_geneset score_geneset_BIN = module_utils.which(score_geneset_path) assert score_geneset_BIN, 'cannot find the %s' % score_geneset_path automatch = out_attributes['automatch'] command = [ 'python', score_geneset_BIN, '-o', outfile, '--geneset_file', geneset_node.identifier, data_node.identifier, '--all' ] if automatch == 'yes': command.append('--automatch') process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for score_pathway_with_geneset fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from genomicode import config from genomicode import filelib #out_attributes = set_out_attributes(in_data, out_attributes) TCGA_BIN = config.download_tcga assert 'disease' in user_options if 'date' in user_options: x = ['--date', user_options['date']] else: x = [] command = [ 'python', TCGA_BIN, '--disease', user_options['disease'], '--data', out_attributes['preprocess'], '--download_only' ] + x process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) result_files = os.listdir(".") result_format = 'tar.gz' for result_file in result_files: if result_file.endswith(result_format): os.rename(result_file, outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for download_tcga fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """extract the fastq rna seq files""" import os import shutil from genomicode import filelib from Betsy import module_utils in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' if not os.path.exists(outfile): os.mkdir(outfile) format_types = ['fa', 'fastq'] for format_type in format_types: for filename in filenames: if filename == '.DS_Store': continue fileloc = os.path.join(in_data.identifier, filename) if fileloc.endswith(format_type + '.gz'): newfname = os.path.splitext(filename)[0] new_file = module_utils.gunzip(fileloc) elif fileloc.endswith(format_type): new_file = fileloc newfname = filename shutil.copyfile(new_file, os.path.join(outfile, newfname)) if fileloc.endswith('.gz'): os.remove(new_file) assert filelib.exists_nz(outfile), ( 'the output file %s for extract_rna_files_fastq fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib in_data = antecedents sortsam_BIN = config.sortsam assert os.path.exists(sortsam_BIN), 'cannot find the %s' % sortsam_BIN command = [ 'java', '-Xmx5g', '-jar', sortsam_BIN, 'I=' + in_data.identifier, 'O=' + outfile, 'SO=coordinate', 'VALIDATION_STRINGENCY=LENIENT', 'CREATE_INDEX=true' ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate()[1] if 'error' in error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for sort_sam_file does not exist' % outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib in_data = antecedents scoresig_path = config.scoresig scoresig_BIN = module_utils.which(scoresig_path) assert scoresig_BIN, 'cannot find the %s' % scoresig_path command = ['python', scoresig_BIN, '-r', in_data.identifier, '-m', in_data.identifier, '-j', '20', '-o', outfile] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for run_scoresig does not exists' % outfile )
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib in_data = antecedents import arrayio f_out = file(outfile, 'w') M = arrayio.read(in_data.identifier) I_good = [] #get the percentage of gene filter percent = float(user_options['filter_value']) / 100 for i in range(M.dim()[0]): missing_count = 0 for j in range(M.dim()[1]): if M._X[i][j] in [None, 'NA']: missing_count = missing_count + 1 if float(missing_count) / M.dim()[1] < percent: I_good.append(i) M_c = M.matrix(I_good, None) arrayio.tab_delimited_format.write(M_c, f_out) f_out.close() assert filelib.exists_nz(outfile), ( 'the output file %s for gene_filter fails' % outfile )
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from genomicode import config from genomicode import filelib from Betsy import module_utils in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' if not os.path.exists(outfile): os.mkdir(outfile) samtools_BIN = config.samtools assert os.path.exists( samtools_BIN), 'cannot find the %s' % samtools_BIN for filename in filenames: infile = os.path.join(directory, filename) outname = os.path.splitext(filename)[-2] + '.bam' outname = os.path.join(outfile, outname) command = [samtools_BIN, 'view', '-S', '-b', '-o', outname, infile] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate() if 'error' in error_message[1]: raise ValueError(error_message) assert filelib.exists_nz(outname), ( 'the output file %s for convert_sam_to_bam does not exist' % outname)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bwa = filelib.which_assert(config.bwa) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) # bwa index <out_stem.fa> # Makes files: # <out_stem>.fa.amb .ann .bwt .pac .sa sq = parallel.quote cmd = [ sq(bwa), "index", sq(ref.fasta_file_full), ] parallel.sshell(cmd, path=out_path) # Make sure the indexing worked properly. EXTENSIONS = [".amb", ".ann", ".bwt", ".pac", ".sa"] for ext in EXTENSIONS: f = "%s%s" % (ref.fasta_file_full, ext) assert filelib.exists_nz(f), "Missing: %s" % f
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib import os from genomicode import jmath in_data = antecedents matrix = [x for x in filelib.read_cols(in_data.identifier)] matrix = [x[1:] for x in matrix] matrix = jmath.transpose(matrix) sample = matrix[0][1:] data = matrix[1:] if not os.path.exists(outfile): os.mkdir(outfile) for one_data in data: value = one_data[1:] value = [float(i) for i in value] pair = [(value[i], sample[i]) for i in range(len(value))] pair.sort() gene_value = [i[0] for i in pair] label = [i[1] for i in pair] ylabel = one_data[0] from genomicode import mplgraph fig = mplgraph.barplot(gene_value, box_label=label, xtick_rotation=90, xlabel='sample', ylabel=ylabel) output = os.path.join(outfile, ylabel) fig.savefig(output + '.png') assert filelib.exists_nz(outfile), ( 'the output file %s for plot_geneset_score_bar fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess import arrayio from Betsy import module_utils from genomicode import filelib from genomicode import config in_data = antecedents bfrm_path = config.bfrmnorm bfrm_BIN = module_utils.which(bfrm_path) assert bfrm_BIN, 'cannot find the %s' % bfrm_path num_factor = 1 #num_factor = 10 if 'num_factors' in user_options.keys(): num_factor = int(user_options['num_factors']) assert num_factor >= 1, 'the num_factor should be >=1' # What is single_object? #M = arrayio.read(single_object.identifier) M = arrayio.read(in_data.identifier) col_num = M.ncol() assert num_factor <= col_num, ( 'the num_factor should be less than %d' % col_num) tmp = 'tmp_dir' command = [ 'python', bfrm_BIN, in_data.identifier, '-f', str(num_factor), '-o', tmp ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(tmp), ( 'the output dir %s for bfrm_normalize fails' % tmp) assert filelib.exists_nz(os.path.join(tmp, 'normalized.gct')), ( 'the output gct file for bfrm_normalize fails') out = os.path.join(tmp, 'normalized.gct') M = arrayio.read(out) M_new = arrayio.convert(M, to_format=arrayio.pcl_format) f = file(outfile, 'w') arrayio.tab_delimited_format.write(M_new, f) f.close()
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib MAX_RAM = 64 # maximum amount of ram to use in Gb. bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # list of (in_filename, log_filename, out_filename) for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) x = in_filename, log_filename, out_filename jobs.append(x) # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar # -T SplitNCigarReads -R ../hg19.fa -I $i -o $j # -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 # -U ALLOW_N_CIGAR_READS # Start with 5 Gb RAM. commands = make_commands(jobs, ref.fasta_file_full, 5) nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_procs"] = nc # If any of the analyses didn't finish, try again with more # RAM. jobs2 = [] for x in jobs: in_filename, log_filename, out_filename = x if filelib.exists_nz(out_filename): continue jobs2.append(x) if jobs2: commands = make_commands(jobs2, ref.fasta_file_full, MAX_RAM) nc = mlib.calc_max_procs_from_ram(MAX_RAM, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] += commands # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames) return metadata
def plot_line_keywds(filename, keywords, outfile): import arrayio from genomicode import mplgraph from genomicode import filelib M = arrayio.read(filename) header = M.row_names() label = M._col_names['_SAMPLE_NAME'] outfiles = [] for keyword in keywords: out = keyword + '.png' lines = [] data = [] legend_name = [] for i in range(M.dim()[0]): if M.row_names(header[1])[i] == keyword: data.append(M.slice()[i]) legend_name.append(M.row_names(header[0])[i]) assert len(data) > 0, 'cannot find the keywords %s in the file %s' % ( keywords, filename) for i in range(len(data)): line = [(j, data[i][j]) for j in range(len(data[i]))] lines.append(line) params = { "box_label": label, "legend": legend_name, "ylim_min": 0, "ylabel": keyword, "left": 0.1, } fig = mplgraph.lineplot(*lines, **params) fig.savefig(out) outfiles.append(out) import Image img_w_list = [] img_h_list = [] imgs = [] for i in range(len(outfiles)): img = Image.open(outfiles[i], 'r') img_w, img_h = img.size img_w_list.append(img_w) img_h_list.append(img_h) imgs.append(img) total_w = max(img_w_list) + 30 total_h = sum(img_h_list) + 10 background = Image.new('RGBA', (total_w, total_h), (255, 255, 255, 255)) bg_w, bg_h = background.size offset_w = (bg_w - max(img_w_list)) / 2 offset_h_list = [] for i in range(len(img_h_list)): offset_h = bg_h - sum(img_h_list[i:]) offset_h_list.append(offset_h) for img, offset_h in zip(imgs, offset_h_list): background.paste(img, (offset_w, offset_h)) background.save(outfile) assert filelib.exists_nz(outfile), 'the plot_line_keywds fails'
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import shutil from genomicode import filelib in_data = antecedents #out_attributes = set_out_attributes(in_data, out_attributes) shutil.copyfile(in_data.identifier, outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for convert_postprocess_impute fails' % outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib import os import arrayio from genomicode import config from genomicode import arrayplatformlib in_data = antecedents mapfile = config.HumanHT_12_to_HG_u133_Plus_2 assert os.path.exists(mapfile), 'mapping file %s does not exist' % mapfile result = [] for d in filelib.read_row(mapfile, header=True): if int(d.Distance) <= 1000 and d.Match == 'Best for Both': result.append((d.Affymetrix_Probe_Set_ID, d.Illumina_Probe_ID)) M = arrayio.read(in_data.identifier) #platform_list = arrayplatformlib.identify_all_platforms_of_matrix(M) platform_list = arrayplatformlib.score_all_platforms_of_matrix(M) illu_id = None probe_id = None for platform in platform_list: if 'HumanHT_12' in platform: illu_id = M._row_names[platform[0]] if 'HG_U133_Plus_2' in platform: probe_id = M._row_names[platform[0]] if not illu_id or not probe_id: return None index = [] for i in range(M.nrow()): if (probe_id[i], illu_id[i]) in result: index.append(i) if len(index) > 0: M_new = M.matrix(index, None) f = file(outfile, 'w') arrayio.tab_delimited_format.write(M_new, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for best_match_both fails' % outfile ) else: return None
def _make_config_file(config_filename, skip_depth_filter=False): import os from genomicode import filelib from Betsy import module_utils as mlib strelka_path = mlib.get_config("strelka", assert_exists=True) src_config = os.path.join(strelka_path, "etc", "strelka_config_bwa_default.ini") filelib.exists_nz(src_config) lines = open(src_config).readlines() assert lines # Edit configure options. for i in range(len(lines)): x = lines[i] x = x.strip() line = x # Make sure skip_depth_filter is correct. # isSkipDepthFilters should be set to 1 to skip depth # filtration for whole exome or other targeted sequencing data # # sSkipDepthFilters = 0 if line.startswith("isSkipDepthFilters"): # isSkipDepthFilters = 0 x = line.split() assert len(x) == 3 assert x[1] == "=" assert x[2] in ["0", "1"] if skip_depth_filter: x[2] = "1" else: x[2] = "0" line = " ".join(x) lines[i] = line lines = [x + "\n" for x in lines] # replace newline that was stripped. open(config_filename, 'w').writelines(lines)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): """convert the cel file with ccl or v3_4 to v3_4""" import shutil from genomicode import filelib in_data = antecedents #new_parameters = set_out_attributes(in_data, out_attributes) shutil.copytree(in_data.identifier, outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for detect_CEL_version' % outfile )
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os from genomicode import jmath from genomicode import filelib in_data = antecedents cwd = os.getcwd() R = jmath.start_R() R('require(limma,quietly=TRUE)') R('library(marray)') os.chdir(in_data.identifier) try: R('dir<-getwd()') R('files<-list.files(dir)') R('x.read<-read.Agilent(files)') finally: os.chdir(cwd) R('xnorm.loc <- maNorm(x.read, norm = "loess")') R('x.norm <- maNormScale(xnorm.loc, norm = "p")') tmpfile = 'tmp.txt' jmath.R_equals(tmpfile, 'tmpfile') R('write.marray(x.norm,tmpfile)') f = open(tmpfile, 'r') text = f.readlines() firstline = text[0].split() f.close() firstindex = firstline.index('"ProbeName"') if '"Sequence"' in firstline: secondindex = firstline.index('"Sequence"') else: secondindex = firstline.index('"ControlType"') sample = range(secondindex + 1, len(firstline)) f = open(outfile, 'w') for i in text: line = i.split() f.write(line[firstindex] + '\t') for j in sample: f.write(line[j] + '\t') f.write('\n') f.close() os.remove(tmpfile) assert filelib.exists_nz(outfile), ( 'the output file %s for preprocess_agilent fails' % outfile )
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import shutil from genomicode import filelib in_data = antecedents result_files = os.listdir(in_data.identifier) for result_file in result_files: if '-controls' in result_file: goal_file = os.path.join(in_data.identifier, result_file) shutil.copyfile(goal_file, outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for illu_control fails' % outfile)
def add_snpeff_to_svm(svm_file, snpeff_file, outfile): import shutil from genomicode import filelib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix if not filelib.exists_nz(snpeff_file): shutil.copy2(svm_file, outfile) return # Read the annotations. header = None # includes Chrom, Pos, Ref, Alt coord2d = {} for d in filelib.read_row(snpeff_file, header=1): if header is None: header = d._header coord = d.Chrom, d.Pos, d.Ref, d.Alt coord2d[coord] = d svm = SimpleVariantMatrix.read_as_am(svm_file) CHROM = svm.header2annots["______Chrom"] POS = svm.header2annots["______Pos"] REF = svm.header2annots["______Ref"] ALT = svm.header2annots["______Alt"] snpeff_header = header[4:] snpeff_matrix = [] # Row major. for i in range(len(CHROM)): coord = CHROM[i], POS[i], REF[i], ALT[i] row = [""] * len(snpeff_header) d = coord2d.get(coord) if d: row = d._cols[4:] assert len(row) == len(snpeff_header) snpeff_matrix.append(row) assert len(snpeff_matrix) == len(CHROM) # AnnotationMatrix is column major. snpeff_annots = [] for j in range(len(snpeff_header)): x = [snpeff_matrix[i][j] for i in range(len(snpeff_matrix))] snpeff_annots.append(x) # Convert the headers to SVM format. snpeff_header = ["SnpEff______%s" % x for x in snpeff_header] # Make the new SimpleVariantMatrix. headers = svm.headers[:4] + snpeff_header + svm.headers[4:] x = [svm.header2annots[x] for x in svm.headers_h] all_annots = x[:4] + snpeff_annots + x[4:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=svm.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import mplgraph from genomicode import filelib in_data = antecedents matrix = [x for x in filelib.read_cols(in_data.identifier)] header = matrix[0] index = header.index('Confidence') matrix = matrix[1:] confidence = [float(i[index]) for i in matrix] sample = [i[0] for i in matrix] if confidence == [''] * len(matrix) or 'Correct?' in header: index = header.index('Predicted_class') class_value = [i[index] for i in matrix] label_dict = dict() label_list = [] i = -1 for label in class_value: if label not in label_dict.keys(): i = i + 1 label_dict[label] = i label_list.append(label_dict[label]) yticks = label_dict.keys() ytick_pos = [label_dict[i] for i in label_dict.keys()] fig = mplgraph.barplot(label_list, box_label=sample, ylim=(-0.5, 1.5), ytick_pos=ytick_pos, yticks=yticks, xtick_rotation='vertical', ylabel='Prediction', xlabel='Sample') fig.savefig(outfile) else: fig = mplgraph.barplot(confidence, box_label=sample, ylim=(-1.5, 1.5), xtick_rotation='vertical', ylabel='Prediction', xlabel='Sample') fig.savefig(outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for plot_prediction_bar fails' % outfile )
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import quantnorm import arrayio from genomicode import filelib in_data = antecedents M = arrayio.read(in_data.identifier) Y = quantnorm.normalize(M) f = file(outfile, 'w') Y_c = arrayio.convert(Y, to_format=arrayio.pcl_format) arrayio.tab_delimited_format.write(Y_c, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for quantile fails' % outfile )