def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from Betsy import read_label_file from Betsy import module_utils from genomicode import filelib from genomicode import config data_node, cls_node = antecedents if data_node and cls_node: result, label_line, second_line = read_label_file.read( cls_node.identifier) assert len( result) >= 2, 'for combat,there should be equal or larger than 2 classes' combat_path = config.combatnorm combat_BIN = module_utils.which(combat_path) assert combat_BIN, 'cannot find the %s' % combat_path command = ['python', combat_BIN, '-f', data_node.identifier, '-o', outfile, '-label', cls_node.identifier] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for combat fails' % outfile ) return False
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from genomicode import filelib from Betsy import module_utils from genomicode import config in_data = antecedents bcftools_BIN = config.bcftools bcftools_module = module_utils.which(bcftools_BIN) assert bcftools_module, 'cannot find the %s' % bcftools_BIN vcfutils_BIN = config.vcfutils #vcfutils_module = module_utils.which(vcfutils_BIN) #assert bcftools_module, 'cannot find the %s' % bcftools_BIN command = [ bcftools_BIN, 'view', in_data.identifier, '|', vcfutils_BIN, 'varFilter', '-D500' ] #command = ['vcfutils.pl','varFilter','-D100',single_object.identifier] f = file(outfile, 'w') try: process = subprocess.Popen(command, shell=False, stdout=f, stderr=subprocess.PIPE) finally: f.close() error_message = process.communicate()[1] if 'error' in error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for filter_vcf_file does not exist' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """analyze geneset""" import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib data_node, geneset_node = antecedents score_geneset_path = config.score_geneset score_geneset_BIN = module_utils.which(score_geneset_path) assert score_geneset_BIN, 'cannot find the %s' % score_geneset_path automatch = out_attributes['automatch'] command = [ 'python', score_geneset_BIN, '-o', outfile, '--geneset_file', geneset_node.identifier, data_node.identifier, '--all' ] if automatch == 'yes': command.append('--automatch') process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for score_pathway_with_geneset fails' % outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib in_data = antecedents scoresig_path = config.scoresig scoresig_BIN = module_utils.which(scoresig_path) assert scoresig_BIN, 'cannot find the %s' % scoresig_path command = ['python', scoresig_BIN, '-r', in_data.identifier, '-m', in_data.identifier, '-j', '20', '-o', outfile] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for run_scoresig does not exists' % outfile )
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess import arrayio from Betsy import module_utils from genomicode import filelib from genomicode import config in_data = antecedents bfrm_path = config.bfrmnorm bfrm_BIN = module_utils.which(bfrm_path) assert bfrm_BIN, 'cannot find the %s' % bfrm_path num_factor = 1 #num_factor = 10 if 'num_factors' in user_options.keys(): num_factor = int(user_options['num_factors']) assert num_factor >= 1, 'the num_factor should be >=1' # What is single_object? #M = arrayio.read(single_object.identifier) M = arrayio.read(in_data.identifier) col_num = M.ncol() assert num_factor <= col_num, ( 'the num_factor should be less than %d' % col_num) tmp = 'tmp_dir' command = [ 'python', bfrm_BIN, in_data.identifier, '-f', str(num_factor), '-o', tmp ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(tmp), ( 'the output dir %s for bfrm_normalize fails' % tmp) assert filelib.exists_nz(os.path.join(tmp, 'normalized.gct')), ( 'the output gct file for bfrm_normalize fails') out = os.path.join(tmp, 'normalized.gct') M = arrayio.read(out) M_new = arrayio.convert(M, to_format=arrayio.pcl_format) f = file(outfile, 'w') arrayio.tab_delimited_format.write(M_new, f) f.close()
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from genomicode import config from genomicode import filelib from genomicode import filelib from Betsy import module_utils in_data = antecedents #out_attributes = set_out_attributes(in_data, out_attributes) species = out_attributes['ref'] if species == 'hg18': ref_file = config.hg18_ref elif species == 'hg19': ref_file = config.hg19_ref elif species == 'dm3': ref_file = config.dm3_ref elif species == 'mm9': ref_file = config.mm9_ref else: raise ValueError('cannot handle %s' % species) assert os.path.exists(ref_file), "File not found: %s" % ref_file bwa_BIN = config.bwa bwa_module = module_utils.which(bwa_BIN) assert bwa_module, 'cannot find the %s' % bwa_BIN command = [bwa_BIN, 'aln', ref_file, in_data.identifier] f = file(outfile, 'w') try: process = subprocess.Popen(command, shell=False, stdout=f, stderr=subprocess.PIPE) finally: f.close() error_message = process.communicate()[1] if 'error' in error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for align_sequence does not exist' % outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): """clustering the input file""" import os import subprocess from genomicode import filelib from Betsy import module_utils from genomicode import config in_data = antecedents CLUSTER_BIN = config.cluster cluster = module_utils.which(CLUSTER_BIN) assert cluster, 'cannot find the %s' % CLUSTER_BIN distance_para = {'correlation': '1', 'euclidean': '7'} dist = distance_para[out_attributes['distance']] com_parameter = ["-g", dist, '-pg', '-e', '1'] command = [CLUSTER_BIN, '-f', in_data.identifier, '-u', outfile] for i in com_parameter: command.append(i) process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) result_files = os.listdir(".") result_format = 'pca_gene.coords.txt' for result_file in result_files: if result_file.endswith(result_format): os.rename(result_file, outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for cluster_genes_by_pca fails' % outfile )
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from Betsy import module_utils from genomicode import config rma_node, mas5_node = antecedents scoresig_path = config.scoresig scoresig_BIN = module_utils.which(scoresig_path) assert scoresig_BIN, 'cannot find the %s' % scoresig_path file1, file2 = module_utils.convert_to_same_platform( rma_node.identifier, mas5_node.identifier) command = [ 'python', scoresig_BIN, '-r', file1, '-m', file2, '-j', '20', '-o', outfile ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess import shutil import arrayio from genomicode import config from genomicode import arrayplatformlib from genomicode import filelib from Betsy import module_utils in_data = antecedents DATA = arrayio.read(in_data.identifier) chipname = arrayplatformlib.identify_platform_of_matrix(DATA) platform = user_options['platform_name'] assert arrayplatformlib.get_bm_attribute(platform), ( 'the desire platform %s is not recognized by Betsy' % platform) if chipname == platform: shutil.copyfile(in_data.identifier, outfile) else: Annot_path = config.annotate_matrix Annot_BIN = module_utils.which(Annot_path) assert Annot_BIN, 'cannot find the %s' % Annot_path command = [ 'python', Annot_BIN, in_data.identifier, "--platform", platform ] f = file(outfile, 'w') try: process = subprocess.Popen(command, shell=False, stdout=f, stderr=subprocess.PIPE) finally: f.close() error_message = process.communicate()[1] if error_message: raise ValueError(error_message) filelib.assert_exists_nz(outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib import arrayio data_node, cls_node = antecedents tmp = os.path.join(".", 'tmp.txt') f = file(tmp, 'w') M = arrayio.read(data_node.identifier) M_c = arrayio.convert(M, to_format=arrayio.gct_format) arrayio.gct_format.write(M_c, f) f.close() module_name = 'ClassNeighbors' gp_parameters = dict() gp_parameters['data.filename'] = tmp gp_parameters['class.filename'] = cls_node.identifier if 'cn_num_neighbors' in user_options: gp_parameters['num.neighbors'] = str( user_options['cn_num_neighbors']) if 'cn_num_perm' in user_options: if user_options['cn_num_perm'].isdigit(): gp_parameters['num.permutations'] = str( user_options['cn_num_perm']) if 'cn_user_pval' in user_options: if module_utils.is_number(user_options['cn_user_pval']): gp_parameters['user.pval'] = str(user_options['cn_user_pval']) mean_median = {'mean': '', 'median': '-d'} if out_attributes['cn_mean_or_median'] in ['mean', 'median']: gp_parameters['mean.or.median'] = mean_median[ out_attributes['cn_mean_or_median']] p = {'t_test': '', 'snr': '-S'} if out_attributes['cn_ttest_or_snr'] in p.values(): gp_parameters['ttest.or.snr'] = p[ out_attributes['cn_ttest_or_snr']] if out_attributes['cn_filter_data'] in ['yes', 'no']: gp_parameters['filter.data'] = str( out_attributes['cn_filter_data']) if 'cn_abs_diff' in user_options: if module_utils.is_number(user_options['cn_abs_diff']): gp_parameters['min.abs.diff'] = str( user_options['cn_abs_diff']) if 'cn_min_threshold' in user_options: if module_utils.is_number(user_options['cn_min_threshold']): gp_parameters['min.threshold'] = str( user_options['cn_min_threshold']) if 'cn_max_threshold' in user_options: if module_utils.is_number(user_options['cn_max_threshold']): gp_parameters['max.threshold'] = str( user_options['cn_max_threshold']) if 'cn_min_folddiff' in user_options: if module_utils.is_number(user_options['cn_min_folddiff']): gp_parameters['min.fold.diff'] = str( user_options['cn_min_folddiff']) gp_path = config.genepattern gp_module = module_utils.which(gp_path) assert gp_module, 'cannot find the %s' % gp_path download_directory = os.path.join(".", 'class_neighbors_result') command = [gp_module, module_name, '-o', download_directory] for key in gp_parameters.keys(): a = ['--parameters', key + ':' + gp_parameters[key]] command.extend(a) process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert os.path.exists(download_directory), ( 'there is no output directory for class_neighbors') result_files = os.listdir(download_directory) assert 'stderr.txt' not in result_files, 'gene_pattern get error' os.remove(tmp) gene_list = [] for result_file in result_files: if result_file.endswith('.odf'): f = file(os.path.join(download_directory, result_file), 'r') text = f.read() text = text.split('\n') f.close() numline = 8 startline = 14 assert text[numline].startswith( 'NumNeighbors'), 'the odf file format is not right' number_gene = int(text[numline].split('=')[1]) assert text[startline].startswith( '1'), 'the start line is not right' for line in text[startline:startline + number_gene]: lines = line.split('\t') gene_list.append(lines[10]) f = file(outfile, 'w') f.write('\t'.join(gene_list)) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for rank_genes_by_class_neighbors fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """select tumor sample only """ import os import subprocess from Betsy import module_utils from genomicode import config from genomicode import filelib in_data = antecedents infile = in_data.identifier if in_data.identifier.endswith('tar.gz'): infile = extract_files(in_data.identifier) slice_matrix_BIN = config.slice_matrix slice_matrix = module_utils.which(slice_matrix_BIN) assert slice_matrix, 'cannot find the %s' % slice_matrix_BIN tempfile = 'temp.txt' process = subprocess.Popen([ slice_matrix_BIN, '--reorder_col_alphabetical', infile, ], shell=False, stdout=file(tempfile, 'w'), stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(tempfile), ( 'the temp file %s for select_tumor_only fails' % tempfile) tempfile1 = 'temp1.txt' process = subprocess.Popen([ slice_matrix_BIN, '--tcga_solid_tumor_only', tempfile, ], shell=False, stdout=file(tempfile1, 'w'), stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(tempfile1), ( 'the temp file %s for select_tumor_only fails' % tempfile1) tempfile2 = 'temp2.txt' process = subprocess.Popen([ slice_matrix_BIN, '--tcga_relabel_patient_barcodes', tempfile1, ], shell=False, stdout=file(tempfile2, 'w'), stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(tempfile2), ( 'the output file %s for select_tumor_only fails' % tempfile2) process = subprocess.Popen([ slice_matrix_BIN, '--remove_duplicate_cols', tempfile2, ], shell=False, stdout=file(outfile, 'w'), stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for select_tumor_only fails' % outfile) os.remove(tempfile) os.remove(tempfile1) os.remove(tempfile2)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess import arrayio from genomicode import filelib from Betsy import read_label_file from Betsy import module_utils from genomicode import config data_node_train, data_node_test, cls_node_train = antecedents module_name = 'WeightedVoting' gp_parameters = dict() file1, file2 = module_utils.convert_to_same_platform( data_node_train.identifier, data_node_test.identifier) result, label_line, class_name = read_label_file.read( cls_node_train.identifier) M = arrayio.read(data_node_test.identifier) label_line = ['0'] * M.dim()[1] read_label_file.write('temp_test.cls', class_name, label_line) gp_parameters['train.filename'] = file1 gp_parameters['train.class.filename'] = cls_node_train.identifier gp_parameters['test.filename'] = file2 gp_parameters['test.class.filename'] = 'temp_test.cls' if 'wv_num_features' in user_options: gp_parameters['num.features'] = str(user_options['wv_num_features']) if 'wv_minstd' in user_options: assert module_utils.is_number( user_options['wv_minstd']), 'the sv_minstd should be number' gp_parameters['min.std'] = str(user_options['wv_minstd']) wv_feature_stat = ['wv_snr', 'wv_ttest', 'wv_snr_median', 'wv_ttest_median', 'wv_snr_minstd', 'wv_ttest_minstd', 'wv_snr_median_minstd', 'wv_ttest_median_minstd'] assert out_attributes['wv_feature_stat'] in wv_feature_stat, ( 'the wv_feature_stat is invalid' ) gp_parameters['feature.selection.statistic'] = str( wv_feature_stat.index(out_attributes['wv_feature_stat'])) gp_path = config.genepattern gp_module = module_utils.which(gp_path) assert gp_module, 'cannot find the %s' % gp_path download_directory = os.path.join(".", 'wv_result') command = [gp_module, module_name, '-o', download_directory] for key in gp_parameters.keys(): a = ['--parameters', key + ':' + gp_parameters[key]] command.extend(a) process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert os.path.exists(download_directory), ( 'there is no output directory for weightedVoting' ) result_files = os.listdir(download_directory) assert 'stderr.txt' not in result_files, 'gene_pattern get error' gp_files = os.listdir(download_directory) for gp_file in gp_files: if gp_file.endswith('pred.odf'): gp_file = os.path.join(download_directory, gp_file) f = file(gp_file, 'r') text = f.readlines() f.close() os.rename(os.path.join(download_directory, gp_file), os.path.join(download_directory, 'prediction.odf')) assert text[1][0:12] == 'HeaderLines=' start = int(text[1][12:-1]) newresult = [['Sample_name', 'Predicted_class', 'Confidence']] for i in text[start + 2:]: line = i.split() n = len(line) newline = [' '.join(line[0:n - 4]), line[n - 3], line[n - 2]] newresult.append(newline) f = file(outfile, 'w') for i in newresult: f.write('\t'.join(i)) f.write('\n') f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for classify_with_weighted_voting fails' % outfile )
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """generate a heatmap of input file""" from genomicode import graphlib from genomicode import filelib import subprocess import arrayio from Betsy import module_utils from genomicode import config in_data = antecedents Heatmap_path = config.arrayplot Heatmap_BIN = module_utils.which(Heatmap_path) assert Heatmap_BIN, 'cannot find the %s' % Heatmap_path command = [ 'python', Heatmap_BIN, in_data.identifier, '-o', outfile, "--label_arrays", "--label_genes" ] if 'color' in out_attributes.keys(): color = ['--color', out_attributes['color'].replace('_', '-')] command.extend(color) M = arrayio.read(in_data.identifier) nrow = M.nrow() ncol = M.ncol() ratio = float(nrow) / ncol max_box_height = 20 max_box_width = 60 if 'hm_width' in user_options: max_box_width = user_options['hm_width'] if 'hm_height' in user_options: max_box_height = user_options['hm_height'] if ratio >= 4: x, y = graphlib.find_tall_heatmap_size( nrow, ncol, max_box_height=max_box_height, max_box_width=max_box_width, min_box_height=20, min_box_width=20, max_megapixels=128) else: x, y = graphlib.find_wide_heatmap_size( nrow, ncol, max_box_height=max_box_height, max_box_width=max_box_width, min_box_height=20, min_box_width=20, max_megapixels=128) command.extend(['-x', str(x), '-y', str(y)]) process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for plot_signature_prediction_comparison fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from Betsy import gene_ranking import arrayio from Betsy import module_utils from genomicode import filelib from genomicode import config from genomicode import arrayplatformlib data_node, gene_node = antecedents #read the gene order list gene_list = open(gene_node.identifier, 'r').read().split() M = arrayio.read(data_node.identifier) x = arrayplatformlib.identify_all_platforms_of_matrix(M) if x: id = x[0][0] platform = x[0][1] chip = arrayplatformlib.identify_platform_of_annotations(gene_list) if not chip: chip = [] signal_file = data_node.identifier #if platform == chip: # tmpfile = data_node.identifier #else: if platform != chip: platform_name = 'unknown_platform' if 'platform_name' in user_options: platform_name = user_options['platform_name'] if platform_name in chip: #, 'unknown_platform': import subprocess Annot_path = config.annotate_matrix Annot_BIN = module_utils.which(Annot_path) assert Annot_BIN, 'cannot find the %s' % Annot_path signal_file = 'tmp' command = [ 'python', Annot_BIN, # Needs to be tested. #'-f', single_object.identifier, '-f', data_node.identifier, '-o', signal_file, "--platform", chip, ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz( signal_file), 'the platform conversion fails' id = out_attributes['platform'] M = arrayio.read(signal_file) elif platform_name == platform: # Needs to be tested. infile = gene_node.identifier #infile = gene_list_file.identifier f = file(infile, 'rU') genes = f.readlines() f.close() gene_list = module_utils.convert_gene_list_platform( genes, platform) else: id = M._row_order[0] original_list = M._row_names[id] #get the order index and write to the outout file indexlist = gene_ranking.find_sorted_index(original_list, gene_list) M_new = M.matrix(indexlist, None) f = open(outfile, 'w') arrayio.tab_delimited_format.write(M_new, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for reorder_genes fails' % outfile)