def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import math from Betsy import read_label_file from genomicode import jmath import arrayio data_node, cls_node = antecedents # obtain the class label label, label_line, second_line = read_label_file.read( cls_node.identifier) class_num = len(label) assert class_num == 2, 'the number of class is not 2' fc = 1 if 'group_fc_num' in user_options: fc = int(user_options['group_fc_num']) M = arrayio.read(data_node.identifier) first = M.slice(None, label[0][0]) second = M.slice(None, label[1][0]) #X = M.slice() I_good = [] for i in range(M.nrow()): fold_change = abs(jmath.mean(first[i]) - jmath.mean(second[i])) if fold_change >= math.log(fc, 2): I_good.append(i) assert I_good, 'there is no gene is significant in fold change with 2' f = file(outfile, 'w') M_c = M.matrix(I_good, None) arrayio.tab_delimited_format.write(M_c, f) f.close()
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import subprocess from Betsy import read_label_file from Betsy import module_utils from genomicode import filelib from genomicode import config data_node, cls_node = antecedents if data_node and cls_node: result, label_line, second_line = read_label_file.read( cls_node.identifier) assert len( result) >= 2, 'for combat,there should be equal or larger than 2 classes' combat_path = config.combatnorm combat_BIN = module_utils.which(combat_path) assert combat_BIN, 'cannot find the %s' % combat_path command = ['python', combat_BIN, '-f', data_node.identifier, '-o', outfile, '-label', cls_node.identifier] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert filelib.exists_nz(outfile), ( 'the output file %s for combat fails' % outfile ) return False
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import shiftscalenorm import arrayio from Betsy import read_label_file from genomicode import filelib data_node, cls_node = antecedents if data_node and cls_node: result, label_line, second_line = read_label_file.read( cls_node.identifier) assert len( result) == 2, 'for shiftscale,there should be only 2 classes' M = arrayio.read(data_node.identifier) index1 = result[0][0] index2 = result[1][0] M_1 = M.matrix(None, index1) M_2 = M.matrix(None, index2) M_y = shiftscalenorm.normalize(M_1, M_2) for i in range(M_y.dim()[0]): for j in range(M_y.dim()[1]): if str(M_y._X[i][j]) == 'nan': M_y._X[i][j] = M_2._X[i][0] for j in range(M.nrow()): for i in range(len(index1)): M._X[j][index1[i]] = M_y._X[j][i] f = file(outfile, 'w') arrayio.tab_delimited_format.write(M, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for shiftscale fails' % outfile) return False
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import svmutil import arrayio from genomicode import filelib from Betsy import read_label_file from Betsy import module_utils svm_model, data_node_test, cls_node_train = antecedents a, train_label, second_line = read_label_file.read( cls_node_train.identifier) M = arrayio.read(data_node_test.identifier) # convert to the format libsvm accept test = M.matrix(None, range(len(train_label), M.dim()[1])) x_test = module_utils.format_convert(test) model = svmutil.svm_load_model(svm_model.identifier) a, train_label, second_line = read_label_file.read( cls_node_train.identifier) y_test = [0] * len(x_test) p_label, p_acc, p_val = svmutil.svm_predict(y_test, x_test, model) prediction_index = [int(i) for i in p_label] prediction = [second_line[i] for i in prediction_index] name = test._col_names.keys()[0] sample_name = test._col_names[name] result = [['Sample_name', 'Predicted_class', 'Confidence']] for i in range(len(sample_name)): result.append( [str(sample_name[i]), prediction[i], str(p_val[i][0])]) f = file(outfile, 'w') for i in result: f.write('\t'.join(i)) f.write('\n') f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for classify_with_svm fails' % outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from Betsy import read_label_file from genomicode import jmath cls_node_train, data_node = antecedents result, label_line, second_line = read_label_file.read( cls_node_train.identifier) y = [second_line[int(i)] for i in label_line] R = jmath.start_R() M = arrayio.read(data_node.identifier) M_train = M.matrix(None, range(0, len(label_line))) M_test = M.matrix(None, range(len(label_line), M.dim()[1])) M1 = M_train.slice() M_train = jmath.transpose(M1) jmath.R_equals_matrix(M_train, 'data') M2 = M_test.slice() M2 = jmath.transpose(M2) jmath.R_equals_matrix(M2, 'test') jmath.R_equals(y, 'y') R('y<-as.factor(y)') R('require(randomForest, quietly=TRUE)') R('library(randomForest)') R('model <- randomForest(data,y=y,importance=TRUE)') R('predict_result <- predict(model, test)') predict_result = R['predict_result'] levels = predict_result.levels predict_labels = predict_result[:] predict_labels = [levels[i - 1] for i in predict_labels] name = M_test._col_names.keys()[0] sample_name = M_test._col_names[name] result = [['Sample_name', 'Predicted_class', 'Confidence']] for i in range(len(sample_name)): result.append([str(sample_name[i]), predict_labels[i], '']) f = file(outfile, 'w') for i in result: f.write('\t'.join(i)) f.write('\n') f.close()
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import dwdnorm import arrayio from Betsy import read_label_file from genomicode import filelib data_node, cls_node = antecedents # BUG: What happens if no antecedents? if data_node and cls_node: M = arrayio.read(data_node.identifier) result, label_line, second_line = read_label_file.read( cls_node.identifier) assert len(result) == 2, 'for dwd,there should be only 2 classes' assert [i in ['0', '1'] for i in label_line] == [True] * len(label_line), ( 'the label of class shoul be 0 and 1') y = [i.replace('0', '-1') for i in label_line] M_y = dwdnorm.normalize(M, y) f = file(outfile, 'w') arrayio.tab_delimited_format.write(M_y, f) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for dwd fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from Betsy import read_label_file cls_node_test, data_node = antecedents text = open(data_node.identifier).readlines() a, test_label, second_line = read_label_file.read( cls_node_test.identifier) actual_label = [second_line[int(i)] for i in test_label] f = file(outfile, 'w') header = text[0].replace('\n', '').split('\t') header.extend(['Actual_class', 'Correct?']) f.write('\t'.join(header) + '\n') for index in range(1, len(text)): line = text[index].replace('\n', '') line = line.split('\t') correct = 'no' if line[1] == actual_label[index - 1]: correct = 'yes' line.extend([actual_label[index - 1], correct]) f.write('\t'.join(line) + '\n') f.close()
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import arrayio from genomicode import filelib from Betsy import bie3 from Betsy import rulebase from Betsy import read_label_file cls_node, data_node = antecedents M = arrayio.read(data_node.identifier) x = read_label_file.read(cls_node.identifier) a, training_label, second_line = x predict_model = __import__( 'Betsy.modules.' + 'classify_with_random_forest', globals(), locals(), ['classify_with_random_forest'], -2) evaluate_model = __import__('Betsy.modules.' + 'evaluate_prediction', globals(), locals(), ['evaluate_prediction'], -2) full_index = range(M.ncol()) f = file(outfile, 'w') f.write('\t'.join([ 'sample_name', 'Predicted_class', 'Confidence', 'Actual_class', 'Correct?' ])) f.write('\n') for i in range(M.ncol()): # Make filenames # gene expression for N samples. merge_file = 'merge' + '_' + str(i) # class label file for the training samples (samples 1-(N-1)). train_label = 'train_label' + '_' + str(i) # class label file for the test sample (sample N). test_label = 'test_label' + '_' + str(i) # Save the output of the prediction and evaluation. predict_file = "predict.txt" evaluate_file = "evaluate.txt" test_index = i train_index = full_index[:] train_index.remove(test_index) merge_index = train_index + [test_index] y_training = [training_label[x] for x in train_index] y_test = [training_label[test_index]] # Write the files for this iteration. M_merge = M.matrix(None, merge_index) arrayio.gct_format.write(M_merge, open(merge_file, 'w')) read_label_file.write(train_label, second_line, y_training) read_label_file.write(test_label, second_line, y_test[0]) # Make objects to be used in this analysis. x = rulebase.SignalFile.output(format='gct', contents='class0,class1,test') merge_data = bie3.IdentifiedDataNode(x, identifier=merge_file) x = rulebase.ClassLabelFile.output(contents='class0,class1') train_label_data = bie3.IdentifiedDataNode(x, identifier=train_label) x = rulebase.ClassLabelFile.output(contents='test') test_label_data = bie3.IdentifiedDataNode(x, identifier=test_label) # Make a fake object to pass to evaluate_model.run. out_node = filelib.GenericObject() out_node.identifier = predict_file # Run the predictions. x = train_label_data, merge_data predict_model.Module().run(network, x, out_attributes, user_options, num_cores, predict_file) # Run the evaluation. new_parameters = out_attributes.copy() x = test_label_data, out_node evaluate_model.Module().run(network, x, new_parameters, user_options, num_cores, evaluate_file) # Is this the right line? lines = open(evaluate_file).readlines() f.write(lines[1]) os.remove(merge_file) os.remove(train_label) os.remove(test_label) os.remove(predict_file) os.remove(evaluate_file) f.close()
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess import arrayio from genomicode import filelib from Betsy import read_label_file from Betsy import module_utils from genomicode import config data_node_train, data_node_test, cls_node_train = antecedents module_name = 'WeightedVoting' gp_parameters = dict() file1, file2 = module_utils.convert_to_same_platform( data_node_train.identifier, data_node_test.identifier) result, label_line, class_name = read_label_file.read( cls_node_train.identifier) M = arrayio.read(data_node_test.identifier) label_line = ['0'] * M.dim()[1] read_label_file.write('temp_test.cls', class_name, label_line) gp_parameters['train.filename'] = file1 gp_parameters['train.class.filename'] = cls_node_train.identifier gp_parameters['test.filename'] = file2 gp_parameters['test.class.filename'] = 'temp_test.cls' if 'wv_num_features' in user_options: gp_parameters['num.features'] = str(user_options['wv_num_features']) if 'wv_minstd' in user_options: assert module_utils.is_number( user_options['wv_minstd']), 'the sv_minstd should be number' gp_parameters['min.std'] = str(user_options['wv_minstd']) wv_feature_stat = ['wv_snr', 'wv_ttest', 'wv_snr_median', 'wv_ttest_median', 'wv_snr_minstd', 'wv_ttest_minstd', 'wv_snr_median_minstd', 'wv_ttest_median_minstd'] assert out_attributes['wv_feature_stat'] in wv_feature_stat, ( 'the wv_feature_stat is invalid' ) gp_parameters['feature.selection.statistic'] = str( wv_feature_stat.index(out_attributes['wv_feature_stat'])) gp_path = config.genepattern gp_module = module_utils.which(gp_path) assert gp_module, 'cannot find the %s' % gp_path download_directory = os.path.join(".", 'wv_result') command = [gp_module, module_name, '-o', download_directory] for key in gp_parameters.keys(): a = ['--parameters', key + ':' + gp_parameters[key]] command.extend(a) process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate()[1] if error_message: raise ValueError(error_message) assert os.path.exists(download_directory), ( 'there is no output directory for weightedVoting' ) result_files = os.listdir(download_directory) assert 'stderr.txt' not in result_files, 'gene_pattern get error' gp_files = os.listdir(download_directory) for gp_file in gp_files: if gp_file.endswith('pred.odf'): gp_file = os.path.join(download_directory, gp_file) f = file(gp_file, 'r') text = f.readlines() f.close() os.rename(os.path.join(download_directory, gp_file), os.path.join(download_directory, 'prediction.odf')) assert text[1][0:12] == 'HeaderLines=' start = int(text[1][12:-1]) newresult = [['Sample_name', 'Predicted_class', 'Confidence']] for i in text[start + 2:]: line = i.split() n = len(line) newline = [' '.join(line[0:n - 4]), line[n - 3], line[n - 2]] newresult.append(newline) f = file(outfile, 'w') for i in newresult: f.write('\t'.join(i)) f.write('\n') f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for classify_with_weighted_voting fails' % outfile )
def main(filename, label_file, outfile, delta, foldchange): label, label_line, second_line = read_label_file.read(label_file) M = arrayio.read(filename) data = M.slice() label_list = [int(i) + 1 for i in label_line] key = M._row_names.keys() genenames = M._row_names[key[0]] genenames = [str(i) for i in genenames] if not os.path.exists(outfile): os.mkdir(outfile) pngfig = os.path.join(outfile, 'sam_plot.png') out_result = os.path.join(outfile, 'sam_reult.txt') gene_ids, result = sam(data, label_list, genenames, delta, foldchange, pngfig) group1s = [] group2s = [] sd1s = [] sd2s = [] label1 = second_line[int(label[0][1])] label2 = second_line[int(label[1][1])] if gene_ids: index_list = [] for key in M._row_order: full_gene_list = M._row_names[key] if gene_ids[0] not in full_gene_list: continue index_list = [full_gene_list.index(i) for i in gene_ids] M_select = M.slice(index_list, None) first = [] second = [] for i in range(len(M_select)): first.append([M_select[i][j] for j in label[0][0]]) second.append([M_select[i][j] for j in label[1][0]]) higher_group = [] for i in range(len(first)): group1 = sum(first[i]) / float(len(first[i])) group2 = sum(second[i]) / float(len(second[i])) sd1 = jmath.stddev_list(first[i]) sd2 = jmath.stddev_list(second[i]) group1s.append(group1) group2s.append(group2) sd1s.append(sd1) sd2s.append(sd2) if group1 >= group2: higher_group.append(label1) else: higher_group.append(label2) header = [ 'Gene Name', 'Ave_' + label1, 'Ave_' + label2, 'SD_' + label1, 'SD_' + label2, 'Score(d)', 'Numerator(r)', 'Denominator(s+s0)', 'Fold Change', 'q_value', 'higher_expression' ] f = file(out_result, 'w') f.write('\t'.join(header)) f.write('\n') if gene_ids: for i in range(len(gene_ids)): f.write(str(gene_ids[i]) + '\t') f.write(str(group1s[i]) + '\t') f.write(str(group2s[i]) + '\t') f.write(str(sd1s[i]) + '\t') f.write(str(sd2s[i]) + '\t') for k in range(5): f.write(str(result[k][i]) + '\t') f.write(str(higher_group[i]) + '\n') f.close()
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import arrayio from genomicode import filelib from Betsy import bie3 from Betsy import rulebase from Betsy import read_label_file data_node, cls_node = antecedents M = arrayio.read(data_node.identifier) a, training_label, second_line = read_label_file.read( cls_node.identifier) full_index = range(M.ncol()) predict_model = __import__('Betsy.modules.' + 'classify_with_svm', globals(), locals(), ['classify_with_svm'], -2) train_model = __import__('Betsy.modules.' + 'train_svm_model', globals(), locals(), ['train_svm_model'], -2) evaluate_model = __import__('Betsy.modules.' + 'evaluate_prediction', globals(), locals(), ['evaluate_prediction'], -2) f = file(outfile, 'w') f.write('\t'.join([ 'sample_name', 'Predicted_class', 'Confidence', 'Actual_class', 'Correct?' ])) f.write('\n') for i in range(M.ncol()): test_index = i train_index = full_index[:] train_index.remove(test_index) merge_index = train_index + [test_index] y_training = [training_label[x] for x in train_index] y_test = [training_label[test_index]] M_merge = M.matrix(None, merge_index) merge_file = 'merge' + '_' + str(i) f_out = file(merge_file, 'w') arrayio.gct_format.write(M_merge, f_out) f_out.close() train_label = 'train_label' + '_' + str(i) test_label = 'test_label' + '_' + str(i) read_label_file.write(train_label, second_line, y_training) read_label_file.write(test_label, second_line, y_test[0]) merge_node = rulebase.SignalFile.output( format='gct', contents='class0,class1,test') merge_data = bie3.IdentifiedDataNode(merge_node, identifier=merge_file) train_label_node = rulebase.ClassLabelFile.output( contents='class0,class1') train_label_data = bie3.IdentifiedDataNode(train_label_node, identifier=train_label) test_label_node = rulebase.ClassLabelFile.output(contents='test') test_label_data = bie3.IdentifiedDataNode(test_label_node, identifier=test_label) new_parameters = out_attributes.copy() del new_parameters['loocv'] del new_parameters['actual_label'] del new_parameters['wv_feature_stat'] x1 = merge_data, train_label_data svm_model = train_model.run(x1, new_parameters, user_options, network) x = svm_model, merge_data, train_label_data out_node = predict_model.run(x, out_attributes, user_options, network) out_node_label = evaluate_model.run((out_node, test_label_data), out_attributes, user_options, network) f1 = open(out_node_label.identifier, 'r') lines = f1.readlines() f1.close() f.write(lines[1]) os.remove(merge_file) os.remove(train_label) os.remove(test_label) os.remove(out_node.identifier) os.remove(out_node_label.identifier) os.remove(svm_model.identifier) f.close() assert filelib.exists_nz(outfile), ( 'the output file %s for loocv fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): #import numpy import arrayio from genomicode import jmath from Betsy import gene_ranking from Betsy import read_label_file from Betsy import module_utils as mlib data_node, cls_node = antecedents M = arrayio.read(data_node.identifier) x = read_label_file.read(cls_node.identifier) label, label_line, second_line = x assert len(label) == 2, \ 'the length of label in %s should be 2' % cls_node.identifier assert len(label[0]) == 2 assert len(label[1]) == 2 threshold = mlib.get_user_option(user_options, "gene_select_threshold", not_empty=True, type=float) first = M.slice(None, label[0][0]) second = M.slice(None, label[1][0]) t, p = gene_ranking.t_test(first, second) for i in range(len(p)): if p[i] is None: p[i] = 1 gene_list = [] key = M._row_order[0] gene_order = out_attributes["gene_order"] if gene_order == 'ttest_p': sort_p = [(p[index], index) for index in range(len(p))] sort_p.sort() for i in range(len(sort_p)): if sort_p[i][0] < threshold: gene_list.append(M._row_names[key][sort_p[i][1]]) elif gene_order == 'ttest_fdr': #for i in range(len(p)): # if p[i] == 10: # p[i] = '' fdr = jmath.cmh_fdr_bh(p) #for i in range(len(fdr)): # if numpy.isnan(fdr[i]): # fdr[i] = 10 sort_fdr = [(fdr[index], index) for index in range(len(fdr))] sort_fdr.sort() for i in range(len(fdr)): if sort_fdr[i][0] < threshold: gene_list.append(M._row_names[key][sort_fdr[i][1]]) else: raise AssertionError, "Unknown gene_order: %s" % gene_order assert gene_list, 'there is no significant genes can be found in ttest' f = open(outfile, 'w') f.write('\t'.join(gene_list)) f.close()