def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import math
        from Betsy import read_label_file
        from genomicode import jmath
        import arrayio
        data_node, cls_node = antecedents
        # obtain the class label
        label, label_line, second_line = read_label_file.read(
            cls_node.identifier)
        class_num = len(label)
        assert class_num == 2, 'the number of class is not 2'
        fc = 1
        if 'group_fc_num' in user_options:
            fc = int(user_options['group_fc_num'])

        M = arrayio.read(data_node.identifier)
        first = M.slice(None, label[0][0])
        second = M.slice(None, label[1][0])
        #X = M.slice()
        I_good = []
        for i in range(M.nrow()):
            fold_change = abs(jmath.mean(first[i]) - jmath.mean(second[i]))
            if fold_change >= math.log(fc, 2):
                I_good.append(i)

        assert I_good, 'there is no gene is significant in fold change with 2'
        f = file(outfile, 'w')
        M_c = M.matrix(I_good, None)
        arrayio.tab_delimited_format.write(M_c, f)
        f.close()
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import subprocess
     from Betsy import read_label_file
     from Betsy import module_utils
     from genomicode import filelib
     from genomicode import config
     data_node, cls_node = antecedents
     if data_node and cls_node:
         result, label_line, second_line = read_label_file.read(
             cls_node.identifier)
         assert len(
             result) >= 2, 'for combat,there should be equal or larger than 2 classes'
         combat_path = config.combatnorm
         combat_BIN = module_utils.which(combat_path)
         assert combat_BIN, 'cannot find the %s' % combat_path
         command = ['python', combat_BIN, '-f', data_node.identifier, '-o',
                    outfile, '-label', cls_node.identifier]
         process = subprocess.Popen(command,
                                    shell=False,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
         error_message = process.communicate()[1]
         if error_message:
             raise ValueError(error_message)
         assert filelib.exists_nz(outfile), (
             'the output file %s for combat fails' % outfile
         )
     
     return False
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import shiftscalenorm
        import arrayio
        from Betsy import read_label_file
        from genomicode import filelib
        data_node, cls_node = antecedents
        if data_node and cls_node:
            result, label_line, second_line = read_label_file.read(
                cls_node.identifier)
            assert len(
                result) == 2, 'for shiftscale,there should be only 2 classes'
            M = arrayio.read(data_node.identifier)
            index1 = result[0][0]
            index2 = result[1][0]
            M_1 = M.matrix(None, index1)
            M_2 = M.matrix(None, index2)
            M_y = shiftscalenorm.normalize(M_1, M_2)
            for i in range(M_y.dim()[0]):
                for j in range(M_y.dim()[1]):
                    if str(M_y._X[i][j]) == 'nan':
                        M_y._X[i][j] = M_2._X[i][0]
            for j in range(M.nrow()):
                for i in range(len(index1)):
                    M._X[j][index1[i]] = M_y._X[j][i]

            f = file(outfile, 'w')
            arrayio.tab_delimited_format.write(M, f)
            f.close()
            assert filelib.exists_nz(outfile), (
                'the output file %s for shiftscale fails' % outfile)

        return False
Example #4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import svmutil
        import arrayio
        from genomicode import filelib
        from Betsy import read_label_file
        from Betsy import module_utils
        svm_model, data_node_test, cls_node_train = antecedents
        a, train_label, second_line = read_label_file.read(
            cls_node_train.identifier)
        M = arrayio.read(data_node_test.identifier)
        # convert to the format libsvm accept
        test = M.matrix(None, range(len(train_label), M.dim()[1]))
        x_test = module_utils.format_convert(test)
        model = svmutil.svm_load_model(svm_model.identifier)
        a, train_label, second_line = read_label_file.read(
            cls_node_train.identifier)
        y_test = [0] * len(x_test)
        p_label, p_acc, p_val = svmutil.svm_predict(y_test, x_test, model)
        prediction_index = [int(i) for i in p_label]
        prediction = [second_line[i] for i in prediction_index]
        name = test._col_names.keys()[0]
        sample_name = test._col_names[name]
        result = [['Sample_name', 'Predicted_class', 'Confidence']]
        for i in range(len(sample_name)):
            result.append(
                [str(sample_name[i]), prediction[i],
                 str(p_val[i][0])])

        f = file(outfile, 'w')
        for i in result:
            f.write('\t'.join(i))
            f.write('\n')

        f.close()
        assert filelib.exists_nz(outfile), (
            'the output file %s for classify_with_svm fails' % outfile)
Example #5
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import arrayio
     from Betsy import read_label_file
     from genomicode import jmath
     
     cls_node_train, data_node = antecedents
     result, label_line, second_line = read_label_file.read(
         cls_node_train.identifier)
     y = [second_line[int(i)] for i in label_line]
     R = jmath.start_R()
     M = arrayio.read(data_node.identifier)
     M_train = M.matrix(None, range(0, len(label_line)))
     M_test = M.matrix(None, range(len(label_line), M.dim()[1]))
     M1 = M_train.slice()
     M_train = jmath.transpose(M1)
     jmath.R_equals_matrix(M_train, 'data')
     M2 = M_test.slice()
     M2 = jmath.transpose(M2)
     jmath.R_equals_matrix(M2, 'test')
     jmath.R_equals(y, 'y')
     R('y<-as.factor(y)')
     R('require(randomForest, quietly=TRUE)')
     R('library(randomForest)')
     R('model <- randomForest(data,y=y,importance=TRUE)')
     R('predict_result <- predict(model, test)')
     predict_result = R['predict_result']
     levels = predict_result.levels
     predict_labels = predict_result[:]
     predict_labels = [levels[i - 1] for i in predict_labels]
     name = M_test._col_names.keys()[0]
     sample_name = M_test._col_names[name]
     result = [['Sample_name', 'Predicted_class', 'Confidence']]
     for i in range(len(sample_name)):
         result.append([str(sample_name[i]), predict_labels[i], ''])
     
     f = file(outfile, 'w')
     for i in result:
         f.write('\t'.join(i))
         f.write('\n')
     f.close()
 def run(self, network, antecedents, out_attributes, user_options,
         num_cores, outfile):
     from genomicode import dwdnorm
     import arrayio
     from Betsy import read_label_file
     from genomicode import filelib
     data_node, cls_node = antecedents
     # BUG: What happens if no antecedents?
     if data_node and cls_node:
         M = arrayio.read(data_node.identifier)
         result, label_line, second_line = read_label_file.read(
             cls_node.identifier)
         assert len(result) == 2, 'for dwd,there should be only 2 classes'
         assert [i in ['0', '1']
                 for i in label_line] == [True] * len(label_line), (
                     'the label of class shoul be 0 and 1')
         y = [i.replace('0', '-1') for i in label_line]
         M_y = dwdnorm.normalize(M, y)
         f = file(outfile, 'w')
         arrayio.tab_delimited_format.write(M_y, f)
         f.close()
         assert filelib.exists_nz(outfile), (
             'the output file %s for dwd fails' % outfile)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from Betsy import read_label_file

        cls_node_test, data_node = antecedents
        text = open(data_node.identifier).readlines()
        a, test_label, second_line = read_label_file.read(
            cls_node_test.identifier)
        actual_label = [second_line[int(i)] for i in test_label]

        f = file(outfile, 'w')
        header = text[0].replace('\n', '').split('\t')
        header.extend(['Actual_class', 'Correct?'])
        f.write('\t'.join(header) + '\n')
        for index in range(1, len(text)):
            line = text[index].replace('\n', '')
            line = line.split('\t')
            correct = 'no'
            if line[1] == actual_label[index - 1]:
                correct = 'yes'
            line.extend([actual_label[index - 1], correct])
            f.write('\t'.join(line) + '\n')
        f.close()
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import arrayio
        from genomicode import filelib
        from Betsy import bie3
        from Betsy import rulebase
        from Betsy import read_label_file

        cls_node, data_node = antecedents
        M = arrayio.read(data_node.identifier)
        x = read_label_file.read(cls_node.identifier)
        a, training_label, second_line = x

        predict_model = __import__(
            'Betsy.modules.' + 'classify_with_random_forest', globals(),
            locals(), ['classify_with_random_forest'], -2)
        evaluate_model = __import__('Betsy.modules.' + 'evaluate_prediction',
                                    globals(), locals(),
                                    ['evaluate_prediction'], -2)

        full_index = range(M.ncol())

        f = file(outfile, 'w')
        f.write('\t'.join([
            'sample_name', 'Predicted_class', 'Confidence', 'Actual_class',
            'Correct?'
        ]))
        f.write('\n')
        for i in range(M.ncol()):
            # Make filenames
            # gene expression for N samples.
            merge_file = 'merge' + '_' + str(i)
            # class label file for the training samples (samples 1-(N-1)).
            train_label = 'train_label' + '_' + str(i)
            # class label file for the test sample (sample N).
            test_label = 'test_label' + '_' + str(i)
            # Save the output of the prediction and evaluation.
            predict_file = "predict.txt"
            evaluate_file = "evaluate.txt"

            test_index = i
            train_index = full_index[:]
            train_index.remove(test_index)
            merge_index = train_index + [test_index]
            y_training = [training_label[x] for x in train_index]
            y_test = [training_label[test_index]]

            # Write the files for this iteration.
            M_merge = M.matrix(None, merge_index)
            arrayio.gct_format.write(M_merge, open(merge_file, 'w'))
            read_label_file.write(train_label, second_line, y_training)
            read_label_file.write(test_label, second_line, y_test[0])

            # Make objects to be used in this analysis.
            x = rulebase.SignalFile.output(format='gct',
                                           contents='class0,class1,test')
            merge_data = bie3.IdentifiedDataNode(x, identifier=merge_file)
            x = rulebase.ClassLabelFile.output(contents='class0,class1')
            train_label_data = bie3.IdentifiedDataNode(x,
                                                       identifier=train_label)
            x = rulebase.ClassLabelFile.output(contents='test')
            test_label_data = bie3.IdentifiedDataNode(x, identifier=test_label)

            # Make a fake object to pass to evaluate_model.run.
            out_node = filelib.GenericObject()
            out_node.identifier = predict_file

            # Run the predictions.
            x = train_label_data, merge_data
            predict_model.Module().run(network, x, out_attributes,
                                       user_options, num_cores, predict_file)

            # Run the evaluation.
            new_parameters = out_attributes.copy()
            x = test_label_data, out_node
            evaluate_model.Module().run(network, x, new_parameters,
                                        user_options, num_cores, evaluate_file)

            # Is this the right line?
            lines = open(evaluate_file).readlines()

            f.write(lines[1])
            os.remove(merge_file)
            os.remove(train_label)
            os.remove(test_label)
            os.remove(predict_file)
            os.remove(evaluate_file)

        f.close()
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        import os
        import subprocess
        import arrayio
        from genomicode import filelib
        from Betsy import read_label_file
        from Betsy import module_utils
        from genomicode import config
        data_node_train, data_node_test, cls_node_train = antecedents
        module_name = 'WeightedVoting'
        gp_parameters = dict()
        file1, file2 = module_utils.convert_to_same_platform(
            data_node_train.identifier, data_node_test.identifier)
        result, label_line, class_name = read_label_file.read(
            cls_node_train.identifier)
        M = arrayio.read(data_node_test.identifier)
        label_line = ['0'] * M.dim()[1]
        read_label_file.write('temp_test.cls', class_name, label_line)
        gp_parameters['train.filename'] = file1
        gp_parameters['train.class.filename'] = cls_node_train.identifier
        gp_parameters['test.filename'] = file2
        gp_parameters['test.class.filename'] = 'temp_test.cls'
        if 'wv_num_features' in user_options:
            gp_parameters['num.features'] = str(user_options['wv_num_features'])
        
        if 'wv_minstd' in user_options:
            assert module_utils.is_number(
                user_options['wv_minstd']), 'the sv_minstd should be number'
            gp_parameters['min.std'] = str(user_options['wv_minstd'])

        
        wv_feature_stat = ['wv_snr', 'wv_ttest', 'wv_snr_median',
                           'wv_ttest_median', 'wv_snr_minstd', 'wv_ttest_minstd',
                           'wv_snr_median_minstd', 'wv_ttest_median_minstd']

        assert out_attributes['wv_feature_stat'] in wv_feature_stat, (
            'the wv_feature_stat is invalid'
        )
        gp_parameters['feature.selection.statistic'] = str(
            wv_feature_stat.index(out_attributes['wv_feature_stat']))
        gp_path = config.genepattern
        gp_module = module_utils.which(gp_path)
        assert gp_module, 'cannot find the %s' % gp_path
        download_directory = os.path.join(".", 'wv_result')
        command = [gp_module, module_name, '-o', download_directory]
        for key in gp_parameters.keys():
            a = ['--parameters', key + ':' + gp_parameters[key]]
            command.extend(a)
        
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        process.wait()
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)
        
        assert os.path.exists(download_directory), (
            'there is no output directory for weightedVoting'
        )
        result_files = os.listdir(download_directory)
        assert 'stderr.txt' not in result_files, 'gene_pattern get error'
        gp_files = os.listdir(download_directory)
        for gp_file in gp_files:
            if gp_file.endswith('pred.odf'):
                gp_file = os.path.join(download_directory, gp_file)
                f = file(gp_file, 'r')
                text = f.readlines()
                f.close()
                os.rename(os.path.join(download_directory, gp_file),
                          os.path.join(download_directory, 'prediction.odf'))
                assert text[1][0:12] == 'HeaderLines='
                start = int(text[1][12:-1])
                newresult = [['Sample_name', 'Predicted_class', 'Confidence']]
                for i in text[start + 2:]:
                    line = i.split()
                    n = len(line)
                    newline = [' '.join(line[0:n - 4]), line[n - 3], line[n - 2]]
                    newresult.append(newline)
                f = file(outfile, 'w')
                for i in newresult:
                    f.write('\t'.join(i))
                    f.write('\n')
                f.close()
        
        assert filelib.exists_nz(outfile), (
            'the output file %s for classify_with_weighted_voting fails' % outfile
        )
Example #10
0
def main(filename, label_file, outfile, delta, foldchange):
    label, label_line, second_line = read_label_file.read(label_file)
    M = arrayio.read(filename)
    data = M.slice()
    label_list = [int(i) + 1 for i in label_line]
    key = M._row_names.keys()
    genenames = M._row_names[key[0]]
    genenames = [str(i) for i in genenames]
    if not os.path.exists(outfile):
        os.mkdir(outfile)
    pngfig = os.path.join(outfile, 'sam_plot.png')
    out_result = os.path.join(outfile, 'sam_reult.txt')
    gene_ids, result = sam(data, label_list, genenames, delta, foldchange,
                           pngfig)
    group1s = []
    group2s = []
    sd1s = []
    sd2s = []
    label1 = second_line[int(label[0][1])]
    label2 = second_line[int(label[1][1])]
    if gene_ids:
        index_list = []
        for key in M._row_order:
            full_gene_list = M._row_names[key]
            if gene_ids[0] not in full_gene_list:
                continue
            index_list = [full_gene_list.index(i) for i in gene_ids]
        M_select = M.slice(index_list, None)
        first = []
        second = []
        for i in range(len(M_select)):
            first.append([M_select[i][j] for j in label[0][0]])
            second.append([M_select[i][j] for j in label[1][0]])
        higher_group = []
        for i in range(len(first)):
            group1 = sum(first[i]) / float(len(first[i]))
            group2 = sum(second[i]) / float(len(second[i]))
            sd1 = jmath.stddev_list(first[i])
            sd2 = jmath.stddev_list(second[i])
            group1s.append(group1)
            group2s.append(group2)
            sd1s.append(sd1)
            sd2s.append(sd2)
            if group1 >= group2:
                higher_group.append(label1)
            else:
                higher_group.append(label2)

    header = [
        'Gene Name', 'Ave_' + label1, 'Ave_' + label2, 'SD_' + label1,
        'SD_' + label2, 'Score(d)', 'Numerator(r)', 'Denominator(s+s0)',
        'Fold Change', 'q_value', 'higher_expression'
    ]
    f = file(out_result, 'w')
    f.write('\t'.join(header))
    f.write('\n')
    if gene_ids:
        for i in range(len(gene_ids)):
            f.write(str(gene_ids[i]) + '\t')
            f.write(str(group1s[i]) + '\t')
            f.write(str(group2s[i]) + '\t')
            f.write(str(sd1s[i]) + '\t')
            f.write(str(sd2s[i]) + '\t')
            for k in range(5):
                f.write(str(result[k][i]) + '\t')
            f.write(str(higher_group[i]) + '\n')
    f.close()
Example #11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import arrayio
        from genomicode import filelib
        from Betsy import bie3
        from Betsy import rulebase
        from Betsy import read_label_file

        data_node, cls_node = antecedents
        M = arrayio.read(data_node.identifier)
        a, training_label, second_line = read_label_file.read(
            cls_node.identifier)
        full_index = range(M.ncol())
        predict_model = __import__('Betsy.modules.' + 'classify_with_svm',
                                   globals(), locals(), ['classify_with_svm'],
                                   -2)
        train_model = __import__('Betsy.modules.' + 'train_svm_model',
                                 globals(), locals(), ['train_svm_model'], -2)
        evaluate_model = __import__('Betsy.modules.' + 'evaluate_prediction',
                                    globals(), locals(),
                                    ['evaluate_prediction'], -2)
        f = file(outfile, 'w')
        f.write('\t'.join([
            'sample_name', 'Predicted_class', 'Confidence', 'Actual_class',
            'Correct?'
        ]))
        f.write('\n')
        for i in range(M.ncol()):
            test_index = i
            train_index = full_index[:]
            train_index.remove(test_index)
            merge_index = train_index + [test_index]
            y_training = [training_label[x] for x in train_index]
            y_test = [training_label[test_index]]
            M_merge = M.matrix(None, merge_index)
            merge_file = 'merge' + '_' + str(i)
            f_out = file(merge_file, 'w')
            arrayio.gct_format.write(M_merge, f_out)
            f_out.close()
            train_label = 'train_label' + '_' + str(i)
            test_label = 'test_label' + '_' + str(i)
            read_label_file.write(train_label, second_line, y_training)
            read_label_file.write(test_label, second_line, y_test[0])
            merge_node = rulebase.SignalFile.output(
                format='gct', contents='class0,class1,test')
            merge_data = bie3.IdentifiedDataNode(merge_node,
                                                 identifier=merge_file)
            train_label_node = rulebase.ClassLabelFile.output(
                contents='class0,class1')
            train_label_data = bie3.IdentifiedDataNode(train_label_node,
                                                       identifier=train_label)
            test_label_node = rulebase.ClassLabelFile.output(contents='test')
            test_label_data = bie3.IdentifiedDataNode(test_label_node,
                                                      identifier=test_label)
            new_parameters = out_attributes.copy()
            del new_parameters['loocv']
            del new_parameters['actual_label']
            del new_parameters['wv_feature_stat']
            x1 = merge_data, train_label_data
            svm_model = train_model.run(x1, new_parameters, user_options,
                                        network)
            x = svm_model, merge_data, train_label_data
            out_node = predict_model.run(x, out_attributes, user_options,
                                         network)
            out_node_label = evaluate_model.run((out_node, test_label_data),
                                                out_attributes, user_options,
                                                network)
            f1 = open(out_node_label.identifier, 'r')
            lines = f1.readlines()
            f1.close()
            f.write(lines[1])
            os.remove(merge_file)
            os.remove(train_label)
            os.remove(test_label)
            os.remove(out_node.identifier)
            os.remove(out_node_label.identifier)
            os.remove(svm_model.identifier)

        f.close()
        assert filelib.exists_nz(outfile), (
            'the output file %s for loocv fails' % outfile)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        #import numpy
        import arrayio
        from genomicode import jmath
        from Betsy import gene_ranking
        from Betsy import read_label_file
        from Betsy import module_utils as mlib

        data_node, cls_node = antecedents
        M = arrayio.read(data_node.identifier)
        x = read_label_file.read(cls_node.identifier)
        label, label_line, second_line = x
        assert len(label) == 2, \
               'the length of label in %s should be 2' % cls_node.identifier
        assert len(label[0]) == 2
        assert len(label[1]) == 2

        threshold = mlib.get_user_option(user_options,
                                         "gene_select_threshold",
                                         not_empty=True,
                                         type=float)

        first = M.slice(None, label[0][0])
        second = M.slice(None, label[1][0])
        t, p = gene_ranking.t_test(first, second)
        for i in range(len(p)):
            if p[i] is None:
                p[i] = 1

        gene_list = []
        key = M._row_order[0]

        gene_order = out_attributes["gene_order"]
        if gene_order == 'ttest_p':
            sort_p = [(p[index], index) for index in range(len(p))]
            sort_p.sort()
            for i in range(len(sort_p)):
                if sort_p[i][0] < threshold:
                    gene_list.append(M._row_names[key][sort_p[i][1]])
        elif gene_order == 'ttest_fdr':
            #for i in range(len(p)):
            #    if p[i] == 10:
            #        p[i] = ''
            fdr = jmath.cmh_fdr_bh(p)
            #for i in range(len(fdr)):
            #    if numpy.isnan(fdr[i]):
            #        fdr[i] = 10
            sort_fdr = [(fdr[index], index) for index in range(len(fdr))]
            sort_fdr.sort()
            for i in range(len(fdr)):
                if sort_fdr[i][0] < threshold:
                    gene_list.append(M._row_names[key][sort_fdr[i][1]])
        else:
            raise AssertionError, "Unknown gene_order: %s" % gene_order

        assert gene_list, 'there is no significant genes can be found in ttest'

        f = open(outfile, 'w')
        f.write('\t'.join(gene_list))
        f.close()