Esempio n. 1
0
 def test_get_bicluster(self):
     data = np.arange(60).reshape(10, 6)
     array = np.array([[25, 27, 28], [37, 39, 40], [55, 57, 58]])
     rows = (4, 6, 9)
     cols = (1, 3, 4)
     bicluster = Bicluster(rows, cols, data)
     self.assertTrue(np.alltrue(array == bicluster.array()))
Esempio n. 2
0
def _read_result_file_(filename, data):
    biclusters = []
    with open(filename, 'r') as f:
        rows = []
        cols = []

        header = f.readline().split()
        properties = dict(nstable=int(header[5]),
                          likelihood=float(header[7]),
                          nparams=int(float(header[11])),
                          bic=float(header[13]))

        target = rows
        for line in f:
            if line[0:9] == "bicluster":
                if not line[9] == '1':  #make sure we've read one biclustert
                    biclusters.append(Bicluster(rows, cols, data=data))
                    rows = []
                    cols = []
                f.next()
                continue
            elif line[0:3] == "row":
                target = rows
                continue
            elif line[0:3] == "col":
                target = cols
                continue
            else:
                v = int(line.split()[0]) - 1
                target.append(v)
        #ensure we get last bicluster.
        biclusters.append(Bicluster(rows, cols, data=data))
    return biclusters, properties
Esempio n. 3
0
 def test_get_bicluster(self):
     data = np.arange(60).reshape(10, 6)
     array = np.array([[25, 27, 28],
                       [37, 39, 40],
                       [55, 57, 58]])
     rows = (4, 6, 9)
     cols = (1, 3, 4)
     bicluster = Bicluster(rows, cols, data)
     self.assertTrue(np.alltrue(array == bicluster.array()))
Esempio n. 4
0
class TestValidation(unittest.TestCase):
    """
    Contains test cases for testing the validation functions in
    the 'validation' module.

    """
    data = numpy.random.randn(10, 10)
    list1 = [Bicluster([0, 1, 2, 3], [0, 1, 2, 3], data)]
    list2 = [Bicluster([2, 3, 4, 5], [2, 3, 4, 5], data)]

    def test_prelic(self):
        rel, rec = bb.prelic_list(self.list1, self.list1)
        self.assertEqual(rel, 1)
        self.assertEqual(rec, 1)

        rel, rec = bb.prelic_list(self.list1, self.list2)
        self.assertAlmostEqual(rel, 1 / 3)
        self.assertAlmostEqual(rec, 1 / 3)

    def test_fmeasure(self):
        rel, rec = bb.f_measure_list(self.list1, self.list1, modified=False)
        self.assertEquals(rel, 1)
        self.assertEquals(rec, 1)

        sens = 4 / 16
        spec = (100 - 28) / (100 - 16)

        expected = 2 * (sens * spec) / (sens + spec)

        rel, rec = bb.f_measure_list(self.list1, self.list2, modified=False)
        self.assertAlmostEqual(rel, expected)
        self.assertAlmostEqual(rec, expected)

    def test_modified_fmeasure(self):
        rel, rec = bb.f_measure_list(self.list1, self.list1, modified=True)
        self.assertEquals(rel, 1)
        self.assertEquals(rec, 1)

    def test_bicluster_jaccard(self):
        rel, rec = bb.jaccard_list(self.list1, self.list1)
        self.assertEquals(rel, 1)
        self.assertEquals(rec, 1)

        expected = 4 / 28

        rel, rec = bb.jaccard_list(self.list1, self.list2)
        self.assertAlmostEqual(rel, expected)
        self.assertAlmostEqual(rec, expected)

    def test_recovery_and_relevance(self):
        rel, rec = bb.recovery_relevance_list(self.list1, self.list1)
        self.assertEquals(rel, 1)
        self.assertEquals(rec, 1)

        rel, rec = bb.recovery_relevance_list(self.list1, self.list2)
        self.assertAlmostEqual(rel, 0.25)
        self.assertAlmostEqual(rec, 0.25)
Esempio n. 5
0
def _read_result_file_(filename, data):
    """
    Reads the bicluster in a single CPB output file.

    The file format is:

    ROWS
    [row index]     [row score]
    [row index]     [row score]
    ...
    [row index]     [row score]
    COLS
    [col index]     [col score]
    [col index]     [col score]
    ...
    [col index]     [col score]

    """
    rows, cols = [], []
    with open(filename, 'r') as f:
        target = rows
        for line in f:
            if line[0] == 'R':
                continue
            elif line[0] == 'C':
                target = cols
                continue
            else:
                target.append(int(line.split()[0]))
        rows.sort()
        cols.sort()
    return Bicluster(rows, cols, data=data)
Esempio n. 6
0
def _shuffle_(data, expected, new_rows=None, new_cols=None):
    """
    Shuffles the dataset while preserving biclusters.

    Args:
        * data: numpy.ndarray
        * expected: list of biclusters.
        * new_rows: Shuffled row indices; if None, randomly generated.
        * new_cols: Shuffled column indices; if None, randomly generated.

    Returns:
        The tuple (shuffled_data, shuffled_biclusters) where shuffled_data
        is a shuffled version of the input dataset, and shuffled_biclusters
        is a list of biclusters corresponding to the new biclusters in
        the shuffled dataset.

    """
    nrows, ncols = data.shape
    if new_rows is None:
        new_rows = range(nrows)
        random.shuffle(new_rows)
    if new_cols is None:
        new_cols = range(ncols)
        random.shuffle(new_cols)

    shuffled_data = data[new_rows].T[new_cols].T
    shuffled_biclusters = []
    for b in expected:
        new_b_rows = [new_rows.index(r) for r in b.rows]
        new_b_cols = [new_cols.index(c) for c in b.cols]
        shuffled_biclusters.append(
            Bicluster(new_b_rows, new_b_cols, shuffled_data))
    return shuffled_data, shuffled_biclusters
Esempio n. 7
0
def _extract_biclusters_(fact, thresZ=0.5, thresL=None):
    params = dict()
    params['thresZ'] = thresZ
    if thresL is not None:
        params['thresL'] = thresL
    extract = robjects.r['extractBic']
    result = extract(fact, **params)

    data = result.rx('X')[0]
    numpy_data = numpy.array(data)
    row_dict = util.make_index_map(list(data.names[0]))
    col_dict = util.make_index_map(list(data.names[1]))

    # an R matrix; each row is a bicluster
    biclusters = []
    r_biclusters = result.rx('bic')[0]
    for b in range(1, r_biclusters.nrow + 1): #r matrices are 1-indexed
        entry = r_biclusters.rx(b, True)

        rownames = list(entry.rx('bixn')[0])
        colnames = list(entry.rx('biypn')[0])
        rows = [row_dict[r] for r in rownames]
        cols = [col_dict[c] for c in colnames]
        biclusters.append(Bicluster(rows, cols, numpy_data))
    return biclusters
Esempio n. 8
0
def _run_biclust_(function_name, data, **kwargs):
    """Convenience function for the various methods implemented in 'biclust'.

    Performs biclustering on the dataset and returns a set of biclusters.

    """
    #replace underscores with dots:
    keys = kwargs.keys()
    for key in keys:
        kwargs[key.replace("_", ".")] = kwargs.pop(key)

    robjects.r.library('biclust')

    #run biclustering
    biclust = robjects.r["biclust"]
    function = robjects.r[function_name]

    try:
        result = biclust(data, method=function_name, **kwargs)
    except RRuntimeError as e:
        logging.error(
            '{0} caught an R exception. Assuming no biclusters were found. Message: {1}'
            .format(function_name, e.message))
        return []

    #get rowXnumber array
    row_matrix = numpy.array(result.do_slot("RowxNumber"))

    #get numberXcolumn array
    col_matrix = numpy.array(result.do_slot("NumberxCol"))

    num_biclusters = row_matrix.shape[1]

    # a hack for Cheng and Church, which appears to sometimes get the transpose of
    # the column matrix
    if not num_biclusters == col_matrix.shape[0]:
        if num_biclusters == col_matrix.shape[1] and \
                row_matrix.shape[0] == data.shape[0] and \
                col_matrix.shape[0] == data.shape[1]:
            col_matrix = col_matrix.T

    if not num_biclusters == col_matrix.shape[0]:
        raise Exception(
            'There is a problem with the results returned by {0}'.format(
                function_name))

    #make list of biclusters
    biclusters = []
    for i in range(num_biclusters):
        rows_bools = row_matrix[:, i] != 0
        cols_bools = col_matrix[i, :] != 0

        rows = [index for index, elt in enumerate(rows_bools) if elt]
        cols = [index for index, elt in enumerate(cols_bools) if elt]

        biclusters.append(Bicluster(rows, cols, data=data))

    return biclusters
Esempio n. 9
0
def _createBicluster_(geneLine, conditionLine, data):
    """
    Extracts the rows and columns of the bicluster from the given gene
    and condition lines.

    """
    genes = map(int, geneLine.split(" "))
    conditions = map(int, conditionLine.split(" "))
    return Bicluster(genes, conditions, data)
Esempio n. 10
0
    def test__get_r_biclust_(self):

        exp_rows = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.bool8)
        exp_cols = np.array([[1, 1, 0], [1, 0, 1]], dtype=np.bool8)
        data = np.random.randn(2, 2)
        biclusters = [
            Bicluster([0, 1], [0, 1], data),
            Bicluster([1], [0], data),
            Bicluster([0], [1], data)
        ]

        result = _get_r_biclust_(biclusters)

        rows = np.array(result.do_slot("RowxNumber"))
        cols = np.array(result.do_slot("NumberxCol"))
        cols = cols.T

        self.assertTrue((rows == exp_rows).all())
        self.assertTrue((cols == exp_cols).all())
Esempio n. 11
0
    def test_get_row_col_matrices(self):
        exp_rows = np.vstack(np.array([1, 1, 0]))
        exp_cols = np.vstack(np.array([0, 1, 0]))

        data = np.random.randn(3, 3)
        biclusters = [Bicluster([0, 1], [1], data)]

        rowxnumber, colxnumber = get_row_col_matrices(biclusters)

        self.assertTrue((rowxnumber == exp_rows).all())
        self.assertTrue((colxnumber == exp_cols).all())
Esempio n. 12
0
def _make_expected_biclusters_(row_matrix, col_matrix, data):
    """
    Given the output of _make_row_matrix_() and _make_col_matrix_(),
    make a list of Biclusters.

    """
    nclust = row_matrix.shape[1]
    assert nclust == col_matrix.shape[1]

    biclusters = []
    for row_line, col_line in zip(row_matrix.T, col_matrix.T):
        rows = list(numpy.where(row_line > 0)[0])
        cols = list(numpy.where(col_line > 0)[0])
        biclusters.append(Bicluster(rows, cols, data))
    return biclusters
Esempio n. 13
0
    def test_bicluster_eq(self):
        bic_a = Bicluster([1, 2, 3], [1, 2, 3])
        bic_b = Bicluster([1, 2, 3], [1, 2, 3])
        self.assertEquals(bic_a, bic_b)

        data = np.arange(10)
        bic_b.data = data
        self.assertNotEquals(bic_a, bic_b)

        bic_a.data = data
        self.assertEquals(bic_a, bic_b)

        bic_b.data = np.arange(10)
        self.assertNotEquals(bic_a, bic_b)
Esempio n. 14
0
def _parse_bicluster_(string, gene_dict, cond_dict, data):
    expected_ngenes = _get_expected_(string, _gene_regex_)
    expected_nconds = _get_expected_(string, _cond_regex_)

    #split after the gene part
    after_genes = re.split(_gene_regex_, string)[1]

    #split into genes and conditions
    gene_lines, cond_lines = re.split(_cond_regex_, after_genes)

    cond_lines = cond_lines.split('\n')[0]

    rows = _handle_gene_lines_(gene_lines, gene_dict)
    cols = _handle_cond_lines_(cond_lines, cond_dict)

    assert len(rows) == expected_ngenes
    assert len(cols) == expected_nconds

    return Bicluster(rows, cols, data)
Esempio n. 15
0
def make_isa_data(nrows=300,
                  ncols=50,
                  nclusts=3,
                  nclustrows=None,
                  nclustcols=None,
                  noise=0,
                  bicluster_signals=None,
                  bicluster_noise=None,
                  noverlap_rows=0,
                  noverlap_cols=None,
                  shuffle=None):
    """
    Make ISA-style data.

    Generates a dataset using the Bioconductor 'isa2' package's
    make.isa.data function.

    If an argument is None, it is not included, and isa2's defaults are used.

    Requires that 'isa2' be installed.

    Args:
        * nrows: Number of rows in the data matrix.
        * cols: Number of columns in the data matrix.
        * nclusts: Number of biclusters.
        * nclustrows: Rows in each bicluster.
            Defaults to round(0.5 * num_rows/num_fact)
        * nclustcols: Cols in each bicluster. round(0.5 * num_cols/num_fact)
        * noise: Standard deviation of normal noise in background.
        * bicluster_signals: List of base signals for each bicluster.
            Defaults to 1's.
        * bicluster_noise: List of noise standard deviations for each bicluster.
            Defaults to 0's.
        * noverlap_rows: Number of bicluster rows that overlap.
        * noverlap_cols: Number of coluster columns that overlap.
            Defaults to 'overlap_row'.
        * shuffle: If True, shuffle rows and columns.

    """
    args = locals()

    isa_map = dict(
        nrows='num_rows',
        ncols='num_cols',
        nclusts='num_fact',
        nclustrows='mod_row_size',
        nclustcols='mod_col_size',
        noise='noise',
        bicluster_signals='mod_signal',
        bicluster_noise='mod_noise',
        noverlap_rows='overlap_row',
        noverlap_cols='overlap_col',
    )

    isa_args = dict()

    for key, argkey in isa_map.iteritems():
        isa_args[argkey] = args[key]

    #remove empty keys
    empty_keys = []
    for key in isa_args:
        if isa_args[key] is None:
            empty_keys.append(key)
    for key in empty_keys:
        isa_args.pop(key)

    for key in ['mod_signal', 'mod_noise']:
        if key in isa_args:
            isa_args[key] = robjects.FloatVector(list(isa_args[key]))

    robjects.r.library('isa2')

    #get data
    func = robjects.r['isa.in.silico']
    result = func(**isa_args)

    #convert to python
    data = numpy.array(robjects.Matrix(result[0])).copy()
    rows = numpy.array(robjects.Matrix(result[1])).copy()
    cols = numpy.array(robjects.Matrix(result[2])).copy()

    nbiclusters = rows.shape[1]

    row_list = []
    for i in range(nbiclusters):
        row = list(rows[:, i].nonzero()[0])
        row_list.append(row)

    col_list = []
    for i in range(nbiclusters):
        col = list(cols[:, i].nonzero()[0])
        col_list.append(col)

    expected = []
    for r, c, in zip(row_list, col_list):
        expected.append(Bicluster(r, c, data))

    if shuffle:
        data, expected = _shuffle_(data, expected)
    return data, expected
Esempio n. 16
0
def isa(data,
        thr_row=None,
        thr_col=None,
        no_seeds=100,
        direction=['updown', 'updown']):
    """
    ISA biclustering algorithm.

    Args:
        * data: numpy.ndarray.
        * thr_row: threshold value for rows.
        * thr_col: threshold value for cols.
        * no_seeds: number of seeds to generate biclusters.
        * direction: either 'up' for upregulated,
            'down' for downregulated, 'updown' for both(default).

    Returns:
        A list of biclusters.

    """


    #load the isa library
    robjects.r.library('isa2')

    #get an R object for the data
    r_data = robjects.Matrix(data)

    def handle_threshold(x):
        if x is None:
            x = robjects.r['seq'](1, 3, by=0.5)
        else:
            if not isiterable(x):
                x = [x]
            x = robjects.FloatVector(list(x))
        return x

    thr_row = handle_threshold(thr_row)
    thr_col = handle_threshold(thr_col)

    direction = robjects.StrVector(direction)

    #run biclustering
    func = robjects.r('isa')
    result = func(r_data, thr_row, thr_col, no_seeds, direction)

    #get rowXnumber array
    row_matrix = numpy.array(robjects.Matrix(result[0]))

    #get numberXcolumn array
    col_matrix = numpy.array(robjects.Matrix(result[1]))

    num_biclusters = row_matrix.shape[1]
    assert num_biclusters == col_matrix.shape[1]

    #make list of biclusters
    biclusters = []
    for i in range(num_biclusters):
        row_vals = row_matrix[:, i]
        col_vals = col_matrix[:, i]

        rows = [index for index, elt in enumerate(row_vals) if elt]
        cols = [index for index, elt in enumerate(col_vals) if elt]

        biclusters.append(Bicluster(rows, cols, data=data))

    return biclusters
Esempio n. 17
0
def make_fabia_data(nrows,
                    ncols,
                    nclusts,
                    f1,
                    f2,
                    of1,
                    of2,
                    sd_noise,
                    sd_z_noise,
                    mean_z,
                    sd_z,
                    sd_l_noise,
                    mean_l,
                    sd_l,
                    shuffle=True,
                    pos=False):
    """
    Make FABIA-style data.

    An interface to the Bioconductor 'fabia' library's
    makeFabiaDataset functions.

    Requires that 'fabia' be installed.

    Args:
        * nrows: number of observations.
        * ncols: number of samples.
        * nclusts: number of biclusters.
        * f1: ncols/f1 max. additional samples are active in a bicluster.
        * f2: nrows/f2 max. additional observations that form a pattern
            in a bicluster.
        * of1: minimal active samples in a bicluster.
        * of2: minimal observations that form a pattern in a bicluster.
        * sd_noise: Gaussian zero mean noise std on data matrix.
        * sd_z_noise: Gaussian zero mean noise std for deactivated hidden factors.
        * mean_z: Gaussian mean for activated factors.
        * sd_z: Gaussian std for activated factors.
        * sd_l_noise: Gaussian zero mean noise std if no observation patterns
            are present.
        * mean_l: Gaussian mean for observation patterns.
        * sd_l: Gaussian std for observation patterns.
        * shuffle: If True, shuffle dataset.
        * pos: Use the MakeFabiaDataPos functions

    """
    robjects.r.library('fabia')

    function = 'makeFabiaData'
    if not shuffle:
        function += "Blocks"
    if pos:
        function += "Pos"
    func = robjects.r[function]

    result = func(nrows, ncols, nclusts, f1, f2, of1, of2, sd_noise,
                  sd_z_noise, mean_z, sd_z, sd_l_noise, mean_l, sd_l)

    noisy_data = numpy.array(result[0]).copy()
    noiseless_data = numpy.array(result[1]).copy()
    cols_vector = result[2]
    rows_vector = result[3]

    f = lambda x: int(x) - 1
    rows = []
    for r in rows_vector:
        rows.append(map(f, r))

    cols = []
    for c in cols_vector:
        cols.append(map(f, c))

    biclusters = []
    for r, c in zip(rows, cols):
        biclusters.append(Bicluster(r, c, noisy_data))

    return noisy_data, biclusters