def test(self, baits, gene_families):
        orig_baits = baits.copy()
        orig_families = gene_families.copy()
        nodes = alg._create_bait_nodes(baits, gene_families)

        # Then input unchanged
        assert_series_equals(baits, orig_baits)
        assert_df_equals(gene_families, orig_families)

        # and returns a table with family added
        colour = RGB((255, 255, 255))
        partition = hash(frozenset())
        expected = pd.DataFrame(
            [[
                frozenset({'bait1'}), 'bait', np.nan, 'bait1', colour,
                partition
            ],
             [
                 frozenset({'bait2'}), 'bait', 'fam2', 'bait2', colour,
                 partition
             ]],
            columns=[
                'genes', 'type', 'family', 'label', 'colour', 'partition_id'
            ],
        )
        assert_df_equals(nodes,
                         expected,
                         ignore_indices={0},
                         ignore_order={0, 1})
Ejemplo n.º 2
0
 def test_happy_days(self):
     'When valid gene families, do not raise'
     original = pd.DataFrame([
         ['fam1', 'geneA1'],
         ['fam1', 'geneA3'],
         ['fam2', 'geneA2'],
         ['fam2', 'geneB3'],
     ],
                             columns=['family', 'gene'])
     df = original.copy()
     _validate_gene_families(df)
     assert_df_equals(df, original)  # did not modify input
    def test(self, baits, cors, gene_families):
        orig_baits = baits.copy()
        orig_cors = cors.copy()
        orig_gene_families = gene_families.copy()
        family_nodes, gene_nodes = alg._create_non_bait_nodes(
            baits, cors, gene_families)

        # Then input unchanged
        assert_series_equals(baits, orig_baits)
        assert_df_equals(cors, orig_cors)
        assert_df_equals(gene_families, orig_gene_families)

        # gene nodes
        expected = pd.DataFrame(
            [[frozenset({'bait2'}), 'gene3', 'gene',
              frozenset({'gene3'})]],
            columns=['baits', 'label', 'type', 'genes'],
        )
        assert_df_equals(gene_nodes,
                         expected,
                         ignore_indices={0},
                         ignore_order={0, 1})

        # family nodes
        expected = pd.DataFrame(
            [[
                frozenset({'bait1', 'bait2'}), 'fam', 'family',
                frozenset({'gene1', 'gene2'}), 'fam'
            ]],
            columns=['baits', 'label', 'type', 'genes', 'family'],
        )
        assert_df_equals(family_nodes,
                         expected,
                         ignore_indices={0},
                         ignore_order={0, 1})
    def test(self, matrix, baits, pearson_df_mock, estimate_cutoffs_mock,
             percentiles, cor_matrix, cutoffs):

        orig_matrix_df = matrix.data.copy()
        orig_baits = baits.copy()
        orig_percentiles = percentiles.copy()
        cors, matrix_info = alg._correlate_matrix(matrix, baits, percentiles)

        # Then input unchanged
        assert_df_equals(matrix.data, orig_matrix_df)
        assert_series_equals(baits, orig_baits)
        assert np.allclose(percentiles, orig_percentiles)

        # Correlate entire matrix (so no rows were dropped due to low std) to
        # present baits
        args = pearson_df_mock.call_args.args
        assert_df_equals(args[0], orig_matrix_df)
        assert list(args[1].index) == ['bait1', 'bait2']

        # Matrix info is passed on unchanged
        assert_df_equals(matrix_info.cor_matrix, cor_matrix)
        assert np.allclose(matrix_info.percentile_values, cutoffs)
        assert matrix_info.sample == 'sample'

        # Insignificant cors have been cut, but only those. And the cors df has
        # a different format.
        expected_cors = pd.DataFrame(
            [['bait1', 'bait1', 0.0], ['bait1', 'bait2', 1.0],
             ['gene1', 'bait2', 5.0]],
            columns=['gene', 'bait', 'correlation'])
        assert_df_equals(cors,
                         expected_cors,
                         ignore_indices={0},
                         ignore_order={0, 1})
Ejemplo n.º 5
0
    def test(self, mock_stdin, output_dir):
        main()

        # Sample matrix file
        expected = pd.DataFrame(
            [[1, -1, 0, -0.59603956067926978], [-1, 1, 0, 0.59603956067926978],
             [0, 0, 1, -0.802955],
             [-0.59603956067926978, 0.59603956067926978, -0.802955, 1]],
            index=['gene1', 'gene2', 'gene3', 'gene4'],
            columns=['gene1', 'gene2', 'gene3', 'gene4'],
        )
        actual = pd.read_table(str(output_dir / 'matrix1.sample_matrix.txt'),
                               index_col=0)
        assert_df_equals(actual, expected, ignore_order={0, 1}, all_close=True)

        # Correlation matrix file
        expected = pd.DataFrame(
            [[1, -1], [-1, 1], [0, 0],
             [-0.59603956067926978, 0.59603956067926978]],
            index=['gene1', 'gene2', 'gene3', 'gene4'],
            columns=['gene1', 'gene2'],
        )
        actual = pd.read_table(str(output_dir /
                                   'matrix1.correlation_matrix.txt'),
                               index_col=0)
        assert_df_equals(actual, expected, ignore_order={0, 1}, all_close=True)

        # Percentile values file
        expected = pd.DataFrame(
            [['matrix1', -0.95073877, 0.44702967]],
            columns=['expression_matrix', 'lower', 'upper'],
        )
        actual = pd.read_table(str(output_dir / 'percentile_values.txt'),
                               index_col=None)
        assert_df_equals(actual,
                         expected,
                         ignore_order={0, 1},
                         ignore_indices={0},
                         all_close=True)

        # Significant correlations file
        expected = pd.DataFrame(
            [['gene1', 'gene2', -1], ['gene2', 'gene4', 0.59603956067926978]],
            columns=['bait', 'gene', 'correlation'],
        )
        actual = pd.read_table(str(output_dir /
                                   'significant_correlations.txt'),
                               index_col=None)
        assert_df_equals(actual,
                         expected,
                         ignore_order={0, 1},
                         ignore_indices={0},
                         all_close=True)

        # Sample graphs
        for file_name in ('matrix1.sample_histogram.png',
                          'matrix1.sample_cdf.png'):
            assert (output_dir / file_name).exists()
    def test(self, nodes):
        orig_nodes = nodes.copy()
        edges = alg._create_homology_edges(nodes)

        # Then input unchanged
        assert_df_equals(nodes, orig_nodes)

        # and edges between baits of the same family except self/symmetric edges
        expected = pd.DataFrame(
            [[2, 3], [2, 4], [3, 4]],
            columns=('bait_node1', 'bait_node2'),
        )
        assert_df_equals(edges,
                         expected,
                         ignore_indices={0},
                         ignore_order={0, 1})
    def test(self, correlate_matrix_mock):
        # These args are invalid but is fine for this test as we mock _correlate_matrix
        cors, matrix_infos = alg._correlate_matrices([1, 2], None, None)

        # Then cors is concatenation of the cors of each matrix with self
        # comparisons and symmetrical cors dropped (i.e. only return
        # pearson(x, y) for x < y)
        expected = pd.DataFrame(
            [['bait1', 'gene1', 0.5], ['bait2', 'gene2', 1.0]],
            columns=['bait', 'gene', 'correlation'],
        )
        assert_df_equals(cors,
                         expected,
                         ignore_indices={0},
                         ignore_order={0, 1})

        # matrix_infos is a tuple of infos (though we only returned an int
        # instead of MatrixInfo)
        assert matrix_infos == (3, 4)
    def test(self, nodes, cors):
        orig_nodes = nodes.copy()
        orig_cors = cors.copy()
        edges = alg._create_cor_edges(nodes, cors)

        # Then input unchanged
        assert_df_equals(nodes, orig_nodes)
        assert_df_equals(cors, orig_cors)

        # and correct edges
        expected = pd.DataFrame(
            [
                [1, 2, 2.0],
                [3, 2, 3.0],
                [3, 4, 4.0],

                # include bait-bait cor edges as well
                [1, 3, 5.0],
                [3, 5, 5.0],
            ],
            columns=('bait_node', 'node', 'max_correlation'),
        )
        assert_df_equals(edges,
                         expected,
                         ignore_indices={0},
                         ignore_order={0, 1})
    def test(self, bait_nodes, family_nodes, gene_nodes,
             distinct_colours_mock):
        orig_bait_nodes = bait_nodes.copy()
        orig_family_nodes = family_nodes.copy()
        orig_gene_nodes = gene_nodes.copy()
        nodes = alg._concat_nodes(bait_nodes, family_nodes, gene_nodes)

        # Then input unchanged
        assert_df_equals(bait_nodes, orig_bait_nodes)
        assert_df_equals(family_nodes, orig_family_nodes)
        assert_df_equals(gene_nodes, orig_gene_nodes)

        # All NaN replaced by None (e.g. when converting family col to json
        # later on we want missing values to become `null`, not `NaN`, in json)
        #
        # np.isnan raises exception on None so we check with `is`.
        assert not nodes.applymap(lambda x: x is np.nan).any(axis=None)

        # Request 2 colours as there are 2 partitions other than the bait
        # partition
        distinct_colours_mock.assert_called_once_with(2)

        # Do a plain simple concat with partitions being hashes of baits
        del nodes['id']
        expected = pd.DataFrame([
            ['bait1', 'bait',
             frozenset({'bait1'}), None, 1, 1],
            [
                'fam', 'family',
                frozenset({'gene1', 'gene2'}), 'fam',
                hash(frozenset({'bait1', 'bait2'})), 2
            ],
            [
                'fam2', 'family',
                frozenset({'gene3'}), 'fam2',
                hash(frozenset({'bait1'})), 3
            ],
            [
                'gene4', 'gene',
                frozenset({'gene4'}), None,
                hash(frozenset({'bait1', 'bait2'})), 2
            ],
        ],
                                columns=('label', 'type', 'genes', 'family',
                                         'partition_id', 'colour'))
        assert_df_equals(nodes,
                         expected,
                         ignore_indices={0},
                         ignore_order={0, 1})
    def test(self, pearson_df_mock, matrix, expected_cors, percentile_mock):
        orig_matrix_df = matrix.data.copy()
        percentiles = np.array([20.0, 80.0])
        orig_percentiles = percentiles.copy()
        cors, _ = alg._estimate_cutoffs(matrix, percentiles=percentiles)

        # Input is unchanged
        assert_df_equals(matrix.data, orig_matrix_df)
        assert np.allclose(percentiles, orig_percentiles)

        # Then correlations of the entire matrix are calculated (instead of just a
        # sample)
        args = pearson_df_mock.call_args.args
        assert_df_equals(args[0], matrix.data)
        assert_df_equals(args[1], matrix.data)

        # the actual sample cors are the one returned by pearson
        assert_df_equals(cors, expected_cors)

        # percentiles are calculated on a triangle minus the diagonal
        args = percentile_mock.call_args.args
        assert np.allclose(args[0], np.array([1.0, 2.0, 3.0]))
        assert np.allclose(args[1], percentiles)