def text_normalize_expression_structures(): data = pd.DataFrame(dict(a=[np.nan, 1, 2, 3, 4], b=[np.nan, 1, 2, 3, 4])) same = pd.DataFrame(dict(structure='a'), index=data.index) same_expect = pd.DataFrame( dict(a=[np.nan, -1.5, -0.5, 0.5, 1.5], b=[np.nan, -1.5, -0.5, 0.5, 1.5])) nsame = correct.normalize_expression(data, norm='center', structures=same) pd.testing.assert_frame_equal(same_expect, nsame) diff = pd.DataFrame(dict(structure=['a', 'a', 'a', 'b', 'b']), index=data.index) diff_expect = pd.DataFrame( dict(a=[np.nan, -0.5, 0.5, -0.5, 0.5], b=[np.nan, -0.5, 0.5, -0.5, 0.5])) ndiff = correct.normalize_expression(data, norm='center', structures=diff) pd.testing.assert_frame_equal(diff_expect, ndiff)
def test_normalize_expression_real(testfiles, method): # load in data and add some NaN values for "realness" micro = [ io.read_microarray(f).T for f in flatten_dict(testfiles, 'microarray').values() ] inds = [[5, 15, 25], [0, 10, 20]] for n, idx in enumerate(inds): micro[n].iloc[idx] = np.nan minmax = [ 'minmax', 'scaled_sigmoid', 'scaled_sigmoid_quantiles', 'scaled_robust_sigmoid', 'mixed_sigmoid' ] out = correct.normalize_expression(micro, norm=method) for exp, idx in zip(out, inds): assert np.all(np.isnan(exp.iloc[idx])) exp = exp.dropna(axis=1, how='all') if method in minmax: assert np.allclose(exp.max(axis=0), 1) assert np.allclose(exp.min(axis=0), 0) elif method == 'robust_sigmoid': assert np.all(exp.max(axis=0) <= 1) assert np.all(exp.min(axis=0) >= 0) elif method in ['center', 'zscore']: assert np.allclose(exp.mean(axis=0), 0) if method == 'zscore': assert np.allclose(exp.std(axis=0, ddof=1), 1) # # batch correct: force means identical # out = correct.normalize_expression(micro, norm='batch') # assert np.allclose(*[e.mean(axis=0, skipna=True) for e in out]) # # the NaN values should still be there, though # for exp, idx in zip(out, inds): # assert np.all(np.isnan(exp.iloc[idx])) # invalid norm parameter with pytest.raises(ValueError): correct.normalize_expression(micro, norm='notanorm')
def test_normalize_expression_real(testfiles): # load in data and add some NaN values for "realness" micro = [io.read_microarray(f).T for f in testfiles['microarray']] inds = [[5, 15, 25], [0, 10, 20]] for n, idx in enumerate(inds): micro[n].iloc[idx] = np.nan # min-max scaling (with some extra pizzazz) srs = correct.normalize_expression(micro, norm='srs') for exp, idx in zip(srs, inds): assert np.all(np.isnan(exp.iloc[idx])) exp = exp.dropna(axis=1, how='all') assert np.allclose(exp.max(axis=0), 1) assert np.allclose(exp.min(axis=0), 0) # z-scoring: mean = 0, std = 1 zscore = correct.normalize_expression(micro, norm='zscore') for exp, idx in zip(zscore, inds): assert np.all(np.isnan(exp.iloc[idx])) exp = exp.dropna(axis=1, how='all') assert np.allclose(exp.mean(axis=0), 0) assert np.allclose(exp.std(axis=0, ddof=1), 1) # batch correct: force means identical batch = correct.normalize_expression(micro, norm='batch') assert np.allclose(*[e.mean(axis=0, skipna=True) for e in batch]) # the NaN values should still be there, though for exp, idx in zip(batch, inds): assert np.all(np.isnan(exp.iloc[idx])) # invalid norm parameter with pytest.raises(ValueError): correct.normalize_expression(micro, norm='notanorm') # can't do batch correction with only one donor with pytest.raises(ValueError): correct.normalize_expression(micro[0], norm='batch')