Example #1
0
def test_gtex_reindex(expression_data):
    """Test that all of the transforms reindex to GTEx properly."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)
    tpm = norm.tpm_from_counts(expression_data.counts)
    clr = norm.clr_from_tpm(tpm, imputer=normalize.impute)
    assert (tpm.columns == clr.columns).all()
    assert (tpm.columns == norm.gene_lengths.index).all()
Example #2
0
def test_clr_functions(expression_data):
    """Test the TPM -> CLR and CLR -> TPM transforms for some expression data."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)

    tpm = normalize.impute(expression_data.tpm)
    clr = norm.clr_from_tpm(tpm, gene_list=tpm.columns)
    tpm_from_clr = norm.tpm_from_clr(clr, gene_list=clr.columns)

    assert np.allclose(tpm, tpm_from_clr)
Example #3
0
def test_normalizer_tpm_from_rpkm_allgenes(expression_data):
    """Test the RPKM -> TPM conversion for some expression data.
    Include all genes."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)

    rpkm = expression_data.rpkm
    tpm = expression_data.tpm
    tpm_calc = norm.tpm_from_rpkm(rpkm)
    assert (tpm.index == tpm_calc.index).all()
    assert np.allclose(tpm.values, tpm_calc[expression_data.tpm.columns].values)
Example #4
0
def test_zscore_from_clr(expression_data):
    """Test the z-score transformation on CLR data."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)

    tpm = normalize.impute(expression_data.tpm)
    clr = norm.clr_from_tpm(tpm, gene_list=tpm.columns)

    tissues = pd.Series('Liver', index=clr.index)
    zscore = norm.z_score_from_clr(clr, tissues)
    assert zscore.shape == clr.shape
Example #5
0
def test_normalizer_tpm_from_counts(expression_data):
    """Test the counts -> TPM conversion for some expression data."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)

    counts = expression_data.counts
    tpm = expression_data.tpm
    tpm_calc = norm.tpm_from_counts(counts)
    assert (tpm.columns == tpm_calc.columns).all()
    assert (tpm.index == tpm_calc.index).all()
    assert np.allclose(tpm.values, tpm_calc.values)
Example #6
0
def test_normalizer_tpm_from_subset(expression_data):
    """Test the TPM -> TPM subset conversion for some expression data."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)

    tpm = expression_data.tpm
    tpm_fullset_calc = norm.tpm_from_subset(tpm)
    assert np.allclose(tpm.values, tpm_fullset_calc.values)

    tpm_subset_calc = norm.tpm_from_subset(tpm, tpm.columns[:100])
    tpm_subset_norm = tpm_subset_calc.sum(axis=1).values
    assert np.allclose(tpm_subset_norm - 1e6, np.zeros_like(tpm_subset_norm))
Example #7
0
def test_ordinalize(expression_data):
    """Test the ordinalize transformation on CLR data."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)

    tpm = normalize.impute(expression_data.tpm)
    clr = norm.clr_from_tpm(tpm, gene_list=tpm.columns)

    cutoffs = [0.37]
    min_value = 5
    ords = norm.ordinalize(clr, cutoffs, min_value=min_value)
    assert ((clr <= cutoffs[0]) == (ords == min_value)).all().all()
    assert ((clr > cutoffs[0]) == (ords == 1+min_value)).all().all()
Example #8
0
def test_alr_functions_dirimpute(expression_data):
    """Test the TPM -> ALR  transform for some expression data.
    Directly impute in the alr calculation."""
    identifier = 'symbol'
    norm = normalize.Normalizer(identifier=identifier)

    all_genes = list(expression_data.tpm.columns)
    reference_genes = all_genes[:1]
    genes_to_keep = all_genes[1:]
    alr = norm.alr_from_tpm(expression_data.tpm, reference_genes,
                            gene_list=all_genes, imputer=normalize.impute)
    alr_genes = list(alr.columns)

    assert (genes_to_keep == alr_genes)
Example #9
0
def expression_data():
    """Create some example data."""
    num_samples = 100
    num_genes = 1000
    max_read_count = 1234
    counts = pd.DataFrame(np.round(max_read_count*np.random.rand(num_samples, num_genes)))

    # create a Normalizer object and get gene lengths
    norm = normalize.Normalizer(identifier='symbol')
    gene_lengths = norm.gene_lengths[:num_genes]
    counts.columns = gene_lengths.index
    # TPM
    tpk = counts.divide(gene_lengths/1e3, axis='columns')
    tpm = tpk.divide(tpk.sum(axis=1)/1e6, axis='index')
    # RPKM
    cpm = counts.divide(counts.sum(axis=1)/1e6, axis='index')
    rpkm = cpm.divide(gene_lengths/1e3, axis='columns')

    return ExpressionData(counts, tpm, rpkm)