Python read_metadata Examples, ananke._tabulate.read_metadata Python Examples

Example #1

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_read_metadata_missing_column():
    """Test that a KeyError is raised when specifying a column that is not
    present, both for time_points and time_mask"""

    with pytest.raises(KeyError):
        metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                         "something_else", None)
    with pytest.raises(KeyError):
        metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                         "time_points", "something_else")

Example #2

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_read_metadata_no_header():
    """Test that a KeyError is raised when no header exists in metadata file"""

    with pytest.raises(KeyError):
        metadata_mapping = read_metadata("./tests/data/" \
                                     "metadata_mapping_noheader.txt",
                                     "time_points", None)

Example #3

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_tabulate_skipped_sample_warning():
    """Test that a warning is provided if sequences in the FASTA file are not
    used because there is no corresponding sample in the metadata mapping"""
    seqf = open("./tests/data/sequences_extra_sample.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    with pytest.warns(UserWarning):
        seqcount = tabulate(seqf, metadata_mapping, False)
    seqf.close()

Example #4

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_tabulate_wrapped_FASTA():
    """Test that a the proper exception is raised if a wrapped FASTA is
    provided. Input FASTA must be one line for label, one line for sequence"""
    seqf = open("./tests/data/sequences_wrapped.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    with pytest.raises(AssertionError):
        seqcount = tabulate(seqf, metadata_mapping, False)
    seqf.close()

Example #5

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_read_metadata_without_mask():
    """Tests reading in metadata with a time_mask column unspecified

    Tests for correct data type, all expected columns exist, and
    correct data sorting (by time_points)"""

    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", None)
    assert type(metadata_mapping) is pd.core.frame.DataFrame
    assert "#SampleID" in metadata_mapping
    assert "time_points" in metadata_mapping
    assert list(metadata_mapping["time_points"]) == [0,0,4,4,10,10,12,12]

Example #6

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_write_csr(temp_dir):
    """Test of the write_csr function, and verify it produces a proper FASTA
    file output, and Ananke TimeSeriesData object.
    Validate the CSR data in the TimeSeriesData object."""
    
    seqf = open("./tests/data/sequences_no_size.fasta")
    outseqf = open(temp_dir + "/seq.unique.fasta", 'w')
    timeseriesdb = TimeSeriesData(temp_dir + "/ananke.h5")

    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    seqcount = tabulate(seqf, metadata_mapping, False)
    sample_name_array = np.array(metadata_mapping["#SampleID"])

    ngenes = len(seqcount)
    nsamples = len(sample_name_array)
    nobs = 0

    for sequence, abundance_dict in seqcount.items():
        nobs += len(abundance_dict)

    timeseriesdb.resize_data(ngenes, nsamples, nobs)
    timeseriesdb.insert_array_by_chunks("samples/names", sample_name_array)
    timeseriesdb.insert_array_by_chunks("samples/time", 
                                        metadata_mapping["time_points"],
                                        transform_func = float)

    timeseriesdb.insert_array_by_chunks("samples/mask", 
                                        metadata_mapping["time_mask"])

    unique_indices = write_csr(timeseriesdb, seqcount,
                               outseqf, sample_name_array)

    seqf.close()
    outseqf.close()
    # Check the table in the TimeSeriesData object
    true_data = [[2.0, 1.0, 3.0, 4.0],
                 [1.0, 1.0, 1.0, 1.0],
                 [0.0, 0.0, 1.0, 0.0],
                 [0.0, 2.0, 0.0, 0.0]]

    generated_data = timeseriesdb.get_sparse_matrix().todense()
    # Note: the inner lists have a specific ordering (temporal)
    #       but the order of those lists (i.e., the order of the sequences
    #       is not important
    for data in true_data:
        assert data in generated_data

    del timeseriesdb

    # Check the output FASTA file
    assert compare_fasta_unordered(temp_dir + "/seq.unique.fasta", 
                       "tests/data/seq.unique.fasta")

Example #7

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_tabulate_size_labels():
    """Test that a FASTA file with size labels is properly tabulated"""
    
    seqf = open("./tests/data/sequences_size.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    seqcount = tabulate(seqf, metadata_mapping, True)
    true_data = {'ATGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':4, '72c':2, 'qpa':3, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':1, '72c':1, 'qpa':1, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCT':
                 {'qpa':1},
                 'GATGTCGTAGCTGTAGCTAGCTAGCTAGCTGCTG':
                 {'29':2}}
    seqf.close()
    assert seqcount == true_data

Example #8

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_tabulate_unordered():
    """Test that a FASTA file with sequences presented in a random order
    (i.e., sequences next to one another don't necessarily belong to the same
    sample/time point) is properly tabulated"""
    
    seqf = open("./tests/data/sequences_unordered.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    seqcount = tabulate(seqf, metadata_mapping, False)
    true_data = {'ATGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':4, '72c':2, 'qpa':3, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':1, '72c':1, 'qpa':1, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCT':
                 {'qpa':1},
                 'GATGTCGTAGCTGTAGCTAGCTAGCTAGCTGCTG':
                 {'29':2}}
    seqf.close()
    assert seqcount == true_data

Example #9

0

Show file

File: test_tabulate.py Project: z2e2/ananke

def test_read_metadata_no_hash():
    """Test that a KeyError is raised when file does not start with a hash"""
    with pytest.raises(KeyError):
        metadata_mapping = read_metadata("./tests/data/" \
                                         "metadata_mapping_nohash.txt",
                                         "time_points", None)