Ejemplo n.º 1
0
def test_tabulate_wrapped_FASTA():
    """Test that a the proper exception is raised if a wrapped FASTA is
    provided. Input FASTA must be one line for label, one line for sequence"""
    seqf = open("./tests/data/sequences_wrapped.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    with pytest.raises(AssertionError):
        seqcount = tabulate(seqf, metadata_mapping, False)
    seqf.close()
Ejemplo n.º 2
0
def test_tabulate_skipped_sample_warning():
    """Test that a warning is provided if sequences in the FASTA file are not
    used because there is no corresponding sample in the metadata mapping"""
    seqf = open("./tests/data/sequences_extra_sample.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    with pytest.warns(UserWarning):
        seqcount = tabulate(seqf, metadata_mapping, False)
    seqf.close()
Ejemplo n.º 3
0
def test_write_csr(temp_dir):
    """Test of the write_csr function, and verify it produces a proper FASTA
    file output, and Ananke TimeSeriesData object.
    Validate the CSR data in the TimeSeriesData object."""
    
    seqf = open("./tests/data/sequences_no_size.fasta")
    outseqf = open(temp_dir + "/seq.unique.fasta", 'w')
    timeseriesdb = TimeSeriesData(temp_dir + "/ananke.h5")

    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    seqcount = tabulate(seqf, metadata_mapping, False)
    sample_name_array = np.array(metadata_mapping["#SampleID"])

    ngenes = len(seqcount)
    nsamples = len(sample_name_array)
    nobs = 0

    for sequence, abundance_dict in seqcount.items():
        nobs += len(abundance_dict)

    timeseriesdb.resize_data(ngenes, nsamples, nobs)
    timeseriesdb.insert_array_by_chunks("samples/names", sample_name_array)
    timeseriesdb.insert_array_by_chunks("samples/time", 
                                        metadata_mapping["time_points"],
                                        transform_func = float)

    timeseriesdb.insert_array_by_chunks("samples/mask", 
                                        metadata_mapping["time_mask"])

    unique_indices = write_csr(timeseriesdb, seqcount,
                               outseqf, sample_name_array)

    seqf.close()
    outseqf.close()
    # Check the table in the TimeSeriesData object
    true_data = [[2.0, 1.0, 3.0, 4.0],
                 [1.0, 1.0, 1.0, 1.0],
                 [0.0, 0.0, 1.0, 0.0],
                 [0.0, 2.0, 0.0, 0.0]]

    generated_data = timeseriesdb.get_sparse_matrix().todense()
    # Note: the inner lists have a specific ordering (temporal)
    #       but the order of those lists (i.e., the order of the sequences
    #       is not important
    for data in true_data:
        assert data in generated_data

    del timeseriesdb

    # Check the output FASTA file
    assert compare_fasta_unordered(temp_dir + "/seq.unique.fasta", 
                       "tests/data/seq.unique.fasta")
Ejemplo n.º 4
0
def test_tabulate_size_labels():
    """Test that a FASTA file with size labels is properly tabulated"""
    
    seqf = open("./tests/data/sequences_size.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    seqcount = tabulate(seqf, metadata_mapping, True)
    true_data = {'ATGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':4, '72c':2, 'qpa':3, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':1, '72c':1, 'qpa':1, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCT':
                 {'qpa':1},
                 'GATGTCGTAGCTGTAGCTAGCTAGCTAGCTGCTG':
                 {'29':2}}
    seqf.close()
    assert seqcount == true_data
Ejemplo n.º 5
0
def test_tabulate_unordered():
    """Test that a FASTA file with sequences presented in a random order
    (i.e., sequences next to one another don't necessarily belong to the same
    sample/time point) is properly tabulated"""
    
    seqf = open("./tests/data/sequences_unordered.fasta")
    metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt",
                                     "time_points", "time_mask")
    seqcount = tabulate(seqf, metadata_mapping, False)
    true_data = {'ATGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':4, '72c':2, 'qpa':3, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA':
                 {'15':1, '72c':1, 'qpa':1, '29':1},
                 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCT':
                 {'qpa':1},
                 'GATGTCGTAGCTGTAGCTAGCTAGCTAGCTGCTG':
                 {'29':2}}
    seqf.close()
    assert seqcount == true_data