def test_tabulate_wrapped_FASTA(): """Test that a the proper exception is raised if a wrapped FASTA is provided. Input FASTA must be one line for label, one line for sequence""" seqf = open("./tests/data/sequences_wrapped.fasta") metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt", "time_points", "time_mask") with pytest.raises(AssertionError): seqcount = tabulate(seqf, metadata_mapping, False) seqf.close()
def test_tabulate_skipped_sample_warning(): """Test that a warning is provided if sequences in the FASTA file are not used because there is no corresponding sample in the metadata mapping""" seqf = open("./tests/data/sequences_extra_sample.fasta") metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt", "time_points", "time_mask") with pytest.warns(UserWarning): seqcount = tabulate(seqf, metadata_mapping, False) seqf.close()
def test_write_csr(temp_dir): """Test of the write_csr function, and verify it produces a proper FASTA file output, and Ananke TimeSeriesData object. Validate the CSR data in the TimeSeriesData object.""" seqf = open("./tests/data/sequences_no_size.fasta") outseqf = open(temp_dir + "/seq.unique.fasta", 'w') timeseriesdb = TimeSeriesData(temp_dir + "/ananke.h5") metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt", "time_points", "time_mask") seqcount = tabulate(seqf, metadata_mapping, False) sample_name_array = np.array(metadata_mapping["#SampleID"]) ngenes = len(seqcount) nsamples = len(sample_name_array) nobs = 0 for sequence, abundance_dict in seqcount.items(): nobs += len(abundance_dict) timeseriesdb.resize_data(ngenes, nsamples, nobs) timeseriesdb.insert_array_by_chunks("samples/names", sample_name_array) timeseriesdb.insert_array_by_chunks("samples/time", metadata_mapping["time_points"], transform_func = float) timeseriesdb.insert_array_by_chunks("samples/mask", metadata_mapping["time_mask"]) unique_indices = write_csr(timeseriesdb, seqcount, outseqf, sample_name_array) seqf.close() outseqf.close() # Check the table in the TimeSeriesData object true_data = [[2.0, 1.0, 3.0, 4.0], [1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 1.0, 0.0], [0.0, 2.0, 0.0, 0.0]] generated_data = timeseriesdb.get_sparse_matrix().todense() # Note: the inner lists have a specific ordering (temporal) # but the order of those lists (i.e., the order of the sequences # is not important for data in true_data: assert data in generated_data del timeseriesdb # Check the output FASTA file assert compare_fasta_unordered(temp_dir + "/seq.unique.fasta", "tests/data/seq.unique.fasta")
def test_tabulate_size_labels(): """Test that a FASTA file with size labels is properly tabulated""" seqf = open("./tests/data/sequences_size.fasta") metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt", "time_points", "time_mask") seqcount = tabulate(seqf, metadata_mapping, True) true_data = {'ATGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA': {'15':4, '72c':2, 'qpa':3, '29':1}, 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA': {'15':1, '72c':1, 'qpa':1, '29':1}, 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCT': {'qpa':1}, 'GATGTCGTAGCTGTAGCTAGCTAGCTAGCTGCTG': {'29':2}} seqf.close() assert seqcount == true_data
def test_tabulate_unordered(): """Test that a FASTA file with sequences presented in a random order (i.e., sequences next to one another don't necessarily belong to the same sample/time point) is properly tabulated""" seqf = open("./tests/data/sequences_unordered.fasta") metadata_mapping = read_metadata("./tests/data/metadata_mapping.txt", "time_points", "time_mask") seqcount = tabulate(seqf, metadata_mapping, False) true_data = {'ATGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA': {'15':4, '72c':2, 'qpa':3, '29':1}, 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCA': {'15':1, '72c':1, 'qpa':1, '29':1}, 'TTGCTATCGATCGATGCATCGATCGATGCTATCGTACGTGGCT': {'qpa':1}, 'GATGTCGTAGCTGTAGCTAGCTAGCTAGCTGCTG': {'29':2}} seqf.close() assert seqcount == true_data