def test_broken_fasta(tmp_path: Path): """Ensure that we print a reasonable message when the user feeds in a broken fasta file. Unfortunately, we can't detect if the user fed in a markdown file instead of a fasta, because we could parse that markdown file as fasta: > Following the initial line (used for a unique description of the > sequence) was the actual sequence itself in standard one-letter > character string. Anything other than a valid character would > be ignored (including spaces, tabulators, asterisks, etc...). > It was also common to end the sequence with an "*" (asterisk) > character (in analogy with use in PIR formatted sequences) and, > for the same reason, to leave a blank line between the description > and the sequence. From https://www.wikiwand.com/en/FASTA_format NCBI is a bit stricter with their definition by barring blank line, but is otherwise still extremely lenient. (https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp) """ input_file = "test-data/embeddings.npz" with pytest.raises(ValueError, match="Are you sure this is a valid fasta file?"): _process_fasta_file( sequences_file=input_file, prefix=str(tmp_path), )
def test_missing_fasta(tmp_path: Path): input_file = tmp_path.joinpath("non_existant.fasta") with pytest.raises(FileNotFoundError, match="No such file or directory"): _process_fasta_file( sequences_file=input_file, prefix=str(tmp_path), )
def test_unparsable_fasta(caplog, pytestconfig, tmp_path: Path): input_file = pytestconfig.rootpath.joinpath("test-data/unparsable.fasta") with pytest.raises( ValueError, match= f"Could not parse '{input_file}'. Are you sure this is a valid fasta file?", ): _process_fasta_file(sequences_file=str(input_file), prefix=str(tmp_path)) assert caplog.messages == []
def test_illegal_amino_acids(caplog, pytestconfig, tmp_path: Path): """https://github.com/sacdallago/bio_embeddings/issues/54""" input_file = pytestconfig.rootpath.joinpath( "test-data/illegal_amino_acids.fasta") _process_fasta_file(sequences_file=str(input_file), prefix=str(tmp_path)) assert caplog.messages == [ f"The entry 'lowercase' in {input_file} contains lower " "case amino acids. Lower case letters are uninterpretable by most language " "models, and their embedding will be nonesensical. Protein LMs available " "through bio_embeddings have been trained on upper case, single letter code " "sequence representations only " "(https://en.wikipedia.org/wiki/Amino_acid#Table_of_standard_amino_acid_abbreviations_and_properties)." ]
def test_illegal_amino_acids(caplog, tmp_path: Path): """ https://github.com/sacdallago/bio_embeddings/issues/54 """ input_file = "test-data/illegal_amino_acids.fasta" with pytest.raises( ValueError, match=f"The entry 'illegal' in {input_file} contains the characters 'ä', 'ö', " "while only single letter code is allowed", ): _process_fasta_file( sequences_file=input_file, prefix=str(tmp_path), ) assert caplog.messages == [ f"The entry 'lowercase' in {input_file} contains lower " "case amino acids. Lower case letters are uninterpretable by most language " "models, and their embedding will be nonesensical. Protein LMs available " "through bio_embeddings have been trained on upper case, single letter code " "sequence representations only " "(https://en.wikipedia.org/wiki/Amino_acid#Table_of_standard_amino_acid_abbreviations_and_properties)." ]
def test_simple_remapping(tmp_path: Path): """ https://github.com/sacdallago/bio_embeddings/issues/50 """ global_parameters = { "sequences_file": "test-data/seqwence-protein.fasta", "prefix": str(tmp_path), "simple_remapping": True, } global_parameters = _process_fasta_file(**global_parameters) embed_and_write_batched( FakeEmbedder(), FileSystemFileManager(), global_parameters, )
def test_simple_remapping(): """ https://github.com/sacdallago/bio_embeddings/issues/50 """ with TemporaryDirectory() as prefix: global_parameters = { "sequences_file": "test-data/seqwence-protein.fasta", "prefix": prefix, "simple_remapping": True, } global_parameters = _process_fasta_file(**global_parameters) embed_and_write_batched( FakeEmbedder(), FileSystemFileManager(), { **global_parameters, "max_amino_acids": 10000, "discard_per_amino_acid_embeddings": False, }, )