def test_pipeline_global(tmp_path):
    prefix = Path(tmp_path).joinpath("prefix")
    execute_pipeline_from_config({
        "global": {
            "prefix": str(prefix),
            "sequences_file": "test-data/seqwence-protein.fasta",
        }
    })

    expected_files = [
        "input_parameters_file.yml",
        "mapping_file.csv",
        "ouput_parameters_file.yml",
        "remapped_sequences_file.fasta",
        "sequences_file.fasta",
    ]

    try:
        importlib_metadata.version("bio_embeddings")
        actual = prefix.joinpath("bio_embeddings_version.txt").read_text()
        expected = toml.loads(
            Path("pyproject.toml").read_text())["tool"]["poetry"]["version"]
        assert actual == expected
        expected_files.append("bio_embeddings_version.txt")
    except importlib_metadata.PackageNotFoundError:
        pass  # No dev install

    assert sorted(expected_files) == sorted(path.name
                                            for path in prefix.iterdir())
Example #2
0
def run_pipeline(job_identifier: str, sequences: List[Tuple[str, str]],
                 pipeline_type: str):
    from bio_embeddings.utilities.pipeline import execute_pipeline_from_config

    config = deepcopy(_CONFIGS[pipeline_type])

    def _post_stage_save(stage_out_config):
        for file_name in _FILES_TO_STORE:
            if stage_out_config.get(file_name):
                logger.info(f"Copying {file_name} to database.")
                write_file(job_identifier, file_name,
                           stage_out_config[file_name])

    with TemporaryDirectory() as workdir:
        with Path(workdir).joinpath("sequences.fasta").open("w") as fp:
            for seq_id, sequence in sequences:
                fp.write(f">{seq_id}\n{sequence}\n")

        # Add last job details
        config['global']['prefix'] = str(Path(workdir) / "bio_embeddings_job")
        config['global']['sequences_file'] = str(
            Path(workdir) / "sequences.fasta")

        logger.info("------ Starting pipeline execution...")
        execute_pipeline_from_config(config, post_stage=_post_stage_save)
        logger.info("------ Finished pipeline execution.")
def test_pipeline_global(tmp_path):
    prefix = Path(tmp_path).joinpath("prefix")
    execute_pipeline_from_config(
        {
            "global": {
                "prefix": str(prefix),
                "sequences_file": "test-data/seqwence-protein.fasta",
            }
        }
    )

    actual = prefix.joinpath("bio_embeddings_version.txt").read_text()
    expected = toml.loads(Path("pyproject.toml").read_text())["tool"]["poetry"][
        "version"
    ]
    assert actual == expected

    expected_files = [
        "bio_embeddings_version.txt",
        "input_parameters_file.yml",
        "mapping_file.csv",
        "ouput_parameters_file.yml",
        "remapped_sequences_file.fasta",
        "sequences_file.fasta",
    ]
    assert expected_files == sorted(path.name for path in prefix.iterdir())
def test_wrong_model_param(pytestconfig, tmp_path: Path, caplog):
    """In this config, the protocol esm1b is chosen, but instead of a model_file a model_directory for T5 is given"""
    pipeline_config = read_config_file(
        str(pytestconfig.rootpath.joinpath("test-data/embed_config_mixup.yml"))
    )
    pipeline_config["global"]["sequences_file"] = str(
        pytestconfig.rootpath.joinpath("test-data").joinpath(
            pipeline_config["global"]["sequences_file"]
        )
    )
    pipeline_config["global"]["prefix"] = str(
        tmp_path.joinpath(pipeline_config["global"]["prefix"])
    )

    with mock.patch(
        "bio_embeddings.embed.pipeline.name_to_embedder", {"esm1b": MockESM1bEmbedder}
    ), mock.patch(
        "bio_embeddings.embed.embedder_interfaces.get_model_file",
        return_value="/dev/null",
    ):
        execute_pipeline_from_config(pipeline_config)

    assert caplog.messages == [
        "You set an unknown option for esm1b: model_directory (value: /mnt/project/bio_embeddings/models/lms/t5)"
    ]
Example #5
0
def test_esm1v_missing_ensemble_id(pytestconfig, tmp_path: Path):
    pipeline_config = read_and_patch_config(
        pytestconfig, tmp_path, "test-data/esm1v_missing_ensemble_id.yml"
    )
    with pytest.raises(
        InvalidParameterError,
        match=re.escape(
            "You must set `ensemble_id` to select which of the five models you want to use [1-5]"
        ),
    ):
        execute_pipeline_from_config(pipeline_config)
Example #6
0
def test_pipeline_global(tmp_path):
    prefix = Path(tmp_path).joinpath("prefix")
    out_config = execute_pipeline_from_config({
        "global": {
            "prefix": str(prefix),
            "sequences_file": "test-data/seqwence-protein.fasta",
        }
    })

    try:
        installed_version = version.parse(
            importlib_metadata.version("bio_embeddings"))
        expected = version.parse(
            toml.loads(Path("pyproject.toml").read_text())["tool"]["poetry"]
            ["version"])
        # That can actually happen
        assert expected == installed_version, "Please run `poetry install`"
        print(out_config["global"])
        assert version.parse(out_config["global"]["version"]) == expected
    except importlib_metadata.PackageNotFoundError:
        pass  # No dev install

    expected_files = [
        "input_parameters_file.yml",
        "mapping_file.csv",
        "output_parameters_file.yml",
        "remapped_sequences_file.fasta",
        "sequences_file.fasta",
    ]

    assert sorted(expected_files) == sorted(path.name
                                            for path in prefix.iterdir())
max_number_of_sequences = config['global'].get('max_number_of_sequences', 250)
max_len = config['global'].get('max_len', 100)
min_len = config['global'].get('min_len', 50)
#
filtered_sequences = list()
total_aa = 0
#

for seq_record in SeqIO.parse(sequence_path, "fasta"):
    if max_len > len(seq_record) > min_len:
        filtered_sequences.append(seq_record)

random_sample = random.sample(filtered_sequences, max_number_of_sequences)

for sequence in random_sample:
    total_aa += len(sequence)

logger.info(f"Total AA={total_aa}.")
logger.info(f"Total per-AA embedding size={4*total_aa*1024*3*pow(10, -6)}MB")

with TemporaryDirectory() as workdir:
    seq_path = Path(workdir).joinpath("sequences.fasta")

    SeqIO.write(random_sample, seq_path, "fasta")

    # Add sampled sequences
    config['global']['sequences_file'] = str(seq_path)

    logger.info("------ Starting pipeline execution...")
    execute_pipeline_from_config(config)
    logger.info("------ Finished pipeline execution.")