Example #1
0
def run(**kwargs):
    """
    Run embedding protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        sequences_file: Where sequences live
        prefix: Output prefix for all generated files
        protocol: Which embedder to use
        mapping_file: the mapping file generated by the pipeline when remapping indexes
        stage_name: The stage name

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(
        kwargs,
        [
            "protocol", "prefix", "stage_name", "remapped_sequences_file",
            "mapping_file"
        ],
    )

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: {}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())))

    embedder_class = PROTOCOLS[kwargs["protocol"]]

    if embedder_class == UniRepEmbedder and kwargs.get("use_cpu") is not None:
        raise InvalidParameterError(
            "UniRep does not support configuring `use_cpu`")

    result_kwargs = deepcopy(kwargs)

    # Download necessary files if needed
    # noinspection PyProtectedMember
    for file in embedder_class._necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model=embedder_class.name,
                                                 file=file)

    # noinspection PyProtectedMember
    for directory in embedder_class._necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(
                model=embedder_class.name, directory=directory)

    result_kwargs.setdefault("max_amino_acids",
                             DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]])

    file_manager = get_file_manager(**kwargs)
    embedder: EmbedderInterface = embedder_class(**result_kwargs)
    _check_transform_embeddings_function(embedder, result_kwargs)

    return embed_and_write_batched(embedder, file_manager, result_kwargs,
                                   kwargs.get("half_precision", False))
Example #2
0
def run(**kwargs):
    """
    Run project protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        reduced_embeddings_file: Where per-protein embeddings live
        prefix: Output prefix for all generated files
        stage_name: The stage name
        protocol: Which projection technique to use
        mapping_file: the mapping file generated by the pipeline when remapping indexes

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(kwargs, [
        'protocol', 'prefix', 'stage_name', 'reduced_embeddings_file',
        'mapping_file'
    ])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join(
                PROTOCOLS.keys())))

    return PROTOCOLS[kwargs["protocol"]](**kwargs)
Example #3
0
def run(**kwargs):
    """
    Run visualize protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        projected_embeddings_file: A csv with columns: (index), original_id, x, y, z
        prefix: Output prefix for all generated files
        stage_name: The stage name
        protocol: Which plot to generate

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(kwargs, ['protocol', 'prefix', 'stage_name', 'projected_embeddings_file'])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
            )
        )

    return PROTOCOLS[kwargs["protocol"]](**kwargs)
Example #4
0
def run(**kwargs):
    """
    Run visualize protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        projected_reduced_embeddings_file: A csv with columns: (index), original_id, x, y, z
        prefix: Output prefix for all generated files
        stage_name: The stage name
        protocol: Which plot to generate

    For plotly:
        projected_reduced_embeddings_file: The projected (dimensionality reduced) embeddings, normally coming from the project stage
        annotation_file: csv file with annotations
        display_unknown: Hide proteins for which there is no annotation in the annotation file (only relevant if annotation file is provided)
        merge_via_index: Set to True if in annotation_file identifiers correspond to sequence MD5 hashes
        n_components: 2D vs 3D plot

    For plot_mutagenesis:
        residue_probabilities_file: The csv with the probabilities, normally coming from the mutagenesis stage

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(kwargs, ['protocol', 'prefix', 'stage_name'])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join(
                PROTOCOLS.keys())))

    result_kwargs = deepcopy(kwargs)

    if kwargs["protocol"] == "plotly":
        # Support legacy projected_embeddings_file
        projected_reduced_embeddings_file = (
            kwargs.get("projected_reduced_embeddings_file")
            or kwargs.get("projected_embeddings_file"))
        if not projected_reduced_embeddings_file:
            raise InvalidParameterError(
                f"You need to provide either projected_reduced_embeddings_file or projected_embeddings_file or "
                f"reduced_embeddings_file for {kwargs['protocol']}")
        result_kwargs[
            "projected_reduced_embeddings_file"] = projected_reduced_embeddings_file

    return PROTOCOLS[kwargs["protocol"]](result_kwargs)
Example #5
0
def run(**kwargs):
    """
    Run project protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        projected_reduced_embeddings_file or projected_embeddings_file or reduced_embeddings_file: Where per-protein embeddings live
        prefix: Output prefix for all generated files
        stage_name: The stage name
        protocol: Which projection technique to use
        mapping_file: the mapping file generated by the pipeline when remapping indexes

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(kwargs, ["protocol", "prefix", "stage_name", "mapping_file"])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: "
            + "{}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
            )
        )

    result_kwargs = deepcopy(kwargs)

    # We want to allow chaining protocols, e.g. first tucker than umap,
    # so we need to allow projected embeddings as input
    embeddings_input_file = (
        kwargs.get("projected_reduced_embeddings_file")
        or kwargs.get("projected_embeddings_file")
        or kwargs.get("reduced_embeddings_file")
    )
    if not embeddings_input_file:
        raise InvalidParameterError(
            f"You need to provide either projected_reduced_embeddings_file or projected_embeddings_file or "
            f"reduced_embeddings_file for {kwargs['protocol']}"
        )
    result_kwargs["reduced_embeddings_file"] = embeddings_input_file

    file_manager = get_file_manager(**kwargs)

    result_kwargs = PROTOCOLS[kwargs["protocol"]](file_manager, result_kwargs)

    return result_kwargs
Example #6
0
def run(**kwargs):
    """
    Run visualize protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        projected_reduced_embeddings_file: A csv with columns: (index), original_id, x, y, z
        prefix: Output prefix for all generated files
        stage_name: The stage name
        protocol: Which plot to generate

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(kwargs, ['protocol', 'prefix', 'stage_name'])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
            )
        )

    result_kwargs = deepcopy(kwargs)

    # Support legacy projected_embeddings_file
    projected_reduced_embeddings_file = (
        kwargs.get("projected_reduced_embeddings_file")
        or kwargs.get("projected_embeddings_file")
    )
    if not projected_reduced_embeddings_file:
        raise InvalidParameterError(
            f"You need to provide either projected_reduced_embeddings_file or projected_embeddings_file or "
            f"reduced_embeddings_file for {kwargs['protocol']}"
        )
    result_kwargs["projected_reduced_embeddings_file"] = projected_reduced_embeddings_file

    return PROTOCOLS[kwargs["protocol"]](result_kwargs)
Example #7
0
def plot_mutagenesis(result_kwargs):
    """BETA: visualize in-silico mutagenesis as a heatmap with plotly

    mandatory:
    * residue_probabilities_file
    """
    required_kwargs = [
        "protocol",
        "prefix",
        "stage_name",
        "residue_probabilities_file",
    ]
    check_required(result_kwargs, required_kwargs)
    file_manager = get_file_manager()
    file_manager.create_stage(result_kwargs["prefix"], result_kwargs["stage_name"])

    probabilities_all = pandas.read_csv(result_kwargs["residue_probabilities_file"])
    assert (
        list(probabilities_all.columns) == PROBABILITIES_COLUMNS
    ), f"probabilities file is expected to have the following columns: {PROBABILITIES_COLUMNS}"
    number_of_proteins = len(set(probabilities_all["id"]))

    for sequence_id, probabilities in tqdm(
        probabilities_all.groupby("id"), total=number_of_proteins
    ):
        fig = plot(probabilities)
        plotly.offline.plot(
            fig,
            filename=file_manager.create_file(
                result_kwargs.get("prefix"),
                result_kwargs.get("stage_name"),
                sequence_id,
                extension=".html",
            ),
        )

    return result_kwargs
Example #8
0
def execute_pipeline_from_config(config: Dict,
                                 post_stage: Callable[[Dict],
                                                      None] = _null_function,
                                 **kwargs) -> Dict:
    original_config = deepcopy(config)

    check_required(config, ["global"])

    # !! pop = remove from config!
    global_parameters = config.pop('global')

    check_required(global_parameters, ["prefix", "sequences_file"])

    file_manager = get_file_manager(**global_parameters)

    # Make sure prefix exists
    prefix = global_parameters['prefix']

    # If prefix already exists
    if file_manager.exists(prefix):
        if not kwargs.get('overwrite'):
            raise FileExistsError(
                "The prefix already exists & no overwrite option has been set.\n"
                "Either set --overwrite, or move data from the prefix.\n"
                "Prefix: {}".format(prefix))
    else:
        # create the prefix
        file_manager.create_prefix(prefix)

    # Copy original config to prefix
    global_in = file_manager.create_file(prefix,
                                         None,
                                         _IN_CONFIG_NAME,
                                         extension='.yml')
    write_config_file(global_in, original_config)

    # This downloads sequences_file if required
    download_files_for_stage(global_parameters, file_manager, prefix)

    global_parameters = _process_fasta_file(**global_parameters)

    for stage_name in config:
        stage_parameters = config[stage_name]
        original_stage_parameters = dict(**stage_parameters)

        check_required(stage_parameters, ["protocol", "type"])

        stage_type = stage_parameters['type']
        stage_runnable = _STAGES.get(stage_type)

        if not stage_runnable:
            raise Exception(
                "No type defined, or invalid stage type defined: {}".format(
                    stage_type))

        # Prepare to run stage
        stage_parameters['stage_name'] = stage_name
        file_manager.create_stage(prefix, stage_name)

        stage_parameters = download_files_for_stage(stage_parameters,
                                                    file_manager, prefix,
                                                    stage_name)

        stage_dependency = stage_parameters.get('depends_on')

        if stage_dependency:
            if stage_dependency not in config:
                raise Exception(
                    "Stage {} depends on {}, but dependency not found in config."
                    .format(stage_name, stage_dependency))

            stage_dependency_parameters = config.get(stage_dependency)
            stage_parameters = {
                **global_parameters,
                **stage_dependency_parameters,
                **stage_parameters
            }
        else:
            stage_parameters = {**global_parameters, **stage_parameters}

        # Register start time
        start_time = datetime.now().astimezone()
        stage_parameters['start_time'] = str(start_time)

        stage_in = file_manager.create_file(prefix,
                                            stage_name,
                                            _IN_CONFIG_NAME,
                                            extension='.yml')
        write_config_file(stage_in, stage_parameters)

        try:
            stage_output_parameters = stage_runnable(**stage_parameters)
        except Exception as e:
            # Tell the user which stage failed and show an url to report an error on github
            try:
                version = importlib_metadata.version("bio_embeddings")
            except PackageNotFoundError:
                version = "unknown"

            # Make a github flavored markdown table; the header is in the template
            parameter_table = "\n".join(
                f"{key}|{value}"
                for key, value in original_stage_parameters.items())
            params = {
                # https://stackoverflow.com/a/35498685/3549270
                "title":
                f"Protocol {original_stage_parameters['protocol']}: {type(e).__name__}: {e}",
                "body":
                _ERROR_REPORTING_TEMPLATE.format(
                    version,
                    torch.cuda.is_available(),
                    parameter_table,
                    traceback.format_exc(10),
                ),
            }
            print(traceback.format_exc(), file=sys.stderr)
            print(
                f"Consider reporting this error at this url: {_ISSUE_URL}?{urllib.parse.urlencode(params)}\n\n"
                f"Stage {stage_name} failed.",
                file=sys.stderr,
            )

            sys.exit(1)

        # Register end time
        end_time = datetime.now().astimezone()
        stage_output_parameters['end_time'] = str(end_time)

        # Register elapsed time
        stage_output_parameters['elapsed_time'] = str(end_time - start_time)

        stage_out = file_manager.create_file(prefix,
                                             stage_name,
                                             _OUT_CONFIG_NAME,
                                             extension='.yml')
        write_config_file(stage_out, stage_output_parameters)

        # Store in global_out config for later retrieval (e.g. depends_on)
        config[stage_name] = stage_output_parameters

        # Execute post-stage function, if provided
        post_stage(stage_output_parameters)

    config['global'] = global_parameters

    try:
        config['global']['version'] = importlib_metadata.version(
            "bio_embeddings")
    except PackageNotFoundError:
        pass  # :(

    global_out = file_manager.create_file(prefix,
                                          None,
                                          _OUT_CONFIG_NAME,
                                          extension='.yml')
    write_config_file(global_out, config)

    return config
Example #9
0
def prepare_kwargs(**kwargs):
    required_kwargs = [
        "protocol",
        "prefix",
        "stage_name",
        "remapped_sequences_file",
        "mapping_file",
    ]
    check_required(kwargs, required_kwargs)

    if kwargs["protocol"] not in name_to_embedder:
        if kwargs["protocol"] in ALL_PROTOCOLS:
            raise InvalidParameterError(
                f"The extra for the protocol {kwargs['protocol']} is missing. "
                "See https://docs.bioembeddings.com/#installation on how to install all extras"
            )
        raise InvalidParameterError(
            "Invalid protocol selection: {}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(name_to_embedder.keys())))

    embedder_class = name_to_embedder[kwargs["protocol"]]

    if kwargs["protocol"] == "unirep" and kwargs.get("use_cpu") is not None:
        raise InvalidParameterError(
            "UniRep does not support configuring `use_cpu`")
    # See parameter_blueprints.yml
    global_options = {"sequences_file", "simple_remapping", "start_time"}
    embed_options = {
        "decoder",
        "device",
        "discard_per_amino_acid_embeddings",
        "half_precision_model",
        "half_precision",
        "max_amino_acids",
        "reduce",
        "type",
    }
    known_parameters = (set(required_kwargs)
                        | global_options
                        | embed_options
                        | set(embedder_class.necessary_files)
                        | set(embedder_class.necessary_directories))
    if embedder_class == "seqvec":
        # We support two ways of configuration for seqvec
        known_parameters.add("model_directory")
    if not set(kwargs) < known_parameters:
        # Complain louder if the input looks fishier
        for option in set(kwargs) - known_parameters:
            logger.warning(
                f"You set an unknown option for {embedder_class.name}: {option} (value: {kwargs[option]})"
            )

    if kwargs.get("half_precision_model"):
        if kwargs["protocol"] not in [
                "prottrans_t5_bfd", "prottrans_t5_uniref50"
        ]:
            raise InvalidParameterError(
                "`half_precision_model` is only supported with prottrans_t5_bfd and prottrans_t5_uniref50"
            )

        if kwargs.get("half_precision") is False:  # None remains allowed
            raise InvalidParameterError(
                "You can't have `half_precision_model` be true and `half_precision` be false. "
                "We suggest also setting `half_precision` to true, "
                "which will compute and save embeddings as half-precision floats"
            )

    result_kwargs = deepcopy(kwargs)
    result_kwargs.setdefault("max_amino_acids",
                             DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]])

    return embedder_class, result_kwargs
Example #10
0
def execute_pipeline_from_config(config: Dict,
                                 post_stage: Callable[[Dict],
                                                      None] = _null_function,
                                 **kwargs) -> Dict:
    original_config = deepcopy(config)

    check_required(config, ["global"])

    # !! pop = remove from config!
    global_parameters = config.pop('global')

    check_required(global_parameters, ["prefix", "sequences_file"])

    file_manager = get_file_manager(**global_parameters)

    # Make sure prefix exists
    prefix = global_parameters['prefix']

    # If prefix already exists
    if file_manager.exists(prefix):
        if not kwargs.get('overwrite'):
            raise FileExistsError(
                "The prefix already exists & no overwrite option has been set.\n"
                "Either set --overwrite, or move data from the prefix.\n"
                "Prefix: {}".format(prefix))
    else:
        # create the prefix
        file_manager.create_prefix(prefix)

    try:
        Path(prefix).joinpath("bio_embeddings_version.txt").write_text(
            importlib_metadata.version("bio_embeddings"))
    except PackageNotFoundError:
        pass  # :(

    # Copy original config to prefix
    global_in = file_manager.create_file(prefix,
                                         None,
                                         _IN_CONFIG_NAME,
                                         extension='.yml')
    write_config_file(global_in, original_config)

    global_parameters = _process_fasta_file(**global_parameters)

    for stage_name in config:
        stage_parameters = config[stage_name]

        check_required(stage_parameters, ["protocol", "type"])

        stage_type = stage_parameters['type']
        stage_runnable = _STAGES.get(stage_type)

        if not stage_runnable:
            raise Exception(
                "No type defined, or invalid stage type defined: {}".format(
                    stage_type))

        # Prepare to run stage
        stage_parameters['stage_name'] = stage_name
        file_manager.create_stage(prefix, stage_name)

        stage_dependency = stage_parameters.get('depends_on')

        if stage_dependency:
            if stage_dependency not in config:
                raise Exception(
                    "Stage {} depends on {}, but dependency not found in config."
                    .format(stage_name, stage_dependency))

            stage_dependency_parameters = config.get(stage_dependency)
            stage_parameters = {
                **global_parameters,
                **stage_dependency_parameters,
                **stage_parameters
            }
        else:
            stage_parameters = {**global_parameters, **stage_parameters}

        # Register start time
        start_time = datetime.now().astimezone()
        stage_parameters['start_time'] = str(start_time)

        stage_in = file_manager.create_file(prefix,
                                            stage_name,
                                            _IN_CONFIG_NAME,
                                            extension='.yml')
        write_config_file(stage_in, stage_parameters)

        stage_output_parameters = stage_runnable(**stage_parameters)

        # Register end time
        end_time = datetime.now().astimezone()
        stage_output_parameters['end_time'] = str(end_time)

        # Register elapsed time
        stage_output_parameters['elapsed_time'] = str(end_time - start_time)

        stage_out = file_manager.create_file(prefix,
                                             stage_name,
                                             _OUT_CONFIG_NAME,
                                             extension='.yml')
        write_config_file(stage_out, stage_output_parameters)

        # Store in global_out config for later retrieval (e.g. depends_on)
        config[stage_name] = stage_output_parameters

        # Execute post-stage function, if provided
        post_stage(stage_output_parameters)

    config['global'] = global_parameters
    global_out = file_manager.create_file(prefix,
                                          None,
                                          _OUT_CONFIG_NAME,
                                          extension='.yml')
    write_config_file(global_out, config)

    return config
Example #11
0
def run(**kwargs):
    """BETA: in-silico mutagenesis using BertForMaskedLM

    optional (see extract stage for details):
     * model_directory
     * device
     * half_precision
     * half_precision_model
     * temperature: temperature for softmax
    """
    required_kwargs = [
        "protocol",
        "prefix",
        "stage_name",
        "remapped_sequences_file",
        "mapping_file",
    ]
    check_required(kwargs, required_kwargs)
    result_kwargs = deepcopy(kwargs)
    if result_kwargs["protocol"] not in _PROTOCOLS:
        raise RuntimeError(
            f"Passed protocol {result_kwargs['protocol']}, but allowed are: {', '.join(_PROTOCOLS)}"
        )
    temperature = result_kwargs.setdefault("temperature", 1)
    device = get_device(result_kwargs.get("device"))
    model_class: Type[ProtTransBertBFDMutagenesis] = _PROTOCOLS[
        result_kwargs["protocol"]
    ]
    model = model_class(
        device,
        result_kwargs.get("model_directory"),
        result_kwargs.get("half_precision_model"),
    )

    file_manager = get_file_manager()
    file_manager.create_stage(result_kwargs["prefix"], result_kwargs["stage_name"])

    # The mapping file contains the corresponding ids in the same order
    sequences = [
        str(entry.seq)
        for entry in SeqIO.parse(result_kwargs["remapped_sequences_file"], "fasta")
    ]
    mapping_file = read_mapping_file(result_kwargs["mapping_file"])

    probabilities_all = dict()
    with tqdm(total=int(mapping_file["sequence_length"].sum())) as progress_bar:
        for sequence_id, original_id, sequence in zip(
            mapping_file.index, mapping_file["original_id"], sequences
        ):
            with torch.no_grad():
                probabilities = model.get_sequence_probabilities(
                    sequence, temperature, progress_bar=progress_bar
                )

            for p in probabilities:
                assert math.isclose(
                    1, (sum(p.values()) - p["position"]), rel_tol=1e-6
                ), "softmax values should add up to 1"

            probabilities_all[sequence_id] = probabilities
    residue_probabilities = probabilities_as_dataframe(
        mapping_file, probabilities_all, sequences
    )

    probabilities_file = file_manager.create_file(
        result_kwargs.get("prefix"),
        result_kwargs.get("stage_name"),
        "residue_probabilities_file",
        extension=".csv",
    )
    residue_probabilities.to_csv(probabilities_file, index=False)
    result_kwargs["residue_probabilities_file"] = probabilities_file
    return result_kwargs