Beispiel #1
0
def tmbed(**kwargs) -> Dict[str, Any]:
    '''
    Protocol extracts membrane residues from "embeddings_file".
    Embeddings must have been generated with ProtT5-XL-U50.
    '''

    check_required(kwargs, ['embeddings_file', 'remapped_sequences_file'])

    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    for file in TmbedAnnotationExtractor.necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model='tmbed', file=file)

    tmbed_extractor = TmbedAnnotationExtractor(**result_kwargs)

    # Try to create final file (if this fails, now is better than later)
    membrane_residues_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                       result_kwargs.get('stage_name'),
                                                                       'membrane_residues_predictions_file',
                                                                       extension='.fasta')

    result_kwargs['membrane_residues_predictions_file'] = membrane_residues_predictions_file_path

    tmbed_sequences = []

    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
            embedding = np.array(embedding_file[protein_sequence.id])

            # Add batch dimension (until we support batch processing)
            embedding = embedding[None, ]

            # Sequence lengths (only a single sequence for now)
            lengths = [len(protein_sequence.seq)]

            annotations = tmbed_extractor.get_membrane_residues(embedding, lengths)

            # Gratuitous loop (only a single item for now)
            # Needs to be changed for batch mode to deepcopy different protein sequences
            for annotation in annotations:
                tmbed_sequence = deepcopy(protein_sequence)
                tmbed_sequence.seq = Seq(convert_list_of_enum_to_string(annotation.membrane_residues))

                tmbed_sequences.append(tmbed_sequence)

    # Write file
    write_fasta_file(tmbed_sequences, membrane_residues_predictions_file_path)

    return result_kwargs
Beispiel #2
0
def light_attention(model, **kwargs) -> Dict[str, Any]:
    """
    Protocol extracts subcellular locationfrom "embeddings_file".
    Embeddings can be generated with ProtBert.

    :param model: either "la_protbert" or "la_prott5". Used to download files
    """

    check_required(
        kwargs, ['embeddings_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    for file in LightAttentionAnnotationExtractor.necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model=model, file=file)

    annotation_extractor = LightAttentionAnnotationExtractor(**result_kwargs)

    # mapping file will be needed for protein-wide annotations
    mapping_file = read_csv(result_kwargs['mapping_file'], index_col=0)

    # Try to create final files (if this fails, now is better than later
    per_sequence_predictions_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'per_sequence_predictions_file',
        extension='.csv')
    result_kwargs[
        'per_sequence_predictions_file'] = per_sequence_predictions_file_path

    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(
                result_kwargs['remapped_sequences_file']):
            embedding = np.array(embedding_file[protein_sequence.id])

            annotations = annotation_extractor.get_subcellular_location(
                embedding)

            # Per-sequence annotations, e.g. subcell loc & membrane boundness
            mapping_file.at[
                protein_sequence.id,
                'subcellular_location'] = annotations.localization.value
            mapping_file.at[protein_sequence.id,
                            'membrane_or_soluble'] = annotations.membrane.value

    # Write files
    mapping_file.to_csv(per_sequence_predictions_file_path)

    return result_kwargs
Beispiel #3
0
def prott5cons(model: str, **kwargs) -> Dict[str, Any]:
    """
    Protocol extracts conservation from "embeddings_file".
    Embeddings can only be generated with ProtT5-XL-U50.

    :param model: "t5_xl_u50_conservation". Used to download files
    """

    check_required(kwargs, ['embeddings_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    for file in ProtT5consAnnotationExtractor.necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model=model, file=file)

    annotation_extractor = ProtT5consAnnotationExtractor(**result_kwargs)

    # mapping file will be needed for protein-wide annotations
    mapping_file = read_mapping_file(result_kwargs["mapping_file"])

    # Try to create final files (if this fails, now is better than later
    conservation_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                  result_kwargs.get('stage_name'),
                                                                  'conservation_predictions_file',
                                                                  extension='.fasta')
    result_kwargs['conservation_predictions_file'] = conservation_predictions_file_path
    cons_sequences = list()
    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
            embedding = np.array(embedding_file[protein_sequence.id])

            annotations = annotation_extractor.get_conservation(embedding)
            cons_sequence = deepcopy(protein_sequence)
            cons_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.conservation))
            cons_sequences.append(cons_sequence)

    # Write files
    write_fasta_file(cons_sequences, conservation_predictions_file_path)
    return result_kwargs
Beispiel #4
0
def predict_annotations_using_basic_models(model, **kwargs) -> Dict[str, Any]:
    """
    Protocol extracts secondary structure (DSSP3 and DSSP8), disorder, subcellular location and membrane boundness
    from "embeddings_file". Embeddings can either be generated with SeqVec or ProtBert.
    SeqVec models are used in this publication: https://doi.org/10.1186/s12859-019-3220-8
    ProtTrans models are used in this publication: https://doi.org/10.1101/2020.07.12.199554

    :param model: either "bert_from_publication" or "seqvec_from_publication". Used to download files
    """

    check_required(
        kwargs, ['embeddings_file', 'mapping_file', 'remapped_sequences_file'])
    necessary_files = [
        'secondary_structure_checkpoint_file',
        'subcellular_location_checkpoint_file'
    ]
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    for file in necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(
                model=f'{model}_annotations_extractors', file=file)

    annotation_extractor = BasicAnnotationExtractor(model, **result_kwargs)

    # mapping file will be needed for protein-wide annotations
    mapping_file = read_csv(result_kwargs['mapping_file'], index_col=0)

    # Try to create final files (if this fails, now is better than later
    DSSP3_predictions_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'DSSP3_predictions_file',
        extension='.fasta')
    result_kwargs['DSSP3_predictions_file'] = DSSP3_predictions_file_path
    DSSP8_predictions_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'DSSP8_predictions_file',
        extension='.fasta')
    result_kwargs['DSSP8_predictions_file'] = DSSP8_predictions_file_path
    disorder_predictions_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'disorder_predictions_file',
        extension='.fasta')
    result_kwargs['disorder_predictions_file'] = disorder_predictions_file_path
    per_sequence_predictions_file_path = file_manager.create_file(
        result_kwargs.get('prefix'),
        result_kwargs.get('stage_name'),
        'per_sequence_predictions_file',
        extension='.csv')
    result_kwargs[
        'per_sequence_predictions_file'] = per_sequence_predictions_file_path

    # Create sequence containers
    DSSP3_sequences = list()
    DSSP8_sequences = list()
    disorder_sequences = list()

    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(
                result_kwargs['remapped_sequences_file']):

            # Per-AA annotations: DSSP3, DSSP8 and disorder
            embedding = np.array(embedding_file[protein_sequence.id])

            annotations = annotation_extractor.get_annotations(embedding)

            DSSP3_sequence = deepcopy(protein_sequence)
            DSSP3_sequence.seq = Seq(
                convert_list_of_enum_to_string(annotations.DSSP3))
            DSSP3_sequences.append(DSSP3_sequence)

            DSSP8_sequence = deepcopy(protein_sequence)
            DSSP8_sequence.seq = Seq(
                convert_list_of_enum_to_string(annotations.DSSP8))
            DSSP8_sequences.append(DSSP8_sequence)

            disorder_sequence = deepcopy(protein_sequence)
            disorder_sequence.seq = Seq(
                convert_list_of_enum_to_string(annotations.disorder))
            disorder_sequences.append(disorder_sequence)

            # Per-sequence annotations, e.g. subcell loc & membrane boundness
            mapping_file.at[
                protein_sequence.id,
                'subcellular_location'] = annotations.localization.value
            mapping_file.at[protein_sequence.id,
                            'membrane_or_soluble'] = annotations.membrane.value

    # Write files
    mapping_file.to_csv(per_sequence_predictions_file_path)
    write_fasta_file(DSSP3_sequences, DSSP3_predictions_file_path)
    write_fasta_file(DSSP8_sequences, DSSP8_predictions_file_path)
    write_fasta_file(disorder_sequences, disorder_predictions_file_path)

    return result_kwargs
Beispiel #5
0
def predict_annotations_using_basic_models(model: str, **kwargs) -> Dict[str, Any]:
    """
    Protocol extracts secondary structure (DSSP3 and DSSP8), disorder, subcellular location and membrane boundness
    from "embeddings_file". Embeddings can either be generated with SeqVec or ProtBert.
    SeqVec models are used in this publication: https://doi.org/10.1186/s12859-019-3220-8
    ProtTrans models are used in this publication: https://doi.org/10.1101/2020.07.12.199554

    :param model: either "bert_from_publication" or "seqvec_from_publication". Used to download files
    """

    check_required(kwargs, ['embeddings_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    annotation_extractor = BasicAnnotationExtractor(model, **result_kwargs)

    # mapping file will be needed for protein-wide annotations
    mapping_file = read_mapping_file(result_kwargs["mapping_file"])

    # Try to create final files (if this fails, now is better than later)
    DSSP3_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                           result_kwargs.get('stage_name'),
                                                           'DSSP3_predictions_file',
                                                           extension='.fasta')
    result_kwargs['DSSP3_predictions_file'] = DSSP3_predictions_file_path

    DSSP8_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                           result_kwargs.get('stage_name'),
                                                           'DSSP8_predictions_file',
                                                           extension='.fasta')
    result_kwargs['DSSP8_predictions_file'] = DSSP8_predictions_file_path

    disorder_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                              result_kwargs.get('stage_name'),
                                                              'disorder_predictions_file',
                                                              extension='.fasta')
    result_kwargs['disorder_predictions_file'] = disorder_predictions_file_path

    per_sequence_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                  result_kwargs.get('stage_name'),
                                                                  'per_sequence_predictions_file',
                                                                  extension='.csv')
    result_kwargs['per_sequence_predictions_file'] = per_sequence_predictions_file_path

    if 'get_activations' in kwargs and kwargs['get_activations']:
        DSSP3_raw_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'DSSP3_raw_predictions_file',
                                                                   extension='.csv')
        result_kwargs['DSSP3_raw_predictions_file'] = DSSP3_raw_predictions_file_path
        DSSP8_raw_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'DSSP8_raw_predictions_file',
                                                                   extension='.csv')
        result_kwargs['DSSP8_raw_predictions_file'] = DSSP8_raw_predictions_file_path
        disorder_raw_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                      result_kwargs.get('stage_name'),
                                                                      'disorder_raw_predictions_file',
                                                                      extension='.csv')
        result_kwargs['disorder_raw_predictions_file'] = disorder_raw_predictions_file_path

    # Create sequence containers
    DSSP3_sequences = list()
    DSSP8_sequences = list()
    disorder_sequences = list()

    DSSP3_raw = []
    DSSP8_raw = []
    disorder_raw = []

    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
            # Per-AA annotations: DSSP3, DSSP8 and disorder
            embedding = np.array(embedding_file[protein_sequence.id])

            annotations = annotation_extractor.get_annotations(embedding)

            DSSP3_sequence = deepcopy(protein_sequence)
            DSSP3_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.DSSP3))
            DSSP3_sequences.append(DSSP3_sequence)
            DSSP3_raw_df = DataFrame(annotations.DSSP3_raw[:, :, 0].detach().cpu().numpy().transpose(),
                                     columns=['H', 'E', 'C'])
            DSSP3_raw_df.insert(0, 'residue', range(1, 1 + len(DSSP3_raw_df)))
            DSSP3_raw_df.insert(0, 'seqID', DSSP3_sequence.id)
            DSSP3_raw.append(DSSP3_raw_df)

            DSSP8_sequence = deepcopy(protein_sequence)
            DSSP8_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.DSSP8))
            DSSP8_sequences.append(DSSP8_sequence)
            DSSP8_raw_df = DataFrame(annotations.DSSP8_raw[:, :, 0].detach().cpu().numpy().transpose(),
                                     columns=['G', 'H', 'I', 'B', 'E', 'S', 'T', 'C'])
            DSSP8_raw_df.insert(0, 'residue', range(1, 1 + len(DSSP8_raw_df)))
            DSSP8_raw_df.insert(0, 'seqID', DSSP8_sequence.id)
            DSSP8_raw.append(DSSP8_raw_df)

            disorder_sequence = deepcopy(protein_sequence)
            disorder_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.disorder))
            disorder_sequences.append(disorder_sequence)
            disorder_raw_df = DataFrame(annotations.disorder_raw[:, :, 0].detach().cpu().numpy().transpose(),
                                        columns=['Order', 'Disorder'])
            disorder_raw_df.insert(0, 'residue', range(1, 1 + len(disorder_raw_df)))
            disorder_raw_df.insert(0, 'seqID', disorder_sequence.id)
            disorder_raw.append(disorder_raw_df)

            # Per-sequence annotations, e.g. subcell loc & membrane boundness
            mapping_file.at[protein_sequence.id, 'subcellular_location'] = annotations.localization.value
            mapping_file.at[protein_sequence.id, 'membrane_or_soluble'] = annotations.membrane.value

    # Write files
    mapping_file.to_csv(per_sequence_predictions_file_path)
    write_fasta_file(DSSP3_sequences, DSSP3_predictions_file_path)
    write_fasta_file(DSSP8_sequences, DSSP8_predictions_file_path)
    write_fasta_file(disorder_sequences, disorder_predictions_file_path)

    if 'get_activations' in kwargs and kwargs['get_activations']:
        # create files with activations for each multiclass prediction
        concatenate_dataframe(DSSP3_raw).set_index('seqID').rename_axis(None).to_csv(DSSP3_raw_predictions_file_path)
        concatenate_dataframe(DSSP8_raw).set_index('seqID').rename_axis(None).to_csv(DSSP8_raw_predictions_file_path)
        concatenate_dataframe(disorder_raw).set_index('seqID').rename_axis(None).to_csv(
            disorder_raw_predictions_file_path)

    return result_kwargs
Beispiel #6
0
def bindembed21(**kwargs) -> Dict[str, Any]:
    """
    Protocol extracts binding residues from "alignment_result_file" if possible, and from "embeddings_file", otherwise.
    :param kwargs:
    :return:
    """

    check_required(kwargs, ['alignment_results_file', 'embeddings_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    # for HBI
    for directory in BindEmbed21HBIAnnotationExtractor.necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(model="bindembed21hbi", directory=directory)
    # for DL
    for file in BindEmbed21DLAnnotationExtractor.necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model="bindembed21dl", file=file)

    hbi_extractor = BindEmbed21HBIAnnotationExtractor(**result_kwargs)
    dl_extractor = BindEmbed21DLAnnotationExtractor(**result_kwargs)

    # Try to create final files (if this fails, now is better than later
    metal_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'metal_binding_predictions_file',
                                                                   extension='.fasta')
    result_kwargs['metal_binding_predictions_file'] = metal_binding_predictions_file_path
    nuc_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                 result_kwargs.get('stage_name'),
                                                                 'nucleic_acid_binding_predictions_file',
                                                                 extension='.fasta')
    result_kwargs['binding_residue_predictions_file'] = nuc_binding_predictions_file_path
    small_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'small_molecule_binding_predictions_file',
                                                                   extension='.fasta')
    result_kwargs['binding_residue_predictions_file'] = small_binding_predictions_file_path

    metal_sequences = list()
    nuc_sequences = list()
    small_sequences = list()

    alignment_results = read_csv(result_kwargs['alignment_results_file'], sep='\t',
                                 dtype={'query': 'str', 'target': 'str'})
    alignment_results = alignment_results[alignment_results['eval'] < 1E-3].copy()

    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
            # get HBI hit for this query
            hits = alignment_results[alignment_results['query'].str.match(str(protein_sequence.id))].copy()
            hits_min_eval = hits[hits['eval'] == min(hits['eval'])]
            hit_max_pide = hits_min_eval[hits_min_eval['fident'] == max(hits_min_eval['fident'])]

            metal_sequence = deepcopy(protein_sequence)
            nuc_sequence = deepcopy(protein_sequence)
            small_sequence = deepcopy(protein_sequence)

            hbi_annotations = hbi_extractor.get_binding_residues(hit_max_pide.iloc[0].to_dict())
            metal_inference = convert_list_of_enum_to_string(hbi_annotations.metal_ion)
            nuc_inference = convert_list_of_enum_to_string(hbi_annotations.nucleic_acids)
            small_inference = convert_list_of_enum_to_string(hbi_annotations.small_molecules)

            # some part of the sequence was predicted using HBI --> save output and don't run DL method
            if 'M' in metal_inference or 'N' in nuc_inference or 'S' in small_inference:
                metal_sequence.seq = Seq(metal_inference)
                nuc_sequence.seq = Seq(nuc_inference)
                small_sequence.seq = Seq(small_inference)
            # no inference containing binding annotations was made --> run bindEmbed21DL
            else:
                embedding = np.array(embedding_file[protein_sequence.id])
                annotations = dl_extractor.get_binding_residues(embedding)
                metal_sequence = deepcopy(protein_sequence)
                nuc_sequence = deepcopy(protein_sequence)
                small_sequence = deepcopy(protein_sequence)

                metal_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.metal_ion))
                nuc_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.nucleic_acids))
                small_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.small_molecules))

            metal_sequences.append(metal_sequence)
            nuc_sequences.append(nuc_sequence)
            small_sequences.append(small_sequence)

    # Write files
    write_fasta_file(metal_sequences, metal_binding_predictions_file_path)
    write_fasta_file(nuc_sequences, nuc_binding_predictions_file_path)
    write_fasta_file(small_sequences, small_binding_predictions_file_path)

    return result_kwargs
Beispiel #7
0
def bindembed21hbi(**kwargs) -> Dict[str, Any]:
    """
    Protocol extracts binding residues from "alignment_results_file".

    :return:
    """

    check_required(kwargs, ['alignment_results_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    for directory in BindEmbed21HBIAnnotationExtractor.necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(model="bindembed21hbi", directory=directory)

    annotation_extractor = BindEmbed21HBIAnnotationExtractor(**result_kwargs)

    # Try to create final files (if this fails, now is better than later
    metal_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'metal_binding_inference_file',
                                                                   extension='.fasta')
    result_kwargs['metal_binding_inference_file'] = metal_binding_predictions_file_path
    nuc_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                 result_kwargs.get('stage_name'),
                                                                 'nucleic_acid_binding_inference_file',
                                                                 extension='.fasta')
    result_kwargs['binding_residue_inference_file'] = nuc_binding_predictions_file_path
    small_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'small_molecule_binding_inference_file',
                                                                   extension='.fasta')
    result_kwargs['binding_residue_inference_file'] = small_binding_predictions_file_path

    metal_sequences = list()
    nuc_sequences = list()
    small_sequences = list()

    alignment_results = read_csv(result_kwargs['alignment_results_file'], sep='\t',
                                 dtype={'query': 'str', 'target': 'str'})
    alignment_results = alignment_results[alignment_results['eval'] < 1E-3].copy()

    for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
        # get hits for this query
        hits = alignment_results[alignment_results['query'].str.match(str(protein_sequence.id))].copy()
        # get hits with minimal E-value
        hits_min_eval = hits[hits['eval'] == min(hits['eval'])]
        # get hit with maximal PIDE
        hit_max_pide = hits_min_eval[hits_min_eval['fident'] == max(hits_min_eval['fident'])]

        annotations = annotation_extractor.get_binding_residues(hit_max_pide.iloc[0].to_dict())
        metal_sequence = deepcopy(protein_sequence)
        nuc_sequence = deepcopy(protein_sequence)
        small_sequence = deepcopy(protein_sequence)

        metal_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.metal_ion))
        nuc_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.nucleic_acids))
        small_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.small_molecules))

        metal_sequences.append(metal_sequence)
        nuc_sequences.append(nuc_sequence)
        small_sequences.append(small_sequence)

    # Write files
    write_fasta_file(metal_sequences, metal_binding_predictions_file_path)
    write_fasta_file(nuc_sequences, nuc_binding_predictions_file_path)
    write_fasta_file(small_sequences, small_binding_predictions_file_path)

    return result_kwargs
Beispiel #8
0
def bindembed21dl(**kwargs) -> Dict[str, Any]:
    """
    Protocol extracts binding residues from "embeddings_file".
    Results guaranteed only with ProtT5-XL-U50 embeddings.

    :return:
    """

    check_required(kwargs, ['embeddings_file', 'mapping_file', 'remapped_sequences_file'])
    result_kwargs = deepcopy(kwargs)
    file_manager = get_file_manager(**kwargs)

    # Download necessary files if needed
    for file in BindEmbed21DLAnnotationExtractor.necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model="bindembed21dl", file=file)

    annotation_extractor = BindEmbed21DLAnnotationExtractor(**result_kwargs)

    # Try to create final files (if this fails, now is better than later
    metal_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'metal_binding_predictions_file',
                                                                   extension='.fasta')
    result_kwargs['metal_binding_predictions_file'] = metal_binding_predictions_file_path
    nuc_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                 result_kwargs.get('stage_name'),
                                                                 'nucleic_acid_binding_predictions_file',
                                                                 extension='.fasta')
    result_kwargs['binding_residue_predictions_file'] = nuc_binding_predictions_file_path
    small_binding_predictions_file_path = file_manager.create_file(result_kwargs.get('prefix'),
                                                                   result_kwargs.get('stage_name'),
                                                                   'small_molecule_binding_predictions_file',
                                                                   extension='.fasta')
    result_kwargs['binding_residue_predictions_file'] = small_binding_predictions_file_path

    metal_sequences = list()
    nuc_sequences = list()
    small_sequences = list()

    with h5py.File(result_kwargs['embeddings_file'], 'r') as embedding_file:
        for protein_sequence in read_fasta(result_kwargs['remapped_sequences_file']):
            embedding = np.array(embedding_file[protein_sequence.id])

            annotations = annotation_extractor.get_binding_residues(embedding)
            metal_sequence = deepcopy(protein_sequence)
            nuc_sequence = deepcopy(protein_sequence)
            small_sequence = deepcopy(protein_sequence)

            metal_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.metal_ion))
            nuc_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.nucleic_acids))
            small_sequence.seq = Seq(convert_list_of_enum_to_string(annotations.small_molecules))

            metal_sequences.append(metal_sequence)
            nuc_sequences.append(nuc_sequence)
            small_sequences.append(small_sequence)

    # Write files
    write_fasta_file(metal_sequences, metal_binding_predictions_file_path)
    write_fasta_file(nuc_sequences, nuc_binding_predictions_file_path)
    write_fasta_file(small_sequences, small_binding_predictions_file_path)

    return result_kwargs