Example #1
0
    def test_gather_residues(self):
        methylation = SiteType(name='methylation')

        p = Protein(refseq='NM_007', id=1, sequence='ABCD')
        sites = [
            Site(position=2, types={methylation}, protein=p),  # default -> 'B'
            Site(position=4, types={methylation}, residue='D'),
        ]
        db.session.add_all(sites)
        db.session.commit()

        assert methylation.find_modified_residues() == {'B', 'D'}
Example #2
0
def train_model(site_type: SiteType,
                sequences_dir='.tmp',
                sampling_n=10000,
                enzyme_type='kinase',
                output_path=None,
                **kwargs):
    """Train MIMP model for given site type.

    NOTE: Natively MIMP works on phosphorylation sites only,
    so a special, forked version [reimandlab/rmimp] is needed
    for this function to work at all.

    Args:
        site_type: Type of the site for which the model is to be trained
        sequences_dir: path to dir where sequences for trainModel should be dumped
        sampling_n: number of sampling iterations for negative sequence set
        output_path: path to .mimp file where the model should be saved
        **kwargs: will be passed to trainModel

    Returns:
        trained MIMP model for all kinases affecting sites of given SiteType
    """
    if not output_path:
        output_path = f'{site_type.name}.mimp'

    mimp = load_mimp()

    sites_of_this_type = set(site_type.sites)
    modified_residues = site_type.find_modified_residues()

    negative_sites = gather_negative_sites(modified_residues,
                                           exclude=sites_of_this_type)

    sequences_path = Path(sequences_dir)

    positive_path = sequences_path / 'positive'
    negative_path = sequences_path / 'negative'

    for path in [positive_path, negative_path]:
        shutil.rmtree(str(path), ignore_errors=True)
        path.mkdir(parents=True)

    if enzyme_type == 'kinase':

        enzymes = Kinase.query.filter(
            Kinase.is_involved_in.any(SiteType.name == site_type.name)).filter(
                Kinase.sites.any(Site.types.contains(site_type)))
        enzymes = tqdm(enzymes, total=enzymes.count())

    elif enzyme_type == 'catch-all':
        enzymes = [
            SimpleNamespace(sites=Site.query.filter(
                Site.types.contains(site_type)),
                            name=f'all_enzymes_for_{site_type.name}')
        ]
    else:
        assert False

    for enzyme in enzymes:

        sites = [site for site in enzyme.sites if site_type in site.types]

        positive_sequences = [site.sequence for site in sites]
        negative_sequences = sample_random_negative_sequences(
            negative_sites, sampling_n)

        save_kinase_sequences(enzyme, positive_sequences, positive_path)
        save_kinase_sequences(enzyme, negative_sequences, negative_path)

    priors = mimp.PRIORS.rx2('human')

    # just in case
    # r.debug(mimp.trainModel)

    return mimp.trainModel(
        str(positive_path),
        str(negative_path),
        file=output_path,
        priors=priors,  # or calculate_background_frequency(),
        # both give the same values (within rounding error), the custom
        # func might come in handy in future
        residues_groups=residues_groups(site_type, modified_residues),
        **kwargs)