Example #1
0
    def test_types(self):
        methylation = SiteType(name='methylation')

        p = Protein(refseq='NM_007', id=1, sequence='ABCD')
        db.session.add(p)

        site = Site(position=2, types={methylation}, residue='B', protein=p)
        db.session.add(site)

        db.session.commit()

        query = Protein.query

        assert query.filter(Protein.sites.any(
            Site.types.contains(methylation))).one()
        assert not query.filter(
            Protein.sites.any(~Site.types.contains(methylation))).all()
        assert Site.query.filter(Site.types.contains(methylation)).count() == 1
        assert not Site.query.filter(~Site.types.contains(methylation)).all()

        phosphorylation = SiteType(name='phosphorylation')
        assert not query.filter(
            Protein.sites.any(Site.types.contains(phosphorylation))).all()
        assert query.filter(
            Protein.sites.any(~Site.types.contains(phosphorylation))).one()
        assert Site.query.filter(
            Site.types.contains(phosphorylation)).count() == 0
    def test_sites(self):

        p = Protein(**test_protein_data())

        sites = [
            Site(position=3,
                 residue='R',
                 types={SiteType(name='phosphorylation')}),
            Site(position=4, residue='T', types={SiteType(name='methylation')})
        ]
        db.session.add(p)
        p.sites = sites

        response = self.client.get('/protein/sites/NM_000123')

        assert response.status_code == 200
        assert response.content_type == 'application/json'

        assert len(response.json) == 2

        phosphorylation_site_repr = None

        for site_repr in response.json:
            print(site_repr)
            if site_repr['type'] == 'phosphorylation':
                phosphorylation_site_repr = site_repr

        assert phosphorylation_site_repr
Example #3
0
    def test_gather_residues(self):
        methylation = SiteType(name='methylation')

        p = Protein(refseq='NM_007', id=1, sequence='ABCD')
        sites = [
            Site(position=2, types={methylation}, protein=p),  # default -> 'B'
            Site(position=4, types={methylation}, residue='D'),
        ]
        db.session.add_all(sites)
        db.session.commit()

        assert methylation.find_modified_residues() == {'B', 'D'}
Example #4
0
 def glycosylations_without_subtype_ratio(self):
     from models import Site, SiteType
     glycosylation = SiteType.query.filter_by(name='glycosylation').one()
     glycosylations = Site.query.filter(
         SiteType.fuzzy_filter(glycosylation)).count()
     return (glycosylations -
             self.glycosylations_with_subtype()) / glycosylations
Example #5
0
def create_test_models():
    protein = Protein(refseq='NM_0001',
                      gene=Gene(name='SOMEGENE'),
                      sequence='ABCD')
    mutation = Mutation(protein=protein, position=1, alt='E')
    protein.gene.preferred_isoform = protein

    MC3Mutation(mutation=mutation,
                cancer=Cancer(code='CAN'),
                samples='Sample A,Sample B',
                count=2)
    InheritedMutation(mutation=mutation,
                      clin_data=[
                          ClinicalData(disease=Disease(name='Some disease'),
                                       sig_code=5),
                          ClinicalData(disease=Disease(name='Other disease'),
                                       sig_code=2)
                      ])

    protein_kinase = Protein(refseq='NM_0002',
                             gene=Gene(name='OTHERGENE'),
                             sequence='ABCD')
    kinase = Kinase(name='Kinase name', protein=protein_kinase)
    site = Site(protein=protein,
                position=1,
                residue='A',
                kinases={kinase},
                pmid={1, 2},
                types={SiteType(name='glycosylation')})
    protein.sites = [site]

    return locals()
Example #6
0
def ptm_muts_of_gene(
    path_template='exported/{site_type}_muts_of_{gene}_-_{protein}.tsv', gene='EGFR',
    site_type='glycosylation', mutation_source='mc3', to_csv=True, show_progress=False, **kwargs
):

    manager = MutationImportManager()
    importer_class = manager.importers[mutation_source]
    importer = importer_class(**kwargs)

    site_type = SiteType.query.filter_by(name=site_type).one()
    gene = Gene.query.filter_by(name=gene).one()
    protein = gene.preferred_isoform

    mutations = importer.export_to_df(
        mutation_filter=and_(
            Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type)),
            Mutation.protein_id == protein.id
        ),
        protein_filter=Protein.id == protein.id,
        show_progress=show_progress
    )
    path = path_template.format(protein=protein.refseq, gene=gene.name, site_type=site_type.name)
    if to_csv:
        mutations.to_csv(path, sep='\t', index=False)
    return mutations
Example #7
0
    def __init__(self, **kwargs):

        filters = [
            Filter(
                Mutation, 'sources', comparators=['in'],
                choices=list(source_manager.visible_fields.keys()),
                default=None, nullable=True,
                as_sqlalchemy=sqlalchemy_filter_from_source_name
            ),
            Filter(
                Site, 'types', comparators=['in'],
                choices={
                    site_type.name: site_type
                    for site_type in SiteType.available_types()
                },
                as_sqlalchemy=SiteType.fuzzy_filter,
                as_sqlalchemy_joins=[SiteType]
            ),
            Filter(
                Gene, 'has_ptm_muts',
                comparators=['eq'],
                as_sqlalchemy=lambda value: text('ptm_muts_cnt > 0') if value else text('true')
            ),
            Filter(
                Gene, 'is_known_kinase',
                comparators=['eq'],
                as_sqlalchemy=lambda value: Protein.kinase.any()
            )
        ] + [
            filter
            for filter in source_dependent_filters()
            if filter.has_sqlalchemy    # filters without sqlalchemy interface are not supported for table views
        ]
        super().__init__(filters)
        self.update_from_request(request)
Example #8
0
    def test_search_mutations(self):

        s = Site(position=13, types={SiteType(name='methylation')})
        p = Protein(refseq='NM_007', id=7, sites=[s], sequence='XXXXXXXXXXXXV')

        m_in_site = Mutation(protein=p, position=13, alt='V')
        m_out_site = Mutation(protein=p, position=50, alt='K')

        db.session.add(p)

        # points to the same location as first record in VCF_FILE_CONTENT
        test_query = 'chr20 14370 G A'

        from database import bdb

        # map the first genomic mutation from VCF_FILE_CONTENT
        # to some (mocked) protein mutation
        bdb.add_genomic_mut('20', 14370, 'G', 'A', m_in_site, is_ptm=True)

        #
        # basic test - is appropriate mutation in results?
        #
        response = self.search_mutations(mutations=test_query)

        assert response.status_code == 200

        # this mutation is exactly at a PTM site and should be included in results
        assert '<td>{0}</td>'.format(m_in_site.alt).encode() in response.data
        # this mutation lies outside of a PTM site - be default should be filtered out
        assert '<td>{0}</td>'.format(m_out_site.alt).encode() not in response.data

        #
        # count test - is mutation for this query annotated as shown twice?
        #
        response = self.search_mutations(
            mutations='{0}\n{0}'.format(test_query)
        )

        assert response.status_code == 200
        assert b'<td>2</td>' in response.data

        #
        # VCF file test
        #
        response = self.client.post(
            '/search/mutations',
            content_type='multipart/form-data',
            data={
                'vcf-file': (BytesIO(VCF_FILE_CONTENT), 'exemplar_vcf.vcf')
            }
        )

        assert response.status_code == 200
        assert b'NM_007' in response.data
Example #9
0
    def glycosylations_with_subtype(self):
        from models import Site, SiteType

        glycosylation_subtypes = [
            type_id for type_name, type_id in SiteType.id_by_name().items()
            if 'glycosylation' in type_name and type_name != 'glycosylation'
        ]

        site_filter = Site.types.any(SiteType.id.in_(glycosylation_subtypes))

        return Site.query.filter(site_filter).count()
Example #10
0
    def test_default_residue(self):
        p = Protein(refseq='NM_007', id=1, sequence='ABCD')

        methylation = SiteType(name='methylation')

        # note: for sites, positions are 1-based)
        site = Site(position=2, types={methylation}, protein=p)

        db.session.add(p)
        db.session.commit()

        assert site.residue == 'B'
Example #11
0
    def test_train_model(self):

        phosphorylation = SiteType(name='phosphorylation')

        # non-phosphorylated serine residues are needed to generate negative sites
        p = Protein(refseq='NM_007',
                    sequence='--------SLPA-----------SVIT-------')
        g = Gene(isoforms=[p], preferred_isoform=p)
        db.session.add(g)

        # phosphorylated, with sites
        p = Protein(refseq='NM_001',
                    sequence='--------SPAK-----------SPAR-------')
        g = Gene(isoforms=[p], preferred_isoform=p)
        db.session.add(g)

        k = Kinase(name='CDK1', is_involved_in={phosphorylation})

        for pos in [9, 24]:
            s = Site(position=pos,
                     types={phosphorylation},
                     residue='S',
                     protein=p,
                     kinases={k})
            db.session.add(s)

        db.session.commit()

        with TemporaryDirectory() as temp_dir:
            model = train_model(phosphorylation,
                                sequences_dir=temp_dir,
                                sampling_n=2,
                                threshold=2)

        # the model should have one set of params - for CDK1 kinase
        assert len(model) == 1

        cdk_params = model.rx2('CDK1')
        pwm = cdk_params.rx2('pwm')

        # and the position-specific weight matrix should be created
        assert pwm

        # the very detailed testing should be performed by rMIMP,
        # but why not test the basics?

        weights_of_central_aa = {
            aa: value
            for aa, value in zip(pwm.rownames, pwm.rx(True, 8))
        }
        assert weights_of_central_aa['S'] == max(
            weights_of_central_aa.values())
Example #12
0
def most_mutated_sites(sources: List[MutationSource],
                       site_type: SiteType = None,
                       limit=25,
                       intersection=True,
                       exclusive=None,
                       mutation_filter=None):
    """Sources must have the same value_type (counts/frequencies)"""

    assert not (intersection and exclusive)

    counts = prepare_for_summing(sources)

    query = (db.session.query(
        Site, *[count.label(f'count_{i}')
                for i, count in enumerate(counts)]).select_from(Mutation))

    if intersection:
        for source in sources:
            query = query.join(source)
    else:
        for source in sources:
            query = query.outerjoin(source)

        if exclusive:
            query = query.filter(~Mutation.in_sources(*exclusive))

    if mutation_filter is not None:
        query = query.filter(mutation_filter)

    query = (query.join(Mutation.affected_sites).filter(
        Site.protein.has(Protein.is_preferred_isoform)))

    if site_type:
        query = query.filter(SiteType.fuzzy_filter(site_type, join=True))

    query = (query.group_by(Site).having(and_(*counts)))

    query = query.subquery()

    total_muts_count = reduce(
        operator.add,
        [getattr(query.c, f'count_{i}') for i in range(len(counts))])

    total_muts_count = total_muts_count.label('mutations_count')

    query = (db.session.query(
        aliased(Site, query),
        total_muts_count,
    ).order_by(desc(total_muts_count)))

    return query.limit(limit)
Example #13
0
def site_type_filter_from_str(query, site=Site):
    if query == 'any':
        return

    if query.startswith('not'):
        query = query[4:]
        negate = True
    else:
        negate = False

    site_type = SiteType.query.filter_by(name=query).one()
    site_filter = SiteType.fuzzy_filter(site_type, join=True, site=site)

    if negate:
        site_filter = ~site_filter
    return site_filter
Example #14
0
def are_glycosylation_sites_mutated_more_often(source_name: str,
                                               disordered=None,
                                               alternative='greater'):
    from stats.table import count_mutated_sites

    glycosylation = SiteType.query.filter_by(name='glycosylation').one()
    non_glycosylation = SiteType.query.filter(
        ~SiteType.name.contains('glycosylation')).all()

    print(f'Comparing {glycosylation} against {non_glycosylation}')

    source = source_manager.source_by_name[source_name]

    count = partial(count_mutated_sites,
                    model=source,
                    only_primary=True,
                    disordered=disordered)

    glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True)
    glycosylation_types = SiteType.query.filter(
        SiteType.name.contains('glycosylation')).all()
    non_glyco_filter = Site.types.any(
        ~SiteType.id.in_([site_type.id for site_type in glycosylation_types]))

    mutated_glycosylation = count(custom_filter=glyco_filter)
    mutated_non_glycosylation = count(custom_filter=non_glyco_filter)

    total_glycosylation = Site.query.filter(glyco_filter).count()
    total_non_glycosylation = Site.query.filter(non_glyco_filter).count()

    #         mutated | not_mutated
    #  glyc |
    # other |

    contingency_table = [
        [mutated_glycosylation, total_glycosylation - mutated_glycosylation],
        [
            mutated_non_glycosylation,
            total_non_glycosylation - mutated_non_glycosylation
        ]
    ]
    print(contingency_table)
    oddsratio, pvalue = fisher_exact(contingency_table,
                                     alternative=alternative)
    print(source_name, oddsratio, pvalue)
    return oddsratio, pvalue
Example #15
0
    def test_gather_negative_sites(self):

        p = Protein(refseq='NM_007',
                    sequence='X---------X------------YXY--------')
        g = Gene(isoforms=[p], preferred_isoform=p)

        # one-based
        s = Site(position=11,
                 types={SiteType(name='methylation')},
                 residue='X',
                 protein=p)

        db.session.add_all([g, p, s])

        negative_sites = gather_negative_sites(residues={'X'}, exclude={s})

        # zero-based
        assert negative_sites == {NegativeSite(p, 0), NegativeSite(p, 24)}
Example #16
0
def enrichment_of_ptm_genes(reference_set,
                            site_type_name: str,
                            only_mutated_sites=False):
    """
    Args:
        only_mutated_sites:
            whether only genes with mutated sites should be considered,
            True, False or an SQLAlchemy filter, e.g. Mutation.in_sources(MC3Mutation)
    """
    site_type = SiteType.query.filter_by(name=site_type_name).one()
    observed_genes = (Gene.query.join(Gene.preferred_isoform).join(
        Protein.sites).filter(SiteType.fuzzy_filter(site_type)))
    if only_mutated_sites is not False:
        observed_genes = observed_genes.join(
            Site.mutations).filter(only_mutated_sites)

    observed_genes = set(observed_genes)

    return genes_enrichment(observed_genes, reference_set)
Example #17
0
    def test_in_disordered(self):
        methylation = SiteType(name='methylation')

        p = Protein(refseq='NM_007',
                    id=1,
                    disorder_map='10000000001000000000000001',
                    sequence='ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        db.session.add(p)

        sites_in_disordered = {0, 10, 25}
        sites_not_disordered = {1, 9, 11, 24}

        sites = {}

        for position in sites_in_disordered | sites_not_disordered:
            print(position)
            print(len(p.sequence[position]))
            sites[position] = Site(position=position + 1,
                                   types={methylation},
                                   residue=p.sequence[position],
                                   protein=p)

        # Python side

        for position in sites_in_disordered:
            site = sites[position]
            assert site.in_disordered_region

        for position in sites_not_disordered:
            site = sites[position]
            assert not site.in_disordered_region

        # SQL side
        assert {
            site.position - 1
            for site in Site.query.filter_by(in_disordered_region=True)
        } == sites_in_disordered
        assert {
            site.position - 1
            for site in Site.query.filter_by(in_disordered_region=False)
        } == sites_not_disordered
Example #18
0
def are_glycosylation_sites_enriched(source_name: str,
                                     population_name: str,
                                     disordered=None,
                                     alternative='greater'):
    from stats.table import count_mutated_sites

    glycosylation = SiteType.query.filter_by(name='glycosylation').one()
    glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True)

    glycosylation_types = SiteType.query.filter(
        SiteType.name.contains('glycosylation')).all()
    non_glyco_filter = Site.types.any(
        ~SiteType.id.in_([site_type.id for site_type in glycosylation_types]))

    source = source_manager.source_by_name[source_name]
    population = source_manager.source_by_name[population_name]

    count_glyc = partial(count_mutated_sites,
                         custom_filter=glyco_filter,
                         only_primary=True,
                         disordered=disordered)
    count_not_glyc = partial(count_mutated_sites,
                             custom_filter=non_glyco_filter,
                             only_primary=True,
                             disordered=disordered)

    #         mutated glyc | mutated not glyc
    # cancer |
    # popula |

    contingency_table = [[
        count_glyc(model=source),
        count_not_glyc(model=source)
    ], [count_glyc(model=population),
        count_not_glyc(model=population)]]
    print(contingency_table)
    oddsratio, pvalue = fisher_exact(contingency_table,
                                     alternative=alternative)
    print(source_name, population_name, oddsratio, pvalue)
    return oddsratio, pvalue
Example #19
0
    def test_consistency(self):

        methylation = SiteType(name='methylation')

        p = Protein(refseq='NM_007', id=1, sequence='ABCD')
        db.session.add(p)

        # matching residue (note: for sites, positions are 1-based)
        assert Site(position=2, types={methylation}, residue='B', protein=p)

        # mismatched residue
        with pytest.raises(ValidationError):
            Site(position=3, types={methylation}, residue='B', protein=p)

        # no residue and position in range
        assert Site(position=2, protein=p)

        # no residue and position outside of range
        with pytest.raises(ValidationError):
            Site(position=5, protein=p)
        with pytest.raises(ValidationError):
            Site(position=-5, protein=p)
Example #20
0
    def show(self, refseq):
        """Show a protein by:

        + needleplot
        + tracks (sequence + data tracks)
        """

        protein, filter_manager = self.get_protein_and_manager(refseq)

        user_datasets = current_user.datasets_names_by_uri()

        return template(
            'protein/show.html',
            protein=protein,
            filters=filter_manager,
            widgets=create_widgets(
                protein,
                filter_manager.filters,
                custom_datasets_names=user_datasets.values()),
            site_types=['multi_ptm'] + SiteType.available_types(),
            mutation_types=Mutation.types,
        )
Example #21
0
def common_filters(protein,
                   default_source='MC3',
                   source_nullable=False,
                   custom_datasets_ids=[]):

    return [
        Filter(Mutation,
               'sources',
               comparators=['in'],
               choices=list(source_manager.visible_fields.keys()),
               default=default_source,
               nullable=source_nullable,
               as_sqlalchemy=sqlalchemy_filter_from_source_name),
        Filter(UserMutations,
               'sources',
               comparators=['in'],
               choices=list(custom_datasets_ids),
               default=None,
               nullable=True),
        Filter(Mutation, 'is_ptm', comparators=['eq']),
        Filter(Drug,
               'groups.name',
               comparators=['in'],
               nullable=False,
               choices=cached_queries.drug_groups,
               default=['approved'],
               multiple='all',
               as_sqlalchemy=True),
        Filter(Site,
               'types',
               comparators=['in'],
               choices={
                   site_type.name: site_type
                   for site_type in SiteType.available_types()
               },
               custom_comparators={'in': SiteType.fuzzy_comparator},
               as_sqlalchemy=SiteType.fuzzy_filter,
               as_sqlalchemy_joins=[Site.types])
    ] + source_dependent_filters(protein)
Example #22
0
    def test_sequence(self):
        methylation = SiteType(name='methylation')

        p = Protein(refseq='NM_007',
                    id=1,
                    sequence='ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        db.session.add(p)

        data = {
            0: '-------ABCDEFGH',
            10: 'DEFGHIJKLMNOPQR',
            25: 'STUVWXYZ-------'
        }

        sites = {}

        for position in data:
            sites[position] = Site(position=position + 1,
                                   types={methylation},
                                   residue=p.sequence[position],
                                   protein=p)

        db.session.add_all(sites.values())
        db.session.commit()

        for position, expected_sequence in data.items():
            site = sites[position]
            # Python side
            assert site.sequence == expected_sequence
            # SQL side
            assert Site.query.filter_by(
                sequence=expected_sequence).one() == site

        sequences = [
            s for (s, ) in db.session.query(Site.sequence).select_from(
                Site).join(Protein)
        ]
        assert set(sequences) == set(data.values())
Example #23
0
    def test_edge_cases(self):

        protein = Protein(
            refseq='NM_006829',
            sequence=
            'MASKGLQDLKQQVEGTAQEAVSAAGAAAQQVVDQATEAGQKAMDQLAKTTQETIDKTANQASDTFSGIGKKFGLLK*'
        )

        db.session.add(protein)

        with initialized_importer(EdgeSitesCase, 'My_test') as importer:

            importer.site_datasets['My_test'] = 'mixed_type'
            importer.site_types_map['mixed_type'] = SiteType(name='mixed_type')

            sites = importer.load_sites(site_datasets=['My_test'])

            assert len(sites) == 2

            sites_by_pos = {site.position: site for site in sites}

            assert sites_by_pos[3].residue == 'S'
            assert sites_by_pos[70].residue == 'K'
Example #24
0
def get_site_filters(exclude=None, glycosylation='together'):
    """Yields (site type name, type filter) tuples.

    Args:
        exclude: site types to exclude
        glycosylation: 'together', 'only' or 'separate'
    """

    for site_type in tqdm(SiteType.query, total=SiteType.query.count()):
        glyco_kind = 'glycosylation' in site_type.name
        type_name = site_type.name

        if glycosylation == 'together':

            if glyco_kind:
                if type_name == 'glycosylation':
                    type_name = 'glycosylation (all subtypes)'
                else:
                    continue

        elif glycosylation == 'only':

            if not glyco_kind:
                continue

            if type_name == 'glycosylation':
                type_name = 'glycosylation (unknown subtype)'

        if exclude and site_type.name in exclude:
            continue

        if glycosylation == 'together':
            site_filter = SiteType.fuzzy_filter(site_type, join=True)
        else:
            site_filter = Site.types.contains(site_type)

        yield type_name, site_filter
Example #25
0
def train_model(site_type: SiteType,
                sequences_dir='.tmp',
                sampling_n=10000,
                enzyme_type='kinase',
                output_path=None,
                **kwargs):
    """Train MIMP model for given site type.

    NOTE: Natively MIMP works on phosphorylation sites only,
    so a special, forked version [reimandlab/rmimp] is needed
    for this function to work at all.

    Args:
        site_type: Type of the site for which the model is to be trained
        sequences_dir: path to dir where sequences for trainModel should be dumped
        sampling_n: number of sampling iterations for negative sequence set
        output_path: path to .mimp file where the model should be saved
        **kwargs: will be passed to trainModel

    Returns:
        trained MIMP model for all kinases affecting sites of given SiteType
    """
    if not output_path:
        output_path = f'{site_type.name}.mimp'

    mimp = load_mimp()

    sites_of_this_type = set(site_type.sites)
    modified_residues = site_type.find_modified_residues()

    negative_sites = gather_negative_sites(modified_residues,
                                           exclude=sites_of_this_type)

    sequences_path = Path(sequences_dir)

    positive_path = sequences_path / 'positive'
    negative_path = sequences_path / 'negative'

    for path in [positive_path, negative_path]:
        shutil.rmtree(str(path), ignore_errors=True)
        path.mkdir(parents=True)

    if enzyme_type == 'kinase':

        enzymes = Kinase.query.filter(
            Kinase.is_involved_in.any(SiteType.name == site_type.name)).filter(
                Kinase.sites.any(Site.types.contains(site_type)))
        enzymes = tqdm(enzymes, total=enzymes.count())

    elif enzyme_type == 'catch-all':
        enzymes = [
            SimpleNamespace(sites=Site.query.filter(
                Site.types.contains(site_type)),
                            name=f'all_enzymes_for_{site_type.name}')
        ]
    else:
        assert False

    for enzyme in enzymes:

        sites = [site for site in enzyme.sites if site_type in site.types]

        positive_sequences = [site.sequence for site in sites]
        negative_sequences = sample_random_negative_sequences(
            negative_sites, sampling_n)

        save_kinase_sequences(enzyme, positive_sequences, positive_path)
        save_kinase_sequences(enzyme, negative_sequences, negative_path)

    priors = mimp.PRIORS.rx2('human')

    # just in case
    # r.debug(mimp.trainModel)

    return mimp.trainModel(
        str(positive_path),
        str(negative_path),
        file=output_path,
        priors=priors,  # or calculate_background_frequency(),
        # both give the same values (within rounding error), the custom
        # func might come in handy in future
        residues_groups=residues_groups(site_type, modified_residues),
        **kwargs)
Example #26
0
    def test_counting(self):

        motifs_db = {
            # xation happens whenever there is X which is not preceded with or followed by another X
            'xation': {
                'canonical': '.{6}[^X]X[^X].{6}',
                'non-canonical': 'XXY'
            }
        }

        p = Protein(refseq='NM_007', id=1, sequence='_X_X_______X________XXY')

        mutations = [
            Mutation(protein=p, position=1, alt='X'),  # proximal, breaking
            Mutation(protein=p, position=1, alt='o'),  # proximal, non-breaking
            Mutation(protein=p, position=2, alt='Y'),  # direct, breaking
            Mutation(protein=p, position=3,
                     alt='X'),  # proximal for two sites, breaking
        ]

        xation = SiteType(name='xation')

        canonical_sites = [
            Site(protein=p, position=2,
                 types={xation}),  # canonical, seriously mutated and broken
            Site(protein=p, position=4, types={xation}),  # canonical, mutated
            Site(protein=p, position=12,
                 types={xation}),  # canonical, not mutated
        ]
        other_sites = [
            Site(protein=p, position=22,
                 types={xation}),  # non-canonical motif, not mutated
        ]
        all_sites = canonical_sites + other_sites

        db.session.add(p)
        db.session.commit()

        counter = MotifsCounter(xation, motifs_db=motifs_db)
        counts = counter.count_muts_and_sites(Mutation.query, Site.query)

        assert counts.muts_around_sites_with_motif['canonical'] == 4
        assert counts.muts_breaking_sites_motif['canonical'] == 3
        assert counts.sites_with_broken_motif['canonical'] == 2
        assert counts.sites_with_motif['canonical'] == len(canonical_sites)

        assert counts.sites_with_broken_motif['non-canonical'] == 0
        assert counts.muts_around_sites_with_motif['non-canonical'] == 0

        x_motifs = motifs_db['xation']
        selection = select_sites_with_motifs(Site.query, x_motifs)

        assert selection['canonical'] == set(canonical_sites)
        assert select_sites_with_motifs(all_sites, x_motifs) == selection

        data = counter.gather_muts_and_sites(Mutation.query, Site.query)

        assert data.sites_with_broken_motif['canonical'] == {
            canonical_sites[0], canonical_sites[1]
        }
        assert data.sites_with_motif['canonical'] == set(canonical_sites)
Example #27
0
            comments.append(
                f'{breaking_muts} mutations breaking this motif '
                f'({muts_percentage:.2f}% of PTM muts close to that motif).'
                f'<br>'
                f'{broken_sites} sites with broken motif ({sites_percentage:.2f}% of sites with this motif).'
                if broken_sites else None)

        data[motif] = genes_ordered, y, comments
    return data


sources_combinations = [[InheritedMutation], [InheritedMutation, MC3Mutation],
                        [MC3Mutation]]

motifs_cases = cases(site_type=[SiteType(name='glycosylation')],
                     sources=sources_combinations,
                     count_method=['occurrences',
                                   'distinct']).set_mode('product')


def calc_motifs(sources, site_type, count_method, y_axis: str):
    kwargs = {}
    if count_method == 'occurrences':
        kwargs['occurrences_in'] = sources
    if len(sources) > 1:
        kwargs['intersection'] = sources
    counts_by_gene = count_by_sources(sources,
                                      site_type,
                                      by_genes=True,
                                      **kwargs)
Example #28
0
    def test_mutation(self):

        s = Site(position=13, types={SiteType(name='methylation')})
        p = Protein(refseq='NM_007',
                    id=1,
                    sites=[s],
                    sequence='A' * 15,
                    gene=Gene(name='SomeGene'))

        db.session.add(p)

        from database import bdb

        muts = {13: 14370, 15: 14376}

        for aa_pos, dna_pos in muts.items():
            muts[aa_pos] = Mutation(protein=p, position=aa_pos, alt='V')
            bdb.add_genomic_mut('20',
                                dna_pos,
                                'G',
                                'A',
                                muts[aa_pos],
                                is_ptm=True)

        query_url = '/chromosome/mutation/{chrom}/{pos}/{ref}/{alt}'

        # query as a novel mutation
        response = self.client.get(
            query_url.format(chrom='chr20', pos=14370, ref='G', alt='A'))

        assert response.status_code == 200
        assert response.json == [{
            'alt':
            'V',
            'gene':
            'SomeGene',
            'in_datasets': {},
            'pos':
            13,
            'ptm_impact':
            'direct',
            'cnt_ptm':
            1,
            'closest_sites': ['13 A'],
            'protein':
            'NM_007',
            'sites': [{
                'kinases': [],
                'position': 13,
                'residue': 'A',
                'kinase_groups': [],
                'type': 'methylation'
            }],
            'ref':
            'A'
        }]

        # well let's look on a known mutation:
        m = muts[15]
        mc3 = MC3Mutation(mutation=m,
                          cancer=Cancer(name='Breast invasive carcinoma',
                                        code='BRCA'),
                          count=1)
        esp = ExomeSequencingMutation(mutation=m, maf_all=0.02, maf_aa=0.02)

        db.session.add_all([m, mc3, esp])
        db.session.commit()

        mutation_a15v_query = query_url.format(chrom='chr20',
                                               pos=14376,
                                               ref='G',
                                               alt='A')
        response = self.client.get(mutation_a15v_query)

        metadata = {
            'MC3': {
                'Cancers': [{
                    'Cancer': 'Breast invasive carcinoma',
                    'Value': 1
                }]
            },
            'ESP6500': {
                'MAF': 0.02,
                'MAF AA': 0.02,
                'MAF EA': None
            }
        }

        assert response.json[0]['in_datasets'] == metadata

        expected_values = {'MC3': 1, 'ESP6500': 0.02}

        # if user does not want to download data for all datasets he may use:
        for source, meta in metadata.items():
            response = self.client.get(mutation_a15v_query +
                                       '?filters=Mutation.sources:in:' +
                                       source)
            json = response.json[0]
            assert json['in_datasets'] == {source: meta}
            assert json['value'] == expected_values[source]

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:MC3;Mutation.mc3_cancer_code:in:BRCA'
        )
        assert response.json

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:African American'
        )
        assert response.json

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:European American'
        )
        assert not response.json
Example #29
0
def create_network():
    p = create_test_protein()
    cancer = Cancer(name='Ovarian', code='OV')

    known_interactor_of_x = create_test_kinase('Kinase Y', 'NM_0009')

    kinase_mutation = Mutation(
        position=1,
        alt='T',
        meta_MC3=[MC3Mutation(cancer=cancer)]
    )

    known_interactor_of_x.protein.mutations = [kinase_mutation]

    drug = Drug(
        name='Drug targeting ' + known_interactor_of_x.name,
        drug_bank_id='DB01',
        target_genes=[known_interactor_of_x.protein.gene],
        # by default only approved drugs are shown
        groups={DrugGroup(name='approved')}
    )

    group = KinaseGroup(
        name='Group of kinases',
    )
    phosphorylation = SiteType(name='phosphorylation')

    s = Site(
        position=1,
        types={phosphorylation},
        residue='T',
        kinases={known_interactor_of_x},
        kinase_groups={group}
    )
    s2 = Site(
        position=2,
        types={phosphorylation},
        residue='R',
        kinase_groups={group}
    )
    p.sites = [s, s2]

    predicted_interactor = create_test_kinase('Kinase Z', 'NM_0002')

    protein_mutation = Mutation(
        position=2,
        alt='T',
        meta_MC3=[MC3Mutation(cancer=cancer)],
        meta_MIMP=[
            MIMPMutation(pwm=known_interactor_of_x.name, effect='loss', site=s, probability=0.1, position_in_motif=1),
            MIMPMutation(pwm=predicted_interactor.name, effect='gain', site=s, probability=0.1, position_in_motif=1)
        ]
    )

    p.mutations = [protein_mutation]
    db.session.add_all([p, drug, predicted_interactor])
    db.session.commit()

    # a new cancer was added, reload is necessary (this should not happen during normal app usage)
    from website.views.filters import cached_queries
    cached_queries.reload()
Example #30
0
def ptm_on_random(source=MC3Mutation,
                  site_type='glycosylation',
                  same_proteins=False,
                  only_preferred=True,
                  mode='occurrences',
                  repeats=10000,
                  ptm_proteins=False,
                  same_ptm_proteins=False,
                  exclude_genes=None,
                  mutation_filter=None,
                  sample_ptm_muts=True):
    """"Compare frequencies of PTM mutations of given type with random proteome mutations

    from protein sequence regions of the same size as analysed PTM regions.
    """
    from numpy import sum
    from numpy import zeros
    from numpy.random import choice

    assert mode in {'distinct', 'occurrences'}
    distinct = mode == 'distinct'

    assert not (same_proteins and ptm_proteins)
    assert not (same_ptm_proteins and not ptm_proteins)

    def measure(x):
        """See https://github.com/taschini/pyinterval/issues/2"""
        return int(fpu.up(lambda: sum((c.sup - c.inf for c in x), 0)))

    site_type = SiteType.query.filter_by(name=site_type).one()
    only_preferred = Protein.is_preferred_isoform if only_preferred else True

    # all muts

    all_muts = defaultdict(lambda: defaultdict(int))
    q = (db.session.query(
        source, Mutation).select_from(source).join(Mutation).filter(
            mutation_filter if mutation_filter is not None else True).join(
                Protein).filter(only_preferred))
    if distinct:
        for mutation_details, mutation in tqdm(q, total=q.count()):
            all_muts[mutation.protein][mutation.position] += 1
    else:
        for mutation_details, mutation in tqdm(q, total=q.count()):
            if mutation.position > mutation.protein.length:
                print(f'Faulty mutation: {mutation}')
                continue
            all_muts[mutation.protein][
                mutation.position] += mutation_details.count

    # region size

    glyco_sequence_region_size = 0

    intervals_by_protein = defaultdict(interval)

    sites = (Site.query.filter(
        SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred))

    for site in tqdm(sites, total=sites.count()):
        intervals_by_protein[site.protein] |= interval[
            max(site.position - 7, 0),
            min(site.position + 7, site.protein.length)]

    for sequence_interval in intervals_by_protein.values():
        glyco_sequence_region_size += measure(sequence_interval)

    # ptm muts

    ptm_muts = (db.session.query(
        source, Mutation
    ).select_from(source).join(Mutation).join(
        Mutation.affected_sites
    ).filter(mutation_filter if mutation_filter is not None else True).filter(
        SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred))

    if exclude_genes:
        exclude_proteins = Protein.query.select_from(Gene).join(
            Gene.isoforms).filter(Gene.name.in_(exclude_genes)).all()
        ptm_muts = ptm_muts.filter(
            ~Protein.id.in_([p.id for p in exclude_proteins]))

    ptm_muts = ptm_muts.group_by(source)

    if sample_ptm_muts:
        ptm_muts_by_protein = defaultdict(list)
        for mutation_details, mutation in ptm_muts:
            ptm_muts_by_protein[mutation.protein].append(
                (mutation_details, mutation))

        ptm_mutations_array = zeros(glyco_sequence_region_size)
        pos = 0

        for protein, protein_interval in tqdm(intervals_by_protein.items(),
                                              total=len(intervals_by_protein)):
            for mutation_details, mutation in ptm_muts_by_protein[protein]:
                p = 0
                for subinterval in protein_interval.components:
                    if mutation.position in subinterval:
                        p += mutation.position - 1 - int(
                            subinterval[0].inf)  # position in interval
                        break
                    p += measure(subinterval)

                if distinct:
                    ptm_mutations_array[pos + p] += 1
                else:
                    ptm_mutations_array[pos + p] += mutation_details.count
            pos += measure(protein_interval)

        ptm_counts = []
        for repeat in tqdm(range(repeats), total=repeats):
            ptm_counts.append(
                sum(
                    choice(ptm_mutations_array,
                           size=glyco_sequence_region_size)))
        print(Series(ptm_counts).describe())
        ptm_muts_count = mean(ptm_counts)
    else:
        if distinct:
            ptm_muts_count = ptm_muts.count()
        else:
            ptm_muts_count = 0
            for mutation, mutation in ptm_muts:
                assert mutation.count != 0
                ptm_muts_count += mutation.count

            assert ptm_muts_count >= ptm_muts.count()
        ptm_counts = [ptm_muts_count]

    ptm_ratio = ptm_muts_count / glyco_sequence_region_size

    if same_proteins:
        proteins = list(intervals_by_protein.keys())
    else:
        proteins = Protein.query.filter(only_preferred)
        if ptm_proteins:
            proteins = proteins.join(Protein.sites)
        if same_ptm_proteins:
            proteins = proteins.filter(SiteType.fuzzy_filter(site_type))
        proteins = proteins.all()

    proteins = [
        protein for protein in proteins
        if not exclude_genes or protein.gene.name not in exclude_genes
    ]

    weights = [p.length for p in proteins]

    mutations_array = zeros(sum(weights))
    pos = 0

    for protein in tqdm(proteins):
        for position, count in all_muts[protein].items():
            try:
                mutations_array[pos + position - 1] = count
            except Exception:
                print(protein, pos, position)
                raise
        pos += protein.length

    counts = []
    append = counts.append

    for repeat in tqdm(range(repeats), total=repeats):
        append(sum(choice(mutations_array, size=glyco_sequence_region_size)))

    p_value = sum(1 for count in counts if count > ptm_muts_count) / repeats
    count_of_sampled_muts = mean(counts)
    random_ratio = count_of_sampled_muts / glyco_sequence_region_size

    explanation = '(only the same proteins)' if same_proteins else ''

    print(
        f'count of {site_type.name} mutations: {ptm_muts_count},\n'
        f'count of random mutations from protein sequence regions of the same size: {count_of_sampled_muts}'
        f' {explanation}.')
    print(f'region size: {glyco_sequence_region_size}; source: {source}')
    print(f'frequency of {site_type.name} mutations: {ptm_ratio * 100}%,\n'
          f'frequency of random mutations: {random_ratio * 100}%.')
    print(f'p-value = {p_value} (permutation test, {repeats} repeats)')
    print('Permutation test values:')
    print(Series(counts).describe())

    return ptm_counts, counts, glyco_sequence_region_size, p_value