Exemple #1
0
 def glycosylations_without_subtype_ratio(self):
     from models import Site, SiteType
     glycosylation = SiteType.query.filter_by(name='glycosylation').one()
     glycosylations = Site.query.filter(
         SiteType.fuzzy_filter(glycosylation)).count()
     return (glycosylations -
             self.glycosylations_with_subtype()) / glycosylations
Exemple #2
0
def ptm_muts_of_gene(
    path_template='exported/{site_type}_muts_of_{gene}_-_{protein}.tsv', gene='EGFR',
    site_type='glycosylation', mutation_source='mc3', to_csv=True, show_progress=False, **kwargs
):

    manager = MutationImportManager()
    importer_class = manager.importers[mutation_source]
    importer = importer_class(**kwargs)

    site_type = SiteType.query.filter_by(name=site_type).one()
    gene = Gene.query.filter_by(name=gene).one()
    protein = gene.preferred_isoform

    mutations = importer.export_to_df(
        mutation_filter=and_(
            Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type)),
            Mutation.protein_id == protein.id
        ),
        protein_filter=Protein.id == protein.id,
        show_progress=show_progress
    )
    path = path_template.format(protein=protein.refseq, gene=gene.name, site_type=site_type.name)
    if to_csv:
        mutations.to_csv(path, sep='\t', index=False)
    return mutations
Exemple #3
0
def most_mutated_sites(sources: List[MutationSource],
                       site_type: SiteType = None,
                       limit=25,
                       intersection=True,
                       exclusive=None,
                       mutation_filter=None):
    """Sources must have the same value_type (counts/frequencies)"""

    assert not (intersection and exclusive)

    counts = prepare_for_summing(sources)

    query = (db.session.query(
        Site, *[count.label(f'count_{i}')
                for i, count in enumerate(counts)]).select_from(Mutation))

    if intersection:
        for source in sources:
            query = query.join(source)
    else:
        for source in sources:
            query = query.outerjoin(source)

        if exclusive:
            query = query.filter(~Mutation.in_sources(*exclusive))

    if mutation_filter is not None:
        query = query.filter(mutation_filter)

    query = (query.join(Mutation.affected_sites).filter(
        Site.protein.has(Protein.is_preferred_isoform)))

    if site_type:
        query = query.filter(SiteType.fuzzy_filter(site_type, join=True))

    query = (query.group_by(Site).having(and_(*counts)))

    query = query.subquery()

    total_muts_count = reduce(
        operator.add,
        [getattr(query.c, f'count_{i}') for i in range(len(counts))])

    total_muts_count = total_muts_count.label('mutations_count')

    query = (db.session.query(
        aliased(Site, query),
        total_muts_count,
    ).order_by(desc(total_muts_count)))

    return query.limit(limit)
Exemple #4
0
def site_type_filter_from_str(query, site=Site):
    if query == 'any':
        return

    if query.startswith('not'):
        query = query[4:]
        negate = True
    else:
        negate = False

    site_type = SiteType.query.filter_by(name=query).one()
    site_filter = SiteType.fuzzy_filter(site_type, join=True, site=site)

    if negate:
        site_filter = ~site_filter
    return site_filter
Exemple #5
0
def are_glycosylation_sites_mutated_more_often(source_name: str,
                                               disordered=None,
                                               alternative='greater'):
    from stats.table import count_mutated_sites

    glycosylation = SiteType.query.filter_by(name='glycosylation').one()
    non_glycosylation = SiteType.query.filter(
        ~SiteType.name.contains('glycosylation')).all()

    print(f'Comparing {glycosylation} against {non_glycosylation}')

    source = source_manager.source_by_name[source_name]

    count = partial(count_mutated_sites,
                    model=source,
                    only_primary=True,
                    disordered=disordered)

    glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True)
    glycosylation_types = SiteType.query.filter(
        SiteType.name.contains('glycosylation')).all()
    non_glyco_filter = Site.types.any(
        ~SiteType.id.in_([site_type.id for site_type in glycosylation_types]))

    mutated_glycosylation = count(custom_filter=glyco_filter)
    mutated_non_glycosylation = count(custom_filter=non_glyco_filter)

    total_glycosylation = Site.query.filter(glyco_filter).count()
    total_non_glycosylation = Site.query.filter(non_glyco_filter).count()

    #         mutated | not_mutated
    #  glyc |
    # other |

    contingency_table = [
        [mutated_glycosylation, total_glycosylation - mutated_glycosylation],
        [
            mutated_non_glycosylation,
            total_non_glycosylation - mutated_non_glycosylation
        ]
    ]
    print(contingency_table)
    oddsratio, pvalue = fisher_exact(contingency_table,
                                     alternative=alternative)
    print(source_name, oddsratio, pvalue)
    return oddsratio, pvalue
Exemple #6
0
def enrichment_of_ptm_genes(reference_set,
                            site_type_name: str,
                            only_mutated_sites=False):
    """
    Args:
        only_mutated_sites:
            whether only genes with mutated sites should be considered,
            True, False or an SQLAlchemy filter, e.g. Mutation.in_sources(MC3Mutation)
    """
    site_type = SiteType.query.filter_by(name=site_type_name).one()
    observed_genes = (Gene.query.join(Gene.preferred_isoform).join(
        Protein.sites).filter(SiteType.fuzzy_filter(site_type)))
    if only_mutated_sites is not False:
        observed_genes = observed_genes.join(
            Site.mutations).filter(only_mutated_sites)

    observed_genes = set(observed_genes)

    return genes_enrichment(observed_genes, reference_set)
Exemple #7
0
def are_glycosylation_sites_enriched(source_name: str,
                                     population_name: str,
                                     disordered=None,
                                     alternative='greater'):
    from stats.table import count_mutated_sites

    glycosylation = SiteType.query.filter_by(name='glycosylation').one()
    glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True)

    glycosylation_types = SiteType.query.filter(
        SiteType.name.contains('glycosylation')).all()
    non_glyco_filter = Site.types.any(
        ~SiteType.id.in_([site_type.id for site_type in glycosylation_types]))

    source = source_manager.source_by_name[source_name]
    population = source_manager.source_by_name[population_name]

    count_glyc = partial(count_mutated_sites,
                         custom_filter=glyco_filter,
                         only_primary=True,
                         disordered=disordered)
    count_not_glyc = partial(count_mutated_sites,
                             custom_filter=non_glyco_filter,
                             only_primary=True,
                             disordered=disordered)

    #         mutated glyc | mutated not glyc
    # cancer |
    # popula |

    contingency_table = [[
        count_glyc(model=source),
        count_not_glyc(model=source)
    ], [count_glyc(model=population),
        count_not_glyc(model=population)]]
    print(contingency_table)
    oddsratio, pvalue = fisher_exact(contingency_table,
                                     alternative=alternative)
    print(source_name, population_name, oddsratio, pvalue)
    return oddsratio, pvalue
Exemple #8
0
def get_site_filters(exclude=None, glycosylation='together'):
    """Yields (site type name, type filter) tuples.

    Args:
        exclude: site types to exclude
        glycosylation: 'together', 'only' or 'separate'
    """

    for site_type in tqdm(SiteType.query, total=SiteType.query.count()):
        glyco_kind = 'glycosylation' in site_type.name
        type_name = site_type.name

        if glycosylation == 'together':

            if glyco_kind:
                if type_name == 'glycosylation':
                    type_name = 'glycosylation (all subtypes)'
                else:
                    continue

        elif glycosylation == 'only':

            if not glyco_kind:
                continue

            if type_name == 'glycosylation':
                type_name = 'glycosylation (unknown subtype)'

        if exclude and site_type.name in exclude:
            continue

        if glycosylation == 'together':
            site_filter = SiteType.fuzzy_filter(site_type, join=True)
        else:
            site_filter = Site.types.contains(site_type)

        yield type_name, site_filter
Exemple #9
0
def ptm_on_random(source=MC3Mutation,
                  site_type='glycosylation',
                  same_proteins=False,
                  only_preferred=True,
                  mode='occurrences',
                  repeats=10000,
                  ptm_proteins=False,
                  same_ptm_proteins=False,
                  exclude_genes=None,
                  mutation_filter=None,
                  sample_ptm_muts=True):
    """"Compare frequencies of PTM mutations of given type with random proteome mutations

    from protein sequence regions of the same size as analysed PTM regions.
    """
    from numpy import sum
    from numpy import zeros
    from numpy.random import choice

    assert mode in {'distinct', 'occurrences'}
    distinct = mode == 'distinct'

    assert not (same_proteins and ptm_proteins)
    assert not (same_ptm_proteins and not ptm_proteins)

    def measure(x):
        """See https://github.com/taschini/pyinterval/issues/2"""
        return int(fpu.up(lambda: sum((c.sup - c.inf for c in x), 0)))

    site_type = SiteType.query.filter_by(name=site_type).one()
    only_preferred = Protein.is_preferred_isoform if only_preferred else True

    # all muts

    all_muts = defaultdict(lambda: defaultdict(int))
    q = (db.session.query(
        source, Mutation).select_from(source).join(Mutation).filter(
            mutation_filter if mutation_filter is not None else True).join(
                Protein).filter(only_preferred))
    if distinct:
        for mutation_details, mutation in tqdm(q, total=q.count()):
            all_muts[mutation.protein][mutation.position] += 1
    else:
        for mutation_details, mutation in tqdm(q, total=q.count()):
            if mutation.position > mutation.protein.length:
                print(f'Faulty mutation: {mutation}')
                continue
            all_muts[mutation.protein][
                mutation.position] += mutation_details.count

    # region size

    glyco_sequence_region_size = 0

    intervals_by_protein = defaultdict(interval)

    sites = (Site.query.filter(
        SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred))

    for site in tqdm(sites, total=sites.count()):
        intervals_by_protein[site.protein] |= interval[
            max(site.position - 7, 0),
            min(site.position + 7, site.protein.length)]

    for sequence_interval in intervals_by_protein.values():
        glyco_sequence_region_size += measure(sequence_interval)

    # ptm muts

    ptm_muts = (db.session.query(
        source, Mutation
    ).select_from(source).join(Mutation).join(
        Mutation.affected_sites
    ).filter(mutation_filter if mutation_filter is not None else True).filter(
        SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred))

    if exclude_genes:
        exclude_proteins = Protein.query.select_from(Gene).join(
            Gene.isoforms).filter(Gene.name.in_(exclude_genes)).all()
        ptm_muts = ptm_muts.filter(
            ~Protein.id.in_([p.id for p in exclude_proteins]))

    ptm_muts = ptm_muts.group_by(source)

    if sample_ptm_muts:
        ptm_muts_by_protein = defaultdict(list)
        for mutation_details, mutation in ptm_muts:
            ptm_muts_by_protein[mutation.protein].append(
                (mutation_details, mutation))

        ptm_mutations_array = zeros(glyco_sequence_region_size)
        pos = 0

        for protein, protein_interval in tqdm(intervals_by_protein.items(),
                                              total=len(intervals_by_protein)):
            for mutation_details, mutation in ptm_muts_by_protein[protein]:
                p = 0
                for subinterval in protein_interval.components:
                    if mutation.position in subinterval:
                        p += mutation.position - 1 - int(
                            subinterval[0].inf)  # position in interval
                        break
                    p += measure(subinterval)

                if distinct:
                    ptm_mutations_array[pos + p] += 1
                else:
                    ptm_mutations_array[pos + p] += mutation_details.count
            pos += measure(protein_interval)

        ptm_counts = []
        for repeat in tqdm(range(repeats), total=repeats):
            ptm_counts.append(
                sum(
                    choice(ptm_mutations_array,
                           size=glyco_sequence_region_size)))
        print(Series(ptm_counts).describe())
        ptm_muts_count = mean(ptm_counts)
    else:
        if distinct:
            ptm_muts_count = ptm_muts.count()
        else:
            ptm_muts_count = 0
            for mutation, mutation in ptm_muts:
                assert mutation.count != 0
                ptm_muts_count += mutation.count

            assert ptm_muts_count >= ptm_muts.count()
        ptm_counts = [ptm_muts_count]

    ptm_ratio = ptm_muts_count / glyco_sequence_region_size

    if same_proteins:
        proteins = list(intervals_by_protein.keys())
    else:
        proteins = Protein.query.filter(only_preferred)
        if ptm_proteins:
            proteins = proteins.join(Protein.sites)
        if same_ptm_proteins:
            proteins = proteins.filter(SiteType.fuzzy_filter(site_type))
        proteins = proteins.all()

    proteins = [
        protein for protein in proteins
        if not exclude_genes or protein.gene.name not in exclude_genes
    ]

    weights = [p.length for p in proteins]

    mutations_array = zeros(sum(weights))
    pos = 0

    for protein in tqdm(proteins):
        for position, count in all_muts[protein].items():
            try:
                mutations_array[pos + position - 1] = count
            except Exception:
                print(protein, pos, position)
                raise
        pos += protein.length

    counts = []
    append = counts.append

    for repeat in tqdm(range(repeats), total=repeats):
        append(sum(choice(mutations_array, size=glyco_sequence_region_size)))

    p_value = sum(1 for count in counts if count > ptm_muts_count) / repeats
    count_of_sampled_muts = mean(counts)
    random_ratio = count_of_sampled_muts / glyco_sequence_region_size

    explanation = '(only the same proteins)' if same_proteins else ''

    print(
        f'count of {site_type.name} mutations: {ptm_muts_count},\n'
        f'count of random mutations from protein sequence regions of the same size: {count_of_sampled_muts}'
        f' {explanation}.')
    print(f'region size: {glyco_sequence_region_size}; source: {source}')
    print(f'frequency of {site_type.name} mutations: {ptm_ratio * 100}%,\n'
          f'frequency of random mutations: {random_ratio * 100}%.')
    print(f'p-value = {p_value} (permutation test, {repeats} repeats)')
    print('Permutation test values:')
    print(Series(counts).describe())

    return ptm_counts, counts, glyco_sequence_region_size, p_value
def gather_ptm_muts_impacts(source: MutationSource,
                            site_type: SiteType,
                            limit_to_genes: List[str] = None,
                            occurrences=True,
                            limit_to_muts=False,
                            muts_filter=None):
    """

    Args:
        source: mutation source to gather mutations from
        site_type: PTM site type for which affecting mutations will be gathered
        limit_to_genes: list of gene names for which mutations of primary isoforms will be gathered
        occurrences: whether to count occurrences or distinct mutations
        limit_to_muts: list of tuples defining mutations and counts, like from AD data frame
            providing custom mutations lists overrides "occurrences" setting
        muts_filter: SQLAlchemy filter for mutations
    """

    try:
        motifs_counter = MotifsCounter(site_type, mode='change_of_motif')
    except NoKnownMotifs as error:
        warn(f'Impacts collection failed, due to: {error}')
        return {}

    sites = (Site.query.filter(SiteType.fuzzy_filter(
        site_type,
        join=True)).join(Protein).filter(Protein.is_preferred_isoform))

    def fuzzy_site_filter(sites):
        return [
            site for site in sites
            # matches 'O-glycosylation' for site_type 'glycosylation'
            if any(
                site_type.name in type_name for type_name in site.types_names)
        ]

    mutations_by_impact_by_gene = {
        # order matters
        'direct': defaultdict(int),
        'motif-changing': defaultdict(int),
        'proximal': defaultdict(int),
        'distal': defaultdict(int)
    }

    mutations = (Mutation.query.filter(
        Mutation.in_sources(source)).join(Protein).join(
            Gene, Gene.preferred_isoform_id == Protein.id))
    if muts_filter is not None:
        mutations = mutations.filter(muts_filter)

    motifs_data = motifs_counter.gather_muts_and_sites(mutations,
                                                       sites,
                                                       occurrences_in=[source])

    all_breaking_muts = set()
    for motif_name, breaking_muts in motifs_data.muts_breaking_sites_motif.items(
    ):
        all_breaking_muts.update(breaking_muts)

    mutations = mutations.filter(
        Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type,
                                                          join=True)))
    if limit_to_genes is not None:
        proteins_ids = (db.session.query(
            Protein.id).select_from(Gene).join(Gene.preferred_isoform).filter(
                Gene.name.in_(limit_to_genes)).all())
        mutations = mutations.filter(Protein.id.in_(proteins_ids))

    mutations = mutations.with_entities(Gene.name, Mutation)

    if limit_to_muts is not False:
        muts = {
            Mutation.query.filter_by(position=mut.position,
                                     alt=mut.mut_residue,
                                     protein=Protein.query.filter_by(
                                         refseq=mut.isoform).one()).one():
            int(mut.count)
            for mut in limit_to_muts.itertuples(index=False)
        }

    for gene_name, mutation in tqdm(mutations, total=mutations.count()):

        if limit_to_muts is not False:
            if mutation not in muts:
                continue
            value = muts[mutation]
        else:
            value = mutation.sources_map[
                source.name].get_value() if occurrences else 1

        impact = mutation.impact_on_ptm(fuzzy_site_filter)
        if impact != 'direct' and mutation in all_breaking_muts:
            mutations_by_impact_by_gene['motif-changing'][gene_name] += value
            continue
        assert impact != 'none'
        mutations_by_impact_by_gene[impact][gene_name] += value

    return mutations_by_impact_by_gene