def test_annotations(self): self.add_default_provenance() statements = [ 'SET TestAnnotation1 = "A"', 'SET TestAnnotation2 = "X"', 'g(TESTNS:1) -> g(TESTNS:2)' ] self.parser.parse_lines(statements) test_node_1 = Gene(namespace='TESTNS', name='1') test_node_2 = Gene(namespace='TESTNS', name='2') self.assertEqual(2, self.graph.number_of_nodes()) self.assertIn(test_node_1, self.graph) self.assertIn(test_node_2, self.graph) self.assertEqual(1, self.parser.graph.number_of_edges()) kwargs = { ANNOTATIONS: { 'TestAnnotation1': { 'A': True }, 'TestAnnotation2': { 'X': True }, }, EVIDENCE: test_evidence_text, CITATION: test_citation_dict } self.assert_has_edge(test_node_1, test_node_2, **kwargs)
def test_gene_fusion_missing_implicit(self): """Test serialization of a gene fusion to BEL with a implicit missing fusion ranges.""" dsl = GeneFusion( Gene('HGNC', 'TMPRSS2'), Gene('HGNC', 'ERG'), ) self.assertEqual('g(fus(HGNC:TMPRSS2, "?", HGNC:ERG, "?"))', dsl.as_bel())
def test_annotations_with_multilist(self): self.add_default_provenance() statements = [ 'SET TestAnnotation1 = {"A","B"}', 'SET TestAnnotation2 = "X"', 'SET TestAnnotation3 = {"D","E"}', 'g(TESTNS:1) -> g(TESTNS:2)' ] self.parser.parse_lines(statements) test_node_1_dict = Gene(namespace='TESTNS', name='1') test_node_2_dict = Gene(namespace='TESTNS', name='2') self.assertEqual(2, self.parser.graph.number_of_nodes()) self.assertIn(test_node_1_dict, self.graph) self.assertIn(test_node_2_dict, self.graph) self.assertEqual(1, self.parser.graph.number_of_edges()) kwargs = { ANNOTATIONS: { 'TestAnnotation1': { 'A': True, 'B': True }, 'TestAnnotation2': { 'X': True }, 'TestAnnotation3': { 'D': True, 'E': True } }, CITATION: test_citation_dict } self.assert_has_edge(test_node_1_dict, test_node_2_dict, **kwargs)
def test_fusion_specified(self, mock): node_data = GeneFusion( Gene('HGNC', 'TMPRSS2'), Gene('HGNC', 'ERG'), EnumeratedFusionRange('c', 1, 79), EnumeratedFusionRange('c', 312, 5034), ) self._help_reconstitute(node_data, 1, 0)
def test_GeneFusion(self): """Test serialization of a gene fusion to BEL with a explicit fusion ranges.""" dsl = GeneFusion(Gene('HGNC', 'TMPRSS2'), Gene('HGNC', 'ERG'), EnumeratedFusionRange('c', 1, 79), EnumeratedFusionRange('c', 312, 5034)) self.assertEqual( 'g(fus(HGNC:TMPRSS2, "c.1_79", HGNC:ERG, "c.312_5034"))', dsl.as_bel())
def test_single_variant(self): node_data = Gene('HGNC', 'AKT1', variants=Hgvs('p.Phe508del')) node_parent_data = node_data.get_parent() self.graph.add_node_from_data(node_data) self.assertIn(node_data, self.graph) self.assertIn(node_parent_data, self.graph) self.assertEqual(2, self.graph.number_of_nodes()) self.assertEqual(1, self.graph.number_of_edges())
def test_gene_fusion_specified(self): node = GeneFusion(partner_5p=Gene(namespace='HGNC', name='TMPRSS2'), range_5p=EnumeratedFusionRange('c', 1, 79), partner_3p=Gene(namespace='HGNC', name='ERG'), range_3p=EnumeratedFusionRange('c', 312, 5034)) self.assertEqual( 'g(fus(HGNC:TMPRSS2, "c.1_79", HGNC:ERG, "c.312_5034"))', str(node))
def test_fusion(self): node_data = GeneFusion(partner_5p=Gene('HGNC', 'TMPRSS2'), partner_3p=Gene('HGNC', 'ERG'), range_5p=EnumeratedFusionRange('c', 1, 79), range_3p=EnumeratedFusionRange('c', 312, 5034)) node_data = node_data self.graph.add_node_from_data(node_data) self.assertIn(node_data, self.graph) self.assertEqual(1, self.graph.number_of_nodes()) self.assertEqual(0, self.graph.number_of_edges())
def test_multiple_variants(self): node_data = Gene('HGNC', 'AKT1', variants=[Hgvs('p.Phe508del'), Hgvs('p.Phe509del')]) node_parent_data = node_data.get_parent() node_parent_tuple = node_parent_data self.graph.add_node_from_data(node_data) self.assertIn(node_data, self.graph) self.assertIn(node_parent_tuple, self.graph) self.assertEqual(2, self.graph.number_of_nodes()) self.assertEqual(1, self.graph.number_of_edges())
def test_annotations_with_multilist(self): self.add_default_provenance() statements = [ 'SET TestAnnotation1 = {"A","B"}', 'SET TestAnnotation2 = "X"', 'SET TestAnnotation3 = {"D","E"}', 'g(TESTNS:1) -> g(TESTNS:2)', ] self.parser.parse_lines(statements) self.assertEqual(3, len(self.parser.control_parser.annotations)) self.assertIn('TestAnnotation1', self.parser.control_parser.annotations) self.assertIn('TestAnnotation2', self.parser.control_parser.annotations) self.assertIn('TestAnnotation3', self.parser.control_parser.annotations) test_node_1 = Gene(namespace='TESTNS', name='1') test_node_2 = Gene(namespace='TESTNS', name='2') self.assertEqual(2, self.parser.graph.number_of_nodes()) self.assertIn(test_node_1, self.graph) self.assertIn(test_node_2, self.graph) self.assertEqual(1, self.parser.graph.number_of_edges()) kwargs = { RELATION: INCREASES, EVIDENCE: test_evidence_text, ANNOTATIONS: { 'TestAnnotation1': { 'A': True, 'B': True }, 'TestAnnotation2': { 'X': True }, 'TestAnnotation3': { 'D': True, 'E': True }, }, CITATION: test_citation_dict, } self.assert_has_edge(test_node_1, test_node_2, only=True, **kwargs)
def test_regex_lookup( self, mock): # FIXME this test needs to be put somewhere else """Test that regular expression nodes get love too.""" graph = BELGraph( name='Regular Expression Test Graph', description='Help test regular expression namespaces', version='1.0.0', ) dbsnp = 'dbSNP' DBSNP_PATTERN = 'rs[0-9]+' graph.namespace_pattern[dbsnp] = DBSNP_PATTERN rs1234 = Gene(namespace=dbsnp, name='rs1234') rs1235 = Gene(namespace=dbsnp, name='rs1235') graph.add_node_from_data(rs1234) graph.add_node_from_data(rs1235) rs1234_hash = rs1234.md5 rs1235_hash = rs1235.md5 self.manager.insert_graph(graph) rs1234_lookup = self.manager.get_node_by_hash(rs1234_hash) self.assertIsNotNone(rs1234_lookup) self.assertEqual('Gene', rs1234_lookup.type) self.assertEqual('g(dbSNP:rs1234)', rs1234_lookup.bel) self.assertEqual(rs1234_hash, rs1234_lookup.md5) self.assertIsNotNone(rs1234_lookup.namespace_entry) self.assertEqual('rs1234', rs1234_lookup.namespace_entry.name) self.assertEqual('dbSNP', rs1234_lookup.namespace_entry.namespace.keyword) self.assertEqual(DBSNP_PATTERN, rs1234_lookup.namespace_entry.namespace.pattern) rs1235_lookup = self.manager.get_node_by_hash(rs1235_hash) self.assertIsNotNone(rs1235_lookup) self.assertEqual('Gene', rs1235_lookup.type) self.assertEqual('g(dbSNP:rs1235)', rs1235_lookup.bel) self.assertEqual(rs1235_hash, rs1235_lookup.md5) self.assertIsNotNone(rs1235_lookup.namespace_entry) self.assertEqual('rs1235', rs1235_lookup.namespace_entry.name) self.assertEqual('dbSNP', rs1235_lookup.namespace_entry.namespace.keyword) self.assertEqual(DBSNP_PATTERN, rs1235_lookup.namespace_entry.namespace.pattern)
def test_gmod_default(self, mock): """Test a gene modification that uses the BEL default namespace.""" dummy_namespace = n() dummy_name = n() node_data = Gene(namespace=dummy_namespace, name=dummy_name, variants=[GeneModification('Me')]) self._help_reconstitute(node_data, 2, 1)
def get_neurommsig_scores( graph: BELGraph, genes: List[Gene], annotation: str = 'Subgraph', ora_weight: Optional[float] = None, hub_weight: Optional[float] = None, top_percent: Optional[float] = None, topology_weight: Optional[float] = None, preprocess: bool = False, use_tqdm: bool = False, tqdm_kwargs: Optional[Mapping] = None, ) -> Optional[Mapping[str, float]]: """Preprocess the graph, stratify by the given annotation, then run the NeuroMMSig algorithm on each. :param graph: A BEL graph :param genes: A list of gene nodes :param annotation: The annotation to use to stratify the graph to subgraphs :param ora_weight: The relative weight of the over-enrichment analysis score from :py:func:`neurommsig_gene_ora`. Defaults to 1.0. :param hub_weight: The relative weight of the hub analysis score from :py:func:`neurommsig_hubs`. Defaults to 1.0. :param top_percent: The percentage of top genes to use as hubs. Defaults to 5% (0.05). :param topology_weight: The relative weight of the topolgical analysis core from :py:func:`neurommsig_topology`. Defaults to 1.0. :param preprocess: If true, preprocess the graph. :return: A dictionary from {annotation value: NeuroMMSig composite score} Pre-processing steps: 1. Infer the central dogma with :func:`` 2. Collapse all proteins, RNAs and miRNAs to genes with :func:`` 3. Collapse variants to genes with :func:`` """ if preprocess: graph = neurommsig_graph_preprocessor.run(graph) if all(isinstance(gene, str) for gene in genes): genes = [Gene('HGNC', gene) for gene in genes] if all(gene not in graph for gene in genes): logger.warning('no genes mapping to graph') return subgraphs = get_subgraphs_by_annotation(graph, annotation=annotation) return get_neurommsig_scores_prestratified( subgraphs=subgraphs, genes=genes, ora_weight=ora_weight, hub_weight=hub_weight, top_percent=top_percent, topology_weight=topology_weight, use_tqdm=use_tqdm, tqdm_kwargs=tqdm_kwargs, )
def test_overlay(self): """Test overlaying data in a BEL graph.""" g = BELGraph() g1 = Gene(HGNC, 'a') g2 = Gene(HGNC, 'b') g3 = Gene(HGNC, 'c') g4 = Gene(HGNC, 'd') r1 = Rna(HGNC, 'e') p1 = Protein(HGNC, 'f') g.add_node_from_data(g1) g.add_node_from_data(g2) g.add_node_from_data(g3) g.add_node_from_data(g4) g.add_node_from_data(r1) g.add_node_from_data(p1) self.assertEqual(6, g.number_of_nodes()) label = 'dgxp' overlay_type_data(g, { 'a': 1, 'b': 2, 'c': -1 }, Gene, HGNC, label=label, impute=0) for node in g1, g2, g3, g4: self.assertIn(label, g.nodes[node]) for node in r1, p1: self.assertNotIn(label, g.nodes[node]) self.assertEqual(1, g.nodes[g1][label]) self.assertEqual(2, g.nodes[g2][label]) self.assertEqual(-1, g.nodes[g3][label]) self.assertEqual(0, g.nodes[g4][label])
def test_gmod_custom(self, mock): """Tests a gene modification that uses a non-default namespace""" dummy_namespace = 'HGNC' dummy_name = 'AKT1' dummy_mod_namespace = 'GO' dummy_mod_name = 'DNA Methylation' node_data = Gene(namespace=dummy_namespace, name=dummy_name, variants=[ GeneModification(name=dummy_mod_name, namespace=dummy_mod_namespace) ]) self._help_reconstitute(node_data, 2, 1)
def rewire_variants_to_genes(graph: BELGraph) -> None: """Find all protein variants that are pointing to a gene and not a protein and fixes them. Does this by changing their function to be :data:`pybel.constants.GENE`, in place. A use case is after running :func:`collapse_to_genes`. """ mapping = {} for node in graph: if not isinstance(node, Protein) or not node.variants: continue mapping[node] = Gene( name=node.name, namespace=node.namespace, identifier=node.identifier, variants=node.variants, ) nx.relabel_nodes(graph, mapping, copy=False)
def test_multiple_variants(self, mock): node_data = Gene(namespace='HGNC', name='AKT1', variants=[Hgvs('p.Phe508del'), Hgvs('p.Phe509del')]) self._help_reconstitute(node_data, 2, 1)
# -*- coding: utf-8 -*- """Tests for PyNPA.""" import unittest import pandas as pd from pybel import BELGraph from pybel.dsl import ComplexAbundance, Gene, Protein, Rna from pybel.io.pynpa import to_npa_dfs, to_npa_layers from pybel.struct.getters import get_tf_pairs from pybel.testing.utils import n g1 = Gene('hgnc', '1') r1 = Rna('hgnc', '1') p1 = Protein('hgnc', '1') g2 = Gene('hgnc', '2') r2 = Rna('hgnc', '2') p2 = Protein('hgnc', '2') g3 = Gene('hgnc', '3') p3 = Protein('hgnc', '3') class TestPyNPA(unittest.TestCase): """Tests for PyNPA.""" def setUp(self) -> None: """Set up a small test graph.""" self.graph = BELGraph() self.graph.add_increases(ComplexAbundance([p1, g2]), r2, citation=n(),
def get_graph() -> BELGraph: df = df_getter() graph = BELGraph( name='GWAS Catalog', version='1.0.2', ) graph.namespace_pattern.update( dict( dbsnp=r'^rs\d+$', efo=r'^\d{7}$', hgnc=r'^((HGNC|hgnc):)?\d{1,5}$', )) it = tqdm(df.values, desc='Mapping GWAS Catalog to BEL') for ( pmid, mapped_gene, dbsnp_id, context, intergenic, minus_log_p_value, risk_allele_frequency, or_or_beta, confidence_interval, mapped_trait, mapped_trait_uri, ) in it: if pd.isna(mapped_trait_uri): continue annotations = dict( minus_log_p_value=minus_log_p_value, risk_allele_frequency=risk_allele_frequency, odds_ratio_or_beta=or_or_beta, confidence_interval=confidence_interval, ) if pd.notna(context): annotations['gwascatalog_context'] = { c.strip() for c in context.split(';') } dbsnp_node = Gene( namespace='dbsnp', identifier=dbsnp_id, ) pathology_node = Pathology( namespace='efo', name=mapped_trait, identifier=mapped_trait_uri.split('/')[-1][4:], ) graph.add_association( dbsnp_node, pathology_node, citation=str(pmid), evidence=MODULE_NAME, annotations=annotations, ) if intergenic in {'0', '0.0', 0, 0.0}: gene_symbols = [ gene_symbol.strip() for gene_symbol in mapped_gene.split(',') ] for gene_symbol in gene_symbols: hgnc_id = hgnc_name_to_id.get(gene_symbol) if hgnc_id is None: continue # TODO lookup for ensembl identifiers # gene_node = Gene( # namespace='ensembl', # name=gene_symbol, # ) else: gene_node = Gene( namespace='hgnc', identifier=hgnc_id, name=gene_symbol, ) graph.add_has_variant(gene_node, dbsnp_node) graph.add_association( gene_node, pathology_node, citation=str(pmid), evidence=MODULE_NAME, annotations=annotations, ) return graph
def test_gene_reference(self): node = Gene(namespace='EGID', name='780') self.assertEqual('g(EGID:780)', str(node))
def get_neurommsig_bel( df: pd.DataFrame, disease: str, nift_values: Mapping[str, str], ) -> BELGraph: """Generate the NeuroMMSig BEL graph. :param df: :param disease: :param nift_values: a dictionary of lower-cased to normal names in NIFT """ missing_features = set() fixed_caps = set() nift_value_originals = set(nift_values.values()) graph = BELGraph( name=f'NeuroMMSigDB for {disease}', description=f'SNP and Clinical Features for Subgraphs in {disease}', authors= 'Daniel Domingo-Fernández, Charles Tapley Hoyt, Mufassra Naz, Aybuge Altay, Anandhi Iyappan', contact='*****@*****.**', version=time.strftime('%Y%m%d'), ) for pathway, pathway_df in df.groupby(PATHWAY_COLUMN_NAME): sorted_pathway_df = pathway_df.sort_values(GENE_COLUMN_NAME) sliced_df = sorted_pathway_df[columns].itertuples() for _, gene, pubmeds, lit_snps, gwas_snps, ld_block_snps, clinical_features, clinical_snps in sliced_df: gene = ensure_quotes(gene) for snp in itt.chain(lit_snps or [], gwas_snps or [], ld_block_snps or [], clinical_snps or []): if not snp.strip(): continue graph.add_association( Gene('HGNC', gene), Gene('DBSNP', snp), evidence=CANNED_EVIDENCE, citation=CANNED_CITATION, annotations={ 'MeSHDisease': disease, }, ) for clinical_feature in clinical_features or []: if not clinical_feature.strip(): continue if clinical_feature.lower() not in nift_values: missing_features.add(clinical_feature) continue if clinical_feature not in nift_value_originals: fixed_caps.add((clinical_feature, nift_values[clinical_feature.lower()])) clinical_feature = nift_values[ clinical_feature.lower()] # fix capitalization graph.add_association( Gene('HGNC', gene), Abundance('NIFT', clinical_feature), evidence=CANNED_EVIDENCE, citation=CANNED_CITATION, annotations={ 'MeSHDisease': disease, }, ) if clinical_snps: for clinical_snp in clinical_snps: graph.add_association( Gene('DBSNP', clinical_snp), Abundance('NIFT', clinical_feature), evidence=CANNED_EVIDENCE, citation=CANNED_CITATION, annotations={ 'MeSHDisease': disease, }, ) if missing_features: logger.warning('Missing Features in %s', disease) for feature in missing_features: logger.warning(feature) if fixed_caps: logger.warning('Fixed capitalization') for broken, fixed in fixed_caps: logger.warning('%s -> %s', broken, fixed) return graph
def test_annotations_with_list(self): self.assertIsNotNone(self.parser.graph) self.add_default_provenance() statements = [ 'SET TestAnnotation1 = {"A","B"}', 'SET TestAnnotation2 = "X"', 'g(TESTNS:1) -> g(TESTNS:2)' ] self.parser.parse_lines(statements) self.assertEqual(2, len(self.parser.control_parser.annotations)) self.assertIn('TestAnnotation1', self.parser.control_parser.annotations) self.assertIn('TestAnnotation2', self.parser.control_parser.annotations) self.assertEqual( 2, len(self.parser.control_parser.annotations['TestAnnotation1'])) self.assertEqual( [ Entity(namespace='TestAnnotation1', identifier='A'), Entity(namespace='TestAnnotation1', identifier='B'), ], self.parser.control_parser.annotations['TestAnnotation1'], ) self.assertEqual( 1, len(self.parser.control_parser.annotations['TestAnnotation2'])) self.assertEqual( [ Entity(namespace='TestAnnotation2', identifier='X'), ], self.parser.control_parser.annotations['TestAnnotation2'], ) test_node_1_dict = Gene(namespace='TESTNS', name='1') test_node_2_dict = Gene(namespace='TESTNS', name='2') self.assertEqual(2, self.parser.graph.number_of_nodes()) self.assertIn(test_node_1_dict, self.graph) self.assertIn(test_node_2_dict, self.graph) self.assertEqual(1, self.parser.graph.number_of_edges()) kwargs = { RELATION: INCREASES, EVIDENCE: test_evidence_text, ANNOTATIONS: { 'TestAnnotation1': { 'A': True, 'B': True }, 'TestAnnotation2': { 'X': True }, }, CITATION: test_citation_dict, } self.assert_has_edge(test_node_1_dict, test_node_2_dict, only=True, **kwargs)
def test_regex_match(self): line = 'g(dbSNP:rs10234) -- g(dbSNP:rs10235)' self.add_default_provenance() self.parser.parseString(line) self.assertIn(Gene('dbSNP', 'rs10234'), self.parser.graph) self.assertIn(Gene('dbSNP', 'rs10235'), self.parser.graph)
# PyBEL manager cls.pybel_manager = pybel.Manager(engine=cls.engine, session=cls.session) cls.pybel_manager.create_all() @classmethod def tearDownClass(cls): """Close the connection in the manager and deletes the temporary database.""" cls.session.close() super().tearDownClass() protein_a = Protein(namespace=HGNC, identifier='2976', name='DNMT1') protein_b = Protein(namespace=HGNC, identifier='9173', name='POLA1') gene_c = Gene(namespace=HGNC, identifier='8903', name='PGLS') pathway_a = BiologicalProcess(namespace=WIKIPATHWAYS, identifier='WP1604', name='Codeine and Morphine Metabolism') def get_enrichment_graph(): """Build a simple test graph with 2 proteins, one gene, and one pathway all contained in HGNC.""" graph = BELGraph( name='My test graph for enrichment', version='0.0.1', ) graph.add_increases(protein_a, protein_b, citation='1234', evidence='') graph.add_decreases(protein_b, gene_c, citation='1234', evidence='') graph.add_part_of(gene_c, pathway_a) return graph
def test_Hgvs(self, mock): node_data = Gene(namespace='HGNC', name='AKT1', variants=Hgvs('p.Phe508del')) self._help_reconstitute(node_data, 2, 1)
import unittest from pybel import BELGraph from pybel.constants import ( ASSOCIATION, DECREASES, DIRECTLY_INCREASES, INCREASES, POSITIVE_CORRELATION, ) from pybel.dsl import Abundance, Gene, MicroRna, Pathology, Protein, Rna from pybel.testing.utils import n from pybel_tools.mutation.collapse import collapse_to_protein_interactions HGNC = 'HGNC' GO = 'GO' CHEBI = 'CHEBI' g1 = Gene(namespace=HGNC, name='1') r1 = Rna(namespace=HGNC, name='1') p1 = Protein(HGNC, name='1') g2 = Gene(HGNC, name='2') r2 = Rna(HGNC, name='2') p2 = Protein(HGNC, name='2') g3 = Gene(namespace=HGNC, name='3') r3 = Rna(namespace=HGNC, name='3') p3 = Protein(namespace=HGNC, name='3') g4 = Gene(namespace=HGNC, name='4') m4 = MicroRna(namespace=HGNC, name='4') a5 = Abundance(namespace=CHEBI, name='5')
def _preprocess_dtis( dtis: Mapping[str, List[str]]) -> Mapping[str, List[Gene]]: return { drug: [Gene(namespace='HGNC', name=target) for target in targets] for drug, targets in dtis.items() }
akt1_rna = akt1.get_rna() akt1_gene = akt1_rna.get_gene() akt_methylated = akt1_gene.with_variants(GeneModification('Me')) akt1_phe_508_del = akt1_gene.with_variants(Hgvs('p.Phe508del')) cftr = hgnc('CFTR') cftr_protein_unspecified_variant = cftr.with_variants(HgvsUnspecified()) cftr_protein_phe_508_del = cftr.with_variants(Hgvs('p.Phe508del')) adenocarcinoma = Pathology('MESHD', 'Adenocarcinoma') interleukin_23_complex = NamedComplexAbundance('GO', 'interleukin-23 complex') oxygen_atom = Abundance(namespace='CHEBI', name='oxygen atom') hydrogen_peroxide = Abundance('CHEBI', 'hydrogen peroxide') tmprss2_gene = Gene('HGNC', 'TMPRSS2') tmprss2_erg_gene_fusion = GeneFusion( partner_5p=tmprss2_gene, range_5p=EnumeratedFusionRange('c', 1, 79), partner_3p=Gene('HGNC', 'ERG'), range_3p=EnumeratedFusionRange('c', 312, 5034) ) bcr_jak2_gene_fusion = GeneFusion( partner_5p=Gene('HGNC', 'BCR'), range_5p=EnumeratedFusionRange('c', '?', 1875), partner_3p=Gene('HGNC', 'JAK2'), range_3p=EnumeratedFusionRange('c', 2626, '?') )
def test_fusion_unspecified(self, mock): node_data = GeneFusion( Gene('HGNC', 'TMPRSS2'), Gene('HGNC', 'ERG'), ) self._help_reconstitute(node_data, 1, 0)