Example #1
0
    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.version_num is None:
            import os
            logger.info("Figuring out version num for files")
            # probe the raw directory for the WSnumber on
            # the "letter.WS###" file.
            # this is the only one that we keep the version number on
            files = os.listdir(self.rawdir)
            letter_file = next(f for f in files if re.match(r'letter', f))
            vernum = re.search(r'(WS\d+)', letter_file)
            self.update_wsnum_in_files(vernum.group(1))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        self.nobnodes = True  # FIXME
        # to hold any label for a given id
        self.id_label_map = {}
        # to hold the mappings between genotype and background
        self.genotype_backgrounds = {}
        self.extrinsic_id_to_enviro_id_hash = {}
        # to hold the genes variant due to a seq alt
        self.variant_loci_genes = {}
        # to hold the parts of an environment
        self.environment_hash = {}
        self.wildtype_genotypes = []
        # stores the rnai_reagent to gene targets
        self.rnai_gene_map = {}

        self.process_gene_ids(limit)
        # self.process_gene_desc(limit)   #TEC imput file is mia 2016-Mar-03
        self.process_allele_phenotype(limit)
        self.process_rnai_phenotypes(limit)
        self.process_pub_xrefs(limit)
        self.process_feature_loc(limit)
        self.process_disease_association(limit)
        # TODO add this when when complete
        # self.process_gene_interaction(limit)

        logger.info("Finished parsing.")

        self.load_bindings()
        gu = GraphUtils(curie_map.get())
        gu.loadAllProperties(g)
        gu.loadObjectProperties(g, Genotype.object_properties)

        logger.info("Found %d nodes in graph", len(self.graph))
        logger.info("Found %d nodes in testgraph", len(self.testgraph))

        return
Example #2
0
    def __init__(self,
                 graph,
                 definedby,
                 entity_id,
                 phenotype_id,
                 rel=None,
                 entity_category=None,
                 phenotype_category=None):
        super().__init__(graph, definedby)
        self.entity_id = entity_id
        self.phenotype_id = phenotype_id

        if rel is None:
            rel = self.globaltt['has phenotype']

        self.start_stage_id = None
        self.end_stage_id = None
        self.environment_id = None
        self.stage_process_id = None

        self.set_subject(entity_id)
        self.set_object(phenotype_id)
        self.set_relationship(rel)

        self.subject_category = entity_category
        self.object_category = phenotype_category
        self.gut = GraphUtils(None)

        return
Example #3
0
    def _get_phenotypicseries_parents(entry, g):
        """
        Extract the phenotypic series parent relationship out of the entry
        :param entry:
        :return:
        """
        gu = GraphUtils(curie_map.get())
        omimid = 'OMIM:'+str(entry['mimNumber'])
        # the phenotypic series mappings
        serieslist = []
        if 'phenotypicSeriesExists' in entry:
            if entry['phenotypicSeriesExists'] is True:
                if 'phenotypeMapList' in entry:
                    phenolist = entry['phenotypeMapList']
                    for p in phenolist:
                        serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
                if 'geneMap' in entry and 'phenotypeMapList' in entry['geneMap']:
                    phenolist = entry['geneMap']['phenotypeMapList']
                    for p in phenolist:
                        if 'phenotypicSeriesNumber' in p['phenotypeMap']:
                            serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
        # add this entry as a subclass of the series entry
        for ser in serieslist:
            series_id = 'OMIM:'+ser
            gu.addClassToGraph(g, series_id, None)
            gu.addSubclass(g, series_id, omimid)

        return
Example #4
0
    def _parse_curated_chem_disease(self, limit):
        line_counter = 0
        file_path = '/'.join((self.rawdir, self.static_files['publications']['file']))
        gu = GraphUtils(curie_map.get())
        with open(file_path, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                # catch comment lines
                if re.match('^#', ' '.join(row)):
                    continue
                line_counter += 1
                self._check_list_len(row, 10)
                (pub_id, disease_label, disease_id, disease_cat, evidence,
                 chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row

                rel_id = self._get_relationship_id(evidence)
                chem_id = 'MESH:'+chem_id
                gu.addClassToGraph(self.g, chem_id, chem_label)
                gu.addClassToGraph(self.g, disease_id, None)
                if pub_id != '':
                    pub_id = 'PMID:'+pub_id
                    r = Reference(pub_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(self.g)
                else:
                    pub_id = None
                self._make_association('MESH:'+chem_id, disease_id, rel_id, ['PMID:'+pub_id])

                if not self.testMode and limit is not None and line_counter >= limit:
                    break
        return
Example #5
0
    def _process_phenotypicseries(self, limit):
        """
        Creates classes from the OMIM phenotypic series list.  These are grouping classes
        to hook the more granular OMIM diseases.
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        logger.info("getting phenotypic series titles")
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        start = False
        with open('/'.join((self.rawdir, self.files['phenotypicSeries']['file']))) as f:
            for line in f:
                # there's several lines of header in the file, so need to skip several lines:
                if not start:
                    if re.match('Phenotypic Series', line):
                        start = True
                    continue
                if re.match('\w*$', line):
                    # skip blank lines
                    continue
                line = line.strip()
                line_counter += 1
                (ps_label, ps_num) = line.split('\t')
                omim_id = 'OMIM:'+ps_num
                gu.addClassToGraph(g, omim_id, ps_label)

        return
Example #6
0
    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for g in [self.graph, self.testgraph]:
            # FIXME: How to devise a label for each repository?
            gu = GraphUtils(curie_map.get())
            repo_id = 'CoriellCollection:'+collection_id
            repo_label = label
            repo_page = page

            gu.addIndividualToGraph(
                g, repo_id, repo_label, self.terms['collection'])
            gu.addPage(g, repo_id, repo_page)

        return
Example #7
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())

        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    gu.addClassToGraph(self.graph, hp_id, None)
                    # Add the HP ID as an equivalent class
                    gu.addEquivalentClass(self.graph, morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s', morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return
Example #8
0
    def __init__(
            self,
            graph,
            feature_id=None,
            label=None,
            feature_type=None,
            description=None,
            feature_category=None
    ):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gfxutl = GraphUtils(self.curie_map)
        self.fid = feature_id
        self.feature_category = feature_category
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None
Example #9
0
    def process_gene_desc(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_desc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Gene descriptions")
        line_counter = 0
        # geno = Genotype(g)  # TODO unused
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                if line_counter == 1:
                    continue
                (gene_num, public_name, molecular_name, concise_description,
                 provisional_description, detailed_description,
                 automated_description, gene_class_description) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:'+gene_num

                if concise_description != 'none available':
                    gu.addDefinition(g, gene_id, concise_description)

                # remove the description if it's identical to the concise
                descs = {
                    'provisional': provisional_description,
                    'automated': automated_description,
                    'detailed': detailed_description,
                    'gene class': gene_class_description
                }
                for d in descs:
                    text = descs.get(d)
                    if text == concise_description \
                            or re.match(r'none', text) or text == '':
                        pass  # don't use it
                    else:
                        text = ' '.join((text, '['+d+']'))
                        descs[d] = text
                        gu.addDescription(g, gene_id, text)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Example #10
0
 def __init__(self, graph):
     if isinstance(graph, Graph):
         self.graph = graph
     else:
         raise ValueError("{} is not a graph".format(graph))
     self.model = Model(self.graph)
     self.globaltt = self.graph.globaltt
     self.globaltcid = self.graph.globaltcid
     self.curie_map = self.graph.curie_map
     self.gut = GraphUtils(self.curie_map)
Example #11
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new
        gene id is the replacement for it.  The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))

                # add the two genes
                gu.addClassToGraph(g, gene_id, None)
                gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol)

                # add the new gene id to replace the old gene id
                gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                gu.addSynonym(g, gene_id, discontinued_symbol)

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        return
Example #12
0
    def process_pub_xrefs(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing publication xrefs")
        line_counter = 0
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (wb_ref, xref) = row
                # WBPaper00000009 pmid8805<BR>
                # WBPaper00000011 doi10.1139/z78-244<BR>
                # WBPaper00000012 cgc12<BR>

                if self.testMode and wb_ref not in self.test_ids['pub']:
                    continue

                ref_id = 'WormBase:'+wb_ref
                xref_id = r = None
                xref = re.sub(r'<BR>', '', xref)
                xref = xref.strip()
                if re.match(r'pmid', xref):
                    xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref)
                    r = Reference(
                        xref_id, Reference.ref_types['journal_article'])
                elif re.search(r'[\(\)\<\>\[\]\s]', xref):
                    continue
                elif re.match(r'doi', xref):
                    xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip())
                    r = Reference(xref_id)
                elif re.match(r'cgc', xref):
                    # TODO not sure what to do here with cgc xrefs
                    continue
                else:
                    # logger.debug("Other xrefs like %s", xref)
                    continue

                if xref_id is not None:
                    r.addRefToGraph(g)
                    gu.addSameIndividual(g, ref_id, xref_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Example #13
0
    def _process_pathways(self, limit=None):
        """
        This method adds the KEGG pathway IDs.
        These are the canonical pathways as defined in KEGG.
        We also encode the graphical depiction
        which maps 1:1 with the identifier.

        Triples created:
        <pathway_id> is a GO:signal_transduction
        <pathway_id> rdfs:label <pathway_name>
        <gene_id> RO:involved_in <pathway_id>
        :param limit:
        :return:

        """

        logger.info("Processing pathways")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        path = Pathway(g, self.nobnodes)
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['pathway']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (pathway_id, pathway_name) = row

                if self.testMode and \
                        pathway_id not in self.test_ids['pathway']:
                    continue

                pathway_id = 'KEGG-'+pathway_id.strip()
                path.addPathway(pathway_id, pathway_name)

                # we know that the pathway images from kegg map 1:1 here.
                # so add those
                image_filename = re.sub(r'KEGG-path:', '', pathway_id) + '.png'
                image_url = \
                    'http://www.genome.jp/kegg/pathway/map/'+image_filename
                gu.addDepiction(g, pathway_id, image_url)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with pathways")
        return
Example #14
0
    def write(self, format='rdfxml', stream=None):
        """
        This convenience method will write out all of the graphs associated with the source.
        Right now these are hardcoded to be a single "graph" and a "dataset".
        If you do not supply stream='stdout' it will default write these to files
        :return: None
        """
        format_to_xtn = {
            'rdfxml': 'xml', 'turtle': 'ttl'
        }

        # make the regular graph output file
        file = None
        if self.name is not None:
            file = '/'.join((self.outdir, self.name))
            if format in format_to_xtn:
                file = '.'.join((file, format_to_xtn.get(format)))
            else:
                file = '.'.join((file, format))
            # make the datasetfile name
            datasetfile = '/'.join((self.outdir, self.name+'_dataset'))
            if format in format_to_xtn:
                datasetfile = '.'.join((datasetfile, format_to_xtn.get(format)))
            else:
                datasetfile = '.'.join((datasetfile, format))
        else:
            logger.warn("No output file set. Using stdout")
            stream = 'stdout'

        # start off with only the dataset descriptions
        graphs = [
            {'g': self.dataset.getGraph(), 'file': datasetfile},
        ]

        # add the other graphs to the set to write, if not in the test mode
        if self.testMode:
            graphs += [{'g': self.testgraph, 'file': self.testfile}]
        else:
            graphs += [{'g': self.graph, 'file': file}]

        gu = GraphUtils(None)
        # loop through each of the graphs and print them out
        for g in graphs:
            if stream is None:
                gu.write(g['g'], format, file=g['file'])
            elif stream.lowercase().strip() == 'stdout':
                gu.write(g['g'], format)
            else:
                logger.error("I don't understand your stream.")
        return
Example #15
0
    def _process_genes_kegg2ncbi(self, limit=None):
        """
        This method maps the KEGG human gene IDs
            to the corresponding NCBI Gene IDs.

        Triples created:
        <kegg_gene_id> is a class
        <ncbi_gene_id> is a class
        <kegg_gene_id> equivalentClass <ncbi_gene_id>
        :param limit:
        :return:

        """

        logger.info("Processing KEGG gene IDs to NCBI gene IDs")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, ncbi_gene_id, link_type) = row

                if self.testMode and \
                        kegg_gene_id not in self.test_ids['genes']:
                    continue

                # Adjust the NCBI gene ID prefix.
                ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
                kegg_gene_id = 'KEGG-'+kegg_gene_id

                # Adding the KEGG gene ID to the graph here is redundant,
                # unless there happens to be additional gene IDs in this table
                # not present in the genes table.
                gu.addClassToGraph(g, kegg_gene_id, None)
                gu.addClassToGraph(g, ncbi_gene_id, None)
                gu.addEquivalentClass(g, kegg_gene_id, ncbi_gene_id)

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with KEGG gene IDs to NCBI gene IDs")
        return
Example #16
0
    def __init__(self):
        Source.__init__(self, 'mpd')
        # @N, not sure if this step is required
        self.namespaces.update(curie_map.get())
        self.stdevthreshold = 2

        self.nobnodes = True  # FIXME

        # update the dataset object with details about this resource
        # @N: Note that there is no license as far as I can tell
        self.dataset = Dataset(
            'mpd', 'MPD', 'http://phenome.jax.org', None, None)

        # TODO add a citation for mpd dataset as a whole
        self.dataset.set_citation('PMID:15619963')

        self.assayhash = {}
        self.idlabel_hash = {}
        # to store the mean/zscore of each measure by strain+sex
        self.score_means_by_measure = {}
        # to store the mean value for each measure by strain+sex
        self.strain_scores_by_measure = {}

        self.geno = Genotype(self.graph)
        self.gu = GraphUtils(curie_map.get())

        return
Example #17
0
    def __init__(self):
        Source.__init__(self, 'ctd')
        self.dataset = Dataset(
            'ctd', 'CTD', 'http://ctdbase.org', None,
            'http://ctdbase.org/about/legal.jsp')

        if 'test_ids' not in config.get_config() \
                or 'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = config.get_config()['test_ids']['gene']

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = config.get_config()['test_ids']['disease']

        self.gu = GraphUtils(curie_map.get())
        self.g = self.graph
        self.geno = Genotype(self.g)

        return
Example #18
0
    def make_association_id(definedby, sub, pred, obj, attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively digests the  string of concatonated values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be appeded to the ID.

        Note this is equivalent to a RDF blank node

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:

        :return:

        """

        items_to_hash = [definedby, sub, pred, obj]
        if attributes is not None and len(attributes) > 0:
            items_to_hash += attributes

        items_to_hash = [x for x in items_to_hash if x is not None]

        assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash))))
        # assert assoc_id is not None
        return assoc_id
Example #19
0
    def make_association_id(definedby, sub, pred, obj, attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively digests the  string of concatonated values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be appeded to the ID.

        Note this is equivalent to a RDF blank node

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:

        :return:

        """

        items_to_hash = [definedby, sub, pred, obj]
        if attributes is not None and len(attributes) > 0:
            items_to_hash += attributes

        items_to_hash = [x for x in items_to_hash if x is not None]

        assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash))))
        assert assoc_id is not None
        return assoc_id
Example #20
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils
        from dipper import curie_map

        # Make testutils object and load ttl
        test_query = TestUtils(self.source.graph)
        test_query.load_testgraph_from_turtle(self.source)

        # Expected structure
        # TODO can this be unified OBAN and the Annot models to be automatically generated?
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        gu = GraphUtils(curie_map.get())
        chem_id = 'MESH:D009538'
        chem_uri = gu.getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = gu.getNode(disease_id)
        eco = 'ECO:0000033'
        rel_id = gu.object_properties['substance_that_treats']
        pubmed_id = 'PMID:16785264'
        pubmed_uri = gu.getNode(pubmed_id)

        # consider replacing with make_ctd_chem_disease_assoc_id()
        assoc_id = self.source.make_association_id('ctd', chem_id, rel_id, disease_id, eco, pubmed_id)
        assoc_uri = gu.getNode(assoc_id)

        # One of the expected outputs from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output, "did not find expected association: " + assoc_id +
                        " found: " + pprint.pformat(sparql_output))

        logger.info("Test query data finished.")
Example #21
0
    def _process_diseases(self, limit=None):
        """
        This method processes the KEGG disease IDs.

        Triples created:
        <disease_id> is a class
        <disease_id> rdfs:label <disease_name>
        :param limit:
        :return:

        """

        logger.info("Processing diseases")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (disease_id, disease_name) = row

                disease_id = 'KEGG-'+disease_id.strip()
                if disease_id not in self.label_hash:
                    self.label_hash[disease_id] = disease_name

                if self.testMode and\
                        disease_id not in self.test_ids['disease']:
                    continue

                # Add the disease as a class.
                # we don't get all of these from MONDO yet see:
                # https://github.com/monarch-initiative/human-disease-ontology/issues/3
                gu.addClassToGraph(g, disease_id, disease_name)
                # not typing the diseases as DOID:4 yet because
                # I don't want to bulk up the graph unnecessarily

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with diseases")
        return
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils

        # Make testutils object and load bindings
        test_query = TestUtils(self.ctd.graph)
        self.ctd.load_bindings()

        # Expected structure
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        gu = GraphUtils(curie_map.get())
        chem_id = 'MESH:D009538'
        chem_uri = gu.getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = gu.getNode(disease_id)
        pubmed_id = 'PMID:16785264'
        pubmed_uri = gu.getNode(pubmed_id)
        rel_id = gu.object_properties['substance_that_treats']
        eco = 'ECO:0000033'
        # TODO PYLINT  make_association_id() does not exist in CTD
        # there is "_make_association()" with a different sig

        assoc_id = self.ctd.make_association_id(
            'ctd', chem_id, rel_id, disease_id, eco, pubmed_id)
        assoc_uri = gu.getNode(assoc_id)

        # Expected output from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output)

        logger.info("Test finished.")
Example #23
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say that a given publication
        is_about a gene.  Publications are added as NamedIndividuals.
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        is_about = gu.getNode(gu.object_properties['is_about'])
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #       or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                # add the gene, in case it hasn't before
                gu.addClassToGraph(g, gene_id, None)
                # add the publication as a NamedIndividual
                gu.addIndividualToGraph(g, pubmed_id, None, None)  # add type publication
                self.graph.add((gu.getNode(pubmed_id), is_about, gu.getNode(gene_id)))

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        return
Example #24
0
 def _add_variant_gene_relationship(self, variant_id, hgnc_symbol):
     """
     :param variant_id
     :param hgnc_symbol
     :return: None
     """
     gu = GraphUtils(curie_map.get())
     geno = Genotype(self.graph)
     if hgnc_symbol in self.gene_map:
         gene_id = self.gene_map[hgnc_symbol]
     else:
         gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol))
         logger.warn("Can't map gene symbol {0} "
                     "to entrez ID".format(hgnc_symbol))
     gu.addClassToGraph(self.graph, gene_id, hgnc_symbol)
     geno.addAlleleOfGene(variant_id, gene_id)
     return
Example #25
0
    def declareAsOntology(self, graph):
        """
        The file we output needs to be declared as an ontology, including it's version information.
        Further information will be augmented in the dataset object.
        :param version:
        :return:
        """
        # <http://data.monarchinitiative.org/ttl/biogrid.ttl> a owl:Ontology ;
        # owl:versionInfo <http://archive.monarchinitiative.org/ttl/biogrid-YYYY-MM-DD.ttl>

        gu = GraphUtils(curie_map.get())

        ontology_file_id = 'MonarchData:'+self.name+".ttl"
        gu.addOntologyDeclaration(graph, ontology_file_id)

        # add timestamp as version info

        t = datetime.now()
        t_string = t.strftime("%Y-%m-%d-%H-%M")
        ontology_version = self.name+'-'+t_string
        archive_url = 'MonarchArchive:'+ontology_version+'.ttl'
        gu.addOWLVersionIRI(graph, ontology_file_id, archive_url)
        gu.addOWLVersionInfo(graph, ontology_file_id, ontology_version)

        # TODO make sure this is synced with the Dataset class

        return
Example #26
0
    def _process_pathway_disease(self, limit):
        """
        We make a link between the pathway identifiers,
        and any diseases associated with them.
        Since we model diseases as processes, we make a triple saying that
        the pathway may be causally upstream of or within the disease process.

        :param limit:
        :return:

        """
        logger.info("Processing KEGG pathways to disease ids")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['pathway_disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (disease_id, kegg_pathway_num) = row

                if self.testMode and \
                        kegg_pathway_num not in self.test_ids['pathway']:
                    continue

                disease_id = 'KEGG-'+disease_id
                # will look like KEGG-path:map04130 or KEGG-path:hsa04130
                pathway_id = 'KEGG-'+kegg_pathway_num

                gu.addTriple(
                    g, pathway_id,
                    GraphUtils.object_properties[
                        'causally_upstream_of_or_within'],
                    disease_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Example #27
0
    def __init__(self, graph):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP)

        return
Example #28
0
    def test_curieprefixes(self):
        """
        This will ensure that we can create identifiers for all of the defined curie prefixes using the
        GraphUtils.getNode() method
        :return:
        """
        from dipper.utils.GraphUtils import GraphUtils

        gu = GraphUtils(self.curie_map)

        # add one id per curie as classes to the graph
        for p in self.curie_map.keys():
            testid = p+':testme'
            n = gu.getNode(testid)
            m = "prefix \""+p+"\" has an error...can't create graph node"
            self.assertTrue(n is not None, m)

        return
Example #29
0
    def parse(self, limit=None):
        """
        MPD data is delivered in four separate csv files and one xml file,
        which we process iteratively and write out as
        one large graph.

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
            self.geno = Genotype(self.testgraph)
        else:
            g = self.graph

        self._process_straininfo(limit)
        # the following will provide us the hash-lookups
        # These must be processed in a specific order

        # mapping between assays and ontology terms
        self._process_ontology_mappings_file(limit)
        # this is the metadata about the measurements
        self._process_measurements_file(limit)
        # get all the measurements per strain
        self._process_strainmeans_file(limit)

        # The following will use the hash populated above
        # to lookup the ids when filling in the graph
        self._fill_provenance_graph(limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        gu = GraphUtils(curie_map.get())
        gu.loadAllProperties(g)
        gu.loadProperties(g, G2PAssoc.object_properties, GraphUtils.OBJPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, GraphUtils.OBJPROP)
        gu.loadProperties(
            g, G2PAssoc.annotation_properties, GraphUtils.ANNOTPROP)

        logger.info("Found %d nodes", len(self.graph))
        return
Example #30
0
 def __init__(self, id, label, type, description=None):
     self.id = id
     self.label = label
     self.type = type
     self.description = description
     self.gu = GraphUtils(curie_map.get())
     self.start = None
     self.stop = None
     self.nobnodes = True  # TODO remove this before official release
     return
Example #31
0
    def _get_pubs(self, entry, g):
        """
        Extract mentioned publications from the reference list
        :param entry:
        :return:
        """

        ref_to_pmid = {}
        du = DipperUtil()
        entry_num = entry['mimNumber']
        gu = GraphUtils(curie_map.get())
        if 'referenceList' in entry:
            reflist = entry['referenceList']
            for r in reflist:
                if 'pubmedID' in r['reference']:
                    pub_id = 'PMID:' + str(r['reference']['pubmedID'])
                    ref = Reference(pub_id, Reference.ref_types['journal_article'])
                else:
                    # make blank node for internal reference
                    pub_id = '_OMIM' + str(entry_num) + 'ref' + str(r['reference']['referenceNumber'])
                    if self.nobnodes:
                        pub_id = ':' + pub_id
                    ref = Reference(pub_id)
                    title = author_list = source = citation = None
                    if 'title' in r['reference']:
                        title = r['reference']['title']
                        ref.setTitle(title)
                    if 'authors' in r['reference']:
                        author_list = r['reference']['authors']
                        ref.setAuthorList(author_list)
                        citation = re.split('\.\,', author_list)[0] + ' et al'
                    if 'source' in r['reference']:
                        source = r['reference']['source']
                    citation = '; '.join(du.flatten([citation, title, source]))
                    ref.setShortCitation(citation)
                ref.addRefToGraph(g)
                ref_to_pmid[r['reference']['referenceNumber']] = pub_id

                # add is_about for the pub
                omim_id = 'OMIM:'+str(entry_num)
                gu.addTriple(g, omim_id, gu.object_properties['mentions'], pub_id)

        return ref_to_pmid
Example #32
0
    def __init__(self, prov_type=None):

        if prov_type is None:
            self.prov_type = 'OBAN:provenance'
        self.prov_id = None
        self.measurement_datums = {}
        self.agent = None
        self.reference = None  # TODO this will be papers in the future
        self.gu = GraphUtils(curie_map.get())

        return
Example #33
0
    def _process_pathway_pubmed(self, limit):
        """
        Indicate that a pathway is annotated directly to a paper (is about)
            via it's pubmed id.
        :param limit:
        :return:
        """
        logger.info("Processing KEGG pathways to pubmed ids")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['pathway_pubmed']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (pubmed_id, kegg_pathway_num) = row

                if self.testMode and \
                        kegg_pathway_num not in self.test_ids['pathway']:
                    continue

                pubmed_id = pubmed_id.upper()
                # will look like KEGG-path:map04130
                kegg_id = 'KEGG-'+kegg_pathway_num

                r = Reference(
                    pubmed_id, Reference.ref_types['journal_article'])
                r.addRefToGraph(g)
                gu.addTriple(g, pubmed_id,
                             GraphUtils.object_properties['is_about'], kegg_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Example #34
0
    def addRefToGraph(self, g):

        gu = GraphUtils(curie_map.get())

        n = self.short_citation
        if n is None:
            n = self.title

        gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type)

        if self.title is not None:
            gu.addTitle(g, self.ref_id, self.title)

        # todo what is the property here to add the date?
        #if self.year is not None:
        #    gu.addTriple()

        #if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(g, self.ref_id, self.props['has_author'], a, True)
        return
Example #35
0
    def addRefToGraph(self, g):

        gu = GraphUtils(curie_map.get())

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            ref_uri = URIRef(self.ref_url)
            g.add((ref_uri, DC['title'], Literal(self.title)))
            g.add((ref_uri, RDF['type'], gu.getNode(self.ref_type)))
            g.add((ref_uri, RDFS['label'], Literal(n)))
        elif self.ref_id is not None:
            gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type)
            if self.title is not None:
                gu.addTitle(g, self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Example #36
0
    def _add_therapy_drug_association(self, drug_id, disease_id, therapy_status_id):
        """
        Create an association linking a drug and disease with
        RO:0002606 (substance_that_treats) and any supporting information
        such as FDA approval and source (not implemented)
        :param drug_id: Id as curie of the drug
        :param disease_id: Id as curie of the disease
        :param therapy_status: (Optional) String label of therapy approval status
        :return: None
        """
        gu = GraphUtils(curie_map.get())
        # Placeholder relationship, note this does not exist in RO
        relationship_id = "RO:has_approval_status"
        gu.addTriple(self.graph, drug_id, gu.object_properties['substance_that_treats'], disease_id)
        # Make association
        drug_disease_annot = self.make_cgd_id("assoc{0}{1}".format(drug_id, disease_id))

        therapy_disease_assoc = Assoc(self.name)
        therapy_disease_assoc.set_subject(drug_id)
        therapy_disease_assoc.set_relationship(gu.object_properties['substance_that_treats'])
        therapy_disease_assoc.set_object(disease_id)
        therapy_disease_assoc.set_association_id(drug_disease_annot)
        therapy_disease_assoc.add_association_to_graph(self.graph)

        gu.addTriple(self.graph, drug_disease_annot, relationship_id, therapy_status_id)
Example #37
0
    def test_parse(self):
        if self.source is not None:  # don't test the abstract class
            self.source.parse()
            """
            seems we get a better stack trace by not catching the exception
            am I missing something?
            try:
                self.source.parse()
            except Exception as ParseException:  # tec too broad?
                logger.error(ParseException)
                self.assertFalse(True, "Parsing failed")
            """
            try:
                properties = GraphUtils.get_properties_from_graph(
                    self.source.graph)
                GraphUtils.add_property_axioms(self.source.graph, properties)
                self.source.write(format='turtle')
            except Exception as WriteException:
                logger.error(WriteException)
                self.assertFalse(True, "Write failed")

        return
Example #38
0
    def get_uniprot_entrez_id_map(self):
        taxon_digest = GraphUtils.digest_id(str(self.tax_ids))
        id_map = {}
        smallfile = '/'.join((self.rawdir, 'id_map_' + taxon_digest + '.yaml'))
        bigfile = '/'.join((self.rawdir, self.files['id-map']['file']))

        # if processed smallfile exists and is newer use it instesd
        if os.path.isfile(smallfile) and \
                os.path.getctime(smallfile) > os.path.getctime(bigfile):
            LOG.info("Using the cheap mapping file %s", smallfile)
            with open(smallfile, 'r') as fh:
                id_map = yaml.safe_load(fh)
        else:
            LOG.info(
                "Expensive Mapping from Uniprot ids to Entrez/ENSEMBL gene ids for %s",
                str(self.tax_ids))
            self.fetch_from_url(self.files['id-map']['url'], bigfile)
            with gzip.open(bigfile, 'rb') as csvfile:
                csv.field_size_limit(sys.maxsize)
                filereader = csv.reader(  # warning this file is over 10GB unzipped
                    io.TextIOWrapper(csvfile, newline=""),
                    delimiter='\t',
                    quotechar='\"')
                for row in filereader:
                    (uniprotkb_ac, uniprotkb_id, geneid, refseq, gi, pdb, go,
                     uniref100, unifref90, uniref50, uniparc, pir, ncbitaxon,
                     mim, unigene, pubmed, embl, embl_cds, ensembl,
                     ensembl_trs, ensembl_pro, other_pubmed) = row
                    if str(ncbitaxon) not in self.tax_ids:
                        continue
                    genid = geneid.strip()
                    if geneid != '' and ';' not in genid:
                        id_map[uniprotkb_ac.strip()] = 'NCBIGene:' + genid
                    elif ensembl.strip() != '' and ';' not in ensembl:
                        id_map[uniprotkb_ac.strip(
                        )] = 'ENSEMBL:' + ensembl.strip()

            LOG.info("Writing id_map out as %s", smallfile)
            with open(smallfile, 'w') as fh:
                yaml.dump(id_map, fh)

        LOG.info("Acquired %i 1:1 uniprot to [entrez|ensembl] mappings",
                 len(id_map.keys()))

        return id_map
Example #39
0
class Genotype():
    """
    These methods provide convenient methods to
    add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in
    GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features,
    we use the GenomicFeature class to create them.

    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gut = GraphUtils(self.curie_map)

    def addGenotype(self,
                    genotype_id,
                    genotype_label,
                    genotype_type=None,
                    genotype_description=None):
        """
        If a genotype_type is not supplied,
        we will default to 'intrinsic genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:

        """
        if genotype_type is None:
            genotype_type = self.globaltt['intrinsic genotype']

        self.model.addIndividualToGraph(genotype_id, genotype_label,
                                        genotype_type, genotype_description)

    def addAllele(self,
                  allele_id,
                  allele_label,
                  allele_type=None,
                  allele_description=None):
        """
        Make an allele object.
        If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional,
        recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:

        """

        # TODO should we accept a list of allele types?
        if allele_type is None:
            allele_type = self.globaltt['allele']  # TODO is this a good idea?
        self.model.addIndividualToGraph(allele_id, allele_label, allele_type,
                                        allele_description)

    def addGene(self,
                gene_id,
                gene_label=None,
                gene_type=None,
                gene_description=None):
        ''' genes are classes '''
        if gene_type is None:
            gene_type = self.globaltt['gene']
        self.model.addClassToGraph(gene_id, gene_label, gene_type,
                                   gene_description)

    def addConstruct(self,
                     construct_id,
                     construct_label,
                     construct_type=None,
                     construct_description=None,
                     construct_category=None,
                     construct_type_category=None):
        """
        :param construct_id:
        :param construct_label:
        :param construct_type:
        :param construct_description:
        :param construct_category: a biolink category CURIE for construct_id
        :param construct_type_category: a biolink category CURIE for construct_type
        :return:

        """
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    construct_type=self.construct_base_type
        self.model.addIndividualToGraph(
            construct_id,
            construct_label,
            construct_type,
            construct_description,
            ind_category=construct_category,
            ind_type_category=construct_type_category)

    def addDerivesFrom(self,
                       child_id,
                       parent_id,
                       child_category=None,
                       parent_category=None):
        """
        We add a derives_from relationship between the child and parent id.
        Examples of uses include between:
        an allele and a construct or strain here,
        a cell line and it's parent genotype.  Adding the parent and child to
        the graph should happen outside of this function call to ensure graph
        integrity.
        :param child_id:
        :param parent_id:
        :return:

        """

        self.graph.addTriple(child_id,
                             self.globaltt['derives_from'],
                             parent_id,
                             subject_category=child_category,
                             object_category=parent_category)

    def addSequenceDerivesFrom(self,
                               child_id,
                               parent_id,
                               child_category=None,
                               parent_category=None):
        self.graph.addTriple(child_id,
                             self.globaltt['sequence_derives_from'],
                             parent_id,
                             subject_category=child_category,
                             object_category=parent_category)

        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_allele_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt["is_allele_of"]
        self.graph.addTriple(allele_id, rel_id, gene_id)

    def addAffectedLocus(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:has_affected_feature.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt['has_affected_feature']
        self.graph.addTriple(allele_id, rel_id, gene_id)

    def addGeneProduct(self,
                       sequence_id,
                       product_id,
                       product_label=None,
                       product_type=None,
                       sequence_category=None,
                       product_category=None):
        """
        Add gene/variant/allele has_gene_product relationship
        Can be used to either describe a gene to transcript relationship
        or gene to protein
        :param sequence_id:
        :param product_id:
        :param product_label:
        :param product_type:
        :param sequence_category: bl category CURIE for seq_id [blv.terms.Gene].value
        :param product_category: biolink category CURIE for product_id
        :return:

        """
        if product_label is not None and product_type is not None:
            self.model.addIndividualToGraph(product_id,
                                            product_label,
                                            product_type,
                                            ind_category=product_category)
        self.graph.addTriple(sequence_id,
                             self.globaltt['has gene product'],
                             product_id,
                             subject_category=sequence_category,
                             object_category=product_category)

    def addPolypeptide(self,
                       polypeptide_id,
                       polypeptide_label=None,
                       transcript_id=None,
                       polypeptide_type=None):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:

        """
        if polypeptide_type is None:
            polypeptide_type = self.globaltt['polypeptide']
        self.model.addIndividualToGraph(polypeptide_id, polypeptide_label,
                                        polypeptide_type)
        if transcript_id is not None:
            self.graph.addTriple(transcript_id, self.globaltt['translates_to'],
                                 polypeptide_id)

    def addPartsToVSLC(self,
                       vslc_id,
                       allele1_id,
                       allele2_id,
                       zygosity_id=None,
                       allele1_rel=None,
                       allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles
        (reference or variant loci) are traditionally added, you can add any
        node (such as sequence_alterations for unlocated variations) to a vslc
        if they are known to be paired.  However, if a sequence_alteration's
        loci is unknown, it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:

        """

        # vslc has parts allele1/allele2

        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.globaltt['homozygous']
            else:
                zygosity_id = self.globaltt['heterozygous']

        if zygosity_id is not None:
            self.graph.addTriple(vslc_id, self.globaltt['has_zygosity'],
                                 zygosity_id)

    def addVSLCtoParent(self,
                        vslc_id,
                        parent_id,
                        part_category=None,
                        parent_category=None):
        """
        The VSLC can either be added to a genotype or to a GVC.
        The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :param part_category: a biolink category CURIE for part
        :param parent_category: a biolink category CURIE for parent
        :return:
        """

        self.addParts(vslc_id,
                      parent_id,
                      self.globaltt['has_variant_part'],
                      part_category=part_category,
                      parent_category=parent_category)

    def addParts(self,
                 part_id,
                 parent_id,
                 part_relationship=None,
                 part_category=None,
                 parent_category=None):
        """
        This will add a has_part (or subproperty) relationship between
        a parent_id and the supplied part.
        By default the relationship will be BFO:has_part,
        but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :param part_category: a biolink vocab curie for part_id
        :param parent_category: a biolink vocab curie for parent_id
        :return:

        """
        if part_relationship is None:
            part_relationship = self.globaltt['has_part']
        # Fail loudly if parent or child identifiers are None
        if parent_id is None:
            raise TypeError('Attempt to pass None as parent')
        elif part_id is None:
            raise TypeError('Attempt to pass None as child')
        elif part_relationship is None:
            part_relationship = self.globaltt['has_part']

        self.graph.addTriple(parent_id,
                             part_relationship,
                             part_id,
                             subject_category=parent_category,
                             object_category=part_category)

    def addSequenceAlteration(self,
                              sa_id,
                              sa_label,
                              sa_type=None,
                              sa_description=None):

        if sa_type is None:
            sa_type = self.globaltt['sequence_alteration']

        self.model.addIndividualToGraph(sa_id, sa_label, sa_type,
                                        sa_description)

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.globaltt['has_variant_part'])

    def addGenomicBackground(self,
                             background_id,
                             background_label,
                             background_type=None,
                             background_description=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addIndividualToGraph(background_id, background_label,
                                        background_type,
                                        background_description)

    def addGenomicBackgroundToGenotype(self,
                                       background_id,
                                       genotype_id,
                                       background_type=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addType(background_id, background_type)
        self.addParts(background_id, genotype_id,
                      self.globaltt['has_reference_part'])

    def addTaxon(self, taxon_id, genopart_id, genopart_category=None):
        """
        The supplied geno part will have the specified taxon added with
        RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background,
        but could be added to any genotype part (including a gene,
        regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:
        :param genopart_category: a biolink term for genopart_id
        :return:

        """
        self.graph.addTriple(genopart_id, self.globaltt['in taxon'], taxon_id)

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        """
        Add genotype has_variant_part reagent_id. For example, add a morphant
        reagent thingy to the genotype, assuming it's a extrinsic_genotype
        Also a triple to assign biolink categories to genotype and reagent.
        :param reagent_id
        :param genotype_id
        :return:

        """
        self.graph.addTriple(genotype_id, self.globaltt['has_variant_part'],
                             reagent_id)

    def addGeneTargetingReagent(self,
                                reagent_id,
                                reagent_label,
                                reagent_type,
                                gene_id,
                                description=None,
                                reagent_category=None):
        """
        Here, a gene-targeting reagent is added.
        The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:

        :return:

        """

        # TODO add default type to reagent_type
        self.model.addIndividualToGraph(reagent_id,
                                        reagent_label,
                                        reagent_type,
                                        description,
                                        ind_category=reagent_category)

        self.graph.addTriple(reagent_id, self.globaltt['targets_gene'],
                             gene_id)

    def addReagentTargetedGene(self,
                               reagent_id,
                               gene_id,
                               targeted_gene_id=None,
                               targeted_gene_label=None,
                               description=None,
                               reagent_category=None):
        """
        This will create the instance of a gene that is targeted by a molecular
        reagent (such as a morpholino or rnai).
        If an instance id is not supplied,
        we will create it as an anonymous individual which is of the type
        GENO:reagent_targeted_gene.
        We will also add the targets relationship between the reagent and
        gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
        rdfs:label targeted_gene_label
        dc:description description
        <reagent_id> GENO:targets_gene <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :param reagent_category: a biolink category CURIE for reagent_id
        :return:

        """

        # akin to a variant locus
        # is this some sort of pseudo bnode?
        if targeted_gene_id is None:
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
            targeted_gene_id = targeted_gene_id.replace(":", "")
        self.model.addIndividualToGraph(targeted_gene_id,
                                        targeted_gene_label,
                                        self.globaltt['reagent_targeted_gene'],
                                        description,
                                        ind_category=reagent_category)

        if gene_id is not None:
            self.graph.addTriple(targeted_gene_id,
                                 self.globaltt['is_expression_variant_of'],
                                 gene_id)

        self.graph.addTriple(targeted_gene_id, self.globaltt['is_targeted_by'],
                             reagent_id)

    def addTargetedGeneSubregion(self,
                                 tgs_id,
                                 tgs_label,
                                 tgs_type=None,
                                 tgs_description=None):
        if tgs_type is None:
            tgs_type = self.globaltt['targeted_gene_subregion']

        self.model.addIndividualToGraph(tgs_id, tgs_label, tgs_type,
                                        tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.graph.addTriple(population_id,
                             self.globaltt['has_member_with_allelotype'],
                             member_id)

    def addTargetedGeneComplement(self,
                                  tgc_id,
                                  tgc_label,
                                  tgc_type=None,
                                  tgc_description=None):
        if tgc_type is None:
            tgc_type = self.globaltt['targeted_gene_complement']
        self.model.addIndividualToGraph(tgc_id, tgc_label, tgc_type,
                                        tgc_description)

    def addGenome(self, taxon_num, taxon_label=None, genome_id=None):
        ncbitaxon = 'NCBITaxon:' + taxon_num
        if taxon_label is None:
            if ncbitaxon in self.globaltcid:
                taxon_label = self.globaltcid[ncbitaxon]
            else:
                logging.warning('Add ' + ncbitaxon +
                                ' to global translation table')
                taxon_label = taxon_num
        elif ncbitaxon in self.globaltcid and taxon_label != self.globaltcid[
                ncbitaxon]:
            logging.warning('"' + self.globaltcid[ncbitaxon] +
                            '" may need updating from "' + taxon_label +
                            '" in global translation table')
            logging.warning(
                '"' + taxon_label + '": " ' + self.globaltcid[ncbitaxon] +
                '"' + ' may need to be added to a local translation table')

        genome_label = taxon_label + ' genome'

        if genome_id is None:
            genome_id = self.makeGenomeID(taxon_num)

        self.model.addClassToGraph(genome_id, genome_label,
                                   self.globaltt['genome'])

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addIndividualToGraph(build_id, build_label,
                                        self.globaltt['reference_genome'],
                                        blv.terms['GenomeBuild'])
        self.model.addType(build_id,
                           genome_id,
                           subject_category=blv.terms['GenomeBuild'])
        if re.match(r'[0-9]+', taxon_id):
            taxon_id = 'NCBITaxon:' + taxon_id

        self.addTaxon(taxon_id,
                      build_id,
                      genopart_category=blv.terms['GenomeBuild'])

    @staticmethod
    def makeGenomeID(taxon_id):
        # scrub off the taxon prefix.  put it in base space
        # TODO: revisit as yet another BNODE?
        # should never be called if a real genome iri exists
        # should create the opaque bode and label together
        # genome_id = re.sub(r'.*\:', '_:', taxon_id) + 'genome'
        genome_id = '_:' + taxon_id + 'genome'
        return genome_id

    def addChromosome(self,
                      chrom,
                      tax_id,
                      tax_label=None,
                      build_id=None,
                      build_label=None):
        """
        if it's just the chromosome, add it as an instance of a SO:chromosome,
        and add it to the genome. If a build is included,
        punn the chromosome as a subclass of SO:chromsome, and make the
        build-specific chromosome an instance of the supplied chr.
        The chr then becomes part of the build or genome.
        """
        family = Family(self.graph)
        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chrom), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chrom, tax_label)
        else:
            chr_label = makeChromLabel(chrom)
        genome_id = self.makeGenomeID(tax_id)
        self.model.addClassToGraph(chr_id, chr_label,
                                   self.globaltt['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            # the build-specific chromosome
            chrinbuild_id = makeChromID(chrom, build_id)
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chrom, build_label)
            # add the build-specific chromosome as an instance of the chr class

            self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label,
                                            chr_id)

            # add the build-specific chromosome
            # as a member of the build (both ways)
            family.addMember(build_id,
                             chrinbuild_id,
                             group_category=blv.terms['GenomeBuild'])
            family.addMemberOf(chrinbuild_id,
                               build_id,
                               group_category=blv.terms['GenomeBuild'])

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        # the chrom class (generic) id
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.model.addClassToGraph(chrom_class_id, chrom_class_label,
                                   self.globaltt['chromosome'])

    def addChromosomeInstance(self,
                              chr_num,
                              reference_id,
                              reference_label,
                              chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr_num:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.
        typically a genome-specific chr

        :return:

        """
        family = Family(self.graph)
        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.model.addIndividualToGraph(chr_id, chr_label,
                                        self.globaltt['chromosome'])
        if chr_type is not None:
            self.model.addType(chr_id, chr_type)

        # add the build-specific chromosome
        # as a member of the build  (both ways)
        family.addMember(reference_id,
                         chr_id,
                         group_category=blv.terms['GenomeBuild'])
        family.addMemberOf(chr_id, reference_id)

    @staticmethod
    def make_variant_locus_label(gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip() + '<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """

        vslc_label = ''

        if gene_label is None and allele1_label is None and allele2_label is None:
            LOG.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label

    def make_experimental_model_with_genotype(self, genotype_id,
                                              genotype_label, taxon_id,
                                              taxon_label):

        animal_id = '-'.join((taxon_id, 'with', genotype_id))
        animal_id = animal_id.replace(':', '')
        # bnode
        animal_id = ':'.join(('_', self.gut.digest_id(animal_id)))

        animal_label = ' '.join((genotype_label, taxon_label))
        self.model.addIndividualToGraph(animal_id, animal_label, taxon_id)

        self.graph.addTriple(animal_id, self.globaltt['has_genotype'],
                             genotype_id)
        return animal_id
Example #40
0
class G2PAssoc(Assoc):
    """
    A specific association class for defining Genotype-to-Phenotype
    relationships. This assumes that a graph is created outside of this class,
    and nodes get added.
    By default, an association will assume the "has_phenotype" relationship,
    unless otherwise specified.
    Note that genotypes are expected to be
    created and defined outside of this association,
    most likely by calling methods in the Genotype() class.

    """
    def __init__(self,
                 graph,
                 definedby,
                 entity_id,
                 phenotype_id,
                 rel=None,
                 entity_category=None,
                 phenotype_category=None):
        super().__init__(graph, definedby)
        self.entity_id = entity_id
        self.phenotype_id = phenotype_id

        if rel is None:
            rel = self.globaltt['has phenotype']

        self.start_stage_id = None
        self.end_stage_id = None
        self.environment_id = None
        self.stage_process_id = None

        self.set_subject(entity_id)
        self.set_object(phenotype_id)
        self.set_relationship(rel)

        self.subject_category = entity_category
        self.object_category = phenotype_category
        self.gut = GraphUtils(None)

        return

    def set_stage(self, start_stage_id, end_stage_id):
        if start_stage_id is not None and start_stage_id.strip() != '':
            self.start_stage_id = start_stage_id
        if end_stage_id is not None and end_stage_id.strip() != '':
            self.end_stage_id = end_stage_id

    def set_environment(self, environment_id):
        if environment_id is not None and environment_id.strip() != '':
            self.environment_id = environment_id

    def set_association_id(self, assoc_id=None):

        if assoc_id is None:
            self.assoc_id = self.make_g2p_id()
        else:
            self.assoc_id = assoc_id

    def add_association_to_graph(self,
                                 entity_category=None,
                                 phenotype_category=None):
        """
        Overrides  Association by including bnode support

        The reified relationship between a genotype (or any genotype part)
        and a phenotype is decorated with some provenance information.
        This makes the assumption that
        both the genotype and phenotype are classes.

        currently hardcoded to map the annotation to the monarch namespace
        :param g:
        :param entity_category: a biolink category CURIE for self.sub
        :param phenotype_category: a biolink category CURIE for self.obj
        :return:
        """
        # is this kosher?
        Assoc.add_association_to_graph(self)

        # make a blank stage
        if self.start_stage_id or self.end_stage_id is not None:
            stage_process_str = '-'.join(
                (str(self.start_stage_id), str(self.end_stage_id)))
            stage_process_id = ':'.join(  # bnode
                ('_', self.gut.digest_id(stage_process_str)))
            self.model.addIndividualToGraph(
                stage_process_id, None, self.globaltt['developmental_process'])
            self.graph.addTriple(stage_process_id, self.globaltt['label'],
                                 stage_process_str)

            self.graph.addTriple(stage_process_id,
                                 self.globaltt['starts during'],
                                 self.start_stage_id)

            self.graph.addTriple(stage_process_id,
                                 self.globaltt['ends during'],
                                 self.end_stage_id)

            self.stage_process_id = stage_process_id
            self.graph.addTriple(self.assoc_id, self.globaltt['has_qualifier'],
                                 self.stage_process_id)

        if self.environment_id is not None:
            self.graph.addTriple(self.assoc_id, self.globaltt['has_qualifier'],
                                 self.environment_id)

    def make_g2p_id(self):
        """
        Make an association id for phenotypic associations that is defined by:
        source of association +
        (Annot subject) +
        relationship +
        phenotype/disease +
        environment +
        start stage +
        end stage

        :return:

        """

        attributes = [
            self.environment_id, self.start_stage_id, self.end_stage_id
        ]
        assoc_id = self.make_association_id(self.definedby, self.entity_id,
                                            self.rel, self.phenotype_id,
                                            attributes)

        return assoc_id
Example #41
0
class Pathway():
    """
    This provides convenience methods to deal with gene and protein collections
    in the context of pathways.
    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gut = GraphUtils(self.curie_map)

    def addPathway(self,
                   pathway_id,
                   pathway_label,
                   pathway_type=None,
                   pathway_description=None):
        """
        Adds a pathway as a class.  If no specific type is specified, it will
        default to a subclass of "GO:cellular_process" and "PW:pathway".
        :param pathway_id:
        :param pathway_label:
        :param pathway_type:
        :param pathway_description:
        :return:
        """

        if pathway_type is None:
            pathway_type = self.globaltt['cellular_process']
        self.model.addClassToGraph(pathway_id, pathway_label, pathway_type,
                                   pathway_description)
        self.model.addSubClass(pathway_id, self.globaltt['pathway'])

    def addGeneToPathway(self, gene_id, pathway_id):
        """
        When adding a gene to a pathway, we create an intermediate
        'gene product' that is involved in
        the pathway, through a blank node.

        gene_id RO:has_gene_product _gene_product
        _gene_product RO:involved_in pathway_id

        :param pathway_id:
        :param gene_id:
        :return:
        """
        # bnode
        gene_product = ':'.join(
            ('_', self.gut.digest_id(gene_id.replace(':', '') + 'product')))
        self.model.addIndividualToGraph(gene_product, None,
                                        self.globaltt['gene_product'])
        self.graph.addTriple(gene_product, self.globaltt['label'], pathway_id)

        self.graph.addTriple(gene_id, self.globaltt['has gene product'],
                             gene_product)
        self.addComponentToPathway(gene_product, pathway_id)

    def addComponentToPathway(self, component_id, pathway_id):
        """
        This can be used directly when the component is directly involved in
        the pathway.  If a transforming event is performed on the component
        first, then the addGeneToPathway should be used instead.

        :param pathway_id:
        :param component_id:
        :param component_category: biolink category for component_id
        :param pathway_category: biolink category for pathway_id
        :return:
        """
        self.graph.addTriple(component_id, self.globaltt['involved in'],
                             pathway_id)
Example #42
0
    def _add_variant_protein_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and protein variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_missense = False
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = row[0:11]

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        transcript_curie = self._make_transcript_curie(transcript_id)
        uniprot_curie = self._make_uniprot_polypeptide_curie(transcript_id)
        ncbi_protein_curie = self._make_ncbi_polypeptide_curie(transcript_id)

        geno.addGenotype(variant_id, variant_label,
                         geno.genoparts['sequence_alteration'])

        # Make fake amino acid sequence in case we
        # can't get a CCDS to Uniprot and/or NCBI Protein mapping
        aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant))

        # Add Transcript:
        geno.addTranscript(variant_id, transcript_curie, transcript_id,
                           geno.genoparts['transcript'])

        # Add polypeptide
        if ncbi_protein_curie is not None:
            geno.addPolypeptide(ncbi_protein_curie,
                                self.transcript_xrefs['RefSeq'][transcript_id],
                                transcript_curie)
            aa_seq_id = ncbi_protein_curie
        if uniprot_curie is not None:
            geno.addPolypeptide(uniprot_curie,
                                self.transcript_xrefs['UniProt'][transcript_id],
                                transcript_curie)
            # Overrides ncbi_protein_curie,
            # but we set them as equal individuals below
            aa_seq_id = uniprot_curie

        if ncbi_protein_curie is not None and uniprot_curie is not None:
            gu.addSameIndividual(self.graph, ncbi_protein_curie, uniprot_curie)
        else:
            aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant))

        if protein_variant_type == 'nonsynonymous - missense' \
                or re.search(r'missense', variant_label):
            is_missense = True
            geno.addGenotype(variant_id, variant_label,
                             geno.genoparts['missense_variant'])

        # Get gene ID from gene map
        self._add_variant_gene_relationship(variant_id, transcript_gene)

        amino_acid_regex = re.compile(r'^p\.([A-Za-z]{1,3})(\d+)([A-Za-z]{1,3})$')

        if is_missense:
            match = re.match(amino_acid_regex, amino_acid_variant.rstrip())
        else:
            match = None

        if match is not None:
            ref_amino_acid = match.group(1)
            position = match.group(2)
            altered_amino_acid = match.group(3)
        else:
            logger.debug("Could not parse amino acid information"
                         " from {0} variant:"
                         " {1} type: {2}".format(amino_acid_variant,
                                                 variant_label,
                                                 protein_variant_type))

        # Add amino acid change to model
        if is_missense is True and match is not None:
            gu.addTriple(self.graph, variant_id,
                         geno.properties['reference_amino_acid'],
                         ref_amino_acid, is_literal)
            gu.addTriple(self.graph, variant_id,
                         geno.properties['results_in_amino_acid_change'],
                         altered_amino_acid, is_literal)

            aa_region_id = ":_{0}{1}{2}Region".format(position, position, aa_seq_id)
            self._add_feature_with_coords(variant_id, position,
                                          position, aa_seq_id, aa_region_id)

        return
Example #43
0
class Model():
    """
    Utility class to add common triples to a graph
    (subClassOf, type, label, sameAs)
    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
            self.globaltt = self.graph.globaltt
            self.globaltcid = self.graph.globaltcid
            self.curie_map = self.graph.curie_map

        else:
            raise ValueError("{} is not a graph".format(graph))

        self.gut = GraphUtils(None)  # self.curie_map

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=False,
                  literal_type=None,
                  subject_category=None,
                  object_category=None):
        self.graph.addTriple(subject_id,
                             predicate_id,
                             obj,
                             object_is_literal,
                             literal_type,
                             subject_category=subject_category,
                             object_category=object_category)

    def addType(self,
                subject_id,
                subject_type,
                subject_category=None,
                subject_type_category=None):
        self.graph.addTriple(subject_id,
                             self.globaltt['type'],
                             subject_type,
                             subject_category=subject_category,
                             object_category=subject_type_category)

    def addLabel(self, subject_id, label, subject_category=None):
        if label != '':
            self.graph.addTriple(subject_id,
                                 self.globaltt['label'],
                                 label,
                                 object_is_literal=True,
                                 subject_category=subject_category)
        # warn

    def addClassToGraph(self,
                        class_id,
                        label=None,
                        class_type=None,
                        description=None,
                        class_category=None,
                        class_type_category=None):
        """
        Any node added to the graph will get at least 3 triples:
        *(node, type, owl:Class) and
        *(node, label, literal(label))
        *if a type is added,
            then the node will be an OWL:subclassOf that the type
        *if a description is provided,
            it will also get added as a dc:description
        :param class_id:
        :param label:
        :param class_type:
        :param description:
        :param class_category: a biolink category CURIE for class
        :param class_type_category: a biolink category CURIE for class type
        :return:

        """
        if class_id is None:
            raise ValueError("class_id is None")

        self.graph.addTriple(class_id,
                             self.globaltt['type'],
                             self.globaltt['class'],
                             subject_category=class_category)
        if label is not None and label != '':
            self.graph.addTriple(class_id,
                                 self.globaltt['label'],
                                 label,
                                 object_is_literal=True)

        if class_type is not None:
            self.graph.addTriple(class_id,
                                 self.globaltt['subclass_of'],
                                 class_type,
                                 object_category=class_type_category)
        if description is not None and description != '':
            self.graph.addTriple(class_id,
                                 self.globaltt['description'],
                                 description,
                                 object_is_literal=True)

    def addIndividualToGraph(self,
                             ind_id,
                             label,
                             ind_type=None,
                             description=None,
                             ind_category=None,
                             ind_type_category=None):
        if label is not None and label != '':
            self.graph.addTriple(ind_id,
                                 self.globaltt['label'],
                                 label,
                                 object_is_literal=True)
        if ind_type is not None:
            self.graph.addTriple(ind_id,
                                 self.globaltt['type'],
                                 ind_type,
                                 object_is_literal=False,
                                 subject_category=ind_category,
                                 object_category=ind_type_category)
        else:
            self.graph.addTriple(ind_id,
                                 self.globaltt['type'],
                                 self.globaltt['named_individual'],
                                 subject_category=ind_category)
        if description is not None and description != '':
            self.graph.addTriple(ind_id,
                                 self.globaltt['description'],
                                 description,
                                 object_is_literal=True)

    def addEquivalentClass(self,
                           sub,
                           obj,
                           subject_category=None,
                           object_category=None):
        self.graph.addTriple(sub,
                             self.globaltt['equivalent_class'],
                             obj,
                             object_is_literal=False,
                             subject_category=subject_category,
                             object_category=object_category)

    def addSameIndividual(self,
                          sub,
                          obj,
                          subject_category=None,
                          object_category=None):
        self.graph.addTriple(sub,
                             self.globaltt['same_as'],
                             obj,
                             object_is_literal=False,
                             subject_category=subject_category,
                             object_category=object_category)

    def addOWLPropertyClassRestriction(self,
                                       class_id,
                                       property_id,
                                       property_value,
                                       class_category=None,
                                       property_id_category=None,
                                       property_value_category=None):
        # make a bnode to hold the property restrictions
        uniq_str = '-'.join((property_id, property_value))
        bnode = ':'.join(('_', self.gut.digest_id(uniq_str)))

        self.graph.addTriple(bnode, self.globaltt['type'],
                             self.globaltt['restriction'])
        self.graph.addTriple(bnode, self.globaltt['label'], uniq_str)
        self.graph.addTriple(bnode,
                             self.globaltt['on_property'],
                             property_id,
                             object_category=property_id_category)
        self.graph.addTriple(bnode,
                             self.globaltt['some_values_from'],
                             property_value,
                             object_category=property_value_category)
        self.graph.addTriple(class_id,
                             self.globaltt['subclass_of'],
                             bnode,
                             object_is_literal=False,
                             subject_category=class_category)

    def addPerson(self, person_id, person_label=None):
        self.graph.addTriple(person_id, self.globaltt['type'],
                             self.globaltt['person'])
        if person_label is not None and person_label != '':
            self.graph.addTriple(person_id,
                                 self.globaltt['label'],
                                 person_label,
                                 object_is_literal=True)

    def addDeprecatedClass(self,
                           old_id,
                           new_ids=None,
                           old_id_category=None,
                           new_ids_category=None):
        """
        Will mark the oldid as a deprecated class.
        if one newid is supplied, it will mark it as replaced by.
        if >1 newid is supplied, it will mark it with consider properties
        :param old_id: str - the class id to deprecate
        :param new_ids: list - the class list that is
                       the replacement(s) of the old class.  Not required.
        :param old_id_category - a biolink category CURIE for old id
        :param new_ids_category - a biolink category CURIE for new ids
        :return: None

        """
        self.graph.addTriple(old_id,
                             self.globaltt['type'],
                             self.globaltt['class'],
                             subject_category=old_id_category)

        self._addReplacementIds(old_id,
                                new_ids,
                                new_ids_category=new_ids_category)

    def _addReplacementIds(self, old_id, new_ids, new_ids_category=None):

        self.graph.addTriple(old_id,
                             self.globaltt['deprecated'],
                             True,
                             object_is_literal=True,
                             literal_type='xsd:boolean')

        if new_ids is not None:
            if isinstance(new_ids, str):
                self.graph.addTriple(old_id, self.globaltt['term replaced by'],
                                     new_ids)
            elif len(new_ids) == 1:
                self.graph.addTriple(old_id,
                                     self.globaltt['term replaced by'],
                                     new_ids[0],
                                     object_category=new_ids_category)
            elif new_ids:
                for new_id in new_ids:
                    self.graph.addTriple(old_id,
                                         self.globaltt['consider'],
                                         new_id,
                                         object_category=new_ids_category)

    def addDeprecatedIndividual(self,
                                old_id,
                                new_ids=None,
                                old_id_category=None,
                                new_id_category=None):
        """
        Will mark the oldid as a deprecated individual.
        if one newid is supplied, it will mark it as replaced by.
        if >1 newid is supplied, it will mark it with consider properties
        :param g:
        :param oldid: the individual id to deprecate
        :param newids: the individual idlist that is the replacement(s) of
                       the old individual.  Not required.
        :param old_id_category - a biolink category CURIE for old id
        :param new_ids_category - a biolink category CURIE for new ids
        :return:

        """
        self.graph.addTriple(old_id,
                             self.globaltt['type'],
                             self.globaltt['named_individual'],
                             subject_category=old_id_category)

        self._addReplacementIds(old_id,
                                new_ids,
                                new_ids_category=new_id_category)

    def addSubClass(self,
                    child_id,
                    parent_id,
                    child_category=None,
                    parent_category=None):
        self.graph.addTriple(child_id,
                             self.globaltt['subclass_of'],
                             parent_id,
                             object_is_literal=False,
                             subject_category=child_category,
                             object_category=parent_category)

    def addSynonym(self,
                   class_id,
                   synonym,
                   synonym_type=None,
                   class_category=None):
        """
        Add the synonym as a property of the class cid.
        Assume it is an exact synonym, unless otherwise specified
        :param self:
        :param class_id: class id
        :param synonym: the literal synonym label
        :param synonym_type: the CURIE of the synonym type (not the URI)
        :param class_category: biolink category CURIE for class_id
        (no biolink category is possible for synonym, since this is added to the triple
        as a literal)
        :return:

        """
        if synonym_type is None:
            synonym_type = self.globaltt['has_exact_synonym']

        if synonym is not None and synonym != '':
            self.graph.addTriple(class_id,
                                 synonym_type,
                                 synonym,
                                 object_is_literal=True,
                                 subject_category=class_category)
            # todo warn

    def addDefinition(self, class_id, definition, class_category=None):
        self.graph.addTriple(class_id,
                             self.globaltt['definition'],
                             definition,
                             object_is_literal=True,
                             subject_category=class_category)

    def addXref(self,
                class_id,
                xref_id,
                xref_as_literal=False,
                class_category=None,
                xref_category=None):
        self.graph.addTriple(class_id,
                             self.globaltt['database_cross_reference'],
                             xref_id,
                             object_is_literal=xref_as_literal,
                             subject_category=class_category,
                             object_category=xref_category)

    def addDepiction(self, subject_id, image_url):
        self.graph.addTriple(subject_id, self.globaltt['depiction'], image_url)

    def addComment(self, subject_id, comment, subject_category=None):
        self.graph.addTriple(subject_id,
                             self.globaltt['comment'],
                             comment.strip(),
                             object_is_literal=True,
                             subject_category=subject_category)

    def addDescription(self, subject_id, description, subject_category=None):
        description = description.strip()
        if description is not None and description != '':
            self.graph.addTriple(subject_id,
                                 self.globaltt['description'],
                                 description,
                                 object_is_literal=True,
                                 subject_category=subject_category)
            # todo: warn; but only when we can say where it came from

    def addOntologyDeclaration(self, ontology_id):
        self.graph.addTriple(ontology_id, self.globaltt['type'],
                             self.globaltt['ontology'])

    def addOWLVersionIRI(self, ontology_id, version_iri):
        self.graph.addTriple(ontology_id,
                             self.globaltt['version_iri'],
                             version_iri,
                             object_is_literal=False)

    def addOWLVersionInfo(self, ontology_id, version_info):
        self.graph.addTriple(ontology_id,
                             self.globaltt['version_info'],
                             version_info,
                             object_is_literal=True)

    def makeLeader(self, node_id):
        """
        Add an annotation property to the given ```node_id```
        to be the clique_leader.
        This is a monarchism.
        :param node_id:
        :param node_category: a biolink category CURIE for node_id
        :return:
        """
        self.graph.addTriple(node_id,
                             self.globaltt['clique_leader'],
                             True,
                             object_is_literal=True,
                             literal_type='xsd:boolean')

    def addBlankNodeAnnotation(self, node_id):
        """
        Add an annotation property to the given ```node_id```
        to be a pseudo blank node.
        This is a monarchism.
        :param node_id:
        :return:
        """
        self.graph.addTriple(node_id,
                             self.globaltt['is_anonymous'],
                             True,
                             object_is_literal=True,
                             literal_type='xsd:boolean')

    def _addSexSpecificity(self, subject_id, sex, subject_category=None):
        """
        Add sex specificity to a subject (eg association node)

        In our modeling we use this to add a qualifier to a triple
        for example, this genotype to phenotype association
        is specific to this sex (see MGI, IMPC)

        This expects the client to define the ontology term
        for sex (eg PATO)

        Note this class is probably not the right place for this
        method, but putting here until a better home is found
        :param subject_id:
        :param subject_category: a biolink category CURIE for subject_id
        :param sex:
        :return:
        """
        self.graph.addTriple(subject_id,
                             self.globaltt['has_sex_specificty'],
                             sex,
                             subject_category=subject_category,
                             object_category=blv.terms['BiologicalSex'])
Example #44
0
    def write(self, fmt='turtle', stream=None, write_metadata_in_main_graph=False):
        """
        This convenience method will write out all of the graphs
            associated with the source.
        Right now these are hardcoded to be a single main "graph"
        and a "src_dataset.ttl" and a "src_test.ttl"
        If you do not supply stream='stdout'
        it will default write these to files.

        In addition, if the version number isn't yet set in the dataset,
        it will be set to the date on file.
        :return: None

        """
        fmt_ext = {
            'rdfxml': 'xml',
            'turtle': 'ttl',
            'nt': 'nt',         # ntriples
            'nquads': 'nq',
            'n3': 'n3'          # notation3
        }

        # make the regular graph output file
        dest = None
        if self.name is not None:
            dest = '/'.join((self.outdir, self.name))
            if fmt in fmt_ext:
                dest = '.'.join((dest, fmt_ext.get(fmt)))
            else:
                dest = '.'.join((dest, fmt))
            LOG.info("Setting outfile to %s", dest)

            # make the dataset_file name, always format as turtle
            self.datasetfile = '/'.join(
                (self.outdir, self.name + '_dataset.ttl'))
            LOG.info("Setting dataset file to %s", self.datasetfile)
        else:
            LOG.warning("No output file set. Using stdout")
            stream = 'stdout'

        graph_util = GraphUtils(None)

        # the  _dataset description is always turtle
        graph_util.write(self.dataset.get_graph(), 'turtle', filename=self.datasetfile)

        if self.test_mode:
            # unless we stop hardcoding, the test dataset is always turtle
            LOG.info("Setting testfile to %s", self.testfile)
            graph_util.write(self.testgraph, 'turtle', filename=self.testfile)

        if write_metadata_in_main_graph:
            self.graph = self.graph + self.dataset.get_graph()

        # print graph out
        if stream is None:
            outfile = dest
        elif stream.lower().strip() == 'stdout':
            outfile = None
        else:
            LOG.error("I don't understand our stream.")
            return

        graph_util.write(self.graph, fmt, filename=outfile)
Example #45
0
def main():
    # TODO this should be generated by looking in the dipper/sources directory
    # or read from a sources/dataset/config yaml or dir of yamls
    source_to_class_map = {
        # 'facebase_alpha': 'FaceBase_alpha',
        'hpoa': 'HPOAnnotations',   # ~3 min
        'zfin': 'ZFIN',
        'omim': 'OMIM',  # full file takes ~15 min, due to required throttling
        'biogrid': 'BioGrid',  # interactions file takes <10 minutes
        'mgi': 'MGI',
        'impc': 'IMPC',
        # Panther takes ~1hr to map 7 species-worth of associations
        'panther': 'Panther',
        'oma': 'OMA',
        'ncbigene': 'NCBIGene',  # takes about 4 minutes to process 2 species
        'ucscbands': 'UCSCBands',
        'ctd': 'CTD',
        'genereviews': 'GeneReviews',
        'eom': 'EOM',  # Takes about 5 seconds.
        'coriell': 'Coriell',
        # 'clinvar': 'ClinVar',                   # takes ~ half hour
        # 'clinvarxml_alpha': 'ClinVarXML_alpha', # takes ~ five minutes
        'monochrom': 'Monochrom',
        'kegg': 'KEGG',
        'animalqtldb': 'AnimalQTLdb',
        'ensembl': 'Ensembl',
        'hgnc': 'HGNC',
        'orphanet': 'Orphanet',
        'omia': 'OMIA',
        'flybase': 'FlyBase',
        'mmrrc': 'MMRRC',
        'wormbase': 'WormBase',
        'mpd': 'MPD',
        'gwascatalog': 'GWASCatalog',
        'monarch': 'Monarch',
        'go': 'GeneOntology',
        'reactome': 'Reactome',
        'udp': 'UDP',
        'mgi-slim': 'MGISlim',
        'zfin-slim': 'ZFINSlim',
        'bgee': 'Bgee',
        'mydrug': 'MyDrug',
        'stringdb': 'StringDB',
        'rgd': 'RGD',
        'sgd': 'SGD'
    }

    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(
        description='Dipper: Data Ingestion Pipeline for SciGraph',
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '-g', '--graph', type=str, default="rdf_graph",
        help='graph type: rdf_graph, streamed_graph')
    parser.add_argument(
        '-s', '--sources', type=str, required=True,
        help='comma separated list of sources')
    parser.add_argument(
        '-l', '--limit', type=int,
        help='limit number of rows')
    parser.add_argument(
        '--parse_only', action='store_true',
        help='parse files without writing')
    parser.add_argument(
        '--fetch_only', action='store_true',
        help='fetch sources without parsing')
    parser.add_argument('-f', '--force', action='store_true',
                        help='force re-download of files')
    parser.add_argument(
        '--no_verify',
        help='ignore the verification step', action='store_true')
    parser.add_argument('--query', help='enter in a sparql query', type=str)
    parser.add_argument(
        '-q', '--quiet',
        help='turn off info logging', action="store_true")
    parser.add_argument(
        '--debug', help='turn on debug logging', action="store_true")
    parser.add_argument(
        '--skip_tests', help='skip any testing', action="store_true")

    # Blank Nodes can't be visualized in Protege, default to Skolemizing them
    parser.add_argument(
        '-b', '--use_bnodes',
        help="use blank nodes instead of skolemizing", action="store_true",
        default=False)

    # TODO this should live in a global data file
    #   and the same filter be applied to all sources
    parser.add_argument(
        '-t', '--taxon', type=str,
        help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers,'
        ' comma delimited\n'
        'Implemented taxa per source\n'
        'NCBIGene: 9606,10090,7955\n'
        'Panther: 9606,10090,10116,7227,7955,6239,8355\n'
        'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n'
        'UCSCBands: 9606\n'
        'GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913')
    parser.add_argument(
        '-o', '--test_only',
        help='only process and output the pre-configured test subset',
        action="store_true")

    parser.add_argument(
        '--dest_fmt',
        help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw',
        type=str)

    parser.add_argument(
        '--version', '-v',
        help='version of source',
        type=str)

    args = parser.parse_args()
    tax_ids = None
    if args.taxon is not None:
        tax_ids = [int(t) for t in args.taxon.split(',')]

    taxa_supported = [  # these are not taxa
        'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology',
        'Bgee', 'Ensembl', 'StringDB', 'OMA']

    formats_supported = [
        'turtle', 'ttl',
        'ntriples', 'nt',
        'nquads', 'nq',
        'rdfxml', 'xml',
        'notation3', 'n3',
        'raw']

    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
    else:
        if args.debug:
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.INFO)

    if not args.use_bnodes:
        logger.info("Will Skolemize Blank Nodes")

    if args.query is not None:
        test_query = TestUtils()
        for source in args.sources.split(','):
            source = source.lower()
            mysource = source_to_class_map[source]()

            # import source lib
            module = "dipper.sources.{0}".format(mysource)
            imported_module = importlib.import_module(module)
            source_class = getattr(imported_module, mysource)

            test_query.check_query_syntax(args.query, source_class)
            test_query.load_graph_from_turtle(source_class)

        print(test_query.query_graph(args.query, True))
        exit(0)

    # run initial tests
    if (args.no_verify or args.skip_tests) is not True:
        unittest.TextTestRunner(verbosity=2).run(test_suite)

    # set serializer
    if args.dest_fmt is not None:
        if args.dest_fmt in formats_supported:
            if args.dest_fmt == 'ttl':
                args.dest_fmt = 'turtle'
            elif args.dest_fmt == 'ntriples':
                args.dest_fmt = 'nt'
            elif args.dest_fmt == 'nq':
                args.dest_fmt = 'nquads'
            elif args.dest_fmt == 'xml':
                args.dest_fmt = 'rdfxml'
            elif args.dest_fmt == 'notation3':
                args.dest_fmt = 'n3'
        else:
            logger.error(
                "You have specified an invalid serializer: %s", args.dest_fmt)

            exit(0)
    else:
        args.dest_fmt = 'turtle'

    # iterate through all the sources
    for source in args.sources.split(','):
        logger.info("\n******* %s *******", source)
        source = source.lower()
        src = source_to_class_map[source]

        # import source lib
        module = "dipper.sources.{0}".format(src)
        imported_module = importlib.import_module(module)
        source_class = getattr(imported_module, src)
        mysource = None
        # arg factory
        source_args = dict(
            graph_type=args.graph
        )
        source_args['are_bnodes_skolemized'] = not args.use_bnodes
        if src in taxa_supported:
            source_args['tax_ids'] = tax_ids
        if args.version:
            source_args['version'] = args.version

        mysource = source_class(**source_args)
        if args.parse_only is False:
            start_fetch = time.clock()
            mysource.fetch(args.force)
            end_fetch = time.clock()
            logger.info("Fetching time: %d sec", end_fetch-start_fetch)

        mysource.settestonly(args.test_only)

        # run tests first
        if (args.no_verify or args.skip_tests) is not True:
            suite = mysource.getTestSuite()
            if suite is None:
                logger.warning(
                    "No tests configured for this source: %s", source)
            else:
                unittest.TextTestRunner(verbosity=2).run(suite)
        else:
            logger.info("Skipping Tests for source: %s", source)

        if args.test_only is False and args.fetch_only is False:
            start_parse = time.clock()
            mysource.parse(args.limit)
            end_parse = time.clock()
            logger.info("Parsing time: %d sec", end_parse-start_parse)
            if args.graph == 'rdf_graph':
                logger.info("Found %d nodes", len(mysource.graph))

                # Add property axioms
                start_axiom_exp = time.clock()
                logger.info("Adding property axioms")

                properties = GraphUtils.get_properties_from_graph(mysource.graph)
                GraphUtils.add_property_axioms(mysource.graph, properties)
                end_axiom_exp = time.clock()
                logger.info("Property axioms added: %d sec",
                            end_axiom_exp-start_axiom_exp)

                start_write = time.clock()
                mysource.write(fmt=args.dest_fmt)
                end_write = time.clock()
                logger.info("Writing time: %d sec", end_write-start_write)
        # if args.no_verify is not True:

        #    status = mysource.verify()
        #    if status is not True:
        #        logger.error(
        #            'Source %s did not pass verification tests.', source)
        #        exit(1)
        # else:
        #    logger.info('skipping verification step')
        logger.info('***** Finished with %s *****', source)
    # load configuration parameters
    # for example, keys

    logger.info("All done.")
Example #46
0
def main():
    # TODO this should be generated by looking in the dipper/sources directory
    source_to_class_map = {
        # 'facebase_alpha': 'FaceBase_alpha',
        'hpoa': 'HPOAnnotations',
        'zfin': 'ZFIN',
        'omim': 'OMIM',
        'biogrid': 'BioGrid',
        'mgi': 'MGI',
        'impc': 'IMPC',
        'panther': 'Panther',
        'ncbigene': 'NCBIGene',
        'ucscbands': 'UCSCBands',
        'ctd': 'CTD',
        'genereviews': 'GeneReviews',
        'eom': 'EOM',
        'coriell': 'Coriell',
        # 'clinvar': 'ClinVar',      # needs integrating here
        'monochrom': 'Monochrom',
        'kegg': 'KEGG',
        'animalqtldb': 'AnimalQTLdb',
        'ensembl': 'Ensembl',
        'hgnc': 'HGNC',
        'orphanet': 'Orphanet',
        'omia': 'OMIA',
        'flybase': 'FlyBase',
        'mmrrc': 'MMRRC',
        'wormbase': 'WormBase',
        'mpd': 'MPD',
        'gwascatalog': 'GWASCatalog',
        'monarch': 'Monarch',
        'go': 'GeneOntology',
        'reactome': 'Reactome',
        'udp': 'UDP',
        'mgislim': 'MGISlim',
        'zfinslim': 'ZFINSlim',
        'bgee': 'Bgee',
        'mydrug': 'MyDrug',
        'stringdb': 'StringDB',
        'rgd': 'RGD',
        'sgd': 'SGD',
        'mychem': 'MyChem',
        'ebi': 'EBIGene2Phen',
        'xenbase': 'Xenbase'
    }

    parser = argparse.ArgumentParser(
        description='Dipper: Data Ingestion Pipeline for Monarch',
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-g',
                        '--graph',
                        type=str,
                        default="rdf_graph",
                        help='graph type: rdf_graph, streamed_graph')
    parser.add_argument('-s',
                        '--sources',
                        type=str,
                        default='?',
                        help='comma separated list of sources')
    parser.add_argument('-l',
                        '--limit',
                        type=int,
                        help='limit number of rows used')
    parser.add_argument('--parse_only',
                        action='store_true',
                        help='parse files without writing RDF')
    parser.add_argument('--fetch_only',
                        action='store_true',
                        help='fetch sources without parsing')
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        help='force re-download of files')
    parser.add_argument('--no_verify',
                        help='ignore the verification step',
                        action='store_true')
    # parser.add_argument( '--query', help='enter in a sparql query', type=str)
    parser.add_argument('-q',
                        '--quiet',
                        help='turn off info logging',
                        action="store_true")
    parser.add_argument('--debug',
                        help='turn on debug logging',
                        action="store_true")
    parser.add_argument('--skip_tests',
                        help='skip any testing',
                        action="store_true")
    # Blank Nodes can't be visualized in Protege, default to Skolemizing them
    parser.add_argument('-b',
                        '--use_bnodes',
                        help="use blank nodes instead of skolemizing",
                        action="store_true",
                        default=False)
    #
    parser.add_argument(  # TODO help needs revisiting, push constraints off the the src
        '-t',
        '--taxon',
        type=str,
        help='''
            Constrain Source to supplied taxon identifier(s).
            Please enter comma delimited NCBITaxon numbers:
            Implemented taxa per source
            NCBIGene: 9606,10090,7955
            Panther: 9606,10090,10116,7227,7955,6239,8355
            BioGrid: 9606,10090,10116,7227,7955,6239,8355
            UCSCBands: 9606
            GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913,4896,5782,5052
    ''')
    parser.add_argument(
        '-o',
        '--test_only',
        help='only process and output the pre-configured test subset',
        action="store_true")

    parser.add_argument(
        '--dest_fmt',
        help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw',
        type=str)

    parser.add_argument('-v',
                        '--version',
                        help='version of source (deprecated)',
                        type=str)

    parser.add_argument('-d',
                        '--data_release_version',
                        help='''
            string indicating the version of data release, e.g. '\'201908\' (YYYYMM),
            used to construct metadata, including version and distribution IRIs
            and downloadURLs
            [defaults to date at start of runtime in ISO 8601 format]
        ''',
                        type=str)

    args = parser.parse_args()
    tax_ids = None
    if args.taxon is not None:
        tax_ids = [str(t) for t in args.taxon.split(',') if t.isdigit()]

    species_specific = [
        'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology', 'Bgee',
        'StringDB', 'Ensembl'
    ]

    formats_supported = [
        'turtle', 'ttl', 'ntriples', 'nt', 'nquads', 'nq', 'rdfxml', 'xml',
        'notation3', 'n3', 'raw'
    ]

    if args.quiet:
        logging.getLogger().setLevel(logging.WARNING)
    else:
        if args.debug:
            logging.getLogger().setLevel(logging.DEBUG)
        else:
            logging.getLogger().setLevel(logging.INFO)

    if not args.use_bnodes:
        LOG.info("Will Skolemize Blank Nodes")

    # None of these query test utils exist in ./dipper/utils/TestUtils.py
    # if args.query is not None:
    #    test_query = TestUtils()
    #    for source in args.sources.split(','):
    #        source = source.lower()
    #        mysource = source_to_class_map[source]()
    #        # import source lib
    #        module = "dipper.sources.{0}".format(mysource)
    #        imported_module = importlib.import_module(module)
    #        source_class = getattr(imported_module, mysource)
    #        test_query.check_query_syntax(args.query, source_class)
    #        test_query.load_graph_from_turtle(source_class)
    #
    #    print(test_query.query_graph(args.query, True))
    #    exit(0)

    # run initial tests
    if (args.no_verify or args.skip_tests) is not True:
        unittest.TextTestRunner(verbosity=2).run(TEST_SUITE)

    # set serializer
    if args.dest_fmt is not None:
        if args.dest_fmt in formats_supported:
            if args.dest_fmt == 'ttl':
                args.dest_fmt = 'turtle'
            elif args.dest_fmt == 'ntriples':
                args.dest_fmt = 'nt'
            elif args.dest_fmt == 'nq':
                args.dest_fmt = 'nquads'
            elif args.dest_fmt == 'xml':
                args.dest_fmt = 'rdfxml'
            elif args.dest_fmt == 'notation3':
                args.dest_fmt = 'n3'
        else:
            LOG.error("You have specified an invalid serializer: %s",
                      args.dest_fmt)
            exit(0)
    else:
        args.dest_fmt = 'turtle'

    # Provide feedback if we can't proceed
    if args.sources is None or args.sources.split(
            ',')[0] not in source_to_class_map:
        LOG.info('Unknown Source %s', args.sources.split(',')[0])
        LOG.info('Sources Known are limited to:')
        for key in sorted(source_to_class_map):
            LOG.info('\t%s\t%s', key, source_to_class_map[key])
        exit(0)

    # iterate through all the sources
    for source in args.sources.split(','):
        LOG.info("\n******* %s *******", source)
        source = source.lower()
        src = source_to_class_map[source]

        # import source lib
        module = "dipper.sources.{0}".format(src)
        imported_module = importlib.import_module(module)
        source_class = getattr(imported_module, src)
        mysource = None

        LOG.info(
            'Command line arguments available to dipper-etl:\n%s',
            "\n".join(['\t{}: {}'.format(k, v)
                       for k, v in vars(args).items()]))

        source_args = dict(graph_type=args.graph)
        source_args['are_bnodes_skolemized'] = not args.use_bnodes
        if src in species_specific:
            source_args['tax_ids'] = tax_ids
        if args.version:
            source_args['version'] = args.version
        if args.data_release_version:
            source_args['data_release_version'] = args.data_release_version

        mysource = source_class(**source_args)

        # WIP cli args should be available to source
        if hasattr(mysource, 'ARGV'):
            mysource.ARGV = vars(args)
        else:
            LOG.error('no where to to put args in %s', mysource.__class__)

        if args.parse_only is False:
            start_fetch = time.perf_counter()
            mysource.fetch(args.force)

            end_fetch = time.perf_counter()
            LOG.info("Fetching time: %d sec", end_fetch - start_fetch)

        mysource.settestonly(args.test_only)

        # create source ingest graph first (with pristine arguments)
        if args.test_only is False and args.fetch_only is False:
            start_parse = time.perf_counter()
            mysource.parse(args.limit)

            end_parse = time.perf_counter()
            LOG.info("Parsing time: %d sec", end_parse - start_parse)

            if args.graph == 'rdf_graph':
                LOG.info("Found %d nodes", len(mysource.graph))

                # Add property axioms
                start_axiom_exp = time.perf_counter()
                LOG.info("Adding property axioms")

                properties = GraphUtils.get_properties_from_graph(
                    mysource.graph)
                GraphUtils.add_property_axioms(mysource.graph, properties)
                LOG.info("Property axioms added: %d sec",
                         time.perf_counter() - start_axiom_exp)

                start_write = time.perf_counter()
                mysource.write(fmt=args.dest_fmt)
                LOG.info("Writing time: %d sec",
                         time.perf_counter() - start_write)
            # elif args.graph == 'streamed_graph': ...

        # '*_test.ttl' graphs if requested
        if (args.no_verify or args.skip_tests) is False:
            suite = mysource.getTestSuite()
            if suite is None:
                LOG.warning("No tests configured for this source: %s", source)
            else:
                unittest.TextTestRunner(verbosity=2).run(suite)
        else:
            LOG.info("Skipping Tests for source: %s", source)

        LOG.info('***** Finished with %s *****', source)

    LOG.info("All done.")
Example #47
0
    def _add_variant_cdna_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and cDNA variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure.
                                      Only applicable for structure 2.
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna,
         cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base,
         variant_base, primary_transcript_exons,
         primary_transcript_variant_sub_types, variant_type, chromosome,
         genome_build, build_version, build_date) = row

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        # Add gene
        self._add_variant_gene_relationship(variant_id, variant_gene)

        # Transcript reference for nucleotide position
        transcript_curie = self._make_transcript_curie(transcript_id)

        # Make region IDs
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build,
                                                          chromosome,
                                                          genome_pos_start,
                                                          genome_pos_end)

        # Add the genome build
        genome_label = "Human"
        build_id = "UCSC:{0}".format(genome_build)
        taxon_id = 'NCBITaxon:9606'
        geno.addGenome(taxon_id, genome_label)
        geno.addReferenceGenome(build_id, genome_build, taxon_id)

        # Add chromosome

        chrom_class_id = makeChromID(chromosome, '9606', 'CHR')  # the chrom class (generic) id
        chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH')

        # first, add the chromosome class (in the taxon)
        geno.addChromosomeClass(chromosome, taxon_id, 'Human')

        # then, add the chromosome instance (from the given build)
        geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id)

        # Add variant coordinates in reference to chromosome
        self._add_feature_with_coords(variant_id,genome_pos_start,
                                      genome_pos_end, chrom_instance_id, chrom_region_id)

        # Add mutation coordinates in reference to gene
        self._add_feature_with_coords(variant_id, bp_pos,
                                      bp_pos, transcript_curie, cdna_region_id)

        # Add nucleotide mutation
        gu.addTriple(self.graph, variant_id,
                     geno.properties['reference_nucleotide'],
                     ref_base, is_literal)
        gu.addTriple(self.graph, variant_id,
                     geno.properties['altered_nucleotide'],
                     variant_base, is_literal)

        """
        Here we update any internal cgd variant IDS with a cosmic ID
        or dbSNP ID.  Alternatively we could do this using sql rather
        than a sparql update which may be safer
        """
        # Add SNP xrefs
        if cosmic_id is not None:
            cosmic_id_list = cosmic_id.split(', ')
            cosmic_curie_list = []
            for c_id in cosmic_id_list:
                cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                cosmic_curie_list.append(cosmic_curie)
                gu.addIndividualToGraph(self.graph, cosmic_curie, c_id,
                                        geno.genoparts['missense_variant'])

            # If there are multiple ids set them equivalent to the first
            for curie in cosmic_curie_list[1:]:
                gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie)

            self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings)

        if db_snp_id is not None:
            db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id)
            gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id,
                                    geno.genoparts['missense_variant'])

            if cosmic_id is None:
                self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings)
            else:
                cosmic_id_list = cosmic_id.split(', ')
                for c_id in cosmic_id_list:
                    cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                    gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie)

        return
Example #48
0
    def get_uniprot_entrez_id_map(self):
        src_key = 'id-map'
        taxon_digest = GraphUtils.digest_id(str(self.tax_ids))
        id_map = {}
        smallfile = '/'.join((self.rawdir, 'id_map_' + taxon_digest + '.yaml'))
        bigfile = '/'.join((self.rawdir, self.files[src_key]['file']))

        # if processed smallfile exists and is newer than bigfile then use it instesd
        if os.path.isfile(smallfile) and \
                os.path.getctime(smallfile) > os.path.getctime(bigfile):
            LOG.info("Using the cheap mapping file %s", smallfile)
            with open(smallfile, 'r') as yamlreader:
                id_map = yaml.safe_load(yamlreader)
        else:
            LOG.info(
                "Expensive Mapping from Uniprot IDs to Entrez/ENSEMBL gene ids for %s",
                self.tax_ids)
            self.fetch_from_url(self.files[src_key]['url'], bigfile)
            col = self.files[src_key]['columns']
            ummapped_uniprot = 0
            with gzip.open(bigfile, 'rb') as csvfile:
                csv.field_size_limit(sys.maxsize)
                reader = csv.reader(  # warning this file is over 10GB unzipped
                    io.TextIOWrapper(csvfile, newline=""),
                    delimiter='\t', quotechar='\"')
                for row in reader:
                    uniprotkb_ac = row[col.index('UniProtKB-AC')].strip()
                    # uniprotkb_id = row[col.index('UniProtKB-ID')]
                    geneid = row[col.index('GeneID (EntrezGene)')].strip()
                    # refseq = row[col.index('RefSeq')]
                    # gi = row[col.index('GI')]
                    # pdb = row[col.index('PDB')]
                    # go = row[col.index('GO')]
                    # uniref100 = row[col.index('UniRef100')]
                    # unifref90 = row[col.index('UniRef90')]
                    # uniref50 = row[col.index('UniRef50')]
                    # uniparc = row[col.index('UniParc')]
                    # pir = row[col.index('PIR')]
                    ncbitaxon = row[col.index('NCBI-taxon')].strip()
                    # mim = row[col.index('MIM')]
                    # unigene = row[col.index('UniGene')]
                    # pubmed = row[col.index('PubMed')]
                    # embl = row[col.index('EMBL')]
                    # embl_cds = row[col.index('EMBL-CDS')]
                    ensembl = row[col.index('Ensembl')].strip()
                    # ensembl_trs = row[col.index('Ensembl_TRS')]
                    # ensembl_pro = row[col.index('Ensembl_PRO')]
                    # other_pubmed = row[col.index('Additional PubMed')]

                    if ncbitaxon not in self.tax_ids:
                        continue

                    # neither empty nor a list
                    if geneid != '' and ';' not in geneid:
                        id_map[uniprotkb_ac] = 'NCBIGene:' + geneid
                    elif ensembl != '' and ';' not in ensembl:
                        id_map[uniprotkb_ac] = 'ENSEMBL:' + ensembl
                    else:
                        ummapped_uniprot += 1

            LOG.info("Writing id_map out as %s", smallfile)
            with open(smallfile, 'w') as yamlwriter:
                yaml.dump(id_map, yamlwriter)
            LOG.warning('Did not find 1:1 gene IDs for %i uniprots', ummapped_uniprot)
        LOG.info(
            "Acquired %i 1:1 uniprot to [entrez|ensembl] mappings", len(id_map.keys()))

        return id_map
Example #49
0
    def add_disease_drug_variant_to_graph(self, table):
        """
        Takes an iterable of iterables as input with the following structure,
        optional indices can be Null:
        [[variant_key, variant_label, diagnoses_key, diagnoses,
          specific_diagnosis, organ, relationship,
          drug_key, drug, therapy_status (optional), pubmed_id(optional)]]

        See ongoing discussion of how to best model here:
        https://github.com/monarch-initiative/mckb/issues/9

        :param table: iterable of iterables, for example, a tuple of tuples
                      from _get_disease_drug_variant_relationship
        :return: None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)

        for row in table:
            (variant_key, variant_label, diagnoses_key, diagnoses,
             specific_diagnosis, organ, relationship,
             drug_key, drug_label, therapy_status, pubmed_id) = row

            if specific_diagnosis is not None:
                diagnoses_label = specific_diagnosis
            else:
                diagnoses_label = diagnoses

            # Arbitrary IDs to be replaced by ontology mappings
            variant_id = self.make_cgd_id('variant{0}'.format(variant_key))
            disease_id = self._get_disease_id(diagnoses_key, diagnoses_label)
            therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status))
            relationship_id = "RO:has_environment"
            disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_")
            has_quality_property = "BFO:0000159"
            drug_id = self._get_drug_id(drug_key, drug_label)

            geno.addGenotype(variant_id, variant_label,
                             geno.genoparts['sequence_alteration'])

            disease_instance_id = self.make_cgd_id('disease{0}{1}'.format(
                                                     diagnoses_label, variant_key))

            phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format(
                                                     diagnoses_label, variant_key, relationship))

            phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship)
            if relationship == "detrimental effect":
                phenotype_instance_label = "{0} with therapeutic response {1} to health"\
                                           .format(diagnoses_label, relationship)

            # Reified association for disease caused_by genotype
            variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label))

            # Add individuals/classes
            gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4')

            gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888')
            gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label,
                                    disease_id)
            gu.loadObjectProperties(self.graph, {relationship: relationship_id})

            if pubmed_id is not None:
                source_id = "PMID:{0}".format(pubmed_id)
                ref = Reference(source_id, Reference.ref_types['journal_article'])
                ref.addRefToGraph(self.graph)
                evidence = 'ECO:0000033'
            else:
                source_id = None
                evidence = None

            rel_id = gu.object_properties['has_phenotype']
            variant_phenotype_assoc = G2PAssoc(self.name,
                                               variant_id,
                                               phenotype_instance_id,
                                               rel_id)

            variant_phenotype_assoc.set_association_id(variant_disease_annot)
            if evidence:
                variant_phenotype_assoc.add_evidence(evidence)

            if source_id:
                variant_phenotype_assoc.add_source(source_id)

            variant_phenotype_assoc.add_association_to_graph(self.graph)
            gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id)
            gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality)

            # Add therapy-disease association and approval status
            marker_relation = "RO:has_biomarker"

            disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label)
            gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label,
                                    disease_id)
            gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id)

            gu.addClassToGraph(self.graph, therapy_status_id, therapy_status)
            self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id)

        return
Example #50
0
    def write(self, fmt='turtle', stream=None):
        """
        This convenience method will write out all of the graphs
        associated with the source.
        Right now these are hardcoded to be a single "graph"
        and a "src_dataset.ttl" and a "src_test.ttl"
        If you do not supply stream='stdout'
        it will default write these to files.

        In addition, if the version number isn't yet set in the dataset,
        it will be set to the date on file.
        :return: None

        """
        fmt_ext = {
            'rdfxml': 'xml',
            'turtle': 'ttl',
            'nt': 'nt',  # ntriples
            'nquads': 'nq',
            'n3': 'n3'
        }

        # make the regular graph output file
        dest = None
        if self.name is not None:
            dest = '/'.join((self.outdir, self.name))
            if fmt in fmt_ext:
                dest = '.'.join((dest, fmt_ext.get(fmt)))
            else:
                dest = '.'.join((dest, fmt))
            logger.info("Setting outfile to %s", dest)

            # make the datasetfile name, always format as turtle
            datasetfile = '/'.join((self.outdir, self.name + '_dataset.ttl'))

            if self.dataset is not None and self.dataset.version is None:
                self.dataset.set_version_by_date()
                logger.info("No version for " + self.name +
                            " setting to date issued.")
        else:
            logger.warning("No output file set. Using stdout")
            stream = 'stdout'

        gu = GraphUtils(None)

        # the  _dataset descriptions is always turtle
        gu.write(self.dataset.getGraph(), 'turtle', file=datasetfile)

        # unless we stop hardcoding above, the test dataset is always turtle
        if self.testMode:
            gu.write(self.testgraph, 'turtle', file=self.testfile)

        # print graph out
        if stream is None:
            f = dest
        elif stream.lower().strip() == 'stdout':
            f = None
        else:
            logger.error("I don't understand your stream.")
            return

        gu.write(self.graph, fmt, file=f)
        return
Example #51
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    def __init__(
            self,
            graph,
            feature_id=None,
            label=None,
            feature_type=None,
            description=None,
            feature_category=None
    ):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gfxutl = GraphUtils(self.curie_map)
        self.fid = feature_id
        self.feature_category = feature_category
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand, position_types)

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand, position_types)

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.globaltt['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        """
        strand_id = None
        if strand == '+':
            strand_id = self.globaltt['plus_strand']
        elif strand == '-':
            strand_id = self.globaltt['minus_strand']
        elif strand == '.':
            strand_id = self.globaltt['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            LOG.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, add_region=True, region_id=None, feature_as_class=False,
            feature_category=None):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param add_region [True]
        :param region_id [None]
        :param feature_as_class [False]
        :param feature_category: a biolink category CURIE for feature
        """

        if feature_category is None:
            feature_category = self.feature_category

        if feature_as_class:
            self.model.addClassToGraph(
                self.fid, self.label, self.ftype, self.description,
                class_category=feature_category)
        else:
            self.model.addIndividualToGraph(
                self.fid, self.label, self.ftype, self.description,
                ind_category=feature_category)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(self.start['type'])
                if self.stop is not None and self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                # blank node, bnode
                rid = rid + "-Region"
                curie = '_:' + self.gfxutl.digest_id(rid)
                self.model.addLabel(curie, rid)
                region_id = curie

            self.graph.addTriple(
                self.fid,
                self.globaltt['location'],
                region_id,
                subject_category=feature_category
            )
            self.model.addIndividualToGraph(region_id, None, self.globaltt['Region'])
        else:
            region_id = self.fid
            self.model.addType(region_id, self.globaltt['region'])

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(
                self.start['reference'], self.start['coordinate'], self.start['type'])
            self.addPositionToGraph(
                self.start['reference'], self.start['coordinate'], self.start['type'],
            )

        if self.stop is not None:
            endp = self._makePositionId(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])
            self.addPositionToGraph(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.globaltt['plus_strand'] in tylist:
            strand = 'plus'
        elif self.globaltt['minus_strand'] in tylist:
            strand = 'minus'
        elif self.globaltt['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return: bnode_curie
        """
        # blank node, bnode
        if reference is None:
            LOG.error("Trying to make position with no reference.")
            return None

        reference = re.sub(r'\w+\:', '', reference, 1)
        if reference[0] == '_':
            # in this case the reference is a bnode curie as well
            # ... this is a bad smell of over modleing
            reference = reference[1:]
        unique_words = reference
        if coordinate is not None:
            # just in case it isn't a string already
            unique_words = '-'.join((unique_words, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                unique_words = '-'.join((unique_words, tstring))

        curie = '_:' + self.gfxutl.digest_id(unique_words)

        # attach the wordage via a label
        # I want to see more of this (TEC 201905)
        # including a type should be mandatory as well
        self.model.addLabel(curie, unique_words)
        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id):

        if begin_position_id is None:
            pass
            # LOG.warn("No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id)

        if end_position_id is None:
            pass
            # LOG.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['end'], end_position_id)

    def addPositionToGraph(
            self, reference_id, position, position_types=None, strand=None
    ):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(
                pos_id,
                self.globaltt['position'],
                position,
                object_is_literal=True,
                literal_type="xsd:integer"
            )
        self.graph.addTriple(
            pos_id, self.globaltt['reference'], reference_id
        )
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        strnd = None
        if strand is not None:
            strnd = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                strnd = self._getStrandType(strand)
        # else:
        #    strnd = self.globaltt['both_strand']
        if strnd is None and (position_types is None or position_types == []):
            strnd = self.globaltt['Position']

        if strnd is not None:
            self.model.addType(pos_id, strnd)

        return pos_id

    def addSubsequenceOfFeature(
            self, parentid, subject_category=None, object_category=None
    ):
        """
        This will add reciprocal triples like:
        feature <is subsequence of> parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(
            self.fid,
            self.globaltt['is subsequence of'],
            parentid,
            subject_category=subject_category,
            object_category=object_category
        )
        # this should be expected to be done in reasoning not ETL
        self.graph.addTriple(
            parentid,
            self.globaltt['has subsequence'],
            self.fid,
            subject_category=object_category,
            object_category=subject_category
        )

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        self.taxon = taxonid
        self.graph.addTriple(
            self.fid,
            self.globaltt['in taxon'],
            self.taxon,
            subject_category=self.feature_category
        )

    def addFeatureProperty(self, property_type, feature_property):

        self.graph.addTriple(
            self.fid,
            property_type,
            feature_property,
            subject_category=self.feature_category
        )