Ejemplo n.º 1
0
    def _add_snp_to_graph(
            self, snp_id, snp_label, chrom_num, chrom_pos, context,
            risk_allele_frequency=None):
        # constants
        tax_id = 'NCBITaxon:9606'
        genome_version = 'GRCh38'

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        if chrom_num != '' and chrom_pos != '':
            location = self._make_location_curie(chrom_num, chrom_pos)
            if location not in self.id_location_map:
                self.id_location_map[location] = set()
        else:
            location = None

        alteration = re.search(r'-(.*)$', snp_id)
        if alteration is not None \
                and re.match(r'[ATGC]', alteration.group(1)):
            # add variation to snp
            pass  # TODO

        if location is not None:
            self.id_location_map[location].add(snp_id)

        # create the chromosome
        chrom_id = makeChromID(chrom_num, genome_version, 'CHR')

        # add the feature to the graph
        snp_description = None
        if risk_allele_frequency is not None\
                and risk_allele_frequency != ''\
                and risk_allele_frequency != 'NR':
            snp_description = \
                str(risk_allele_frequency) + \
                ' [risk allele frequency]'

        f = Feature(
            g, snp_id, snp_label.strip(),
            Feature.types['SNP'], snp_description)
        if chrom_num != '' and chrom_pos != '':
            f.addFeatureStartLocation(chrom_pos, chrom_id)
            f.addFeatureEndLocation(chrom_pos, chrom_id)
        f.addFeatureToGraph()
        f.addTaxonToFeature(tax_id)
        # TODO consider adding allele frequency as property;
        # but would need background info to do that

        # also want to add other descriptive info about
        # the variant from the context
        for c in re.split(r';', context):
            cid = self._map_variant_type(c.strip())
            if cid is not None:
                model.addType(snp_id, cid)

        return
Ejemplo n.º 2
0
    def _add_snp_to_graph(self,
                          snp_id,
                          snp_label,
                          chrom_num,
                          chrom_pos,
                          context,
                          risk_allele_frequency=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        if chrom_num != '' and chrom_pos != '':
            location = self._make_location_curie(chrom_num, chrom_pos)
            if location not in self.id_location_map:
                self.id_location_map[location] = set()
        else:
            location = None

        alteration = re.search(r'-(.*)$', snp_id)
        if alteration is not None and re.match(r'[ATGC]', alteration.group(1)):
            # add variation to snp
            pass  # TODO

        if location is not None:
            self.id_location_map[location].add(snp_id)

        # create the chromosome
        chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'],
                               'CHR')

        # add the feature to the graph
        snp_description = None
        if risk_allele_frequency is not None\
                and risk_allele_frequency != ''\
                and risk_allele_frequency != 'NR':
            snp_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        feat = Feature(graph, snp_id, snp_label.strip(), self.globaltt['SNP'],
                       snp_description)
        if chrom_num != '' and chrom_pos != '':
            feat.addFeatureStartLocation(chrom_pos, chrom_id)
            feat.addFeatureEndLocation(chrom_pos, chrom_id)
        feat.addFeatureToGraph()
        feat.addTaxonToFeature(self.globaltt['H**o sapiens'])
        # TODO consider adding allele frequency as property;
        # but would need background info to do that

        # also want to add other descriptive info about
        # the variant from the context
        for ctx in re.split(r';', context):
            ctx = ctx.strip()
            cid = self.resolve(ctx, False)
            if cid != ctx:
                model.addType(snp_id, cid)

        return
Ejemplo n.º 3
0
    def _add_snp_to_graph(
            self, snp_id, snp_label, chrom_num, chrom_pos, context,
            risk_allele_frequency=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        if chrom_num != '' and chrom_pos != '':
            location = self._make_location_curie(chrom_num, chrom_pos)
            if location not in self.id_location_map:
                self.id_location_map[location] = set()
        else:
            location = None

        alteration = re.search(r'-(.*)$', snp_id)
        if alteration is not None and re.match(r'[ATGC]', alteration.group(1)):
            # add variation to snp
            pass  # TODO

        if location is not None:
            self.id_location_map[location].add(snp_id)

        # create the chromosome
        chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'], 'CHR')

        # add the feature to the graph
        snp_description = None
        if risk_allele_frequency is not None\
                and risk_allele_frequency != ''\
                and risk_allele_frequency != 'NR':
            snp_description = str(risk_allele_frequency) + ' [risk allele frequency]'

        feat = Feature(
            graph, snp_id, snp_label.strip(), self.globaltt['SNP'], snp_description)
        if chrom_num != '' and chrom_pos != '':
            feat.addFeatureStartLocation(chrom_pos, chrom_id)
            feat.addFeatureEndLocation(chrom_pos, chrom_id)
        feat.addFeatureToGraph()
        feat.addTaxonToFeature(self.globaltt['H**o sapiens'])
        # TODO consider adding allele frequency as property;
        # but would need background info to do that

        # also want to add other descriptive info about
        # the variant from the context
        for ctx in re.split(r';', context):
            ctx = ctx.strip()
            cid = self.resolve(ctx, False)
            if cid != ctx:
                model.addType(snp_id, cid)

        return
Ejemplo n.º 4
0
    def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank):
        """
        :param gene_id: str Non curified ID
        :param gene_label: str Gene symbol
        :param anatomy_curie: str curified anatomy term
        :param rank: str rank
        :return: None
        """
        g2a_association = Assoc(self.graph, self.name)
        model = Model(self.graph)
        gene_curie = "ENSEMBL:{}".format(gene_id)

        rank = re.sub(r',', '', str(rank))  # ? can't do RE on a float ...
        model.addType(gene_curie, self.globaltt['gene'])
        g2a_association.sub = gene_curie
        g2a_association.obj = anatomy_curie
        g2a_association.rel = self.globaltt['expressed in']
        g2a_association.add_association_to_graph()
        g2a_association.add_predicate_object(self.globaltt['has_quantifier'],
                                             float(rank), 'Literal',
                                             'xsd:float')
Ejemplo n.º 5
0
    def _process_pathway(self, row):
        """
        Process row of CTD data from CTD_genes_pathways.tsv.gz
        and generate triples
        Args:
            :param row (list): row of CTD data
        Returns:
            :return None
        """
        model = Model(self.graph)
        self._check_list_len(row, 4)
        (gene_symbol, gene_id, pathway_name, pathway_id) = row

        if self.test_mode and (int(gene_id) not in self.test_geneids):
            return

        entrez_id = 'NCBIGene:' + gene_id

        pathways_to_scrub = [
            'REACT:REACT_116125',  # disease
            "REACT:REACT_111045",  # developmental biology
            "REACT:REACT_200794",  # Mus musculus biological processes
            "REACT:REACT_13685"
        ]  # neuronal system ?

        if pathway_id in pathways_to_scrub:
            # these are lame "pathways" like generic
            # "disease" and "developmental biology"
            return

        # convert KEGG pathway ids... KEGG:12345 --> KEGG-path:map12345
        if re.match(r'KEGG', pathway_id):
            pathway_id = re.sub(r'KEGG:', 'KEGG-path:map', pathway_id)
        # just in case, add it as a class
        model.addType(entrez_id, self.globaltt['gene'])

        self.pathway.addPathway(pathway_id, pathway_name)
        self.pathway.addGeneToPathway(entrez_id, pathway_id)

        return
Ejemplo n.º 6
0
    def _process_protein_links(
            self, dataframe, p2gene_map, taxon, limit=None, rank_min=700
    ):
        model = Model(self.graph)

        filtered_df = dataframe[dataframe['combined_score'] > rank_min]
        filtered_out_count = 0
        for index, row in filtered_df.iterrows():
            # Check if proteins are in same species
            protein1 = row['protein1'].replace('{}.'.format(taxon), '')
            protein2 = row['protein2'].replace('{}.'.format(taxon), '')
            gene1_curies = None
            gene2_curies = None
            try:
                # Keep orientation the same since RO!"interacts with" is symmetric
                # TEC: symeteric expansion is the job of post processing not ingest
                if protein1 >= protein2:
                    gene1_curies = p2gene_map[protein1]
                    gene2_curies = p2gene_map[protein2]
                else:
                    gene1_curies = p2gene_map[protein2]
                    gene2_curies = p2gene_map[protein1]
            except KeyError:
                filtered_out_count += 1

            if gene1_curies is not None and gene2_curies is not None:
                for gene1 in gene1_curies:
                    for gene2 in gene2_curies:
                        model.addType(gene1, self.globaltt['gene'])
                        model.addType(gene2, self.globaltt['gene'])
                        self.graph.addTriple(
                            gene1, self.globaltt['interacts with'], gene2)
                if limit is not None and index >= limit:
                    break

        LOG.info(
            "Finished parsing p-p interactions for %s, "
            "%i rows filtered out based on checking ensembl proteins",
            taxon, filtered_out_count)
Ejemplo n.º 7
0
    def process_gaf(self, gaffile, limit, id_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter='\t',
                                quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None:
                        # try/except much faster than checking
                        # for dict key membership
                        try:
                            gene_id = id_map[gene_num]
                            uniprotid = ':'.join((dbase, gene_num))
                            (dbase, gene_num) = gene_id.split(':')
                            uniprot_hit += 1
                        except KeyError:
                            # LOG.warning(
                            #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                            #    gene_num)
                            uniprot_miss += 1
                            continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addLabel(gene_id, gene_symbol)
                model.addType(gene_id, self.globaltt['gene'])

                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(gene_id,
                                            self.globaltt['has gene product'],
                                            syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None and\
                                syn.split(':')[0] not in self.wont_prefix:
                            syn = syn.strip()
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            if syn != '':
                                model.addSynonym(gene_id, syn)
                        elif syn != '':
                            model.addSynonym(gene_id, syn)

                # First taxon is for the gene, after the pipe are interacting taxa
                tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon')
                # this is a required field but good to safe
                if tax_curie:
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = self.gaf_eco[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[-2]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'[|,]', with_or_from)  # OR + AND
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning("Skipping  %s from or with %s",
                                        uniprotid, itm)
                            continue
                        # sanity check/conversion on go curie prefix
                        (pfx, lclid) = itm.split(':')[-2:]  # last prefix wins
                        if pfx in self.localtt:
                            pfx = self.localtt[pfx]
                        itm = ':'.join((pfx, lclid))

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm,
                                             phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[-2]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the idmapping_selected download",
                uniprot_per, uniprot_tot)
Ejemplo n.º 8
0
class Assoc:
    """
    A base class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.

    """

    assoc_types = {
        'association': 'OBAN:association'
    }

    annotation_properties = {
        'replaced_by': 'IAO:0100001',
        'consider': 'OIO:consider',
        'hasExactSynonym': 'OIO:hasExactSynonym',
        'hasRelatedSynonym': 'OIO:hasRelatedSynonym',
        'definition': 'IAO:0000115',
        'has_xref': 'OIO:hasDbXref',
        'inchi_key': 'CHEBI:InChIKey',
        'probabalistic_quantifier': 'GENO:0000867'
    }

    object_properties = {
        'has disposition': 'RO:0000091',
        'has_phenotype': 'RO:0002200',
        'expressed_in': 'RO:0002206',
        'in_taxon': 'RO:0002162',
        'has_quality': 'RO:0000086',
        'towards': 'RO:0002503',
        'has_subject': 'OBAN:association_has_subject',
        'has_object': 'OBAN:association_has_object',
        'has_predicate': 'OBAN:association_has_predicate',
        'is_about': 'IAO:0000136',
        'has_evidence': 'RO:0002558',
        'has_source': 'dc:source',
        'has_provenance': 'OBAN:has_provenance',
        'causes_or_contributes': 'RO:0003302'
    }

    datatype_properties = {
        'position': 'faldo:position',
        'has_measurement': 'IAO:0000004',
        'has_quantifier': 'GENO:0000866',
        'created_on': 'pav:createdOn'
    }

    properties = annotation_properties.copy()
    properties.update(object_properties)
    properties.update(datatype_properties)

    def __init__(self, graph, definedby, sub=None, obj=None, pred=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)

        # core parts of the association
        self.definedby = definedby
        self.sub = sub
        self.obj = obj
        self.rel = pred
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        self.date = []

        # this is going to be used for the refactored evidence/provenance
        self.provenance = []
        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def get_properties(self):
        return self.properties

    def _is_valid(self):

        # check if sub/obj/rel are none...throw error
        if self.sub is None:
            raise ValueError('No subject set for this association')
        if self.obj is None:
            raise ValueError('No object set for this association')
        if self.rel is None:
            raise ValueError('No relation set for this association')

        return True

    def _add_basic_association_to_graph(self):

        if not self._is_valid():
            return

        self.graph.addTriple(self.sub, self.rel, self.obj)

        if self.assoc_id is None:
            self.set_association_id()

        self.model.addType(self.assoc_id, self.assoc_types['association'])

        self.graph.addTriple(
            self.assoc_id, self.object_properties['has_subject'], self.sub)
        self.graph.addTriple(
            self.assoc_id, self.object_properties['has_object'], self.obj)
        self.graph.addTriple(
            self.assoc_id, self.object_properties['has_predicate'], self.rel)

        if self.description is not None:
            self.model.addDescription(self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for e in self.evidence:
                self.graph.addTriple(
                    self.assoc_id, self.object_properties['has_evidence'], e)

        if self.source is not None and len(self.source) > 0:
            for s in self.source:
                if re.match('http', s):
                    # TODO assume that the source is a publication?
                    # use Reference class here
                    self.graph.addTriple(
                        self.assoc_id, self.object_properties['has_source'],
                        s, True)
                else:
                    self.graph.addTriple(
                        self.assoc_id, self.object_properties['has_source'], s)

        if self.provenance is not None and len(self.provenance) > 0:
            for p in self.provenance:
                self.graph.addTriple(
                    self.assoc_id, self.object_properties['has_provenance'], p)

        if self.date is not None and len(self.date) > 0:
            for d in self.date:
                self.graph.addTriple(
                    object_is_literal=True,
                    subject_id=self.assoc_id,
                    predicate_id=self.datatype_properties['created_on'],
                    obj=d)

        if self.score is not None:
            self.graph.addTriple(
                self.assoc_id, self.properties['has_measurement'],
                self.score, True, 'xsd:float')
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_association_to_graph(self):

        self._add_basic_association_to_graph()

        return

    def add_predicate_object(self, predicate, object_node,
                             object_type=None, datatype=None):

        if object_type == 'Literal':
            if datatype is not None:
                self.graph.addTriple(self.assoc_id, predicate,
                                     object_node, True, datatype)
            else:
                self.graph.addTriple(self.assoc_id, predicate,
                                     object_node, True)
        else:
            self.graph.addTriple(self.assoc_id, predicate,
                                 object_node, False)

        return

    # This isn't java, but if we must,
    # prefer use of property decorator
    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
        of the association.
        To be used in cases where an external association identifier
        should be used.

        :param assoc_id:

        :return:

        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(self.definedby, self.sub,
                                                     self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return

    def get_association_id(self):

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_date(self, date):
        if date is not None and date.strip() != '':
            self.date += [date]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    @staticmethod
    def make_association_id(definedby, subject, predicate, object,
                            attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively digests the  string of concatonated values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be appeded to the ID.

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:

        :return:

        """

        # note others available:
        #   md5(), sha1(), sha224(), sha256(), sha384(), and sha512()
        # putting definedby first,
        # as this will usually be the datasource providing the annotation
        # this will end up making the first few parts of the id
        # be the same for all annotations in that resource
        # (although the point of a digest is to render such details moot).

        items_to_hash = [definedby, subject, predicate, object]
        if attributes is not None:
            items_to_hash += attributes

        for i, val in enumerate(items_to_hash):
            if val is None:
                items_to_hash[i] = ''

        byte_string = '+'.join(items_to_hash).encode("utf-8")

        # TODO put this in a util?
        return ':'.join(('MONARCH', hashlib.sha1(byte_string).hexdigest()[0:16]))
Ejemplo n.º 9
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """
    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("%s is not a graph", graph)

        # assert ref_id is not None

        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map

        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type
            if ref_type[:4] not in ('IAO:', 'SIO:'):
                LOG.warning("Got Pub ref type of:  %s", ref_type)

        if ref_id is not None and ref_id[:4] == 'http':
            self.ref_url = ref_id

    def setTitle(self, title):
        self.title = title

    def setYear(self, year):
        self.year = year

    def setType(self, reference_type):
        self.ref_type = reference_type

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list

    def addAuthor(self, author):
        self.author_list += [author]

    def setShortCitation(self, citation):
        self.short_citation = citation

    def addPage(self,
                subject_id,
                page_url,
                subject_category=None,
                page_category=None):
        self.graph.addTriple(
            subject_id,
            self.globaltt['page'],  # foaf:page  not  <sio:web page>
            page_url,
            object_is_literal=False,  # URL is not a literal
            subject_category=subject_category,
            object_category=page_category)

    def addTitle(self, subject_id, title):
        if title is not None and title != '':
            self.graph.addTriple(subject_id,
                                 self.globaltt['title'],
                                 title,
                                 object_is_literal=True)

    def addRefToGraph(self):

        cite = self.short_citation
        if cite is None and self.title is not None:
            cite = self.title

        if self.ref_url is not None:
            if self.title is not None:
                self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            if cite is not None:
                self.model.addLabel(self.ref_url, cite)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            LOG.error("You are missing an identifier for a reference.")
Ejemplo n.º 10
0
class Genotype():
    """
    These methods provide convenient methods to
    add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in
    GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features,
    we use the GenomicFeature class to create them.

    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def addGenotype(self,
                    genotype_id,
                    genotype_label,
                    genotype_type=None,
                    genotype_description=None):
        """
        If a genotype_type is not supplied,
        we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:

        """
        if genotype_type is None:
            genotype_type = self.globaltt['intrinsic_genotype']

        self.model.addIndividualToGraph(genotype_id, genotype_label,
                                        genotype_type, genotype_description)
        return

    def addAllele(self,
                  allele_id,
                  allele_label,
                  allele_type=None,
                  allele_description=None):
        """
        Make an allele object.
        If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional,
        recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:

        """

        # TODO should we accept a list of allele types?
        if allele_type is None:
            allele_type = self.globaltt['allele']  # TODO is this a good idea?
        self.model.addIndividualToGraph(allele_id, allele_label, allele_type,
                                        allele_description)

        return

    def addGene(self,
                gene_id,
                gene_label=None,
                gene_type=None,
                gene_description=None):
        ''' genes are classes '''
        if gene_type is None:
            gene_type = self.globaltt['gene']
        self.model.addClassToGraph(gene_id, gene_label, gene_type,
                                   gene_description)

        return

    def addConstruct(self,
                     construct_id,
                     construct_label,
                     construct_type=None,
                     construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.model.addIndividualToGraph(construct_id, construct_label,
                                        construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.
        Examples of uses include between:
        an allele and a construct or strain here,
        a cell line and it's parent genotype.  Adding the parent and child to
        the graph should happen outside of this function call to ensure graph
        integrity.
        :param child_id:
        :param parent_id:
        :return:

        """

        self.graph.addTriple(child_id, self.globaltt['derives_from'],
                             parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.graph.addTriple(child_id, self.globaltt['sequence_derives_from'],
                             parent_id)

        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_allele_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt["is_allele_of"]
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addAffectedLocus(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:has_affected_feature.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt['has_affected_feature']
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addGeneProduct(self,
                       sequence_id,
                       product_id,
                       product_label=None,
                       product_type=None):
        """
        Add gene/variant/allele has_gene_product relationship
        Can be used to either describe a gene to transcript relationship
        or gene to protein
        :param sequence_id:
        :param product_id:
        :param product_label:
        :param product_type:
        :return:

        """
        if product_label is not None and product_type is not None:
            self.model.addIndividualToGraph(product_id, product_label,
                                            product_type)
        self.graph.addTriple(sequence_id, self.globaltt['has gene product'],
                             product_id)

        return

    def addPolypeptide(self,
                       polypeptide_id,
                       polypeptide_label=None,
                       transcript_id=None,
                       polypeptide_type=None):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:

        """
        if polypeptide_type is None:
            polypeptide_type = self.globaltt['polypeptide']
        self.model.addIndividualToGraph(polypeptide_id, polypeptide_label,
                                        polypeptide_type)
        if transcript_id is not None:
            self.graph.addTriple(transcript_id, self.globaltt['translates_to'],
                                 polypeptide_id)

        return

    def addPartsToVSLC(self,
                       vslc_id,
                       allele1_id,
                       allele2_id,
                       zygosity_id=None,
                       allele1_rel=None,
                       allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles
        (reference or variant loci) are traditionally added, you can add any
        node (such as sequence_alterations for unlocated variations) to a vslc
        if they are known to be paired.  However, if a sequence_alteration's
        loci is unknown, it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:

        """

        # vslc has parts allele1/allele2

        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.globaltt['homozygous']
            else:
                zygosity_id = self.globaltt['heterozygous']

        if zygosity_id is not None:
            self.graph.addTriple(vslc_id, self.globaltt['has_zygosity'],
                                 zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.
        The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """

        self.addParts(vslc_id, parent_id, self.globaltt['has_variant_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between
        a parent_id and the supplied part.
        By default the relationship will be BFO:has_part,
        but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:

        """
        if part_relationship is None:
            part_relationship = self.globaltt['has_part']
        # Fail loudly if parent or child identifiers are None
        if parent_id is None:
            raise TypeError('Attempt to pass None as parent')
        elif part_id is None:
            raise TypeError('Attempt to pass None as child')
        elif part_relationship is None:
            part_relationship = self.globaltt['has_part']

        self.graph.addTriple(parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(self,
                              sa_id,
                              sa_label,
                              sa_type=None,
                              sa_description=None):

        if sa_type is None:
            sa_type = self.globaltt['sequence_alteration']

        self.model.addIndividualToGraph(sa_id, sa_label, sa_type,
                                        sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.globaltt['has_variant_part'])
        return

    def addGenomicBackground(self,
                             background_id,
                             background_label,
                             background_type=None,
                             background_description=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addIndividualToGraph(background_id, background_label,
                                        background_type,
                                        background_description)

        return

    def addGenomicBackgroundToGenotype(self,
                                       background_id,
                                       genotype_id,
                                       background_type=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addType(background_id, background_type)
        self.addParts(background_id, genotype_id,
                      self.globaltt['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with
        RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background,
        but could be added to any genotype part (including a gene,
        regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:

        :return:

        """
        self.graph.addTriple(genopart_id, self.globaltt['in taxon'], taxon_id)

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype,
        # assuming it's a extrinsic_genotype
        self.graph.addTriple(genotype_id, self.globaltt['has_variant_part'],
                             reagent_id)

        return

    def addGeneTargetingReagent(self,
                                reagent_id,
                                reagent_label,
                                reagent_type,
                                gene_id,
                                description=None):
        """
        Here, a gene-targeting reagent is added.
        The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:

        :return:

        """

        # TODO add default type to reagent_type
        self.model.addIndividualToGraph(reagent_id, reagent_label,
                                        reagent_type, description)

        self.graph.addTriple(reagent_id, self.globaltt['targets_gene'],
                             gene_id)

        return

    def addReagentTargetedGene(self,
                               reagent_id,
                               gene_id,
                               targeted_gene_id=None,
                               targeted_gene_label=None,
                               description=None):
        """
        This will create the instance of a gene that is targeted by a molecular
        reagent (such as a morpholino or rnai).
        If an instance id is not supplied,
        we will create it as an anonymous individual which is of the type
        GENO:reagent_targeted_gene.
        We will also add the targets relationship between the reagent and
        gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
        rdf:label targeted_gene_label
        dc:description description
        <reagent_id> GENO:targets_gene <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:

        """

        # akin to a variant locus
        if targeted_gene_id is None:
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
            targeted_gene_id = targeted_gene_id.replace(":", "")
        self.model.addIndividualToGraph(targeted_gene_id, targeted_gene_label,
                                        self.globaltt['reagent_targeted_gene'],
                                        description)

        if gene_id is not None:
            self.graph.addTriple(targeted_gene_id,
                                 self.globaltt['is_expression_variant_of'],
                                 gene_id)

        self.graph.addTriple(targeted_gene_id, self.globaltt['is_targeted_by'],
                             reagent_id)

        return

    def addTargetedGeneSubregion(self,
                                 tgs_id,
                                 tgs_label,
                                 tgs_type=None,
                                 tgs_description=None):
        if tgs_type is None:
            tgs_type = self.globaltt['targeted_gene_subregion']

        self.model.addIndividualToGraph(tgs_id, tgs_label, tgs_type,
                                        tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.graph.addTriple(population_id,
                             self.globaltt['has_member_with_allelotype'],
                             member_id)
        return

    def addTargetedGeneComplement(self,
                                  tgc_id,
                                  tgc_label,
                                  tgc_type=None,
                                  tgc_description=None):
        if tgc_type is None:
            tgc_type = self.globaltt['targeted_gene_complement']
        self.model.addIndividualToGraph(tgc_id, tgc_label, tgc_type,
                                        tgc_description)

        return

    def addGenome(self, taxon_num, taxon_label=None, genome_id=None):
        ncbitaxon = 'NCBITaxon:' + taxon_num
        if taxon_label is None:
            if ncbitaxon in self.globaltcid:
                taxon_label = self.globaltcid[ncbitaxon]
            else:
                logging.warning('Add ' + ncbitaxon +
                                ' to global translation table')
                taxon_label = taxon_num
        elif ncbitaxon in self.globaltcid and taxon_label != self.globaltcid[
                ncbitaxon]:
            logging.warning('"' + self.globaltcid[ncbitaxon] +
                            '" may need updating from "' + taxon_label +
                            '" in global translation table')
            logging.warning(
                '"' + taxon_label + '": " ' + self.globaltcid[ncbitaxon] +
                '"' + ' may need to be added to a local translation table')

        genome_label = taxon_label + ' genome'
        if genome_id is None:
            genome_id = self.makeGenomeID(taxon_num)
        self.model.addClassToGraph(genome_id, genome_label,
                                   self.globaltt['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addIndividualToGraph(build_id, build_label,
                                        self.globaltt['reference_genome'])
        self.model.addType(build_id, genome_id)
        if re.match(r'[0-9]+', taxon_id):
            taxon_id = 'NCBITaxon:' + taxon_id
        self.addTaxon(taxon_id, build_id)

        return

    @staticmethod
    def makeGenomeID(taxon_id):
        # scrub off the taxon prefix.  put it in base space
        # TODO: revisit as yet another BNODE?
        # should never be called if a real genome iri exists
        # should create the opaque bode and label together
        # genome_id = re.sub(r'.*\:', '_:', taxon_id) + 'genome'
        genome_id = '_:' + taxon_id + 'genome'
        return genome_id

    def addChromosome(self,
                      chrom,
                      tax_id,
                      tax_label=None,
                      build_id=None,
                      build_label=None):
        """
        if it's just the chromosome, add it as an instance of a SO:chromosome,
        and add it to the genome. If a build is included,
        punn the chromosome as a subclass of SO:chromsome, and make the
        build-specific chromosome an instance of the supplied chr.
        The chr then becomes part of the build or genome.
        """
        family = Family(self.graph)
        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chrom), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chrom, tax_label)
        else:
            chr_label = makeChromLabel(chrom)
        genome_id = self.makeGenomeID(tax_id)
        self.model.addClassToGraph(chr_id, chr_label,
                                   self.globaltt['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            # the build-specific chromosome
            chrinbuild_id = makeChromID(chrom, build_id)
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chrom, build_label)
            # add the build-specific chromosome as an instance of the chr class

            self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label,
                                            chr_id)

            # add the build-specific chromosome
            # as a member of the build (both ways)
            family.addMember(build_id, chrinbuild_id)
            family.addMemberOf(chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        # the chrom class (generic) id
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.model.addClassToGraph(chrom_class_id, chrom_class_label,
                                   self.globaltt['chromosome'])

        return

    def addChromosomeInstance(self,
                              chr_num,
                              reference_id,
                              reference_label,
                              chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr_num:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.
        typically a genome-specific chr

        :return:

        """
        family = Family(self.graph)
        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.model.addIndividualToGraph(chr_id, chr_label,
                                        self.globaltt['chromosome'])
        if chr_type is not None:
            self.model.addType(chr_id, chr_type)

        # add the build-specific chromosome
        # as a member of the build  (both ways)
        family.addMember(reference_id, chr_id)
        family.addMemberOf(chr_id,
                           reference_id)  # usage dependent, todo: ommit

        return

    @staticmethod
    def make_variant_locus_label(gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip() + '<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """

        vslc_label = ''

        if gene_label is None and allele1_label is None and allele2_label is None:
            LOG.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label

    def make_experimental_model_with_genotype(self, genotype_id,
                                              genotype_label, taxon_id,
                                              taxon_label):

        animal_id = '-'.join((taxon_id, 'with', genotype_id))
        animal_id = re.sub(r':', '', animal_id)
        animal_id = '_:' + animal_id

        animal_label = ' '.join((genotype_label, taxon_label))
        self.model.addIndividualToGraph(animal_id, animal_label, taxon_id)
        self.graph.addTriple(animal_id, self.globaltt['has_genotype'],
                             genotype_id)
        return animal_id
Ejemplo n.º 11
0
class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """

    def __init__(self, identifier, title, url, description=None,
                 license_url=None, data_rights=None, graph_type=None,
                 file_handle=None):
        if graph_type is None:
            self.graph = RDFGraph(None, identifier)  # 
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True, file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph()
        self.model = Model(self.graph)
        self.identifier = ':' + identifier
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an literal of properties
        # such as dct:issued, which needs to conform xsd:dateTime format.
        # TODO ... we need to have a talk about typed literals and SPARQL
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dct:title', title, True)
        self.graph.addTriple(
            self.identifier, 'dct:identifier',
            identifier, object_is_literal=True)
        self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> .

        # TODO add the licence info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(
                self.identifier, 'dct:license', license_url)
        else:
            logger.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(
                self.identifier, 'dct:rights',
                data_rights, object_is_literal=True)
        else:
            logger.debug('No rights provided.')

        if description is not None:
            self.model.addDescription(self.identifier, description)
        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...
            should use the other set_* for version and date

        as of 2016-10-20  used in:
        
        dipper/sources/HPOAnnotations.py 139:
        dipper/sources/CTD.py             99:
        dipper/sources/BioGrid.py        100:        
        dipper/sources/MGI.py            255:
        dipper/sources/EOM.py             93:
        dipper/sources/Coriell.py        200:
        dipper/sources/MMRRC.py           77:

        # TODO set as deprecated
        
        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.error("date or version not set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.info("set version to %s", self.version)
            self.set_version_by_date(date_issued)

        logger.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.addTriple(
            self.identifier, 'dct:issued', date_issued, object_is_literal=True)
        logger.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            d = date_issued
        elif self.date_issued is not None:
            d = self.date_issued
        else:
            d = self.date_accessed
            logger.info(
                "No date supplied for setting version; "
                "using download timestamp for date_issued")

        logger.info("setting version by date")
        self.set_version_by_num(d)

        return

    def set_version_by_num(self, version_num):

        self.version = self.identifier+version_num
        self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier)
        self.graph.addTriple(self.version, 'pav:version', version_num,
                             object_is_literal=True)

        logger.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            dipperized_version = ':' + str(self.date_accessed)
            self.graph.addTriple(
                dipperized_version, 'dct:isVersionOf',
                self.version)
            self.graph.addTriple(
                dipperized_version, 'pav:version',
                self.date_accessed, object_is_literal=True)
            self.graph.addTriple(
                dipperized_version, 'dct:issued',
                self.date_accessed, object_is_literal=True,
                literal_type="xsd:dateTime")
        return


    def setFileAccessUrl(self, url, is_object_literal=False):
        self.graph.addTriple(self.identifier, 'dcat:accessURL',
                             url, is_object_literal)

    def getGraph(self):
        return self.graph

    def set_license(self, license):
        self.license = license
        return

    def get_license(self):

        return self.license

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return
Ejemplo n.º 12
0
class Genotype():
    """
    These methods provide convenient methods to
    add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in
    GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features,
    we use the GenomicFeature class to create them.

    """

    # special genotype parts mapped to their
    # GENO and SO classes that we explicitly reference here
    genoparts = {
        'intrinsic_genotype': 'GENO:0000000',
        'extrinsic_genotype': 'GENO:0000524',
        'effective_genotype': 'GENO:0000525',
        'sex_qualified_genotype': 'GENO:0000645',
        'male_genotype': 'GENO:0000646',
        'female_genotype': 'GENO:0000647',
        'genomic_background': 'GENO:0000611',
        'unspecified_genomic_background': 'GENO:0000649',
        'genomic_variation_complement': 'GENO:0000009',
        'karyotype_variation_complement': 'GENO:0000644',
        'variant_single_locus_complement': 'GENO:0000030',
        'variant_locus': 'GENO:0000002',
        'reference_locus': 'GENO:0000036',
        'allele': 'GENO:0000512',
        'gene': 'SO:0000704',
        'QTL': 'SO:0000771',
        'transgene': 'SO:0000902',  # not really used any more
        'transgenic_insertion': 'SO:0001218',
        'pseudogene': 'SO:0000336',
        'cytogenetic marker': 'SO:0000341',
        'sequence_feature': 'SO:0000110',
        'sequence_alteration': 'SO:0001059',
        'insertion': 'SO:0000667',
        'deletion': 'SO:0000159',
        'substitution': 'SO:1000002',
        'duplication': 'SO:1000035',
        'translocation': 'SO:0000199',
        'inversion': 'SO:1000036',
        'tandem_duplication': 'SO:1000173',
        'point_mutation': 'SO:1000008',
        'population': 'PCO:0000001',  # population
        'family': 'PCO:0000020',  # family
        'wildtype': 'GENO:0000511',
        'reagent_targeted_gene': 'GENO:0000504',
        'targeted_gene_subregion': 'GENO:0000534',
        'targeted_gene_complement': 'GENO:0000527',
        'biological_region': 'SO:0001411',
        'missense_variant': 'SO:0001583',
        'transcript': 'SO:0000233',
        'polypeptide': 'SO:0000104',
        'cDNA': 'SO:0000756',
        'sequence_variant_causing_loss_of_function_of_polypeptide':
            'SO:1000118',
        'sequence_variant_causing_gain_of_function_of_polypeptide':
            'SO:1000125',
        'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120',
        'sequence_variant_affecting_polypeptide_function': 'SO:1000117',
        'regulatory_transgene_feature': 'GENO:0000637',
        'coding_transgene_feature': 'GENO:0000638',
        'protein_coding_gene': 'SO:0001217',
        'ncRNA_gene': 'SO:0001263',
        'RNAi_reagent': 'SO:0000337',
        'heritable_phenotypic_marker': 'SO:0001500'
    }

    object_properties = {
        'is_mutant_of': 'GENO:0000440',
        'derives_from': 'RO:0001000',
        'has_alternate_part': 'GENO:0000382',
        'has_reference_part': 'GENO:0000385',
        'has_sex_agnostic_genotype_part': 'GENO:0000650',
        'in_taxon': 'RO:0002162',
        'has_zygosity': 'GENO:0000608',
        # is_seq_var_inst_of links a alternate locus (instance)
        # to a gene (class)
        'is_sequence_variant_instance_of': 'GENO:0000408',
        'targets_instance_of': 'GENO:0000414',
        'is_reference_instance_of': 'GENO:0000610',
        'has_part': 'BFO:0000051',
        # use has_member_with_allelotype when relating populations
        'has_member_with_allelotype': 'GENO:0000225',
        'is_allelotype_of': 'GENO:0000206',
        'has_genotype': 'GENO:0000222',
        'has_phenotype': 'RO:0002200',
        'has_gene_product': 'RO:0002205',
        'translates_to': 'RO:0002513',
        'is_targeted_expression_variant_of': 'GENO:0000443',
        'is_transgene_variant_of': 'GENO:0000444',
        'has_variant_part': 'GENO:0000382',
        # targeted_by isa between a (reagent-targeted gene) and a morpholino
        'targeted_by': 'GENO:0000634',
        # FIXME should derives_sequence_from_gene just be subsequence of?
        'derives_sequence_from_gene': 'GENO:0000639',
        'has_affected_locus': 'GENO:0000418'
    }

    annotation_properties = {
        # TODO change properties with
        # https://github.com/monarch-initiative/GENO-ontology/issues/21
        # FIXME
        # reference_nucleotide, reference_amino_acid, altered_nucleotide
        # results_in_amino_acid_change are FIXME Made up terms
        'reference_nucleotide': 'GENO:reference_nucleotide',
        'reference_amino_acid': 'GENO:reference_amino_acid',
        'altered_nucleotide': 'GENO:altered_nucleotide',
        'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change'
    }

    zygosity = {
        'homoplasmic': 'GENO:0000602',
        'heterozygous': 'GENO:0000135',
        'indeterminate': 'GENO:0000137',
        'heteroplasmic': 'GENO:0000603',
        'hemizygous-y': 'GENO:0000604',
        'hemizygous-x': 'GENO:0000605',
        'homozygous': 'GENO:0000136',
        'hemizygous': 'GENO:0000606',
        'complex_heterozygous': 'GENO:0000402',
        'simple_heterozygous': 'GENO:0000458'
    }

    properties = object_properties.copy()
    properties.update(annotation_properties)

    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)

        return

    def addGenotype(
            self, genotype_id, genotype_label, genotype_type=None,
            genotype_description=None):
        """
        If a genotype_type is not supplied,
        we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:

        """

        if genotype_type is None:
            genotype_type = self.genoparts['intrinsic_genotype']

        self.model.addIndividualToGraph(
            genotype_id, genotype_label, genotype_type, genotype_description)
        return

    def addAllele(
            self, allele_id, allele_label, allele_type=None,
            allele_description=None):
        """
        Make an allele object.
        If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional,
        recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:

        """

        # TODO should we accept a list of allele types?
        if allele_type is None:
            allele_type = self.genoparts['allele']  # TODO is this a good idea?
        self.model.addIndividualToGraph(
            allele_id, allele_label, allele_type, allele_description)

        return

    def addGene(
            self, gene_id, gene_label, gene_type=None,
            gene_description=None):
        if gene_type is None:
            gene_type = self.genoparts['gene']
        # genes are classes
        self.model.addClassToGraph(
            gene_id, gene_label, gene_type, gene_description)

        return

    def addConstruct(self, construct_id, construct_label, construct_type=None,
                     construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.model.addIndividualToGraph(construct_id, construct_label,
                                        construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.
        Examples of uses include between:
        an allele and a construct or strain here,
        a cell line and it's parent genotype.  Adding the parent and child to
        the graph should happen outside of this function call to ensure graph
        integrity.
        :param child_id:
        :param parent_id:
        :return:

        """

        self.graph.addTriple(
            child_id, self.properties['derives_from'], parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.graph.addTriple(
            child_id, self.properties['derives_sequence_from_gene'], parent_id)
        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_sequence_variant_instance_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.properties['is_sequence_variant_instance_of']
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addAffectedLocus(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_sequence_variant_instance_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.properties['has_affected_locus']
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addGeneProduct(
            self, sequence_id, product_id,
            product_label=None, product_type=None):
        """
        Add gene/variant/allele has_gene_product relationship
        Can be used to either describe a gene to transcript relationship
        or gene to protein
        :param sequence_id:
        :param product_id:
        :param product_label:
        :param product_type:
        :return:

        """
        if product_label is not None and product_type is not None:
            self.model.addIndividualToGraph(
                product_id, product_label, product_type)
        self.graph.addTriple(
            sequence_id, self.properties['has_gene_product'], product_id)

        return

    def addPolypeptide(
            self, polypeptide_id, polypeptide_label=None,
            transcript_id=None, polypeptide_type=None, ):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:

        """

        if polypeptide_type is None:
            polypeptide_type = self.genoparts['polypeptide']
        self.model.addIndividualToGraph(
            polypeptide_id, polypeptide_label, polypeptide_type)
        if transcript_id is not None:
            self.graph.addTriple(
                transcript_id, self.properties['translates_to'],
                polypeptide_id)

        return

    def addPartsToVSLC(
            self, vslc_id, allele1_id, allele2_id, zygosity_id=None,
            allele1_rel=None, allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles
        (reference or variant loci) are traditionally added, you can add any
        node (such as sequence_alterations for unlocated variations) to a vslc
        if they are known to be paired.  However, if a sequence_alteration's
        loci is unknown, it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:

        """

        # vslc has parts allele1/allele2

        # vslc = gu.getNode(vslc_id)  # TODO unused
        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.zygosity['homozygous']
            else:
                zygosity_id = self.zygosity['heterozygous']

        if zygosity_id is not None:
            self.graph.addTriple(
                vslc_id, self.properties['has_zygosity'], zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.
        The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """

        self.addParts(
            vslc_id, parent_id, self.properties['has_alternate_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between
        a parent_id and the supplied part.
        By default the relationship will be BFO:has_part,
        but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:

        """

        if part_relationship is None:
            part_relationship = self.properties['has_part']

        self.graph.addTriple(parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(
            self, sa_id, sa_label, sa_type=None, sa_description=None):
        if sa_type is None:
            sa_type = self.genoparts['sequence_alteration']
        self.model.addIndividualToGraph(
            sa_id, sa_label, sa_type, sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.properties['has_alternate_part'])
        return

    def addGenomicBackground(
            self, background_id, background_label,
            background_type=None, background_description=None):
        if background_type is None:
            background_type = self.genoparts['genomic_background']
        self.model.addIndividualToGraph(
            background_id, background_label, background_type,
            background_description)

        return

    def addGenomicBackgroundToGenotype(
            self, background_id, genotype_id, background_type=None):
        if background_type is None:
            background_type = self.genoparts['genomic_background']
        self.model.addType(background_id, background_type)
        self.addParts(background_id, genotype_id,
                      self.object_properties['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with
        RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background,
        but could be added to any genotype part (including a gene,
        regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:

        :return:

        """
        self.graph.addTriple(
            genopart_id, self.properties['in_taxon'], taxon_id)

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype,
        # assuming it's a extrinsic_genotype
        self.graph.addTriple(
            genotype_id, self.properties['has_variant_part'], reagent_id)

        return

    def addGeneTargetingReagent(
            self, reagent_id, reagent_label, reagent_type, gene_id,
            description=None):
        """
        Here, a gene-targeting reagent is added.
        The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:

        :return:

        """

        # TODO add default type to reagent_type
        self.model.addIndividualToGraph(
            reagent_id, reagent_label, reagent_type, description)

        self.graph.addTriple(
            reagent_id, self.object_properties['targets_instance_of'], gene_id)

        return

    def addReagentTargetedGene(
            self, reagent_id, gene_id, targeted_gene_id=None,
            targeted_gene_label=None, description=None):
        """
        This will create the instance of a gene that is targeted by a molecular
        reagent (such as a morpholino or rnai).
        If an instance id is not supplied,
        we will create it as an anonymous individual which is of the type
        GENO:reagent_targeted_gene.
        We will also add the targets relationship between the reagent and
        gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
        rdf:label targeted_gene_label
        dc:description description
        <reagent_id> GENO:targets_instance_of <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:

        """

        # akin to a variant locus
        if targeted_gene_id is None:
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
            targeted_gene_id = targeted_gene_id.replace(":", "")
        self.model.addIndividualToGraph(
            targeted_gene_id, targeted_gene_label,

            self.genoparts['reagent_targeted_gene'], description)

        if gene_id is not None:
            self.graph.addTriple(
                targeted_gene_id,
                self.object_properties['is_targeted_expression_variant_of'],
                gene_id)

        self.graph.addTriple(
            targeted_gene_id, self.properties['targeted_by'], reagent_id)

        return

    def addTargetedGeneSubregion(
            self, tgs_id, tgs_label, tgs_type=None, tgs_description=None):
        if tgs_type is None:
            tgs_type = self.genoparts['targeted_gene_subregion']

        self.model.addIndividualToGraph(
            tgs_id, tgs_label, tgs_type, tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.graph.addTriple(
            population_id,
            self.properties['has_member_with_allelotype'],
            member_id)
        return

    def addTargetedGeneComplement(
            self, tgc_id, tgc_label, tgc_type=None, tgc_description=None):
        if tgc_type is None:
            tgc_type = self.genoparts['targeted_gene_complement']
        self.model.addIndividualToGraph(
            tgc_id, tgc_label, tgc_type, tgc_description)

        return

    def addGenome(self, taxon_id, taxon_label=None):
        if taxon_label is None:
            taxon_label = taxon_id
        genome_label = taxon_label+' genome'
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addClassToGraph(
            genome_id, genome_label, Feature.types['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addIndividualToGraph(
            build_id, build_label, Feature.types['reference_genome'])
        self.model.addType(build_id, genome_id)
        self.addTaxon(taxon_id, build_id)

        return

    def makeGenomeID(self, taxon_id):
        # scrub off the taxon prefix.  put it in base space
        # TODO: revisit as BNODE?

        genome_id = re.sub(r'.*\:', ':', taxon_id) + 'genome'

        return genome_id

    def addChromosome(
            self, chr, tax_id, tax_label=None, build_id=None,
            build_label=None):
        """
        if it's just the chromosome, add it as an instance of a SO:chromosome,
        and add it to the genome. If a build is included,
        punn the chromosome as a subclass of SO:chromsome, and make the
        build-specific chromosome an instance of the supplied chr.
        The chr then becomes part of the build or genome.
        """
        family = Family()
        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chr), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chr, tax_label)
        else:
            chr_label = makeChromLabel(chr)
        genome_id = self.makeGenomeID(tax_id)
        self.model.addClassToGraph(
            chr_id, chr_label, Feature.types['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            # the build-specific chromosome
            chrinbuild_id = makeChromID(chr, build_id)
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chr, build_label)
            # add the build-specific chromosome as an instance of the chr class

            self.model.addIndividualToGraph(
                chrinbuild_id, chrinbuild_label, chr_id)

            # add the build-specific chromosome
            # as a member of the build (both ways)
            family.addMember(build_id, chrinbuild_id)
            family.addMemberOf(chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        # the chrom class (generic) id
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.model.addClassToGraph(
            chrom_class_id, chrom_class_label, Feature.types['chromosome'])

        return

    def addChromosomeInstance(
            self, chr_num, reference_id, reference_label, chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr_num:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.
        typically a genome-specific chr

        :return:

        """
        family = Family(self.graph)
        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.model.addIndividualToGraph(
            chr_id, chr_label, Feature.types['chromosome'])
        if chr_type is not None:
            self.model.addType(chr_id, chr_type)

        # add the build-specific chromosome
        # as a member of the build  (both ways)
        family.addMember(reference_id, chr_id)
        family.addMemberOf(chr_id, reference_id)

        return

    def make_variant_locus_label(self, gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip()+'<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """

        vslc_label = ''

        if gene_label is None and \
                allele1_label is None and allele2_label is None:
            logger.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label

    def make_experimental_model_with_genotype(
             self, genotype_id, genotype_label, taxon_id, taxon_label):

        animal_id = '-'.join((taxon_id, 'with', genotype_id))
        animal_id = re.sub(r':', '', animal_id)
        animal_id = '_:'+animal_id

        animal_label = ' '.join((genotype_label, taxon_label))
        self.model.addIndividualToGraph(animal_id, animal_label, taxon_id)
        self.graph.addTriple(
            animal_id, Genotype.object_properties['has_genotype'], genotype_id)
        return animal_id
Ejemplo n.º 13
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """
    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type

        if ref_id is not None and re.match(r'http', ref_id):
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(subject_id,
                             self.globaltt['page'],
                             page_url,
                             object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        self.graph.addTriple(subject_id,
                             self.globaltt['title (dce)'],
                             title,
                             object_is_literal=True)
        return

    def addRefToGraph(self):

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            self.model.addLabel(self.ref_url, n)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, n, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Ejemplo n.º 14
0
    def _get_orthologs(self, src_key, limit):
        """
        This will process each of the specified pairwise orthology files,
        creating orthology associations based on the specified orthology code.
        this currently assumes that each of the orthology files is identically
        formatted. Relationships are made between genes here.

        There is also a nominal amount of identifier re-formatting:
        MGI:MGI --> MGI
        Ensembl --> ENSEMBL

        we skip any genes where we don't know how to map the gene identifiers.
        For example, Gene:Huwe1 for RAT is not an identifier, so we skip any
        mappings to this identifier.  Often, the there are two entries for the
        same gene (base on equivalent Uniprot id), and so we are not actually
        losing any information.

        We presently have a filter to select only orthology relationships where
        each of the pair is found in self.tax_ids.

        Genes are also added to a grouping class defined with a PANTHER id.

        Triples:
        <gene1_id> RO:othologous <gene2_id>
        <assoc_id> :hasSubject <gene1_id>
        <assoc_id> :hasObject <gene2_id>
        <assoc_id> :hasPredicate <RO:orthologous>
        <assoc_id> dcterms:evidence ECO:phylogenetic_evidence

        <panther_id> rdf:type DATA:gene_family
        <panther_id> RO:has_member <gene1_id>
        <panther_id> RO:has_member <gene2_id>

        :param limit:
        :return:

        """
        LOG.info("reading orthologs")

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        unprocessed_gene_ids = []

        src_file = '/'.join((self.rawdir, self.files[src_key]['file']))
        matchcounter = line_counter = 0
        col = self.files[src_key]['columns']
        reader = tarfile.open(src_file, 'r:gz')

        LOG.info("Parsing %s", src_key)

        with reader.extractfile(src_key) as csvfile:
            # there are no comments or headers
            for line in csvfile:
                # a little feedback to the user since there's so many ... bah strace
                # if line_counter % 1000000 == 0:
                #    LOG.info("Processed %d lines from %s", line_counter, fname.name)

                # parse each row. ancestor_taxons is unused
                # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83
                #   	MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6
                #       	LDO	Euarchontoglires	PTHR15964

                row = line.decode().split('\t')
                thing1 = row[col.index('Gene')].strip()
                thing2 = row[col.index('Ortholog')].strip()
                orthology_type = row[col.index('Type of ortholog')].strip()
                # ancestor_taxons  = row[
                #    col.index('Common ancestor for the orthologs')].strip()
                panther_id = row[
                    col.index('Panther Ortholog ID')].strip()

                (species_a, gene_a, protein_a) = thing1.split('|')
                (species_b, gene_b, protein_b) = thing2.split('|')

                # for testing skip entries without homolog relationships to test ids
                if self.test_mode and not (
                        protein_a[9:] in self.test_ids or
                        protein_b[9:] in self.test_ids):
                    continue

                # map the species abbreviations to ncbi taxon id numbers
                taxon_a = self.resolve(species_a).split(':')[1].strip()
                taxon_b = self.resolve(species_b).split(':')[1].strip()

                # ###
                # keep orthologous relationships to genes in the given tax_ids
                # using AND will get you only those associations where
                # gene1 AND gene2 are in the taxid list (most-filter)
                # using OR will get you any associations where
                # gene1 OR gene2 are in the taxid list (some-filter)
                if self.tax_ids is not None and (
                        taxon_a not in self.tax_ids) and (
                        taxon_b not in self.tax_ids):
                    continue
                else:
                    matchcounter += 1
                    if limit is not None and matchcounter > limit:
                        break

                # ### end code block for filtering on taxon

                # fix the gene identifiers
                gene_a = re.sub(r'=', ':', gene_a)
                gene_b = re.sub(r'=', ':', gene_b)

                clean_gene = self._clean_up_gene_id(gene_a, species_a)
                if clean_gene is None:
                    unprocessed_gene_ids.append(gene_a)
                    continue
                gene_a = clean_gene
                clean_gene = self._clean_up_gene_id(gene_b, species_b)
                if clean_gene is None:
                    unprocessed_gene_ids.append(gene_b)
                    continue
                gene_b = clean_gene

                rel = self.resolve(orthology_type)

                evidence_id = self.globaltt['phylogenetic evidence']

                # add the association and relevant nodes to graph
                assoc = OrthologyAssoc(graph, self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence_id)

                # add genes to graph;  assume labels will be taken care of elsewhere
                model.addType(gene_a, self.globaltt['gene'])
                model.addType(gene_b, self.globaltt['gene'])

                # might as well add the taxon info for completeness
                graph.addTriple(
                    gene_a, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_a
                )
                graph.addTriple(
                    gene_b, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_b
                )

                assoc.add_association_to_graph(
                    blv.terms['GeneToGeneHomologyAssociation']
                )

                # note this is incomplete...
                # it won't construct the full family hierarchy,
                # just the top-grouping
                assoc.add_gene_family_to_graph('PANTHER:' + panther_id)

                if not self.test_mode and\
                        limit is not None and line_counter > limit:
                    break

            LOG.info("finished processing %s", src_file)
            LOG.warning(
                "The following gene ids were unable to be processed: %s",
                str(set(unprocessed_gene_ids)))
Ejemplo n.º 15
0
class Dataset:
    """
     This class produces metadata about a dataset that is compliant with the
     HCLS dataset specification:
     https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4

     Summary level: The summary level provides a description of a dataset that is
     independent of a specific version or format. (e.g. the Monarch ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER]

     Version level: The version level captures version-specific characteristics of a
     dataset. (e.g. the 01-02-2018 ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP]

     Distribution level: The distribution level captures metadata about a specific form
     and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is
     a [distribution level resource] for each different downloadable file we emit,
     i.e. one for the TTL file, one for the ntriples file, etc.
     CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format]

     We write out at least the following triples:

     SUMMARY LEVEL TRIPLES:
     [summary level resource] - rdf:type -> dctypes:Dataset
     [summary level resource] - dc:title -> title (literal)
     [summary level resource] - dc:description -> description (literal)
                                                (use docstring from Source class)
     [summary level resource] - dc:source -> [source web page, e.g. omim.org]
     [summary level resource] - schema:logo -> [source logo IRI]
     [summary level resource] - dc:publisher -> monarchinitiative.org
        n.b: about summary level resource triples:
        -- HCLS spec says we "should" link to our logo and web page, but I'm not,
        because it would confuse the issue of whether we are pointing to our logo/page
        or the logo/page of the data source for this ingest. Same below for
        [version level resource] and [distibution level resource] - I'm not linking to
        our page/logo down there either.
        - spec says we "should" include summary level triples describing Update
        frequency and SPARQL endpoint but I'm omitting this for now, because these are
        not clearly defined at the moment

     VERSION LEVEL TRIPLES:
     [version level resource] - rdf:type -> dctypes:Dataset
     [version level resource] - dc:title -> version title (literal)
     [version level resource] - dc:description -> version description (literal)
     [version level resource] - dc:created -> ingest timestamp [ISO 8601 compliant]
     [version level resource] - pav:version -> ingest timestamp (same one above)
     [version level resource] - dc:creator	-> monarchinitiative.org
     [version level resource] - dc:publisher -> monarchinitiative.org
     [version level resource] - dc:isVersionOf -> [summary level resource]
     [version level resource] - dc:source -> [source file 1 IRI]
     [version level resource] - dc:source -> [source file 2 IRI]
     ...

     [source file 1 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     [source file 2 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     ...

     [version level resource] - pav:createdWith -> [Dipper github URI]
     [version level resource] - void:dataset -> [distribution level resource]

     [version level resource] - cito:citesAsAuthoriy -> [citation id 1]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 2]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 3]

        n.b: about version level resource triples:
        - spec says we "should" include Date of issue/dc:issued triple, but I'm not
        because it is redundant with this triple above:
        [version level resource] - dc:created -> time stamp
        and would introduce ambiguity and confusion if the two disagree. Same below
        for [distribution level resource] - dc:created -> tgiime stamp below
        Also omitting:
          - triples linking to our logo and page, see above.
          - License/dc:license triple, because we will make this triple via the
            [distribution level resource] below
          - Language/dc:language triple b/c it seems superfluous. Same below for
            [distribution level resource] - no language triple.
        - [version level resource] - pav:version triple is also a bit redundant
        with the pav:version triple below, but the spec requires both these triples
        - I'm omitting the [version level resource] -> pav:previousVersion because
        Dipper doesn't know this info for certain at run time. Same below for
        [distribution level resource] - pav:previousVersion.


     DISTRIBUTION LEVEL TRIPLES:
     [distribution level resource] - rdf:type -> dctypes:Dataset
     [distribution level resource] - rdf:type -> dcat:Distribution
     [distribution level resource] - dc:title -> distribution title (literal)
     [distribution level resource] - dc:description -> distribution description (lit.)
     [distribution level resource] - dc:created -> ingest timestamp[ISO 8601 compliant]
     [distribution level resource] - pav:version -> ingest timestamp (same as above)
     [distribution level resource] - dc:creator -> monarchinitiative.org
     [distribution level resource] - dc:publisher -> monarchinitiative.org
     [distribution level resource] - dc:license -> [license info, if available
                    otherwise indicate unknown]
     [distribution level resource] - dc:rights -> [data rights IRI]
     [distribution level resource] - pav:createdWith -> [Dipper github URI]
     [distribution level resource] - dc:format -> [IRI of ttl|nt|whatever spec]
     [distribution level resource] - dcat:downloadURL -> [ttl|nt URI]
     [distribution level resource] - void:triples -> [triples count (literal)]
     [distribution level resource] - void:entities -> [entities count (literal)]
     [distribution level resource] - void:distinctSubjects -> [subject count (literal)]
     [distribution level resource] - void:distinctObjects -> [object count (literal)]
     [distribution level resource] - void:properties -> [properties count (literal)]
     ...

        n.b: about distribution level resource triples:
        - omitting Vocabularies used/void:vocabulary and Standards
        used/dc:conformTo triples, because they are described in the ttl file
        - also omitting Example identifier/idot:exampleIdentifier and
        Example resource/void:exampleResource, because we don't really have one
        canonical example of either - they're all very different.
        - [distribution level resource] - dc:created should have the exact same
        time stamp as this triple above:
        [version level resource] - dc:created -> time stamp
        - this [distribution level resource] - pav:version triple should have the
        same object as [version level resource] - pav:version triple above
        - Data source provenance/dc:source triples are above in the
        [version level resource]
        - omitting Byte size/dc:byteSize, RDF File URL/void:dataDump, and
        Linkset/void:subset triples because they probably aren't necessary for MI right
        now
        - these triples "should" be emitted, but we will do this in a later iteration:
        # of classes	void:classPartition	IRI
        # of literals	void:classPartition	IRI
        # of RDF graphs	void:classPartition	IRI

     Note: Do not use blank nodes in the dataset graph. This dataset graph is added to
     the main Dipper graph in Source.write() like so

        $ mainGraph = mainGraph + datasetGraph

     which apparently in theory could lead to blank node ID collisions between the two
     graphs.

     Note also that this implementation currently does not support producing metadata
     for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is
     currently not being used for any ingests, so this isn't a problem. There was
     talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which
     would probably require adding support here for StreamedGraph's.
    """
    def __init__(
            self,
            identifier,
            data_release_version,
            ingest_name,
            ingest_title,
            ingest_url,
            ingest_logo=None,
            ingest_description=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None,
            distribution_type='ttl',
            dataset_curie_prefix='MonarchArchive'):

        if graph_type is None:
            self.graph = RDFGraph(None,
                                  ":".join([dataset_curie_prefix, identifier]))
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       ":".join(
                                           [dataset_curie_prefix, identifier]),
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True,
                                  ':'.join([dataset_curie_prefix, identifier]))

        if data_release_version is not None:
            self.data_release_version = data_release_version
        else:
            self.data_release_version = datetime.today().strftime("%Y%m%d")

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.identifier = ':'.join([dataset_curie_prefix, identifier])
        self.citation = set()

        self.ingest_name = ingest_name
        self.ingest_title = ingest_title
        if self.ingest_title is None:
            self.ingest_title = ":".join([dataset_curie_prefix, identifier])

        self.ingest_url = ingest_url
        self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo
        self.ingest_description = ingest_description

        self.date_issued = None

        self.license_url = license_url
        self.data_rights = data_rights
        self.distribution_type = distribution_type

        # set HCLS resource CURIEs
        self.summary_level_curie = ':'.join(
            [dataset_curie_prefix, '#' + identifier])
        self.version_level_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/#' + identifier
        self.distribution_level_turtle_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/rdf/' + \
            identifier + "." + self.distribution_type

        # The following might seem a little odd, but we need to set downloadURLs this
        # way in order for them to point to where they will end up in archive.MI.org as
        # of Sept 2019. URL is:
        #  https://archive.MI.org/[release version]/[dist type]/[source].[dist type]
        self.download_url = \
            self.curie_map.get("MonarchArchive") + self.data_release_version + \
            "/rdf/" + self.ingest_name + "." + self.distribution_type

        self._set_summary_level_triples()
        self._set_version_level_triples()
        self._set_distribution_level_triples()

    def _set_summary_level_triples(self):
        self.model.addType(self.summary_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(self.summary_level_curie, self.globaltt['title'],
                             self.ingest_title, True)
        self.model.addTriple(self.summary_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))
        self.model.addTriple(self.summary_level_curie, "schema:logo",
                             self.ingest_logo)
        self.graph.addTriple(self.summary_level_curie,
                             self.globaltt['identifier'],
                             self.summary_level_curie)
        if self.ingest_url is not None:
            self.graph.addTriple(self.summary_level_curie,
                                 self.globaltt["Source"], self.ingest_url)
        if self.ingest_description is not None:
            self.model.addDescription(self.summary_level_curie,
                                      self.ingest_description)

    def _set_version_level_triples(self):
        self.model.addType(self.version_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['title'],
            self.ingest_title + " Monarch version " +
            self.data_release_version, True)
        if self.ingest_description is not None:
            self.model.addDescription(self.version_level_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['Date Created'],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date))
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['isVersionOf'],
                             self.summary_level_curie,
                             object_is_literal=False)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['distribution'],
                             self.distribution_level_turtle_curie,
                             object_is_literal=False)

    def _set_distribution_level_triples(self):
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['Dataset'])
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['Distribution'])
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['title'],
            self.ingest_title + " distribution " + self.distribution_type,
            True)
        if self.ingest_description is not None:
            self.model.addDescription(self.distribution_level_turtle_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(
            self.distribution_level_turtle_curie,
            self.globaltt['Date Created'],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date))
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['created_with'],
                             "https://github.com/monarch-initiative/dipper")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['format'],
                             "https://www.w3.org/TR/turtle/")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['downloadURL'], self.download_url)
        if self.license_url is None:
            self.graph.addTriple(
                self.distribution_level_turtle_curie, self.globaltt['license'],
                'https://project-open-data.cio.gov/unknown-license/')
        else:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['license'], self.license_url)

        if self.data_rights is not None:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['rights'], self.data_rights)

        self._declare_as_ontology()

    def set_ingest_source_file_version_num(self, file_iri, version):
        """
        This method sets the version of a remote file or resource that is used in the
        ingest. It writes this triple:

        file_iri - 'pav:version' -> version

        Version is an untyped literal

        Note: if your version is a date or timestamp, use
        set_ingest_source_file_version_date()
        instead

        :param file_iri: a remote file or resource used in ingest
        :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD)
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             version,
                             object_is_literal=True)

    def set_ingest_source_file_version_date(self,
                                            file_iri,
                                            date,
                                            datatype=XSD.date):
        """
        This method sets the version that the source (OMIM, CTD, whatever) uses to
        refer to this version of the remote file/resource that was used in the ingest

        It writes this triple:

        file_iri - 'pav:version' -> date or timestamp

        Version is added as a literal of datatype XSD date

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source_file_version_retrieved_on(self,
                                                    file_iri,
                                                    date,
                                                    datatype=XSD.date):
        """
        This method sets the date on which a remote file/resource (from OMIM, CTD, etc)
        was retrieved.

        It writes this triple:

        file_iri - 'pav:retrievedOn' -> date or timestamp

        Version is added as a literal of datatype XSD date by default

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['retrieved_on'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source(self, url, predicate=None, is_object_literal=False):
        """
        This method writes a triple to the dataset graph indicating that the ingest
        used a file or resource at [url] during the ingest.

        Triple emitted is version_level_curie dc:source [url]

        This triple is likely to be redundant if Source.get_files() is used to retrieve
        the remote files/resources, since this triple should also be emitted
        as files/resources are being retrieved. This method is provided as a convenience
        method for sources that do their own downloading of files.

        :param url: a remote resource used as a source during ingest
        :param predicate: the predicate to use for the triple ["dc:source"]
                from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/)
                "Use dc:source when the source dataset was used in whole or in part.
                Use pav:retrievedFrom when the source dataset was used in whole and was
                not modified from its original distribution. Use prov:wasDerivedFrom
                when the source dataset was in whole or in part and was modified from
                its original distribution."
        :return: None
        """
        if predicate is None:
            predicate = self.globaltt["Source"]
        self.graph.addTriple(self.version_level_curie,
                             predicate,
                             url,
                             object_is_literal=is_object_literal,
                             subject_category=blv.terms['DataSetVersion'])

    def get_graph(self):
        """
        This method returns the dataset graph
        :param
        :return: dataset graph
        """
        return self.graph

    def get_license(self):
        """
        This method returns the license info
        :param
        :return: license info
        """
        return self.license_url

    def set_citation(self, citation_id):
        """
        This method adds [citaton_id] argument to the set of citations, and also
        adds a triple indicating that version level cito:citesAsAuthority [citation_id]
        :param: citation_id
        :return: none
        """
        self.citation.add(citation_id)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['citesAsAuthority'], citation_id)

    def _declare_as_ontology(self, version_info=None):
        """
        Declare the distribution level IRI as an ontology, and also make triple
        distribution level IRI - version_iri -> version level IRI

        TEC: I am not convinced dipper reformatting external data as RDF triples
        makes an OWL ontology (nor that it should be considered a goal).

        Proper ontologies are built by ontologists. Dipper reformats data
        and annotates/decorates it with a minimal set of carefully arranged
        terms drawn from from multiple proper ontologies.
        Which allows the whole (dipper's RDF triples and parent ontologies)
        to function as a single ontology we can reason over when combined
        in a store such as SciGraph.

        Including more than the minimal ontological terms in dipper's RDF
        output constitutes a liability as it allows greater divergence
        between dipper artifacts and the proper ontologies.

        :param version_info: a string describing version info for the ontology
        :return:

        """
        model = Model(self.graph)
        model.addOntologyDeclaration(self.summary_level_curie)
        model.addOWLVersionIRI(self.summary_level_curie,
                               self.version_level_curie)
        if version_info is not None:
            model.addOWLVersionInfo(self.distribution_level_turtle_curie,
                                    version_info)

    @staticmethod
    def make_id(long_string, prefix='MONARCH'):
        """
        A method to create DETERMINISTIC identifiers
        based on a string's digest. currently implemented with sha1
        Duplicated from Source.py to avoid circular imports.
        :param long_string: string to use to generate identifier
        :param prefix: prefix to prepend to identifier [Monarch]
        :return: a Monarch identifier
        """
        return ':'.join((prefix, Dataset.hash_id(long_string)))

    @staticmethod
    def hash_id(word):  # same as graph/GraphUtils.digest_id(wordage)
        """
        Given a string, make a hash
        Duplicated from Source.py.

        :param word: str string to be hashed
        :return: hash of id
        """
        return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
Ejemplo n.º 16
0
    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        geno = Genotype(g)
        line_counter = 0

        impc_map = self.open_and_parse_yaml(self.map_files['impc_map'])
        impress_map = json.loads(
            self.fetch_from_url(
                self.map_files['impress_map']).read().decode('utf-8'))

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_:IMPC-'+re.sub(r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    logger.info("Found a strange strain accession...%s",
                                strain_accession_id)
                    strain_accession_id = 'IMPC:' + strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning("Marker unspecified on row %d",
                                   line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_:seqalt'+re.sub(r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                model.addIndividualToGraph(colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_:'+re.sub(r':', '', allele_accession_id+geno.zygosity['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                g.addTriple(colony_id, geno.object_properties['has_genotype'],
                            colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:' + vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(
                    vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(genomic_background_id, strain_name,
                                     geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '-' + phenotyping_center + '-' + colony
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center),
                                  re.sub(r'\W+', '', colony)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(sex_qualified_genotype_id,
                                 sex_qualified_genotype_label, sq_type_id)
                geno.addParts(genotype_id, sex_qualified_genotype_id,
                              geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning("No phenotype id specified for row %d: %s",
                                   line_counter, str(row))
                    continue
                # hard coded ECO code
                eco_id = "ECO:0000015"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(g, self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                # add a free-text description
                try:
                    description = \
                        ' '.join((mp_term_name, 'phenotype determined by',
                                  phenotyping_center, 'in an',
                                  procedure_name, 'assay where',
                                  parameter_name.strip(),
                                  'was measured with an effect_size of',
                                  str(round(float(effect_size), 5)),
                                  '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = \
                        ' '.join((mp_term_name, 'phenotype determined by',
                                  phenotyping_center, 'in an',
                                  procedure_name, 'assay where',
                                  parameter_name.strip(),
                                  'was measured with an effect_size of',
                                  str(effect_size),
                                  '(p =', "{0}".format(p_value), ').'))

                study_bnode = \
                    self._add_study_provenance(
                        impc_map, impress_map, phenotyping_center, colony,
                        project_fullname, pipeline_name, pipeline_stable_id,
                        procedure_stable_id, procedure_name,
                        parameter_stable_id, parameter_name,
                        statistical_method, resource_name)

                evidence_line_bnode = \
                    self._add_evidence(
                        assoc_id, eco_id, impc_map, p_value, percentage_change,
                        effect_size, study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode,
                                               impc_map)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Ejemplo n.º 17
0
class Assoc:
    """
    A base class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.

    """

    assoc_types = {'association': 'OBAN:association'}

    annotation_properties = {
        'replaced_by': 'IAO:0100001',
        'consider': 'OIO:consider',
        'hasExactSynonym': 'OIO:hasExactSynonym',
        'hasRelatedSynonym': 'OIO:hasRelatedSynonym',
        'definition': 'IAO:0000115',
        'has_xref': 'OIO:hasDbXref',
        'inchi_key': 'CHEBI:InChIKey',
        'probabalistic_quantifier': 'GENO:0000867'
    }

    object_properties = {
        'has disposition': 'RO:0000091',
        'has_phenotype': 'RO:0002200',
        'expressed_in': 'RO:0002206',
        'in_taxon': 'RO:0002162',
        'has_quality': 'RO:0000086',
        'towards': 'RO:0002503',
        'has_subject': 'OBAN:association_has_subject',
        'has_object': 'OBAN:association_has_object',
        'has_predicate': 'OBAN:association_has_predicate',
        'is_about': 'IAO:0000136',
        'has_evidence': 'RO:0002558',
        'has_source': 'dc:source',
        'has_provenance': 'OBAN:has_provenance',
        'causes_or_contributes': 'RO:0003302'
    }

    datatype_properties = {
        'position': 'faldo:position',
        'has_measurement': 'IAO:0000004',
        'has_quantifier': 'GENO:0000866',
        'created_on': 'pav:createdOn'
    }

    properties = annotation_properties.copy()
    properties.update(object_properties)
    properties.update(datatype_properties)

    def __init__(self, graph, definedby, sub=None, obj=None, pred=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)

        # core parts of the association
        self.definedby = definedby
        self.sub = sub
        self.obj = obj
        self.rel = pred
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        self.date = []

        # this is going to be used for the refactored evidence/provenance
        self.provenance = []
        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def get_properties(self):
        return self.properties

    def _is_valid(self):

        # check if sub/obj/rel are none...throw error
        if self.sub is None:
            raise ValueError('No subject set for this association')
        if self.obj is None:
            raise ValueError('No object set for this association')
        if self.rel is None:
            raise ValueError('No relation set for this association')

        return True

    def _add_basic_association_to_graph(self):

        if not self._is_valid():
            return

        self.graph.addTriple(self.sub, self.rel, self.obj)

        if self.assoc_id is None:
            self.set_association_id()

        self.model.addType(self.assoc_id, self.assoc_types['association'])

        self.graph.addTriple(self.assoc_id,
                             self.object_properties['has_subject'], self.sub)
        self.graph.addTriple(self.assoc_id,
                             self.object_properties['has_object'], self.obj)
        self.graph.addTriple(self.assoc_id,
                             self.object_properties['has_predicate'], self.rel)

        if self.description is not None:
            self.model.addDescription(self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for e in self.evidence:
                self.graph.addTriple(self.assoc_id,
                                     self.object_properties['has_evidence'], e)

        if self.source is not None and len(self.source) > 0:
            for s in self.source:
                if re.match('http', s):
                    # TODO assume that the source is a publication?
                    # use Reference class here
                    self.graph.addTriple(self.assoc_id,
                                         self.object_properties['has_source'],
                                         s, True)
                else:
                    self.graph.addTriple(self.assoc_id,
                                         self.object_properties['has_source'],
                                         s)

        if self.provenance is not None and len(self.provenance) > 0:
            for p in self.provenance:
                self.graph.addTriple(self.assoc_id,
                                     self.object_properties['has_provenance'],
                                     p)

        if self.date is not None and len(self.date) > 0:
            for d in self.date:
                self.graph.addTriple(
                    object_is_literal=True,
                    subject_id=self.assoc_id,
                    predicate_id=self.datatype_properties['created_on'],
                    obj=d)

        if self.score is not None:
            self.graph.addTriple(self.assoc_id,
                                 self.properties['has_measurement'],
                                 self.score, True, 'xsd:float')
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_association_to_graph(self):

        self._add_basic_association_to_graph()

        return

    def add_predicate_object(self,
                             predicate,
                             object_node,
                             object_type=None,
                             datatype=None):

        if object_type == 'Literal':
            if datatype is not None:
                self.graph.addTriple(self.assoc_id, predicate, object_node,
                                     True, datatype)
            else:
                self.graph.addTriple(self.assoc_id, predicate, object_node,
                                     True)
        else:
            self.graph.addTriple(self.assoc_id, predicate, object_node, False)

        return

    # This isn't java, but if we must,
    # prefer use of property decorator
    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
        of the association.
        To be used in cases where an external association identifier
        should be used.

        :param assoc_id:

        :return:

        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(self.definedby, self.sub,
                                                     self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return

    def get_association_id(self):

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_date(self, date):
        if date is not None and date.strip() != '':
            self.date += [date]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    @staticmethod
    def make_association_id(definedby,
                            subject,
                            predicate,
                            object,
                            attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively digests the  string of concatonated values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be appeded to the ID.

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:

        :return:

        """

        # note others available:
        #   md5(), sha1(), sha224(), sha256(), sha384(), and sha512()
        # putting definedby first,
        # as this will usually be the datasource providing the annotation
        # this will end up making the first few parts of the id
        # be the same for all annotations in that resource
        # (although the point of a digest is to render such details moot).

        items_to_hash = [definedby, subject, predicate, object]
        if attributes is not None:
            items_to_hash += attributes

        for i, val in enumerate(items_to_hash):
            if val is None:
                items_to_hash[i] = ''

        byte_string = '+'.join(items_to_hash).encode("utf-8")

        # TODO put this in a util?
        return ':'.join(
            ('MONARCH', hashlib.sha1(byte_string).hexdigest()[0:16]))
Ejemplo n.º 18
0
class Assoc:
    """
    A base class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.

    """

    def __init__(self, graph, definedby, sub=None, obj=None, pred=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        # core parts of the association
        self.definedby = definedby
        self.sub = sub
        self.obj = obj
        self.rel = pred
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        self.date = []

        # this is going to be used for the refactored evidence/provenance
        self.provenance = []
        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def _is_valid(self):
        # check if sub/obj/rel are none...raise error
        if self.sub is None:
            raise ValueError(
                'No subject set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        if self.obj is None:
            raise ValueError(
                'No object set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        if self.rel is None:
            raise ValueError(
                'No predicate set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        # Are subject & predicate, either a curie or IRI
        pfx = self.sub.split(':')[0]
        if pfx not in self.curie_map.keys() and \
                pfx not in ['_', 'http', 'https', 'ftp']:
            raise ValueError(
                'Invalid Subject for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        pfx = self.rel.split(':')[0]
        if pfx not in self.curie_map.keys() and \
                pfx not in ['_', 'http', 'https', 'ftp']:
            raise ValueError(
                'Invalid Predicate for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        return True

    def add_association_to_graph(self):

        if not self._is_valid():
            return

        self.graph.addTriple(self.sub, self.rel, self.obj)

        if self.assoc_id is None:
            self.set_association_id()

        assert self.assoc_id is not None

        self.model.addType(self.assoc_id, self.model.globaltt['association'])

        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has subject'], self.sub)
        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has object'], self.obj)
        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has predicate'], self.rel)

        if self.description is not None:
            self.model.addDescription(self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for evi in self.evidence:
                self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi)

        if self.source is not None and len(self.source) > 0:
            for src in self.source:
                # TODO assume that the source is a publication? use Reference class
                self.graph.addTriple(self.assoc_id, self.globaltt['source'], src)

        if self.provenance is not None and len(self.provenance) > 0:
            for prov in self.provenance:
                self.graph.addTriple(
                    self.assoc_id, self.globaltt['has_provenance'], prov)

        if self.date is not None and len(self.date) > 0:
            for dat in self.date:
                self.graph.addTriple(
                    self.assoc_id,self.globaltt['created_on'], dat,
                    object_is_literal=True)

        if self.score is not None:
            self.graph.addTriple(
                self.assoc_id, self.globaltt['has measurement value'], self.score,
                True, 'xsd:float')
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_predicate_object(
            self, predicate, object_node, object_type=None, datatype=None):

        if object_type == 'Literal':
            if datatype is not None:
                self.graph.addTriple(
                    self.assoc_id, predicate, object_node, True, datatype)
            else:
                self.graph.addTriple(self.assoc_id, predicate, object_node, True)
        else:
            self.graph.addTriple(self.assoc_id, predicate, object_node, False)

        return

    # This isn't java, but predecessors favored the use of property decorators
    # and CamelCase and ...
    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
        of the association.
        To be used in cases where an external association identifier
        should be used.

        :param assoc_id:

        :return:

        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(
                self.definedby, self.sub, self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return self.assoc_id

    def get_association_id(self):
        if self.assoc_id is None:
            self.set_association_id()

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_date(self, date):
        if date is not None and date.strip() != '':
            self.date += [date]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    @staticmethod
    def make_association_id(definedby, sub, pred, obj, attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively digests the  string of concatonated values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be appeded to the ID.

        Note this is equivalent to a RDF blank node

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:

        :return:

        """

        items_to_hash = [definedby, sub, pred, obj]
        if attributes is not None and len(attributes) > 0:
            items_to_hash += attributes

        items_to_hash = [x for x in items_to_hash if x is not None]

        assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash))))
        assert assoc_id is not None
        return assoc_id
Ejemplo n.º 19
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        line_counter = 0

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (
                    marker_accession_id,
                    marker_symbol,
                    phenotyping_center,
                    colony_raw,
                    sex,
                    zygosity,
                    allele_accession_id,
                    allele_symbol,
                    allele_name,
                    strain_accession_id,
                    strain_name,
                    project_name,
                    project_fullname,
                    pipeline_name,
                    pipeline_stable_id,
                    procedure_stable_id,
                    procedure_name,
                    parameter_stable_id,
                    parameter_name,
                    top_level_mp_term_id,
                    top_level_mp_term_name,
                    mp_term_id,
                    mp_term_name,
                    p_value,
                    percentage_change,
                    effect_size,
                    statistical_method,
                    resource_name
                    ) = row

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", line_counter)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d: %s",
                        line_counter, str(row))
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name, line_counter)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        return
Ejemplo n.º 20
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    def __init__(
            self, graph, feature_id=None, label=None,
            feature_type=None, description=None):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.fid = feature_id
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None
        return

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None, position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand, position_types)

        return

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None, position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        :return:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand, position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.globaltt['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.globaltt['plus_strand']
        elif strand == '-':
            strand_id = self.globaltt['minus_strand']
        elif strand == '.':
            strand_id = self.globaltt['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            LOG.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, add_region=True, region_id=None, feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:

        :return:

        """

        if feature_as_class:
            self.model.addClassToGraph(
                self.fid, self.label, self.ftype, self.description)
        else:
            self.model.addIndividualToGraph(
                self.fid, self.label, self.ftype, self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(self.start['type'])
                if self.stop is not None and self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_:'+rid+"-Region"
                region_id = rid

            self.graph.addTriple(self.fid, self.globaltt['location'], region_id)
            self.model.addIndividualToGraph(region_id, None, self.globaltt['Region'])
        else:
            region_id = self.fid
            self.model.addType(region_id, self.globaltt['region'])

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(
                self.start['reference'], self.start['coordinate'], self.start['type'])
            self.addPositionToGraph(
                self.start['reference'], self.start['coordinate'], self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])
            self.addPositionToGraph(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.globaltt['plus_strand'] in tylist:
            strand = 'plus'
        elif self.globaltt['minus_strand'] in tylist:
            strand = 'minus'
        elif self.globaltt['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            LOG.error("Trying to make position with no reference.")
            return None

        curie = '_:'
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        curie += reference
        if coordinate is not None:
            # just in case it isn't a string already
            curie = '-'.join((curie, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                curie = '-'.join((curie, tstring))

        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id):

        if begin_position_id is None:
            pass
            # LOG.warn("No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id)

        if end_position_id is None:
            pass
            # LOG.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['end'], end_position_id)

        return

    def addPositionToGraph(
            self, reference_id, position, position_types=None, strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(
                pos_id, self.globaltt['position'], position, object_is_literal=True,
                literal_type="xsd:integer")
        self.graph.addTriple(pos_id, self.globaltt['reference'], reference_id)
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        strnd = None
        if strand is not None:
            strnd = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                strnd = self._getStrandType(strand)
        # else:
        #    strnd = self.globaltt['both_strand']
        if strnd is None and (position_types is None or position_types == []):
            strnd = self.globaltt['Position']

        if strnd is not None:
            self.model.addType(pos_id, strnd)

        return pos_id

    def addSubsequenceOfFeature(self, parentid):
        """
        This will add reciprocal triples like:
        feature <is subsequence of> parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(self.fid, self.globaltt['is subsequence of'], parentid)
        # this should be expected to be done in reasoning not ETL
        self.graph.addTriple(parentid, self.globaltt['has subsequence'], self.fid)

        return

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        self.taxon = taxonid
        self.graph.addTriple(self.fid, self.globaltt['in taxon'], self.taxon)

        return

    def addFeatureProperty(self, property_type, feature_property):
        self.graph.addTriple(self.fid, property_type, feature_property)
        return
Ejemplo n.º 21
0
class Assoc:
    """
    A base class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.

    """

    def __init__(
            self,
            graph,
            definedby,
            sub=None,
            obj=None,
            pred=None,
            subject_category=None,
            object_category=None
    ):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        # core parts of the association
        self.definedby = definedby
        self.sub = sub
        self.obj = obj
        self.rel = pred
        self.subject_category = subject_category
        self.object_category = object_category
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        self.date = []

        # this is going to be used for the refactored evidence/provenance
        self.provenance = []
        self.score = None
        self.score_type = None
        self.score_unit = None


    def _is_valid(self):
        # check if sub/obj/rel are none...raise error
        if self.sub is None:
            raise ValueError(
                'No subject set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        if self.obj is None:
            raise ValueError(
                'No object set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        if self.rel is None:
            raise ValueError(
                'No predicate set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        # Are subject & predicate, either a curie or IRI
        pfx = self.sub.split(':')[0]
        if pfx not in self.curie_map.keys() and \
                pfx not in ['_', 'http', 'https', 'ftp']:
            raise ValueError(
                'Invalid Subject for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        pfx = self.rel.split(':')[0]
        if pfx not in self.curie_map.keys() and \
                pfx not in ['_', 'http', 'https', 'ftp']:
            raise ValueError(
                'Invalid Predicate for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        return True

    def add_association_to_graph(self, association_category=None):

        # Assume null and iri checks happen downstream
        #if not self._is_valid():
        #    return

        self.graph.addTriple(self.sub, self.rel, self.obj,
                             subject_category=self.subject_category,
                             object_category=self.object_category)

        if self.assoc_id is None:
            self.set_association_id()

        # assert self.assoc_id is not None

        self.model.addType(self.assoc_id, self.model.globaltt['association'])

        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has subject'], self.sub
        )
        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has object'], self.obj
        )
        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has predicate'], self.rel
        )

        if association_category is not None:
            self.graph.addTriple(
                self.assoc_id,
                blv.terms['category'],
                association_category
            )

        if self.description:
            self.model.addDescription(self.assoc_id, self.description)

        if self.evidence:
            for evi in self.evidence:
                self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi)

        if self.source:
            for src in self.source:
                # TODO assume that the source is a publication? use Reference class
                self.graph.addTriple(self.assoc_id, self.globaltt['Source'], src)

        if self.provenance:
            for prov in self.provenance:
                self.graph.addTriple(
                    self.assoc_id, self.globaltt['has_provenance'], prov)

        if self.date:
            for dat in self.date:
                self.graph.addTriple(
                    self.assoc_id,
                    self.globaltt['created_on'],
                    dat,
                    object_is_literal=True
                )

        if self.score is not None:
            self.graph.addTriple(
                self.assoc_id,
                self.globaltt['has measurement value'],
                self.score,
                True, 'xsd:float'
            )
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

    def add_predicate_object(
            self, predicate, object_node, object_type=None, datatype=None):

        if object_type == 'Literal':
            if datatype is not None:
                self.graph.addTriple(
                    self.assoc_id, predicate, object_node, True, datatype
                )
            else:
                self.graph.addTriple(self.assoc_id, predicate, object_node, True)
        else:
            self.graph.addTriple(self.assoc_id, predicate, object_node, False)


    # This isn't java, but predecessors favored the use of property decorators
    # and CamelCase and ...
    def set_subject(self, identifier):
        self.sub = identifier

    def set_object(self, identifier):
        self.obj = identifier

    def set_relationship(self, identifier):
        self.rel = identifier

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
        of the association.
        To be used in cases where an external association identifier
        should be used.

        :param assoc_id:

        :return:

        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(
                self.definedby, self.sub, self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return self.assoc_id

    def get_association_id(self):
        if self.assoc_id is None:
            self.set_association_id()

        return self.assoc_id

    def set_description(self, description):
        self.description = description

    def set_score(self, score, unit=None, score_type=None):
        self.score = score
        self.score_unit = unit
        self.score_type = score_type

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

    def add_date(self, date):
        if date is not None and date.strip() != '':
            self.date += [date]

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

    @staticmethod
    def make_association_id(definedby, sub, pred, obj, attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively digests the  string of concatonated values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be appeded to the ID.

        Note this is equivalent to a RDF blank node

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:

        :return:

        """

        items_to_hash = [definedby, sub, pred, obj]
        if attributes is not None and len(attributes) > 0:
            items_to_hash += attributes

        items_to_hash = [x for x in items_to_hash if x is not None]

        assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash))))
        # assert assoc_id is not None
        return assoc_id
Ejemplo n.º 22
0
class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """
    def __init__(
            self,
            identifier,  # name? should be Archive url via Source
            title,
            url,
            ingest_desc=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None):

        if graph_type is None:
            self.graph = RDFGraph(None, identifier)
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       identifier,
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True, identifier)

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        # TODO: move hard coded curies to translation table calls
        self.identifier = identifier
        if title is None:
            self.title = identifier
        else:
            self.title = title
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an literal of properties
        # such as dcterms:issued, which needs to conform xsd:dateTime format.
        # TODO ... we need to have a talk about typed literals and SPARQL
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license_url = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dcterms:title', title, True)
        self.graph.addTriple(self.identifier, 'dcterms:identifier', identifier,
                             True)
        if url is not None:
            self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo  <uri>
        # TODO add the license info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(self.identifier, 'dcterms:license',
                                 license_url)
        else:
            LOG.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(self.identifier,
                                 'dcterms:rights',
                                 data_rights,
                                 object_is_literal=True)
        else:
            LOG.debug('No rights provided.')

        if ingest_desc is not None:
            self.model.addDescription(self.identifier, ingest_desc)
        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...
            should use the other set_* for version and date

        as of 2016-10-20  used in:

        dipper/sources/HPOAnnotations.py 139:
        dipper/sources/CTD.py             99:
        dipper/sources/BioGrid.py        100:
        dipper/sources/MGI.py            255:
        dipper/sources/EOM.py             93:
        dipper/sources/Coriell.py        200:
        dipper/sources/MMRRC.py           77:

        # TODO set as deprecated

        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            self.set_version_by_num(version_id)
        else:
            LOG.error("date or version not set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            LOG.info("set version to %s", self.version)
            self.set_version_by_date(date_issued)

        LOG.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.addTriple(self.identifier,
                             'dcterms:issued',
                             date_issued,
                             object_is_literal=True)
        LOG.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            dat = date_issued
        elif self.date_issued is not None:
            dat = self.date_issued
        else:
            dat = self.date_accessed
            LOG.info(
                "No date supplied, using download timestamp for date_issued")
        LOG.info("setting version by date to: %s", dat)
        self.set_version_by_num(dat)

        return

    def set_version_by_num(self, version_num):

        self.version = self.identifier + version_num
        self.graph.addTriple(self.version, 'dcterms:isVersionOf',
                             self.identifier)
        self.graph.addTriple(self.version,
                             'pav:version',
                             version_num,
                             object_is_literal=True)

        LOG.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            dipperized_version = ':' + str(self.date_accessed)
            self.graph.addTriple(dipperized_version, 'dcterms:isVersionOf',
                                 "MonarchData:" + self.identifier +
                                 ".ttl")  # fix suffix
            self.graph.addTriple(dipperized_version,
                                 'pav:version',
                                 self.date_accessed,
                                 object_is_literal=True)
            self.graph.addTriple(dipperized_version,
                                 'dcterms:issued',
                                 self.date_accessed,
                                 object_is_literal=True,
                                 literal_type="xsd:dateTime")
        return

    def setFileAccessUrl(self, url, is_object_literal=False):
        self.graph.addTriple(self.identifier, 'dcat:accessURL', url,
                             is_object_literal)

    def getGraph(self):
        return self.graph

    def set_license(self, license_url):
        self.license_url = license_url
        return

    def get_license(self):
        return self.license_url

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return
Ejemplo n.º 23
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """

    ref_types = {
        'person': 'foaf:Person',
        'journal_article': 'IAO:0000013',
        'publication': 'IAO:0000311',  # book
        'document': 'IAO:0000310',  # document???
        'photograph': 'IAO:0000185',
        'webpage': 'SIO:000302',
    }

    annotation_properties = {
        'page': 'foaf:page',
        'title': 'dc:title'
    }

    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)

        if ref_type is None:
            self.ref_type = self.ref_types['document']
        else:
            self.ref_type = ref_type

        if ref_id is not None and re.match(r'http', ref_id):
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(
            subject_id, self.annotation_properties['page'],
            page_url, object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        self.graph.addTriple(
            subject_id, self.annotation_properties['title'],
            title, object_is_literal=True)
        return

    def addRefToGraph(self):

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            self.model.addLabel(self.ref_url, n)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, n, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Ejemplo n.º 24
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    def __init__(
            self,
            graph,
            feature_id=None,
            label=None,
            feature_type=None,
            description=None,
            feature_category=None
    ):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gfxutl = GraphUtils(self.curie_map)
        self.fid = feature_id
        self.feature_category = feature_category
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand, position_types)

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand, position_types)

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.globaltt['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        """
        strand_id = None
        if strand == '+':
            strand_id = self.globaltt['plus_strand']
        elif strand == '-':
            strand_id = self.globaltt['minus_strand']
        elif strand == '.':
            strand_id = self.globaltt['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            LOG.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, add_region=True, region_id=None, feature_as_class=False,
            feature_category=None):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param add_region [True]
        :param region_id [None]
        :param feature_as_class [False]
        :param feature_category: a biolink category CURIE for feature
        """

        if feature_category is None:
            feature_category = self.feature_category

        if feature_as_class:
            self.model.addClassToGraph(
                self.fid, self.label, self.ftype, self.description,
                class_category=feature_category)
        else:
            self.model.addIndividualToGraph(
                self.fid, self.label, self.ftype, self.description,
                ind_category=feature_category)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(self.start['type'])
                if self.stop is not None and self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                # blank node, bnode
                rid = rid + "-Region"
                curie = '_:' + self.gfxutl.digest_id(rid)
                self.model.addLabel(curie, rid)
                region_id = curie

            self.graph.addTriple(
                self.fid,
                self.globaltt['location'],
                region_id,
                subject_category=feature_category
            )
            self.model.addIndividualToGraph(region_id, None, self.globaltt['Region'])
        else:
            region_id = self.fid
            self.model.addType(region_id, self.globaltt['region'])

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(
                self.start['reference'], self.start['coordinate'], self.start['type'])
            self.addPositionToGraph(
                self.start['reference'], self.start['coordinate'], self.start['type'],
            )

        if self.stop is not None:
            endp = self._makePositionId(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])
            self.addPositionToGraph(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.globaltt['plus_strand'] in tylist:
            strand = 'plus'
        elif self.globaltt['minus_strand'] in tylist:
            strand = 'minus'
        elif self.globaltt['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return: bnode_curie
        """
        # blank node, bnode
        if reference is None:
            LOG.error("Trying to make position with no reference.")
            return None

        reference = re.sub(r'\w+\:', '', reference, 1)
        if reference[0] == '_':
            # in this case the reference is a bnode curie as well
            # ... this is a bad smell of over modleing
            reference = reference[1:]
        unique_words = reference
        if coordinate is not None:
            # just in case it isn't a string already
            unique_words = '-'.join((unique_words, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                unique_words = '-'.join((unique_words, tstring))

        curie = '_:' + self.gfxutl.digest_id(unique_words)

        # attach the wordage via a label
        # I want to see more of this (TEC 201905)
        # including a type should be mandatory as well
        self.model.addLabel(curie, unique_words)
        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id):

        if begin_position_id is None:
            pass
            # LOG.warn("No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id)

        if end_position_id is None:
            pass
            # LOG.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['end'], end_position_id)

    def addPositionToGraph(
            self, reference_id, position, position_types=None, strand=None
    ):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(
                pos_id,
                self.globaltt['position'],
                position,
                object_is_literal=True,
                literal_type="xsd:integer"
            )
        self.graph.addTriple(
            pos_id, self.globaltt['reference'], reference_id
        )
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        strnd = None
        if strand is not None:
            strnd = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                strnd = self._getStrandType(strand)
        # else:
        #    strnd = self.globaltt['both_strand']
        if strnd is None and (position_types is None or position_types == []):
            strnd = self.globaltt['Position']

        if strnd is not None:
            self.model.addType(pos_id, strnd)

        return pos_id

    def addSubsequenceOfFeature(
            self, parentid, subject_category=None, object_category=None
    ):
        """
        This will add reciprocal triples like:
        feature <is subsequence of> parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(
            self.fid,
            self.globaltt['is subsequence of'],
            parentid,
            subject_category=subject_category,
            object_category=object_category
        )
        # this should be expected to be done in reasoning not ETL
        self.graph.addTriple(
            parentid,
            self.globaltt['has subsequence'],
            self.fid,
            subject_category=object_category,
            object_category=subject_category
        )

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        self.taxon = taxonid
        self.graph.addTriple(
            self.fid,
            self.globaltt['in taxon'],
            self.taxon,
            subject_category=self.feature_category
        )

    def addFeatureProperty(self, property_type, feature_property):

        self.graph.addTriple(
            self.fid,
            property_type,
            feature_property,
            subject_category=self.feature_category
        )
Ejemplo n.º 25
0
    def _parse_g2p_file(self, limit=None):
        """
        Parse gene to XPO file, currently custom for Monarch
        :param limit:
        :return:
        """
        src_key = 'g2p_assertions'
        geno = Genotype(self.graph)
        model = Model(self.graph)

        columns = self.files[src_key]['columns']
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        LOG.info("Processing Gene to XPO associations")

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile)

            # File has headers
            row = next(reader)
            if not self.check_fileheader(columns, row):
                pass

            for row in reader:

                gene = row[columns.index('SUBJECT')]
                gene_label = row[columns.index('SUBJECT_LABEL')]
                gene_taxon = row[columns.index('SUBJECT_TAXON')]
                #gene_taxon_label = row[columns.index('SUBJECT_TAXON_LABEL')]
                phenotype_curie = row[columns.index('OBJECT')]
                #phenotype_label = row[columns.index('OBJECT_LABEL')]
                relation = row[columns.index('RELATION')]
                #relation_label = row[columns.index('RELATION_LABEL')]
                evidence = row[columns.index('EVIDENCE')]
                #evidence_label = row[columns.index('EVIDENCE_LABEL')]
                source = row[columns.index('SOURCE')]
                #is_defined_by = row[columns.index('IS_DEFINED_BY')]
                #qualifier = row[columns.index('QUALIFIER')]

                gene_curie = 'Xenbase:' + gene
                relation_curie = relation.replace('_', ':')

                geno.addGene(gene_curie, gene_label)
                geno.addTaxon(gene_taxon, gene_curie)

                assoc = G2PAssoc(
                    self.graph,
                    self.name,
                    entity_id=gene_curie,
                    phenotype_id=phenotype_curie,
                    rel=relation_curie
                )

                if evidence:
                    assoc.add_evidence(evidence)

                if source:
                    model.addType(source, self.globaltt['journal article'])
                    assoc.add_source(source)

                assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Ejemplo n.º 26
0
    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        LOG.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.test_mode:
                    graph = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    graph = self.graph

                model = Model(graph)

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:' + biogrid_num
                prefix = self.localtt[id_type]

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC,WormBase,XenBase,FlyBase'.split(
                    ',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) and (organism_label.strip()
                                                     in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters
                            is not None) and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        model.addEquivalentClass(biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        model.addLabel(biogrid_id, id_num)
                        model.addType(biogrid_id, self.globaltt['gene'])

                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        myzip.close()

        return
Ejemplo n.º 27
0
class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """
    def __init__(self,
                 identifier,
                 title,
                 url,
                 description=None,
                 license_url=None,
                 data_rights=None,
                 graph_type=None,
                 file_handle=None):
        if graph_type is None:
            self.graph = RDFGraph()
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True, file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph()
        self.model = Model(self.graph)
        self.identifier = ':' + identifier
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an object literal of properties such as dct:issued, which needs to conform xsd:dateTime format.
        # self.date_accessed = datetime.now().strftime('%Y-%m-%d-%H-%M')
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dct:title', title, True)
        self.graph.addTriple(self.identifier,
                             'dct:identifier',
                             identifier,
                             object_is_literal=True)
        self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> .

        # TODO add the licence info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(self.identifier, 'dct:license', license_url)
        else:
            logger.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(self.identifier,
                                 'dct:rights',
                                 data_rights,
                                 object_is_literal=True)
        else:
            logger.debug('No rights provided.')

        if description is not None:
            self.model.addDescription(self.identifier, description)
        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...
            should use the other set_* for version and date

        as of 2016-10-20  used in:
        
        dipper/sources/HPOAnnotations.py 139:
        dipper/sources/CTD.py             99:
        dipper/sources/BioGrid.py        100:        
        dipper/sources/MGI.py            255:
        dipper/sources/EOM.py             93:
        dipper/sources/Coriell.py        200:
        dipper/sources/MMRRC.py           77:

        # TODO set as deprecated
        
        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.error("date or version not set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.info("set version to %s", self.version)
            self.set_version_by_date(date_issued)

        logger.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.addTriple(self.identifier,
                             'dct:issued',
                             date_issued,
                             object_is_literal=True)
        logger.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            d = date_issued
        elif self.date_issued is not None:
            d = self.date_issued
        else:
            d = self.date_accessed
            logger.info("No date supplied for setting version; "
                        "using download timestamp for date_issued")

        logger.info("setting version by date")
        self.set_version_by_num(d)

        return

    def set_version_by_num(self, version_num):

        self.version = self.identifier + version_num
        self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier)
        self.graph.addTriple(self.version,
                             'pav:version',
                             version_num,
                             object_is_literal=True)

        logger.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            dipperized_version = ':' + str(self.date_accessed)
            self.graph.addTriple(dipperized_version, 'dct:isVersionOf',
                                 self.version)
            self.graph.addTriple(dipperized_version,
                                 'pav:version',
                                 self.date_accessed,
                                 object_is_literal=True)
            self.graph.addTriple(dipperized_version,
                                 'dct:issued',
                                 self.date_accessed,
                                 object_is_literal=True,
                                 literal_type="xsd:dateTime")
        return

    def setFileAccessUrl(self, url, is_object_literal=False):
        self.graph.addTriple(self.identifier, 'dcat:accessURL', url,
                             is_object_literal)

    def getGraph(self):
        return self.graph

    def set_license(self, license):
        self.license = license
        return

    def get_license(self):

        return self.license

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return
Ejemplo n.º 28
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    object_properties = {
        'location': 'faldo:location',
        'begin': 'faldo:begin',
        'end': 'faldo:end',
        'reference': 'faldo:reference',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205',
        'is_about': 'IAO:0000136',
        'has_subsequence': 'RO:0002524',
        'is_subsequence_of': 'RO:0002525',
        'has_staining_intensity': 'GENO:0000207',
        'upstream_of_sequence_of': 'RO:0002528',
        'downstream_of_sequence_of': 'RO:0002529'
    }

    data_properties = {
        'position': 'faldo:position',
    }

    annotation_properties = {}

    properties = object_properties.copy()
    properties.update(data_properties)
    properties.update(annotation_properties)

    types = {
        'region': 'faldo:Region',
        'Position': 'faldo:Position',
        # big P for Position type.  little p for position property
        'FuzzyPosition': 'faldo:FuzzyPosition',
        'chromosome': 'SO:0000340',
        'chromosome_arm': 'SO:0000105',
        'chromosome_band': 'SO:0000341',
        'chromosome_part': 'SO:0000830',
        'long_chromosome_arm': 'GENO:0000629',
        'short_chromosome_arm': 'GENO:0000628',
        'chromosome_region': 'GENO:0000614',
        'chromosome_subband': 'GENO:0000616',
        'centromere': 'SO:0000577',
        'plus_strand': 'faldo:PlusStrandPosition',
        'minus_strand': 'faldo:MinusStrandPosition',
        'both_strand': 'faldo:BothStrandPosition',
        'score': 'SO:0001685',
        # FIXME - score is not a good solution, too generic
        'reference_genome': 'SO:0001505',
        'genome': 'SO:0001026',
        'assembly_component': 'SO:0000143',
        'SNP': 'SO:0000694',
        'haplotype': 'GENO:0000871',

        # the following are sequence attributes:
        'band_intensity':  'GENO:0000618',
        'gneg': 'GENO:0000620',
        'gpos': 'GENO:0000619',
        'gpos100': 'GENO:0000622',
        'gpos75': 'GENO:0000623',
        'gpos50': 'GENO:0000624',
        'gpos25': 'GENO:0000625',
        'gvar': 'GENO:0000621',
        'gpos33': 'GENO:0000633',
        'gpos66': 'GENO:0000632'
    }

    def __init__(self, graph, feature_id=None, label=None,
                 feature_type=None, description=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)
        self.id = feature_id
        self.label = label
        self.type = feature_type
        self.description = description
        self.start = None
        self.stop = None
        return

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand,
                                       position_types)

        return

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        :return:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand,
                                      position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        loc = dict()
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.types['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.types['plus_strand']
        elif strand == '-':
            strand_id = self.types['minus_strand']
        elif strand == '.':
            strand_id = self.types['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            logger.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, add_region=True, region_id=None,
            feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:

        :return:

        """

        if feature_as_class:
            self.model.addClassToGraph(self.id, self.label, self.type,
                                       self.description)
        else:
            self.model.addIndividualToGraph(self.id, self.label, self.type,
                                            self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and \
                        self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(
                        self.start['type'])
                if self.stop is not None and\
                        self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_:'+rid+"-Region"
                region_id = rid

            self.graph.addTriple(self.id, self.properties['location'],
                                 region_id)
            self.model.addIndividualToGraph(region_id, None, 'faldo:Region')
        else:
            region_id = self.id
            self.model.addType(region_id, 'faldo:Region')

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(self.start['reference'],
                                          self.start['coordinate'],
                                          self.start['type'])
            self.addPositionToGraph(self.start['reference'],
                                    self.start['coordinate'],
                                    self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(self.stop['reference'],
                                        self.stop['coordinate'],
                                        self.stop['type'])
            self.addPositionToGraph(self.stop['reference'],
                                    self.stop['coordinate'],
                                    self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.types['plus_strand'] in tylist:
            strand = 'plus'
        elif self.types['minus_strand'] in tylist:
            strand = 'minus'
        elif self.types['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            logger.error("Trying to make position with no reference.")
            return None

        curie = '_:'
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        curie += reference
        if coordinate is not None:
            # just in case it isn't a string already
            curie = '-'.join((curie, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                curie = '-'.join((curie, tstring))

        return curie

    def addRegionPositionToGraph(
            self, region_id, begin_position_id,
            end_position_id):

        if begin_position_id is None:
            pass
            # logger.warn(
            #   "No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.properties['begin'],
                                 begin_position_id)

        if end_position_id is None:
            pass
            # logger.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.properties['end'],
                                 end_position_id)

        return

    def addPositionToGraph(
            self, reference_id, position,
            position_types=None, strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(pos_id, self.properties['position'],
                                 position, object_is_literal=True,
                                 literal_type="xsd:integer")
        self.graph.addTriple(
            pos_id, self.properties['reference'], reference_id)
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        s = None
        if strand is not None:
            s = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                s = self._getStrandType(strand)
        # else:
        #    s = self.types['both_strand']
        if s is None and (position_types is None or position_types == []):
            s = self.types['Position']

        if s is not None:
            self.model.addType(pos_id, s)

        return pos_id

    def addSubsequenceOfFeature(self, parentid):
        """
        This will add reciprocal triples like:
        feature is_subsequence_of parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(
            self.id, self.properties['is_subsequence_of'], parentid)
        self.graph.addTriple(
            parentid, self.properties['has_subsequence'], self.id)

        return

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        # TEC: should taxon be set in __init__()?
        self.taxon = taxonid
        self.graph.addTriple(
            self.id, Assoc.properties['in_taxon'], self.taxon)

        return

    def addFeatureProperty(self, property_type, property):
        self.graph.addTriple(self.id, property_type, property)
        return
Ejemplo n.º 29
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    object_properties = {
        'location': 'faldo:location',
        'begin': 'faldo:begin',
        'end': 'faldo:end',
        'reference': 'faldo:reference',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205',
        'is_about': 'IAO:0000136',
        'has_subsequence': 'RO:0002524',
        'is_subsequence_of': 'RO:0002525',
        'has_staining_intensity': 'GENO:0000207',
        'upstream_of_sequence_of': 'RO:0002528',
        'downstream_of_sequence_of': 'RO:0002529'
    }

    data_properties = {
        'position': 'faldo:position',
    }

    annotation_properties = {}

    properties = object_properties.copy()
    properties.update(data_properties)
    properties.update(annotation_properties)

    types = {
        'region': 'faldo:Region',
        'Position': 'faldo:Position',
        # big P for Position type.  little p for position property
        'FuzzyPosition': 'faldo:FuzzyPosition',
        'chromosome': 'SO:0000340',
        'chromosome_arm': 'SO:0000105',
        'chromosome_band': 'SO:0000341',
        'chromosome_part': 'SO:0000830',
        'long_chromosome_arm': 'GENO:0000629',
        'short_chromosome_arm': 'GENO:0000628',
        'chromosome_region': 'GENO:0000614',
        'chromosome_subband': 'GENO:0000616',
        'centromere': 'SO:0000577',
        'plus_strand': 'faldo:PlusStrandPosition',
        'minus_strand': 'faldo:MinusStrandPosition',
        'both_strand': 'faldo:BothStrandPosition',
        'score': 'SO:0001685',
        # FIXME - score is not a good solution, too generic
        'reference_genome': 'SO:0001505',
        'genome': 'SO:0001026',
        'assembly_component': 'SO:0000143',
        'SNP': 'SO:0000694',
        'haplotype': 'GENO:0000871',

        # the following are sequence attributes:
        'band_intensity': 'GENO:0000618',
        'gneg': 'GENO:0000620',
        'gpos': 'GENO:0000619',
        'gpos100': 'GENO:0000622',
        'gpos75': 'GENO:0000623',
        'gpos50': 'GENO:0000624',
        'gpos25': 'GENO:0000625',
        'gvar': 'GENO:0000621',
        'gpos33': 'GENO:0000633',
        'gpos66': 'GENO:0000632'
    }

    def __init__(self,
                 graph,
                 feature_id=None,
                 label=None,
                 feature_type=None,
                 description=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)
        self.id = feature_id
        self.label = label
        self.type = feature_type
        self.description = description
        self.start = None
        self.stop = None
        return

    def addFeatureStartLocation(self,
                                coordinate,
                                reference_id,
                                strand=None,
                                position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand,
                                       position_types)

        return

    def addFeatureEndLocation(self,
                              coordinate,
                              reference_id,
                              strand=None,
                              position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        :return:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand,
                                      position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        loc = dict()
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.types['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.types['plus_strand']
        elif strand == '-':
            strand_id = self.types['minus_strand']
        elif strand == '.':
            strand_id = self.types['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            logger.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(self,
                          add_region=True,
                          region_id=None,
                          feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:

        :return:

        """

        if feature_as_class:
            self.model.addClassToGraph(self.id, self.label, self.type,
                                       self.description)
        else:
            self.model.addIndividualToGraph(self.id, self.label, self.type,
                                            self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and \
                        self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(
                        self.start['type'])
                if self.stop is not None and\
                        self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_:' + rid + "-Region"
                region_id = rid

            self.graph.addTriple(self.id, self.properties['location'],
                                 region_id)
            self.model.addIndividualToGraph(region_id, None, 'faldo:Region')
        else:
            region_id = self.id
            self.model.addType(region_id, 'faldo:Region')

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(self.start['reference'],
                                          self.start['coordinate'],
                                          self.start['type'])
            self.addPositionToGraph(self.start['reference'],
                                    self.start['coordinate'],
                                    self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(self.stop['reference'],
                                        self.stop['coordinate'],
                                        self.stop['type'])
            self.addPositionToGraph(self.stop['reference'],
                                    self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.types['plus_strand'] in tylist:
            strand = 'plus'
        elif self.types['minus_strand'] in tylist:
            strand = 'minus'
        elif self.types['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            logger.error("Trying to make position with no reference.")
            return None

        curie = '_:'
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        curie += reference
        if coordinate is not None:
            # just in case it isn't a string already
            curie = '-'.join((curie, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                curie = '-'.join((curie, tstring))

        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id,
                                 end_position_id):

        if begin_position_id is None:
            pass
            # logger.warn(
            #   "No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.properties['begin'],
                                 begin_position_id)

        if end_position_id is None:
            pass
            # logger.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.properties['end'],
                                 end_position_id)

        return

    def addPositionToGraph(self,
                           reference_id,
                           position,
                           position_types=None,
                           strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(pos_id,
                                 self.properties['position'],
                                 position,
                                 object_is_literal=True,
                                 literal_type="xsd:integer")
        self.graph.addTriple(pos_id, self.properties['reference'],
                             reference_id)
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        s = None
        if strand is not None:
            s = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                s = self._getStrandType(strand)
        # else:
        #    s = self.types['both_strand']
        if s is None and (position_types is None or position_types == []):
            s = self.types['Position']

        if s is not None:
            self.model.addType(pos_id, s)

        return pos_id

    def addSubsequenceOfFeature(self, parentid):
        """
        This will add reciprocal triples like:
        feature is_subsequence_of parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(self.id, self.properties['is_subsequence_of'],
                             parentid)
        self.graph.addTriple(parentid, self.properties['has_subsequence'],
                             self.id)

        return

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        # TEC: should taxon be set in __init__()?
        self.taxon = taxonid
        self.graph.addTriple(self.id, Assoc.properties['in_taxon'], self.taxon)

        return

    def addFeatureProperty(self, property_type, property):
        self.graph.addTriple(self.id, property_type, property)
        return
Ejemplo n.º 30
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """

    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("%s is not a graph", graph)

        # assert ref_id is not None

        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map

        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type
            if ref_type[:4] not in ('IAO:', 'SIO:'):
                LOG.warning("Got Pub ref type of:  %s", ref_type)

        if ref_id is not None and ref_id[:4] == 'http':
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(
            subject_id, self.globaltt['page'],  # foaf:page  not  <sio:web page>
            page_url, object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        if title is not None and title != '':
            self.graph.addTriple(
                subject_id, self.globaltt['title (dce)'], title, object_is_literal=True)
        return

    def addRefToGraph(self):

        cite = self.short_citation
        if cite is None and self.title is not None:
            cite = self.title

        if self.ref_url is not None:
            if self.title is not None:
                self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            if cite is not None:
                self.model.addLabel(self.ref_url, cite)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            LOG.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for auth in self.author_list:
        #        gu.addTriple(
        #           graph, self.ref_id, self.props['has_author'], auth, True)
        return
Ejemplo n.º 31
0
    def _process_data(self, source, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files[source]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[source]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning('Expected %i values but find %i in  row %i',
                                len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.testMode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning('Novel Affected status  %s at row: %i of %s',
                                affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join(
                        (patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join(
                        (patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(cell_line_id, line_label,
                                           cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(equiv_cell_line, None,
                                               cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                # this would give a BNode that is an instance of Age.
                # but i don't know how to connect
                # the age node to the cell line? we need to ask @mbrush
                # age_id = '_'+re.sub('\s+','_',age)
                # gu.addIndividualToGraph(
                #   graph,age_id,age,self.globaltt['age'])
                # gu.addTriple(
                #   graph,age_id,self.globaltt['has measurement value'],age,
                #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(
                        ('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(family_comp_id, family_label,
                                               self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:' + re.sub('MONARCH:', '',
                                                 self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(graph, karyotype_feature_id,
                                       karyotype_feature_label,
                                       self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(karyotype_feature_id, karyotype_id,
                                      self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    vl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((vl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = vl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = vl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(karyotype_id, genotype_id,
                                          self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' [' + catalog_id.strip() + ']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(genotype_id, genotype_label,
                                     self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(patient_id, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for d in omim_num.split(';'):
                        if d is not None and d != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if d not in omim_map:
                                disease_id = 'OMIM:' + d.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(graph, self.name, patient_id,
                                                 disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(cell_line_id,
                                                self.globaltt['is model of'],
                                                disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', d)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for s in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + s.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(pubmed_id, self.globaltt['mentions'],
                                        cell_line_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break
        return
Ejemplo n.º 32
0
    def _process_data(self, src_key, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.test_mode:      # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[src_key]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning(
                        'Expected %i values but find %i in  row %i',
                        len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.test_mode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning(
                        'Novel Affected status  %s at row: %i of %s',
                        affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join((
                        patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join((
                        patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(
                    cell_line_id, line_label, cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(
                        equiv_cell_line, None, cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                    # this would give a BNode that is an instance of Age.
                    # but i don't know how to connect
                    # the age node to the cell line? we need to ask @mbrush
                    # age_id = '_'+re.sub('\s+','_',age)
                    # gu.addIndividualToGraph(
                    #   graph,age_id,age,self.globaltt['age'])
                    # gu.addTriple(
                    #   graph,age_id,self.globaltt['has measurement value'],age,
                    #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(
                        family_comp_id, family_label, self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:'+re.sub(
                        'MONARCH:', '', self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(
                            graph, karyotype_feature_id, karyotype_feature_label,
                            self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(
                            karyotype_feature_id, karyotype_id,
                            self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    varl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((varl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = varl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = varl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(
                                karyotype_id, genotype_id,
                                self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' ['+catalog_id.strip()+']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(
                        genotype_id, genotype_label,
                        self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(
                        patient_id, self.globaltt['has_genotype'], genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for disease in omim_num.split(';'):
                        if disease is not None and disease != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if disease not in omim_map:
                                disease_id = 'OMIM:' + disease.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(
                                    graph, self.name,
                                    patient_id, disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(
                                    cell_line_id,
                                    self.globaltt['is model of'],
                                    disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', disease)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for pmid in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + pmid.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(
                            pubmed_id, self.globaltt['mentions'], cell_line_id)

                if not self.test_mode and (
                        limit is not None and line_counter > limit):
                    break
        return
Ejemplo n.º 33
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index('marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index('phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index('allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index('strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index('pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index('procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index('parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index('statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d", reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Ejemplo n.º 34
0
    def parse(self, limit=None):

        model = Model(self.graph)
        geno = Genotype(self.graph)

        count = 0
        for num in range(10, 100):
            fuzzy_gene = "MGI:{0}*".format(num)
            gene = "MGI:{0}".format(num)
            service = Service("http://www.mousemine.org/mousemine/service")
            logging.getLogger('Model').setLevel(logging.ERROR)
            logging.getLogger('JSONIterator').setLevel(logging.ERROR)
            query = service.new_query("OntologyAnnotation")
            query.add_constraint("subject", "SequenceFeature")
            query.add_constraint("ontologyTerm", "MPTerm")
            query.add_view("subject.primaryIdentifier", "subject.symbol",
                           "subject.sequenceOntologyTerm.name",
                           "ontologyTerm.identifier", "ontologyTerm.name",
                           "evidence.publications.pubMedId",
                           "evidence.comments.type",
                           "evidence.comments.description")
            query.add_constraint("subject.organism.taxonId",
                                 "=",
                                 self.txid,
                                 code="A")
            query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B")
            query.add_constraint("subject.primaryIdentifier",
                                 "CONTAINS",
                                 gene,
                                 code="C")
            query.outerjoin("evidence.comments")

            for row in query.rows():
                mgi_curie = row["subject.primaryIdentifier"]
                mp_curie = row["ontologyTerm.identifier"]
                pub_curie = "PMID:{0}".format(
                    row["evidence.publications.pubMedId"])

                model.addType(mgi_curie, self.globaltt['gene'])
                geno.addTaxon('NCBITaxon:' + self.txid, mgi_curie)

                assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie)
                if row["evidence.publications.pubMedId"]:
                    reference = Reference(self.graph, pub_curie,
                                          self.globaltt['journal article'])
                    reference.addRefToGraph()
                    assoc.add_source(pub_curie)

                assoc.add_evidence(
                    self.globaltt['experimental phenotypic evidence'])
                assoc.add_association_to_graph()

            if not count % 10 and count != 0:
                count_from = count - 10
                LOG.info("%s processed ids from MGI:%i* to MGI:%i*",
                         datetime.datetime.now(), count_from, count)

            count += 1
            if limit and count >= limit:
                break

        return