Beispiel #1
0
    def _add_snp_to_graph(
            self, snp_id, snp_label, chrom_num, chrom_pos, context,
            risk_allele_frequency=None):
        # constants
        tax_id = 'NCBITaxon:9606'
        genome_version = 'GRCh38'

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        if chrom_num != '' and chrom_pos != '':
            location = self._make_location_curie(chrom_num, chrom_pos)
            if location not in self.id_location_map:
                self.id_location_map[location] = set()
        else:
            location = None

        alteration = re.search(r'-(.*)$', snp_id)
        if alteration is not None \
                and re.match(r'[ATGC]', alteration.group(1)):
            # add variation to snp
            pass  # TODO

        if location is not None:
            self.id_location_map[location].add(snp_id)

        # create the chromosome
        chrom_id = makeChromID(chrom_num, genome_version, 'CHR')

        # add the feature to the graph
        snp_description = None
        if risk_allele_frequency is not None\
                and risk_allele_frequency != ''\
                and risk_allele_frequency != 'NR':
            snp_description = \
                str(risk_allele_frequency) + \
                ' [risk allele frequency]'

        f = Feature(
            g, snp_id, snp_label.strip(),
            Feature.types['SNP'], snp_description)
        if chrom_num != '' and chrom_pos != '':
            f.addFeatureStartLocation(chrom_pos, chrom_id)
            f.addFeatureEndLocation(chrom_pos, chrom_id)
        f.addFeatureToGraph()
        f.addTaxonToFeature(tax_id)
        # TODO consider adding allele frequency as property;
        # but would need background info to do that

        # also want to add other descriptive info about
        # the variant from the context
        for c in re.split(r';', context):
            cid = self._map_variant_type(c.strip())
            if cid is not None:
                model.addType(snp_id, cid)

        return
Beispiel #2
0
    def _get_phenotypicseries_parents(entry, g):
        """
        Extract the phenotypic series parent relationship out of the entry
        :param entry:
        :return:
        """
        model = Model(g)
        omimid = 'OMIM:'+str(entry['mimNumber'])
        # the phenotypic series mappings
        serieslist = []
        if 'phenotypicSeriesExists' in entry:
            if entry['phenotypicSeriesExists'] is True:
                if 'phenotypeMapList' in entry:
                    phenolist = entry['phenotypeMapList']
                    for p in phenolist:
                        serieslist.append(
                            p['phenotypeMap']['phenotypicSeriesNumber'])
                if 'geneMap' in entry and \
                        'phenotypeMapList' in entry['geneMap']:
                    phenolist = entry['geneMap']['phenotypeMapList']
                    for p in phenolist:
                        if 'phenotypicSeriesNumber' in p['phenotypeMap']:
                            serieslist.append(
                                p['phenotypeMap']['phenotypicSeriesNumber'])
        # add this entry as a subclass of the series entry
        for ser in serieslist:
            series_id = 'OMIM:'+ser
            model.addClassToGraph(series_id, None)
            model.addSubClass(omimid, series_id)

        return
Beispiel #3
0
    def _add_assertion_provenance(
            self, assoc_id, evidence_line_bnode, impc_map):
        """
        Add assertion level provenance, currently always IMPC
        :param assoc_id:
        :param evidence_line_bnode:
        :return:
        """
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)
        assertion_bnode = self.make_id("assertion{0}{1}".format(
            assoc_id, impc_map['asserted_by']['IMPC']),  '_')

        model.addIndividualToGraph(
            assertion_bnode, None,
            provenance_model.provenance_types['assertion'])

        provenance_model.add_assertion(
            assertion_bnode, impc_map['asserted_by']['IMPC'],
            'International Mouse Phenotyping Consortium')

        self.graph.addTriple(
            assoc_id, provenance_model.object_properties['is_asserted_in'],
            assertion_bnode)

        self.graph.addTriple(
            assertion_bnode,
            provenance_model.object_properties['is_assertion_supported_by'],
            evidence_line_bnode)

        return
Beispiel #4
0
    def _parse_patient_phenotypes(self, file, limit=None):
        """
        :param file: file handler
        :param limit: limit rows processed
        :return:
        """
        model = Model(self.graph)
        line_counter = 0
        reader = csv.reader(file, delimiter="\t")
        for row in reader:
            (patient_id, hpo_curie, present) = row
            patient_curie = ':{0}'.format(patient_id)
            if patient_id == 'Patient':  # skip header
                line_counter += 1
                continue

            model.addPerson(patient_curie, patient_id)

            self.graph.addTriple(
                patient_curie, self.globaltt['has phenotype'], self.globaltt['disease'])
            if present == 'yes':
                self.graph.addTriple(
                    patient_curie, self.globaltt['has phenotype'], hpo_curie)

            line_counter += 1
            if not self.test_mode and limit is not None \
                    and line_counter >= limit:
                break
Beispiel #5
0
    def parse(self, limit=None):
        zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized)
        model = Model(self.graph)
        zp_file = '/'.join((self.rawdir, self.files['zpmap']['file']))
        g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file']))
        zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file)

        with open(g2p_file, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:

                (internal_id, symbol, gene_id, subterm1_id, subterm1_label,
                 pc_rel_id, pc_rel_label, superterm1_id, superterm1_label,
                 quality_id, quality_name, modifier, subterm2_id,
                 subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id,
                 superterm2_label, fish_id, fish_label, start_stage, end_stage,
                 environment, pub_id, figure_id, unknown_field) = row

                zp_id = zfin_parser._map_sextuple_to_phenotype(
                    superterm1_id, subterm1_id, quality_id, superterm2_id,
                    subterm2_id, modifier)

                gene_curie = "ZFIN:{0}".format(gene_id)
                model.makeLeader(gene_curie)
                pub_curie = "ZFIN:{0}".format(pub_id)
                if zp_id:
                    assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id)
                    if pub_id:
                        reference = Reference(self.graph, pub_curie,
                                              Reference.ref_types['document'])
                        reference.addRefToGraph()
                        assoc.add_source(pub_curie)

                    assoc.add_evidence('ECO:0000059')
                    assoc.add_association_to_graph()
Beispiel #6
0
    def _add_assertion_provenance(
            self,
            assoc_id,
            evidence_line_bnode
    ):
        """
        Add assertion level provenance, currently always IMPC
        :param assoc_id:
        :param evidence_line_bnode:
        :return:
        """
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)
        assertion_bnode = self.make_id(
            "assertion{0}{1}".format(assoc_id, self.localtt['IMPC']), '_')

        model.addIndividualToGraph(assertion_bnode, None, self.globaltt['assertion'])

        provenance_model.add_assertion(
            assertion_bnode, self.localtt['IMPC'],
            'International Mouse Phenotyping Consortium')

        self.graph.addTriple(
            assoc_id, self.globaltt['proposition_asserted_in'], assertion_bnode)

        self.graph.addTriple(
            assertion_bnode,
            self.resolve('is_assertion_supported_by_evidence'),  # "SEPIO:0000111"
            evidence_line_bnode)

        return
Beispiel #7
0
    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for graph in [self.graph, self.testgraph]:
            # TODO: How to devise a label for each repository?
            model = Model(graph)
            reference = Reference(graph)
            repo_id = 'CoriellCollection:' + collection_id
            repo_label = label
            repo_page = page

            model.addIndividualToGraph(
                repo_id, repo_label, self.globaltt['collection'])
            reference.addPage(repo_id, repo_page)

        return
Beispiel #8
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1
                row = line.split('\t')
                (
                    morphology_term_id, morphology_term_label, hp_id, hp_label,
                    notes) = row

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    model.addClassToGraph(hp_id, None)
                    # Add the HP ID as an equivalent class
                    model.addEquivalentClass(morphology_term_id, hp_id)
                else:
                    LOG.warning('No matching HP term for %s', morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return
Beispiel #9
0
    def process_disease_association(self, limit):

        raw = '/'.join((self.rawdir, self.files['disease_assoc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing disease models")
        geno = Genotype(g)
        line_counter = 0
        worm_taxon = 'NCBITaxon:6239'
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, disease_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:'+gene_num

                # make a variant of the gene
                vl = '_:'+'-'.join((gene_num, 'unspecified'))
                vl_label = 'some variant of '+gene_symbol
                geno.addAffectedLocus(vl, gene_id)
                model.addBlankNodeAnnotation(vl)
                animal_id = geno.make_experimental_model_with_genotype(
                    vl, vl_label, worm_taxon, 'worm')

                assoc = G2PAssoc(
                    g, self.name, animal_id,
                    disease_id, model.object_properties['model_of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                eco_id = None
                if eco_symbol == 'IEA':
                    eco_id = 'ECO:0000501'  # IEA is this now
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                assoc.add_association_to_graph()

        return
Beispiel #10
0
    def _get_mappedids(self, entry, g):
        """
        Extract the Orphanet and UMLS ids as equivalences from the entry
        :param entry:
        :return:
        """
        model = Model(g)
        omimid = 'OMIM:'+str(entry['mimNumber'])
        orpha_mappings = []
        if 'externalLinks' in entry:
            links = entry['externalLinks']
            if 'orphanetDiseases' in links:
                # triple semi-colon delimited list of
                # double semi-colon delimited orphanet ID/disease pairs
                # 2970;;566;;Prune belly syndrome
                items = links['orphanetDiseases'].split(';;;')
                for i in items:
                    # note 'internal_num unused
                    (orpha_num, internal_num, orpha_label) = i.split(';;')
                    orpha_id = 'Orphanet:'+orpha_num.strip()
                    orpha_mappings.append(orpha_id)
                    model.addClassToGraph(orpha_id, orpha_label.strip())
                    model.addXref(omimid, orpha_id)

            if 'umlsIDs' in links:
                umls_mappings = links['umlsIDs'].split(',')
                for i in umls_mappings:
                    umls_id = 'UMLS:'+i
                    model.addClassToGraph(umls_id, None)
                    model.addXref(omimid, umls_id)

        return
Beispiel #11
0
    def process_gene_desc(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_desc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene descriptions")
        line_counter = 0
        # geno = Genotype(g)  # TODO unused
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                if line_counter == 1:
                    continue
                (gene_num, public_name, molecular_name, concise_description,
                 provisional_description, detailed_description,
                 automated_description, gene_class_description) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:'+gene_num

                if concise_description != 'none available':
                    model.addDefinition(gene_id, concise_description)

                # remove the description if it's identical to the concise
                descs = {
                    'provisional': provisional_description,
                    'automated': automated_description,
                    'detailed': detailed_description,
                    'gene class': gene_class_description
                }
                for d in descs:
                    text = descs.get(d)
                    if text == concise_description \
                            or re.match(r'none', text) or text == '':
                        pass  # don't use it
                    else:
                        text = ' '.join((text, '['+d+']'))
                        descs[d] = text
                        model.addDescription(gene_id, text)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Beispiel #12
0
    def _add_snp_to_graph(
            self, snp_id, snp_label, chrom_num, chrom_pos, context,
            risk_allele_frequency=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        if chrom_num != '' and chrom_pos != '':
            location = self._make_location_curie(chrom_num, chrom_pos)
            if location not in self.id_location_map:
                self.id_location_map[location] = set()
        else:
            location = None

        alteration = re.search(r'-(.*)$', snp_id)
        if alteration is not None and re.match(r'[ATGC]', alteration.group(1)):
            # add variation to snp
            pass  # TODO

        if location is not None:
            self.id_location_map[location].add(snp_id)

        # create the chromosome
        chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'], 'CHR')

        # add the feature to the graph
        snp_description = None
        if risk_allele_frequency is not None\
                and risk_allele_frequency != ''\
                and risk_allele_frequency != 'NR':
            snp_description = str(risk_allele_frequency) + ' [risk allele frequency]'

        feat = Feature(
            graph, snp_id, snp_label.strip(), self.globaltt['SNP'], snp_description)
        if chrom_num != '' and chrom_pos != '':
            feat.addFeatureStartLocation(chrom_pos, chrom_id)
            feat.addFeatureEndLocation(chrom_pos, chrom_id)
        feat.addFeatureToGraph()
        feat.addTaxonToFeature(self.globaltt['H**o sapiens'])
        # TODO consider adding allele frequency as property;
        # but would need background info to do that

        # also want to add other descriptive info about
        # the variant from the context
        for ctx in re.split(r';', context):
            ctx = ctx.strip()
            cid = self.resolve(ctx, False)
            if cid != ctx:
                model.addType(snp_id, cid)

        return
Beispiel #13
0
    def process_pub_xrefs(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing publication xrefs")
        line_counter = 0
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (wb_ref, xref) = row
                # WBPaper00000009 pmid8805<BR>
                # WBPaper00000011 doi10.1139/z78-244<BR>
                # WBPaper00000012 cgc12<BR>

                if self.testMode and wb_ref not in self.test_ids['pub']:
                    continue

                ref_id = 'WormBase:'+wb_ref
                xref_id = None
                r = None
                xref = re.sub(r'<BR>', '', xref)
                xref = xref.strip()
                if re.match(r'pmid', xref):
                    xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref)
                    reference = Reference(
                        g, xref_id, Reference.ref_types['journal_article'])
                elif re.search(r'[\(\)\<\>\[\]\s]', xref):
                    continue
                elif re.match(r'doi', xref):
                    xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip())
                    reference = Reference(g, xref_id)
                elif re.match(r'cgc', xref):
                    # TODO not sure what to do here with cgc xrefs
                    continue
                else:
                    # logger.debug("Other xrefs like %s", xref)
                    continue

                if xref_id is not None:
                    reference.addRefToGraph()
                    model.addSameIndividual(ref_id, xref_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Beispiel #14
0
class Environment():
    """
    These methods provide convenient methods
    to add items related to an experimental environment
    and it's parts to a supplied graph.

    This is a stub.
    """


    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def addEnvironment(
            self, env_id, env_label, env_type=None, env_description=None):
        if env_type is None:
            env_type = self.globaltt['environmental_system']

        self.model.addIndividualToGraph(
            env_id, env_label, env_type, env_description)

        return

    def addEnvironmentalCondition(
            self, cond_id, cond_label, cond_type=None, cond_description=None):
        if cond_type is None:
            cond_type = self.globaltt['environmental_condition']

        self.model.addIndividualToGraph(
            cond_id, cond_label, cond_type, cond_description)

        return

    def addComponentToEnvironment(self, env_id, component_id):

        self.graph.addTriple(env_id, self.globaltt['has_part'], component_id)

        return

    def addComponentAttributes(self, component_id, entity_id, value=None, unit=None):

        self.graph.addTriple(
            component_id, self.globaltt['has_part'], entity_id)
        # TODO add value and units

        return
Beispiel #15
0
    def _add_variant_trait_association(self, variant_id, mapped_trait_uri,
                                       efo_ontology, pubmed_id,
                                       description=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        # make associations to the EFO terms; there can be >1
        if mapped_trait_uri.strip() != '':
            for trait in re.split(r',', mapped_trait_uri):
                trait = trait.strip()

                trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_", "EFO:")

                phenotype_query = """
                    SELECT ?trait
                    WHERE {{
                        <{0}> rdfs:subClassOf+ <http://www.ebi.ac.uk/efo/EFO_0000651> .
                        <{0}> rdfs:label ?trait .
                    }}
                """.format(trait)

                query_result = efo_ontology.query(phenotype_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_curie):
                        model.addClassToGraph(
                            trait_curie,
                            list(query_result)[0][0],
                            'UPHENO:0001001')

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(
                    g, pubmed_curie, Reference.ref_types['journal_article'])
                ref.addRefToGraph()

                assoc = G2PAssoc(
                    g, self.name, variant_id, trait_curie,
                    model.object_properties['contributes_to'])
                assoc.add_source(pubmed_curie)
                # combinatorial evidence
                # used in automatic assertion
                eco_id = 'ECO:0000213'
                assoc.add_evidence(eco_id)

                if description is not None:
                    assoc.set_description(description)

                # FIXME score should get added to provenance/study
                # assoc.set_score(pvalue)
                if trait_curie is not None:
                    assoc.add_association_to_graph()
Beispiel #16
0
    def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num,
                          disorder_label, phene_key):

        geno = Genotype(g)
        model = Model(g)
        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_id = model.object_properties['has_phenotype']  # default
        rel_label = 'causes'
        if re.match(r'\[', disorder_label):
            rel_id = model.object_properties['is_marker_for']
            rel_label = 'is a marker for'
        elif re.match(r'\{', disorder_label):
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'
        elif re.match(r'\?', disorder_label):
            # this is a questionable mapping!  skip?
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'

        evidence = self._map_phene_mapping_code_to_eco(phene_key)

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself.
        # so we make an anonymous alternate locus,
        # and put that in the association.
        # but we only need to do that in the cases when it's not an NCBIGene
        # (as that is a sequence feature itself)
        if re.match(r'OMIM:', gene_id):
            alt_locus = '_:'+re.sub(r':', '', gene_id)+'-'+disorder_num+'VL'
            alt_label = gene_symbol.strip()
            if alt_label is not None and alt_label != '':
                alt_label = \
                    ' '.join(('some variant of', alt_label,
                              'that', rel_label, disorder_label))
            else:
                alt_label = None

            model.addIndividualToGraph(
                alt_locus, alt_label, Genotype.genoparts['variant_locus'])
            geno.addAffectedLocus(alt_locus, gene_id)
            model.addBlankNodeAnnotation(alt_locus)

        else:
            # assume it's already been added
            alt_locus = gene_id

        assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id)
        assoc.add_evidence(evidence)
        assoc.add_association_to_graph()

        return
Beispiel #17
0
 def _add_gene_to_graph(self, gene, variant_bnode, gene_id, relation):
     """
     :param gene:
     :param variant_bnode:
     :return:
     """
     model = Model(self.graph)
     if gene_id:
         self.graph.addTriple(variant_bnode, relation, gene_id)
     elif gene:
         LOG.info("gene %s not mapped to NCBI gene, making blank node", gene)
         gene_bnode = self.make_id("{0}".format(gene), "_")
         model.addIndividualToGraph(gene_bnode, gene)
         self.graph.addTriple(variant_bnode, relation, gene_bnode)
Beispiel #18
0
    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("%s is not a graph", graph)

        # assert ref_id is not None

        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map

        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type
            if ref_type[:4] not in ('IAO:', 'SIO:'):
                LOG.warning("Got Pub ref type of:  %s", ref_type)

        if ref_id is not None and ref_id[:4] == 'http':
            self.ref_url = ref_id

        return
Beispiel #19
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(
            graph_type,
            are_bnodes_skolemized,
            'decipher',
            ingest_title='Development Disorder Genotype Phenotype Database',
            ingest_url='https://decipher.sanger.ac.uk/',
            license_url='https://decipher.sanger.ac.uk/legal',
            data_rights='https://decipher.sanger.ac.uk/datasharing',
            # file_handle=None
        )

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = self.all_test_ids['disease']

        self.graph = self.graph
        self.geno = Genotype(self.graph)
        self.model = Model(self.graph)
        self.graph_type = graph_type
        self.are_bnodes_skolemized = are_bnodes_skolemized

        return
Beispiel #20
0
    def __init__(self, graph, definedby, sub=None, obj=None, pred=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)

        # core parts of the association
        self.definedby = definedby
        self.sub = sub
        self.obj = obj
        self.rel = pred
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        self.date = []

        # this is going to be used for the refactored evidence/provenance
        self.provenance = []
        self.score = None
        self.score_type = None
        self.score_unit = None

        return
Beispiel #21
0
 def __init__(self, graph):
     if isinstance(graph, Graph):
         self.graph = graph
     else:
         raise ValueError("{} is not a graph".graph)
     self.model = Model(self.graph)
     return
Beispiel #22
0
    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            header = next(filereader)
            line_counter = 1
            colcount = len(header)
            if colcount != 4:  # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID')
                logger.error("Unexpected Header ", header)
                exit(-1)
            for row in filereader:
                line_counter += 1
                if len(row) != colcount:
                    logger.error("Unexpected row. got: ", row)
                    logger.error("Expected data for: ", header)
                    exit(-1)
                (shortname, title, nbk_num, pmid) = row
                gr_id = 'GeneReviews:'+nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    model.addClassToGraph(gr_id, title)
                    model.addSynonym(gr_id, shortname)
                # TODO include the new PMID?

        return
Beispiel #23
0
    def _get_mapped_gene_ids(self, entry, g):

        gene_ids = []
        model = Model(g)
        omimid = 'OMIM:'+str(entry['mimNumber'])
        if 'externalLinks' in entry:
            links = entry['externalLinks']
            omimtype = self._get_omimtype(entry)
            if 'geneIDs' in links:
                entrez_mappings = links['geneIDs']
                gene_ids = entrez_mappings.split(',')
                self.omim_ncbigene_idmap[omimid] = gene_ids
                if omimtype == Genotype.genoparts['gene']:
                    for i in gene_ids:
                        model.addEquivalentClass(omimid, 'NCBIGene:'+str(i))

        return gene_ids
Beispiel #24
0
    def _process_interactions(self, row):
        """
        Process row of CTD data from CTD_chemicals_diseases.tsv.gz
        and generate triples. Only create associations based on direct evidence
        (not using the inferred-via-gene), and unambiguous relationships.
        (Ambiguous ones will be processed in the sister method using the
        disambiguated file). There are no OMIM ids for diseases in these cases,
        so we associate with only the mesh disease ids.
        Args:
            :param row (list): row of CTD data
        Returns:
            :return None
        """
        model = Model(self.graph)
        self._check_list_len(row, 10)
        (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence,
         inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row

        if direct_evidence == '':
            return

        evidence_pattern = re.compile(r'^therapeutic|marker\/mechanism$')
        # dual_evidence = re.compile(r'^marker\/mechanism\|therapeutic$')

        # filter on those diseases that are mapped to omim ids in the test set
        intersect = list(
            set(['OMIM:' + str(i) for i in omim_ids.split('|')] +
                [disease_id]) & set(self.test_diseaseids))
        if self.test_mode and len(intersect) < 1:
            return
        chem_id = 'MESH:' + chem_id
        reference_list = self._process_pubmed_ids(pubmed_ids)
        if re.match(evidence_pattern, direct_evidence):
            rel_id = self.resolve(direct_evidence)
            model.addClassToGraph(chem_id, chem_name)
            model.addClassToGraph(disease_id, None)
            self._make_association(chem_id, disease_id, rel_id, reference_list)
        else:
            # there's dual evidence, but haven't mapped the pubs
            pass
            # LOG.debug(
            #   "Dual evidence for %s (%s) and %s (%s)",
            #   chem_name, chem_id, disease_name, disease_id)

        return
Beispiel #25
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        model = Model(self.graph)

        record['relation']['id'] = self.resolve("has phenotype")
        # define the triple
        gene = record['subject']['id']
        relation = record['relation']['id']
        phenotype = record['object']['id']

        # instantiate the association
        g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=phenotype, pred=relation)

        # add the references
        references = record['evidence']['has_supporting_reference']
        # created RGDRef prefix in curie map to route to proper reference URL in RGD
        references = [
            x.replace('RGD', 'RGDRef') if 'PMID' not in x else x for x in references]

        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(
                self.graph, references[0],
                self.globaltt['publication']
            )
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            # This seems to be specific to this source and
            # there could be non-equivalent references in this list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add the date created on
        g2p_assoc.add_date(date=record['date'])
        g2p_assoc.add_evidence(self.resolve(record['evidence']['type']))  # ?set where?
        g2p_assoc.add_association_to_graph()

        return
Beispiel #26
0
    def _process_genes_kegg2ncbi(self, limit=None):
        """
        This method maps the KEGG human gene IDs
            to the corresponding NCBI Gene IDs.

        Triples created:
        <kegg_gene_id> is a class
        <ncbi_gene_id> is a class
        <kegg_gene_id> equivalentClass <ncbi_gene_id>
        :param limit:
        :return:

        """

        LOG.info("Processing KEGG gene IDs to NCBI gene IDs")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (kegg_gene_id, ncbi_gene_id, link_type) = row

                if self.test_mode and kegg_gene_id not in self.test_ids['genes']:
                    continue

                # Adjust the NCBI gene ID prefix.
                ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
                kegg_gene_id = 'KEGG-' + kegg_gene_id

                # Adding the KEGG gene ID to the graph here is redundant,
                # unless there happens to be additional gene IDs in this table
                # not present in the genes table.
                model.addClassToGraph(kegg_gene_id, None)
                model.addClassToGraph(ncbi_gene_id, None)
                model.addEquivalentClass(kegg_gene_id, ncbi_gene_id)

                if not self.test_mode and (
                        limit is not None and reader.line_num > limit):
                    break
        LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
Beispiel #27
0
 def _add_deprecated_snp(
         self, snp_id, snp_id_current, merged, chrom_num, chrom_pos):
     if self.test_mode:
         graph = self.testgraph
     else:
         graph = self.graph
     model = Model(graph)
     location = self._make_location_curie(chrom_num, chrom_pos)
     # add deprecation information
     if merged == '1' and str(snp_id_current.strip()) != '':
         # get the current rs_id
         current_rs_id = 'dbSNP:'
         if not re.match(r'rs', snp_id_current):
             current_rs_id += 'rs'
         current_rs_id += str(snp_id_current)
         if location is not None:
             if location not in self.id_location_map:
                 self.id_location_map[location] = set(current_rs_id)
             else:
                 self.id_location_map[location].add(current_rs_id)
         model.addDeprecatedIndividual(snp_id, current_rs_id)
         # TODO check on this
         # should we add the annotations to the current
         # or orig?
         model.makeLeader(current_rs_id)
     else:
         model.makeLeader(snp_id)
Beispiel #28
0
    def _process_orthologs(self, raw, limit=None):
        """
        This method maps orthologs for a species to the KEGG orthology classes.

        Triples created:
        <gene_id> is a class
        <orthology_class_id> is a class

        <assoc_id> has subject <gene_id>
        <assoc_id> has object <orthology_class_id>
        :param limit:
        :return:

        """

        LOG.info("Processing orthologs")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in reader:
                (gene_id, orthology_class_id) = row

                orthology_class_id = 'KEGG:'+orthology_class_id.strip()
                gene_id = 'KEGG:' + gene_id.strip()

                # note that the panther_id references a group of orthologs,
                # and is not 1:1 with the rest

                # add the KO id as a gene-family grouping class
                OrthologyAssoc(
                    graph, self.name, gene_id, None).add_gene_family_to_graph(
                        orthology_class_id)

                # add gene and orthology class to graph;
                # assume labels will be taken care of elsewhere
                model.addClassToGraph(gene_id, None)
                model.addClassToGraph(orthology_class_id, None)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
        LOG.info("Done with orthologs")
Beispiel #29
0
    def __init__(self, graph, association):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.association = association

        return
Beispiel #30
0
    def _process_all(self, limit):
        """
        This takes the list of omim identifiers from the omim.txt.Z file,
        and iteratively queries the omim api for the json-formatted data.
        This will create OMIM classes, with the label,
        definition, and some synonyms.
        If an entry is "removed",
            it is added as a deprecated class.
        If an entry is "moved",
            it is deprecated and consider annotations are added.

        Additionally, we extract:
        *phenotypicSeries ids as superclasses
        *equivalent ids for Orphanet and UMLS

        If set to testMode,
            it will write only those items in the test_ids to the testgraph.

        :param limit:
        :return:
        """
        omimids = self._get_omim_ids()  # store the set of omim identifiers

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        model = Model(g)
        # tax_num = '9606'   # TODO PYLINT unused
        tax_id = 'NCBITaxon:9606'
        tax_label = 'Human'

        # add genome and taxon
        geno.addGenome(tax_id, tax_label)   # tax label can get added elsewhere
        model.addClassToGraph(tax_id, None)   # label added elsewhere

        includes = set()
        includes.add('all')

        self.process_entries(
            omimids, self._transform_entry, includes, g, limit)

        return
Beispiel #31
0
    def _process_qtls_genetic_location(
            self, raw, src_key, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[src_key]['curie']

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # no header in these files, so no header checking

            col = self.files[src_key]['columns']

            for row in filereader:
                line_counter += 1

                if len(row) != len(self.qtl_columns):
                    LOG.warning("Problem parsing in %s row %s\n"
                                "got %s cols but expected %s",
                                raw, row, len(row), len(col))
                    continue
                else:
                    qtl_id = row[col.index('qtl_id')].strip()
                    qtl_symbol = row[col.index('qtl_symbol')].strip()
                    trait_name = row[col.index('trait_name')].strip()
                    # assotype = row[col.index('assotype')].strip()
                    # empty = row[col.index('empty')].strip()
                    chromosome = row[col.index('chromosome')].strip()
                    position_cm = row[col.index('position_cm')].strip()
                    range_cm = row[col.index('range_cm')].strip()
                    # flankmark_a2 = row[col.index('flankmark_a2')].strip()
                    # flankmark_a1 = row[col.index('flankmark_a1')].strip()
                    peak_mark = row[col.index('peak_mark')].strip()
                    # flankmark_b1 = row[col.index('flankmark_b1')].strip()
                    # flankmark_b2 = row[col.index('flankmark_b2')].strip()
                    # exp_id = row[col.index('exp_id')].strip()
                    # model_id = row[col.index('model_id')].strip()
                    # test_base = row[col.index('test_base')].strip()
                    # sig_level = row[col.index('sig_level')].strip()
                    # lod_score = row[col.index('lod_score')].strip()
                    # ls_mean = row[col.index('ls_mean')].strip()
                    p_values = row[col.index('p_values')].strip()
                    # f_statistics = row[col.index('f_statistics')].strip()
                    # variance = row[col.index('variance')].strip()
                    # bayes_value = row[col.index('bayes_value')].strip()
                    # likelihood_ratio = row[col.index('likelihood_ratio')].strip()
                    trait_id = row[col.index('trait_id')].strip()
                    # dom_effect = row[col.index('dom_effect')].strip()
                    # add_effect = row[col.index('add_effect')].strip()
                    pubmed_id = row[col.index('pubmed_id')].strip()
                    gene_id = row[col.index('gene_id')].strip()
                    gene_id_src = row[col.index('gene_id_src')].strip()
                    # gene_id_type = row[col.index('gene_id_type')].strip()
                    # empty2 = row[col.index('empty2')].strip()

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                # off by one - the following actually gives us (limit + 1) records
                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return
Beispiel #32
0
    def __init__(
            self,
            identifier,
            data_release_version,
            ingest_name,
            ingest_title,
            ingest_url,
            ingest_logo=None,
            ingest_description=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None,
            distribution_type='ttl',
            dataset_curie_prefix='MonarchArchive'):

        if graph_type is None:
            self.graph = RDFGraph(None,
                                  ":".join([dataset_curie_prefix, identifier]))
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       ":".join(
                                           [dataset_curie_prefix, identifier]),
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True,
                                  ':'.join([dataset_curie_prefix, identifier]))

        if data_release_version is not None:
            self.data_release_version = data_release_version
        else:
            self.data_release_version = datetime.today().strftime("%Y%m%d")

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.identifier = ':'.join([dataset_curie_prefix, identifier])
        self.citation = set()

        self.ingest_name = ingest_name
        self.ingest_title = ingest_title
        if self.ingest_title is None:
            self.ingest_title = ":".join([dataset_curie_prefix, identifier])

        self.ingest_url = ingest_url
        self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo
        self.ingest_description = ingest_description

        self.date_issued = None

        self.license_url = license_url
        self.data_rights = data_rights
        self.distribution_type = distribution_type

        # set HCLS resource CURIEs
        self.summary_level_curie = ':'.join(
            [dataset_curie_prefix, '#' + identifier])
        self.version_level_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/#' + identifier
        self.distribution_level_turtle_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/rdf/' + \
            identifier + "." + self.distribution_type

        # The following might seem a little odd, but we need to set downloadURLs this
        # way in order for them to point to where they will end up in archive.MI.org as
        # of Sept 2019. URL is:
        #  https://archive.MI.org/[release version]/[dist type]/[source].[dist type]
        self.download_url = \
            self.curie_map.get("MonarchArchive") + self.data_release_version + \
            "/rdf/" + self.ingest_name + "." + self.distribution_type

        self._set_summary_level_triples()
        self._set_version_level_triples()
        self._set_distribution_level_triples()
Beispiel #33
0
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n" + '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(prefix,
                                           self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Beispiel #34
0
    def _process_omim2disease(self, limit=None):
        """
        This method maps the KEGG disease IDs to
        the corresponding OMIM disease IDs.
        Currently this only maps KEGG diseases and OMIM diseases that are 1:1.

        Triples created:
        <kegg_disease_id> is a class
        <omim_disease_id> is a class
        <kegg_disease_id> hasXref <omim_disease_id>
        :param limit:

        :return:

        """

        logger.info("Processing 1:1 KEGG disease to OMIM disease mappings")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        model = Model(g)
        raw = '/'.join((self.rawdir, self.files['omim2disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                (omim_disease_id, kegg_disease_id, link_type) = row

                kegg_disease_id = 'KEGG-'+kegg_disease_id.strip()
                omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id)

                # Create hash for the links from OMIM ID -> KEGG ID
                if omim_disease_id not in self.omim_disease_hash:
                    self.omim_disease_hash[omim_disease_id] = [kegg_disease_id]
                else:
                    self.omim_disease_hash[
                        omim_disease_id].append(kegg_disease_id)

                # Create hash for the links from KEGG ID -> OMIM ID
                if kegg_disease_id not in self.kegg_disease_hash:
                    self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id]
                else:
                    self.kegg_disease_hash[
                        kegg_disease_id].append(omim_disease_id)

        # Now process the disease hashes
        # and only pass 1:1 omim disease:KEGG disease entries.
        for omim_disease_id in self.omim_disease_hash:
            if self.testMode and \
                    omim_disease_id not in self.test_ids['disease']:
                continue

            if (not self.testMode) and (
                    limit is not None and line_counter > limit):
                break
            line_counter += 1

            if len(self.omim_disease_hash[omim_disease_id]) == 1:
                kegg_disease_id = \
                    ''.join(self.omim_disease_hash.get(omim_disease_id))
                if len(self.kegg_disease_hash[kegg_disease_id]) == 1:
                    # add ids, and deal with the labels separately
                    model.addClassToGraph(kegg_disease_id, None)
                    model.addClassToGraph(omim_disease_id, None)
                    # TODO is this safe?
                    model.addEquivalentClass(kegg_disease_id, omim_disease_id)
            else:
                pass
                # gu.addXref(g, omim_disease_id, kegg_disease_id)
                # TODO add xrefs if >1:1 mapping?

        logger.info("Done with KEGG disease to OMIM disease mappings.")
        return
Beispiel #35
0
class Decipher(Source):
    """
    Deprecated - please see the EBIGene2Phen class, which parses the same
    file but fetches it from EBI which has clearer terms for redistribution,
    while Decipher has restrictive terms due to containing patient data in
    password protected datasets.

    The Decipher group curates and assembles the Development Disorder Genotype
    Phenotype Database (DDG2P) which is a curated list of genes reported to be
    associated with developmental disorders, compiled by clinicians as part of
    the DDD study to facilitate clinical feedback of likely causal variants.

    Beware that the redistribution of this data is a bit unclear from the
    [license](https://decipher.sanger.ac.uk/legal).
    If you intend to distribute this data, be sure to have the appropriate
    licenses in place.

    """

    files = {
        'annot': {
            'file': 'ddg2p.zip',
            'url': 'https://decipher.sanger.ac.uk/files/ddd/ddg2p.zip',
            'headers': []
        }
    }

    def __init__(self,
                 graph_type,
                 are_bnodes_skolemized,
                 data_release_version=None):
        super().__init__(
            graph_type=graph_type,
            are_bnodes_skized=are_bnodes_skolemized,
            data_release_version=data_release_version,
            name='decipher',
            ingest_title='Development Disorder Genotype Phenotype Database',
            ingest_url='https://decipher.sanger.ac.uk/',
            ingest_logo='source-decipher.png',
            license_url='https://decipher.sanger.ac.uk/legal',
            data_rights='https://decipher.sanger.ac.uk/datasharing',
            # file_handle=None
        )

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = self.all_test_ids['disease']

        self.graph = self.graph
        self.geno = Genotype(self.graph)
        self.model = Model(self.graph)
        self.graph_type = graph_type
        self.are_bnodes_skolemized = are_bnodes_skolemized

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        # since there's a dependency on HGNC files; fetch those too

        hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized)
        hgnc.fetch(is_dl_forced)

        return

    def parse(self, limit=None):
        if limit is not None:
            LOG.info("Only parsing first %s rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True
            self.graph = self.testgraph
        else:
            self.graph = self.graph

        self.geno = Genotype(self.graph)

        # rare disease-phenotype associations
        self._process_ddg2p_annotations(limit)

        LOG.info("Finished parsing.")

        return

    def _process_ddg2p_annotations(self, limit):
        """
        The ddg2p annotations associate a gene symbol to an omim disease,
        along with some HPO ids and pubs. The gene symbols come from gencode,
        which in turn come from HGNC official gene symbols.  Therefore,
        we use the HGNC source class to get the id/symbol mapping for
        use in our annotations here.

        According to http://www.gencodegenes.org/faq.html,
        "Gene names are usually HGNC or MGI-approved gene symbols mapped
        to the GENCODE genes by the Ensembl xref pipeline. Sometimes,
        when there is no official gene symbol, the Havana clone-based
        name is used."

        The kind of variation that is linked to a disease is indicated
        (LOF, GOF, CNV, etc) in the source data.
        Here, we create an anonymous variant of the specified gene of
        the indicated type (mapped to the sequence ontology (SO)).

        :param limit:
        :return:

        """

        line_counter = 0
        if self.graph is not None:
            graph = self.graph
        else:
            graph = self.graph

        # in order for this to work, we need to map the HGNC id-symbol;
        # hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized)
        # hgnc_symbol_id_map = hgnc.get_symbol_id_map()  # Does Not Exists in hgnc

        myzip = ZipFile('/'.join((self.rawdir, self.files['annot']['file'])),
                        'r')

        # use the ddg2p.txt file
        fname = 'ddg2p.txt'

        unmapped_omim_counter = 0
        unmapped_gene_count = 0
        with myzip.open(fname, 'r') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            # score_means_by_measure = {}
            # strain_scores_by_measure = {}   # TODO theseare unused
            for row in reader:
                if re.match(r'#', row[0]):  # skip comments
                    continue

                (gencode_gene_name, mode, category, consequence, disease, omim,
                 ddg2p_id, pubmed_ids, hpo_codes) = row

                # hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip())
                # if hgnc_id is None:
                if True:
                    LOG.error("FIXME Couldn't map the gene symbol %s to HGNC.",
                              gencode_gene_name)
                    unmapped_gene_count += 1
                    continue
                # add the gene
                # self.model.addClassToGraph(hgnc_id, gencode_gene_name)

                # TODO make VSLC with the variation
                #   to associate with the disorder
                # TODO use the Inheritance and Mutation consequence
                #   to classify the VSLCs

                # allele_id = self.make_allele_by_consequence(
                #    consequence, hgnc_id, gencode_gene_name)

                if omim.strip() != '':
                    omim_id = 'OMIM:' + str(omim.strip())
                    # assume this is declared elsewhere in ontology
                    self.model.addClassToGraph(omim_id, None)

                    # ??? rel is never used
                    # if category.strip() == 'Confirmed DD gene':
                    #     rel = self.self.globaltt['has phenotype']
                    # elif category.strip() == 'Probable DD gene':
                    #    rel = self.self.globaltt['has phenotype']
                    # elif category.strip() == 'Possible DD gene':
                    #    rel = self.self.globaltt['contributes to']
                    # elif category.strip() == 'Not DD gene':
                    #    # TODO negative annotation
                    #    continue
                    # assoc = G2PAssoc(graph, self.name, allele_id, omim_id)
                    # TODO 'rel' is assigned to but never used

                    for p in re.split(r';', pubmed_ids):
                        p = p.strip()
                        if p != '':
                            pmid = 'PMID:' + str(p)
                            r = Reference(graph, pmid,
                                          self.globaltt['journal article'])
                            r.addRefToGraph()
                            assoc.add_source(pmid)

                    assoc.add_association_to_graph()
                else:
                    # these are unmapped to a disease id.
                    # note that some match OMIM disease labels
                    # but the identifiers are just not included.
                    # TODO consider mapping to OMIM or DOIDs in other ways
                    LOG.warning("No omim id on line %d\n%s", line_counter,
                                str(row))
                    unmapped_omim_counter += 1

                # TODO hpo phenotypes
                # since the DDG2P file is not documented,
                # I don't know what the HPO annotations are actually about
                # are they about the gene?  the omim disease?  something else?
                # So, we wont create associations until this is clarified

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        myzip.close()
        LOG.warning("gene-disorder associations with no omim id: %d",
                    unmapped_omim_counter)
        LOG.warning("unmapped gene count: %d", unmapped_gene_count)

        return

    def make_allele_by_consequence(self, consequence, gene_id, gene_symbol):
        """
        Given a "consequence" label that describes a variation type,
        create an anonymous variant of the specified gene as an instance of
        that consequence type.

        :param consequence:
        :param gene_id:
        :param gene_symbol:
        :return: allele_id
        """

        allele_id = None

        # Loss of function : Nonsense, frame-shifting indel,
        #   essential splice site mutation, whole gene deletion or any other
        #   mutation where functional analysis demonstrates clear reduction
        #   or loss of function
        # All missense/in frame : Where all the mutations described in the data
        #   source are either missense or in frame deletions and there is no
        #   evidence favoring either loss-of-function, activating or
        #   dominant negative effect
        # Dominant negative : Mutation within one allele of a gene that creates
        #   a significantly greater deleterious effect on gene product
        #   function than a monoallelic loss of function mutation
        # Activating : Mutation, usually missense that results in
        #   a constitutive functional activation of the gene product
        # Increased gene dosage : Copy number variation that increases
        #   the functional dosage of the gene
        # Cis-regulatory or promotor mutation : Mutation in cis-regulatory
        #   elements that lies outwith the known transcription unit and
        #   promotor of the controlled gene
        # Uncertain : Where the exact nature of the mutation is unclear or
        #   not recorded

        type_id = self.resolve(consequence, mandatory=False)
        if type_id == consequence:
            LOG.warning("Consequence type unmapped: %s", str(consequence))
            type_id = self.globaltt['sequence_variant']

        # make the allele
        allele_id = ''.join((gene_id, type_id))
        allele_id = re.sub(r':', '', allele_id)
        allele_id = self.make_id(allele_id)  # make this a BNode
        allele_label = ' '.join((consequence, 'allele in', gene_symbol))

        self.model.addIndividualToGraph(allele_id, allele_label, type_id)
        self.geno.addAlleleOfGene(allele_id, gene_id)

        return allele_id
Beispiel #36
0
    def process_gaf(self, gaffile, limit, id_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter='\t',
                                quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None:
                        # try/except much faster than checking
                        # for dict key membership
                        try:
                            gene_id = id_map[gene_num]
                            uniprotid = ':'.join((dbase, gene_num))
                            (dbase, gene_num) = gene_id.split(':')
                            uniprot_hit += 1
                        except KeyError:
                            # LOG.warning(
                            #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                            #    gene_num)
                            uniprot_miss += 1
                            continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addLabel(gene_id, gene_symbol)
                model.addType(gene_id, self.globaltt['gene'])

                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(gene_id,
                                            self.globaltt['has gene product'],
                                            syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None and\
                                syn.split(':')[0] not in self.wont_prefix:
                            syn = syn.strip()
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            if syn != '':
                                model.addSynonym(gene_id, syn)
                        elif syn != '':
                            model.addSynonym(gene_id, syn)

                # First taxon is for the gene, after the pipe are interacting taxa
                tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon')
                # this is a required field but good to safe
                if tax_curie:
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = self.gaf_eco[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[-2]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = with_or_from.split('|')
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning("Skipping  %s from or with %s",
                                        uniprotid, itm)
                            continue
                        # sanity check/conversion on go curie prefix
                        (pfx, lclid) = itm.split(':')[-2:]  # last prefix wins
                        if pfx in self.localtt:
                            pfx = self.localtt[pfx]
                        itm = ':'.join((pfx, lclid))

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm,
                                             phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[-2]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the mapping download",
                uniprot_per, uniprot_tot)
Beispiel #37
0
    def _add_study_provenance(self, phenotyping_center, colony,
                              project_fullname, pipeline_name,
                              pipeline_stable_id, procedure_stable_id,
                              procedure_name, parameter_stable_id,
                              parameter_name, statistical_method,
                              resource_name):
        """
        :param phenotyping_center: str, from self.files['all']
        :param colony: str, from self.files['all']
        :param project_fullname: str, from self.files['all']
        :param pipeline_name: str, from self.files['all']
        :param pipeline_stable_id: str, from self.files['all']
        :param procedure_stable_id: str, from self.files['all']
        :param procedure_name: str, from self.files['all']
        :param parameter_stable_id: str, from self.files['all']
        :param parameter_name: str, from self.files['all']
        :param statistical_method: str, from self.files['all']
        :param resource_name: str, from self.files['all']
        :return: study bnode
        """

        provenance_model = Provenance(self.graph)
        model = Model(self.graph)

        # Add provenance
        # A study is a blank node equal to its parts
        study_bnode = self.make_id(
            "{0}{1}{2}{3}{4}{5}{6}{7}".format(
                phenotyping_center, colony, project_fullname,
                pipeline_stable_id, procedure_stable_id, parameter_stable_id,
                statistical_method, resource_name), '_')

        model.addIndividualToGraph(study_bnode, None, self.globaltt['study'])

        # List of nodes linked to study with has_part property
        study_parts = []

        pipeline_curie = 'IMPC-pipe:' + pipeline_stable_id
        procedure_curie = 'IMPC-proc:' + procedure_stable_id
        parameter_curie = 'IMPC-param:' + procedure_stable_id + '#' + parameter_stable_id

        # Add study parts

        model.addIndividualToGraph(procedure_curie, procedure_name)
        study_parts.append(procedure_curie)  # ? stable or curie

        study_parts.append(self.resolve(statistical_method))
        provenance_model.add_study_parts(study_bnode, study_parts)

        # Add parameter/measure statement: study measures parameter
        parameter_label = "{0} ({1})".format(parameter_name, procedure_name)

        # logging.info("Adding Provenance for %s", project_fullname)
        model.addIndividualToGraph(parameter_curie, parameter_label)
        provenance_model.add_study_measure(study_bnode,
                                           parameter_curie,
                                           object_is_literal=False)

        # Add Colony
        colony_bnode = self.make_id("{0}".format(colony), '_')
        model.addIndividualToGraph(colony_bnode, colony)

        # Add study agent
        phenotyping_center_id = self.localtt[phenotyping_center]
        model.addIndividualToGraph(phenotyping_center_id, phenotyping_center,
                                   self.globaltt['organization'])

        # self.graph
        model.addTriple(study_bnode, self.globaltt['has_agent'],
                        phenotyping_center_id)

        # add pipeline and project
        model.addIndividualToGraph(pipeline_curie, pipeline_name)
        # self.graph
        model.addTriple(study_bnode, self.globaltt['part_of'], pipeline_curie)

        if project_fullname in self.localtt:
            project_fullname_id = self.localtt[project_fullname]
        else:
            project_fullname_id = self.resolve(project_fullname)

        model.addIndividualToGraph(project_fullname_id, project_fullname,
                                   self.globaltt['project'])

        # self.graph
        model.addTriple(study_bnode, self.globaltt['part_of'],
                        project_fullname_id)

        return study_bnode
Beispiel #38
0
    def _process_QTLs_genomic_location(
            self, raw, taxon_id, build_id, build_label, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        geno = Genotype(g)
        # assume that chrs get added to the genome elsewhere
        # genome_id = geno.makeGenomeID(taxon_id)  # TODO unused

        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence
        logger.info("Processing QTL locations for %s", taxon_id)
        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            # bad_attr_flag = False  # TODO unused
            for row in reader:
                line_counter += 1
                if re.match(r'^#', ' '.join(row)):
                    continue

                (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame,
                 strand, score, attr) = row

                # Chr.Z   Animal QTLdb    Production_QTL  33954873      34023581        .       .       .
                # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
                # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
                # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
                # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
                # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01"

                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
                # VTO_name,Map_Type,Significance,P-value,Model,
                # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
                # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
                # Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search(r'"FlankMarkers";', attr):
                    attr = re.sub(r'FlankMarkers;', '', attr)
                attr_items = re.sub(r'"', '', attr).split(";")
                bad_attrs = set()
                for a in attr_items:
                    if not re.search(r'=', a):
                        # bad_attr_flag = True  # TODO unused
                        # remove this attribute from the list
                        bad_attrs.add(a)

                attr_set = set(attr_items) - bad_attrs
                attribute_dict = dict(item.split("=") for item in attr_set)

                qtl_num = attribute_dict.get('QTL_ID')
                if self.testMode and int(qtl_num) not in self.test_ids:
                    continue

                # make association between QTL and trait
                qtl_id = 'AQTL:' + str(qtl_num)
                model.addIndividualToGraph(qtl_id, None, geno.genoparts['QTL'])
                geno.addTaxon(taxon_id, qtl_id)

                trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match(r'ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        reference = Reference(g, pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        reference = Reference(
                            g, pub_id, Reference.ref_types['journal_article'])
                    reference.addRefToGraph()

                # Add QTL to graph
                assoc = G2PAssoc(
                    g, self.name, qtl_id, trait_id,
                    model.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    s = re.sub(r'<', '', attribute_dict.get('P-value'))
                    if ',' in s:
                        s = re.sub(r',', '.', s)
                    if s.isnumeric():
                        score = float(s)
                        assoc.set_score(score)

                assoc.add_association_to_graph()
                # TODO make association to breed
                # (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub(r'Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                chrom_in_build_id = \
                    makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(g, qtl_id, None, geno.genoparts['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(
                    start_bp, chrom_in_build_id, strand,
                    [Feature.types['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(
                    stop_bp, chrom_in_build_id, strand,
                    [Feature.types['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(taxon_id)
                qtl_feature.addFeatureToGraph()

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.warning("Bad attribute flags in this file")
        logger.info("Done with QTL genomic mappings for %s", taxon_id)
        return
Beispiel #39
0
    def _process_data(self, src_key, limit=None):

        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index(
                    'marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index(
                    'phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index(
                    'allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index(
                    'strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index(
                    'pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index(
                    'procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index(
                    'parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index(
                    'statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate",
                        zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-' + re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info("Found a strange strain accession...%s",
                             strain_accession_id)
                    strain_accession_id = 'IMPC:' + strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(
                            1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d",
                                reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 self.globaltt['gene'])

                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t",
                          reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw,
                                           stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:' + re.sub(
                    r':', '',
                    allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(vslc_colony, allele_accession_id, None,
                                    self.globaltt['indeterminate'],
                                    self.globaltt['has_variant_part'])
                graph.addTriple(colony_id, self.globaltt['has_genotype'],
                                colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id((colony_id + phenotyping_center +
                                            zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:' + vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(vslc_id, allele1_id, allele2_id,
                                    zygosity_id,
                                    self.globaltt['has_variant_part'],
                                    allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id,
                              self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(genomic_background_id, strain_name,
                                     self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join(
                        (re.sub(r':', '', genomic_background_id),
                         re.sub(r'\s', '_', phenotyping_center),
                         re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(sex_qualified_genotype_id,
                                 sex_qualified_genotype_label, sq_type_id)
                geno.addParts(genotype_id, sex_qualified_genotype_id,
                              self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning("No phenotype id specified for row %d",
                                reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and a parameter tested

                assoc = G2PAssoc(graph, self.name, sex_qualified_genotype_id,
                                 mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join(
                        (mp_term_name, 'phenotype determined by',
                         phenotyping_center, 'in an', procedure_name,
                         'assay where', parameter_name.strip(),
                         'was measured with an effect_size of',
                         str(round(float(effect_size), 5)), '(p =',
                         "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join(
                        (mp_term_name, 'phenotype determined by',
                         phenotyping_center, 'in an', procedure_name,
                         'assay where', parameter_name.strip(),
                         'was measured with an effect_size of',
                         str(effect_size), '(p =', "{0}".format(p_value),
                         ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname,
                    pipeline_name, pipeline_stable_id, procedure_stable_id,
                    procedure_name, parameter_stable_id, parameter_name,
                    statistical_method, resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Beispiel #40
0
    def _process_QTLs_genetic_location(
            self, raw, taxon_id, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        model = Model(g)
        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence

        logger.info(
            "Processing genetic location for %s from %s", taxon_id, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.testMode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = 'AQTL:'+qtl_id.strip()
                trait_id = 'AQTLTrait:'+trait_id.strip()

                # Add QTL to graph
                feature = Feature(g, qtl_id, qtl_symbol, geno.genoparts['QTL'])
                feature.addTaxonToFeature(taxon_id)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_id)
                chrom_in_build_id = makeChromID(
                    chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = \
                            [int(float(x.strip()))
                             for x in re.split(r'-', range_cm)]
                    else:
                        logger.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [Feature.types['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [Feature.types['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        geno.genoparts['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '')
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided
                    # and gene_id is an integer, it's NCBI
                    if (gene_id_src == 'NCBIgene' or gene_id_src == '') and \
                            gene_id.strip().isdigit() :
                        gene_id = 'NCBIGene:' + gene_id.strip()
                        # we will expect that these labels provided elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = \
                                '_:' + re.sub(
                                    r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(g, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:'+pubmed_id.strip()
                    reference = Reference(g,
                        pub_id, Reference.ref_types['journal_article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    g, self.name, qtl_id, trait_id,
                    model.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    s = re.sub(r'<', '', p_values)
                    s = re.sub(r',', '.', s)  # international notation
                    if s.isnumeric():
                        score = float(s)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        g, self.name, dbsnp_id, trait_id,
                        model.object_properties['is_marker_for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        s = re.sub(r'<', '', p_values)
                        s = re.sub(r',', '.', s)
                        if s.isnumeric():
                            score = float(s)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with QTL genetic info")
        return
Beispiel #41
0
    def _process_genes(self, limit=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        src_key = 'genes'
        geno = Genotype(graph)
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        col = self.files[src_key]['columns']
        LOG.info("Processing HGNC genes")

        chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
        band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')

            row = next(reader)
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # To generate:
                # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
                # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"

                hgnc_id = row[col.index('hgnc_id')].strip()
                symbol = row[col.index('symbol')].strip()
                name = row[col.index('name')].strip()
                # locus_group = row[col.index('locus_group')]
                locus_type = row[col.index('locus_type')].strip()
                status = row[col.index('status')].strip()
                # 41622 Approved  & 1752 Entry Withdrawn
                location = row[col.index('location')].strip()
                # location_sortable = row[col.index('location_sortable')]
                # alias_symbol = row[col.index('alias_symbol')]
                # alias_name = row[col.index('alias_name')]
                # prev_symbol = row[col.index('prev_symbol')]
                # prev_name = row[col.index('prev_name')]
                # gene_family = row[col.index('gene_family')]
                # gene_family_id = row[col.index('gene_family_id')]
                # date_approved_reserved = row[col.index('date_approved_reserved')]
                # date_symbol_changed = row[col.index('date_symbol_changed')]
                # date_name_changed = row[col.index('date_name_changed')]
                # date_modified = row[col.index('date_modified')]
                entrez_id = row[col.index('entrez_id')].strip()
                ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
                # vega_id = row[col.index('vega_id')]
                # ucsc_id = row[col.index('ucsc_id')]
                # ena = row[col.index('ena')]
                # refseq_accession = row[col.index('refseq_accession')]
                # ccds_id = row[col.index('ccds_id')]
                # uniprot_ids = row[col.index('uniprot_ids')]
                pubmed_ids = row[col.index('pubmed_id')].strip(
                    '"')  # pipe separated!
                # mgd_id = row[col.index('mgd_id')]
                # rgd_id = row[col.index('rgd_id')]
                # lsdb = row[col.index('lsdb')]
                # cosmic = row[col.index('cosmic')]
                omim_ids = row[col.index('omim_id')].strip()  # pipe separated!
                # mirbase = row[col.index('mirbase')]
                # homeodb = row[col.index('homeodb')]
                # snornabase = row[col.index('snornabase')]
                # bioparadigms_slc = row[col.index('bioparadigms_slc')]
                # orphanet = row[col.index('orphanet')]
                # pseudogene.org = row[col.index('pseudogene.org')]
                # horde_id = row[col.index('horde_id')]
                # merops = row[col.index('merops')]
                # imgt = row[col.index('imgt')]
                # iuphar = row[col.index('iuphar')]
                # kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
                # mamit_trnadb = row[col.index('mamit-trnadb')]
                # cd = row[col.index('cd')]
                # lncrnadb = row[col.index('lncrnadb')]
                # enzyme_id = row[col.index('enzyme_id')]
                # intermediate_filament_db = row[col.index('intermediate_filament_db')]
                # rna_central_ids = row[col.index('rna_central_ids')]
                # lncipedia = row[col.index('lncipedia')]
                # gtrnadb = row[col.index('gtrnadb')]
                # agr = row[col.index('agr')]

                if status != 'Approved':
                    self.withdrawn[hgnc_id] = symbol
                    continue

                if (self.test_mode and entrez_id != ''
                        and entrez_id not in self.gene_ids):
                    continue

                if name == '':
                    name = None

                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id,
                                             old_id_category=blv.terms['Gene'])
                elif symbol[
                        -1] == '@':  # 10)  region (HOX), RNA cluster, gene (PCDH)
                    continue

                else:
                    gene_type_id = self.resolve(locus_type, mandatory=False)
                    if gene_type_id != locus_type:
                        model.addClassToGraph(hgnc_id, symbol, gene_type_id,
                                              name)
                    model.makeLeader(hgnc_id)

                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)

                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id,
                                             'ENSEMBL:' + ensembl_gene_id)

                for omim_id in omim_ids.split('|'):
                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        LOG.warning('%s is replaced with %s', omim_id, repl)
                        for omim in repl:
                            if self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim

                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)

                geno.addTaxon(self.taxon_curie, hgnc_id)

                # add pubs as "is about"
                for pubmed_id in pubmed_ids.split('|'):
                    pmid = pubmed_id.strip()
                    if pmid is not None and pmid != '':
                        graph.addTriple('PMID:' + pmid,
                                        self.globaltt['is_about'], hgnc_id)

                # add the default taxon to the gene
                graph.addTriple(hgnc_id, self.globaltt['in taxon'],
                                self.taxon_curie)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_match = chr_pattern.match(location)
                if chr_match is not None and chr_match.groups():
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.taxon_curie, 'CHR')
                    band_match = band_pattern.search(location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and band_match.groups():
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.taxon_curie, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
Beispiel #42
0
    def process_omia_phenotypes(self, limit):

        # process the whole directory
        # TODO get the file listing
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)

        LOG.info(
            "Processing Monarch OMIA Animal disease-phenotype associations")

        src_key = 'omia_d2p'

        # get file listing
        mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype'))
        file_list = [
            f for f in listdir(mypath)
            if isfile(join(mypath, f)) and re.search(r'.txt$', f)
        ]

        col = self.files[src_key]['columns']
        # reusable initial code generator
        # for c in col:
        #   print(
        #    '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()")

        for filename in file_list:
            LOG.info("Processing %s", filename)
            count_missing = 0
            bad_rows = list()
            fname = '/'.join((mypath, filename))
            with open(fname, 'r') as csvfile:
                filereader = csv.reader(csvfile,
                                        delimiter='\t',
                                        quotechar='\"')
                fileheader = next(filereader)
                if fileheader != col:
                    LOG.error('Expected  %s to have columns: %s', fname, col)
                    LOG.error('But Found %s to have columns: %s', fname,
                              fileheader)
                    raise AssertionError(
                        'Incomming data headers have changed.')

                for row in filereader:
                    if len(row) != len(col):
                        LOG.info("Not enough cols %d in %s - please fix",
                                 len(row), filename)
                        continue

                    disease_num = row[col.index('Disease ID')].strip()
                    species_id = row[col.index('Species ID')].strip()
                    breed_name = row[col.index('Breed Name')].strip()
                    # variant = row[col.index('Variant')]
                    # inheritance = row[col.index('Inheritance')]
                    phenotype_id = row[col.index('Phenotype ID')].strip()
                    # phenotype_name = row[col.index('Phenotype Name')]
                    entity_id = row[col.index('Entity ID')].strip()
                    entity_name = row[col.index('Entity Name')]
                    quality_id = row[col.index('Quality ID')].strip()
                    quality_name = row[col.index('Quality Name')]
                    # related_entity_id = row[col.index('Related Entity ID')]
                    # related_entity_name = row[col.index('Related Entity Name')]
                    # abnormal_id = row[col.index('Abnormal ID')]
                    # abnormal_name = row[col.index('Abnormal Name')]
                    # phenotype_desc = row[col.index('Phenotype Desc')]
                    assay = row[col.index('Assay')].strip()
                    # frequency = row[col.index('Frequency')]
                    pubmed_id = row[col.index('Pubmed ID')].strip()
                    phenotype_description = row[col.index('Pub Desc')].strip()
                    curator_notes = row[col.index('Curator Notes')].strip()
                    # date_created = row[col.index('Date Created')]

                    if phenotype_id == '':
                        # LOG.warning('Missing phenotype in row:\n%s', row)
                        count_missing += 1
                        bad_rows.append(row)
                        continue
                    if len(str(disease_num)) < 6:
                        disease_num = str(disease_num).zfill(6)
                    disease_id = 'OMIA:' + disease_num
                    if species_id != '':
                        disease_id = '-'.join((disease_id, species_id))
                    assoc = D2PAssoc(graph, self.name, disease_id,
                                     phenotype_id)
                    if pubmed_id != '':
                        for pnum in re.split(r'[,;]', pubmed_id):
                            pnum = re.sub(r'[^0-9]', '', pnum)
                            pmid = 'PMID:' + pnum
                            assoc.add_source(pmid)
                    else:
                        assoc.add_source('/'.join(
                            (self.curie_map['OMIA'] + disease_num,
                             species_id)))
                    assoc.add_association_to_graph()
                    aid = assoc.get_association_id()
                    if phenotype_description != '':
                        model.addDescription(aid, phenotype_description)
                    if breed_name != '':
                        model.addDescription(aid,
                                             breed_name + ' [observed in]')
                    if assay != '':
                        model.addDescription(aid, assay + ' [assay]')
                    if curator_notes != '':
                        model.addComment(aid, curator_notes)

                    if entity_id != '' or quality_id != '':
                        LOG.info("EQ not empty for %s: %s + %s", disease_id,
                                 entity_name, quality_name)
            if count_missing > 0:
                LOG.warning(
                    "We are missing %d of %d D2P annotations from id %s",
                    count_missing, filereader.line_num - 1, filename)
                LOG.warning("Bad rows:\n%s",
                            '\n'.join([str(x) for x in bad_rows]))
            # finish loop through all files

        return
Beispiel #43
0
    def _get_chrbands(self, limit, taxon, genome_id=None):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :param: taxon:
        :param: genome
        :return:

        """
        model = Model(self.graph)
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        LOG.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        if genome_id is None:
            genome_id = geno.makeGenomeID(
                taxon_id)  # makes a blank node allways
        geno.addGenome(taxon_id, genome_label, genome_id)
        model.addOWLPropertyClassRestriction(genome_id,
                                             self.globaltt['in taxon'],
                                             taxon_id)

        placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'
        # currently unused patterns
        # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
        # unplaced_scaffold_pattern = r'chrUn_(\w+)'

        col = ['chrom', 'start', 'stop', 'band', 'rtype']
        with gzip.open(myfile, 'rb') as reader:
            for line in reader:
                line_counter += 1
                # skip comments
                line = line.decode().strip()
                if line[0] == '#':
                    continue
                # chr13	4500000	10000000	p12	stalk
                row = line.split('\t')
                chrom = row[col.index('chrom')]
                band = row[col.index('band')]
                rtype = row[col.index('rtype')]
                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1

                mch = re.match(placed_scaffold_pattern + r'$', chrom)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # chrom = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    LOG.info("Skipping non-placed chromosome %s", chrom)
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                model.addOWLPropertyClassRestriction(
                    cclassid, self.globaltt['member of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid + band
                maplocclass_label = makeChromLabel(chrom + band, genome_label)
                if band is not None and band.strip() != '':

                    region_type_id = self.map_type_of_region(rtype)
                    model.addClassToGraph(maplocclass_id, maplocclass_label,
                                          region_type_id)
                else:
                    region_type_id = self.globaltt['chromosome']

                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            self.globaltt['chromosome_band'],
                            self.globaltt['chromosome_subband']
                    ]:
                        stain_type = self.resolve(rtype)
                        if stain_type is not None:
                            model.addOWLPropertyClassRestriction(
                                maplocclass_id,
                                self.globaltt['has_sequence_attribute'],
                                self.resolve(rtype))
                    else:
                        # usually happens if it's a chromosome because
                        # they don't actually have banding info
                        LOG.info("feature type %s != chr band", region_type_id)
                else:
                    LOG.info('staining type not found for: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of", maplocclass_id, "=", parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                for prnt in parents:
                    parent = prnt.strip()
                    if parent is None or parent == "":
                        continue
                    pclassid = cclassid + parent  # class chr parts
                    pclass_label = makeChromLabel(chrom + parent, genome_label)
                    rti = getChrPartTypeByNotation(parent, self.graph)
                    model.addClassToGraph(pclassid, pclass_label, rti)

                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions

                    if prnt != parents[-1]:
                        grandparent = 1 + parents.index(prnt)
                        pid = cclassid + parents[grandparent]  # the instance
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'], pid)
                        model.addOWLPropertyClassRestriction(
                            pid, self.globaltt['has subsequence'], pclassid)
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        model.addOWLPropertyClassRestriction(
                            pclassid, self.globaltt['is subsequence of'],
                            cclassid)
                        model.addOWLPropertyClassRestriction(
                            cclassid, self.globaltt['has subsequence'],
                            pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    model.addOWLPropertyClassRestriction(
                        maplocclass_id, self.globaltt['is subsequence of'],
                        cclassid + parents[0])
                    model.addOWLPropertyClassRestriction(
                        cclassid + parents[0],
                        self.globaltt['has subsequence'], maplocclass_id)

                if limit is not None and line_counter > limit:
                    break
Beispiel #44
0
    def process_feature_loc(self, limit):

        raw = '/'.join((self.rawdir, self.files['feature_loc']['file']))

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        logger.info("Processing Feature location and attributes")
        line_counter = 0
        geno = Genotype(graph)
        strain_to_variant_map = {}
        build_num = self.version_num
        build_id = 'WormBase:' + build_num
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                (chrom, db, feature_type_label, start, end, score, strand,
                 phase, attributes) = row

                # I	interpolated_pmap_position	gene	1	559768	.	.	.	ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM)
                # I	WormBase	gene	3747	3909	.	-	.	ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6
                # I	absolute_pmap_position	gene	4119	10230	.	.	.	ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM)

                # dbs = re.split(
                #   r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA')
                #
                # if db not in dbs:
                #     continue

                if feature_type_label not in [
                        'gene', 'point_mutation', 'deletion', 'RNAi_reagent',
                        'duplication', 'enhancer', 'binding_site',
                        'biological_region', 'complex_substitution',
                        'substitution', 'insertion', 'inverted_repeat'
                ]:
                    # note biological_regions include balancers
                    # other options here: promoter, regulatory_region, reagent
                    continue
                line_counter += 1

                attribute_dict = {}
                if attributes != '':
                    attribute_dict = dict(
                        item.split("=")
                        for item in re.sub(r'"', '', attributes).split(";"))

                fid = flabel = desc = None
                if 'ID' in attribute_dict:
                    fid = attribute_dict.get('ID')
                    if re.search(r'WB(Gene|Var|sf)', fid):
                        fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid)
                    elif re.match(r'(gmap|landmark)', fid):
                        continue
                    else:
                        logger.info('other identifier %s', fid)
                        fid = None
                elif 'variation' in attribute_dict:
                    fid = 'WormBase:' + attribute_dict.get('variation')
                    flabel = attribute_dict.get('public_name')
                    sub = attribute_dict.get('substitution')
                    ins = attribute_dict.get('insertion')
                    # if it's a variation:
                    # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T
                    desc = ''
                    if sub is not None:
                        desc = 'substitution=' + sub
                    if ins is not None:
                        desc = 'insertion=' + ins

                    # keep track of the strains with this variation,
                    # for later processing
                    strain_list = attribute_dict.get('strain')
                    if strain_list is not None:
                        for s in re.split(r',', strain_list):
                            if s.strip() not in strain_to_variant_map:
                                strain_to_variant_map[s.strip()] = set()
                            strain_to_variant_map[s.strip()].add(fid)

                # if feature_type_label == 'RNAi_reagent':
                # Target=WBRNAi00096030 1 4942
                # this will tell us where the RNAi is actually binding
                # target = attribute_dict.get('Target') # TODO unused
                # rnai_num = re.split(r' ', target)[0]  # TODO unused
                # it will be the reagent-targeted-gene that has a position,
                # (i think)
                # TODO finish the RNAi binding location

                name = attribute_dict.get('Name')
                polymorphism = attribute_dict.get('polymorphism')

                if fid is None:
                    if name is not None and re.match(r'WBsf', name):
                        fid = 'WormBase:' + name
                        name = None
                    else:
                        continue

                if self.testMode \
                        and re.sub(r'WormBase:', '', fid) \
                        not in self.test_ids['gene']+self.test_ids['allele']:
                    continue

                # these really aren't that interesting
                if polymorphism is not None:
                    continue

                if name is not None and not re.search(name, fid):
                    if flabel is None:
                        flabel = name
                    else:
                        model.addSynonym(fid, name)

                if desc is not None:
                    model.addDescription(fid, desc)

                alias = attribute_dict.get('Alias')

                biotype = attribute_dict.get('biotype')
                note = attribute_dict.get('Note')
                other_name = attribute_dict.get('other_name')
                for n in [alias, other_name]:
                    if n is not None:
                        model.addSynonym(fid, other_name)

                if feature_type_label == 'gene':
                    ftype_id = self.resolve(biotype)
                else:
                    # so far, they all come with SO label syntax. resolve if need be.
                    ftype_id = self.globaltt[feature_type_label]
                chr_id = makeChromID(chrom, build_id, 'CHR')
                geno.addChromosomeInstance(chrom, build_id, build_num)

                feature = Feature(graph, fid, flabel, ftype_id)
                feature.addFeatureStartLocation(start, chr_id, strand)
                feature.addFeatureEndLocation(start, chr_id, strand)

                feature_is_class = False
                if feature_type_label == 'gene':
                    feature_is_class = True

                feature.addFeatureToGraph(True, None, feature_is_class)

                if note is not None:
                    model.addDescription(fid, note)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

                # RNAi reagents:
# I	RNAi_primary	RNAi_reagent	4184	10232	.	+	.	Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10
# I	RNAi_primary	RNAi_reagent	4223	10147	.	+	.	Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052
# I	RNAi_primary	RNAi_reagent	5693	9391	.	+	.	Target=WBRNAi00066135 1 3699 +;laboratory=CH

# TODO TF bindiing sites and network:
# I	TF_binding_site_region	TF_binding_site	1861	2048	.	+	.	Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16
# I	TF_binding_site_region	TF_binding_site	3403	4072	.	+	.	Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1

        return
Beispiel #45
0
    def _get_orthologs(self, limit):
        """
        This will process each of the specified pairwise orthology files,
        creating orthology associations based on the specified orthology code.
        this currently assumes that each of the orthology files is identically
        formatted. Relationships are made between genes here.

        There is also a nominal amount of identifier re-formatting:
        MGI:MGI --> MGI
        Ensembl --> ENSEMBL

        we skip any genes where we don't know how to map the gene identifiers.
        For example, Gene:Huwe1 for RAT is not an identifier, so we skip any
        mappings to this identifier.  Often, the there are two entries for the
        same gene (base on equivalent Uniprot id), and so we are not actually
        losing any information.

        We presently have a hard-coded filter to select only orthology
        relationships where one of the pair is in our species of interest
        (Mouse and Human, for the moment).
        This will be added as a configurable parameter in the future.

        Genes are also added to a grouping class defined with a PANTHER id.

        Triples:
        <gene1_id> RO:othologous <gene2_id>
        <assoc_id> :hasSubject <gene1_id>
        <assoc_id> :hasObject <gene2_id>
        <assoc_id> :hasPredicate <RO:orthologous>
        <assoc_id> dc:evidence ECO:phylogenetic_evidence

        <panther_id> a DATA:gene_family
        <panther_id> RO:has_member <gene1_id>
        <panther_id> RO:has_member <gene2_id>

        :param limit:
        :return:

        """
        LOG.info("getting orthologs")

        if self.test_mode:
            graph = self.testgraph

        else:
            graph = self.graph
        model = Model(graph)
        unprocessed_gene_ids = set()  # may be faster to make a set after

        for src_key in self.files:
            src_file = '/'.join((self.rawdir, self.files[src_key]['file']))
            matchcounter = line_counter = 0
            col = self.files[src_key]['columns']
            reader = tarfile.open(src_file, 'r:gz')

            # assume that the first entry is the item
            fname = reader.getmembers()[0]
            LOG.info("Parsing %s", fname.name)

            with reader.extractfile(fname) as csvfile:
                for line in csvfile:
                    # skip comment lines
                    if re.match(r'^#', line.decode()):
                        LOG.info("Skipping header line")
                        continue
                    line_counter += 1

                    # a little feedback to the user since there's so many
                    if line_counter % 1000000 == 0:
                        LOG.info("Processed %d lines from %s", line_counter,
                                 fname.name)

                    line = line.decode().strip()
                    row = line.split('\t')
                    # parse each row. ancestor_taxons is unused
                    # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83
                    #   	MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6
                    #       	LDO	Euarchontoglires	PTHR15964
                    thing1 = row[col.index('thing1')].strip()
                    thing2 = row[col.index('thing2')].strip()
                    orthology_class = row[col.index('orthology_class')].strip()
                    # ancestor_taxons  = row[col.index('')].strip()
                    panther_id = row[col.index('panther_id')].strip()

                    (species_a, gene_a, protein_a) = thing1.split('|')
                    (species_b, gene_b, protein_b) = thing2.split('|')

                    # skip the entries that don't have homolog relationships
                    # with the test ids
                    if self.test_mode and not (
                            re.sub(r'UniProtKB=', '',
                                   protein_a) in self.test_ids
                            or re.sub(r'UniProtKB=', '',
                                      protein_b) in self.test_ids):
                        continue

                    # map the taxon abbreviations to ncbi taxon id numbers
                    taxon_a = self.resolve(species_a).split(':')[1].strip()
                    taxon_b = self.resolve(species_b).split(':')[1].strip()

                    # ###uncomment the following code block
                    # if you want to filter based on taxid of favorite animals
                    # taxids = [9606,10090,10116,7227,7955,6239,8355]
                    # taxids = [9606] #human only
                    # retain only those orthologous relationships to genes
                    # in the specified taxids
                    # using AND will get you only those associations where
                    # gene1 AND gene2 are in the taxid list (most-filter)
                    # using OR will get you any associations where
                    # gene1 OR gene2 are in the taxid list (some-filter)
                    if self.tax_ids is not None and \
                            (taxon_a not in self.tax_ids) and \
                            (taxon_b not in self.tax_ids):
                        continue
                    else:
                        matchcounter += 1
                        if limit is not None and matchcounter > limit:
                            break

                    # ### end code block for filtering on taxon

                    # fix the gene identifiers
                    gene_a = re.sub(r'=', ':', gene_a)
                    gene_b = re.sub(r'=', ':', gene_b)

                    clean_gene = self._clean_up_gene_id(gene_a, species_a)
                    if clean_gene is None:
                        unprocessed_gene_ids.add(gene_a)
                    gene_a = clean_gene
                    clean_gene = self._clean_up_gene_id(gene_b, species_b)
                    if clean_gene is None:
                        unprocessed_gene_ids.add(gene_b)
                    gene_b = clean_gene

                    # a special case here; mostly some rat genes
                    # they use symbols instead of identifiers.  will skip
                    if gene_a is None or gene_b is None:
                        continue

                    rel = self.resolve(orthology_class)

                    evidence_id = self.globaltt['phylogenetic evidence']

                    # add the association and relevant nodes to graph
                    assoc = OrthologyAssoc(graph, self.name, gene_a, gene_b,
                                           rel)
                    assoc.add_evidence(evidence_id)

                    # add genes to graph;
                    # assume labels will be taken care of elsewhere
                    model.addClassToGraph(gene_a, None)
                    model.addClassToGraph(gene_b, None)

                    # might as well add the taxon info for completeness
                    graph.addTriple(gene_a, self.globaltt['in taxon'],
                                    'NCBITaxon:' + taxon_a)
                    graph.addTriple(gene_b, self.globaltt['in taxon'],
                                    'NCBITaxon:' + taxon_b)

                    assoc.add_association_to_graph()

                    # note this is incomplete...
                    # it won't construct the full family hierarchy,
                    # just the top-grouping
                    assoc.add_gene_family_to_graph('PANTHER:' + panther_id)

                    if not self.test_mode \
                            and limit is not None and line_counter > limit:
                        break
                # make report on unprocessed_gene_ids

            LOG.info("finished processing %s", src_file)
            LOG.warning(
                "The following gene ids were unable to be processed: %s",
                str(unprocessed_gene_ids))
Beispiel #46
0
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Turtle:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid, v_last_modified,
                 v_status, v_lastmodified_epoch) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (re.match(
                        r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (re.match(
                        r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                model.addDefinition(morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id, small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id, large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id, comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        model.addSynonym(morphology_term_id, s.strip(),
                                         self.globaltt['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        model.addSynonym(morphology_term_id, s.strip(),
                                         self.globaltt['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                reference = Reference(self.graph)
                reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return
Beispiel #47
0
    def process_gene_interaction(self, limit):
        """
        The gene interaction file includes identified interactions,
        that are between two or more gene (products).
        In the case of interactions with >2 genes, this requires creating
        groups of genes that are involved in the interaction.
        From the wormbase help list: In the example WBInteraction000007779
        it would likely be misleading to suggest that lin-12 interacts with
        (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60
        ALONE; the observation in the paper; see Table V in paper PMID:15990876
        was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress
        the "multivulva" phenotype induced synthetically by simultaneous
        perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele).
        So this is necessarily a three-gene interaction.

        Therefore, we can create groups of genes based on their "status" of
        Effector | Effected.

        Status:  IN PROGRESS

        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['gene_interaction']['file']))

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        logger.info("Processing gene interaction associations")
        line_counter = 0

        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar="'")

            for row in filereader:
                line_counter += 1
                if re.match(r'#', ''.join(row)):
                    continue

                (interaction_num, interaction_type, interaction_subtype,
                 summary, citation) = row[0:5]
                # print(row)
                interaction_id = 'WormBase:' + interaction_num

                # TODO deal with subtypes
                interaction_type_id = None
                if interaction_type == 'Genetic':
                    interaction_type_id = self.globaltt[
                        'genetically interacts with']
                elif interaction_type == 'Physical':
                    interaction_type_id = self.globaltt[
                        'molecularly_interacts_with']
                elif interaction_type == 'Regulatory':
                    interaction_type_id = self.globaltt['regulates']
                else:
                    logger.info("An interaction type I don't understand %s",
                                interaction_type)

                num_interactors = (len(row) - 5) / 3
                if num_interactors != 2:
                    logger.info(
                        "Skipping interactions with !=2 participants:\n %s",
                        str(row))
                    continue

                gene_a_id = 'WormBase:' + row[5]
                gene_b_id = 'WormBase:' + row[8]

                if self.testMode \
                        and gene_a_id not in self.test_ids['gene'] \
                        and gene_b_id not in self.test_ids['gene']:
                    continue

                assoc = InteractionAssoc(graph, self.name, gene_a_id,
                                         gene_b_id, interaction_type_id)
                assoc.set_association_id(interaction_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                # citation is not a pmid or WBref - get this some other way
                model.addDescription(assoc_id, summary)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        return
Beispiel #48
0
    def _process_omim2gene(self, limit=None):
        """
        This method maps the OMIM IDs and KEGG gene ID.
        Currently split based on the link_type field.
        Equivalent link types are mapped as gene XRefs.
        Reverse link types are mapped as disease to gene associations.
        Original link types are currently skipped.

        Triples created:
        <kegg_gene_id> is a Gene
        <omim_gene_id> is a Gene
        <kegg_gene_id>> hasXref <omim_gene_id>

        <assoc_id> has subject <omim_disease_id>
        <assoc_id> has object <kegg_gene_id>
        :param limit:

        :return:
        """

        logger.info("Processing OMIM to KEGG gene")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        geno = Genotype(g)
        raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, omim_id, link_type) = row

                if self.testMode and \
                        kegg_gene_id not in self.test_ids['genes']:
                    continue

                kegg_gene_id = 'KEGG-'+kegg_gene_id.strip()
                omim_id = re.sub(r'omim', 'OMIM', omim_id)
                if link_type == 'equivalent':
                    # these are genes!
                    # so add them as a class then make equivalence
                    model.addClassToGraph(omim_id, None)
                    geno.addGene(kegg_gene_id, None)
                    model.addEquivalentClass(kegg_gene_id, omim_id)
                elif link_type == 'reverse':
                    # make an association between an OMIM ID & the KEGG gene ID
                    # we do this with omim ids because
                    # they are more atomic than KEGG ids

                    alt_locus_id = self._make_variant_locus_id(kegg_gene_id,
                                                               omim_id)
                    alt_label = self.label_hash[alt_locus_id]
                    model.addIndividualToGraph(alt_locus_id, alt_label,
                                               geno.genoparts['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, kegg_gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # Add the disease to gene relationship.
                    rel = model.object_properties['is_marker_for']
                    assoc = G2PAssoc(g, self.name, alt_locus_id, omim_id, rel)
                    assoc.add_association_to_graph()

                elif link_type == 'original':
                    # these are sometimes a gene, and sometimes a disease
                    logger.info('Unable to handle original link for %s-%s',
                                kegg_gene_id, omim_id)
                else:
                    # don't know what these are
                    logger.warning('Unhandled link type for %s-%s: %s',
                                   kegg_gene_id, omim_id, link_type)

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with OMIM to KEGG gene")

        return
Beispiel #49
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """

    ref_types = {
        'person': 'foaf:Person',
        'journal_article': 'IAO:0000013',
        'publication': 'IAO:0000311',  # book
        'document': 'IAO:0000310',  # document???
        'photograph': 'IAO:0000185',
        'webpage': 'SIO:000302',
    }

    annotation_properties = {'page': 'foaf:page', 'title': 'dc:title'}

    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)

        if ref_type is None:
            self.ref_type = self.ref_types['document']
        else:
            self.ref_type = ref_type

        if ref_id is not None and re.match(r'http', ref_id):
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(subject_id,
                             self.annotation_properties['page'],
                             page_url,
                             object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        self.graph.addTriple(subject_id,
                             self.annotation_properties['title'],
                             title,
                             object_is_literal=True)
        return

    def addRefToGraph(self):

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            self.model.addLabel(self.ref_url, n)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, n, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Beispiel #50
0
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        If there's an embedded enzyme commission number,
        that is added as an xref.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:

        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and \
                        orthology_class_id not in \
                        self.test_ids['orthology_classes']:
                    continue

                # The orthology class is essentially a KEGG gene ID
                # that is species agnostic.
                # Add the ID and label as a gene family class

                other_labels = re.split(r'[;,]', orthology_class_name)
                # the first one is the label we'll use
                orthology_label = other_labels[0]

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                model.addClassToGraph(orthology_class_id, orthology_label,
                                      orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        model.addSynonym(orthology_class_id, s.strip())

                    # add the last one as the description
                    d = other_labels[len(other_labels)-1]
                    model.addDescription(orthology_class_id, d)

                    # add the enzyme commission number (EC:1.2.99.5)as an xref
                    # sometimes there's two, like [EC:1.3.5.1 1.3.5.4]
                    # can also have a dash, like EC:1.10.3.-
                    ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d)
                    if ec_matches is not None:
                        for ecm in ec_matches:
                            model.addXref(orthology_class_id, 'EC:'+ecm)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with ortholog classes")
        return
Beispiel #51
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """
    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("%s is not a graph", graph)

        # assert ref_id is not None

        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map

        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type
            if ref_type[:4] not in ('IAO:', 'SIO:'):
                LOG.warning("Got Pub ref type of:  %s", ref_type)

        if ref_id is not None and ref_id[:4] == 'http':
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(
            subject_id,
            self.globaltt['page'],  # foaf:page  not  <sio:web page>
            page_url,
            object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        if title is not None and title != '':
            self.graph.addTriple(subject_id,
                                 self.globaltt['title (dce)'],
                                 title,
                                 object_is_literal=True)
        return

    def addRefToGraph(self):

        cite = self.short_citation
        if cite is None and self.title is not None:
            cite = self.title

        if self.ref_url is not None:
            if self.title is not None:
                self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            if cite is not None:
                self.model.addLabel(self.ref_url, cite)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            LOG.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for auth in self.author_list:
        #        gu.addTriple(
        #           graph, self.ref_id, self.props['has_author'], auth, True)
        return
Beispiel #52
0
class Dataset:
    """
     This class produces metadata about a dataset that is compliant with the
     HCLS dataset specification:
     https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4

     Summary level: The summary level provides a description of a dataset that is
     independent of a specific version or format. (e.g. the Monarch ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER]

     Version level: The version level captures version-specific characteristics of a
     dataset. (e.g. the 01-02-2018 ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP]

     Distribution level: The distribution level captures metadata about a specific form
     and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is
     a [distribution level resource] for each different downloadable file we emit,
     i.e. one for the TTL file, one for the ntriples file, etc.
     CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format]

     We write out at least the following triples:

     SUMMARY LEVEL TRIPLES:
     [summary level resource] - rdf:type -> dctypes:Dataset
     [summary level resource] - dct:title -> title (literal)
     [summary level resource] - dct:description -> description (literal)
                                                (use docstring from Source class)
     [summary level resource] - dcterms:source -> [source web page, e.g. omim.org]
     [summary level resource] - schemaorg:logo -> [source logo IRI]
     [summary level resource] - dct:publisher -> monarchinitiative.org
        n.b: about summary level resource triples:
        -- HCLS spec says we "should" link to our logo and web page, but I'm not,
        because it would confuse the issue of whether we are pointing to our logo/page
        or the logo/page of the data source for this ingest. Same below for
        [version level resource] and [distibution level resource] - I'm not linking to
        our page/logo down there either.
        - spec says we "should" include summary level triples describing Update
        frequency and SPARQL endpoint but I'm omitting this for now, because these are
        not clearly defined at the moment

     VERSION LEVEL TRIPLES:
     [version level resource] - rdf:type -> dctypes:Dataset
     [version level resource] - dct:title -> version title (literal)
     [version level resource] - dct:description -> version description (literal)
     [version level resource] - dct:created -> ingest timestamp [ISO 8601 compliant]
     [version level resource] - pav:version -> ingest timestamp (same one above)
     [version level resource] - dct:creator	-> monarchinitiative.org
     [version level resource] - dct:publisher -> monarchinitiative.org
     [version level resource] - dct:isVersionOf -> [summary level resource]
     [version level resource] - dcterms:source -> [source file 1 IRI]
     [version level resource] - dcterms:source -> [source file 2 IRI]
     ...

     [source file 1 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     [source file 2 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     ...

     [version level resource] - pav:createdWith -> [Dipper github URI]
     [version level resource] - void:dataset -> [distribution level resource]

     [version level resource] - cito:citesAsAuthoriy -> [citation id 1]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 2]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 3]

        n.b: about version level resource triples:
        - spec says we "should" include Date of issue/dct:issued triple, but I'm not
        because it is redundant with this triple above:
        [version level resource] - dct:created -> time stamp
        and would introduce ambiguity and confusion if the two disagree. Same below
        for [distribution level resource] - dct:created -> tgiime stamp below
        Also omitting:
          - triples linking to our logo and page, see above.
          - License/dct:license triple, because we will make this triple via the
            [distribution level resource] below
          - Language/dct:language triple b/c it seems superfluous. Same below for
            [distribution level resource] - no language triple.
        - [version level resource] - pav:version triple is also a bit redundant
        with the pav:version triple below, but the spec requires both these triples
        - I'm omitting the [version level resource] -> pav:previousVersion because
        Dipper doesn't know this info for certain at run time. Same below for
        [distribution level resource] - pav:previousVersion.


     DISTRIBUTION LEVEL TRIPLES:
     [distribution level resource] - rdf:type -> dctypes:Dataset
     [distribution level resource] - rdf:type -> dcat:Distribution
     [distribution level resource] - dct:title -> distribution title (literal)
     [distribution level resource] - dct:description -> distribution description (lit.)
     [distribution level resource] - dct:created -> ingest timestamp[ISO 8601 compliant]
     [distribution level resource] - pav:version -> ingest timestamp (same as above)
     [distribution level resource] - dct:creator -> monarchinitiative.org
     [distribution level resource] - dct:publisher -> monarchinitiative.org
     [distribution level resource] - dct:license -> [license info, if available
                    otherwise indicate unknown]
     [distribution level resource] - dcterms:rights -> [data rights IRI]
     [distribution level resource] - pav:createdWith -> [Dipper github URI]
     [distribution level resource] - dct:format -> [IRI of ttl|nt|whatever spec]
     [distribution level resource] - dct:downloadURL -> [ttl|nt URI]
     [distribution level resource] - void:triples -> [triples count (literal)]
     [distribution level resource] - void:entities -> [entities count (literal)]
     [distribution level resource] - void:distinctSubjects -> [subject count (literal)]
     [distribution level resource] - void:distinctObjects -> [object count (literal)]
     [distribution level resource] - void:properties -> [properties count (literal)]
     ...

        n.b: about distribution level resource triples:
        - omitting Vocabularies used/void:vocabulary and Standards
        used/dct:conformTo triples, because they are described in the ttl file
        - also omitting Example identifier/idot:exampleIdentifier and
        Example resource/void:exampleResource, because we don't really have one
        canonical example of either - they're all very different.
        - [distribution level resource] - dct:created should have the exact same
        time stamp as this triple above:
        [version level resource] - dct:created -> time stamp
        - this [distribution level resource] - pav:version triple should have the
        same object as [version level resource] - pav:version triple above
        - Data source provenance/dct:source triples are above in the
        [version level resource]
        - omitting Byte size/dct:byteSize, RDF File URL/void:dataDump, and
        Linkset/void:subset triples because they probably aren't necessary for MI right
        now
        - these triples "should" be emitted, but we will do this in a later iteration:
        # of classes	void:classPartition	IRI
        # of literals	void:classPartition	IRI
        # of RDF graphs	void:classPartition	IRI

     Note: Do not use blank nodes in the dataset graph. This dataset graph is added to
     the main Dipper graph in Source.write() like so

        $ mainGraph = mainGraph + datasetGraph

     which apparently in theory could lead to blank node ID collisions between the two
     graphs.

     Note also that this implementation currently does not support producing metadata
     for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is
     currently not being used for any ingests, so this isn't a problem. There was
     talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which
     would probably require adding support here for StreamedGraph's.
    """
    def __init__(
            self,
            identifier,
            data_release_version,
            ingest_name,
            ingest_title,
            ingest_url,
            ingest_logo=None,
            ingest_description=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None,
            distribution_type='ttl',
            dataset_curie_prefix='MonarchArchive'):

        if graph_type is None:
            self.graph = RDFGraph(None,
                                  ":".join([dataset_curie_prefix, identifier]))
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       ":".join(
                                           [dataset_curie_prefix, identifier]),
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True,
                                  ':'.join([dataset_curie_prefix, identifier]))

        if data_release_version is not None:
            self.data_release_version = data_release_version
        else:
            self.data_release_version = datetime.today().strftime("%Y%m%d")

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.identifier = ':'.join([dataset_curie_prefix, identifier])
        self.citation = set()

        self.ingest_name = ingest_name
        self.ingest_title = ingest_title
        if self.ingest_title is None:
            self.ingest_title = ":".join([dataset_curie_prefix, identifier])

        self.ingest_url = ingest_url
        self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo
        self.ingest_description = ingest_description

        self.date_issued = None

        self.license_url = license_url
        self.data_rights = data_rights
        self.distribution_type = distribution_type

        # set HCLS resource CURIEs
        self.summary_level_curie = ':'.join(
            [dataset_curie_prefix, '#' + identifier])
        self.version_level_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/#' + identifier
        self.distribution_level_turtle_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/rdf/' + \
            identifier + "." + self.distribution_type

        # The following might seem a little odd, but we need to set downloadURLs this
        # way in order for them to point to where they will end up in archive.MI.org as
        # of Sept 2019. URL is:
        #  https://archive.MI.org/[release version]/[dist type]/[source].[dist type]
        self.download_url = \
            self.curie_map.get("MonarchArchive") + self.data_release_version + \
            "/rdf/" + self.ingest_name + "." + self.distribution_type

        self._set_summary_level_triples()
        self._set_version_level_triples()
        self._set_distribution_level_triples()

    def _set_summary_level_triples(self):
        self.model.addType(self.summary_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(self.summary_level_curie, self.globaltt['title'],
                             self.ingest_title, True)
        self.model.addTriple(self.summary_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))
        self.model.addTriple(self.summary_level_curie, "schemaorg:logo",
                             self.ingest_logo)
        self.graph.addTriple(self.summary_level_curie,
                             self.globaltt['identifier'],
                             self.summary_level_curie)
        if self.ingest_url is not None:
            self.graph.addTriple(self.summary_level_curie,
                                 self.globaltt["Source (dct)"],
                                 self.ingest_url)
        if self.ingest_description is not None:
            self.model.addDescription(self.summary_level_curie,
                                      self.ingest_description)

    def _set_version_level_triples(self):
        self.model.addType(self.version_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['title'],
            self.ingest_title + " Monarch version " +
            self.data_release_version, True)
        if self.ingest_description is not None:
            self.model.addDescription(self.version_level_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['created'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['isVersionOf'],
                             self.summary_level_curie,
                             object_is_literal=False)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['distribution'],
                             self.distribution_level_turtle_curie,
                             object_is_literal=False)

    def _set_distribution_level_triples(self):
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['Dataset'])
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['distribution'])
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['title'],
            self.ingest_title + " distribution " + self.distribution_type,
            True)
        if self.ingest_description is not None:
            self.model.addDescription(self.distribution_level_turtle_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['created'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['created_with'],
                             "https://github.com/monarch-initiative/dipper")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['format'],
                             "https://www.w3.org/TR/turtle/")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['downloadURL'], self.download_url)
        if self.license_url is None:
            self.graph.addTriple(
                self.distribution_level_turtle_curie, self.globaltt['license'],
                'https://project-open-data.cio.gov/unknown-license/')
        else:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['license'], self.license_url)

        if self.data_rights is not None:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['rights'], self.data_rights)

        self._declare_as_ontology()

    def set_ingest_source_file_version_num(self, file_iri, version):
        """
        This method sets the version of a remote file or resource that is used in the
        ingest. It writes this triple:

        file_iri - 'pav:version' -> version

        Version is an untyped literal

        Note: if your version is a date or timestamp, use
        set_ingest_source_file_version_date()
        instead

        :param file_iri: a remote file or resource used in ingest
        :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD)
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             version,
                             object_is_literal=True)

    def set_ingest_source_file_version_date(self,
                                            file_iri,
                                            date,
                                            datatype=XSD.date):
        """
        This method sets the version that the source (OMIM, CTD, whatever) uses to
        refer to this version of the remote file/resource that was used in the ingest

        It writes this triple:

        file_iri - 'pav:version' -> date or timestamp

        Version is added as a literal of datatype XSD date

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source_file_version_retrieved_on(self,
                                                    file_iri,
                                                    date,
                                                    datatype=XSD.date):
        """
        This method sets the date on which a remote file/resource (from OMIM, CTD, etc)
        was retrieved.

        It writes this triple:

        file_iri - 'pav:retrievedOn' -> date or timestamp

        Version is added as a literal of datatype XSD date by default

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['retrieved_on'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source(self, url, predicate=None, is_object_literal=False):
        """
        This method writes a triple to the dataset graph indicating that the ingest
        used a file or resource at [url] during the ingest.

        Triple emitted is version_level_curie dcterms:source [url]

        This triple is likely to be redundant if Source.get_files() is used to retrieve
        the remote files/resources, since this triple should also be emitted
        as files/resources are being retrieved. This method is provided as a convenience
        method for sources that do their own downloading of files.

        :param url: a remote resource used as a source during ingest
        :param predicate: the predicate to use for the triple ["dcterms:source"]
                from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/)
                "Use dct:source when the source dataset was used in whole or in part.
                Use pav:retrievedFrom when the source dataset was used in whole and was
                not modified from its original distribution. Use prov:wasDerivedFrom
                when the source dataset was in whole or in part and was modified from
                its original distribution."
        :return: None
        """
        if predicate is None:
            predicate = self.globaltt["Source (dct)"]
        self.graph.addTriple(self.version_level_curie,
                             predicate,
                             url,
                             object_is_literal=is_object_literal)

    def get_graph(self):
        """
        This method returns the dataset graph
        :param
        :return: dataset graph
        """
        return self.graph

    def get_license(self):
        """
        This method returns the license info
        :param
        :return: license info
        """
        return self.license_url

    def set_citation(self, citation_id):
        """
        This method adds [citaton_id] argument to the set of citations, and also
        adds a triple indicating that version level cito:citesAsAuthority [citation_id]
        :param: citation_id
        :return: none
        """
        self.citation.add(citation_id)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['citesAsAuthority'], citation_id)

    def _declare_as_ontology(self, version_info=None):
        """
        Declare the distribution level IRI as an ontology, and also make triple
        distribution level IRI - version_iri -> version level IRI

        TEC: I am not convinced dipper reformatting external data as RDF triples
        makes an OWL ontology (nor that it should be considered a goal).

        Proper ontologies are built by ontologists. Dipper reformats data
        and annotates/decorates it with a minimal set of carefully arranged
        terms drawn from from multiple proper ontologies.
        Which allows the whole (dipper's RDF triples and parent ontologies)
        to function as a single ontology we can reason over when combined
        in a store such as SciGraph.

        Including more than the minimal ontological terms in dipper's RDF
        output constitutes a liability as it allows greater divergence
        between dipper artifacts and the proper ontologies.

        :param version_info: a string describing version info for the ontology
        :return:

        """
        model = Model(self.graph)
        model.addOntologyDeclaration(self.summary_level_curie)
        model.addOWLVersionIRI(self.summary_level_curie,
                               self.version_level_curie)
        if version_info is not None:
            model.addOWLVersionInfo(self.distribution_level_turtle_curie,
                                    version_info)

    @staticmethod
    def make_id(long_string, prefix='MONARCH'):
        """
        A method to create DETERMINISTIC identifiers
        based on a string's digest. currently implemented with sha1
        Duplicated from Source.py to avoid circular imports.
        :param long_string: string to use to generate identifier
        :param prefix: prefix to prepend to identifier [Monarch]
        :return: a Monarch identifier
        """
        return ':'.join((prefix, Dataset.hash_id(long_string)))

    @staticmethod
    def hash_id(word):  # same as graph/GraphUtils.digest_id(wordage)
        """
        Given a string, make a hash
        Duplicated from Source.py.

        :param word: str string to be hashed
        :return: hash of id
        """
        return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
Beispiel #53
0
    def _process_kegg_disease2gene(self, limit=None):
        """
        This method creates an association between diseases and
        their associated genes. We are being conservative here, and only
        processing those diseases for which there is no mapping to OMIM.

        Triples created:
        <alternate_locus> is an Individual
        <alternate_locus> has type <variant_locus>
        <alternate_locus> is an allele of  <gene_id>

        <assoc_id> has subject <disease_id>
        <assoc_id> has object <gene_id>
        :param limit:
        :return:

        """

        logger.info("Processing KEGG disease to gene")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        geno = Genotype(g)
        rel = model.object_properties['is_marker_for']
        noomimset = set()
        raw = '/'.join((self.rawdir, self.files['disease_gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_id, disease_id) = row

                if self.testMode and gene_id not in self.test_ids['genes']:
                    continue

                gene_id = 'KEGG-'+gene_id.strip()
                disease_id = 'KEGG-'+disease_id.strip()

                # only add diseases for which
                # there is no omim id and not a grouping class
                if disease_id not in self.kegg_disease_hash:
                    # add as a class
                    disease_label = None
                    if disease_id in self.label_hash:
                        disease_label = self.label_hash[disease_id]
                    if re.search(r'includ', str(disease_label)):
                        # they use 'including' when it's a grouping class
                        logger.info(
                            "Skipping this association because " +
                            "it's a grouping class: %s",
                            disease_label)
                        continue
                    # type this disease_id as a disease
                    model.addClassToGraph(disease_id, disease_label, 'DOID:4')
                    noomimset.add(disease_id)
                    alt_locus_id = self._make_variant_locus_id(gene_id,
                                                               disease_id)
                    alt_label = self.label_hash[alt_locus_id]
                    model.addIndividualToGraph(alt_locus_id, alt_label,
                                               geno.genoparts['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)
                    # Add the disease to gene relationship.
                    assoc = G2PAssoc(g, self.name, alt_locus_id, disease_id, rel)
                    assoc.add_association_to_graph()

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with KEGG disease to gene")
        logger.info("Found %d diseases with no omim id", len(noomimset))

        return
Beispiel #54
0
    def _parse_g2p_file(self, limit=None):
        """
        Parse gene to XPO file, currently custom for Monarch
        :param limit:
        :return:
        """
        src_key = 'g2p_assertions'
        geno = Genotype(self.graph)
        model = Model(self.graph)

        columns = self.files[src_key]['columns']
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        LOG.info("Processing Gene to XPO associations")

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')

            # File has headers
            row = next(reader)
            if not self.check_fileheader(columns, row):
                pass

            for row in reader:

                gene = row[columns.index('SUBJECT')]
                gene_label = row[columns.index('SUBJECT_LABEL')]
                gene_taxon = row[columns.index('SUBJECT_TAXON')]
                #gene_taxon_label = row[columns.index('SUBJECT_TAXON_LABEL')]
                phenotype_curie = row[columns.index('OBJECT')]
                #phenotype_label = row[columns.index('OBJECT_LABEL')]
                relation = row[columns.index('RELATION')]
                #relation_label = row[columns.index('RELATION_LABEL')]
                evidence = row[columns.index('EVIDENCE')]
                #evidence_label = row[columns.index('EVIDENCE_LABEL')]
                source = row[columns.index('SOURCE')]
                #is_defined_by = row[columns.index('IS_DEFINED_BY')]
                #qualifier = row[columns.index('QUALIFIER')]

                relation_curie = relation.replace('_', ':')

                geno.addGene(gene, gene_label)
                geno.addTaxon(gene_taxon, gene)

                assoc = G2PAssoc(
                    self.graph,
                    self.name,
                    entity_id=gene,
                    phenotype_id=phenotype_curie,
                    rel=relation_curie
                )

                if evidence:
                    assoc.add_evidence(evidence)

                if source:
                    model.addType(source, self.globaltt['journal article'])
                    assoc.add_source(source)

                assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Beispiel #55
0
    def _process_genes(self, limit=None):
        """
        This method processes the KEGG gene IDs.
        The label for the gene is pulled as
        the first symbol in the list of gene symbols;
        the rest are added as synonyms.
        The long-form of the gene name is added as a definition.
        This is hardcoded to just processes human genes.

        Triples created:
        <gene_id> is a SO:gene
        <gene_id> rdfs:label <gene_name>

        :param limit:
        :return:

        """

        logger.info("Processing genes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        family = Family(g)
        geno = Genotype(g)
        raw = '/'.join((self.rawdir, self.files['hsa_genes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_id, gene_name) = row

                gene_id = 'KEGG-'+gene_id.strip()

                # the gene listing has a bunch of labels
                # that are delimited, as:
                # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT,
                # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin
                # it looks like the list is semicolon delimited
                # (symbol, name, gene_class)
                # where the symbol is a comma-delimited list

                # here, we split them up.
                # we will take the first abbreviation and make it the symbol
                # then take the rest as synonyms

                gene_stuff = re.split('r;', gene_name)
                symbollist = re.split(r',', gene_stuff[0])
                first_symbol = symbollist[0].strip()

                if gene_id not in self.label_hash:
                    self.label_hash[gene_id] = first_symbol

                if self.testMode and gene_id not in self.test_ids['genes']:
                    continue

                # Add the gene as a class.
                geno.addGene(gene_id, first_symbol)

                # add the long name as the description
                if len(gene_stuff) > 1:
                    description = gene_stuff[1].strip()
                    model.addDefinition(gene_id, description)

                # add the rest of the symbols as synonyms
                for i in enumerate(symbollist, start=1):
                    model.addSynonym(gene_id, i[1].strip())

                if len(gene_stuff) > 2:
                    ko_part = gene_stuff[2]
                    ko_match = re.search(r'K\d+', ko_part)
                    if ko_match is not None and len(ko_match.groups()) == 1:
                        ko = 'KEGG-ko:'+ko_match.group(1)
                        family.addMemberOf(gene_id, ko)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with genes")
        return
Beispiel #56
0
    def _process_trait_mappings(self, raw, src_key, limit=None):
        """
        This method mapps traits from/to ...

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        model = Model(graph)

        col = self.files[src_key]['columns']

        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            header = next(filereader, None)
            self.check_fileheader(col, header)
            for row in filereader:
                line_counter += 1
                # need to skip the last line
                if len(row) != len(col):
                    LOG.info("skipping line %d: %s", line_counter, '\t'.join(row))
                    continue
                vto_id = row[col.index('VT')].strip()
                pto_id = row[col.index('LPT')].strip()
                cmo_id = row[col.index('CMO')].strip()
                ato_column = row[col.index('ATO')].strip()
                # species = row[col.index('Species')].strip()
                # trait_class = row[col.index('Class')].strip()
                # trait_type = row[col.index('Type')].strip()
                # qtl_count = row[col.index('QTL_Count')].strip()

                ato_id = re.sub(
                    r'ATO #', 'AQTLTrait:', re.sub(
                        r'\].*', '', re.sub(r'\[', '', ato_column)))
                ato_id = ato_id.strip()

                ato_label = re.sub(r'.*\]\s*', '', ato_column)

                model.addClassToGraph(ato_id, ato_label.strip())

                if re.match(r'VT:.*', vto_id):
                    model.addClassToGraph(vto_id, None)
                    model.addEquivalentClass(ato_id, vto_id)
                if re.match(r'LPT:.*', pto_id):
                    model.addClassToGraph(pto_id, None)
                    model.addXref(ato_id, pto_id)
                if re.match(r'CMO:.*', cmo_id):
                    model.addClassToGraph(cmo_id, None)
                    model.addXref(ato_id, cmo_id)

        LOG.info("Done with trait mappings")
        return
Beispiel #57
0
class Assoc:
    """
    A base class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.

    """

    def __init__(self, graph, definedby, sub=None, obj=None, pred=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        # core parts of the association
        self.definedby = definedby
        self.sub = sub
        self.obj = obj
        self.rel = pred
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        self.date = []

        # this is going to be used for the refactored evidence/provenance
        self.provenance = []
        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def _is_valid(self):
        # check if sub/obj/rel are none...raise error
        if self.sub is None:
            raise ValueError(
                'No subject set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        if self.obj is None:
            raise ValueError(
                'No object set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        if self.rel is None:
            raise ValueError(
                'No predicate set for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        # Are subject & predicate, either a curie or IRI
        pfx = self.sub.split(':')[0]
        if pfx not in self.curie_map.keys() and \
                pfx not in ['_', 'http', 'https', 'ftp']:
            raise ValueError(
                'Invalid Subject for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        pfx = self.rel.split(':')[0]
        if pfx not in self.curie_map.keys() and \
                pfx not in ['_', 'http', 'https', 'ftp']:
            raise ValueError(
                'Invalid Predicate for this association <%s> <%s> <%s>',
                self.sub, self.rel, self.obj
            )
        return True

    def add_association_to_graph(self):

        if not self._is_valid():
            return

        self.graph.addTriple(self.sub, self.rel, self.obj)

        if self.assoc_id is None:
            self.set_association_id()

        assert self.assoc_id is not None

        self.model.addType(self.assoc_id, self.model.globaltt['association'])

        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has subject'], self.sub)
        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has object'], self.obj)
        self.graph.addTriple(
            self.assoc_id, self.globaltt['association has predicate'], self.rel)

        if self.description is not None:
            self.model.addDescription(self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for evi in self.evidence:
                self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi)

        if self.source is not None and len(self.source) > 0:
            for src in self.source:
                # TODO assume that the source is a publication? use Reference class
                self.graph.addTriple(self.assoc_id, self.globaltt['source'], src)

        if self.provenance is not None and len(self.provenance) > 0:
            for prov in self.provenance:
                self.graph.addTriple(
                    self.assoc_id, self.globaltt['has_provenance'], prov)

        if self.date is not None and len(self.date) > 0:
            for dat in self.date:
                self.graph.addTriple(
                    self.assoc_id,self.globaltt['created_on'], dat,
                    object_is_literal=True)

        if self.score is not None:
            self.graph.addTriple(
                self.assoc_id, self.globaltt['has measurement value'], self.score,
                True, 'xsd:float')
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_predicate_object(
            self, predicate, object_node, object_type=None, datatype=None):

        if object_type == 'Literal':
            if datatype is not None:
                self.graph.addTriple(
                    self.assoc_id, predicate, object_node, True, datatype)
            else:
                self.graph.addTriple(self.assoc_id, predicate, object_node, True)
        else:
            self.graph.addTriple(self.assoc_id, predicate, object_node, False)

        return

    # This isn't java, but predecessors favored the use of property decorators
    # and CamelCase and ...
    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
        of the association.
        To be used in cases where an external association identifier
        should be used.

        :param assoc_id:

        :return:

        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(
                self.definedby, self.sub, self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return self.assoc_id

    def get_association_id(self):
        if self.assoc_id is None:
            self.set_association_id()

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:

        :return:

        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_date(self, date):
        if date is not None and date.strip() != '':
            self.date += [date]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    @staticmethod
    def make_association_id(definedby, sub, pred, obj, attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively digests the  string of concatonated values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be appeded to the ID.

        Note this is equivalent to a RDF blank node

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:

        :return:

        """

        items_to_hash = [definedby, sub, pred, obj]
        if attributes is not None and len(attributes) > 0:
            items_to_hash += attributes

        items_to_hash = [x for x in items_to_hash if x is not None]

        assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash))))
        assert assoc_id is not None
        return assoc_id
Beispiel #58
0
    def _process_qtls_genomic_location(
            self, raw, src_key, txid, build_id, build_label, common_name, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        line_counter = 0
        geno = Genotype(graph)
        # assume that chrs get added to the genome elsewhere

        taxon_curie = 'NCBITaxon:' + txid
        eco_id = self.globaltt['quantitative trait analysis evidence']
        LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw)
        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            # no header in GFF, so no header checking
            col = self.files[src_key]['columns']
            for row in reader:
                line_counter += 1
                if re.match(r'^#', ' '.join(row)):
                    continue

                if len(row) != len(col):
                    LOG.warning("Problem parsing in %s row %s\n"
                                "got %s cols but expected %s",
                                raw, row, len(row), len(col))
                    continue
                else:
                    # Doing this non-positional mapping for consistency, but I'm not
                    # sure we need to do this for GFF, since columns in GFF are probably
                    # not going to change anytime soon.
                    chromosome = row[col.index('SEQNAME')].strip()
                    # qtl_source = row[col.index('SOURCE')].strip()
                    # qtl_type = row[col.index('FEATURE')].strip()
                    start_bp = row[col.index('START')].strip()
                    stop_bp = row[col.index('END')].strip()
                    # score = row[col.index('SCORE')].strip()
                    strand = row[col.index('STRAND')].strip()
                    # frame = row[col.index('FRAME')].strip()
                    attr = row[col.index('ATTRIBUTE')].strip()

                example = '''
Chr.Z   Animal QTLdb    Production_QTL  33954873      34023581...
QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01
                '''
                str(example)
                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
                # VTO_name,Map_Type,Significance,P-value,Model,
                # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
                # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
                # Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search(r'"FlankMarkers";', attr):
                    attr = re.sub(r'FlankMarkers;', '', attr)
                attr_items = re.sub(r'"', '', attr).split(";")
                bad_attrs = set()
                for attributes in attr_items:
                    if not re.search(r'=', attributes):
                        # remove this attribute from the list
                        bad_attrs.add(attributes)

                attr_set = set(attr_items) - bad_attrs
                attribute_dict = dict(item.split("=") for item in attr_set)

                qtl_num = attribute_dict.get('QTL_ID')
                if self.test_mode and int(qtl_num) not in self.test_ids:
                    continue
                # make association between QTL and trait based on taxon

                qtl_id = common_name + 'QTL:' + str(qtl_num)
                model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL'])
                geno.addTaxon(taxon_curie, qtl_id)

                #
                trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match(r'ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        reference = Reference(graph, pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        reference = Reference(
                            graph, pub_id, self.globaltt['journal article'])
                    reference.addRefToGraph()

                # Add QTL to graph
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id,
                    self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    scr = re.sub(r'<', '', attribute_dict.get('P-value'))
                    if ',' in scr:
                        scr = re.sub(r',', '.', scr)
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)

                assoc.add_association_to_graph()
                # TODO make association to breed
                # (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub(r'Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(
                    start_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(
                    stop_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(taxon_curie)
                qtl_feature.addFeatureToGraph()

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        # LOG.warning("Bad attribute flags in this file")  # what does this even mean??
        LOG.info("Done with QTL genomic mappings for %s", taxon_curie)
        return
Beispiel #59
0
class Genotype():
    """
    These methods provide convenient methods to
    add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in
    GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features,
    we use the GenomicFeature class to create them.

    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def addGenotype(self,
                    genotype_id,
                    genotype_label,
                    genotype_type=None,
                    genotype_description=None):
        """
        If a genotype_type is not supplied,
        we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:

        """
        if genotype_type is None:
            genotype_type = self.globaltt['intrinsic_genotype']

        self.model.addIndividualToGraph(genotype_id, genotype_label,
                                        genotype_type, genotype_description)
        return

    def addAllele(self,
                  allele_id,
                  allele_label,
                  allele_type=None,
                  allele_description=None):
        """
        Make an allele object.
        If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional,
        recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:

        """

        # TODO should we accept a list of allele types?
        if allele_type is None:
            allele_type = self.globaltt['allele']  # TODO is this a good idea?
        self.model.addIndividualToGraph(allele_id, allele_label, allele_type,
                                        allele_description)

        return

    def addGene(self,
                gene_id,
                gene_label=None,
                gene_type=None,
                gene_description=None):
        ''' genes are classes '''
        if gene_type is None:
            gene_type = self.globaltt['gene']
        self.model.addClassToGraph(gene_id, gene_label, gene_type,
                                   gene_description)

        return

    def addConstruct(self,
                     construct_id,
                     construct_label,
                     construct_type=None,
                     construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.model.addIndividualToGraph(construct_id, construct_label,
                                        construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.
        Examples of uses include between:
        an allele and a construct or strain here,
        a cell line and it's parent genotype.  Adding the parent and child to
        the graph should happen outside of this function call to ensure graph
        integrity.
        :param child_id:
        :param parent_id:
        :return:

        """

        self.graph.addTriple(child_id, self.globaltt['derives_from'],
                             parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.graph.addTriple(child_id, self.globaltt['sequence_derives_from'],
                             parent_id)

        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_allele_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt["is_allele_of"]
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addAffectedLocus(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:has_affected_feature.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt['has_affected_feature']
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addGeneProduct(self,
                       sequence_id,
                       product_id,
                       product_label=None,
                       product_type=None):
        """
        Add gene/variant/allele has_gene_product relationship
        Can be used to either describe a gene to transcript relationship
        or gene to protein
        :param sequence_id:
        :param product_id:
        :param product_label:
        :param product_type:
        :return:

        """
        if product_label is not None and product_type is not None:
            self.model.addIndividualToGraph(product_id, product_label,
                                            product_type)
        self.graph.addTriple(sequence_id, self.globaltt['has gene product'],
                             product_id)

        return

    def addPolypeptide(self,
                       polypeptide_id,
                       polypeptide_label=None,
                       transcript_id=None,
                       polypeptide_type=None):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:

        """
        if polypeptide_type is None:
            polypeptide_type = self.globaltt['polypeptide']
        self.model.addIndividualToGraph(polypeptide_id, polypeptide_label,
                                        polypeptide_type)
        if transcript_id is not None:
            self.graph.addTriple(transcript_id, self.globaltt['translates_to'],
                                 polypeptide_id)

        return

    def addPartsToVSLC(self,
                       vslc_id,
                       allele1_id,
                       allele2_id,
                       zygosity_id=None,
                       allele1_rel=None,
                       allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles
        (reference or variant loci) are traditionally added, you can add any
        node (such as sequence_alterations for unlocated variations) to a vslc
        if they are known to be paired.  However, if a sequence_alteration's
        loci is unknown, it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:

        """

        # vslc has parts allele1/allele2

        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.globaltt['homozygous']
            else:
                zygosity_id = self.globaltt['heterozygous']

        if zygosity_id is not None:
            self.graph.addTriple(vslc_id, self.globaltt['has_zygosity'],
                                 zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.
        The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """

        self.addParts(vslc_id, parent_id, self.globaltt['has_variant_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between
        a parent_id and the supplied part.
        By default the relationship will be BFO:has_part,
        but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:

        """
        if part_relationship is None:
            part_relationship = self.globaltt['has_part']
        # Fail loudly if parent or child identifiers are None
        if parent_id is None:
            raise TypeError('Attempt to pass None as parent')
        elif part_id is None:
            raise TypeError('Attempt to pass None as child')
        elif part_relationship is None:
            part_relationship = self.globaltt['has_part']

        self.graph.addTriple(parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(self,
                              sa_id,
                              sa_label,
                              sa_type=None,
                              sa_description=None):

        if sa_type is None:
            sa_type = self.globaltt['sequence_alteration']

        self.model.addIndividualToGraph(sa_id, sa_label, sa_type,
                                        sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.globaltt['has_variant_part'])
        return

    def addGenomicBackground(self,
                             background_id,
                             background_label,
                             background_type=None,
                             background_description=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addIndividualToGraph(background_id, background_label,
                                        background_type,
                                        background_description)

        return

    def addGenomicBackgroundToGenotype(self,
                                       background_id,
                                       genotype_id,
                                       background_type=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addType(background_id, background_type)
        self.addParts(background_id, genotype_id,
                      self.globaltt['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with
        RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background,
        but could be added to any genotype part (including a gene,
        regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:

        :return:

        """
        self.graph.addTriple(genopart_id, self.globaltt['in taxon'], taxon_id)

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype,
        # assuming it's a extrinsic_genotype
        self.graph.addTriple(genotype_id, self.globaltt['has_variant_part'],
                             reagent_id)

        return

    def addGeneTargetingReagent(self,
                                reagent_id,
                                reagent_label,
                                reagent_type,
                                gene_id,
                                description=None):
        """
        Here, a gene-targeting reagent is added.
        The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:

        :return:

        """

        # TODO add default type to reagent_type
        self.model.addIndividualToGraph(reagent_id, reagent_label,
                                        reagent_type, description)

        self.graph.addTriple(reagent_id, self.globaltt['targets_gene'],
                             gene_id)

        return

    def addReagentTargetedGene(self,
                               reagent_id,
                               gene_id,
                               targeted_gene_id=None,
                               targeted_gene_label=None,
                               description=None):
        """
        This will create the instance of a gene that is targeted by a molecular
        reagent (such as a morpholino or rnai).
        If an instance id is not supplied,
        we will create it as an anonymous individual which is of the type
        GENO:reagent_targeted_gene.
        We will also add the targets relationship between the reagent and
        gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
        rdf:label targeted_gene_label
        dc:description description
        <reagent_id> GENO:targets_gene <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:

        """

        # akin to a variant locus
        if targeted_gene_id is None:
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
            targeted_gene_id = targeted_gene_id.replace(":", "")
        self.model.addIndividualToGraph(targeted_gene_id, targeted_gene_label,
                                        self.globaltt['reagent_targeted_gene'],
                                        description)

        if gene_id is not None:
            self.graph.addTriple(targeted_gene_id,
                                 self.globaltt['is_expression_variant_of'],
                                 gene_id)

        self.graph.addTriple(targeted_gene_id, self.globaltt['is_targeted_by'],
                             reagent_id)

        return

    def addTargetedGeneSubregion(self,
                                 tgs_id,
                                 tgs_label,
                                 tgs_type=None,
                                 tgs_description=None):
        if tgs_type is None:
            tgs_type = self.globaltt['targeted_gene_subregion']

        self.model.addIndividualToGraph(tgs_id, tgs_label, tgs_type,
                                        tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.graph.addTriple(population_id,
                             self.globaltt['has_member_with_allelotype'],
                             member_id)
        return

    def addTargetedGeneComplement(self,
                                  tgc_id,
                                  tgc_label,
                                  tgc_type=None,
                                  tgc_description=None):
        if tgc_type is None:
            tgc_type = self.globaltt['targeted_gene_complement']
        self.model.addIndividualToGraph(tgc_id, tgc_label, tgc_type,
                                        tgc_description)

        return

    def addGenome(self, taxon_num, taxon_label=None, genome_id=None):
        ncbitaxon = 'NCBITaxon:' + taxon_num
        if taxon_label is None:
            if ncbitaxon in self.globaltcid:
                taxon_label = self.globaltcid[ncbitaxon]
            else:
                logging.warning('Add ' + ncbitaxon +
                                ' to global translation table')
                taxon_label = taxon_id
        elif ncbitaxon in self.globaltcid and taxon_label != self.globaltcid[
                ncbitaxon]:
            logging.warning('"' + self.globaltcid[ncbitaxon] +
                            '" may need updating from "' + taxon_label +
                            '" in global translation table')
            logging.warning(
                '"' + taxon_label + '": " ' + self.globaltcid[ncbitaxon] +
                '"' + ' may need to be added to a local translation table')

        genome_label = taxon_label + ' genome'
        if genome_id is None:
            genome_id = self.makeGenomeID(taxon_num)
        self.model.addClassToGraph(genome_id, genome_label,
                                   self.globaltt['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addIndividualToGraph(build_id, build_label,
                                        self.globaltt['reference_genome'])
        self.model.addType(build_id, genome_id)
        if re.match(r'[0-9]+', taxon_id):
            taxon_id = 'NCBITaxon:' + taxon_id
        self.addTaxon(taxon_id, build_id)

        return

    @staticmethod
    def makeGenomeID(taxon_id):
        # scrub off the taxon prefix.  put it in base space
        # TODO: revisit as yet another BNODE?
        # should never be called if a real genome iri exists
        # should create the opaque bode and label together
        # genome_id = re.sub(r'.*\:', '_:', taxon_id) + 'genome'
        genome_id = '_:' + taxon_id + 'genome'
        return genome_id

    def addChromosome(self,
                      chrom,
                      tax_id,
                      tax_label=None,
                      build_id=None,
                      build_label=None):
        """
        if it's just the chromosome, add it as an instance of a SO:chromosome,
        and add it to the genome. If a build is included,
        punn the chromosome as a subclass of SO:chromsome, and make the
        build-specific chromosome an instance of the supplied chr.
        The chr then becomes part of the build or genome.
        """
        family = Family(self.graph)
        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chrom), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chrom, tax_label)
        else:
            chr_label = makeChromLabel(chrom)
        genome_id = self.makeGenomeID(tax_id)
        self.model.addClassToGraph(chr_id, chr_label,
                                   self.globaltt['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            # the build-specific chromosome
            chrinbuild_id = makeChromID(chrom, build_id)
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chrom, build_label)
            # add the build-specific chromosome as an instance of the chr class

            self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label,
                                            chr_id)

            # add the build-specific chromosome
            # as a member of the build (both ways)
            family.addMember(build_id, chrinbuild_id)
            family.addMemberOf(chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        # the chrom class (generic) id
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.model.addClassToGraph(chrom_class_id, chrom_class_label,
                                   self.globaltt['chromosome'])

        return

    def addChromosomeInstance(self,
                              chr_num,
                              reference_id,
                              reference_label,
                              chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr_num:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.
        typically a genome-specific chr

        :return:

        """
        family = Family(self.graph)
        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.model.addIndividualToGraph(chr_id, chr_label,
                                        self.globaltt['chromosome'])
        if chr_type is not None:
            self.model.addType(chr_id, chr_type)

        # add the build-specific chromosome
        # as a member of the build  (both ways)
        family.addMember(reference_id, chr_id)
        family.addMemberOf(chr_id,
                           reference_id)  # usage dependent, todo: ommit

        return

    @staticmethod
    def make_variant_locus_label(gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip() + '<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """

        vslc_label = ''

        if gene_label is None and allele1_label is None and allele2_label is None:
            LOG.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label

    def make_experimental_model_with_genotype(self, genotype_id,
                                              genotype_label, taxon_id,
                                              taxon_label):

        animal_id = '-'.join((taxon_id, 'with', genotype_id))
        animal_id = re.sub(r':', '', animal_id)
        animal_id = '_:' + animal_id

        animal_label = ' '.join((genotype_label, taxon_label))
        self.model.addIndividualToGraph(animal_id, animal_label, taxon_id)
        self.graph.addTriple(animal_id, self.globaltt['has_genotype'],
                             genotype_id)
        return animal_id
Beispiel #60
0
    def _add_evidence(self, assoc_id, eco_id, p_value, percentage_change,
                      effect_size, study_bnode):
        """
        :param assoc_id: assoc curie used to reify a
        genotype to phenotype association, generated in _process_data()
        :param eco_id: eco_id as curie, hardcoded in _process_data()
        :param p_value: str, from self.files['all']
        :param percentage_change: str, from self.files['all']
        :param effect_size: str, from self.files['all']
        :param study_bnode: str, from self.files['all']
        :param phenotyping_center: str, from self.files['all']
        :return: str, evidence_line_bnode as curie
        """

        evidence_model = Evidence(self.graph, assoc_id)
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)

        # Add line of evidence
        evidence_line_bnode = self.make_id(
            "{0}{1}".format(assoc_id, study_bnode), '_')
        evidence_model.add_supporting_evidence(evidence_line_bnode)
        model.addIndividualToGraph(evidence_line_bnode, None, eco_id)

        # Add supporting measurements to line of evidence
        measurements = {}
        if p_value is not None or p_value != "":
            p_value_bnode = self.make_id(
                "{0}{1}{2}".format(evidence_line_bnode, 'p_value', p_value),
                '_')
            model.addIndividualToGraph(p_value_bnode, None,
                                       self.globaltt['p-value'])
            try:
                measurements[p_value_bnode] = float(p_value)
            except ValueError:
                measurements[p_value_bnode] = p_value
        if percentage_change is not None and percentage_change != '':

            fold_change_bnode = self.make_id(
                "{0}{1}{2}".format(evidence_line_bnode, 'percentage_change',
                                   percentage_change), '_')
            model.addIndividualToGraph(fold_change_bnode, None,
                                       self.resolve('percentage_change'))
            measurements[fold_change_bnode] = percentage_change
        if effect_size is not None or effect_size != "":
            fold_change_bnode = self.make_id(
                "{0}{1}{2}".format(evidence_line_bnode, 'effect_size',
                                   effect_size), '_')
            model.addIndividualToGraph(fold_change_bnode, None,
                                       self.globaltt['effect size estimate'])
            measurements[fold_change_bnode] = effect_size
        if measurements != {}:
            evidence_model.add_supporting_data(evidence_line_bnode,
                                               measurements)

        # Link evidence to provenance by connecting to study node
        provenance_model.add_study_to_measurements(study_bnode,
                                                   measurements.keys())
        self.graph.addTriple(evidence_line_bnode,
                             self.globaltt['has_supporting_activity'],
                             study_bnode)

        return evidence_line_bnode