Exemple #1
0
    def _process_group_mpo_row(self, row):
        """
        Make OMIA to MP associations
        :param row:
        :return:
        """
        omia_id = 'OMIA:' + row['omia_id']
        mpo_num = row['MPO_no']
        mpo_id = 'MP:' + str(mpo_num).zfill(7)

        assoc = D2PAssoc(self.graph, self.name, omia_id, mpo_id)
        assoc.add_association_to_graph()
Exemple #2
0
    def _process_omia_group_row(self, row):
        model = Model(self.graph)
        omia_id = 'OMIA:' + row['omia_id']

        if self.test_mode and omia_id not in self.test_ids['disease']:
            return

        group_name = row['group_name']
        group_summary = row['group_summary']
        # default to general disease seems the only reasonable choice
        disease_id = self.globaltt['disease or disorder']
        group_category = 'group_category:' + str(row['group_category'])
        disease_id = self.resolve(group_category, False)

        if disease_id == 'group_category:None':
            disease_id = self.globaltt['disease']
        elif disease_id == group_category:
            LOG.info(
                "No disease superclass defined for %s:  %s  with parent %s",
                omia_id, group_name, group_category)
            disease_id = self.globaltt['disease']
        else:
            if disease_id == self.globaltt['embryonic lethality']:
                # add this as a phenotype association
                # add embryonic onset
                assoc = D2PAssoc(self.graph, self.name, omia_id, disease_id)
                assoc.add_association_to_graph()
                # disease_id = None
        model.addClassToGraph(disease_id,
                              None,
                              class_category=blv.terms['Disease'])

        if group_summary == '':
            group_summary = None
        if group_name == '':
            group_name = None

        model.addClassToGraph(omia_id,
                              group_name,
                              description=group_summary,
                              class_type=disease_id)

        self.label_hash[omia_id] = group_name
Exemple #3
0
    def _process_omia_group_row(self, row):
        model = Model(self.g)
        omia_id = 'OMIA:'+row['omia_id']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        group_name = row['group_name']
        group_summary = row['group_summary']

        disease_id = None
        group_category = row.get('group_category')
        disease_id = \
            self.map_omia_group_category_to_ontology_id(group_category)
        if disease_id is not None:
            model.addClassToGraph(disease_id, None)
            if disease_id == 'MP:0008762':  # embryonic lethal
                # add this as a phenotype association
                # add embryonic onset
                assoc = D2PAssoc(self.g, self.name, omia_id, disease_id)
                assoc.add_association_to_graph()
                disease_id = None
        else:
            logger.info(
                "No disease superclass defined for %s:  %s",
                omia_id, group_name)
            # default to general disease  FIXME this may not be desired
            disease_id = 'DOID:4'

        if group_summary == '':
            group_summary = None
        if group_name == '':
            group_name = None

        model.addClassToGraph(omia_id, group_name, disease_id, group_summary)

        self.label_hash[omia_id] = group_name

        return
Exemple #4
0
    def _process_phene_row(self, row):
        model = Model(self.graph)
        phenotype_id = None
        sp_phene_label = row['phene_name']
        if sp_phene_label == '':
            sp_phene_label = None
        if 'omia_id' not in row:
            LOG.info("omia_id not present for %s", row['phene_id'])
            omia_id = self._make_internal_id('phene', phenotype_id)
        else:
            omia_id = 'OMIA:' + str(row['omia_id'])

        if self.test_mode and not (  # demorgan this
                row['gb_species_id'] in self.test_ids['taxon']
                and omia_id in self.test_ids['disease']):
            return
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = omia_id

        descr = row['summary']
        if descr == '':
            descr = None

        # omia label
        omia_label = self.label_hash.get(omia_id)

        # add the species-specific subclass (TODO please review this choice)
        gb_species_id = row['gb_species_id']

        if gb_species_id != '':
            sp_phene_id = '-'.join((omia_id, gb_species_id))
        else:
            LOG.error(
                "No species supplied in species-specific phene table for %s",
                omia_id)
            return

        species_id = 'NCBITaxon:' + str(gb_species_id)
        # use this instead
        species_label = self.label_hash.get('NCBITaxon:' + gb_species_id)
        if sp_phene_label is None and omia_label is not None \
                and species_label is not None:
            sp_phene_label = ' '.join((omia_label, 'in', species_label))
        model.addClassToGraph(sp_phene_id, sp_phene_label, omia_id, descr)
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = sp_phene_id
        self.label_hash[sp_phene_id] = sp_phene_label
        # add each of the following descriptions,
        # if they are populated, with a tag at the end.
        for item in [
                'clin_feat', 'history', 'pathology', 'mol_gen', 'control'
        ]:
            if row[item] is not None and row[item] != '':
                model.addDescription(sp_phene_id,
                                     row[item] + ' [' + item + ']')
        # if row['symbol'] is not None:  # species-specific
        # CHECK ME - sometimes spaces or gene labels
        #     gu.addSynonym(g, sp_phene, row['symbol'])

        model.addOWLPropertyClassRestriction(sp_phene_id,
                                             self.globaltt['in taxon'],
                                             species_id)

        # add inheritance as an association
        inheritance_id = None
        if row['inherit'] is not None and row['inherit'] in self.localtt:
            inheritance_id = self.resolve(row['inherit'])
        elif row['inherit'] is not None and row['inherit'] != '':
            LOG.info('Unhandled inheritance type:\t%s', row['inherit'])

        if inheritance_id is not None:  # observable related to genetic disposition
            assoc = D2PAssoc(self.graph,
                             self.name,
                             sp_phene_id,
                             inheritance_id,
                             rel=self.globaltt['has disposition'])
            assoc.add_association_to_graph()

        if row['characterised'] == 'Yes':
            self.stored_omia_mol_gen[omia_id] = {
                'mol_gen': row['mol_gen'],
                'map_info': row['map_info'],
                'species': row['gb_species_id']
            }
Exemple #5
0
    def process_omia_phenotypes(self, limit):

        # process the whole directory
        # TODO get the file listing
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        logger.info(
            "Processing Monarch OMIA Animal disease-phenotype associations")

        # get file listing
        mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype'))
        file_list = [
            f for f in listdir(mypath)
            if isfile(join(mypath, f)) and re.search(r'.txt$', f)]

        for f in file_list:
            logger.info("Processing %s", f)
            print(f)
            line_counter = 0
            count_missing = 0
            bad_rows = list()
            fname = '/'.join((mypath, f))
            with open(fname, 'r') as csvfile:
                filereader = csv.reader(
                    csvfile, delimiter='\t', quotechar='\"')
                for row in filereader:
                    line_counter += 1
                    if line_counter <= 1:
                        continue  # skip header
                    if len(row) != 22:
                        logger.info("Not enough cols (%d) in %s - please fix",
                                    len(row), f)
                        continue
                    (disease_num, species_id, breed_name, variant, inheritance,
                     phenotype_id, phenotype_name, entity_id, entity_name,
                     quality_id, quality_name, related_entity_id,
                     related_entity_name, abnormal_id, abnormal_name,
                     phenotype_description, assay, frequency, pubmed_id,
                     pub_description, curator_notes, date_created) = row

                    if phenotype_id == '':
                        # logger.warning('Missing phenotype in row:\n%s', row)
                        count_missing += 1
                        bad_rows.append(row)
                        continue
                    if len(str(disease_num)) < 6:
                        disease_num = str(disease_num).zfill(6)
                    disease_id = 'OMIA:'+disease_num.strip()
                    species_id = species_id.strip()
                    if species_id != '':
                        disease_id = '-'.join((disease_id, species_id))
                    assoc = D2PAssoc(g, self.name, disease_id, phenotype_id)
                    if pubmed_id != '':
                        for p in re.split(r'[,;]', pubmed_id):
                            pmid = 'PMID:'+p.strip()
                            assoc.add_source(pmid)
                    else:
                        assoc.add_source(
                            '/'.join(('http://omia.angis.org.au/OMIA' +
                                      disease_num.strip(),
                                      species_id.strip())))
                    assoc.add_association_to_graph()
                    aid = assoc.get_association_id()
                    if phenotype_description != '':
                        model.addDescription(aid, phenotype_description)
                    if breed_name != '':
                        model.addDescription(
                            aid, breed_name.strip()+' [observed in]')
                    if assay != '':
                        model.addDescription(aid, assay.strip()+' [assay]')
                    if curator_notes != '':
                        model.addComment(aid, curator_notes.strip())

                    if entity_id != '' or quality_id != '':
                        logger.info("EQ not empty for %s: %s + %s", disease_id,
                                    entity_name, quality_name)
            if count_missing > 0:
                logger.warning(
                    "You are missing %d/%d D2P annotations from id %s",
                    count_missing, line_counter-1, f)
                # TODO PYLINT Used builtin function 'map'.
                # Using a list comprehension can be clearer.
                logger.warning("Bad rows:\n"+"\n".join(map(str, bad_rows)))
            # finish loop through all files

        return
Exemple #6
0
    def process_omia_phenotypes(self, limit):

        # process the whole directory
        # TODO get the file listing
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)

        LOG.info(
            "Processing Monarch OMIA Animal disease-phenotype associations")

        src_key = 'omia_d2p'

        # get file listing
        mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype'))
        file_list = [
            f for f in listdir(mypath)
            if isfile(join(mypath, f)) and re.search(r'.txt$', f)
        ]

        col = self.files[src_key]['columns']
        # reusable initial code generator
        # for c in col:
        #   print(
        #    '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()")

        for filename in file_list:
            LOG.info("Processing %s", filename)
            count_missing = 0
            bad_rows = list()
            fname = '/'.join((mypath, filename))
            with open(fname, 'r') as csvfile:
                filereader = csv.reader(csvfile,
                                        delimiter='\t',
                                        quotechar='\"')
                row = next(filereader)
                if self.check_fileheader(col, row):
                    pass

                for row in filereader:
                    if len(row) != len(col):
                        LOG.info("Not enough cols %d in %s - please fix",
                                 len(row), filename)
                        continue

                    disease_num = row[col.index('Disease ID')].strip()
                    species_id = row[col.index('Species ID')].strip()
                    breed_name = row[col.index('Breed Name')].strip()
                    # variant = row[col.index('Variant')]
                    # inheritance = row[col.index('Inheritance')]
                    phenotype_id = row[col.index('Phenotype ID')].strip()
                    # phenotype_name = row[col.index('Phenotype Name')]
                    entity_id = row[col.index('Entity ID')].strip()
                    entity_name = row[col.index('Entity Name')]
                    quality_id = row[col.index('Quality ID')].strip()
                    quality_name = row[col.index('Quality Name')]
                    # related_entity_id = row[col.index('Related Entity ID')]
                    # related_entity_name = row[col.index('Related Entity Name')]
                    # abnormal_id = row[col.index('Abnormal ID')]
                    # abnormal_name = row[col.index('Abnormal Name')]
                    # phenotype_desc = row[col.index('Phenotype Desc')]
                    assay = row[col.index('Assay')].strip()
                    # frequency = row[col.index('Frequency')]
                    pubmed_id = row[col.index('Pubmed ID')].strip()
                    phenotype_description = row[col.index('Pub Desc')].strip()
                    curator_notes = row[col.index('Curator Notes')].strip()
                    # date_created = row[col.index('Date Created')]

                    if phenotype_id == '':
                        # LOG.warning('Missing phenotype in row:\n%s', row)
                        count_missing += 1
                        bad_rows.append(row)
                        continue
                    if len(str(disease_num)) < 6:
                        disease_num = str(disease_num).zfill(6)
                    disease_id = 'OMIA:' + disease_num
                    if species_id != '':
                        disease_id = '-'.join((disease_id, species_id))
                    assoc = D2PAssoc(graph, self.name, disease_id,
                                     phenotype_id)
                    if pubmed_id != '':
                        for pnum in re.split(r'[,;]', pubmed_id):
                            pnum = re.sub(r'[^0-9]', '', pnum)
                            pmid = 'PMID:' + pnum
                            assoc.add_source(pmid)
                    else:
                        assoc.add_source('/'.join(
                            (self.curie_map['OMIA'] + disease_num,
                             species_id)))
                    assoc.add_association_to_graph()
                    aid = assoc.get_association_id()
                    if phenotype_description != '':
                        model.addDescription(aid, phenotype_description)
                    if breed_name != '':
                        model.addDescription(aid,
                                             breed_name + ' [observed in]')
                    if assay != '':
                        model.addDescription(aid, assay + ' [assay]')
                    if curator_notes != '':
                        model.addComment(aid, curator_notes)

                    if entity_id != '' or quality_id != '':
                        LOG.info("EQ not empty for %s: %s + %s", disease_id,
                                 entity_name, quality_name)
            if count_missing > 0:
                LOG.warning(
                    "We are missing %d of %d D2P annotations from id %s",
                    count_missing, filereader.line_num - 1, filename)
                LOG.warning("Bad rows:\n%s",
                            '\n'.join([str(x) for x in bad_rows]))
            # finish loop through all files

        return
Exemple #7
0
    def process_common_disease_file(self, raw, unpadded_doids, limit=None):
        """
        Make disaese-phenotype associations.
        Some identifiers need clean up:
        * DOIDs are listed as DOID-DOID: --> DOID:
        * DOIDs may be unnecessarily zero-padded.
        these are remapped to their non-padded equivalent.

        :param raw:
        :param unpadded_doids:
        :param limit:
        :return:

        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        assoc_count = 0
        replace_id_flag = False
        col = self.small_files['columns']

        with open(raw, 'r', encoding="utf8") as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"')
            header = tsvfile.readline()
            if header != col:
                LOG.error("HEADER: has changed in %s.", raw)
                raise ValueError(col - header)

            disease_id = None
            for row in reader:
                row = [str(x).strip() for x in row]

                did = row[col.index('Disease ID')]
                # genotype = row[col.index('Genotype')]
                phenotype_id = row[col.index('Phenotype ID')]
                age_of_onset_id = row[col.index('Age of Onset ID')]
                eid = row[col.index('Evidence ID')]
                frequency = row[col.index('Frequency')]
                negation_id = row[col.index('Negation ID')]
                description = row[col.index('Description')]
                pub_ids = row[col.index('Pub')]

                disease_id = re.sub(r'DO(ID)?[-\:](DOID:)?', 'DOID:', did)
                disease_id = re.sub(r'MESH-', 'MESH:', disease_id)

                if not re.search(r'(DOID\:|MESH\:\w)\d+', disease_id):
                    LOG.warning("Invalid id format: %s", disease_id)

                # figure out if the doid should be unpadded,
                # then use the unpadded version instead
                if re.match(r'DOID', disease_id):
                    unpadded_num = re.sub(r'DOID:', '', disease_id)
                    unpadded_num = unpadded_num.lstrip('0')
                    if unpadded_num in unpadded_doids:
                        fixed_id = 'DOID:' + unpadded_num
                        replace_id_flag = True
                        disease_id = fixed_id.strip()

                if self.test_mode and disease_id not in self.test_ids:
                    # since these are broken up into disease-by-disease,
                    # just skip the whole file
                    return 0

                if negation_id != '':
                    continue  # TODO add negative associations

                if disease_id != '' and phenotype_id != '':
                    assoc = D2PAssoc(
                        graph, self.name, disease_id, phenotype_id.strip())
                    if age_of_onset_id != '':
                        assoc.onset = age_of_onset_id
                    if frequency != '':
                        assoc.frequency = frequency
                    eco_id = self.localtt[eid]
                    if eco_id is None:
                        eco_id = self.localtt['ITM']

                    assoc.add_evidence(eco_id)
                    # TODO add sex? - not in dataset yet
                    if description != '':
                        assoc.set_description(description)
                    if pub_ids != '':
                        for pub in pub_ids.split(';'):
                            pub = re.sub(r'  *', '', pub)  # fixed now but just in case

                            # there have been several malformed PMIDs curies
                            if pub[:4] != 'http' and \
                                    graph.curie_regexp.fullmatch(pub) is None:
                                LOG.warning(
                                    'Record %s has a malformed Pub %s', did, pub)
                                continue

                            if re.search(
                                    r'(DOID|MESH)', pub) or re.search(
                                        r'Disease name contained', description):
                                # skip "pubs" that are derived from
                                # the classes themselves
                                continue
                            assoc.add_source(pub.strip())
                    # TODO assigned by?

                    assoc.add_association_to_graph()
                    assoc_count += 1

                if not self.test_mode and limit is not None\
                        and reader.line_num > limit:
                    break

            if replace_id_flag:
                LOG.info("replaced DOID with unpadded version")
                self.replaced_id_count += 1
            LOG.info(
                "Added %d associations for %s.", assoc_count, disease_id)
        return assoc_count
Exemple #8
0
    def _process_phenotype_hpoa(self, file_info, limit=None):
        """
        see info on format here:
        http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html

        :param raw:
        :param limit:
        :return:

        """
        src_key = 'hpoa'

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        raw = '/'.join((self.rawdir, file_info['file']))

        # this will cause two dates to be attached to the dataset
        # (one from the filedate, and the other from here)
        # TODO when #112 is implemented,
        # this will result in only the whole dataset being versioned

        col = self.files[src_key]['columns']
        with open(raw, 'r', encoding="utf8") as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"')
            row = next(reader)  # drop Description
            row = str(next(reader))[9:19]
            LOG.info("Ingest from %s", row)
            date = datetime.strptime(
                row.strip(), '%Y-%m-%d').strftime("%Y-%m-%d-%H-%M")

            if file_info.get("url") is not None:
                self.dataset.set_ingest_source_file_version_date(
                    file_info.get("url"), date)

            row = next(reader)  # drop tracker url
            row = next(reader)  # drop release url
            row = next(reader)  # headers
            # row[0] = row[0][1:]  # uncomment;  but not allways needed ?!
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                row = [str(col).strip() for col in row]

                disease_id = row[col.index('#DatabaseID')]
                # 98246 OMIM
                # 68646 ORPHA
                # 297 DECIPHER

                if self.test_mode:
                    try:
                        id_list = self.test_ids
                        if id_list is None or disease_id not in id_list:
                            continue
                    except AttributeError:
                        continue

                # row[col.index('DiseaseName')]  unused

                if row[col.index('Qualifier')] == 'NOT':
                    continue

                hpo_id = row[col.index('HPO_ID')]
                publist = row[col.index('Reference')]
                eco_id = self.resolve(row[col.index('Evidence')])
                onset = row[col.index('Onset')]
                freq = row[col.index('Frequency')]
                sex = row[col.index('Sex')].lower()
                # row[col.index('Modifier')]   unused
                asp = row[col.index('Aspect')]
                # row[col.index('Biocuration')]  unused

                # LOG.info(
                #    'adding <%s>-to-<%s> because <%s>', disease_id, hpo_id, eco_id)

                model.addClassToGraph(disease_id)
                model.addClassToGraph(eco_id)
                if onset is not None and onset != '':
                    model.addClassToGraph(onset)

                if asp in ('P', 'M'):  # phenotype? abnormality or mortality
                    model.addClassToGraph(hpo_id)
                    assoc = D2PAssoc(  # default rel=self.globaltt['has phenotype']
                        graph, self.name, disease_id, hpo_id, onset, freq
                    )
                elif asp in ('I', 'C'):  # inheritance pattern or clinical course/onset
                    model.addClassToGraph(hpo_id)
                    assoc = D2PAssoc(
                        graph,
                        self.name,
                        disease_id,
                        hpo_id,
                        rel=self.globaltt['has disposition']
                    )
                else:
                    LOG.error("Unknown aspect : %s at line %i", asp, reader.line_num)

                assoc.add_evidence(eco_id)
                if sex is not None and sex != '':
                    self.graph.addTriple(
                        assoc.get_association_id(),
                        self.globaltt['has_sex_specificty'],
                        self.globaltt[sex],
                        object_category=blv.terms['BiologicalSex']
                    )

                # Publication
                # cut -f 5 phenotype.hpoa | grep ";" | tr ';' '\n' | cut -f1 -d ':' |\
                # sort | uniq -c | sort -nr
                # 629 PMID
                # 63 OMIM
                # 42 ISBN-13
                # 36 http

                for pub in publist.split(';'):
                    pub = pub.strip()

                    # there have been several malformed PMIDs
                    if pub[:4] != 'http' and \
                            graph.curie_regexp.fullmatch(pub) is None:
                        LOG.warning(
                            'Record %s has a malformed Reference %s', disease_id, pub)
                        continue

                    pubtype = None

                    if pub[:5] == 'PMID:':
                        pubtype = self.globaltt['journal article']

                    elif pub[:4] == 'ISBN':
                        pubtype = self.globaltt['publication']

                    elif pub[:5] == 'OMIM:':
                        pub = 'http://omim.org/entry/' + pub[5:]
                        pubtype = self.globaltt['web page']

                    elif pub[:9] == 'DECIPHER:':
                        pubtype = self.globaltt['web page']

                    elif pub[:6] == 'ORPHA:':
                        pubtype = self.globaltt['web page']

                    elif pub[:4] == 'http':
                        pubtype = self.globaltt['web page']

                    else:
                        LOG.error(
                            'Unknown pub type for disease %s from "%s"',
                            disease_id, pub)
                        continue

                    if pub is not None:
                        assoc.add_source(pub)
                        if pubtype is not None:
                            ref = Reference(graph, pub, pubtype)
                            # ref.setTitle('');  ref.setYear()

                            ref.addRefToGraph()
                    # TODO add curator

                    # pprint.pprint(assoc)

                    assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
        return
Exemple #9
0
    def process_common_disease_file(self, raw, unpadded_doids, limit=None):
        """
        Make disaese-phenotype associations.
        Some identifiers need clean up:
        * DOIDs are listed as DOID-DOID: --> DOID:
        * DOIDs may be unnecessarily zero-padded.
        these are remapped to their non-padded equivalent.

        :param raw:
        :param unpadded_doids:
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        assoc_count = 0
        replace_id_flag = False

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            header = csvfile.readline()  # skip the header row
            logger.info("HEADER: %s", header)
            disease_id = None
            for row in filereader:

                if 21 == len(row):
                    (did, dname, gid, gene_name, genotype, gene_symbols,
                     phenotype_id, phenotype_name, age_of_onset_id,
                     age_of_onset_name, eid, evidence_name, frequency, sex_id,
                     sex_name, negation_id, negation_name, description,
                     pub_ids, assigned_by,
                     date_created) = [str(col).strip() for col in row]
                else:
                    logger.warning(
                        "Wrong number of columns! expected 21, got: %s in: %s",
                        len(row), raw)
                    logger.warning("%s", row)
                    continue
                # b/c "PMID:    17223397"
                pub_ids = re.sub(r'  *', '', pub_ids)

                disease_id = re.sub(r'DO(ID)?[-\:](DOID:)?', 'DOID:', did)
                disease_id = re.sub(r'MESH-', 'MESH:', disease_id)
                if not re.search(r'(DOID\:|MESH\:\w)\d+', disease_id):
                    logger.warning("Invalid id format: %s", disease_id)

                # figure out if the doid should be unpadded,
                # then use the unpadded version instead
                if re.match(r'DOID', disease_id):
                    unpadded_num = re.sub(r'DOID:', '', disease_id)
                    unpadded_num = unpadded_num.lstrip('0')
                    if unpadded_num in unpadded_doids:
                        fixed_id = 'DOID:' + unpadded_num
                        replace_id_flag = True
                        disease_id = fixed_id.strip()

                if self.testMode and disease_id not in self.test_ids:
                    # since these are broken up into disease-by-disease,
                    # just skip the whole file
                    return 0
                else:
                    line_counter += 1

                if negation_id != '':
                    continue  # TODO add negative associations

                if disease_id != '' and phenotype_id != '':
                    assoc = D2PAssoc(g, self.name, disease_id,
                                     phenotype_id.strip())
                    if age_of_onset_id != '':
                        assoc.onset = age_of_onset_id
                    if frequency != '':
                        assoc.frequency = frequency
                    eco_id = self._map_evidence_to_codes(eid)
                    if eco_id is None:
                        eco_id = self._map_evidence_to_codes('ITM')
                    assoc.add_evidence(eco_id)
                    # TODO add sex? - not in dataset yet
                    if description != '':
                        assoc.set_description(description)
                    if pub_ids != '':
                        for p in pub_ids.split(';'):
                            p = re.sub(r'  *', '', p)
                            if re.search(r'(DOID|MESH)', p) \
                                    or re.search(r'Disease name contained',
                                                 description):
                                # skip "pubs" that are derived from
                                # the classes themselves
                                continue
                            assoc.add_source(p.strip())
                    # TODO assigned by?

                    assoc.add_association_to_graph()
                    assoc_count += 1

                if not self.testMode and limit is not None\
                        and line_counter > limit:
                    break

            if replace_id_flag:
                logger.info("replaced DOID with unpadded version")
                self.replaced_id_count += 1
            logger.info("Added %d associations for %s.", assoc_count,
                        disease_id)
        return assoc_count
Exemple #10
0
    def _process_phenotype_tab(self, raw, limit):
        """
        see info on format here:
        http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html

        :param raw:
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                row = [str(col).strip() for col in row]
                (db, num, name, qual, pheno_id, publist, eco, onset, freq, w,
                 asp, syn, date, curator) = row
                disease_id = db + ":" + num

                if self.testMode:
                    try:
                        id_list = self.test_ids
                        if id_list is None \
                                or disease_id not in id_list:
                            continue
                    except AttributeError:
                        continue

                # logger.info('adding %s', disease_id)

                model.addClassToGraph(disease_id, None)
                model.addClassToGraph(pheno_id, None)
                eco_id = self._map_evidence_to_codes(eco)
                model.addClassToGraph(eco_id, None)
                if onset is not None and onset != '':
                    model.addClassToGraph(onset, None)

                # we want to do things differently depending on
                # the aspect of the annotation
                # TODO PYLINT Redefinition of assoc type from
                #   dipper.models.assoc.D2PAssoc.D2PAssoc to
                #   dipper.models.assoc.DispositionAssoc.DispositionAssoc
                if asp == 'O' or asp == 'M':  # organ abnormality or mortality
                    assoc = D2PAssoc(g, self.name, disease_id, pheno_id, onset,
                                     freq)
                elif asp == 'I':  # inheritance patterns for the whole disease
                    assoc = DispositionAssoc(g, self.name, disease_id,
                                             pheno_id)
                elif asp == 'C':  # clinical course / onset
                    assoc = DispositionAssoc(g, self.name, disease_id,
                                             pheno_id)
                else:
                    logger.error("I don't know what this aspect is: %s", asp)

                assoc.add_evidence(eco_id)

                publist = re.split(r'[,;]', publist)
                # blow these apart if there is a list of pubs
                for pub in publist:
                    pub = pub.strip()
                    pubtype = None
                    if pub != '':
                        # if re.match(
                        #       r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene',
                        #        pub):
                        #     #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced
                        #     m = re.search(r'part\=(\w+)', pub)
                        #     pub_id = 'GeneReviews:'+m.group(1)
                        # elif re.search(
                        #        r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=',
                        #        pub):
                        #     m = re.search(r'Expert=(\d+)', pub)
                        #     pub_id = 'Orphanet:'+m.group(1)

                        if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub):
                            if re.match(r'PMID', pub):
                                pubtype = \
                                    Reference.ref_types['journal_article']
                            elif re.match(r'HPO', pub):
                                pubtype = Reference.ref_types['person']
                            else:
                                pubtype = Reference.ref_types['publication']
                            r = Reference(g, pub, pubtype)
                            r.addRefToGraph()
                        elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub):
                            # make the pubs a reference to the website,
                            # instead of the curie
                            if re.match(r'OMIM', pub):
                                omimnum = re.sub(r'OMIM:', '', pub)
                                omimurl = '/'.join(('http://omim.org/entry',
                                                    str(omimnum).strip()))
                                pub = omimurl
                            elif re.match(r'Orphanet:', pub):
                                orphanetnum = re.sub(r'Orphanet:', '', pub)
                                orphaneturl = \
                                    ''.join((
                                        'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=',
                                        str(orphanetnum)))
                                pub = orphaneturl
                            elif re.match(r'DECIPHER:', pub):
                                deciphernum = re.sub(r'DECIPHER:', '', pub)
                                decipherurl = '/'.join(
                                    ('https://decipher.sanger.ac.uk/syndrome',
                                     deciphernum))
                                pub = decipherurl
                            pubtype = Reference.ref_types['webpage']
                        elif re.match(r'http', pub):
                            pass
                        else:
                            logger.error('Unknown pub type for %s: %s',
                                         disease_id, pub)
                            print(disease_id, 'pubs:', str(publist))
                            continue

                        if pub is not None:
                            assoc.add_source(pub)

                        # TODO add curator

                assoc.add_association_to_graph()

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemple #11
0
    def _process_phenotype_hpoa(self, raw, limit):
        """
        see info on format here:
        http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html

        :param raw:
        :param limit:
        :return:

        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        filedate = datetime.utcfromtimestamp(
            os.stat(raw)[ST_CTIME]).strftime("%Y-%m-%d")

        # this will cause two dates to be attached to the dataset
        # (one from the filedate, and the other from here)
        # TODO when #112 is implemented,
        # this will result in only the whole dataset being versioned

        col = self.files['hpoa']['columns']
        with open(raw, 'r', encoding="utf8") as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"')
            vers = next(reader)  # drop
            vers = str(next(reader))[9:19]
            print(vers)
            date = datetime.strptime(vers.strip(),
                                     '%Y-%m-%d').strftime("%Y-%m-%d-%H-%M")

            self.dataset.setVersion(filedate, date)
            for row in reader:
                if row[0][0] == '#' or row[0] == 'DatabaseID':  # headers
                    continue
                row = [str(col).strip() for col in row]

                disease_id = row[col.index('DatabaseID')]
                # 98246 OMIM
                # 68646 ORPHA
                # 297 DECIPHER

                if self.test_mode:
                    try:
                        id_list = self.test_ids
                        if id_list is None or disease_id not in id_list:
                            continue
                    except AttributeError:
                        continue

                pheno_id = row[col.index('HPO_ID')]
                eco_id = self.resolve(row[col.index('Evidence')])
                onset = row[col.index('Onset')]
                asp = row[col.index('Aspect')]
                freq = row[col.index('Frequency')]
                publist = row[col.index('Reference')]
                sex = row[col.index('Sex')].lower()

                # LOG.info(
                #    'adding <%s>-to-<%s> because <%s>', disease_id, pheno_id, eco_id)

                model.addClassToGraph(disease_id)
                model.addClassToGraph(pheno_id)
                model.addClassToGraph(eco_id)
                if onset is not None and onset != '':
                    model.addClassToGraph(onset)

                if asp in ('P', 'M'):  # phenotype? abnormality or mortality
                    assoc = D2PAssoc(  # default rel=self.globaltt['has phenotype']
                        graph, self.name, disease_id, pheno_id, onset, freq)
                elif asp in (
                        'I',
                        'C'):  # inheritance pattern or clinical course/onset
                    assoc = D2PAssoc(graph,
                                     self.name,
                                     disease_id,
                                     pheno_id,
                                     rel=self.globaltt['has disposition'])
                else:
                    LOG.error("Unknown aspect : %s at line %i", asp,
                              reader.line_num)

                assoc.add_evidence(eco_id)
                if sex is not None and sex != '':
                    self.graph.addTriple(assoc.get_association_id(),
                                         self.globaltt['has_sex_specificty'],
                                         self.globaltt[sex])

                # Publication
                # cut -f 5 phenotype.hpoa | grep ";" | tr ';' '\n' | cut -f1 -d ':' |\
                # sort | uniq -c | sort -nr
                # 629 PMID
                # 63 OMIM
                # 42 ISBN-13
                # 36 http

                for pub in publist.split(';'):
                    pub = pub.strip()
                    pubtype = None

                    if pub[:5] == 'PMID:':
                        pubtype = self.globaltt['journal article']

                    elif pub[:4] == 'ISBN':
                        pubtype = self.globaltt['publication']

                    elif pub[:5] == 'OMIM:':
                        pub = 'http://omim.org/entry/' + pub[5:]
                        pubtype = self.globaltt['web page']

                    elif pub[:9] == 'DECIPHER:':
                        pubtype = self.globaltt['web page']

                    elif pub[:6] == 'ORPHA:':
                        pubtype = self.globaltt['web page']

                    elif pub[:4] == 'http':
                        pubtype = self.globaltt['web page']

                    else:
                        LOG.error('Unknown pub type for disease %s from "%s"',
                                  disease_id, pub)
                        continue

                    if pub is not None:
                        assoc.add_source(pub)
                        if pubtype is not None:
                            ref = Reference(graph, pub, pubtype)
                            # ref.setTitle('');  ref.setYear()

                            ref.addRefToGraph()
                    # TODO add curator

                    # pprint.pprint(assoc)

                    assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
        return