def _process_phene_row(self, row): model = Model(self.g) phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: logger.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:'+str(row['omia_id']) if self.testMode and not\ (int(row['gb_species_id']) in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: logger.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:'+str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:'+gb_species_id) if sp_phene_label is None and \ omia_label is not None and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) model.addClassToGraph( sp_phene_id, sp_phene_label, omia_id, descr) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control']: if row[item] is not None and row[item] != '': model.addDescription( sp_phene_id, row[item] + ' ['+item+']') # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) model.addOWLPropertyClassRestriction( sp_phene_id, model.object_properties['in_taxon'], species_id) # add inheritance as an association inheritance_id = self._map_inheritance_term_id(row['inherit']) if inheritance_id is not None: assoc = DispositionAssoc(self.g, self.name, sp_phene_id, inheritance_id) assoc.add_association_to_graph() if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id']} return
def _get_chrbands(self, limit, taxon, genome_id=None): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :param: taxon: :param: genome :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) if genome_id is None: genome_id = geno.makeGenomeID( taxon_id) # makes a blank node always geno.addGenome(taxon_id, genome_label, genome_id) model.addOWLPropertyClassRestriction(genome_id, self.globaltt['in taxon'], taxon_id) placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # currently unused patterns # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' col = ['chrom', 'start', 'stop', 'band', 'rtype'] with gzip.open(myfile, 'rb') as reader: for line in reader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#': continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') chrom = row[col.index('chrom')] band = row[col.index('band')] rtype = row[col.index('rtype')] # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = re.match(placed_scaffold_pattern + r'$', chrom) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern # chrom = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # LOG.info("Skipping non-placed chromosome %s", chrom) # chatty continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid + band maplocclass_label = makeChromLabel(chrom + band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph(maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband'] ]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome (SO:000340) because # they don't actually have banding info LOG.info("feature type '%s' is not chr band", self.globaltcid[region_type_id]) else: LOG.info('staining type not found for: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of", maplocclass_id, "=", parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file for prnt in parents: parent = prnt.strip() if parent is None or parent == "": continue pclassid = cclassid + parent # class chr parts pclass_label = makeChromLabel(chrom + parent, genome_label) rti = getChrPartTypeByNotation(parent, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if prnt != parents[-1]: grandparent = 1 + parents.index(prnt) pid = cclassid + parents[grandparent] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) model.addOWLPropertyClassRestriction( genome_id, self.globaltt['in taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern+r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph( maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband']]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): parent_i = parents[i].strip() if parent_i is not None and parent_i != "": pclassid = cclassid + parent_i # class chr parts pclass_label = makeChromLabel(chrom + parent_i, genome_label) rti = getChrPartTypeByNotation(parent_i, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid+parents[i+1] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid+parents[0]) model.addOWLPropertyClassRestriction( cclassid+parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break # TODO figure out the staining intensities for the encompassing bands return
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ model = Model(self.graph) family = Family(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) model.addOWLPropertyClassRestriction( genome_id, Genotype.object_properties['in_taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern+r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, family.object_properties['member_of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph( maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = Feature.types['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ Feature.types['chromosome_band'], Feature.types['chromosome_subband']]: stain_type = Feature.types.get(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, Feature.properties['has_staining_intensity'], Feature.types.get(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): pclassid = cclassid+parents[i] # class chr parts pclass_label = \ makeChromLabel(chrom+parents[i], genome_label) rti = getChrPartTypeByNotation(parents[i]) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid+parents[i+1] # the instance model.addOWLPropertyClassRestriction( pclassid, Feature.object_properties['is_subsequence_of'], pid) model.addOWLPropertyClassRestriction( pid, Feature.object_properties['has_subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, Feature.object_properties['is_subsequence_of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, Feature.object_properties['has_subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, Feature.object_properties['is_subsequence_of'], cclassid+parents[0]) model.addOWLPropertyClassRestriction( cclassid+parents[0], Feature.object_properties['has_subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break # TODO figure out the staining intensities for the encompassing bands return
def _process_phene_row(self, row): model = Model(self.graph) phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: LOG.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:' + str(row['omia_id']) if self.test_mode and not ( # demorgan this row['gb_species_id'] in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: LOG.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:' + str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:' + gb_species_id) if sp_phene_label is None and omia_label is not None \ and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) model.addClassToGraph(sp_phene_id, sp_phene_label, omia_id, descr, class_category=blv.terms['PhenotypicFeature']) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control' ]: if row[item] is not None and row[item] != '': model.addDescription( sp_phene_id, row[item] + ' [' + item + ']', subject_category=blv.terms['PhenotypicFeature']) # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) model.addOWLPropertyClassRestriction( sp_phene_id, self.globaltt['in taxon'], species_id, class_category=blv.terms['PhenotypicFeature']) # add inheritance as an association inheritance_id = None if row['inherit'] is not None and row['inherit'] in self.localtt: inheritance_id = self.resolve(row['inherit']) elif row['inherit'] is not None and row['inherit'] != '': LOG.info('Unhandled inheritance type:\t%s', row['inherit']) if inheritance_id is not None: # observable related to genetic disposition assoc = D2PAssoc( # JR: not sure we should be using D2PAssoc for this self.graph, self.name, sp_phene_id, inheritance_id, rel=self.globaltt['has disposition'], disease_category=blv.terms['PhenotypicFeature']) assoc.add_association_to_graph() if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id'] }