def addChromosome(self, chr, tax_id, tax_label=None, build_id=None, build_label=None): # if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. # if a build is included, punn the chromosome as a subclass of SO:chromsome, and # make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the # build or genome. # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chr), tax_id) if tax_label is not None: chr_label = makeChromLabel(chr, tax_label) else: chr_label = makeChromLabel(chr) genome_id = self.makeGenomeID(tax_id) self.gu.addClassToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: chrinbuild_id = makeChromID(chr, build_id) # the build-specific chromosome if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chr, build_label) # add the build-specific chromosome as an instance of the chr class self.gu.addIndividualToGraph(self.graph, chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome as a member of the build (both ways) self.gu.addMember(self.graph, build_id, chrinbuild_id) self.gu.addMemberOf(self.graph, chrinbuild_id, build_id) return
def addChromosomeInstance( self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph( chr_id, chr_label, Feature.types['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) return
def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.model.addClassToGraph(chrom_class_id, chrom_class_label, self.globaltt['chromosome'])
def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph(chr_id, chr_label, self.globaltt['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) # usage dependent, todo: ommit return
def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # the chrom class (generic) id chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.gu.addClassToGraph(self.graph, chrom_class_id, chrom_class_label, Feature.types['chromosome']) return
def addChromosome(self, chrom, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family(self.graph) # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chrom), tax_id) if tax_label is not None: chr_label = makeChromLabel(chrom, tax_label) else: chr_label = makeChromLabel(chrom) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph(chr_id, chr_label, self.globaltt['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chrom, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chrom, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id, group_category=blv.terms['GenomeBuild']) family.addMemberOf(chrinbuild_id, build_id, group_category=blv.terms['GenomeBuild'])
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) model.addOWLPropertyClassRestriction( genome_id, self.globaltt['in taxon'], taxon_id) placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # currently unused patterns # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' col = ['chrom', 'start', 'stop', 'band', 'rtype'] with gzip.open(myfile, 'rb') as reader: for line in reader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#': continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') chrom = row[col.index('chrom')] band = row[col.index('band')] rtype = row[col.index('rtype')] # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = re.match(placed_scaffold_pattern+r'$', chrom) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern # chrom = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold LOG.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph( maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband']]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info LOG.info("feature type %s != chr band", region_type_id) else: LOG.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of", maplocclass_id, "=", parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file for prnt in parents: parent = prnt.strip() if parent is None or parent == "": continue pclassid = cclassid + parent # class chr parts pclass_label = makeChromLabel(chrom + parent, genome_label) rti = getChrPartTypeByNotation(parent, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if prnt != parents[-1]: grandparent = 1 + parents.index(prnt) pid = cclassid + parents[grandparent] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break # TODO figure out the staining intensities for the encompassing bands return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # not unzipping the file logger.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", gene_info) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) # tax label can get added elsewhere geno.addGenome(tax_id, str(tax_num)) # label added elsewhere model.addClassToGraph(tax_id, None) with gzip.open(gene_info, 'rb') as f: row = f.readline().decode().strip().split('\t') logger.info("Header has %i columns", len(row)) for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date, feature_type) = line.split('\t') # ##set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self.map_type_of_gene(gtype.strip()) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == 'SO:0000110': self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.testMode and \ limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader, # so we will not add the leader designation here. else: model.addIndividualToGraph(gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader. if name != '-': model.addSynonym(gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if xrefs.strip() != '-': self._add_gene_equivalencies(xrefs, gene_id, tax_num) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if str(chrom) != '-' and str(chrom) != '': if re.search(r'\|', str(chrom)) and \ str(chrom) not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping logger.info( '%s is non-uniquely mapped to %s.' + ' Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chrom) == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split(r'\|', str(chrom)): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(c, tax_id, None) mychrom = makeChromID(c, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(c, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and \ len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^' + c, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(c + bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(g, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # not unzipping the file logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", myfile) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chr, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self._map_type_of_gene(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # TODO might have to figure out if things aren't genes, and make them individuals gu.addClassToGraph(g, gene_id, label, gene_type_id, desc) # we have to do special things here for genes, because they're classes not individuals # f = Feature(gene_id,label,gene_type_id,desc) if name != '-': gu.addSynonym(g, gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 if xrefs.strip() != '-': for r in xrefs.strip().split('|'): fixedr = self._cleanup_id(r) if fixedr is not None and fixedr.strip() != '': if re.match('HPRD', fixedr): # proteins are not == genes. gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr) else: # skip some of these for now if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']: gu.addEquivalentClass(g, gene_id, fixedr) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility # 101928066 LOC101928066 1|Un - # unlocated scaffold # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1 # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table when there is > 1 listed # with the exception of human X|Y, i will only take those that align to one chr # FIXME remove the chr mapping below when we pull in the genomic coords if str(chr) != '-' and str(chr) != '': if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']: # this means that there's uncertainty in the mapping. skip it # TODO we'll need to figure out how to deal with >1 loc mapping logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chr) == 'X; Y': chr = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split('\|',str(chr)) : geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere mychrom = makeChromID(c, tax_num, 'CHR') mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label gu.addSynonym(g, mychrom, mychrom_syn) band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, so make that kind of band # not sure why this matches? chrX|Y or 10090chr12|Un" # TODO we probably need a different regex per organism # the maploc_id already has the numeric chromosome in it, strip it first bid = re.sub('^'+c, '', map_loc) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates) # print(map_loc,'-->',bid,'-->',maploc_id) band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere band.addFeatureToGraph(g) # add the band as the containing feature gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24, ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _get_chrbands(self, limit, src_key, genome_id): """ :param limit: :return: """ tax_num = src_key if limit is None: limit = sys.maxsize # practical limit anyway model = Model(self.graph) line_num = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[src_key]['genome_label'] taxon_curie = 'NCBITaxon:' + tax_num species_name = self.globaltcid[taxon_curie] # for logging # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_curie, None) model.addSynonym(taxon_curie, genome_label) geno.addGenome(taxon_curie, genome_label, genome_id) # add the build and the taxon it's in build_num = self.files[src_key]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_curie) # cat (at least) also has chr[BDAECF]... hex? must be a back cat. if tax_num == self.localtt['Felis catus']: placed_scaffold_regex = re.compile( r'(chr(?:[BDAECF]\d+|X|Y|Z|W|M|))$') else: placed_scaffold_regex = re.compile(r'(chr(?:\d+|X|Y|Z|W|M))$') unlocalized_scaffold_regex = re.compile(r'_(\w+)_random') unplaced_scaffold_regex = re.compile(r'chr(Un(?:_\w+)?)') # process the bands col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as binreader: for line in binreader: line_num += 1 # skip comments line = line.decode().strip() if line[0] == '#' or line_num > limit: continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') scaffold = row[col.index('chrom')].strip() start = row[col.index('chromStart')] stop = row[col.index('chromEnd')] band_num = row[col.index('name')].strip() rtype = row[col.index('gieStain')] # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = placed_scaffold_regex.match(scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold at the class level # LOG.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = unlocalized_scaffold_regex.match(scaffold) m_chr_unplaced = unplaced_scaffold_regex.match(scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) # else: # LOG.error( # "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, tax_num, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_curie, self.files[src_key]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } elif scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } else: LOG.info('%s line %i DROPPED chromosome/scaffold %s', species_name, line_num, scaffold) parents = list() # see it new types have showed up if rtype is not None and rtype not in [ 'gneg', 'gpos25', 'gpos33', 'gpos50', 'gpos66', 'gpos75', 'gpos100', 'acen', 'gvar', 'stalk' ]: LOG.info('Unknown gieStain type "%s" in %s at %i', rtype, src_key, line_num) self.globaltt[rtype] # blow up if rtype == 'acen': # hacky, revisit if ontology improves rtype = self.localtt[rtype] if band_num is not None and band_num != '' and \ rtype is not None and rtype != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt[rtype], } # add the staining intensity of the band # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] # else: # band has no parents # loop through the parents and add them to the dict # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i], self.graph) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum is not None and pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd elif pnum is not None: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom else: LOG.error("pnum is None") # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num binreader.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, tax_num, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False)
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere self.gu.addClassToGraph(self.graph, taxon_id, None) self.gu.addSynonym(self.graph, taxon_id, genome_label) self.gu.loadObjectProperties(self.graph, Feature.object_properties) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) self.gu.addOWLPropertyClassRestriction( self.graph, genome_id, Genotype.object_properties['in_taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern+r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid, self.gu.object_properties['member_of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) self.gu.addClassToGraph( self.graph, maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = Feature.types['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ Feature.types['chromosome_band'], Feature.types['chromosome_subband']]: stain_type = Feature.types.get(rtype) if stain_type is not None: self.gu.addOWLPropertyClassRestriction( self.graph, maplocclass_id, Feature.properties['has_staining_intensity'], Feature.types.get(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): pclassid = cclassid+parents[i] # class chr parts pclass_label = \ makeChromLabel(chrom+parents[i], genome_label) rti = getChrPartTypeByNotation(parents[i]) self.gu.addClassToGraph( self.graph, pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid+parents[i+1] # the instance self.gu.addOWLPropertyClassRestriction( self.graph, pclassid, Feature.object_properties['is_subsequence_of'], pid) self.gu.addOWLPropertyClassRestriction( self.graph, pid, Feature.object_properties['has_subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome self.gu.addOWLPropertyClassRestriction( self.graph, pclassid, Feature.object_properties['is_subsequence_of'], cclassid) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid, Feature.object_properties['has_subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: self.gu.addOWLPropertyClassRestriction( self.graph, maplocclass_id, Feature.object_properties['is_subsequence_of'], cclassid+parents[0]) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid+parents[0], Feature.object_properties['has_subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break self.gu.loadAllProperties(self.graph) # TODO figure out the staining intensities for the encompassing bands return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:'+build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = \ placed_scaffold_pattern+r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' m = re.match(placed_scaffold_pattern+r'$', scaffold) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = m.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if m: pass elif m_chr_unloc is not None and\ len(m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num+'_'+m_chr_unloc.group(2) elif m_chr_unplaced is not None and\ len(m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': Feature.types['chromosome']} if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': Feature.types['assembly_component'], 'synonym': scaffold} if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num+band_num] = {'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None} # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num+band_num]['stain'] = \ Feature.types.get(rtype) # get the parent bands, and make them unique parents = list( monochrom.make_parent_bands(band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num+band_num]['parent'] = \ chrom_num+parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num+parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash b = {'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti} mybands[pnum] = b else: # band already in the hash means it's a grouping band # need to update the min/max coords b = mybands.get(pnum) b['min'] = min(sta, sto, b['min']) b['max'] = max(sta, sto, b['max']) mybands[pnum] = b # also, set the max for the chrom c = mybands.get(chrom_num) c['max'] = max(sta, sto, c['max']) mybands[chrom_num] = c # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num+parents[i+1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for b in mybands.keys(): myband = mybands.get(b) band_class_id = makeChromID(b, taxon, 'CHR') band_class_label = makeChromLabel(b, genome_label) band_build_id = makeChromID(b, build_num, 'MONARCH') band_build_label = makeChromLabel(b, build_num) # the build-specific chrom chrom_in_build_id = makeChromID( myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != Feature.types['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == Feature.types['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == Feature.types['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: # TODO 'has_staining_intensity' being dropped by MB bfeature.addFeatureProperty( Feature.properties['has_staining_intensity'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' mch = re.match(placed_scaffold_pattern + r'$', scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None } # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num + band_num]['stain'] = self.resolve(rtype) # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd else: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, taxon, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivalent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ src_key = 'gene_info' if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # not unzipping the file LOG.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", gene_info) LOG.info('Add taxa and genome classes for those in our filter') band_regex = re.compile(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$') for tax_num in self.tax_ids: tax_curie = ':'.join(('NCBITaxon', tax_num)) # tax label can get added elsewhere geno.addGenome(tax_curie, tax_num) # label added elsewhere model.addClassToGraph(tax_curie, None) col = self.files[src_key]['columns'] LOG.info('Begin reading & parsing') with gzip.open(gene_info, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment char if not self.check_fileheader(col, row): pass for line in tsv: line = line.strip() line_counter += 1 if line[0] == '#': # skip comments continue row = line.decode().strip().split('\t') # ##set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (tax_num not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter tax_num = row[col.index('tax_id')] gene_num = row[col.index('GeneID')] symbol = row[col.index('Symbol')] # = row[col.index('LocusTag')] synonyms = row[col.index('Synonyms')].strip() dbxrefs = row[col.index('dbXrefs')].strip() chrom = row[col.index('chromosome')].strip() map_loc = row[col.index('map_location')].strip() desc = row[col.index('description')] gtype = row[col.index('type_of_gene')].strip() # = row[col.index('Symbol_from_nomenclature_authority')] name = row[col.index('Full_name_from_nomenclature_authority')] # = row[col.index('Nomenclature_status')] other_designations = row[col.index( 'Other_designations')].strip() # = row[col.index('Modification_date')} # = row[col.index('Feature_type')] if self.test_mode and int(gene_num) not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue tax_curie = ':'.join(('NCBITaxon', tax_num)) gene_id = ':'.join(('NCBIGene', gene_num)) gene_type_id = self.resolve(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == self.globaltt['sequence_feature']: self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.test_mode and limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader (for non mods), # so we will not add the leader designation here. else: model.addIndividualToGraph(gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader if name != '-': model.addSynonym(gene_id, name) if synonyms != '-': for syn in synonyms.split('|'): syn = syn.strip() # unknown curies may occur here if syn[:12] == 'AnimalQTLdb:' and \ tax_curie in self.informal_species: syn = self.informal_species[ tax_curie] + 'QTL:' + syn[12:] LOG.info('AnimalQTLdb: CHANGED to: %s', syn) model.addSynonym(gene_id, syn, model.globaltt['has_related_synonym']) if other_designations != '-': for syn in other_designations.split('|'): model.addSynonym(gene_id, syn.strip(), model.globaltt['has_related_synonym']) if dbxrefs != '-': self._add_gene_equivalencies(dbxrefs, gene_id, tax_curie) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if chrom != '-' and chrom != '': if re.search(r'\|', chrom) and chrom not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping LOG.info( '%s is non-uniquely mapped to %s. Skipping for now.', gene_id, chrom) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if chrom == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for chromosome in re.split(r'\|', chrom): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(chromosome, tax_curie, None) mychrom = makeChromID(chromosome, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(chromosome, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match(band_regex, map_loc) if band_match is not None and len( band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^' + chromosome, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(chromosome + bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(graph, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature graph.addTriple(gene_id, self.globaltt['is subsequence of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 LOG.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome graph.addTriple(gene_id, self.globaltt['is subsequence of'], mychrom) geno.addTaxon(tax_curie, gene_id)
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) model.addOWLPropertyClassRestriction( genome_id, self.globaltt['in taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern+r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph( maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband']]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): parent_i = parents[i].strip() if parent_i is not None and parent_i != "": pclassid = cclassid + parent_i # class chr parts pclass_label = makeChromLabel(chrom + parent_i, genome_label) rti = getChrPartTypeByNotation(parent_i, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid+parents[i+1] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid+parents[0]) model.addOWLPropertyClassRestriction( cclassid+parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break # TODO figure out the staining intensities for the encompassing bands return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # not unzipping the file logger.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", gene_info) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) # tax label can get added elsewhere geno.addGenome(tax_id, str(tax_num)) # label added elsewhere model.addClassToGraph(tax_id, None) with gzip.open(gene_info, 'rb') as f: row = f.readline().decode().strip().split('\t') logger.info("Header has %i columns", len(row)) for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date, feature_type) = line.split('\t') # ##set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self.map_type_of_gene(gtype.strip()) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == 'SO:0000110': self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.testMode and \ limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader, # so we will not add the leader designation here. else: model.addIndividualToGraph( gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader. if name != '-': model.addSynonym(gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if xrefs.strip() != '-': self._add_gene_equivalencies(xrefs, gene_id, tax_num) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if str(chrom) != '-' and str(chrom) != '': if re.search(r'\|', str(chrom)) and \ str(chrom) not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping logger.info( '%s is non-uniquely mapped to %s.' + ' Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chrom) == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split(r'\|', str(chrom)): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(c, tax_id, None) mychrom = makeChromID(c, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(c, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match( r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and \ len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^'+c, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(g, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 logger.debug( 'not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) return
def _get_chrbands(self, limit, taxon, genome_id=None): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :param: taxon: :param: genome :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) if genome_id is None: genome_id = geno.makeGenomeID( taxon_id) # makes a blank node always geno.addGenome(taxon_id, genome_label, genome_id) model.addOWLPropertyClassRestriction(genome_id, self.globaltt['in taxon'], taxon_id) placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # currently unused patterns # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' col = ['chrom', 'start', 'stop', 'band', 'rtype'] with gzip.open(myfile, 'rb') as reader: for line in reader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#': continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') chrom = row[col.index('chrom')] band = row[col.index('band')] rtype = row[col.index('rtype')] # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = re.match(placed_scaffold_pattern + r'$', chrom) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern # chrom = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # LOG.info("Skipping non-placed chromosome %s", chrom) # chatty continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid + band maplocclass_label = makeChromLabel(chrom + band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph(maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband'] ]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome (SO:000340) because # they don't actually have banding info LOG.info("feature type '%s' is not chr band", self.globaltcid[region_type_id]) else: LOG.info('staining type not found for: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of", maplocclass_id, "=", parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file for prnt in parents: parent = prnt.strip() if parent is None or parent == "": continue pclassid = cclassid + parent # class chr parts pclass_label = makeChromLabel(chrom + parent, genome_label) rti = getChrPartTypeByNotation(parent, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if prnt != parents[-1]: grandparent = 1 + parents.index(prnt) pid = cclassid + parents[grandparent] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break