def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph else: g = self.graph tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(tmap, limit) geno = Genotype(g) # organisms = ['chicken'] organisms = [ 'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for o in organisms: tax_id = self._get_tax_by_common_name(o) geno.addGenome(tax_id, o) build_id = None build = None k = o+'_bp' if k in self.files: file = self.files[k]['file'] m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file) if m is None: logger.error("Can't match a gff build") else: build = m.group(1) build_id = self._map_build_by_abbrev(build) logger.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, tax_id) if build_id is not None: self._process_QTLs_genomic_location( '/'.join((self.rawdir, file)), tax_id, build_id, build, limit) k = o+'_cm' if k in self.files: file = self.files[k]['file'] self._process_QTLs_genetic_location( '/'.join((self.rawdir, file)), tax_id, o, limit) logger.info("Finished parsing") self.load_bindings() logger.info("Found %d nodes", len(self.graph)) return
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph else: g = self.graph tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(tmap, limit) geno = Genotype(g) # organisms = ['chicken'] organisms = [ 'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for o in organisms: tax_id = self._get_tax_by_common_name(o) geno.addGenome(tax_id, o) build_id = None build = None k = o + '_bp' if k in self.files: file = self.files[k]['file'] m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file) if m is None: logger.error("Can't match a gff build") else: build = m.group(1) build_id = self._map_build_by_abbrev(build) logger.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, tax_id) if build_id is not None: self._process_QTLs_genomic_location( '/'.join((self.rawdir, file)), tax_id, build_id, build, limit) k = o+'_cm' if k in self.files: file = self.files[k]['file'] self._process_QTLs_genetic_location( '/'.join((self.rawdir, file)), tax_id, o, limit) logger.info("Finished parsing") return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omimTitles file, excludes those designated as obsolete and iteratively queries the omim api in batches of 20 for the json-formatted data. This will create OMIM classes, with the label & definition. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: """ omimids = list(self.omim_type.keys() - self.omim_replaced.keys()) LOG.info('Have %i omim numbers to fetch records from their API', len(omimids)) LOG.info('Have %i omim types ', len(self.omim_type)) if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) tax_label = 'H**o sapiens' tax_id = self.globaltt[tax_label] # add genome and taxon geno.addGenome(tax_id, tax_label) model.addClassToGraph(tax_id, tax_label) includes = set() includes.add('all') self.process_entries(omimids, self._transform_entry, includes, graph, limit) # since we are not fetching obsolete records any more add them all in here for omim_id in self.omim_replaced: model.addDeprecatedClass( 'OMIM:' + omim_id, ['OMIM:' + o for o in self.omim_replaced[omim_id]])
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # tax_num = '9606' # TODO PYLINT unused tax_id = 'NCBITaxon:9606' tax_label = 'Human' # add genome and taxon geno.addGenome(tax_id, tax_label) # tax label can get added elsewhere model.addClassToGraph(tax_id, None) # label added elsewhere includes = set() includes.add('all') self.process_entries( omimids, self._transform_entry, includes, g, limit) return
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere self.gu.addClassToGraph(self.graph, taxon_id, None) self.gu.addSynonym(self.graph, taxon_id, genome_label) self.gu.loadObjectProperties(self.graph, Feature.object_properties) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) self.gu.addOWLPropertyClassRestriction( self.graph, genome_id, Genotype.object_properties['in_taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern+r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid, self.gu.object_properties['member_of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) self.gu.addClassToGraph( self.graph, maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = Feature.types['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ Feature.types['chromosome_band'], Feature.types['chromosome_subband']]: stain_type = Feature.types.get(rtype) if stain_type is not None: self.gu.addOWLPropertyClassRestriction( self.graph, maplocclass_id, Feature.properties['has_staining_intensity'], Feature.types.get(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): pclassid = cclassid+parents[i] # class chr parts pclass_label = \ makeChromLabel(chrom+parents[i], genome_label) rti = getChrPartTypeByNotation(parents[i]) self.gu.addClassToGraph( self.graph, pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid+parents[i+1] # the instance self.gu.addOWLPropertyClassRestriction( self.graph, pclassid, Feature.object_properties['is_subsequence_of'], pid) self.gu.addOWLPropertyClassRestriction( self.graph, pid, Feature.object_properties['has_subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome self.gu.addOWLPropertyClassRestriction( self.graph, pclassid, Feature.object_properties['is_subsequence_of'], cclassid) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid, Feature.object_properties['has_subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: self.gu.addOWLPropertyClassRestriction( self.graph, maplocclass_id, Feature.object_properties['is_subsequence_of'], cclassid+parents[0]) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid+parents[0], Feature.object_properties['has_subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break self.gu.loadAllProperties(self.graph) # TODO figure out the staining intensities for the encompassing bands return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:' + tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs' + str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:' + dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_' + gene_num + '-' + variant_num if self.nobnodes: vl_id = ':' + vl_id vl_label = allele_name model.addIndividualToGraph(vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info("No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub(m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub(r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub(r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc(g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:' + xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:'+build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = \ placed_scaffold_pattern+r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' m = re.match(placed_scaffold_pattern+r'$', scaffold) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = m.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if m: pass elif m_chr_unloc is not None and\ len(m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num+'_'+m_chr_unloc.group(2) elif m_chr_unplaced is not None and\ len(m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': Feature.types['chromosome']} if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': Feature.types['assembly_component'], 'synonym': scaffold} if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num+band_num] = {'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None} # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num+band_num]['stain'] = \ Feature.types.get(rtype) # get the parent bands, and make them unique parents = list( monochrom.make_parent_bands(band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num+band_num]['parent'] = \ chrom_num+parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num+parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash b = {'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti} mybands[pnum] = b else: # band already in the hash means it's a grouping band # need to update the min/max coords b = mybands.get(pnum) b['min'] = min(sta, sto, b['min']) b['max'] = max(sta, sto, b['max']) mybands[pnum] = b # also, set the max for the chrom c = mybands.get(chrom_num) c['max'] = max(sta, sto, c['max']) mybands[chrom_num] = c # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num+parents[i+1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for b in mybands.keys(): myband = mybands.get(b) band_class_id = makeChromID(b, taxon, 'CHR') band_class_label = makeChromLabel(b, genome_label) band_build_id = makeChromID(b, build_num, 'MONARCH') band_build_label = makeChromLabel(b, build_num) # the build-specific chrom chrom_in_build_id = makeChromID( myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != Feature.types['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == Feature.types['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == Feature.types['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: # TODO 'has_staining_intensity' being dropped by MB bfeature.addFeatureProperty( Feature.properties['has_staining_intensity'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivalent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ src_key = 'gene_info' if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # not unzipping the file LOG.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", gene_info) LOG.info('Add taxa and genome classes for those in our filter') band_regex = re.compile(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$') for tax_num in self.tax_ids: tax_curie = ':'.join(('NCBITaxon', tax_num)) # tax label can get added elsewhere geno.addGenome(tax_curie, tax_num) # label added elsewhere model.addClassToGraph(tax_curie, None) col = self.files[src_key]['columns'] LOG.info('Begin reading & parsing') with gzip.open(gene_info, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment char if not self.check_fileheader(col, row): pass for line in tsv: line = line.strip() line_counter += 1 if line[0] == '#': # skip comments continue row = line.decode().strip().split('\t') # ##set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (tax_num not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter tax_num = row[col.index('tax_id')] gene_num = row[col.index('GeneID')] symbol = row[col.index('Symbol')] # = row[col.index('LocusTag')] synonyms = row[col.index('Synonyms')].strip() dbxrefs = row[col.index('dbXrefs')].strip() chrom = row[col.index('chromosome')].strip() map_loc = row[col.index('map_location')].strip() desc = row[col.index('description')] gtype = row[col.index('type_of_gene')].strip() # = row[col.index('Symbol_from_nomenclature_authority')] name = row[col.index('Full_name_from_nomenclature_authority')] # = row[col.index('Nomenclature_status')] other_designations = row[col.index( 'Other_designations')].strip() # = row[col.index('Modification_date')} # = row[col.index('Feature_type')] if self.test_mode and int(gene_num) not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue tax_curie = ':'.join(('NCBITaxon', tax_num)) gene_id = ':'.join(('NCBIGene', gene_num)) gene_type_id = self.resolve(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == self.globaltt['sequence_feature']: self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.test_mode and limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader (for non mods), # so we will not add the leader designation here. else: model.addIndividualToGraph(gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader if name != '-': model.addSynonym(gene_id, name) if synonyms != '-': for syn in synonyms.split('|'): syn = syn.strip() # unknown curies may occur here if syn[:12] == 'AnimalQTLdb:' and \ tax_curie in self.informal_species: syn = self.informal_species[ tax_curie] + 'QTL:' + syn[12:] LOG.info('AnimalQTLdb: CHANGED to: %s', syn) model.addSynonym(gene_id, syn, model.globaltt['has_related_synonym']) if other_designations != '-': for syn in other_designations.split('|'): model.addSynonym(gene_id, syn.strip(), model.globaltt['has_related_synonym']) if dbxrefs != '-': self._add_gene_equivalencies(dbxrefs, gene_id, tax_curie) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if chrom != '-' and chrom != '': if re.search(r'\|', chrom) and chrom not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping LOG.info( '%s is non-uniquely mapped to %s. Skipping for now.', gene_id, chrom) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if chrom == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for chromosome in re.split(r'\|', chrom): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(chromosome, tax_curie, None) mychrom = makeChromID(chromosome, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(chromosome, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match(band_regex, map_loc) if band_match is not None and len( band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^' + chromosome, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(chromosome + bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(graph, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature graph.addTriple(gene_id, self.globaltt['is subsequence of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 LOG.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome graph.addTriple(gene_id, self.globaltt['is subsequence of'], mychrom) geno.addTaxon(tax_curie, gene_id)
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # not unzipping the file logger.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", gene_info) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) # tax label can get added elsewhere geno.addGenome(tax_id, str(tax_num)) # label added elsewhere model.addClassToGraph(tax_id, None) with gzip.open(gene_info, 'rb') as f: row = f.readline().decode().strip().split('\t') logger.info("Header has %i columns", len(row)) for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date, feature_type) = line.split('\t') # ##set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self.map_type_of_gene(gtype.strip()) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == 'SO:0000110': self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.testMode and \ limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader, # so we will not add the leader designation here. else: model.addIndividualToGraph(gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader. if name != '-': model.addSynonym(gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if xrefs.strip() != '-': self._add_gene_equivalencies(xrefs, gene_id, tax_num) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if str(chrom) != '-' and str(chrom) != '': if re.search(r'\|', str(chrom)) and \ str(chrom) not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping logger.info( '%s is non-uniquely mapped to %s.' + ' Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chrom) == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split(r'\|', str(chrom)): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(c, tax_id, None) mychrom = makeChromID(c, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(c, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and \ len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^' + c, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(c + bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(g, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) return
def _add_variant_cdna_variant_assoc_to_graph(self, row): """ Generates relationships between variants and cDNA variants given a row of data :param iterable: row of data, see add_variant_info_to_graph() docstring for expected structure. Only applicable for structure 2. :return None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) is_literal = True (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = row variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) # Add gene self._add_variant_gene_relationship(variant_id, variant_gene) # Transcript reference for nucleotide position transcript_curie = self._make_transcript_curie(transcript_id) # Make region IDs cdna_region_id = ":_{0}Region".format(transcript_curie) chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome, genome_pos_start, genome_pos_end) # Add the genome build genome_label = "Human" build_id = "UCSC:{0}".format(genome_build) taxon_id = 'NCBITaxon:9606' geno.addGenome(taxon_id, genome_label) geno.addReferenceGenome(build_id, genome_build, taxon_id) # Add chromosome chrom_class_id = makeChromID(chromosome, '9606', 'CHR') # the chrom class (generic) id chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chromosome, taxon_id, 'Human') # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id) # Add variant coordinates in reference to chromosome self._add_feature_with_coords(variant_id,genome_pos_start, genome_pos_end, chrom_instance_id, chrom_region_id) # Add mutation coordinates in reference to gene self._add_feature_with_coords(variant_id, bp_pos, bp_pos, transcript_curie, cdna_region_id) # Add nucleotide mutation gu.addTriple(self.graph, variant_id, geno.properties['reference_nucleotide'], ref_base, is_literal) gu.addTriple(self.graph, variant_id, geno.properties['altered_nucleotide'], variant_base, is_literal) """ Here we update any internal cgd variant IDS with a cosmic ID or dbSNP ID. Alternatively we could do this using sql rather than a sparql update which may be safer """ # Add SNP xrefs if cosmic_id is not None: cosmic_id_list = cosmic_id.split(', ') cosmic_curie_list = [] for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) cosmic_curie_list.append(cosmic_curie) gu.addIndividualToGraph(self.graph, cosmic_curie, c_id, geno.genoparts['missense_variant']) # If there are multiple ids set them equivalent to the first for curie in cosmic_curie_list[1:]: gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie) self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings) if db_snp_id is not None: db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id) gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id, geno.genoparts['missense_variant']) if cosmic_id is None: self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings) else: cosmic_id_list = cosmic_id.split(', ') for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie) return
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) model.addOWLPropertyClassRestriction( genome_id, self.globaltt['in taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern+r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph( maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband']]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): parent_i = parents[i].strip() if parent_i is not None and parent_i != "": pclassid = cclassid + parent_i # class chr parts pclass_label = makeChromLabel(chrom + parent_i, genome_label) rti = getChrPartTypeByNotation(parent_i, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid+parents[i+1] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid+parents[0]) model.addOWLPropertyClassRestriction( cclassid+parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break # TODO figure out the staining intensities for the encompassing bands return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' mch = re.match(placed_scaffold_pattern + r'$', scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None } # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num + band_num]['stain'] = self.resolve(rtype) # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd else: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, taxon, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers omimparams = { 'format': 'json', 'include': 'all', } # you will need to add the API key into the conf.json file, like: # keys : { 'omim' : '<your api key here>' } omimparams.update({'apiKey': config.get_config()['keys']['omim']}) # http://api.omim.org/api/entry?mimNumber=100100&include=all if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) it = 0 # for counting # note that you can only do request batches of 20 # see info about "Limits" at http://omim.org/help/api groupsize = 20 if not self.testMode and limit is not None: # just in case the limit is larger than the number of records, max it out max = min((limit, omimids.__len__())) else: max = omimids.__len__() # max = 10 #for testing # TODO write the json to local files - make the assumption that downloads within 24 hrs are the same # now, loop through the omim numbers and pull the records as json docs while it < max: end = min((max, it+groupsize)) # iterate through the omim ids list, and fetch from the OMIM api in batches of 20 if self.testMode: intersect = list(set([str(i) for i in self.test_ids]) & set(omimids[it:end])) if len(intersect) > 0: # some of the test ids are in the omimids logger.info("found test ids: %s", intersect) omimparams.update({'mimNumber': ','.join(intersect)}) else: it += groupsize continue else: omimparams.update({'mimNumber': ','.join(omimids[it:end])}) p = urllib.parse.urlencode(omimparams) url = '/'.join((self.OMIM_API, 'entry'))+'?%s' % p logger.info('fetching: %s', '/'.join((self.OMIM_API, 'entry'))+'?%s' % p) # ### if you want to test a specific entry number, uncomment the following code block # if ('101600' in omimids[it:end]): #104000 # print("FOUND IT in",omimids[it:end]) # else: # #testing very specific record # it+=groupsize # continue # ### end code block for testing # print ('fetching:',(',').join(omimids[it:end])) # print('url:',url) d = urllib.request.urlopen(url) resp = d.read().decode() request_time = datetime.now() it += groupsize myjson = json.loads(resp) entries = myjson['omim']['entryList'] geno = Genotype(g) # add genome and taxon tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere for e in entries: # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, and add it as a synonym abbrev = None if len(re.split(';', label)) > 1: abbrev = (re.split(';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': gu.addDeprecatedClass(g, omimid) else: omimtype = self._get_omimtype(e['entry']) # this uses our cleaned-up label gu.addClassToGraph(g, omimid, newlabel, omimtype) # add the original OMIM label as a synonym gu.addSynonym(g, omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: gu.addSynonym(g, omimid, l) # for OMIM, we're adding the description as a definition gu.addDefinition(g, omimid, description) if abbrev is not None: gu.addSynonym(g, omimid, abbrev) # if this is a genetic locus (but not sequenced) then add the chrom loc info if omimtype == Genotype.genoparts['biological_region']: if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. add this omim thing as a subsequence of the cytofeature # 18p11.3-p11.2 # for now, just take the first one # FIXME add the other end of the range, but not sure how to do that # not sure if saying subsequence of feature is the right relationship cytoloc = cytoloc.split('-')[0] f = Feature(omimid, None, None) if 'chromosome' in genemap: chrom = makeChromID(str(genemap['chromosome']), tax_num, 'CHR') geno.addChromosomeClass(str(genemap['chromosome']), tax_id, tax_label) loc = makeChromID(cytoloc, tax_num, 'CHR') gu.addClassToGraph(g, loc, cytoloc) # this is the chr band f.addSubsequenceOfFeature(g, loc) f.addFeatureToGraph(g) pass # check if moved, if so, make it deprecated and replaced/consider class to the other thing(s) # some entries have been moved to multiple other entries and use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search('and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split('and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) gu.addDeprecatedClass(g, omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) ### end iterating over batch of entries # can't have more than 4 req per sec, # so wait the remaining time, if necessary dt = datetime.now() - request_time rem = 0.25 - dt.total_seconds() if rem > 0: logger.info("waiting %d sec", rem) time.sleep(rem/1000) gu.loadAllProperties(g) return
def _get_chrbands(self, limit, src_key, genome_id): """ :param limit: :return: """ tax_num = src_key if limit is None: limit = sys.maxsize # practical limit anyway model = Model(self.graph) line_num = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[src_key]['genome_label'] taxon_curie = 'NCBITaxon:' + tax_num species_name = self.globaltcid[taxon_curie] # for logging # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_curie, None) model.addSynonym(taxon_curie, genome_label) geno.addGenome(taxon_curie, genome_label, genome_id) # add the build and the taxon it's in build_num = self.files[src_key]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_curie) # cat (at least) also has chr[BDAECF]... hex? must be a back cat. if tax_num == self.localtt['Felis catus']: placed_scaffold_regex = re.compile( r'(chr(?:[BDAECF]\d+|X|Y|Z|W|M|))$') else: placed_scaffold_regex = re.compile(r'(chr(?:\d+|X|Y|Z|W|M))$') unlocalized_scaffold_regex = re.compile(r'_(\w+)_random') unplaced_scaffold_regex = re.compile(r'chr(Un(?:_\w+)?)') # process the bands col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as binreader: for line in binreader: line_num += 1 # skip comments line = line.decode().strip() if line[0] == '#' or line_num > limit: continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') scaffold = row[col.index('chrom')].strip() start = row[col.index('chromStart')] stop = row[col.index('chromEnd')] band_num = row[col.index('name')].strip() rtype = row[col.index('gieStain')] # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = placed_scaffold_regex.match(scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold at the class level # LOG.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = unlocalized_scaffold_regex.match(scaffold) m_chr_unplaced = unplaced_scaffold_regex.match(scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) # else: # LOG.error( # "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, tax_num, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_curie, self.files[src_key]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } elif scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } else: LOG.info('%s line %i DROPPED chromosome/scaffold %s', species_name, line_num, scaffold) parents = list() # see it new types have showed up if rtype is not None and rtype not in [ 'gneg', 'gpos25', 'gpos33', 'gpos50', 'gpos66', 'gpos75', 'gpos100', 'acen', 'gvar', 'stalk' ]: LOG.info('Unknown gieStain type "%s" in %s at %i', rtype, src_key, line_num) self.globaltt[rtype] # blow up if rtype == 'acen': # hacky, revisit if ontology improves rtype = self.localtt[rtype] if band_num is not None and band_num != '' and \ rtype is not None and rtype != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt[rtype], } # add the staining intensity of the band # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] # else: # band has no parents # loop through the parents and add them to the dict # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i], self.graph) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum is not None and pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd elif pnum is not None: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom else: LOG.error("pnum is None") # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num binreader.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, tax_num, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False)
def _get_chrbands(self, limit, taxon, genome_id=None): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :param: taxon: :param: genome :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) if genome_id is None: genome_id = geno.makeGenomeID( taxon_id) # makes a blank node always geno.addGenome(taxon_id, genome_label, genome_id) model.addOWLPropertyClassRestriction(genome_id, self.globaltt['in taxon'], taxon_id) placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # currently unused patterns # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' col = ['chrom', 'start', 'stop', 'band', 'rtype'] with gzip.open(myfile, 'rb') as reader: for line in reader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#': continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') chrom = row[col.index('chrom')] band = row[col.index('band')] rtype = row[col.index('rtype')] # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = re.match(placed_scaffold_pattern + r'$', chrom) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern # chrom = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # LOG.info("Skipping non-placed chromosome %s", chrom) # chatty continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid + band maplocclass_label = makeChromLabel(chrom + band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph(maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband'] ]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome (SO:000340) because # they don't actually have banding info LOG.info("feature type '%s' is not chr band", self.globaltcid[region_type_id]) else: LOG.info('staining type not found for: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of", maplocclass_id, "=", parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file for prnt in parents: parent = prnt.strip() if parent is None or parent == "": continue pclassid = cclassid + parent # class chr parts pclass_label = makeChromLabel(chrom + parent, genome_label) rti = getChrPartTypeByNotation(parent, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if prnt != parents[-1]: grandparent = 1 + parents.index(prnt) pid = cclassid + parents[grandparent] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # not unzipping the file logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", myfile) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chr, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self._map_type_of_gene(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # TODO might have to figure out if things aren't genes, and make them individuals gu.addClassToGraph(g, gene_id, label, gene_type_id, desc) # we have to do special things here for genes, because they're classes not individuals # f = Feature(gene_id,label,gene_type_id,desc) if name != '-': gu.addSynonym(g, gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 if xrefs.strip() != '-': for r in xrefs.strip().split('|'): fixedr = self._cleanup_id(r) if fixedr is not None and fixedr.strip() != '': if re.match('HPRD', fixedr): # proteins are not == genes. gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr) else: # skip some of these for now if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']: gu.addEquivalentClass(g, gene_id, fixedr) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility # 101928066 LOC101928066 1|Un - # unlocated scaffold # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1 # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table when there is > 1 listed # with the exception of human X|Y, i will only take those that align to one chr # FIXME remove the chr mapping below when we pull in the genomic coords if str(chr) != '-' and str(chr) != '': if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']: # this means that there's uncertainty in the mapping. skip it # TODO we'll need to figure out how to deal with >1 loc mapping logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chr) == 'X; Y': chr = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split('\|',str(chr)) : geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere mychrom = makeChromID(c, tax_num, 'CHR') mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label gu.addSynonym(g, mychrom, mychrom_syn) band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, so make that kind of band # not sure why this matches? chrX|Y or 10090chr12|Un" # TODO we probably need a different regex per organism # the maploc_id already has the numeric chromosome in it, strip it first bid = re.sub('^'+c, '', map_loc) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates) # print(map_loc,'-->',bid,'-->',maploc_id) band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere band.addFeatureToGraph(g) # add the band as the containing feature gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24, ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # not unzipping the file logger.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", gene_info) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) # tax label can get added elsewhere geno.addGenome(tax_id, str(tax_num)) # label added elsewhere model.addClassToGraph(tax_id, None) with gzip.open(gene_info, 'rb') as f: row = f.readline().decode().strip().split('\t') logger.info("Header has %i columns", len(row)) for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date, feature_type) = line.split('\t') # ##set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self.map_type_of_gene(gtype.strip()) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == 'SO:0000110': self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.testMode and \ limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader, # so we will not add the leader designation here. else: model.addIndividualToGraph( gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader. if name != '-': model.addSynonym(gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if xrefs.strip() != '-': self._add_gene_equivalencies(xrefs, gene_id, tax_num) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if str(chrom) != '-' and str(chrom) != '': if re.search(r'\|', str(chrom)) and \ str(chrom) not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping logger.info( '%s is non-uniquely mapped to %s.' + ' Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chrom) == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split(r'\|', str(chrom)): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(c, tax_id, None) mychrom = makeChromID(c, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(c, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match( r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and \ len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^'+c, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(g, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 logger.debug( 'not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance( cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance( str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name model.addIndividualToGraph( vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info( "No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match( r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub( m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub( r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub( r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub( r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc( g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) model.addOWLPropertyClassRestriction( genome_id, self.globaltt['in taxon'], taxon_id) placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # currently unused patterns # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' col = ['chrom', 'start', 'stop', 'band', 'rtype'] with gzip.open(myfile, 'rb') as reader: for line in reader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#': continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') chrom = row[col.index('chrom')] band = row[col.index('band')] rtype = row[col.index('rtype')] # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = re.match(placed_scaffold_pattern+r'$', chrom) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern # chrom = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold LOG.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph( maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband']]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info LOG.info("feature type %s != chr band", region_type_id) else: LOG.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of", maplocclass_id, "=", parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file for prnt in parents: parent = prnt.strip() if parent is None or parent == "": continue pclassid = cclassid + parent # class chr parts pclass_label = makeChromLabel(chrom + parent, genome_label) rti = getChrPartTypeByNotation(parent, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if prnt != parents[-1]: grandparent = 1 + parents.index(prnt) pid = cclassid + parents[grandparent] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break # TODO figure out the staining intensities for the encompassing bands return