def download_all_structures(db_path, record_type=GlycanRecordWithTaxon): # pragma: no cover response = requests.get(u'http://www.glycome-db.org/http-services/getStructureDump.action?user=eurocarbdb') response.raise_for_status() handle = gzip.GzipFile(fileobj=StringIO(response.content)) xml = etree.parse(handle) db = RecordDatabase(db_path, record_type=record_type) misses = [] i = 0 for structure in xml.iterfind(".//structure"): try: glycomedb_id = int(structure.attrib['id']) i += 1 glycoct_str = structure.find("sequence").text taxa = [Taxon(t.attrib['ncbi'], None, None) for t in structure.iterfind(".//taxon")] glycan = glycoct.loads(glycoct_str) if (glycoct.loads(str(glycan)).mass() - glycan.mass()) > 0.00001: raise Exception("Mass did not match on reparse") record = record_type(glycan, taxa=taxa, id=glycomedb_id) db.load_data(record, commit=False, set_id=False) if i % 1000 == 0: print(i, "Records parsed.") except Exception as e: misses.append((glycomedb_id, e)) print(glycomedb_id, e) db.set_metadata("misses", misses) db.commit() return db
def test_is_n_glycan(self): core = glycans['N-Linked Core'] tree = glycoct.loads(broad_n_glycan) result = (subtree_search.subtree_of(core, tree)) self.assertTrue(result == 1) tree = glycoct.loads(complex_glycan) result = (subtree_search.subtree_of(core, tree, exact=False)) self.assertTrue(result == 1) result = (subtree_search.subtree_of(core, tree, exact=True)) self.assertTrue(result == 1) tree = glycoct.loads(branchy_glycan) result = (subtree_search.subtree_of(core, tree, exact=False)) self.assertTrue(result is None)
def structure(self, *accessions): accumulator = [] sparql = r''' SELECT DISTINCT ?saccharide ?glycoct WHERE { ?saccharide a glycan:saccharide . ?saccharide glycan:has_glycosequence ?sequence . FILTER CONTAINS(str(?sequence), "glycoct") . ?sequence glycan:has_sequence ?glycoct . FILTER ("%s" = str(?saccharide)) } ''' for accession in accessions: if isinstance(accession, URIRef): accession_str = str(accession) else: accession_str = str(NSGlycoinfo[accession]) query_string = sparql % accession_str results = self.query(query_string) g = results.vars[1] glycoct_string = results.bindings[0][g] structure = glycoct.loads(glycoct_string) accumulator.append(structure) if len(accumulator) == 1: return accumulator[0] else: return accumulator
def glycan_record_from_xml(xml_tree, id): ''' Converts an XML document and the associated database into an instance of `GlycanRecord`. Parameters ---------- xml_tree: lxml.etree XML document to consume id: GlycomeDB id number to assign this record Returns ------- GlycanRecord: Constructed record ''' structure = glycoct.loads(xml_tree.find(xpath).text) taxa = [Taxon(t.attrib['ncbi'], t.attrib['name'], make_entries(t)) for t in xml_tree.findall(".//taxon")] aglycon = [Aglyca(t.attrib['name'].replace( "'", "`"), t.attrib['reducing'], make_entries(t)) for t in xml_tree.findall(".//aglyca")] motifs = [Motif(t.attrib['name'], t.attrib['id'], t.attrib['class']) for t in xml_tree.findall(".//motif")] dbxref = [e for c in [t.entries for t in taxa] + [t.entries for t in aglycon] for e in c] dbxref.append(DatabaseEntry("GlycomeDB", id)) record = GlycanRecord(structure, motifs=motifs, dbxref=dbxref, aglycones=aglycon, taxa=taxa, id=id) record.id = id add_cache(record) return record
def reparse_database(database): from glypy.io import glycoct for record in database: ct_str = str(record.structure) structure = glycoct.loads(ct_str) assert structure.mass() == record.structure.mass() record.structure = structure record.update()
def test_translate(self): broad = glycoct.loads(broad_n_glycan) dup = linear_code.loads(linear_code.dumps(broad)) self.assertEqual(broad, dup) # linear code doesn't know about modifications or # ring shape sulfated = glycoct.loads(sulfated_glycan) sulfated.reducing_end = None sulfated.root.ring_start = 1 sulfated.root.ring_end = 5 dup = linear_code.loads(linear_code.dumps(sulfated)) self.assertEqual(dup, sulfated) sulfated = glycoct.loads(sulfated_glycan) dup = linear_code.loads(linear_code.dumps(sulfated)) self.assertNotEqual(sulfated, dup)
def test_deep_similarity(self): branchy = glycoct.loads(branchy_glycan) broad = glycoct.loads(broad_n_glycan) ref = broad.clone() self.assertEqual(similarity.monosaccharide_similarity(branchy.root, branchy.root), (5, 5)) self.assertEqual( similarity.monosaccharide_similarity(branchy.root, branchy.root, include_children=True), (26, 26)) self.assertEqual(similarity.monosaccharide_similarity(branchy.root, broad.root), (4, 5)) self.assertEqual( similarity.monosaccharide_similarity(branchy.root, broad.root, include_children=True), (7, 10)) self.assertEqual( similarity.monosaccharide_similarity(broad.root, branchy.root, include_children=True), (11, 14)) self.assertEqual(similarity.monosaccharide_similarity(broad.root, broad.root, include_children=True), (54, 54)) self.assertEqual(ref, broad)
def convert(self, structure_text): if structure_text in self.cache: return self.cache[structure_text] structure = glycoct.loads(structure_text) gc = HashableGlycanComposition.from_glycan(structure).thaw() gc.drop_stems() gc.drop_configurations() gc.drop_positions() gc = HashableGlycanComposition(gc) self.cache[structure_text] = gc return gc
def get(id): ''' Get the structure for `id` from :title-reference:`GlycomeDB`. ''' if check_cache(id): return cache[id].structure r = requests.get(get_url_template.format(id=id)) r.raise_for_status() tree = etree.fromstring(r.content) condensed = tree.find(xpath).text return glycoct.loads(condensed)
def has_glycosequence_processor(state, uri): """Detect and extract GlycoCT sequence data and parse into a |Glycan| object. Parameters ---------- state : ReferenceEntity or dict The key-value store to add annotation to. uri : rdflib.term.URIRef The `URIRef` to load structure data from. Returns ------- BoundURIRef """ reference = uri() if reference.in_carbohydrate_format == NSGlycan.carbohydrate_format_glycoct: # trailing underscore in case a URI would claim "structure" state["structure_"] = [glycoct.loads(reference.has_sequence)] return uri
def translate_response(self, response): for name, glycosequence, taxon, motif in response: taxon = parse_taxon(taxon) try: structure = glycoct.loads(glycosequence, structure_class=NamedGlycan) structure.name = name passed = True for func in self.filter_functions: if func(structure, name=name, taxon=taxon, motif=motif): passed = False break if not passed: continue yield structure, motif_to_class_map[motif] except glycoct.GlycoCTError as e: continue except Exception as e: self.error("Error in translate_response of %s" % name, e) continue
def get(id): ''' Get the structure for `id` from :title-reference:`GlycomeDB`. GlycomeDB supplies a detailed schema link which allows `lxml` to easily pull out more than just the GlycoCT string. To download a more informative record, use :func:`get_record` Parameters ---------- id: str or int Returns ------- Glycan ''' if check_cache(id): return cache[id].structure r = requests.get(get_url_template.format(id=id)) r.raise_for_status() tree = etree.fromstring(r.content) condensed = tree.find(xpath).text return glycoct.loads(condensed)
def __init__(self, stream=None, key_transform=identity, value_transform=identity): if stream is None: stream = pkg_resources.resource_stream(__name__, "data/motifs.hjson") data = hjson.load(stream) motif_classes = set() motif_categories = set() for motif in data: name = motif['name'] motif_class = motif['class'] motif_category = motif['category'] motif_structure = glycoct.loads(motif['glycoct']) motif_structure.motif_name = name motif_structure.motif_class = motif_class motif_structure.motif_category = motif_category motif_structure.is_core_motif = motif["core_motif"] self[name] = motif_structure motif_classes.add(motif_class) motif_categories.add(motif_category) self._category_map = {} self._class_map = {} self.motif_classes = motif_classes self.motif_categories = motif_categories
glycan_structure = glycoct.loads(''' RES 1b:x-dglc-HEX-1:5 2s:n-acetyl 3b:b-dglc-HEX-1:5 4s:n-acetyl 5b:b-dman-HEX-1:5 6b:a-dman-HEX-1:5 7b:b-dglc-HEX-1:5 8s:n-acetyl 9b:a-lgal-HEX-1:5|6:d 10b:b-dgal-HEX-1:5 11b:a-dgro-dgal-NON-2:6|1:a|2:keto|3:d 12s:n-glycolyl 13b:b-dglc-HEX-1:5 14s:n-acetyl 15b:b-dgal-HEX-1:5 16s:n-acetyl 17b:b-dglc-HEX-1:5 18s:n-acetyl 19b:a-dman-HEX-1:5 20b:b-dglc-HEX-1:5 21s:n-acetyl 22b:a-lgal-HEX-1:5|6:d 23b:b-dgal-HEX-1:5 24b:a-dgro-dgal-NON-2:6|1:a|2:keto|3:d 25s:n-glycolyl 26b:b-dglc-HEX-1:5 27s:n-acetyl 28b:a-lgal-HEX-1:5|6:d 29b:b-dgal-HEX-1:5 30b:a-dgro-dgal-NON-2:6|1:a|2:keto|3:d 31s:n-acetyl 32b:a-lgal-HEX-1:5|6:d LIN 1:1d(2+1)2n 2:1o(4+1)3d 3:3d(2+1)4n 4:3o(4+1)5d 5:5o(3+1)6d 6:6o(2+1)7d 7:7d(2+1)8n 8:7o(3+1)9d 9:7o(4+1)10d 10:10o(3+2)11d 11:11d(5+1)12n 12:6o(4+1)13d 13:13d(2+1)14n 14:13o(4+1)15d 15:15d(2+1)16n 16:5o(4+1)17d 17:17d(2+1)18n 18:5o(6+1)19d 19:19o(2+1)20d 20:20d(2+1)21n 21:20o(3+1)22d 22:20o(4+1)23d 23:23o(3+2)24d 24:24d(5+1)25n 25:19o(6+1)26d 26:26d(2+1)27n 27:26o(3+1)28d 28:26o(4+1)29d 29:29o(3+2)30d 30:30d(5+1)31n 31:1o(6+1)32d ''')
FILTER CONTAINS(str(?sequence), "glycoct") . ?sequence glycan:has_sequence ?glycoct . ?saccharide glycan:has_motif ?motif . FILTER(?motif in (glycoinfo:G00026MO)) } """) structures = [] for i, bind in enumerate(result.bindings): # result.vars[1] contains the RDF key for the GlycoCT Condensed encoding of the # glycan structure text = bind[result.vars[1]] if i % 100 == 0: print("Parsed %d glycan structures" % (i,)) try: structure = glycoct.loads(text) structures.append(structure) except Exception as ex: print(i, bind[result.vars[0]], ex) continue def detatch_monosaccharide_substituents(composition, substituents=None): if substituents is None: substituents = [] if not substituents: return composition gc = GlycanComposition() for key, value in composition.items():
def run(self): self.manager.initialize() logger.debug("Checking %s for downloaded data", cache_name) if os.path.exists(cache_name): data_source = open(cache_name, "rb") else: response = requests.get(u'http://www.glycome-db.org/http-services/getStructureDump.action?user=eurocarbdb') response.raise_for_status() open(cache_name, "wb").write(response.content) data_source = open(cache_name, "rb") handle = gzip.GzipFile(fileobj=data_source) xml = etree.parse(handle) session = self.manager.session() hypothesis = MS2GlycanHypothesis(name=reference_hypothesis_name_prefix + timestamp()) session.add(hypothesis) glycomedb = ReferenceDatabase.get(session, name="Glycome-DB") session.add(glycomedb) session.commit() self.hypothesis_id = hypothesis.id i = 0 motifs = session.query(StructureMotif).all() drop_stems = self.drop_stems drop_positions = self.drop_positions taxa = {int(t.attrib['ncbi']) for t in xml.iterfind(".//taxon")} [Taxon.get(session, tid) for tid in taxa] session.flush() logger.info("Parsing database structures") taxon_acc = [] motif_acc = [] for structure in xml.iterfind(".//structure"): try: accession = structure.attrib['id'] glycoct_str = structure.find("sequence").text taxa = [int(t.attrib['ncbi']) for t in structure.iterfind(".//taxon")] glycan = glycoct.loads(glycoct_str) if (glycoct.loads(str(glycan)).mass() - glycan.mass()) > 0.00001: # Parity Error continue composition = GlycanComposition.from_glycan(glycan) if drop_stems: composition.drop_stems() if drop_positions: composition.drop_positions() composition.drop_configurations() composition.collapse() reduction = "ReducedEnd" if glycan.reducing_end else None record = TheoreticalGlycanStructure( glycoct=glycoct_str, composition=composition.serialize(), reduction=reduction, calculated_mass=glycan.mass(), hypothesis_id=self.hypothesis_id) record.references = [ReferenceAccessionNumber.get(session, accession, glycomedb.id)] session.add(record) session.flush() for motif in motifs: if motif.matches(record): motif_acc.append({"motif_id": motif.id, "glycan_id": record.id}) taxon_acc.extend({"taxon_id": tid, "entity_id": record.id} for tid in taxa) i += 1 if (i % 100) == 0: session.commit() if taxon_acc: session.execute(TheoreticalGlycanStructure.TaxonomyAssociationTable.insert(), taxon_acc) if motif_acc: session.execute(TheoreticalGlycanStructureToMotifTable.insert(), motif_acc) taxon_acc = [] motif_acc = [] session.commit() self.inform("Commit %r", i) except (KeyboardInterrupt, IntegrityError): raise except (glycoct.GlycoCTSectionUnsupported, IndexError): pass except Exception, e: logger.exception("%s", accession, exc_info=e) pass if isinstance(e, KeyboardInterrupt): raise
def has_glycosequence_processor(state, uri): reference = uri() if reference.in_carbohydrate_format == NSGlycan.carbohydrate_format_glycoct: # trailing underscore in case a URI would claim "structure" state["structure_"] = [glycoct.loads(reference.has_sequence)] return uri
def load(name): structure_composition.do_warn = False res = glycoct.loads(structures[name]) structure_composition.do_warn = True return res
def convert(self): seq = self.glycan_sequence structure = glycoct.loads(seq) structure.id = self.id return structure
def __init__(self, stream, key_transform=identity, value_transform=identity): self.update(hjson.load(stream)) for k, v in self.items(): self[key_transform(k)] = value_transform(glycoct.loads(v)) self.key_transform = key_transform
def test_maximum_common_subtree(self): core = glycans['N-Linked Core'] tree = glycoct.loads(branchy_glycan) res = subtree_search.maximum_common_subgraph(core, tree) self.assertEqual(res.score, 6.0)
def test_subtree_inclusion(self): core = glycans['N-Linked Core'] tree = glycoct.loads(broad_n_glycan) self.assertTrue(subtree_search.subtree_of(core, tree)) self.assertTrue(subtree_search.subtree_of(tree, core) is None)