def download_all_structures(db_path, record_type=GlycanRecordWithTaxon):  # pragma: no cover
    response = requests.get(u'http://www.glycome-db.org/http-services/getStructureDump.action?user=eurocarbdb')
    response.raise_for_status()
    handle = gzip.GzipFile(fileobj=StringIO(response.content))
    xml = etree.parse(handle)
    db = RecordDatabase(db_path, record_type=record_type)
    misses = []
    i = 0
    for structure in xml.iterfind(".//structure"):
        try:
            glycomedb_id = int(structure.attrib['id'])
            i += 1
            glycoct_str = structure.find("sequence").text
            taxa = [Taxon(t.attrib['ncbi'], None, None) for t in structure.iterfind(".//taxon")]
            glycan = glycoct.loads(glycoct_str)
            if (glycoct.loads(str(glycan)).mass() - glycan.mass()) > 0.00001:
                raise Exception("Mass did not match on reparse")
            record = record_type(glycan, taxa=taxa, id=glycomedb_id)
            db.load_data(record, commit=False, set_id=False)
            if i % 1000 == 0:
                print(i, "Records parsed.")
        except Exception as e:
            misses.append((glycomedb_id, e))
            print(glycomedb_id, e)
    db.set_metadata("misses", misses)
    db.commit()
    return db
Exemple #2
0
 def test_is_n_glycan(self):
     core = glycans['N-Linked Core']
     tree = glycoct.loads(broad_n_glycan)
     result = (subtree_search.subtree_of(core, tree))
     self.assertTrue(result == 1)
     tree = glycoct.loads(complex_glycan)
     result = (subtree_search.subtree_of(core, tree, exact=False))
     self.assertTrue(result == 1)
     result = (subtree_search.subtree_of(core, tree, exact=True))
     self.assertTrue(result == 1)
     tree = glycoct.loads(branchy_glycan)
     result = (subtree_search.subtree_of(core, tree, exact=False))
     self.assertTrue(result is None)
 def structure(self, *accessions):
     accumulator = []
     sparql = r'''
     SELECT DISTINCT ?saccharide ?glycoct WHERE {
         ?saccharide a glycan:saccharide .
         ?saccharide glycan:has_glycosequence ?sequence .
         FILTER CONTAINS(str(?sequence), "glycoct") .
         ?sequence glycan:has_sequence ?glycoct .
         FILTER ("%s" = str(?saccharide))
     }
     '''
     for accession in accessions:
         if isinstance(accession, URIRef):
             accession_str = str(accession)
         else:
             accession_str = str(NSGlycoinfo[accession])
         query_string = sparql % accession_str
         results = self.query(query_string)
         g = results.vars[1]
         glycoct_string = results.bindings[0][g]
         structure = glycoct.loads(glycoct_string)
         accumulator.append(structure)
     if len(accumulator) == 1:
         return accumulator[0]
     else:
         return accumulator
Exemple #4
0
def glycan_record_from_xml(xml_tree, id):
    '''
    Converts an XML document and the associated database into an instance of
    `GlycanRecord`.

    Parameters
    ----------
    xml_tree: lxml.etree
        XML document to consume
    id:
        GlycomeDB id number to assign this record

    Returns
    -------
    GlycanRecord:
        Constructed record
    '''
    structure = glycoct.loads(xml_tree.find(xpath).text)
    taxa = [Taxon(t.attrib['ncbi'], t.attrib['name'], make_entries(t)) for t in xml_tree.findall(".//taxon")]
    aglycon = [Aglyca(t.attrib['name'].replace(
        "'", "`"), t.attrib['reducing'], make_entries(t)) for t in xml_tree.findall(".//aglyca")]
    motifs = [Motif(t.attrib['name'], t.attrib['id'], t.attrib['class']) for t in xml_tree.findall(".//motif")]
    dbxref = [e for c in [t.entries for t in taxa] + [t.entries for t in aglycon] for e in c]
    dbxref.append(DatabaseEntry("GlycomeDB", id))
    record = GlycanRecord(structure, motifs=motifs, dbxref=dbxref, aglycones=aglycon, taxa=taxa, id=id)
    record.id = id
    add_cache(record)
    return record
Exemple #5
0
def reparse_database(database):
    from glypy.io import glycoct
    for record in database:
        ct_str = str(record.structure)
        structure = glycoct.loads(ct_str)
        assert structure.mass() == record.structure.mass()
        record.structure = structure
        record.update()
Exemple #6
0
    def test_translate(self):
        broad = glycoct.loads(broad_n_glycan)
        dup = linear_code.loads(linear_code.dumps(broad))
        self.assertEqual(broad, dup)

        # linear code doesn't know about modifications or
        # ring shape
        sulfated = glycoct.loads(sulfated_glycan)
        sulfated.reducing_end = None
        sulfated.root.ring_start = 1
        sulfated.root.ring_end = 5
        dup = linear_code.loads(linear_code.dumps(sulfated))
        self.assertEqual(dup, sulfated)

        sulfated = glycoct.loads(sulfated_glycan)
        dup = linear_code.loads(linear_code.dumps(sulfated))
        self.assertNotEqual(sulfated, dup)
Exemple #7
0
 def test_deep_similarity(self):
     branchy = glycoct.loads(branchy_glycan)
     broad = glycoct.loads(broad_n_glycan)
     ref = broad.clone()
     self.assertEqual(similarity.monosaccharide_similarity(branchy.root, branchy.root), (5, 5))
     self.assertEqual(
         similarity.monosaccharide_similarity(branchy.root, branchy.root, include_children=True),
         (26, 26))
     self.assertEqual(similarity.monosaccharide_similarity(branchy.root, broad.root), (4, 5))
     self.assertEqual(
         similarity.monosaccharide_similarity(branchy.root, broad.root, include_children=True),
         (7, 10))
     self.assertEqual(
         similarity.monosaccharide_similarity(broad.root, branchy.root, include_children=True),
         (11, 14))
     self.assertEqual(similarity.monosaccharide_similarity(broad.root, broad.root, include_children=True), (54, 54))
     self.assertEqual(ref, broad)
 def convert(self, structure_text):
     if structure_text in self.cache:
         return self.cache[structure_text]
     structure = glycoct.loads(structure_text)
     gc = HashableGlycanComposition.from_glycan(structure).thaw()
     gc.drop_stems()
     gc.drop_configurations()
     gc.drop_positions()
     gc = HashableGlycanComposition(gc)
     self.cache[structure_text] = gc
     return gc
Exemple #9
0
 def convert(self, structure_text):
     if structure_text in self.cache:
         return self.cache[structure_text]
     structure = glycoct.loads(structure_text)
     gc = HashableGlycanComposition.from_glycan(structure).thaw()
     gc.drop_stems()
     gc.drop_configurations()
     gc.drop_positions()
     gc = HashableGlycanComposition(gc)
     self.cache[structure_text] = gc
     return gc
Exemple #10
0
def get(id):
    '''
    Get the structure for `id` from :title-reference:`GlycomeDB`.
    '''
    if check_cache(id):
        return cache[id].structure
    r = requests.get(get_url_template.format(id=id))
    r.raise_for_status()
    tree = etree.fromstring(r.content)
    condensed = tree.find(xpath).text
    return glycoct.loads(condensed)
def has_glycosequence_processor(state, uri):
    """Detect and extract GlycoCT sequence data and parse
    into a |Glycan| object.

    Parameters
    ----------
    state : ReferenceEntity or dict
        The key-value store to add annotation to.
    uri : rdflib.term.URIRef
        The `URIRef` to load structure data from.

    Returns
    -------
    BoundURIRef
    """
    reference = uri()
    if reference.in_carbohydrate_format == NSGlycan.carbohydrate_format_glycoct:
        # trailing underscore in case a URI would claim "structure"
        state["structure_"] = [glycoct.loads(reference.has_sequence)]
    return uri
Exemple #12
0
    def translate_response(self, response):
        for name, glycosequence, taxon, motif in response:
            taxon = parse_taxon(taxon)
            try:
                structure = glycoct.loads(glycosequence, structure_class=NamedGlycan)
                structure.name = name

                passed = True
                for func in self.filter_functions:
                    if func(structure, name=name, taxon=taxon, motif=motif):
                        passed = False
                        break
                if not passed:
                    continue

                yield structure, motif_to_class_map[motif]
            except glycoct.GlycoCTError as e:
                continue
            except Exception as e:
                self.error("Error in translate_response of %s" % name, e)
                continue
def get(id):
    '''
    Get the structure for `id` from :title-reference:`GlycomeDB`.

    GlycomeDB supplies a detailed schema link which allows `lxml` to easily pull out
    more than just the GlycoCT string. To download a more informative record, use :func:`get_record`

    Parameters
    ----------
    id: str or int

    Returns
    -------
    Glycan
    '''
    if check_cache(id):
        return cache[id].structure
    r = requests.get(get_url_template.format(id=id))
    r.raise_for_status()
    tree = etree.fromstring(r.content)
    condensed = tree.find(xpath).text
    return glycoct.loads(condensed)
Exemple #14
0
 def __init__(self, stream=None, key_transform=identity, value_transform=identity):
     if stream is None:
         stream = pkg_resources.resource_stream(__name__, "data/motifs.hjson")
     data = hjson.load(stream)
     motif_classes = set()
     motif_categories = set()
     for motif in data:
         name = motif['name']
         motif_class = motif['class']
         motif_category = motif['category']
         motif_structure = glycoct.loads(motif['glycoct'])
         motif_structure.motif_name = name
         motif_structure.motif_class = motif_class
         motif_structure.motif_category = motif_category
         motif_structure.is_core_motif = motif["core_motif"]
         self[name] = motif_structure
         motif_classes.add(motif_class)
         motif_categories.add(motif_category)
     self._category_map = {}
     self._class_map = {}
     self.motif_classes = motif_classes
     self.motif_categories = motif_categories
Exemple #15
0
glycan_structure = glycoct.loads('''
RES
1b:x-dglc-HEX-1:5
2s:n-acetyl
3b:b-dglc-HEX-1:5
4s:n-acetyl
5b:b-dman-HEX-1:5
6b:a-dman-HEX-1:5
7b:b-dglc-HEX-1:5
8s:n-acetyl
9b:a-lgal-HEX-1:5|6:d
10b:b-dgal-HEX-1:5
11b:a-dgro-dgal-NON-2:6|1:a|2:keto|3:d
12s:n-glycolyl
13b:b-dglc-HEX-1:5
14s:n-acetyl
15b:b-dgal-HEX-1:5
16s:n-acetyl
17b:b-dglc-HEX-1:5
18s:n-acetyl
19b:a-dman-HEX-1:5
20b:b-dglc-HEX-1:5
21s:n-acetyl
22b:a-lgal-HEX-1:5|6:d
23b:b-dgal-HEX-1:5
24b:a-dgro-dgal-NON-2:6|1:a|2:keto|3:d
25s:n-glycolyl
26b:b-dglc-HEX-1:5
27s:n-acetyl
28b:a-lgal-HEX-1:5|6:d
29b:b-dgal-HEX-1:5
30b:a-dgro-dgal-NON-2:6|1:a|2:keto|3:d
31s:n-acetyl
32b:a-lgal-HEX-1:5|6:d
LIN
1:1d(2+1)2n
2:1o(4+1)3d
3:3d(2+1)4n
4:3o(4+1)5d
5:5o(3+1)6d
6:6o(2+1)7d
7:7d(2+1)8n
8:7o(3+1)9d
9:7o(4+1)10d
10:10o(3+2)11d
11:11d(5+1)12n
12:6o(4+1)13d
13:13d(2+1)14n
14:13o(4+1)15d
15:15d(2+1)16n
16:5o(4+1)17d
17:17d(2+1)18n
18:5o(6+1)19d
19:19o(2+1)20d
20:20d(2+1)21n
21:20o(3+1)22d
22:20o(4+1)23d
23:23o(3+2)24d
24:24d(5+1)25n
25:19o(6+1)26d
26:26d(2+1)27n
27:26o(3+1)28d
28:26o(4+1)29d
29:29o(3+2)30d
30:30d(5+1)31n
31:1o(6+1)32d
''')
        FILTER CONTAINS(str(?sequence), "glycoct") .
        ?sequence glycan:has_sequence ?glycoct .
        ?saccharide glycan:has_motif ?motif .
        FILTER(?motif in (glycoinfo:G00026MO))
    }
""")

structures = []
for i, bind in enumerate(result.bindings):
    # result.vars[1] contains the RDF key for the GlycoCT Condensed encoding of the
    # glycan structure
    text = bind[result.vars[1]]
    if i % 100 == 0:
        print("Parsed %d glycan structures" % (i,))
    try:
        structure = glycoct.loads(text)
        structures.append(structure)
    except Exception as ex:
        print(i, bind[result.vars[0]], ex)
        continue


def detatch_monosaccharide_substituents(composition, substituents=None):
    if substituents is None:
        substituents = []
    if not substituents:
        return composition

    gc = GlycanComposition()

    for key, value in composition.items():
    def run(self):
        self.manager.initialize()
        logger.debug("Checking %s for downloaded data", cache_name)
        if os.path.exists(cache_name):
            data_source = open(cache_name, "rb")
        else:
            response = requests.get(u'http://www.glycome-db.org/http-services/getStructureDump.action?user=eurocarbdb')
            response.raise_for_status()
            open(cache_name, "wb").write(response.content)
            data_source = open(cache_name, "rb")

        handle = gzip.GzipFile(fileobj=data_source)
        xml = etree.parse(handle)
        session = self.manager.session()
        hypothesis = MS2GlycanHypothesis(name=reference_hypothesis_name_prefix + timestamp())
        session.add(hypothesis)
        glycomedb = ReferenceDatabase.get(session, name="Glycome-DB")
        session.add(glycomedb)
        session.commit()
        self.hypothesis_id = hypothesis.id
        i = 0

        motifs = session.query(StructureMotif).all()

        drop_stems = self.drop_stems
        drop_positions = self.drop_positions

        taxa = {int(t.attrib['ncbi']) for t in xml.iterfind(".//taxon")}
        [Taxon.get(session, tid) for tid in taxa]
        session.flush()


        logger.info("Parsing database structures")
        taxon_acc = []
        motif_acc = []
        for structure in xml.iterfind(".//structure"):
            try:
                accession = structure.attrib['id']
                glycoct_str = structure.find("sequence").text
                taxa = [int(t.attrib['ncbi']) for t in structure.iterfind(".//taxon")]
                glycan = glycoct.loads(glycoct_str)
                if (glycoct.loads(str(glycan)).mass() - glycan.mass()) > 0.00001:
                    # Parity Error
                    continue

                composition = GlycanComposition.from_glycan(glycan)
                if drop_stems:
                    composition.drop_stems()
                if drop_positions:
                    composition.drop_positions()
                composition.drop_configurations()
                composition.collapse()

                reduction = "ReducedEnd" if glycan.reducing_end else None

                record = TheoreticalGlycanStructure(
                    glycoct=glycoct_str,
                    composition=composition.serialize(),
                    reduction=reduction,
                    calculated_mass=glycan.mass(),
                    hypothesis_id=self.hypothesis_id)
                record.references = [ReferenceAccessionNumber.get(session, accession, glycomedb.id)]
                session.add(record)
                session.flush()
                for motif in motifs:
                    if motif.matches(record):
                        motif_acc.append({"motif_id": motif.id, "glycan_id": record.id})

                taxon_acc.extend({"taxon_id": tid, "entity_id": record.id} for tid in taxa)
                i += 1
                if (i % 100) == 0:
                    session.commit()
                    if taxon_acc:
                        session.execute(TheoreticalGlycanStructure.TaxonomyAssociationTable.insert(),
                                        taxon_acc)
                    if motif_acc:
                        session.execute(TheoreticalGlycanStructureToMotifTable.insert(), motif_acc)
                    taxon_acc = []
                    motif_acc = []
                    session.commit()
                    self.inform("Commit %r", i)
            except (KeyboardInterrupt, IntegrityError):
                raise
            except (glycoct.GlycoCTSectionUnsupported, IndexError):
                pass
            except Exception, e:
                logger.exception("%s", accession, exc_info=e)
                pass
                if isinstance(e, KeyboardInterrupt):
                    raise
Exemple #18
0
def has_glycosequence_processor(state, uri):
    reference = uri()
    if reference.in_carbohydrate_format == NSGlycan.carbohydrate_format_glycoct:
        # trailing underscore in case a URI would claim "structure"
        state["structure_"] = [glycoct.loads(reference.has_sequence)]
    return uri
Exemple #19
0
def load(name):
    structure_composition.do_warn = False
    res = glycoct.loads(structures[name])
    structure_composition.do_warn = True
    return res
Exemple #20
0
 def convert(self):
     seq = self.glycan_sequence
     structure = glycoct.loads(seq)
     structure.id = self.id
     return structure
 def convert(self):
     seq = self.glycan_sequence
     structure = glycoct.loads(seq)
     structure.id = self.id
     return structure
Exemple #22
0
 def __init__(self, stream, key_transform=identity, value_transform=identity):
     self.update(hjson.load(stream))
     for k, v in self.items():
         self[key_transform(k)] = value_transform(glycoct.loads(v))
     self.key_transform = key_transform
Exemple #23
0
 def test_maximum_common_subtree(self):
     core = glycans['N-Linked Core']
     tree = glycoct.loads(branchy_glycan)
     res = subtree_search.maximum_common_subgraph(core, tree)
     self.assertEqual(res.score, 6.0)
Exemple #24
0
 def test_subtree_inclusion(self):
     core = glycans['N-Linked Core']
     tree = glycoct.loads(broad_n_glycan)
     self.assertTrue(subtree_search.subtree_of(core, tree))
     self.assertTrue(subtree_search.subtree_of(tree, core) is None)