def anyglycan2wurcs(self, glycan):
     sequence = ""
     if isinstance(glycan, Glycan.Glycan):
         if not self.glycoct_format:
             self.glycoct_format = GlycoCTFormat()
         sequence = self.glycoct2wurcs(self.glycoct_format.toStr(glycan))
         if '0+' in sequence:
             sequence = self.fixcompwurcs(sequence)
     else:
         sequence = re.sub(r'\n\n+', r'\n', glycan)
         if sequence.strip().startswith('RES'):
             sequence = self.glycoct2wurcs(glycan)
     return sequence
Beispiel #2
0
#!/bin/env python27

import sys

from getwiki import GlycanData

w = GlycanData()

import findpygly
from pygly.CompositionTable import PermethylCompositionTable
from pygly.GlycanFormatter import GlycoCTFormat

pctable = PermethylCompositionTable()
glycoctformat = GlycoCTFormat()

for g in w.iterglycan():
    glycan = g.getGlycan()
    if not glycan:
        continue
    for m in glycan.all_nodes():
        try:
            eltcomp = m.composition(pctable)
        except KeyError:
            print g.get('accession'), glycoctformat.mtoStr(m)
class GlyTouCanUtil(object):
    _wurcs_mono_format = WURCS20MonoFormat()
    _wurcs_format = WURCS20Format()
    _glycoct_format = GlycoCTFormat()
    _alphamap = None

    def getUnsupportedCodes(self, acc):
        codes = set()
        substs = set()
        invalid = set()
        other = set()
        sequence = self.getseq(acc, 'wurcs')
        if not sequence:
            return codes, substs, invalid, other
        monos = sequence.split('/[', 1)[1].split(']/')[0].split('][')
        for m in monos:
            try:
                g = self._wurcs_mono_format.parsing(m)
            except UnsupportedSkeletonCodeError as e:
                codes.add(e.message.rsplit(None, 1)[-1])
            except UnsupportedSubstituentError as e:
                substs.add(e.message.rsplit(None, 1)[-1])
            except InvalidMonoError as e:
                invalid.add(e.message.rsplit(None, 1)[-1])
            except GlycanParseError:
                pass
        try:
            g = self._wurcs_format.toGlycan(sequence)
        except ZeroPlusLinkCountError:
            other.add("0+ link count")
        except UndeterminedLinkCountError:
            other.add("undetermined link count")
        except CircularError:
            other.add("circular")
        except LinkCountError:
            other.add("bad link count")
        except GlycanParseError:
            pass
        return codes, substs, invalid, other

    def getGlycan(self, acc, format=None):
        if not format or (format == 'wurcs'):
            sequence = self.getseq(acc, 'wurcs')
            if sequence:
                try:
                    return self._wurcs_format.toGlycan(sequence)
                except GlycanParseError:
                    pass  # traceback.print_exc()
        if not format or (format == 'glycoct'):
            sequence = self.getseq(acc, 'glycoct')
            if sequence:
                try:
                    return self._glycoct_format.toGlycan(sequence)
                except GlycanParseError:
                    pass
        return None

    def glycoct(self, acc, fetch=None):
        g = self.getGlycan(acc, fetch)
        if not g:
            return None
        try:
            return g.glycoct()
        except RuntimeError:
            pass
        return None

    def umw(self, acc, fetch=None):
        g = self.getGlycan(acc, fetch)
        if not g:
            return None
        try:
            return g.underivitized_molecular_weight()
        except (LookupError, ValueError):
            pass
        return None

    def wurcs2glycoct(self, acc):
        sequence = self.getseq(acc, 'wurcs')
        if sequence:
            sequence1 = urllib.parse.quote_plus(sequence)
            url = 'https://api.glycosmos.org/glycanformatconverter/2.3.2-snapshot/wurcs2glycoct/' + sequence1
            try:
                data = json.loads(urllib.request.urlopen(url).read())
                if 'GlycoCT' in data:
                    return data['GlycoCT']
            except ValueError:
                pass
        return None

    def subsumptionbyapi(self, acc):
        sequence = self.getseq(acc, 'wurcs')
        if sequence:
            sequence1 = urllib.parse.quote_plus(sequence)
            url = 'https://api.glycosmos.org/subsumption/0.2.0/' + sequence1
            data = urllib.request.urlopen(url).read()
            seen = set()
            lasts = None
            for triple in sorted([
                    tuple([
                        s.strip() for s in list(
                            map(str, list(map(t.get, ("S", "P", "O")))))
                    ]) for t in json.loads(data)
            ]):
                if triple in seen:
                    continue
                seen.add(triple)
                if triple[0] != lasts:
                    if lasts != None:
                        print("")
                    print(triple[0])
                    lasts = triple[0]
                if triple[2] == sequence:
                    print(">>  " + "\t".join(triple[1:]))
                else:
                    print("    " + "\t".join(triple[1:]))

    def findskel(self, skel, maxcount=None):
        if maxcount != None:
            maxcount = int(maxcount)

        for acc, format, wurcs in self.allseq(format='wurcs'):
            glycoct = self.getseq(acc, format='glycoct')
            if not glycoct:
                continue
            monos = wurcs.split('/[', 1)[1].split(']/')[0].split('][')
            if maxcount != None and len(monos) > maxcount:
                continue
            for mono in monos:
                msk = re.search(r'^(.*?)([-_].*)?$', mono).group(1)
                assert msk
                m = re.search(r"^%s$" % (skel, ), msk)
                if m:
                    yield acc, m.group(0)

    def multiseq(self):
        counts = defaultdict(set)
        for acc, fmt, seq in self.allseq():
            counts[(acc, fmt)].add(seq)
        for k, v in list(counts.items()):
            if len(v) > 1:
                yield k

    def fixcompwurcs(self, wurcsseq, subst=[]):
        if not self._alphamap:
            self._alphamap = dict()
            for i, c in enumerate(range(ord('a'), ord('z') + 1)):
                self._alphamap[i + 1] = chr(c)
                self._alphamap[chr(c)] = (i + 1)
            for i, c in enumerate(range(ord('A'), ord('Z') + 1)):
                self._alphamap[i + 1 + 26] = chr(c)
                self._alphamap[chr(c)] = (i + 1 + 26)
        prefix, counts, rest = wurcsseq.split('/', 2)
        unodes, nodes, edges = counts.split(',')
        nodes = int(nodes)
        assert '0+' in edges
        edges = (nodes - 1)
        ambignode = "|".join(
            ["%s?" % (self._alphamap[i], ) for i in range(1, nodes + 1)])
        ambigedge = "%s}-{%s" % (ambignode, ambignode)
        ambigedges = [ambigedge] * edges
        if hasattr(subst, 'items'):
            subst = list(subst.items())
        for sub, cnt in subst:
            for i in range(cnt):
                ambigedges.insert(0, "%s}%s" % (ambignode, sub))
        return "%s/%s,%d,%d/%s%s" % (prefix, unodes, nodes, len(ambigedges),
                                     rest, "_".join(ambigedges))

    def anyglycan2wurcs(self, glycan):
        sequence = ""
        if isinstance(glycan, Glycan.Glycan):
            if not self.glycoct_format:
                self.glycoct_format = GlycoCTFormat()
            sequence = self.glycoct2wurcs(self.glycoct_format.toStr(glycan))
            if '0+' in sequence:
                sequence = self.fixcompwurcs(sequence)
        else:
            sequence = re.sub(r'\n\n+', r'\n', glycan)
            if sequence.strip().startswith('RES'):
                sequence = self.glycoct2wurcs(glycan)
        return sequence

    def glycoct2wurcs(self, seq):
        requestURL = "https://api.glycosmos.org/glycanformatconverter/2.3.2-snapshot/glycoct2wurcs/"
        encodedseq = urllib.parse.quote(seq, safe='')
        requestURL += encodedseq
        req = urllib.request.Request(requestURL)
        # self.wait()
        response = urllib.request.urlopen(req).read()

        result = json.loads(response)

        try:
            wurcs = result["WURCS"]
        except:
            raise ValueError("GlycoCT 2 WURCS conversion failed")

        return wurcs.strip()
Beispiel #4
0
#!/bin/env python27

import os, sys
import json
import findpygly
from pygly.GlycanFormatter import GlycoCTFormat, WURCS20Format
from pygly.GlyTouCan import GlyTouCan

g = GlyTouCan()

wurcs_parser = WURCS20Format()
glycoct_parser = GlycoCTFormat()

wpath = "dumps/wurcs"
gpath = "dumps/glycoct"

wlist = os.listdir(wpath)
glist = os.listdir(gpath)
alllist = list(set(wlist+glist))
print "Total glycan number %s" % len(alllist)

glycanobj = {}
for filename in alllist:
    acc = filename.rstrip(".txt")

    try:
        gseq = open(os.path.join(gpath, filename)).read().strip()
        obj = glycoct_parser.toGlycan(gseq)

    except:
        try:
Beispiel #5
0
#!/bin/env python27

import sys
from collections import defaultdict

from getwiki import GlycanData, Glycan
from pygly.GlycanFormatter import GlycoCTFormat

w = GlycanData()
glycoctformat = GlycoCTFormat()

monosdb = {}
f = open(sys.argv[1], 'r')
for line in f:
    k, v = line.split()
    monosdb[k] = v

for g in w.iterglycan():
    acc = g.get('accession')
    monodbids = set()
    glycan = g.getGlycan()
    if not glycan:
        continue
    for m in glycan.all_nodes():
        try:
            glycoctsym = glycoctformat.mtoStr(m)
        except KeyError:
            continue
        try:
            monodbids.add(monosdb[glycoctsym])
        except KeyError:
Beispiel #6
0
import sys, re
from collections import defaultdict

# from getwiki import GlycanData
import findpygly
from pygly.Glycan import Glycan
from pygly.GlycanFormatter import GlycoCTFormat
from pygly.GlycanResource import GlyTouCan, GlyCosmos

# w = GlycanData()
glycoctformat = GlycoCTFormat()

basecomp = {'x-HEX-x:x':'Hex',
            'x-HEX-x:x||(2d:1)n-acetyl':'HexNAc',
            'x-dgro-dgal-NON-x:x|1:a|2:keto|3:d||(5d:1)n-acetyl':'NeuAc',
            'x-dgro-dgal-NON-x:x|1:a|2:keto|3:d||(5d:1)n-glycolyl':'NeuGc',
            'x-HEX-x:x|6:d':'dHex', 
            'x-lgal-HEX-x:x|6:d':'Fuc',
	    'x-PEN-x:x':'Pent',
	    'x-dgro-dgal-NON-x:x|1:a|2:keto|3:d':'KDN',
	    'x-HEX-x:x|6:a':'HexA',
	    'phosphate':'P',
	    'sulfate':'S'}

badskel = set("""
axxxxh-1x
""".split())

# NeuAc, NeuGc, KDN, Fuc, Hex, HexNAc, dHex, HexA, Pent
expskel = set("""
  AUd21122h_5*NCC/3=O
Beispiel #7
0
from collections import defaultdict
from pygly.GlycanFormatter import WURCS20Format, GlycoCTFormat

# Has to come first to pick out the --smwenv PROD command-line argument.
from getwiki import GlycoMotifWiki, AllMotif
w = GlycoMotifWiki()

topology_file_path = sys.argv[1]
non_file_path = sys.argv[2]
red_file_path = sys.argv[3]



wp = WURCS20Format()
gp = GlycoCTFormat()



class RootMonosaccharideTopoLeq(alignment.MonosaccharideImageEqual):

    def leq(self, a, b):
        return self.eq(a, b)

class MonosaccharideTopoLeq(alignment.MonosaccharideTopoEqual):

    def leq(self, a, b):
        return self.eq(a, b)

class LinkageTopoLeq(alignment.LinkageTopoEqual):
class Glycan(SMW.SMWClass):
    template = 'Glycan'

    @staticmethod
    def pagename(**kwargs):
        assert kwargs.get('accession')
        return kwargs.get('accession')

    def toPython(self, data):
        data = super(Glycan, self).toPython(data)

        # if '_subobjs' in data:
        #     del data['_subobjs']

        return data

    def toTemplate(self, data):
        data = super(Glycan, self).toTemplate(data)

        return data

    def set_annotation(self, **kwargs):
        if 'annotation' in kwargs:
            ann = kwargs.get('annotation')
        else:
            ann = Annotation(**kwargs)
        if not ann.goodvalue():
            return
        if not hasattr(self, '_annotations'):
            self._annotations = dict()
        self._annotations[ann.key()] = ann

    def add_annotation(self, **kwargs):
        assert kwargs.get('value')
        value = kwargs.get('value')
        del kwargs['value']
        if self.has_annotations(**kwargs):
            values = self.get_annotation_values(**kwargs)
            values.append(value)
        else:
            values = [value]
        self.set_annotation(value=values, **kwargs)

    def delete_annotations(self, **kwargs):
        if not hasattr(self, '_annotations'):
            return
        for an in list(self.annotations(**kwargs)):
            del self._annotations[an.key()]

    def count_annotations(self, **kwargs):
        return len(list(self.annotations(**kwargs)))

    def get_annotation_values(self, property=None, **kwargs):
        return map(
            str,
            self.get_annotation(property=property, **kwargs).get('value'))

    def get_annotation_value(self, property=None, **kwargs):
        return str(
            self.get_annotation(property=property, **kwargs).get('value'))

    def get_annotation(self, property=None, **kwargs):
        if property != None:
            kwargs['property'] = property
        anns = list(self.annotations(**kwargs))
        if len(anns) == 0:
            raise LookupError("No matching annotations")
        elif len(anns) > 1:
            raise LookupError("Too many annotations")
        return anns[0]

    def has_annotations(self, **kwargs):
        for an in self.annotations(**kwargs):
            return True
        return False

    def annotations(self,
                    type=None,
                    property=None,
                    source=None,
                    sourceid=None):
        if hasattr(self, '_annotations'):
            for key, an in sorted(self._annotations.items()):
                if (type == None or an.get('type') == type) and \
                       (property == None or an.get('property') == property) and \
                       (source == None or an.get('source') == source) and \
                       (sourceid == None or an.get('sourceid') == sourceid):
                    yield an

    def __str__(self):
        sl = [super(Glycan, self).__str__()]
        for an in self.annotations():
            sl.append(str(an))
        return "\n".join(sl)

    glycoct_format = GlycoCTFormat()
    wurcs_format = WURCS20Format()

    def getGlycan(self):
        try:
            sequence = self.get_annotation_value('WURCS')
            return self.wurcs_format.toGlycan(sequence)
        except (LookupError, GlycanParseError, RuntimeError):
            pass
        try:
            sequence = self.get_annotation_value('GlycoCT')
            return self.glycoct_format.toGlycan(sequence)
        except (LookupError, GlycanParseError, RuntimeError):
            pass
        return None
import time
import findpygly
import pygly.alignment
from pygly.GlycanFormatter import GlycoCTFormat, WURCS20Format
from pygly.GlycanResource.GlyTouCan import GlyTouCanNoCache
from pygly.GlycanResource.GlyCosmos import GlyCosmosNoCache
from getwiki import GlycoMotifWiki
w = GlycoMotifWiki()

if len(sys.argv) > 1:
    res_file_path = sys.argv[1]  # "../data/motif_alignments.tsv"
else:
    res_file_path = None

wp = WURCS20Format()
gp = GlycoCTFormat()
gtc = GlyTouCanNoCache()

nodes_cache = pygly.alignment.ConnectedNodesCache()

loose_matcher = pygly.alignment.MotifInclusive(
    connected_nodes_cache=nodes_cache)
loose_nred_matcher = pygly.alignment.NonReducingEndMotifInclusive(
    connected_nodes_cache=nodes_cache)

strict_matcher = pygly.alignment.MotifStrict(connected_nodes_cache=nodes_cache)
strict_nred_matcher = pygly.alignment.NonReducingEndMotifStrict(
    connected_nodes_cache=nodes_cache)

motif_gobjs = {}
for m in w.itermotif():
Beispiel #10
0
    linkCheck = GlycanLinkCompatibleEitherway()
    monoCheck = MonosaccharideCompatibleOneway()
    rootMonoCheck = MonosaccharideCompatibleOneway()


if __name__ == "__main__":
    seq1 = """RES
    1b:x-dgal-HEX-1:5
    2b:a-dgal-HEX-1:5
    LIN
    1:1o(3+1)2d"""

    seq2 = """RES
    1b:x-dglc-HEX-1:5
    2s:n-acetyl
    3b:b-dgal-HEX-1:5
    4b:a-dgal-HEX-1:5
    LIN
    1:1d(2+1)2n
    2:1o(4+1)3d
    3:3o(4+1)4d"""

    wurcsp = WURCS20Format()
    glycoctp = GlycoCTFormat()

    g1 = glycoctp.toGlycan(seq1)
    g2 = glycoctp.toGlycan(seq2)

    mstsa = MotifSearchTopologicalSameAs()
    print mstsa.get(g1, g2)
Beispiel #11
0
#!/bin/env python27

import sys, traceback
from getwiki import GlycoMotifWiki, UniCarbMotif
w = GlycoMotifWiki()

from pygly.GlyTouCan import GlyTouCan
gtc = GlyTouCan()

from gtccache import GlyTouCanCache
gtccache = GlyTouCanCache()

from pygly.GlycanFormatter import GlycoCTFormat, IUPACParserExtended1
gparser = GlycoCTFormat()
imparser = IUPACParserExtended1()

from dataset import XLSXFileTable
rows = XLSXFileTable(sys.argv[1])

possibleaglycon = ["Cer", "R", "Ser/Thr"]
reaglycon = ["Ser/Thr", "Cer", "Other"]
current = set()
for r in rows:
    id = r["ID"]
    name = r["Name"]
    iupacseq = r["IUPAC"]

    accession = "%06d" % id
    if not iupacseq:
        continue
Beispiel #12
0
#!/usr/bin/env python27

import sys, os, os.path
import findpygly
from pygly.GlyTouCan import GlyTouCan
from pygly.GlycanFormatter import GlycoCTFormat

glycoct_format = GlycoCTFormat()
gtc = GlyTouCan()

for l in sys.stdin:
    acc = l.strip()
    g = gtc.getGlycan(acc)
    if g and g.fully_determined():
        print acc, True
        if not os.path.exists('%s.txt' % (acc, )):
            seq = gtc.getseq(acc, 'glycoct')
            if not seq:
                try:
                    seq = glycoct_format.toStr(g)
                except:
                    pass
            if seq:
                wh = open('%s.txt' % (acc, ), 'w')
                wh.write(seq.strip() + '\n')
                wh.close()
    else:
        print acc, (False if g else None)
Beispiel #13
0
    getargs = {}
    for i in range(1, len(sys.argv), 2):
        key = sys.argv[i]
        try:
            value = sys.argv[i + 1]
            value = float(sys.argv[i + 1])
            value = int(sys.argv[i + 1])
        except ValueError:
            pass
        getargs[key] = value
    if count:
        cnt = gdb.count(**getargs)
        print dbname, cnt
    elif glycoct:
        fmt1 = GlycoCTFormat()
        zf = ZipFile(dbname.rsplit('.', 1)[0] + '.gct', 'w', ZIP_DEFLATED)
        for r in gdb.get(**getargs):
            zf.writestr("%s.txt" % r.accession, fmt1.toStr(r.glycan))
        zf.close()
    else:
        fmt = LinearCodeFormat()
        for r in gdb.get(**getargs):
            if r['lincode']:
                lc = fmt.toStr(r.glycan)
                print r.accession, r['nlinked'], r.get(
                    'oxford', "-"), r['molecular_weight'], r['composition'], lc
                print r
            else:
                print r.accession, r['nlinked'], r.get(
                    'oxford', "-"), r['molecular_weight'], r['composition']
def substructure_search_init(shared_resources, structure_list_file_path, PPID):
    print >> sys.stderr, "Computing Processor%s is starting" % PPID
    task_queue, result_queue = shared_resources

    gp = GlycoCTFormat()
    wp = WURCS20Format()

    motif_match_connected_nodes_cache = pygly.alignment.ConnectedNodesCache()
    mm1 = pygly.alignment.GlyTouCanMotif(
        connected_nodes_cache=motif_match_connected_nodes_cache)
    # mm2 = pygly.alignment.MotifAllowOptionalSub(connected_nodes_cache=motif_match_connected_nodes_cache)

    glycans = {}
    for line in open(structure_list_file_path):
        acc, s = line.strip().split()
        glycans[acc] = wp.toGlycan(s)
    print >> sys.stderr, "Processor-%s: finishes loading %s glycans" % (
        PPID, len(glycans))

    while True:
        task_detail = task_queue.get(block=True)

        print >> sys.stderr, "Processor-%s: Job %s received." % (
            PPID, task_detail["id"])

        seq = task_detail["seq"]
        jobid = task_detail["id"]

        #loose_root_match = task_detail["loose_root_match"]
        #additional_subst = task_detail["additional_subst"]

        motif_match_position = task_detail["motif_match_position"]

        motif_matcher = mm1
        """
        if loose_root_match:
            motif_matcher = mm3

        """

        #fullstructure = False
        rootOnly = False
        anywhereExceptRoot = False
        if motif_match_position == "anywhere":
            pass
        elif motif_match_position == "reo":
            rootOnly = True
        else:
            pass
        """
        elif motif_match_position == "notre":
            anywhereExceptRoot = True
        elif motif_match_position == "fullstructure":
            rootOnly = True
            fullstructure = True
        """

        matches = []
        error = []
        calculation_start_time = time.time()

        try:
            if "RES" in seq:
                motif = gp.toGlycan(seq)
            elif "WURCS" in seq:
                motif = wp.toGlycan(seq)
            else:
                raise RuntimeError
        except:
            error.append("Unable to parse")

        if len(error) == 0:
            motif_node_num = len(list(motif.all_nodes()))
            if motif_node_num > max_motif_size:
                error.append("Motif is too big")

        # TODO time out mechanism to avoid running for too long
        for acc, glycan in glycans.items():

            if len(error) != 0:
                for e in error:
                    print >> sys.stderr, "Processor-%s: Issues (%s) is found with task %s" % (
                        PPID, e, task_detail["id"])
                break

            #if fullstructure:
            #    if motif_node_num != len(list(glycan.all_nodes())):
            #        continue

            if motif_matcher.leq(motif,
                                 glycan,
                                 rootOnly=rootOnly,
                                 anywhereExceptRoot=anywhereExceptRoot):
                matches.append(acc)

        calculation_end_time = time.time()
        calculation_time_cost = calculation_end_time - calculation_start_time

        res = {
            "id": jobid,
            "start time": calculation_start_time,
            "end time": calculation_end_time,
            "alignment calculation time": calculation_time_cost,
            "matches": matches,
            "error": error
        }
        print >> sys.stderr, "Processor-%s: Job %s finished within %ss" % (
            PPID, task_detail["id"], calculation_time_cost)
        result_queue.put(res)