Example #1
0
def main():
    """ re-Index all the Proxteam corpus """
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--host", action='store', help="host", default="http://localhost:5000")
    parser.add_argument("--key" , action='store', help="key", default=None)
    parser.add_argument("--username" , action='store', help="user", default=None)
    parser.add_argument("--password" , action='store', help="pwd", default=None)
    parser.add_argument("--path", action='store', help="path", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)

    args = parser.parse_args()
    
    bot = Botagraph(args.host, args.key)

    if args.username and args.password:
        bot.authenticate(args.username, args.password)

    (Initiales, GSR, Matrix) = load_OCR_file(args.path)
    if not bot.has_graph(GID):
        bot.create_graph(GID, {'description': 'a graph of Old Chinese phonology',
                                     'image': "",
                                     'tags': ['chinese','phonology']})
        bot.post_nodetype(GID, 'GSR', 'Karlgren', {}) 
        bot.post_nodetype(GID, 'Initial', '', {}) 
        bot.post_edgetype(GID, 'Sinograms', '',{})

    print "Get schema '%s'" % GID
    schema = bot.get_schema(GID)['schema']
    nodetypes = { n['name']:n for n in schema['nodetypes'] }
    edgetypes = { e['name']:e for e in schema['edgetypes'] }
    Inidict = {}
    for  i, (_, uuid) in enumerate(bot.post_nodes(GID,
        ({'nodetype':nodetypes['Initial']['uuid'], 'properties':{'label':i}} for i in Initiales))): 
        Inidict[Initiales[i]] = uuid
    
    GSRdict = {}
    for  i, (_, uuid) in enumerate(bot.post_nodes(GID,({'nodetype':nodetypes['GSR']['uuid'], 'properties': {'label':s}} for s in GSR))): 
        GSRdict[GSR[i]] = uuid
    edges = [{'edgetype':edgetypes['Sinograms']['uuid'],
              'properties':{'label': u",".join(sinos)},
              'source': GSRdict[gsr],
              'target': Inidict[i]} for (i, gsr), sinos in Matrix.iteritems()]
    #for e in edges:
    #    print e
    #    bot.post_edge(GID, e)
    for _ in bot.post_edges(GID, iter(edges)):
        pass
Example #2
0
def main():
    """  """
    parser = argparse.ArgumentParser()

    parser.add_argument("--host",
                        action='store',
                        help="host",
                        default="http://localhost:5000")
    parser.add_argument("--key", action='store', help="key", default=None)
    parser.add_argument("--username",
                        action='store',
                        help="user",
                        default=None)
    parser.add_argument("--password", action='store', help="pwd", default=None)
    parser.add_argument("--path", action='store', help="path", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)

    args = parser.parse_args()

    # Bot creation & login
    key = open(args.key, 'r').read().strip()
    print "\n * Connecting to %s \n  " % args.host
    bot = Botagraph(args.host, key)

    # read / parse graph
    print "\n * Reading %s" % args.path
    g = Graph(args.gid)
    gid = g.gid

    if bot.has_graph(gid):
        bot.delete_graph(gid)

    if not bot.has_graph(gid):
        print "\n * Create graph %s" % gid
        bot.create_graph(gid, g.attrs)

        print "\n * Creating node type %s" % "Country"
        props = {"code": Text(), "label": Text()}
        bot.post_nodetype(gid, "Country", "Country ", props)

        print "\n * Creating edge type %s" % "alliance"
        props = {
            'id': Numeric(),
            'label': Text(),
            'starts': Text(),
            'ends': Text(),
            'defense': Numeric(),
            'neutrality': Numeric(),
            'nonaggression': Numeric(),
            'entente': Numeric(),
        }
        bot.post_edgetype(gid, "alliance", "alliance terms", props)

    schema = bot.get_schema(gid)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}

    import csv
    with open(args.path, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for i, row in enumerate(reader):
            # undirected
            if i == 0: continue
            if i % 2 != 0: continue

            node = lambda x: {
                'nodetype': nodetypes["Country"]['uuid'],
                'properties': {
                    'code': x[0],
                    'label': x[1]
                }
            }

            g.vs[row[1]] = node(row[1:3])
            g.vs[row[3]] = node(row[3:5])

            left_censor, right_censor = row[11:13]

            alliances = "defense neutrality nonaggression entente".split()
            es = dict(zip(alliances, row[13:17]))
            es['starts'] = "/".join(row[5:8])
            es['ends'] = "/".join(row[8:11])
            es['id'] = row[0]
            es['label'] = " ".join([a for a in alliances if es[a] in (1, "1")])

            es.update()

            g.es[i] = {
                'edgetype': edgetypes["alliance"]['uuid'],
                'source': row[1],
                'target': row[3],
                'properties': es
            }

        print len(g.vs), len(g.es)

    idx = {}
    for node, uuid in bot.post_nodes(gid, g.vs.itervalues()):
        idx[node['properties']['code']] = uuid

    bot.star_nodes(gid, idx.values())

    print "%s nodes inserted . " % (len(idx))

    for edge in g.es.itervalues():
        edge['source'] = idx[edge['source']]
        edge['target'] = idx[edge['target']]

    fail = count = 0
    for edge, uuid in bot.post_edges(gid, g.es.itervalues()):
        if not uuid:
            fail += 1
        else:
            count += 1

    print "%s edges inserted, %s failed " % (count, fail)
Example #3
0
def main():
    """ re-Index all the Proxteam corpus """
    parser = argparse.ArgumentParser()

    parser.add_argument("--host",
                        action='store',
                        help="host",
                        default="http://localhost:5000")
    parser.add_argument("--key", action='store', help="key", default=None)
    parser.add_argument("--username",
                        action='store',
                        help="user",
                        default=None)
    parser.add_argument("--password", action='store', help="pwd", default=None)
    parser.add_argument("--path", action='store', help="path", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)

    args = parser.parse_args()

    # Bot creation & login
    bot = Botagraph(args.host, args.key)

    if args.username and args.password:
        bot.authenticate(args.username, args.password)

    # read / parse graph
    graph = py2neo.Graph()

    # create empty graph
    gid = args.gid

    if not bot.has_graph(gid):
        print "create graph %s" % gid
        bot.create_graph(
            gid, {
                'description': SCHEMA["description"],
                'image': "",
                'tags': SCHEMA['tags']
            })
        for infos in SCHEMA["Nodes"]:
            print "create node type %s" % infos["type"]
            bot.post_nodetype(gid, *buildType(infos))
        for infos in SCHEMA["Links"]:
            print "create edge type %s" % infos["type"]
            bot.post_edgetype(gid, *buildType(infos))

    print "Get schema '%s'" % gid
    schema = bot.get_schema(gid)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}

    print nodetypes
    print edgetypes

    idx = {}

    print "posting nodes"
    count = 0
    fail = 0
    for infos in SCHEMA['Nodes']:
        for node, uuid in bot.post_nodes(
                gid, gen_nodes(graph, nodetypes[infos['type']]['uuid'],
                               infos)):
            if not uuid:
                fail += 1
            else:
                count += 1
                idx[node['properties'][SCHEMA['key']]] = uuid

    print "%s nodes inserted " % count

    # post edges
    print "posting edges"
    count = 0
    fail = 0

    inv_idx = {v: k for k, v in idx.iteritems()}
    for infos in SCHEMA['Links']:
        for obj, uuid in bot.post_edges(
                gid,
                gen_edges(graph, edgetypes[infos['type']]['uuid'], idx,
                          infos)):
            if not uuid:
                fail += 1
            else:
                count += 1
            #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) )
    print "%s edges inserted, %s failed " % (count, fail)
Example #4
0
def main():
    """ re-Index all the Proxteam corpus """
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--host", action='store', help="host", default="http://localhost:5000")
    parser.add_argument("--key" , action='store', help="key", default=None)
    parser.add_argument("--username" , action='store', help="user", default=None)
    parser.add_argument("--password" , action='store', help="pwd", default=None)
    parser.add_argument("--path", action='store', help="path", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)

    args = parser.parse_args()


    # Bot creation & login 
    bot = Botagraph(args.host, args.key)

    if args.username and args.password:
        bot.authenticate(args.username, args.password)

    # read / parse graph
    graph = py2neo.Graph()

    # create empty graph
    gid =  args.gid


    if not bot.has_graph(gid) :
        print "create graph %s" % gid
        bot.create_graph(gid, { 'description': SCHEMA["description"],
                                'image': "",
                                'tags': SCHEMA['tags']
                              }
                        )
        for infos in SCHEMA["Nodes"]:
            print "create node type %s" % infos["type"]
            bot.post_nodetype(gid, *buildType(infos))
        for infos in SCHEMA["Links"]:
            print "create edge type %s" % infos["type"]
            bot.post_edgetype(gid, *buildType(infos))

    print "Get schema '%s'" % gid
    schema = bot.get_schema(gid)['schema']
    nodetypes = { n['name']:n for n in schema['nodetypes'] }
    edgetypes = { e['name']:e for e in schema['edgetypes'] }

    print nodetypes
    print edgetypes

    idx = {}
    
    print "posting nodes"
    count = 0
    fail = 0
    for infos in SCHEMA['Nodes']:
        for node, uuid in bot.post_nodes( gid, gen_nodes(graph, nodetypes[infos['type']]['uuid'], infos) ):
            if not uuid:
                fail += 1
            else :
                count += 1
                idx[node['properties'][SCHEMA['key']]] = uuid
            
    print "%s nodes inserted " % count

    # post edges
    print "posting edges"
    count = 0
    fail = 0

    inv_idx = { v:k for k,v in idx.iteritems() }
    for infos in SCHEMA['Links']: 
        for obj, uuid in bot.post_edges( gid, gen_edges(graph, edgetypes[infos['type']]['uuid'], idx, infos) ):
            if not uuid:
                fail += 1
            else :
                count += 1
            #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) )
    print "%s edges inserted, %s failed " % (count, fail)
Example #5
0
def main():
    """ re-Index all the Proxteam corpus """
    parser = argparse.ArgumentParser()

    parser.add_argument("--host",
                        action='store',
                        help="host",
                        default="http://localhost:5000")
    parser.add_argument("--key", action='store', help="key", default=None)
    parser.add_argument("--username",
                        action='store',
                        help="user",
                        default=None)
    parser.add_argument("--password", action='store', help="pwd", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)
    parser.add_argument("files", action='store', default=None, nargs='+')

    args = parser.parse_args()

    print " analysing {}".format("".join(args.files))

    # Bot creation & login
    bot = Botagraph(args.host, args.key)

    if args.username and args.password:
        bot.authenticate(args.username, args.password)

    gid = args.gid
    if not bot.has_graph(gid):
        print "create graph %s" % gid
        bot.create_graph(gid, "no description")
        print "create node type %s" % "file"
        props = {"label": Text()}
        bot.post_nodetype(gid, "file", "no description", props)
        bot.post_nodetype(gid, "message", "no description", props)
        print "create edge type %s" % "is_syn"
        bot.post_edgetype(gid, "listen", "no desc", {})
        bot.post_edgetype(gid, "trigger", "no desc", {})

    print "Get schema '%s'" % gid
    schema = bot.get_schema(gid)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}
    nodetype_file = nodetypes['file']['uuid']
    nodetype_message = nodetypes['message']['uuid']
    edgetype_listen = edgetypes['listen']['uuid']
    edgetype_trigger = edgetypes['trigger']['uuid']

    vs = {}
    es = []

    for js_file in args.files:
        if js_file.endswith('min.js'):
            #ignore minified js
            continue
        with codecs.open(js_file, 'r', 'utf8') as FILE:
            js_file = re.sub('^[./]+', '', js_file)
            vs[js_file] = {'label': js_file, 'nodetype': nodetype_file}

            for l in FILE:
                #remove comments:
                l = l.strip().split('//', 1)[0]
                m = re.search(r'listenTo\([^"]*"([a-z-]+)"', l)
                if m:
                    message = m.group(1)
                    if message not in vs:
                        vs[message] = {
                            'label': message,
                            'nodetype': nodetype_message
                        }
                    es.append((js_file, edgetype_listen, message))
                m = re.search(r'trigger\([^"]*"([a-z-]+)"', l)
                if m:
                    message = m.group(1)
                    if message not in vs:
                        vs[message] = {
                            'label': message,
                            'nodetype': nodetype_message
                        }
                    es.append((js_file, edgetype_trigger, message))

    idx = {}
    print "posting nodes"
    count = 0
    fail = 0
    for node, uuid in bot.post_nodes(gid, gen_nodes(vs)):
        if not uuid:
            fail += 1
        else:
            count += 1
            idx[node['properties']['label']] = uuid

    print "%s nodes inserted " % count

    #print "iterate over nodes"
    #for node in bot.find_all_nodes(gid, "word", {}):
    #pass#print node

    # post edges
    print "posting edges"
    count = 0
    fail = 0

    inv_idx = {v: k for k, v in idx.iteritems()}

    for _, uuid in bot.post_edges(gid, gen_edges(es, idx)):
        if not uuid:
            fail += 1
        else:
            count += 1
        #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) )
    print "%s edges inserted, %s failed " % (count, fail)
Example #6
0
def main():
    """ re-Index all the Proxteam corpus """
    parser = argparse.ArgumentParser()

    parser.add_argument("--host",
                        action='store',
                        help="host",
                        default="http://localhost:5000")
    parser.add_argument("--key", action='store', help="key", default=None)
    parser.add_argument("--username",
                        action='store',
                        help="user",
                        default=None)
    parser.add_argument("--password", action='store', help="pwd", default=None)
    parser.add_argument("--path", action='store', help="path", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)

    args = parser.parse_args()

    bot = Botagraph(args.host, args.key)

    if args.username and args.password:
        bot.authenticate(args.username, args.password)

    (Initiales, GSR, Matrix) = load_OCR_file(args.path)
    if not bot.has_graph(GID):
        bot.create_graph(
            GID, {
                'description': 'a graph of Old Chinese phonology',
                'image': "",
                'tags': ['chinese', 'phonology']
            })
        bot.post_nodetype(GID, 'GSR', 'Karlgren', {})
        bot.post_nodetype(GID, 'Initial', '', {})
        bot.post_edgetype(GID, 'Sinograms', '', {})

    print "Get schema '%s'" % GID
    schema = bot.get_schema(GID)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}
    Inidict = {}
    for i, (_, uuid) in enumerate(
            bot.post_nodes(GID, ({
                'nodetype': nodetypes['Initial']['uuid'],
                'properties': {
                    'label': i
                }
            } for i in Initiales))):
        Inidict[Initiales[i]] = uuid

    GSRdict = {}
    for i, (_, uuid) in enumerate(
            bot.post_nodes(GID, ({
                'nodetype': nodetypes['GSR']['uuid'],
                'properties': {
                    'label': s
                }
            } for s in GSR))):
        GSRdict[GSR[i]] = uuid
    edges = [{
        'edgetype': edgetypes['Sinograms']['uuid'],
        'properties': {
            'label': u",".join(sinos)
        },
        'source': GSRdict[gsr],
        'target': Inidict[i]
    } for (i, gsr), sinos in Matrix.iteritems()]
    #for e in edges:
    #    print e
    #    bot.post_edge(GID, e)
    for _ in bot.post_edges(GID, iter(edges)):
        pass
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--host", action='store', help="host", default="http://localhost:5000")
    parser.add_argument("--key" , action='store', help="authentification token", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)
    
    args = parser.parse_args()
    host, key, gid =  (args.host, args.key,args.gid)

    if None  in  (host, key, gid):
        parser.print_help()
        return

    # Setup schema
    
    from reliure.schema import Doc, Schema
    from reliure.types import Text, Numeric , Boolean, GenericType

    desc = """
        Game of thrones 
        %s
        """.replace("    ", "")

    g_attrs = {
        'description': desc % gid,
        
        #'image': "https://commons.wikimedia.org/wiki/File:Game_of_Thrones_2011_logo.svg?uselang=fr",
        #'tags': ['social-network', 'game-of-thrones']
    }

    # used for houses, sauvageons ...
    group_type = Schema(**{
                'label' : Text(),
                'url'  : Text(),
                'tags' : Text(multi=True, uniq=True),
                'image' : Text(),
                'color' : Text(),
                'shape' : Text(default=u"square"),
                "name" : Text(), 
            })

    # human like characters
    character_type = Schema(**{
                'label' : Text(),
                'url'  : Text(multi=True, uniq=True),
                'tags' : Text(multi=True, uniq=True),
                'image' : Text(),
                'shape' : Text(default=u"circle"),
                'description' : Text(),
                "name":Text(),
                "actor": Text(multi=True,uniq=True),
                "dubbling_vf": Text(multi=True,uniq=True),
                
                "bio_fr": Text(), 
                "bio_en": Text(), 
                "seasons": Text(), 
                "dead": Boolean(default=False),
            })

    # creaturs dragons, wolf, white walkers ?
    creatur_type = Schema(**{
                'label' : Text(),
                'url'  : Text(),
                'tags' : Text(multi=True, uniq=True),
                'image' : Text(),
                'shape' : Text(default=u"triangle"),
                'description' : Text(),
                "name" : Text(), 
                "bio_fr": Text(), 
                "bio_en": Text(), 
                "seasons": Text(), 
                "dead":  Boolean(default=False),
            })

    edgetypes = [
        # Characters or Creaturs -- rel --> Group
        # (name, desc , properties ),
        ("is_member_of", "Character is member of a Group", {"from_ep":Text(),} ),
        ("is_child_of", "character or creatur is child of another one", {} ),
        ("works_for", "character or creatur works for a character or a group", {"from_episode":Text(), "to_episode":Text()} ),
        ("is_friend_of", "character is friend of another one", {"from_ep":Text(),} ),
        
        ("married", "character meet another one", {"force": Numeric()} ),
        ("belongs_to", "character or creatur belongs to another one", {"from_episode":Text(), "to_episode":Text()} ),    
        ("kill", "character or creatur kill another one", { "episode":Text(), "method":Text() }),
        #("have_sex", "character or creatur have sex another one", { "episode":Text()} ),
        #("rape", "character or creatur rape another one", { "episode":Text()} ),
        #("meet", "character meet another one", { "episode":Text()}),
        #("loves", "character meet another one", {} ),
        
    ]


    # PARSING WK page

    from pyquery import PyQuery as pq
    import codecs

    root = "."
    path = "%s/pages/Personnages_de_Game_of_Thrones" % root
    graphmlz = "%s/got.graphml"  % root
                          
    def opengot():
        html = codecs.open(path, mode='r', encoding="utf8").read()
        html = pq(html)
        html = html(".mw-content-ltr")
        html(".mw-content-ltr h2:first").prevAll().remove()
        html(".mw-content-ltr h2:first").remove()
        html(".mw-content-ltr h2:first").nextAll().remove()

        html('.mw-editsection').remove()
        html('sup').remove()
        html = pq(".mw-content-ltr", html)
        return html


    def as_doc(ctype, cdata):
        d = Doc(ctype) 

        for k,v  in cdata.iteritems():
            if type(ctype[k]) == Text:
                d.set_field(k,v,True)
            else:
                d[k]= v
        return d.export()


    def _parse_color(e):
        color  = None    
        if "style" in e.attrib: 
            styles = dict ( pair.strip().split(':') for pair in  pq(e).attr("style").strip().lower().split(';') if len(pair))
            color = styles.get("background", None)
            
        if color and color in ( "black", "#000") : color = "#000000"
            
        return color
        
    def parse_belongs_legend(html):
        houses_map = {}
        legende = pq( "li", pq("table td ul", html)[:4])
        for e in legende:
            color = _parse_color(pq("span",e)[0])
            text = pq(e).text()#.replace("Maison ", "")
            houses_map[color] = text

        # removes legendes
        html(".mw-content-ltr h3:first").prevAll().remove()
        return houses_map


    def parse_creaturs_and_characters(html, houses):
        
        rel_belongs = []
        rel_member_of = []
        characters = []
        creaturs = []
        
        while True:
            # reading from end
            if len(html("h3:last")):
         
                ths = pq('th', html("h3:last").nextAll())
                tds = pq('td', html("h3:last").nextAll())

                title = html("h3:last").text()
                color = None
                flg = 0
                
                if len(ths)%5 == 0:
                    c = {}
                    member_of = []
                    for td in tds:
                        colspan = td.attrib.get('colspan', 0)
                        if colspan == "6": # table headers
                           color = _parse_color(td)
                           if color : 
                               member_of.append( houses[color] )
                           flg = 1 

                        elif colspan == 0: # table cells
                            if flg == 1:
                                actor_img = pq("img", td).attr("src")
                                if actor_img:
                                    c['image'] = "http:%s" %actor_img 
                            elif flg == 2: 
                                name = pq(td).text()
                                c['name'] = name ;
                                for e in member_of : 
                                    rel_member_of.append( (name, e) )
                            elif flg == 3:
                                c['actor'] = [pq(e).text() for e in pq("a", td)]

                            elif flg == 4:
                                c['dubbling_vf'] = [pq(e).text() for e in pq("a", td)]
                            elif flg == 5 :
                                c['seasons'] = pq(td).text()
                                c['dead'] = u"✝" in pq(td).text()
                            flg +=1
                            
                        elif colspan == "5": # table bio cell
                            c['bio_fr'] = pq(td).text() 
                            
                            characters.append(as_doc(character_type, c))
                            # reset 
                            c = {}
                            member_of = [] 
                            flg = 1

                if len(ths) == 2:
                    c = {}
                    belongs = [] 
                    for td in tds:
                        colspan = td.attrib.get('colspan', 0)
                        if colspan == "6":
                           color = _parse_color(td)
                           if color : 
                               belongs.append(houses[color])
                           flg = 1 

                        elif colspan == 0:
                            if flg == 1: 
                                name = pq(td).text().strip()
                                c['name'] = name 
                                for e in belongs : rel_belongs.append( (name, e))
                                flg = 2
                            if flg == 2:
                                c["seasons"] = pq(td).text()
                                c["dead"] = True # u"✝" in pq(td).text()

     
                        elif colspan == "5":
                           c['bio_fr'] = pq(td).text()
                           creaturs.append(as_doc(creatur_type, c))
                           c = {}
                           belongs = []  
                           flg = 0

                #removes section once parsed
                html("h3:last").nextAll().remove()
                html("h3:last").remove()

            else : break
                
        return characters, rel_member_of, creaturs, rel_belongs


    # In[ ]:

    from reliure.schema import Doc

    locations = [] # TODO

    html = opengot()
    houses_map = parse_belongs_legend(html)
    characters, rel_member_of, creaturs, rel_belongs = parse_creaturs_and_characters(html, houses_map)

    print "Groups   ", len(houses_map)
    print "Creaturs   ", len(creaturs)
    print "Characters ", len(characters)

    print "member_of", len(rel_member_of)
    print "belongs", len(rel_belongs)



    from botapi import Botagraph, BotApiError

    bot = Botagraph(host, key)

    if not bot.has_graph(gid) :
            
        print "\n * Creating graph %s" % gid
        bot.create_graph(gid, g_attrs)

        print "\n * Creating node type %s" % ""
        bot.post_nodetype(gid, "Character", "Character", character_type._fields)
        bot.post_nodetype(gid, "Creatur", "Creatur", creatur_type._fields)
        bot.post_nodetype(gid, "Group",  "Group", group_type._fields)
        
        for name, desc, props in edgetypes:
            bot.post_edgetype(gid, name, desc, props )


    schema = bot.get_schema(gid)['schema']
    nodetypes = { n['name']:n for n in schema['nodetypes'] }
    edgetypes = { e['name']:e for e in schema['edgetypes'] }

    idx = {} # (label, uuid)
    groups = []

    for k,v in houses_map.iteritems():
         g = as_doc(group_type, {'label': v,'name': v,'color':k })   
         groups.append( g )
            
    for name, els in [ ("Character", characters), 
                       ("Creatur", creaturs ) ,
                       ("Group", groups)
                     ]:
        
        print "Posting %s nodes %s" % (len(els), name)
        for c in els:
            payload = {
                    'nodetype': nodetypes[name]['uuid'],
                    'properties': { k:v for k,v in c.iteritems() }
                  }
            payload['properties']['label'] = payload['properties']['name']
            node = bot.post_node(gid, payload)
            idx[node['label']] = node['uuid']


    vids = set()
    for name, rels in [( "is_member_of", rel_member_of), 
                       ( "belongs_to",   rel_belongs) ]:
                
        print "Posting %s rels %s" % ( len(rels), name )
        for src, tgt in rels:
            if src in idx and tgt in idx:
                edge = {
                    'edgetype': edgetypes[name]['uuid'],
                    'source': idx[src],
                    'label' : name,
                    'target': idx[tgt],
                    'properties': {"from_ep":"",}
                }
                uuid = bot.post_edge(gid, edge)
                vids.add(src)
                vids.add(tgt)
            else:
                print src, tgt
                
    print "Starring %s nodes" % len(list(vids))
    bot.star_nodes(gid, list(vids))
Example #8
0
def main():
    """ re-Index all the Proxteam corpus """
    from pprint import pprint

    parser = argparse.ArgumentParser()

    parser.add_argument("--host",
                        action='store',
                        help="host",
                        default="http://*****:*****@ %s \n  " % (args.gid, args.host)
    bot = Botagraph(args.host, args.key)
    gid = args.gid

    if args.username and args.password:
        bot.authenticate(args.username, args.password)

    if args.infos:
        pprint(bot.get_graph(gid))
        return

    # read / parse graph
    print "\n * Reading %s" % args.path

    graph = igraph.read(args.path)

    # subgraph
    if args.cut > 0:
        print " ** cut %s based on degree()" % args.cut
        # cut method based on degree
        n = int(args.cut)
        vs = list((v.index, v.degree()) for v in graph.vs)
        vs = sorted(vs, key=lambda x: x[1], reverse=True)
        vs = vs[:n]
        graph = graph.subgraph([v[0] for v in vs])

    elif args.gl > 0:
        from cello.graphs.prox import prox_markov_dict, sortcut, ALL
        n = int(args.gl)
        extract = prox_markov_dict(graph,
                                   range(graph.vcount()),
                                   80,
                                   add_loops=True)
        vs = [i for i, v in sortcut(extract, n)]
        print "vs", vs
        graph = graph.subgraph(vs)

    print graph.summary()
    graph.es['a'] = [1 for i in xrange(graph.vcount())]

    if not bot.has_graph(gid):
        print "\n * Create graph %s" % gid
        bot.create_graph(
            gid, {
                'description': "Dicosyn experiment\n * ",
                'image': "",
                'tags': ['synonymes', 'dictionnaire']
            })

    print "\n * Get schema '%s'" % gid
    schema = bot.get_schema(gid)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}

    print "\n nodetypes: ", nodetypes.keys()
    print "\n edgetypes: ", edgetypes.keys()

    if not "word" in nodetypes:

        print "\n\n * Creating node type %s" % "word"
        props = {"label": Text(), "lang": Text()}
        bot.post_nodetype(gid, "word", "no description", props)

    if not "is_syn" in edgetypes:
        print "\n\n * Creating edge type %s" % "is_syn"
        bot.post_edgetype(gid, "is_syn", "no desc", {"a": Text()})

    schema = bot.get_schema(gid)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}

    print nodetypes
    print edgetypes

    idx = {}

    if args.wait:
        raw_input("press <enter> key to start edges and nodes importation")

    if args.seed:

        def set_node(v):
            if v['label'] not in idx:
                node = bot.post_node(
                    gid, node_payload(v, nodetypes['word']['uuid']))
                idx[v['label']] = node['uuid']
                print "inserting %s %s" % (v['label'], node['uuid'])

        idx = {}
        v1 = None

        # seeds grow into beautiful flowers

        while graph.vcount() > 0:

            v1 = graph.vs[0] if v1 is None else v1

            size = graph.vcount()

            nei = v1.neighbors()
            if not len(nei):
                graph.delete_vertices([v1.index])
                v1 = None
                continue

            for i in range(min([5, len(nei)])):

                nei = v1.neighbors()

                if i >= len(nei):
                    if graph.vcount():
                        r = randint(0, graph.vcount() - 1)
                        v1 = graph.vs[r]
                    break

                r = randint(0, len(nei) - 1)
                v2 = nei[r]

                print "inserting edge %s %s" % (v1['label'], v2['label'])

                set_node(v1)
                set_node(v2)

                eid = graph.get_eid(v1.index, v2.index)
                src, tgt = idx[v1['label']], idx[v2['label']]

                uuid = bot.post_edge(
                    gid, edge_payload(edgetypes['is_syn']['uuid'], src, tgt,
                                      {}))

                # delete  from graph
                # * inserted edges
                # * nodes with no more edges

                graph.delete_edges([eid])

                delete_nodes = [
                    v.index for v in (v1, v2) if len(graph.neighbors(v)) == 0
                ]

                if len(delete_nodes):
                    graph.delete_vertices(delete_nodes)

                    if graph.vcount():
                        r = randint(0, graph.vcount() - 1)
                        # switch v1
                        v1 = graph.vs[r]

                    else:
                        break

            # wait sometimes
            pause(args.pause)

    else:

        print "posting nodes"
        count = 0
        fail = 0
        for node, uuid in bot.post_nodes(
                gid, gen_nodes(graph, nodetypes['word']['uuid'])):
            if not uuid:
                fail += 1
            else:
                count += 1
                idx[node['properties']['label']] = uuid

        print "%s nodes inserted " % count

        #print "iterate over nodes"
        #for node in bot.find_all_nodes(gid, "word", {}):
        #pass

        # post edges
        print "posting edges"
        count = 0
        fail = 0

        inv_idx = {v: k for k, v in idx.iteritems()}

        for obj, uuid in bot.post_edges(
                gid, gen_edges(graph, edgetypes['is_syn']['uuid'], idx)):
            if not uuid:
                fail += 1
            else:
                count += 1

            # wait sometimes
            pause(args.pause)

        print "%s edges inserted, %s failed " % (count, fail)
Example #9
0
def to_padagraph(host, key, gid, path):
    from reliure.types import Text, Numeric 
    from botapi import Botagraph, BotApiError
    
    bot = Botagraph(host, key)

    nodes, edges = parse(path)
    
    if not bot.has_graph(gid) :
        
        print "\n * Create graph %s" % gid
        attrs = {
            'description':
            """
            http://utopies-concretes.org/#/fr
            
            Ils ont essayé de nous enterrer, ils ne savaient pas que nous étions des graines.

            Un graphe de près de 3000 sites internet de collectifs, structures, médias, blogs — positions relatives et interconnexions
            """.replace("    ", ""),
    
            'image': "",
            'tags': ['social-network', 'utopies-concretes']
        }

        print "\n * Creating graph %s" % gid
        
        bot.create_graph(gid, attrs )
                        
        print "\n * Creating node type %s" % ""
        props = {
                    'label' : Text(),
                    'url'  : Text(),
                    'tags' : Text(multi=True, uniq=True),
                    'image' : Text(),
                    'description' : Text()
                }
        bot.post_nodetype(gid, "Site",  "Site ", props)

        print "\n * Creating edge type %s" % "follows"
        props = {
                    'score' : Numeric(),
                }
        bot.post_edgetype(gid, "is_related", "is_related", props )
    

    schema = bot.get_schema(gid)['schema']
    nodetypes = { n['name']:n for n in schema['nodetypes'] }
    edgetypes = { e['name']:e for e in schema['edgetypes'] }

    def gen_nodes():
        for k,v in nodes.iteritems():     
            
            yield {
                'nodetype': nodetypes['Site']['uuid'],
                'properties': v
            }
    
    print "posting nodes"
    count = 0
    fail = 0
    idx = {}
    for node, uuid in bot.post_nodes( gid, gen_nodes() ):
        if not uuid:
            fail += 1
        else :
            count += 1
            idx[node['properties']['pid']] = uuid
        
    print "%s nodes inserted " % count

    
    def gen_edges():
        for e in edges: 

            src = idx.get(e["source"], None)
            tgt = idx.get(e["target"], None)
            if src and tgt:
                yield {
                    'edgetype': edgetypes['is_related']['uuid'],
                    'source': src,
                    'label' : "is_related",
                    'target': tgt,
                    'properties': {'score':1}
                }

    print "posting edges"
    count = fail = 0

    for obj, uuid in bot.post_edges( gid, gen_edges() ):
        if not uuid:
            fail += 1
        else :
            count += 1
    print "%s edges inserted " % count
Example #10
0
def main():
    """ re-Index all the Proxteam corpus """
    from pprint import pprint
    
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--host", action='store', help="host", default="http://*****:*****@ %s \n  " % (args.gid, args.host)
    bot = Botagraph(args.host, args.key)
    gid =  args.gid

    if args.username and args.password:
        bot.authenticate(args.username, args.password)

    if args.infos:
        pprint( bot.get_graph(gid) )
        return 


    # read / parse graph
    print "\n * Reading %s" % args.path
    
    graph = igraph.read(args.path)

    # subgraph
    if args.cut > 0:
        print " ** cut %s based on degree()" % args.cut
        # cut method based on degree
        n = int(args.cut)
        vs = list( (v.index, v.degree() ) for v in  graph.vs )
        vs = sorted( vs, key=lambda x: x[1], reverse = True )
        vs = vs[:n]
        graph = graph.subgraph( [  v[0] for v in vs ] )
        
    elif args.gl > 0:
        from cello.graphs.prox import prox_markov_dict, sortcut, ALL
        n = int(args.gl)
        extract = prox_markov_dict(graph, range(graph.vcount()), 80, add_loops=True)
        vs =  [ i for i,v in sortcut(extract,n)]
        print "vs", vs
        graph = graph.subgraph( vs )
        
         

    print graph.summary()
    graph.es['a'] = [ 1 for i in xrange(graph.vcount() ) ]


    
    if not bot.has_graph(gid) :
        print "\n * Create graph %s" % gid
        bot.create_graph(gid, { 'description':"Dicosyn experiment\n * ",
                                'image': "",
                                'tags': ['synonymes', 'dictionnaire']
                              }
                        )
                        
    print "\n * Get schema '%s'" % gid
    schema = bot.get_schema(gid)['schema']
    nodetypes = { n['name']:n for n in schema['nodetypes'] }
    edgetypes = { e['name']:e for e in schema['edgetypes'] }

    print "\n nodetypes: ", nodetypes.keys()
    print "\n edgetypes: ", edgetypes.keys()

    if not "word" in nodetypes:
         
        print "\n\n * Creating node type %s" % "word"
        props = { "label" : Text(),
                  "lang"  : Text()
                }
        bot.post_nodetype(gid, "word",  "no description", props)

    if not "is_syn" in edgetypes:
        print "\n\n * Creating edge type %s" % "is_syn"
        bot.post_edgetype(gid, "is_syn", "no desc", {"a":Text()})

    schema = bot.get_schema(gid)['schema']
    nodetypes = { n['name']:n for n in schema['nodetypes'] }
    edgetypes = { e['name']:e for e in schema['edgetypes'] }

    print nodetypes
    print edgetypes

    idx = {}

    if args.wait :
        raw_input("press <enter> key to start edges and nodes importation") 

    if args.seed: 

        def set_node(v):
            if v['label'] not in idx:
               node = bot.post_node(gid, node_payload(v, nodetypes['word']['uuid']))
               idx[ v['label'] ] = node['uuid']
               print "inserting %s %s" % (v['label'] , node['uuid'])

        idx = {}
        v1 = None

        # seeds grow into beautiful flowers 
        
        while graph.vcount() > 0:
            
            v1 = graph.vs[0] if v1 is None else v1
            
            size = graph.vcount()

            nei = v1.neighbors()
            if not len(nei):
                graph.delete_vertices([v1.index])
                v1 = None
                continue

            for i in range( min([5,len(nei)]) ):

                nei = v1.neighbors()
                
                if i >= len(nei):
                    if graph.vcount():
                        r = randint(0,graph.vcount()-1)
                        v1 = graph.vs[r]
                    break
                
                r = randint(0,len(nei)-1)
                v2 = nei[r]
                
                print "inserting edge %s %s" % (v1['label'] , v2['label'])

                set_node(v1)
                set_node(v2)

                eid = graph.get_eid(v1.index, v2.index)
                src, tgt = idx[v1['label']], idx[v2['label']]
                
                uuid = bot.post_edge(gid, edge_payload(edgetypes['is_syn']['uuid'], src, tgt, {}))

                # delete  from graph
                # * inserted edges
                # * nodes with no more edges 
                
                graph.delete_edges([eid])

                delete_nodes =  [ v.index for v in (v1, v2) if len(graph.neighbors(v)) == 0 ]

                if len(delete_nodes):
                    graph.delete_vertices(delete_nodes)
                    
                    if graph.vcount():
                        r = randint(0,graph.vcount()-1)
                        # switch v1
                        v1 = graph.vs[r]

                    else: break

            # wait sometimes
            pause(args.pause)

            
            
            
    else :

        print "posting nodes"
        count = 0
        fail = 0
        for node, uuid in bot.post_nodes( gid, gen_nodes(graph, nodetypes['word']['uuid']) ):
            if not uuid:
                fail += 1
            else :
                count += 1
                idx[node['properties']['label']] = uuid
            
        print "%s nodes inserted " % count
        
        #print "iterate over nodes"
        #for node in bot.find_all_nodes(gid, "word", {}):
            #pass

        # post edges
        print "posting edges"
        count = 0
        fail = 0

        inv_idx = { v:k for k,v in idx.iteritems() }
        
        for obj, uuid in bot.post_edges( gid, gen_edges(graph, edgetypes['is_syn']['uuid'], idx) ):
            if not uuid:
                fail += 1
            else :
                count += 1

            # wait sometimes    
            pause(args.pause)
            
        print "%s edges inserted, %s failed " % (count, fail)
Example #11
0
def main():
    """ re-Index all the Proxteam corpus """
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--host", action='store', help="host", default="http://localhost:5000")
    parser.add_argument("--key" , action='store', help="key", default=None)
    parser.add_argument("--username" , action='store', help="user", default=None)
    parser.add_argument("--password" , action='store', help="pwd", default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)
    parser.add_argument("files", action='store', default=None, nargs='+')

    args = parser.parse_args()

    print " analysing {}".format("".join(args.files))

    # Bot creation & login 
    bot = Botagraph(args.host, args.key)

    if args.username and args.password:
        bot.authenticate(args.username, args.password)
    
    gid = args.gid
    if not bot.has_graph(gid) :
        print "create graph %s" % gid
        bot.create_graph(gid, "no description")
        print "create node type %s" % "file"
        props = { "label": Text()}
        bot.post_nodetype(gid, "file",  "no description", props)
        bot.post_nodetype(gid, "message",  "no description", props)
        print "create edge type %s" % "is_syn"
        bot.post_edgetype(gid, "listen", "no desc", {})
        bot.post_edgetype(gid, "trigger", "no desc", {})

    print "Get schema '%s'" % gid
    schema = bot.get_schema(gid)['schema']
    nodetypes = { n['name']:n for n in schema['nodetypes'] }
    edgetypes = { e['name']:e for e in schema['edgetypes'] }
    nodetype_file = nodetypes['file']['uuid']
    nodetype_message = nodetypes['message']['uuid']
    edgetype_listen = edgetypes['listen']['uuid']
    edgetype_trigger = edgetypes['trigger']['uuid']
   
    vs = {}
    es = []

    for js_file in args.files:
        if js_file.endswith('min.js'):
            #ignore minified js
            continue
        with codecs.open(js_file, 'r', 'utf8') as FILE:
            js_file = re.sub('^[./]+', '', js_file)
            vs[js_file] = {'label': js_file, 'nodetype': nodetype_file}

            for l in FILE:
                #remove comments:
                l = l.strip().split('//',1)[0]
                m = re.search(r'listenTo\([^"]*"([a-z-]+)"', l)
                if m:
                    message = m.group(1)
                    if message not in vs:
                        vs[message] = {'label': message, 'nodetype': nodetype_message}
                    es.append((js_file, edgetype_listen, message))
                m = re.search(r'trigger\([^"]*"([a-z-]+)"', l)
                if m:
                    message = m.group(1)
                    if message not in vs:
                        vs[message] = {'label': message, 'nodetype': nodetype_message}
                    es.append((js_file, edgetype_trigger, message))

    idx = {}
    print "posting nodes"
    count = 0
    fail = 0
    for node, uuid in bot.post_nodes( gid, gen_nodes(vs) ):
        if not uuid:
            fail += 1
        else :
            count += 1
            idx[node['properties']['label']] = uuid
        
    print "%s nodes inserted " % count

    
    #print "iterate over nodes"
    #for node in bot.find_all_nodes(gid, "word", {}):
        #pass#print node

    # post edges
    print "posting edges"
    count = 0
    fail = 0

    inv_idx = { v:k for k,v in idx.iteritems() }
    
    for _, uuid in bot.post_edges(gid, gen_edges(es, idx)):
        if not uuid:
            fail += 1
        else :
            count += 1
        #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) )
    print "%s edges inserted, %s failed " % (count, fail)
Example #12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--host",
                        action='store',
                        help="host",
                        default="http://localhost:5000")
    parser.add_argument("--key",
                        action='store',
                        help="authentification token",
                        default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)

    args = parser.parse_args()
    host, key, gid = (args.host, args.key, args.gid)

    if None in (host, key, gid):
        parser.print_help()
        return

    # Setup schema

    from reliure.schema import Doc, Schema
    from reliure.types import Text, Numeric, Boolean, GenericType

    desc = """
        Game of thrones 
        %s
        """.replace("    ", "")

    g_attrs = {
        'description': desc % gid,

        #'image': "https://commons.wikimedia.org/wiki/File:Game_of_Thrones_2011_logo.svg?uselang=fr",
        #'tags': ['social-network', 'game-of-thrones']
    }

    # used for houses, sauvageons ...
    group_type = Schema(
        **{
            'label': Text(),
            'url': Text(),
            'tags': Text(multi=True, uniq=True),
            'image': Text(),
            'color': Text(),
            'shape': Text(default=u"square"),
            "name": Text(),
        })

    # human like characters
    character_type = Schema(
        **{
            'label': Text(),
            'url': Text(multi=True, uniq=True),
            'tags': Text(multi=True, uniq=True),
            'image': Text(),
            'shape': Text(default=u"circle"),
            'description': Text(),
            "name": Text(),
            "actor": Text(multi=True, uniq=True),
            "dubbling_vf": Text(multi=True, uniq=True),
            "bio_fr": Text(),
            "bio_en": Text(),
            "seasons": Text(),
            "dead": Boolean(default=False),
        })

    # creaturs dragons, wolf, white walkers ?
    creatur_type = Schema(
        **{
            'label': Text(),
            'url': Text(),
            'tags': Text(multi=True, uniq=True),
            'image': Text(),
            'shape': Text(default=u"triangle"),
            'description': Text(),
            "name": Text(),
            "bio_fr": Text(),
            "bio_en": Text(),
            "seasons": Text(),
            "dead": Boolean(default=False),
        })

    edgetypes = [
        # Characters or Creaturs -- rel --> Group
        # (name, desc , properties ),
        ("is_member_of", "Character is member of a Group", {
            "from_ep": Text(),
        }),
        ("is_child_of", "character or creatur is child of another one", {}),
        ("works_for", "character or creatur works for a character or a group",
         {
             "from_episode": Text(),
             "to_episode": Text()
         }),
        ("is_friend_of", "character is friend of another one", {
            "from_ep": Text(),
        }),
        ("married", "character meet another one", {
            "force": Numeric()
        }),
        ("belongs_to", "character or creatur belongs to another one", {
            "from_episode": Text(),
            "to_episode": Text()
        }),
        ("kill", "character or creatur kill another one", {
            "episode": Text(),
            "method": Text()
        }),
        #("have_sex", "character or creatur have sex another one", { "episode":Text()} ),
        #("rape", "character or creatur rape another one", { "episode":Text()} ),
        #("meet", "character meet another one", { "episode":Text()}),
        #("loves", "character meet another one", {} ),
    ]

    # PARSING WK page

    from pyquery import PyQuery as pq
    import codecs

    root = "."
    path = "%s/pages/Personnages_de_Game_of_Thrones" % root
    graphmlz = "%s/got.graphml" % root

    def opengot():
        html = codecs.open(path, mode='r', encoding="utf8").read()
        html = pq(html)
        html = html(".mw-content-ltr")
        html(".mw-content-ltr h2:first").prevAll().remove()
        html(".mw-content-ltr h2:first").remove()
        html(".mw-content-ltr h2:first").nextAll().remove()

        html('.mw-editsection').remove()
        html('sup').remove()
        html = pq(".mw-content-ltr", html)
        return html

    def as_doc(ctype, cdata):
        d = Doc(ctype)

        for k, v in cdata.iteritems():
            if type(ctype[k]) == Text:
                d.set_field(k, v, True)
            else:
                d[k] = v
        return d.export()

    def _parse_color(e):
        color = None
        if "style" in e.attrib:
            styles = dict(
                pair.strip().split(':')
                for pair in pq(e).attr("style").strip().lower().split(';')
                if len(pair))
            color = styles.get("background", None)

        if color and color in ("black", "#000"): color = "#000000"

        return color

    def parse_belongs_legend(html):
        houses_map = {}
        legende = pq("li", pq("table td ul", html)[:4])
        for e in legende:
            color = _parse_color(pq("span", e)[0])
            text = pq(e).text()  #.replace("Maison ", "")
            houses_map[color] = text

        # removes legendes
        html(".mw-content-ltr h3:first").prevAll().remove()
        return houses_map

    def parse_creaturs_and_characters(html, houses):

        rel_belongs = []
        rel_member_of = []
        characters = []
        creaturs = []

        while True:
            # reading from end
            if len(html("h3:last")):

                ths = pq('th', html("h3:last").nextAll())
                tds = pq('td', html("h3:last").nextAll())

                title = html("h3:last").text()
                color = None
                flg = 0

                if len(ths) % 5 == 0:
                    c = {}
                    member_of = []
                    for td in tds:
                        colspan = td.attrib.get('colspan', 0)
                        if colspan == "6":  # table headers
                            color = _parse_color(td)
                            if color:
                                member_of.append(houses[color])
                            flg = 1

                        elif colspan == 0:  # table cells
                            if flg == 1:
                                actor_img = pq("img", td).attr("src")
                                if actor_img:
                                    c['image'] = "http:%s" % actor_img
                            elif flg == 2:
                                name = pq(td).text()
                                c['name'] = name
                                for e in member_of:
                                    rel_member_of.append((name, e))
                            elif flg == 3:
                                c['actor'] = [
                                    pq(e).text() for e in pq("a", td)
                                ]

                            elif flg == 4:
                                c['dubbling_vf'] = [
                                    pq(e).text() for e in pq("a", td)
                                ]
                            elif flg == 5:
                                c['seasons'] = pq(td).text()
                                c['dead'] = u"✝" in pq(td).text()
                            flg += 1

                        elif colspan == "5":  # table bio cell
                            c['bio_fr'] = pq(td).text()

                            characters.append(as_doc(character_type, c))
                            # reset
                            c = {}
                            member_of = []
                            flg = 1

                if len(ths) == 2:
                    c = {}
                    belongs = []
                    for td in tds:
                        colspan = td.attrib.get('colspan', 0)
                        if colspan == "6":
                            color = _parse_color(td)
                            if color:
                                belongs.append(houses[color])
                            flg = 1

                        elif colspan == 0:
                            if flg == 1:
                                name = pq(td).text().strip()
                                c['name'] = name
                                for e in belongs:
                                    rel_belongs.append((name, e))
                                flg = 2
                            if flg == 2:
                                c["seasons"] = pq(td).text()
                                c["dead"] = True  # u"✝" in pq(td).text()

                        elif colspan == "5":
                            c['bio_fr'] = pq(td).text()
                            creaturs.append(as_doc(creatur_type, c))
                            c = {}
                            belongs = []
                            flg = 0

                #removes section once parsed
                html("h3:last").nextAll().remove()
                html("h3:last").remove()

            else:
                break

        return characters, rel_member_of, creaturs, rel_belongs

    # In[ ]:

    from reliure.schema import Doc

    locations = []  # TODO

    html = opengot()
    houses_map = parse_belongs_legend(html)
    characters, rel_member_of, creaturs, rel_belongs = parse_creaturs_and_characters(
        html, houses_map)

    print "Groups   ", len(houses_map)
    print "Creaturs   ", len(creaturs)
    print "Characters ", len(characters)

    print "member_of", len(rel_member_of)
    print "belongs", len(rel_belongs)

    from botapi import Botagraph, BotApiError

    bot = Botagraph(host, key)

    if not bot.has_graph(gid):

        print "\n * Creating graph %s" % gid
        bot.create_graph(gid, g_attrs)

        print "\n * Creating node type %s" % ""
        bot.post_nodetype(gid, "Character", "Character",
                          character_type._fields)
        bot.post_nodetype(gid, "Creatur", "Creatur", creatur_type._fields)
        bot.post_nodetype(gid, "Group", "Group", group_type._fields)

        for name, desc, props in edgetypes:
            bot.post_edgetype(gid, name, desc, props)

    schema = bot.get_schema(gid)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}

    idx = {}  # (label, uuid)
    groups = []

    for k, v in houses_map.iteritems():
        g = as_doc(group_type, {'label': v, 'name': v, 'color': k})
        groups.append(g)

    for name, els in [("Character", characters), ("Creatur", creaturs),
                      ("Group", groups)]:

        print "Posting %s nodes %s" % (len(els), name)
        for c in els:
            payload = {
                'nodetype': nodetypes[name]['uuid'],
                'properties': {k: v
                               for k, v in c.iteritems()}
            }
            payload['properties']['label'] = payload['properties']['name']
            node = bot.post_node(gid, payload)
            idx[node['label']] = node['uuid']

    vids = set()
    for name, rels in [("is_member_of", rel_member_of),
                       ("belongs_to", rel_belongs)]:

        print "Posting %s rels %s" % (len(rels), name)
        for src, tgt in rels:
            if src in idx and tgt in idx:
                edge = {
                    'edgetype': edgetypes[name]['uuid'],
                    'source': idx[src],
                    'label': name,
                    'target': idx[tgt],
                    'properties': {
                        "from_ep": "",
                    }
                }
                uuid = bot.post_edge(gid, edge)
                vids.add(src)
                vids.add(tgt)
            else:
                print src, tgt

    print "Starring %s nodes" % len(list(vids))
    bot.star_nodes(gid, list(vids))