def main(): """ re-Index all the Proxteam corpus """ parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key" , action='store', help="key", default=None) parser.add_argument("--username" , action='store', help="user", default=None) parser.add_argument("--password" , action='store', help="pwd", default=None) parser.add_argument("--path", action='store', help="path", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) args = parser.parse_args() bot = Botagraph(args.host, args.key) if args.username and args.password: bot.authenticate(args.username, args.password) (Initiales, GSR, Matrix) = load_OCR_file(args.path) if not bot.has_graph(GID): bot.create_graph(GID, {'description': 'a graph of Old Chinese phonology', 'image': "", 'tags': ['chinese','phonology']}) bot.post_nodetype(GID, 'GSR', 'Karlgren', {}) bot.post_nodetype(GID, 'Initial', '', {}) bot.post_edgetype(GID, 'Sinograms', '',{}) print "Get schema '%s'" % GID schema = bot.get_schema(GID)['schema'] nodetypes = { n['name']:n for n in schema['nodetypes'] } edgetypes = { e['name']:e for e in schema['edgetypes'] } Inidict = {} for i, (_, uuid) in enumerate(bot.post_nodes(GID, ({'nodetype':nodetypes['Initial']['uuid'], 'properties':{'label':i}} for i in Initiales))): Inidict[Initiales[i]] = uuid GSRdict = {} for i, (_, uuid) in enumerate(bot.post_nodes(GID,({'nodetype':nodetypes['GSR']['uuid'], 'properties': {'label':s}} for s in GSR))): GSRdict[GSR[i]] = uuid edges = [{'edgetype':edgetypes['Sinograms']['uuid'], 'properties':{'label': u",".join(sinos)}, 'source': GSRdict[gsr], 'target': Inidict[i]} for (i, gsr), sinos in Matrix.iteritems()] #for e in edges: # print e # bot.post_edge(GID, e) for _ in bot.post_edges(GID, iter(edges)): pass
def main(): """ """ parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key", action='store', help="key", default=None) parser.add_argument("--username", action='store', help="user", default=None) parser.add_argument("--password", action='store', help="pwd", default=None) parser.add_argument("--path", action='store', help="path", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) args = parser.parse_args() # Bot creation & login key = open(args.key, 'r').read().strip() print "\n * Connecting to %s \n " % args.host bot = Botagraph(args.host, key) # read / parse graph print "\n * Reading %s" % args.path g = Graph(args.gid) gid = g.gid if bot.has_graph(gid): bot.delete_graph(gid) if not bot.has_graph(gid): print "\n * Create graph %s" % gid bot.create_graph(gid, g.attrs) print "\n * Creating node type %s" % "Country" props = {"code": Text(), "label": Text()} bot.post_nodetype(gid, "Country", "Country ", props) print "\n * Creating edge type %s" % "alliance" props = { 'id': Numeric(), 'label': Text(), 'starts': Text(), 'ends': Text(), 'defense': Numeric(), 'neutrality': Numeric(), 'nonaggression': Numeric(), 'entente': Numeric(), } bot.post_edgetype(gid, "alliance", "alliance terms", props) schema = bot.get_schema(gid)['schema'] nodetypes = {n['name']: n for n in schema['nodetypes']} edgetypes = {e['name']: e for e in schema['edgetypes']} import csv with open(args.path, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') for i, row in enumerate(reader): # undirected if i == 0: continue if i % 2 != 0: continue node = lambda x: { 'nodetype': nodetypes["Country"]['uuid'], 'properties': { 'code': x[0], 'label': x[1] } } g.vs[row[1]] = node(row[1:3]) g.vs[row[3]] = node(row[3:5]) left_censor, right_censor = row[11:13] alliances = "defense neutrality nonaggression entente".split() es = dict(zip(alliances, row[13:17])) es['starts'] = "/".join(row[5:8]) es['ends'] = "/".join(row[8:11]) es['id'] = row[0] es['label'] = " ".join([a for a in alliances if es[a] in (1, "1")]) es.update() g.es[i] = { 'edgetype': edgetypes["alliance"]['uuid'], 'source': row[1], 'target': row[3], 'properties': es } print len(g.vs), len(g.es) idx = {} for node, uuid in bot.post_nodes(gid, g.vs.itervalues()): idx[node['properties']['code']] = uuid bot.star_nodes(gid, idx.values()) print "%s nodes inserted . " % (len(idx)) for edge in g.es.itervalues(): edge['source'] = idx[edge['source']] edge['target'] = idx[edge['target']] fail = count = 0 for edge, uuid in bot.post_edges(gid, g.es.itervalues()): if not uuid: fail += 1 else: count += 1 print "%s edges inserted, %s failed " % (count, fail)
def main(): """ re-Index all the Proxteam corpus """ parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key", action='store', help="key", default=None) parser.add_argument("--username", action='store', help="user", default=None) parser.add_argument("--password", action='store', help="pwd", default=None) parser.add_argument("--path", action='store', help="path", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) args = parser.parse_args() # Bot creation & login bot = Botagraph(args.host, args.key) if args.username and args.password: bot.authenticate(args.username, args.password) # read / parse graph graph = py2neo.Graph() # create empty graph gid = args.gid if not bot.has_graph(gid): print "create graph %s" % gid bot.create_graph( gid, { 'description': SCHEMA["description"], 'image': "", 'tags': SCHEMA['tags'] }) for infos in SCHEMA["Nodes"]: print "create node type %s" % infos["type"] bot.post_nodetype(gid, *buildType(infos)) for infos in SCHEMA["Links"]: print "create edge type %s" % infos["type"] bot.post_edgetype(gid, *buildType(infos)) print "Get schema '%s'" % gid schema = bot.get_schema(gid)['schema'] nodetypes = {n['name']: n for n in schema['nodetypes']} edgetypes = {e['name']: e for e in schema['edgetypes']} print nodetypes print edgetypes idx = {} print "posting nodes" count = 0 fail = 0 for infos in SCHEMA['Nodes']: for node, uuid in bot.post_nodes( gid, gen_nodes(graph, nodetypes[infos['type']]['uuid'], infos)): if not uuid: fail += 1 else: count += 1 idx[node['properties'][SCHEMA['key']]] = uuid print "%s nodes inserted " % count # post edges print "posting edges" count = 0 fail = 0 inv_idx = {v: k for k, v in idx.iteritems()} for infos in SCHEMA['Links']: for obj, uuid in bot.post_edges( gid, gen_edges(graph, edgetypes[infos['type']]['uuid'], idx, infos)): if not uuid: fail += 1 else: count += 1 #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) ) print "%s edges inserted, %s failed " % (count, fail)
def main(): """ re-Index all the Proxteam corpus """ parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key" , action='store', help="key", default=None) parser.add_argument("--username" , action='store', help="user", default=None) parser.add_argument("--password" , action='store', help="pwd", default=None) parser.add_argument("--path", action='store', help="path", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) args = parser.parse_args() # Bot creation & login bot = Botagraph(args.host, args.key) if args.username and args.password: bot.authenticate(args.username, args.password) # read / parse graph graph = py2neo.Graph() # create empty graph gid = args.gid if not bot.has_graph(gid) : print "create graph %s" % gid bot.create_graph(gid, { 'description': SCHEMA["description"], 'image': "", 'tags': SCHEMA['tags'] } ) for infos in SCHEMA["Nodes"]: print "create node type %s" % infos["type"] bot.post_nodetype(gid, *buildType(infos)) for infos in SCHEMA["Links"]: print "create edge type %s" % infos["type"] bot.post_edgetype(gid, *buildType(infos)) print "Get schema '%s'" % gid schema = bot.get_schema(gid)['schema'] nodetypes = { n['name']:n for n in schema['nodetypes'] } edgetypes = { e['name']:e for e in schema['edgetypes'] } print nodetypes print edgetypes idx = {} print "posting nodes" count = 0 fail = 0 for infos in SCHEMA['Nodes']: for node, uuid in bot.post_nodes( gid, gen_nodes(graph, nodetypes[infos['type']]['uuid'], infos) ): if not uuid: fail += 1 else : count += 1 idx[node['properties'][SCHEMA['key']]] = uuid print "%s nodes inserted " % count # post edges print "posting edges" count = 0 fail = 0 inv_idx = { v:k for k,v in idx.iteritems() } for infos in SCHEMA['Links']: for obj, uuid in bot.post_edges( gid, gen_edges(graph, edgetypes[infos['type']]['uuid'], idx, infos) ): if not uuid: fail += 1 else : count += 1 #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) ) print "%s edges inserted, %s failed " % (count, fail)
def main(): """ re-Index all the Proxteam corpus """ parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key", action='store', help="key", default=None) parser.add_argument("--username", action='store', help="user", default=None) parser.add_argument("--password", action='store', help="pwd", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) parser.add_argument("files", action='store', default=None, nargs='+') args = parser.parse_args() print " analysing {}".format("".join(args.files)) # Bot creation & login bot = Botagraph(args.host, args.key) if args.username and args.password: bot.authenticate(args.username, args.password) gid = args.gid if not bot.has_graph(gid): print "create graph %s" % gid bot.create_graph(gid, "no description") print "create node type %s" % "file" props = {"label": Text()} bot.post_nodetype(gid, "file", "no description", props) bot.post_nodetype(gid, "message", "no description", props) print "create edge type %s" % "is_syn" bot.post_edgetype(gid, "listen", "no desc", {}) bot.post_edgetype(gid, "trigger", "no desc", {}) print "Get schema '%s'" % gid schema = bot.get_schema(gid)['schema'] nodetypes = {n['name']: n for n in schema['nodetypes']} edgetypes = {e['name']: e for e in schema['edgetypes']} nodetype_file = nodetypes['file']['uuid'] nodetype_message = nodetypes['message']['uuid'] edgetype_listen = edgetypes['listen']['uuid'] edgetype_trigger = edgetypes['trigger']['uuid'] vs = {} es = [] for js_file in args.files: if js_file.endswith('min.js'): #ignore minified js continue with codecs.open(js_file, 'r', 'utf8') as FILE: js_file = re.sub('^[./]+', '', js_file) vs[js_file] = {'label': js_file, 'nodetype': nodetype_file} for l in FILE: #remove comments: l = l.strip().split('//', 1)[0] m = re.search(r'listenTo\([^"]*"([a-z-]+)"', l) if m: message = m.group(1) if message not in vs: vs[message] = { 'label': message, 'nodetype': nodetype_message } es.append((js_file, edgetype_listen, message)) m = re.search(r'trigger\([^"]*"([a-z-]+)"', l) if m: message = m.group(1) if message not in vs: vs[message] = { 'label': message, 'nodetype': nodetype_message } es.append((js_file, edgetype_trigger, message)) idx = {} print "posting nodes" count = 0 fail = 0 for node, uuid in bot.post_nodes(gid, gen_nodes(vs)): if not uuid: fail += 1 else: count += 1 idx[node['properties']['label']] = uuid print "%s nodes inserted " % count #print "iterate over nodes" #for node in bot.find_all_nodes(gid, "word", {}): #pass#print node # post edges print "posting edges" count = 0 fail = 0 inv_idx = {v: k for k, v in idx.iteritems()} for _, uuid in bot.post_edges(gid, gen_edges(es, idx)): if not uuid: fail += 1 else: count += 1 #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) ) print "%s edges inserted, %s failed " % (count, fail)
def main(): """ re-Index all the Proxteam corpus """ parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key", action='store', help="key", default=None) parser.add_argument("--username", action='store', help="user", default=None) parser.add_argument("--password", action='store', help="pwd", default=None) parser.add_argument("--path", action='store', help="path", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) args = parser.parse_args() bot = Botagraph(args.host, args.key) if args.username and args.password: bot.authenticate(args.username, args.password) (Initiales, GSR, Matrix) = load_OCR_file(args.path) if not bot.has_graph(GID): bot.create_graph( GID, { 'description': 'a graph of Old Chinese phonology', 'image': "", 'tags': ['chinese', 'phonology'] }) bot.post_nodetype(GID, 'GSR', 'Karlgren', {}) bot.post_nodetype(GID, 'Initial', '', {}) bot.post_edgetype(GID, 'Sinograms', '', {}) print "Get schema '%s'" % GID schema = bot.get_schema(GID)['schema'] nodetypes = {n['name']: n for n in schema['nodetypes']} edgetypes = {e['name']: e for e in schema['edgetypes']} Inidict = {} for i, (_, uuid) in enumerate( bot.post_nodes(GID, ({ 'nodetype': nodetypes['Initial']['uuid'], 'properties': { 'label': i } } for i in Initiales))): Inidict[Initiales[i]] = uuid GSRdict = {} for i, (_, uuid) in enumerate( bot.post_nodes(GID, ({ 'nodetype': nodetypes['GSR']['uuid'], 'properties': { 'label': s } } for s in GSR))): GSRdict[GSR[i]] = uuid edges = [{ 'edgetype': edgetypes['Sinograms']['uuid'], 'properties': { 'label': u",".join(sinos) }, 'source': GSRdict[gsr], 'target': Inidict[i] } for (i, gsr), sinos in Matrix.iteritems()] #for e in edges: # print e # bot.post_edge(GID, e) for _ in bot.post_edges(GID, iter(edges)): pass
def main(): parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key" , action='store', help="authentification token", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) args = parser.parse_args() host, key, gid = (args.host, args.key,args.gid) if None in (host, key, gid): parser.print_help() return # Setup schema from reliure.schema import Doc, Schema from reliure.types import Text, Numeric , Boolean, GenericType desc = """ Game of thrones %s """.replace(" ", "") g_attrs = { 'description': desc % gid, #'image': "https://commons.wikimedia.org/wiki/File:Game_of_Thrones_2011_logo.svg?uselang=fr", #'tags': ['social-network', 'game-of-thrones'] } # used for houses, sauvageons ... group_type = Schema(**{ 'label' : Text(), 'url' : Text(), 'tags' : Text(multi=True, uniq=True), 'image' : Text(), 'color' : Text(), 'shape' : Text(default=u"square"), "name" : Text(), }) # human like characters character_type = Schema(**{ 'label' : Text(), 'url' : Text(multi=True, uniq=True), 'tags' : Text(multi=True, uniq=True), 'image' : Text(), 'shape' : Text(default=u"circle"), 'description' : Text(), "name":Text(), "actor": Text(multi=True,uniq=True), "dubbling_vf": Text(multi=True,uniq=True), "bio_fr": Text(), "bio_en": Text(), "seasons": Text(), "dead": Boolean(default=False), }) # creaturs dragons, wolf, white walkers ? creatur_type = Schema(**{ 'label' : Text(), 'url' : Text(), 'tags' : Text(multi=True, uniq=True), 'image' : Text(), 'shape' : Text(default=u"triangle"), 'description' : Text(), "name" : Text(), "bio_fr": Text(), "bio_en": Text(), "seasons": Text(), "dead": Boolean(default=False), }) edgetypes = [ # Characters or Creaturs -- rel --> Group # (name, desc , properties ), ("is_member_of", "Character is member of a Group", {"from_ep":Text(),} ), ("is_child_of", "character or creatur is child of another one", {} ), ("works_for", "character or creatur works for a character or a group", {"from_episode":Text(), "to_episode":Text()} ), ("is_friend_of", "character is friend of another one", {"from_ep":Text(),} ), ("married", "character meet another one", {"force": Numeric()} ), ("belongs_to", "character or creatur belongs to another one", {"from_episode":Text(), "to_episode":Text()} ), ("kill", "character or creatur kill another one", { "episode":Text(), "method":Text() }), #("have_sex", "character or creatur have sex another one", { "episode":Text()} ), #("rape", "character or creatur rape another one", { "episode":Text()} ), #("meet", "character meet another one", { "episode":Text()}), #("loves", "character meet another one", {} ), ] # PARSING WK page from pyquery import PyQuery as pq import codecs root = "." path = "%s/pages/Personnages_de_Game_of_Thrones" % root graphmlz = "%s/got.graphml" % root def opengot(): html = codecs.open(path, mode='r', encoding="utf8").read() html = pq(html) html = html(".mw-content-ltr") html(".mw-content-ltr h2:first").prevAll().remove() html(".mw-content-ltr h2:first").remove() html(".mw-content-ltr h2:first").nextAll().remove() html('.mw-editsection').remove() html('sup').remove() html = pq(".mw-content-ltr", html) return html def as_doc(ctype, cdata): d = Doc(ctype) for k,v in cdata.iteritems(): if type(ctype[k]) == Text: d.set_field(k,v,True) else: d[k]= v return d.export() def _parse_color(e): color = None if "style" in e.attrib: styles = dict ( pair.strip().split(':') for pair in pq(e).attr("style").strip().lower().split(';') if len(pair)) color = styles.get("background", None) if color and color in ( "black", "#000") : color = "#000000" return color def parse_belongs_legend(html): houses_map = {} legende = pq( "li", pq("table td ul", html)[:4]) for e in legende: color = _parse_color(pq("span",e)[0]) text = pq(e).text()#.replace("Maison ", "") houses_map[color] = text # removes legendes html(".mw-content-ltr h3:first").prevAll().remove() return houses_map def parse_creaturs_and_characters(html, houses): rel_belongs = [] rel_member_of = [] characters = [] creaturs = [] while True: # reading from end if len(html("h3:last")): ths = pq('th', html("h3:last").nextAll()) tds = pq('td', html("h3:last").nextAll()) title = html("h3:last").text() color = None flg = 0 if len(ths)%5 == 0: c = {} member_of = [] for td in tds: colspan = td.attrib.get('colspan', 0) if colspan == "6": # table headers color = _parse_color(td) if color : member_of.append( houses[color] ) flg = 1 elif colspan == 0: # table cells if flg == 1: actor_img = pq("img", td).attr("src") if actor_img: c['image'] = "http:%s" %actor_img elif flg == 2: name = pq(td).text() c['name'] = name ; for e in member_of : rel_member_of.append( (name, e) ) elif flg == 3: c['actor'] = [pq(e).text() for e in pq("a", td)] elif flg == 4: c['dubbling_vf'] = [pq(e).text() for e in pq("a", td)] elif flg == 5 : c['seasons'] = pq(td).text() c['dead'] = u"✝" in pq(td).text() flg +=1 elif colspan == "5": # table bio cell c['bio_fr'] = pq(td).text() characters.append(as_doc(character_type, c)) # reset c = {} member_of = [] flg = 1 if len(ths) == 2: c = {} belongs = [] for td in tds: colspan = td.attrib.get('colspan', 0) if colspan == "6": color = _parse_color(td) if color : belongs.append(houses[color]) flg = 1 elif colspan == 0: if flg == 1: name = pq(td).text().strip() c['name'] = name for e in belongs : rel_belongs.append( (name, e)) flg = 2 if flg == 2: c["seasons"] = pq(td).text() c["dead"] = True # u"✝" in pq(td).text() elif colspan == "5": c['bio_fr'] = pq(td).text() creaturs.append(as_doc(creatur_type, c)) c = {} belongs = [] flg = 0 #removes section once parsed html("h3:last").nextAll().remove() html("h3:last").remove() else : break return characters, rel_member_of, creaturs, rel_belongs # In[ ]: from reliure.schema import Doc locations = [] # TODO html = opengot() houses_map = parse_belongs_legend(html) characters, rel_member_of, creaturs, rel_belongs = parse_creaturs_and_characters(html, houses_map) print "Groups ", len(houses_map) print "Creaturs ", len(creaturs) print "Characters ", len(characters) print "member_of", len(rel_member_of) print "belongs", len(rel_belongs) from botapi import Botagraph, BotApiError bot = Botagraph(host, key) if not bot.has_graph(gid) : print "\n * Creating graph %s" % gid bot.create_graph(gid, g_attrs) print "\n * Creating node type %s" % "" bot.post_nodetype(gid, "Character", "Character", character_type._fields) bot.post_nodetype(gid, "Creatur", "Creatur", creatur_type._fields) bot.post_nodetype(gid, "Group", "Group", group_type._fields) for name, desc, props in edgetypes: bot.post_edgetype(gid, name, desc, props ) schema = bot.get_schema(gid)['schema'] nodetypes = { n['name']:n for n in schema['nodetypes'] } edgetypes = { e['name']:e for e in schema['edgetypes'] } idx = {} # (label, uuid) groups = [] for k,v in houses_map.iteritems(): g = as_doc(group_type, {'label': v,'name': v,'color':k }) groups.append( g ) for name, els in [ ("Character", characters), ("Creatur", creaturs ) , ("Group", groups) ]: print "Posting %s nodes %s" % (len(els), name) for c in els: payload = { 'nodetype': nodetypes[name]['uuid'], 'properties': { k:v for k,v in c.iteritems() } } payload['properties']['label'] = payload['properties']['name'] node = bot.post_node(gid, payload) idx[node['label']] = node['uuid'] vids = set() for name, rels in [( "is_member_of", rel_member_of), ( "belongs_to", rel_belongs) ]: print "Posting %s rels %s" % ( len(rels), name ) for src, tgt in rels: if src in idx and tgt in idx: edge = { 'edgetype': edgetypes[name]['uuid'], 'source': idx[src], 'label' : name, 'target': idx[tgt], 'properties': {"from_ep":"",} } uuid = bot.post_edge(gid, edge) vids.add(src) vids.add(tgt) else: print src, tgt print "Starring %s nodes" % len(list(vids)) bot.star_nodes(gid, list(vids))
def main(): """ re-Index all the Proxteam corpus """ from pprint import pprint parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://*****:*****@ %s \n " % (args.gid, args.host) bot = Botagraph(args.host, args.key) gid = args.gid if args.username and args.password: bot.authenticate(args.username, args.password) if args.infos: pprint(bot.get_graph(gid)) return # read / parse graph print "\n * Reading %s" % args.path graph = igraph.read(args.path) # subgraph if args.cut > 0: print " ** cut %s based on degree()" % args.cut # cut method based on degree n = int(args.cut) vs = list((v.index, v.degree()) for v in graph.vs) vs = sorted(vs, key=lambda x: x[1], reverse=True) vs = vs[:n] graph = graph.subgraph([v[0] for v in vs]) elif args.gl > 0: from cello.graphs.prox import prox_markov_dict, sortcut, ALL n = int(args.gl) extract = prox_markov_dict(graph, range(graph.vcount()), 80, add_loops=True) vs = [i for i, v in sortcut(extract, n)] print "vs", vs graph = graph.subgraph(vs) print graph.summary() graph.es['a'] = [1 for i in xrange(graph.vcount())] if not bot.has_graph(gid): print "\n * Create graph %s" % gid bot.create_graph( gid, { 'description': "Dicosyn experiment\n * ", 'image': "", 'tags': ['synonymes', 'dictionnaire'] }) print "\n * Get schema '%s'" % gid schema = bot.get_schema(gid)['schema'] nodetypes = {n['name']: n for n in schema['nodetypes']} edgetypes = {e['name']: e for e in schema['edgetypes']} print "\n nodetypes: ", nodetypes.keys() print "\n edgetypes: ", edgetypes.keys() if not "word" in nodetypes: print "\n\n * Creating node type %s" % "word" props = {"label": Text(), "lang": Text()} bot.post_nodetype(gid, "word", "no description", props) if not "is_syn" in edgetypes: print "\n\n * Creating edge type %s" % "is_syn" bot.post_edgetype(gid, "is_syn", "no desc", {"a": Text()}) schema = bot.get_schema(gid)['schema'] nodetypes = {n['name']: n for n in schema['nodetypes']} edgetypes = {e['name']: e for e in schema['edgetypes']} print nodetypes print edgetypes idx = {} if args.wait: raw_input("press <enter> key to start edges and nodes importation") if args.seed: def set_node(v): if v['label'] not in idx: node = bot.post_node( gid, node_payload(v, nodetypes['word']['uuid'])) idx[v['label']] = node['uuid'] print "inserting %s %s" % (v['label'], node['uuid']) idx = {} v1 = None # seeds grow into beautiful flowers while graph.vcount() > 0: v1 = graph.vs[0] if v1 is None else v1 size = graph.vcount() nei = v1.neighbors() if not len(nei): graph.delete_vertices([v1.index]) v1 = None continue for i in range(min([5, len(nei)])): nei = v1.neighbors() if i >= len(nei): if graph.vcount(): r = randint(0, graph.vcount() - 1) v1 = graph.vs[r] break r = randint(0, len(nei) - 1) v2 = nei[r] print "inserting edge %s %s" % (v1['label'], v2['label']) set_node(v1) set_node(v2) eid = graph.get_eid(v1.index, v2.index) src, tgt = idx[v1['label']], idx[v2['label']] uuid = bot.post_edge( gid, edge_payload(edgetypes['is_syn']['uuid'], src, tgt, {})) # delete from graph # * inserted edges # * nodes with no more edges graph.delete_edges([eid]) delete_nodes = [ v.index for v in (v1, v2) if len(graph.neighbors(v)) == 0 ] if len(delete_nodes): graph.delete_vertices(delete_nodes) if graph.vcount(): r = randint(0, graph.vcount() - 1) # switch v1 v1 = graph.vs[r] else: break # wait sometimes pause(args.pause) else: print "posting nodes" count = 0 fail = 0 for node, uuid in bot.post_nodes( gid, gen_nodes(graph, nodetypes['word']['uuid'])): if not uuid: fail += 1 else: count += 1 idx[node['properties']['label']] = uuid print "%s nodes inserted " % count #print "iterate over nodes" #for node in bot.find_all_nodes(gid, "word", {}): #pass # post edges print "posting edges" count = 0 fail = 0 inv_idx = {v: k for k, v in idx.iteritems()} for obj, uuid in bot.post_edges( gid, gen_edges(graph, edgetypes['is_syn']['uuid'], idx)): if not uuid: fail += 1 else: count += 1 # wait sometimes pause(args.pause) print "%s edges inserted, %s failed " % (count, fail)
def to_padagraph(host, key, gid, path): from reliure.types import Text, Numeric from botapi import Botagraph, BotApiError bot = Botagraph(host, key) nodes, edges = parse(path) if not bot.has_graph(gid) : print "\n * Create graph %s" % gid attrs = { 'description': """ http://utopies-concretes.org/#/fr Ils ont essayé de nous enterrer, ils ne savaient pas que nous étions des graines. Un graphe de près de 3000 sites internet de collectifs, structures, médias, blogs — positions relatives et interconnexions """.replace(" ", ""), 'image': "", 'tags': ['social-network', 'utopies-concretes'] } print "\n * Creating graph %s" % gid bot.create_graph(gid, attrs ) print "\n * Creating node type %s" % "" props = { 'label' : Text(), 'url' : Text(), 'tags' : Text(multi=True, uniq=True), 'image' : Text(), 'description' : Text() } bot.post_nodetype(gid, "Site", "Site ", props) print "\n * Creating edge type %s" % "follows" props = { 'score' : Numeric(), } bot.post_edgetype(gid, "is_related", "is_related", props ) schema = bot.get_schema(gid)['schema'] nodetypes = { n['name']:n for n in schema['nodetypes'] } edgetypes = { e['name']:e for e in schema['edgetypes'] } def gen_nodes(): for k,v in nodes.iteritems(): yield { 'nodetype': nodetypes['Site']['uuid'], 'properties': v } print "posting nodes" count = 0 fail = 0 idx = {} for node, uuid in bot.post_nodes( gid, gen_nodes() ): if not uuid: fail += 1 else : count += 1 idx[node['properties']['pid']] = uuid print "%s nodes inserted " % count def gen_edges(): for e in edges: src = idx.get(e["source"], None) tgt = idx.get(e["target"], None) if src and tgt: yield { 'edgetype': edgetypes['is_related']['uuid'], 'source': src, 'label' : "is_related", 'target': tgt, 'properties': {'score':1} } print "posting edges" count = fail = 0 for obj, uuid in bot.post_edges( gid, gen_edges() ): if not uuid: fail += 1 else : count += 1 print "%s edges inserted " % count
def main(): """ re-Index all the Proxteam corpus """ from pprint import pprint parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://*****:*****@ %s \n " % (args.gid, args.host) bot = Botagraph(args.host, args.key) gid = args.gid if args.username and args.password: bot.authenticate(args.username, args.password) if args.infos: pprint( bot.get_graph(gid) ) return # read / parse graph print "\n * Reading %s" % args.path graph = igraph.read(args.path) # subgraph if args.cut > 0: print " ** cut %s based on degree()" % args.cut # cut method based on degree n = int(args.cut) vs = list( (v.index, v.degree() ) for v in graph.vs ) vs = sorted( vs, key=lambda x: x[1], reverse = True ) vs = vs[:n] graph = graph.subgraph( [ v[0] for v in vs ] ) elif args.gl > 0: from cello.graphs.prox import prox_markov_dict, sortcut, ALL n = int(args.gl) extract = prox_markov_dict(graph, range(graph.vcount()), 80, add_loops=True) vs = [ i for i,v in sortcut(extract,n)] print "vs", vs graph = graph.subgraph( vs ) print graph.summary() graph.es['a'] = [ 1 for i in xrange(graph.vcount() ) ] if not bot.has_graph(gid) : print "\n * Create graph %s" % gid bot.create_graph(gid, { 'description':"Dicosyn experiment\n * ", 'image': "", 'tags': ['synonymes', 'dictionnaire'] } ) print "\n * Get schema '%s'" % gid schema = bot.get_schema(gid)['schema'] nodetypes = { n['name']:n for n in schema['nodetypes'] } edgetypes = { e['name']:e for e in schema['edgetypes'] } print "\n nodetypes: ", nodetypes.keys() print "\n edgetypes: ", edgetypes.keys() if not "word" in nodetypes: print "\n\n * Creating node type %s" % "word" props = { "label" : Text(), "lang" : Text() } bot.post_nodetype(gid, "word", "no description", props) if not "is_syn" in edgetypes: print "\n\n * Creating edge type %s" % "is_syn" bot.post_edgetype(gid, "is_syn", "no desc", {"a":Text()}) schema = bot.get_schema(gid)['schema'] nodetypes = { n['name']:n for n in schema['nodetypes'] } edgetypes = { e['name']:e for e in schema['edgetypes'] } print nodetypes print edgetypes idx = {} if args.wait : raw_input("press <enter> key to start edges and nodes importation") if args.seed: def set_node(v): if v['label'] not in idx: node = bot.post_node(gid, node_payload(v, nodetypes['word']['uuid'])) idx[ v['label'] ] = node['uuid'] print "inserting %s %s" % (v['label'] , node['uuid']) idx = {} v1 = None # seeds grow into beautiful flowers while graph.vcount() > 0: v1 = graph.vs[0] if v1 is None else v1 size = graph.vcount() nei = v1.neighbors() if not len(nei): graph.delete_vertices([v1.index]) v1 = None continue for i in range( min([5,len(nei)]) ): nei = v1.neighbors() if i >= len(nei): if graph.vcount(): r = randint(0,graph.vcount()-1) v1 = graph.vs[r] break r = randint(0,len(nei)-1) v2 = nei[r] print "inserting edge %s %s" % (v1['label'] , v2['label']) set_node(v1) set_node(v2) eid = graph.get_eid(v1.index, v2.index) src, tgt = idx[v1['label']], idx[v2['label']] uuid = bot.post_edge(gid, edge_payload(edgetypes['is_syn']['uuid'], src, tgt, {})) # delete from graph # * inserted edges # * nodes with no more edges graph.delete_edges([eid]) delete_nodes = [ v.index for v in (v1, v2) if len(graph.neighbors(v)) == 0 ] if len(delete_nodes): graph.delete_vertices(delete_nodes) if graph.vcount(): r = randint(0,graph.vcount()-1) # switch v1 v1 = graph.vs[r] else: break # wait sometimes pause(args.pause) else : print "posting nodes" count = 0 fail = 0 for node, uuid in bot.post_nodes( gid, gen_nodes(graph, nodetypes['word']['uuid']) ): if not uuid: fail += 1 else : count += 1 idx[node['properties']['label']] = uuid print "%s nodes inserted " % count #print "iterate over nodes" #for node in bot.find_all_nodes(gid, "word", {}): #pass # post edges print "posting edges" count = 0 fail = 0 inv_idx = { v:k for k,v in idx.iteritems() } for obj, uuid in bot.post_edges( gid, gen_edges(graph, edgetypes['is_syn']['uuid'], idx) ): if not uuid: fail += 1 else : count += 1 # wait sometimes pause(args.pause) print "%s edges inserted, %s failed " % (count, fail)
def main(): """ re-Index all the Proxteam corpus """ parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key" , action='store', help="key", default=None) parser.add_argument("--username" , action='store', help="user", default=None) parser.add_argument("--password" , action='store', help="pwd", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) parser.add_argument("files", action='store', default=None, nargs='+') args = parser.parse_args() print " analysing {}".format("".join(args.files)) # Bot creation & login bot = Botagraph(args.host, args.key) if args.username and args.password: bot.authenticate(args.username, args.password) gid = args.gid if not bot.has_graph(gid) : print "create graph %s" % gid bot.create_graph(gid, "no description") print "create node type %s" % "file" props = { "label": Text()} bot.post_nodetype(gid, "file", "no description", props) bot.post_nodetype(gid, "message", "no description", props) print "create edge type %s" % "is_syn" bot.post_edgetype(gid, "listen", "no desc", {}) bot.post_edgetype(gid, "trigger", "no desc", {}) print "Get schema '%s'" % gid schema = bot.get_schema(gid)['schema'] nodetypes = { n['name']:n for n in schema['nodetypes'] } edgetypes = { e['name']:e for e in schema['edgetypes'] } nodetype_file = nodetypes['file']['uuid'] nodetype_message = nodetypes['message']['uuid'] edgetype_listen = edgetypes['listen']['uuid'] edgetype_trigger = edgetypes['trigger']['uuid'] vs = {} es = [] for js_file in args.files: if js_file.endswith('min.js'): #ignore minified js continue with codecs.open(js_file, 'r', 'utf8') as FILE: js_file = re.sub('^[./]+', '', js_file) vs[js_file] = {'label': js_file, 'nodetype': nodetype_file} for l in FILE: #remove comments: l = l.strip().split('//',1)[0] m = re.search(r'listenTo\([^"]*"([a-z-]+)"', l) if m: message = m.group(1) if message not in vs: vs[message] = {'label': message, 'nodetype': nodetype_message} es.append((js_file, edgetype_listen, message)) m = re.search(r'trigger\([^"]*"([a-z-]+)"', l) if m: message = m.group(1) if message not in vs: vs[message] = {'label': message, 'nodetype': nodetype_message} es.append((js_file, edgetype_trigger, message)) idx = {} print "posting nodes" count = 0 fail = 0 for node, uuid in bot.post_nodes( gid, gen_nodes(vs) ): if not uuid: fail += 1 else : count += 1 idx[node['properties']['label']] = uuid print "%s nodes inserted " % count #print "iterate over nodes" #for node in bot.find_all_nodes(gid, "word", {}): #pass#print node # post edges print "posting edges" count = 0 fail = 0 inv_idx = { v:k for k,v in idx.iteritems() } for _, uuid in bot.post_edges(gid, gen_edges(es, idx)): if not uuid: fail += 1 else : count += 1 #print "%s [ %s -- %s --> %s ] " % ( uuid, inv_idx.get(obj['source'], None) , "syn", inv_idx.get(obj['target'], None) ) print "%s edges inserted, %s failed " % (count, fail)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--host", action='store', help="host", default="http://localhost:5000") parser.add_argument("--key", action='store', help="authentification token", default=None) parser.add_argument("--gid", action='store', help="graph id", default=None) args = parser.parse_args() host, key, gid = (args.host, args.key, args.gid) if None in (host, key, gid): parser.print_help() return # Setup schema from reliure.schema import Doc, Schema from reliure.types import Text, Numeric, Boolean, GenericType desc = """ Game of thrones %s """.replace(" ", "") g_attrs = { 'description': desc % gid, #'image': "https://commons.wikimedia.org/wiki/File:Game_of_Thrones_2011_logo.svg?uselang=fr", #'tags': ['social-network', 'game-of-thrones'] } # used for houses, sauvageons ... group_type = Schema( **{ 'label': Text(), 'url': Text(), 'tags': Text(multi=True, uniq=True), 'image': Text(), 'color': Text(), 'shape': Text(default=u"square"), "name": Text(), }) # human like characters character_type = Schema( **{ 'label': Text(), 'url': Text(multi=True, uniq=True), 'tags': Text(multi=True, uniq=True), 'image': Text(), 'shape': Text(default=u"circle"), 'description': Text(), "name": Text(), "actor": Text(multi=True, uniq=True), "dubbling_vf": Text(multi=True, uniq=True), "bio_fr": Text(), "bio_en": Text(), "seasons": Text(), "dead": Boolean(default=False), }) # creaturs dragons, wolf, white walkers ? creatur_type = Schema( **{ 'label': Text(), 'url': Text(), 'tags': Text(multi=True, uniq=True), 'image': Text(), 'shape': Text(default=u"triangle"), 'description': Text(), "name": Text(), "bio_fr": Text(), "bio_en": Text(), "seasons": Text(), "dead": Boolean(default=False), }) edgetypes = [ # Characters or Creaturs -- rel --> Group # (name, desc , properties ), ("is_member_of", "Character is member of a Group", { "from_ep": Text(), }), ("is_child_of", "character or creatur is child of another one", {}), ("works_for", "character or creatur works for a character or a group", { "from_episode": Text(), "to_episode": Text() }), ("is_friend_of", "character is friend of another one", { "from_ep": Text(), }), ("married", "character meet another one", { "force": Numeric() }), ("belongs_to", "character or creatur belongs to another one", { "from_episode": Text(), "to_episode": Text() }), ("kill", "character or creatur kill another one", { "episode": Text(), "method": Text() }), #("have_sex", "character or creatur have sex another one", { "episode":Text()} ), #("rape", "character or creatur rape another one", { "episode":Text()} ), #("meet", "character meet another one", { "episode":Text()}), #("loves", "character meet another one", {} ), ] # PARSING WK page from pyquery import PyQuery as pq import codecs root = "." path = "%s/pages/Personnages_de_Game_of_Thrones" % root graphmlz = "%s/got.graphml" % root def opengot(): html = codecs.open(path, mode='r', encoding="utf8").read() html = pq(html) html = html(".mw-content-ltr") html(".mw-content-ltr h2:first").prevAll().remove() html(".mw-content-ltr h2:first").remove() html(".mw-content-ltr h2:first").nextAll().remove() html('.mw-editsection').remove() html('sup').remove() html = pq(".mw-content-ltr", html) return html def as_doc(ctype, cdata): d = Doc(ctype) for k, v in cdata.iteritems(): if type(ctype[k]) == Text: d.set_field(k, v, True) else: d[k] = v return d.export() def _parse_color(e): color = None if "style" in e.attrib: styles = dict( pair.strip().split(':') for pair in pq(e).attr("style").strip().lower().split(';') if len(pair)) color = styles.get("background", None) if color and color in ("black", "#000"): color = "#000000" return color def parse_belongs_legend(html): houses_map = {} legende = pq("li", pq("table td ul", html)[:4]) for e in legende: color = _parse_color(pq("span", e)[0]) text = pq(e).text() #.replace("Maison ", "") houses_map[color] = text # removes legendes html(".mw-content-ltr h3:first").prevAll().remove() return houses_map def parse_creaturs_and_characters(html, houses): rel_belongs = [] rel_member_of = [] characters = [] creaturs = [] while True: # reading from end if len(html("h3:last")): ths = pq('th', html("h3:last").nextAll()) tds = pq('td', html("h3:last").nextAll()) title = html("h3:last").text() color = None flg = 0 if len(ths) % 5 == 0: c = {} member_of = [] for td in tds: colspan = td.attrib.get('colspan', 0) if colspan == "6": # table headers color = _parse_color(td) if color: member_of.append(houses[color]) flg = 1 elif colspan == 0: # table cells if flg == 1: actor_img = pq("img", td).attr("src") if actor_img: c['image'] = "http:%s" % actor_img elif flg == 2: name = pq(td).text() c['name'] = name for e in member_of: rel_member_of.append((name, e)) elif flg == 3: c['actor'] = [ pq(e).text() for e in pq("a", td) ] elif flg == 4: c['dubbling_vf'] = [ pq(e).text() for e in pq("a", td) ] elif flg == 5: c['seasons'] = pq(td).text() c['dead'] = u"✝" in pq(td).text() flg += 1 elif colspan == "5": # table bio cell c['bio_fr'] = pq(td).text() characters.append(as_doc(character_type, c)) # reset c = {} member_of = [] flg = 1 if len(ths) == 2: c = {} belongs = [] for td in tds: colspan = td.attrib.get('colspan', 0) if colspan == "6": color = _parse_color(td) if color: belongs.append(houses[color]) flg = 1 elif colspan == 0: if flg == 1: name = pq(td).text().strip() c['name'] = name for e in belongs: rel_belongs.append((name, e)) flg = 2 if flg == 2: c["seasons"] = pq(td).text() c["dead"] = True # u"✝" in pq(td).text() elif colspan == "5": c['bio_fr'] = pq(td).text() creaturs.append(as_doc(creatur_type, c)) c = {} belongs = [] flg = 0 #removes section once parsed html("h3:last").nextAll().remove() html("h3:last").remove() else: break return characters, rel_member_of, creaturs, rel_belongs # In[ ]: from reliure.schema import Doc locations = [] # TODO html = opengot() houses_map = parse_belongs_legend(html) characters, rel_member_of, creaturs, rel_belongs = parse_creaturs_and_characters( html, houses_map) print "Groups ", len(houses_map) print "Creaturs ", len(creaturs) print "Characters ", len(characters) print "member_of", len(rel_member_of) print "belongs", len(rel_belongs) from botapi import Botagraph, BotApiError bot = Botagraph(host, key) if not bot.has_graph(gid): print "\n * Creating graph %s" % gid bot.create_graph(gid, g_attrs) print "\n * Creating node type %s" % "" bot.post_nodetype(gid, "Character", "Character", character_type._fields) bot.post_nodetype(gid, "Creatur", "Creatur", creatur_type._fields) bot.post_nodetype(gid, "Group", "Group", group_type._fields) for name, desc, props in edgetypes: bot.post_edgetype(gid, name, desc, props) schema = bot.get_schema(gid)['schema'] nodetypes = {n['name']: n for n in schema['nodetypes']} edgetypes = {e['name']: e for e in schema['edgetypes']} idx = {} # (label, uuid) groups = [] for k, v in houses_map.iteritems(): g = as_doc(group_type, {'label': v, 'name': v, 'color': k}) groups.append(g) for name, els in [("Character", characters), ("Creatur", creaturs), ("Group", groups)]: print "Posting %s nodes %s" % (len(els), name) for c in els: payload = { 'nodetype': nodetypes[name]['uuid'], 'properties': {k: v for k, v in c.iteritems()} } payload['properties']['label'] = payload['properties']['name'] node = bot.post_node(gid, payload) idx[node['label']] = node['uuid'] vids = set() for name, rels in [("is_member_of", rel_member_of), ("belongs_to", rel_belongs)]: print "Posting %s rels %s" % (len(rels), name) for src, tgt in rels: if src in idx and tgt in idx: edge = { 'edgetype': edgetypes[name]['uuid'], 'source': idx[src], 'label': name, 'target': idx[tgt], 'properties': { "from_ep": "", } } uuid = bot.post_edge(gid, edge) vids.add(src) vids.add(tgt) else: print src, tgt print "Starring %s nodes" % len(list(vids)) bot.star_nodes(gid, list(vids))