Beispiel #1
0
def export_calc_engine(graphdb):
    def _export_calc(query, calc_id=None, **kwargs):

        if calc_id == None:
            return {'message': "No calc_id ", 'gid': calc_id, 'url': ""}

        query, graph = db_graph(graphdb, query)
        url = "http://calc.padagraph.io/_/cillex-%s" % calc_id
        print "_export_calc", query, calc_id, url

        headers, rows = istex.graph_to_calc(graph)
        print("* PUT %s %s " % (url, len(rows)))

        r = requests.put(url, data=istex.to_csv(headers, rows))
        url = "http://calc.padagraph.io/cillex-%s" % calc_id

        return {'message': "Calc exported ", 'gid': calc_id, 'url': url}

    export = Optionable("export_calc")
    export._func = _export_calc
    export.add_option(
        "calc_id",
        Text(
            default=None,
            help=
            "identifiant du calc, le calc sera sauvegardé vers l’adresse http://calc.padagraph.io/cillex-{calc-id}"
        ))

    engine = Engine("export")
    engine.export.setup(in_name="request", out_name="url")
    engine.export.set(export)

    return engine
Beispiel #2
0
    def _build_result_set(self, v_extract):
        """ Building of the Doc list from the list of retrived vertices """

        vid_to_docnum = lambda vid: "%d" % vid
        global_graph = self.graph

        kdocs = []

        schema = Schema(docnum=Numeric(),
                        degree_out=Numeric(),
                        degree_in=Numeric(),
                        score=Numeric(vtype=float),
                        label=Text(),
                        neighbors=Numeric(multi=True, uniq=True))
        for vid, score in v_extract:
            kdoc = Doc(schema, docnum=vid)
            kdoc.score = score
            vtx = global_graph.vs[vid]
            # autres attributs
            kdoc.degree_out = vtx.degree(ig.OUT)
            kdoc.degree_in = vtx.degree(ig.IN)
            kdoc.label = vtx['label']
            # les voisins sont dans un term field
            for nei in vtx.neighbors():  #TODO: ajout IN/OUT ?
                kdoc["neighbors"].add(nei.index)  #TODO ajout d'un poids !

            # on ajoute le doc
            kdocs.append(kdoc)
        return kdocs
Beispiel #3
0
 def __init__(self, name=None):
     """ Projection of a bipartite graph to a unipartite graph
     """
     Optionable.__init__(self, name=name)
     self.add_option("proj_wgt", Text(default='p',
          help=u"projection weighting method",
          choices=['no', 'count', 'p', 'pmin', 'pmax', 'pavg', 'confl']))
Beispiel #4
0
def import_calc_engine(graphdb):
    def _import_calc(query, calc_id=None, **kwargs):
        query, graph = db_graph(graphdb, query)
        if calc_id == None:
            return None
        url = "http://calc.padagraph.io/cillex-%s" % calc_id
        graph = istex.pad_to_graph(calc_id, url)
        graph['meta']['pedigree'] = pedigree.compute(graph)
        graph['properties']['description'] = url
        graphdb.graphs[calc_id] = graph
        return graph_articles(calc_id, graph, cut=100)

    comp = Optionable("import_calc")
    comp._func = _import_calc
    comp.add_option(
        "calc_id",
        Text(
            default=None,
            help=
            "identifiant du calc,le calc sera importé depuis l'adresse http://calc.padagraph.io/cillex-{calc-id}"
        ))

    engine = Engine("import_calc")
    engine.import_calc.setup(in_name="request", out_name="graph")
    engine.import_calc.set(comp)

    return engine
Beispiel #5
0
def search_engine(graphdb):
    # setup
    engine = Engine("search")
    engine.search.setup(in_name="request", out_name="graph")

    ## Search
    def Search(query, **kwargs):
        query, graph = db_graph(graphdb, query)
        gid = query['graph']

        q = kwargs.pop("URI")
        # field = kwargs.pop("field", None)

        #g = query_istex(gid, q, field)
        g = query_rdf(gid, q)
        graph = merge(gid, graph, g)

        nodes = query['nodes']
        #g = graph_articles(gid, graph, weighting=["1"], all_articles=True, cut=100, uuids=nodes, **kwargs )
        return graph

    search = Optionable("RDFSearch")
    search._func = Search
    search.add_option(
        "URI", Text(default=u"http://silene.magistry.fr/data/nan/sinogram/好"))
    # search.add_option("field", Text(choices=[ u"*", u"istex", u"auteurs", u"refBibAuteurs", u"keywords" ], default=u"*"))
    # search.add_option("results_count", Numeric( vtype=int, min=1, default=10, help="Istex results count"))

    engine.search.set(search)
    return engine
Beispiel #6
0
    def __init__(self, index=None, field=None, size=3):
        super(ESPhraseSuggest, self).__init__()
        # configure ES connection
        self.index = index

        self.add_option("field", Text(default=field, help="Suggestions field"))
        self.add_option(
            "size", Numeric(vtype=int, default=size, help="max suggestions"))
Beispiel #7
0
 def __init__(self, name=None):
     super(ESQueryStringBuilder, self).__init__(name=name)
     self.add_option(
         "operator",
         Text(choices=[
             u"AND",
             u"OR",
         ],
              default=u"OR",
              help=u"operator used for chaining terms"))
     self.add_option(
         "fields",
         Text(default=u"_all",
              help=u"""List of fields 
         and the 'boosts' to associate with each of them. The format
         supported is "fieldOne^2.3 fieldTwo fieldThree^0.4", which indicates
         that fieldOne has a boost of 2.3, fieldTwo has the default boost, 
         and fieldThree has a boost of 0.4 ..."""))
Beispiel #8
0
 def __call__(self, docs):
     text_fields = self.text_fields
     out_field = self.out_field
     guess_language = self.guess_language
     for doc in docs:
         if out_field not in doc:
             doc[out_field] = Text()
         texte = "\n".join(doc[text_field] for text_field in text_fields)
         doc[out_field] = guess_language(texte)
     return docs
Beispiel #9
0
def search_engine(graphdb):
    # setup
    engine = Engine("search")
    engine.search.setup(in_name="request", out_name="graph")

    ## Search
    def Search(query, results_count=10, **kwargs):
        query, graph = db_graph(graphdb, query)
        gid = query['graph']

        q = kwargs.pop("q", "*")
        field = kwargs.pop("field", None)

        g = query_istex(gid, q, field, results_count)
        graph = merge(gid, graph, g, index=index, vid=vid)

        nodes = query['nodes']
        g = graph_articles(gid,
                           graph,
                           weighting=["1"],
                           all_articles=True,
                           cut=100,
                           uuids=nodes,
                           **kwargs)
        return g

    search = Optionable("IstexSearch")
    search._func = Search
    search.add_option("q", Text(default=u"clle erss"))
    search.add_option(
        "field",
        Text(choices=[
            u"*", u"istex", u"auteurs", u"refBibAuteurs", u"keywords"
        ],
             default=u"*"))
    search.add_option(
        "results_count",
        Numeric(vtype=int, min=1, default=10, help="Istex results count"))

    engine.search.set(search)
    return engine
Beispiel #10
0
def expand_prox_engine(graphdb):
    """
    prox with weights and filters on UNodes and UEdges types
    
    input:  {
                nodes : [ uuid, .. ],  //more complex p0 distribution
                weights: [float, ..], //list of weight
            }
    output: {
                graph : gid,
                scores : [ (uuid_node, score ), .. ]
            }
    """
    engine = Engine("scores")
    engine.scores.setup(in_name="request", out_name="scores")

    ## Search
    def expand(query, step=3, limit=100, filter_nodes=None, filter_edges=None):
        if filter_nodes is None :
            filter_nodes = []
        if filter_edges is None:
            filter_edges = []
        gid = query.get("graph")
        pzeros = query.get("nodes")
        weights = query.get("weights", [])

        return  graphdb.proxemie( gid, pzeros, weights, filter_edges=filter_edges, filter_nodes=filter_nodes, limit=limit, n_step=step)

    scores = Optionable("scores")
    scores._func = Composable(expand)
    scores.add_option("step", Numeric( vtype=int, default=3))
    scores.add_option("limit", Numeric( vtype=int, default=50, max=100))
    scores.add_option("filter_nodes", Text( default=set([]), multi=True, uniq=True))
    scores.add_option("filter_edges", Text( default=set([]), multi=True, uniq=True))

    engine.scores.set(expand)


    return engine
Beispiel #11
0
    def __init__(self,
                 directed=False,
                 reflexive=True,
                 label_attr='form',
                 vtx_attr='docnum',
                 links_attr="out_links"):
        # Optionable init
        OptionableGraphBuilder.__init__(self, "GraphBuilder", directed=False)
        self.reflexive = reflexive

        self.add_option("label_attr", Text(default=label_attr))
        self.add_option("vtx_attr", Text(default=vtx_attr))
        self.add_option("links_attr", Text(default=links_attr))

        # Graph builder init

        vattrs = ("_doc", "rank", "pzero", "docnum", "graph", "lang", "pos",
                  "form", "score", "neighbors")
        map(self.declare_vattr, vattrs)

        eattrs = ("weight", )
        map(self.declare_eattr, eattrs)
Beispiel #12
0
def graph_engine(graphdb):
    # setup
    engine = Engine("graph")
    engine.graph.setup(in_name="request", out_name="graph")

    def _global(query, reset=False, all_articles=False, cut=100, **kwargs):

        gid = query['graph']
        query, graph = db_graph(graphdb, query)
        nodes = [] if reset else query['nodes']
        g = graph_articles(gid,
                           graph,
                           all_articles=all_articles,
                           cut=cut,
                           uuids=nodes,
                           **kwargs)
        return g

    comp = Optionable("Graph")
    comp._func = _global
    comp.add_option("reset", Boolean(default=False, help="reset or add"))
    comp.add_option("all_articles",
                    Boolean(default=False, help="includes all articles"))
    comp.add_option(
        "weighting",
        Text(choices=[
            u"0", u"1", u"weight", u"auteurs", u"refBibAuteurs", u"keywords",
            u"categories"
        ],
             multi=True,
             default=u"1",
             help="ponderation"))
    comp.add_option("length", Numeric(vtype=int, min=1, default=3))
    comp.add_option("cut", Numeric(vtype=int, min=2, default=100))

    def _reset_global(query, **kwargs):
        gid = query['graph']
        headers = istex.get_schema()
        graph = empty_graph(gid, headers, **kwargs)
        graphdb.graphs[gid] = graph
        g = graph_articles(gid, graph, all_articles=True, uuids=[], **kwargs)
        return g

    reset = Optionable('ResetGraph')
    reset._func = _reset_global
    reset.add_option("reset", Boolean(default=True, help=""), hidden=True)

    engine.graph.set(comp, reset)
    return engine
Beispiel #13
0
def expand_prox_engine(graphdb):
    """
    prox with weights and filters on UNodes and UEdges types
    
    input:  {
                nodes : [ uuid, .. ],  //more complex p0 distribution
                weights: [float, ..], //list of weight
            }
    output: {
                graph : gid,
                scores : [ (uuid_node, score ), .. ]
            }
    """
    engine = Engine("scores")
    engine.scores.setup(in_name="request", out_name="scores")

    ## Search
    def expand(query, length=3, cut=300, weightings=None):
        graph = db_graph(graphdb, query)
        gid = query.get("graph")
        nodes = query.get("nodes", [])
        expand = query.get("expand", [])

        vs = expand_subgraph(graph,
                             expand,
                             nodes,
                             cut=cut,
                             weightings=weightings)
        vs = [(graph.vs[v[0]]['uuid'], v[1]) for v in vs]
        return dict(vs)

    scores = Optionable("scores")
    scores._func = Composable(expand)
    scores.add_option("length", Numeric(vtype=int, default=3))
    scores.add_option("cut", Numeric(vtype=int, default=50, max=300))
    scores.add_option(
        "weighting",
        Text(choices=[u"0", u"1", u"weight"],
             multi=True,
             default=u"1",
             help="ponderation"))

    engine.scores.set(expand)

    return engine
Beispiel #14
0
 def __init__(self, vtx_attr, role=None, name=None):
     """ Build the labelling component
     
     :attr vtx_attr: the vertex attribute to use as label string
     :type vtx_attr: str
     :attr role: the role of the created vertices
     :type role: str
     :attr name: the name of the component
     :type name: str
     """
     super(TypeFalseLabel, self).__init__(name=name)
     self.vtx_attr = vtx_attr
     self.role = role
     self.add_option(
         "score",
         Text(default=u"recall",
              choices=[u"recall", u"precision"],
              help="Label scoring method"))
Beispiel #15
0
 def __init__(self,
              index=None,
              doc_type=None,
              host="localhost:9200",
              name=None):
     """
     :param index: index name
     :param doc_type: document type to search, if list of str then option will be added, if None
     :param host: ES hostname
     :param name: component name
     """
     super(ESSearch, self).__init__(name=name)
     self.add_option(
         "size",
         Numeric(vtype=int,
                 default=10,
                 min=0,
                 help="number of document to returns"))
     # configure ES connection
     self.host = host
     self._es_conn = elasticsearch.Elasticsearch(hosts=self.host)
     if not self._es_conn.ping():
         raise RuntimeError("Couldn't ping ES server at '%s'" % self.host)
     self.index = index
     # manage doctype: add an option if needed
     self.doc_type = None
     if isinstance(doc_type, basestring):
         # only one doctype
         self.doc_type = doc_type
     else:
         if doc_type is None:
             # fetch all the existing doctype
             mappings = self._es_conn.indices.get_mapping(index=self.index)
             doc_type = mappings[self.index]['mappings'].keys()
         if len(doc_type):
             self.add_option(
                 "doc_type",
                 Text(multi=True,
                      choices=doc_type,
                      default=doc_type,
                      help="Documents type"))
         else:
             # if empty list no option, no doctype selection
             self.doc_type = None
Beispiel #16
0
def clusters_labels_engine(graphdb):
    def _labels(query, weighting=None, count=2, **kwargs):
        query, graph = db_graph(graphdb, query)
        gid = query['graph']
        clusters = []
        for clust in query['clusters']:
            labels = []
            pz = graph.vs.select(uuid_in=clust)
            pz = [
                v.index for v in pz if v['nodetype'] == ("_%s_article" % gid)
            ]
            if len(pz):
                vs = extract(graph, pz, cut=300, weighting=weighting, length=3)
                labels = [{
                    'uuid': graph.vs[i]['uuid'],
                    'label': graph.vs[i]['properties']['label'],
                    'score': v
                } for i, v in vs
                          if graph.vs[i]['nodetype'] != ("_%s_article" % gid)
                          ][:count]
            clusters.append(labels)
        return clusters

    comp = Optionable("labels")
    comp._func = _labels
    comp.add_option(
        "weighting",
        Text(choices=[
            u"0", u"1", u"weight", u"auteurs", u"refBibAuteurs", u"keywords",
            u"categories"
        ],
             multi=True,
             default=u"1",
             help="ponderation"))
    comp.add_option("count", Numeric(vtype=int, min=1, default=2))

    engine = Engine("labels")
    engine.labels.setup(in_name="request", out_name="labels")
    engine.labels.set(comp)

    return engine
Beispiel #17
0
    def __init__(self,
                 global_graph,
                 attr_list,
                 default_attr,
                 case_sensitive=True,
                 name=None):
        """
        :attr global_graph: the graph to search vertices in
        :attr attr_list: list of the vtx attributes used to identify vertices
        :attr default_attr: the one used by default (should be in `attr_list`)
        :arre case_sensitive: is the search case_sensitive
        """
        super(VtxMatch, self).__init__(name=name)
        self.add_option(
            "default_attr",
            Text(default=default_attr,
                 choices=attr_list,
                 help="default search attribute"))
        self.global_graph = global_graph
        self._vattr_list = attr_list
        self._index = {}

        self._case_sensitive = case_sensitive

        # build the indices, for each attr
        for attr in attr_list:
            self._index[attr] = {}
            for vtx in global_graph.vs:

                #Manage the case sentivity
                if self._case_sensitive:
                    vtx_label = vtx[attr]
                else:
                    vtx_label = vtx[attr].lower()

                if vtx_label in self._index[attr]:
                    self._index[attr][vtx_label].append(vtx.index)
                else:
                    self._index[attr][vtx_label] = [vtx.index]
from botapi import Botagraph, BotApiError
from reliure.types import Text

NodeType = namedtuple("NodeType", "name description properties")
EdgeType = namedtuple("EdgeType", "name description properties")

# Graph Definition

PDG_HOST = "http://g0v-tw.padagraph.io"
PDG_KEY = ""
GRAPHNAME = "G0V Who Writes What"
DESCRIPTION = "a graph or pads and authors"
TAGS = ["pads", "g0v-tw"]


NodePad = NodeType("pad","", {"id": Text(),
                            "label": Text()})

NodeAuthor = NodeType("author", "", {"id": Text(), "label": Text()})

EdgeLink = EdgeType("writes", "", {}) 



bot = Botagraph(PDG_HOST, PDG_KEY)
bot.create_graph(GRAPHNAME, {'description': DESCRIPTION, "tags": TAGS, "image":"https://avatars3.githubusercontent.com/u/2668086?v=3&s=200"})


# Posting Nodes and Edges Types

nodetypes_uuids = {}
from reliure.types import Text

NodeType = namedtuple("NodeType", "name description properties")
EdgeType = namedtuple("EdgeType", "name description properties")

PDG_HOST = "http://g0v-tw.padagraph.io"
PDG_KEY = ""
GRAPHNAME = "vTaiwan x AirBnB"
DESCRIPTION = "Opinion Graph about AirBnb in Taiwan"
TAGS = ["vTaiwan", "airBnB", "pol.is"]

N_COMMENTS = 227

NodeAgreedComment = NodeType("AgreedComment",
                             "a comment participants agreed with", {
                                 "id": Text(),
                                 "body": Text(),
                                 "label": Text(),
                                 "shape": Text()
                             })

NodeDisagreedComment = NodeType("DisagreedComment",
                                "a comment participants disagreed with", {
                                    "id": Text(),
                                    "body": Text(),
                                    "label": Text(),
                                    "shape": Text()
                                })

NodeUser = NodeType("Participant", "", {
    "id": Text(),
Beispiel #20
0
    def parse(self, bot):
        
        # 1 2 3 4 5 6  9 10 11 12 13 14 15 16
        self.completions = []
        gid = self.gid
        path = self.path

        KEYS =  [ "id","num", "prefix", "subscript", "superscript", "vocable"]
        
        WEIGHT_LOC = 1
        WEIGHT_INC_DEF = 0
        WEIGHT_INC_FORM = 0

        idx = {} 
        
        nodes = {}
        edges = []
        nodetypes = {}
        edgetypes = {}
        

        bot.create_graph(gid, { 'name': gid,
                            'description': "",
                            'image': "",
                            'tags': [""]
                          }
                    );

        lexie_props = {
        
            'rlfid'   : Text(),
            'entry_id'   : Text(),
            'id'      : Text(),
            'label'   : Text(),
            'num'     : Text(),
            'vocable' : Text(),
            'prefix'  : Text(),
            'subscript' : Text(),
            'superscript' : Text(),

            'gc' : Text(), # json
            'df' : Text(), # json
            'examples' : Text(), # json
            'lfs' : Text(), # json
            
        }
        nodetypes["Lexie"] = bot.post_nodetype(gid, "Lexie", "", lexie_props) 

        # noeuds lexicaux du graphe
        nodes = { e['id']: e for e in readcsv(path, "01-lsnodes.csv") }
        entries =  { e['id']: e for e in readcsv( path, "02-lsentries.csv") } 

        for k,v in entries.items() : 
            i = v.pop('id')
            entries[k]['entry_id'] = i 


        def as_token(nid, form, actants ):
            dic = dict(zip(KEYS, [ "" for e in KEYS ]))
            
            if nid : 
                node = nodes.get(nid, None)
                if node:
                    values =  [ node[k] for k in KEYS ]
                    dic = dict(zip(KEYS, values))
                    
            if form and len(form):
                # conversion des variables d actants
                _form = form
                if len(actants):
                    for k,v,i in actants :
                        _form = _form.replace(k,v)
                    
                dic['vocable'] = _form
                        
            return dic
                
        
        for node in nodes.values():
            entry = entries[node['entry']]
            node.update(entry)
            node.update({
                'rlfid' : node['id'],
                'id' : node['id'],
                'label' : node['name'],
                
                'vocable' : node['name'],
                'prefix' : node['addtoname'],
                'num' : node['lexnum'],
                'subscript' : node['subscript'],
                'superscript' : node['superscript'],

                'label_form' : None,

                'gc' : {},

                'lfs': [],
                'df': {
                    'form' : node['name'],
                    'actants' : [],
                    'left_pf_form': '',
                    'right_pf_form': '',
                    'html' : '',
                  },
                'examples' : [],

                'definiens' : None,
                'formatted_definiens' : None,
            })

            to_delete = ('%', 'entry', 'lexnum', 'addtoname')
            for k in to_delete : del node[k]



        # DF

        # 09-lssemlabel-model.xml 
        # 10-lssemlabel-rel.csv
        # 11-lspropform-rel.csv
        # 17-lsdef.csv

        handler = SemLabelHandler()
        semlabels = handler.parse("%s/09-lssemlabel-model.xml" % path)
        rels = readcsv(path, "10-lssemlabel-rel.csv", type=list)

        for sense, label, percent in rels:
            df = nodes[sense]['df']
            df['label_form'] = semlabels[label]
            df['percent'] = percent
            
        rels = readcsv(path, "11-lspropform-rel.csv", type=list)
        for id, propform, tildevalue, percent, actantslist in rels:
            df = nodes[id]['df']
            df['propform'] = propform
            df['tildevalue'] = tildevalue
            df['percent'] = percent

            actants = actantslist if actantslist else "()";
            # [ "$1", "X" , 1 ]
            actants = [ "%s=%s"%(e,i+1) for i,e in enumerate(actants[1:-1].split(',')) if len(e)];
            actants = list(map(lambda e: e.split('=') , actants))
            
            df['actants']     = actants
            df['actantslist'] = actantslist

        # liens d inclusion definitionnelle
        l_inc_def = {}

        for r in readcsv(path, "17-lsdef.csv", type=list):
            id,	def_XML, def_HTML = r
            if id in nodes:
                df = nodes[id]['df']
                df['xml'] = def_XML

                soup = BeautifulSoup(def_XML, 'html.parser')
                rlfids = [ (st.attrs['sense'],st.attrs.get('sem',WEIGHT_INC_DEF)) for st in soup("st") if st.attrs.get('sense') ]
                
                soup = BeautifulSoup(def_HTML, 'html.parser')
                # ~ rlfids = [a.attrs['href'].split('/')[1] for a in  soup("a")]
                l_inc_def[id] = rlfids
                df['html'] = soup.body.prettify()

            else :
                self.error( " # 17-lsdef # no def for %s" % id )



        # GC + PH LOCUTIONS

        """
        gc : {
            usagenote : [],
            othergc : [],
            pos : {},
            
            locution : {
                locution_tokens	: [ {
                    id  : "41142",
                    num	: "I.1b",
                    prefix : "",
                    subscript : "",	
                    superscript	: "",	
                    vocable	monter }
                ],
                name :	locution verbale
                type :	2
            }

        }
        """
        
        handler = GramCharacHandler()
        pos = handler.parse("%s/05-lsgramcharac-model.xml" % path)
        
        rels = readcsv(path, "06-lsgramcharac-rel.csv", type=list)
        l_inc_form = {}

        print( "id, usagenote, usagenotevars, POS, phraseolstruc, embededlex, othercharac, othercharacvars" )
        for r in rels:
            id, usagenote, usagenotevars, POS, phraseolstruc, embededlex, othercharac, othercharacvars = r
            if POS == "":
                error( " # 06-lsgramcharac-rel : missing POS %s for id %s  : %s" % (POS, id,r) )
                continue
                
            # GC Caractéristiques grammaticales


            # variable gc
            split = lambda chaine : ([] if len(chaine) <= 2 else chaine[1:-1].split(',') )
            
            def splitvars(chaine):
                for e in '()':
                    chaine = chaine.replace(e, "")
                return [ e for e in chaine.split(',') if len(e)]
            
            node = nodes[id]   
            
            othercharac =  [ pos[e]['name']  for e in split(othercharac)  ]
            othercharacvars = splitvars(othercharacvars)
            for i,e in enumerate(othercharac):
                for j,v in enumerate(othercharacvars):
                    e = e.replace('%%%s'% (j+1), v )
                othercharac[i] = e
                
            
            usagenote =  [ pos[e]['name']  for e in split(usagenote )]
            
            gc = {}
            gc['usagenote'] = usagenote # fem ..
            gc['othergc']  = othercharac  
            gc['locution'] = None
            gc['pos'] = {
                            'name' : pos[POS]['name'],
                            'type' : pos[POS]['type']
                         }
            
            # LN Locutions nominales, prepositionnelles, phrases
            if len( embededlex ):
                #embededlex = re.findall( "[0-9]+", embededlex)
                #embededlex = embededlex[1:-1].split(',')
                _embededlex = embededlex.replace('),(', ');(')
                _embededlex = [ e  for e  in  _embededlex[1:-1].split(';') ]
                _embededlex = [ e[1:-1].split(',') for e in _embededlex ]
                actants =  node['df']['actants']
                tokens = [ as_token(_id,form, actants) for _id,form in _embededlex  ]
                gc['locution'] =  {
                    'tokens' : tokens,
                    'name' : pos[POS]['name'],
                    'type' : pos[POS]['type']
                }
                for t in tokens:
                    tid = t['id']
                    if tid and len( tid): 
                        l_inc_form[tid] = l_inc_form.get(tid, []) + [id]
                    
            if "$" in embededlex :
                actants =  node['df']['actants']
                z= [ e['vocable'] for e in  [ as_token(id,form, actants) for id,form in _embededlex ]]
                                
            node['gc'] = gc
        
        self.todo(  " 06-lsgramcharac-rel.csv : TODO POST LOCUTIONS" )        

        for r in rels:
            id, usagenote, usagenotevars, POS, phraseolstruc, embededlex, othercharac, othercharacvars = r
            """
            node = nodes[r['id']]

                TODO Locutions links !!
        
            """
            pass
        
                
        # Nodes Exemples
        
        handler = ExempleSourceHandler()
        sources = handler.parse("%s/14-lsexsource-model.xml" % path)
        exemples = { e['id']: e for e in readcsv(path, "15-lsex.csv") }
        
        for e in exemples.values() :
            d,m,y  = ("///%s" % e['date']).split('/')[-3:]
            e['source'] = sources[e['source']]
            e['date_day'] = d
            e['date_month'] = m
            e['date_year'] = y
            e['text'] = e['content']
            e['authors'] =  [  {'first_name':v.split(',')[0],
                                'last_name':v.split(',')[1] if len(v.split(',')) > 1 else '' }
                              for v in ("%s"%e['authors']).split(';')[:2] if len(e['authors'])]
            del e['content']
            
        rels = readcsv(path, "16-lsex-rel.csv")
        for e in rels:
            nid, exid, oc, po = ( e['id'],e['example'],e['occurrence'] ,e['position'] )
            
            node = nodes[nid]
            example = dict(exemples[exid])
            occurrences =  [  {'first':v.split(',')[0], 'last':v.split(',')[1]}
                              for v in oc.split(';') if len(v)]
                                      
            example.update({ 'occurrences': occurrences,
                             'position'   : po })
            node['examples'].append( example )


        # POST Nodes vertex
        self.info( "\n * POSTING Lexie nodes : %s" % (len(nodes.values())) )
        
        def gen(nodes):
            jsons = ( 'df', 'gc', 'examples', 'locutions' )
            for node in nodes :
                properties = {
                    k : node[k] if k not in jsons else json.dumps(node[k]) for k in lexie_props
                }
                yield {
                        'nodetype': nodetypes['Lexie']['uuid'],
                        'properties': properties
                      }
        
        for node, uuid in bot.post_nodes( gid, gen(nodes.values()), key='rlfid' ):
            idx[ node['properties']['rlfid'] ] = uuid
            r = list( node['properties'][k] for k in ['entry_id', 'rlfid', 'vocable','num','prefix','subscript','superscript'])
            self.completions.append( [uuid] + r )

        self.info( " * POST    Lexie nodes : %s" % (len(idx)) )
            
        
        
        # Relations / edges
        
        """
        ## Liens d inclusion définitionnelle 
        
        17-lsdef.csv 
        """

        
        self.info( "17-lsdef.csv [POST] Liens d inclusion définitionnelle " )

        name = "DefinitionalInclusion" 
        properties = { "weight": Text() }
        edgetypes[name] = bot.post_edgetype(gid, name, name, properties)
        
        info( " * POST edgetype : %s %s" % (name, edgetypes[name]['uuid']) )

        edges = []
        skipped_weight = 0
        
        
        _nodes = { e['rlfid'] : e for e in nodes.values() }
        
        for source, targets in l_inc_def.items():
            print (source, targets)

            for target, weight in targets:
                payload = {
                            'edgetype': edgetypes[name]['uuid'],
                            'source': idx[source],
                            'target': idx[target],
                            'properties': {
                                    'weight' : weight,
                            } 
                        }
                edges.append(payload)


        
        self.info( "17-lsdef.csv [POST] Liens d'inclusion formelle " )

        name = "FormalInclusion" 
        properties = { "weight": Text() }
        edgetypes[name] = bot.post_edgetype(gid, name, name, properties)

        for source, targets in l_inc_form.items():
            weight = WEIGHT_INC_FORM
            
            for target in targets:
                payload = {
                            'edgetype': edgetypes[name]['uuid'],
                            'source': idx[source],
                            'target': idx[target],
                            'properties': {
                                    'weight' : weight,
                            } 
                        }
                edges.append(payload)

        for e in bot.post_edges(gid, iter(edges), lambda e: e['edgetype'] ) : 
            pass
        


        """
        ## Liens de co-polysémie
        
        03-lscopolysemy-model.xml
        04-lscopolysemy-rel.csv
        """

        handler = CopolysemyHandler()
        copo = handler.parse("%s/03-lscopolysemy-model.xml" % path)
        copo = { e['id']: e for e in copo }
        _name =  lambda t,s : "Co-polysemy/%s%s%s" % (t['name'], "/" if s else "", s['name']if s else "" )

        def weight_copo(typ):
            return int(copo[typ]['semantics'])
        
        # edgetypes
        self.info( " * POSTING Co-polysemy edgetypes : %s" % (len(copo.values())) )
        for cop in copo.values():
            tp = cop['id'] # cop['name']
            name = _name(cop, None)
            desc = ""
            properties = { "weight": Text(), 'i':Text() }

            edgetypes[name] = bot.post_edgetype(gid, name, desc, properties)
            info( " * POST edgetype : %s %s" % (name, edgetypes[name]['uuid']) )

            for k,v in cop['subtypes'].items() :
                name =  _name(cop, v)
                edgetypes[name] = bot.post_edgetype(gid, name, desc, properties)
                info( " * POST edgetype : %s %s" % (name, edgetypes[name]['uuid']) )

        rels = readcsv(path, "04-lscopolysemy-rel.csv", type=list)

        edges = []; count = 0
        for i, r in enumerate(rels):
            src, tgt, typ, subtype = r
            t = copo[typ]
            s = copo[typ]['subtypes'].get(subtype, None) if len(subtype) else None
            
            if len(subtype) and (copo[typ]['subtypes'].get(subtype, None) is None):
                self.error ( " # 04-lscopolysemy-rel # no subtype %s in type %s (line %s ) %s" \
                        % ( subtype , typ, i+2, r ))
            
            count +=1
            payload = {
                        'edgetype': edgetypes[_name(t,s) ]['uuid'],
                        'source': idx[src],
                        'target': idx[tgt],
                        'properties': { 'weight' : weight_copo(typ),
                                        'i': count
                                      } 
                    }
            edges.append(payload)

        self.info("\n *  POSTING Co polysemy edges : %s" % len(edges) ) 
        for cop in copo.values():
            name =  _name(cop, None)
            self.debug( "    edges : %s %s" % (len( [ e for e in edges if e['edgetype'] == edgetypes[name]['uuid']]), name ) )
            for k,v in cop['subtypes'].items() :
                name =  _name(cop, v)
                self.debug( "    edges : %s %s" % (len( [ e for e in edges if e['edgetype'] == edgetypes[name]['uuid']]), name ) )
        
        for e in bot.post_edges(gid, iter(edges), extra= lambda e: e['properties']['i'] ) : 
            pass


        """
        ## Liens de fonctions lexicales (FL)

        12-lslf-model.xml contient le modèle hiérarchique des FL : chaque FL appartient à une
        « famille » et chaque famille est elle-même élément d’un « groupe » de familles ;
        13-lslf-rel.csv contient l’ensemble de liens de FL entre lexies individuelles.
        """

        handler = LexicalFunctionHandler()
        flex = handler.parse("%s/12-lslf-model.xml" % path)

        # POST edgetypes
        
        self.info( " * POSTING Lexical Function edgetypes : %s" % (len(flex.values())) )
        _name = lambda x: "LexicalFunction/%s" % x['name']
        for fl in flex.values():
            tp = fl['id'] # cop['name']
            name = _name(fl)
            desc = ""
            properties = {
                        "weight": Text(),
                        'form': Text(),
                        'separator': Text(),
                        'merged':Text(),
                        'syntacticframe':Text(),
                        'constraint': Text(),
                        'position': Text()
                        }
            attributes = { "order" : fl['order'],
                           "cdata" : fl['cdata'], }
            
            edgetypes[name] = bot.post_edgetype(gid, name, desc, properties, attributes)
            self.debug( " * POST edgetype : %s %s" % (name, edgetypes[name]['uuid']) )

        # POST Edges

        # LF
        rels = readcsv(path, "13-lslf-rel.csv", type=list)
        edges = []
        skipped_weight = 0
        for source, lf, target, form, separator, merged, syntacticframe, constraint, position in rels:

            weight = int(flex[lf]['semantics'])
            payload = {
                        'edgetype': edgetypes[_name(flex[lf])]['uuid'],
                        'source': idx[source],
                        'target': idx[target],
                        'properties': {
                                'weight' : weight,
                                'form': form,
                                'separator': separator,
                                'merged':merged,
                                'syntacticframe':syntacticframe,
                                'constraint': constraint,
                                'position': position
                            } 
                    }
            edges.append(payload)

        weights = list( len([ e for e in edges if e['properties']['weight'] == i  ]) for i in [0,1,2] )
        self.info(' !! weights  [ 0 : %s,  1 : %s, 2 : %s ] ' %  tuple(weights))
        
        self.info(" * POSTING Lexical Function edges : %s" % len(edges) ) 

        for fl in flex.values():
            name = _name(fl)
            self.debug( "    edges : %s %s" % (len( [ e for e in edges if e['edgetype'] == edgetypes[name]['uuid']  ] ), name, ) )
        count = 0; uuids = []

        for e, uuid in bot.post_edges(gid, iter(edges) , extra=lambda e : e['edgetype']) : 
            count +=1
            uuids.append(uuid)
            
        self.info(" * POST    Lexical Function edges : %s " % (count) ) 

        print( "\n\n == DEBUG == \n\n")
        print( len(nodes) )
Beispiel #21
0
    def _parse_csvrows(self, csv, rows, **kwargs):

        # ( name, type indexed, projection )
        def _w(e):
            isproj = "%" in e
            w = re.findall("\((-?[0-9]?\.?[0-9]+)\)", e)
            if isproj and len(w):
                w = float(w[0])
            elif isproj:
                w = 1.
            else:
                w = None

            return w

        def _v(e):

            isproj = "%" in e
            w = "".join(re.findall("\[(.*)\]", e))
            if not isproj:
                return w if len(w) else None
            elif isproj:
                return None

        for row in csv:
            cell = row[0]
            # ! comment
            if cell and cell[:1] == "!":
                continue

            # IMPORT external ressource
            if cell and cell[:1] == "&":

                url = cell[1:].strip()
                # circular references
                if url not in self.imports:
                    self.log("  === Import === '%s'" % url)
                    rows = self._parse(url, rows, **kwargs)
                else:
                    raise BotapadParseError(
                        self.path, "Same file is imported multiple times  ! ",
                        row)

            # @ Nodetypes, _ Edgetypes
            elif cell and cell[:1] in ("@", "_"):

                self.post(self.current, rows)
                rows = []

                # processing directiv
                line = ";".join(row)
                cols = re.sub(' ', '', line[1:])  # no space
                # @Politic: %Chamber; #First Name; #Last Name;%Party;%State;%Stance;Statement;
                cols = [
                    e for e in re.split("[:;,]", "%s" % cols, flags=re.UNICODE)
                    if len(e)
                ]
                label = cols[0]  # @Something

                start = 1
                if cell[:1] == "_" and cell[1] == "" and cell[1] == "":
                    start = 3

                props = [
                    Prop(name=norm_key(e),
                         type=Text(multi="+" in e, default=_v(e)),
                         isref="@" in e,
                         isindex="#" in e,
                         ismulti="+" in e,
                         isproj="%" in e,
                         iscliq="+" in e and "=" in e,
                         isignored="!" in e,
                         direction="OUT"
                         if ">" in e else "IN" if "<" in e else "ALL",
                         weight=_w(e),
                         value=_v(e)) for e in cols[start:]
                ]

                def get_prop(name):
                    for e in props:
                        if e.name == name:
                            return e
                    return None

                start = 0
                end = None
                props = props[0:end]
                self.log("\n * @%s : Props " % label)
                self.log("  (%s)" % ",".join(Prop()._fields))
                for e in props:
                    self.log("  %s" % str([v for v in e]))

                names = [k.name for k in props]
                projs = [k.name for k in props if k.isproj]
                indexes = [k.name for k in props if k.isindex]

                typeprops = lambda px: {p.name: p.type for p in px}

                if cell[:1] == "@":  # nodetype def
                    # raise error if no label & index
                    pl = get_prop('label')

                    if len(indexes) == 0 and pl is None:
                        message = 'No `index` nor `label` set for @%s ' % (
                            label)
                        raise BotapadParseError(self.path, message, row)

                    if len(indexes) == 0:
                        indexes = ['label']

                    for prop in props:
                        if len(prop.name) == 0:
                            message = "Property error %s " % prop
                            raise BotapadParseError(
                                self.path, 'Parse error : %s ' % message, row)

                    if len(projs) > 0 and len(indexes) == 0:
                        message = "no `index` properties to create edge %s " % self.current
                        raise BotapadParseError(
                            self.path, 'Parse error :  %s\n  ' % (message),
                            row)

                    self.current = (VERTEX, label, props)

                    if not label in self.nodetypes:
                        self.log("\n  >> posting @ %s [%s] [%s] [%s]" %
                                 (label, ", ".join(names), ", ".join(indexes),
                                  ", ".join(projs)))
                        self.nodetypes[label] = self.bot.post_nodetype(
                            self.gid, label, label, typeprops(props))
                        self.node_headers[label] = props

                elif cell[:1] == "_":  # edgetype def
                    rows = []
                    self.current = (EDGE2, label, props)

                    if not label in self.edgetypes:

                        if "label" not in names:
                            props = [
                                Prop(name="label", type=Text(), value="")
                            ] + props
                        if "weight" not in names:
                            props = [
                                Prop(name="weight", type=Numeric(), value=1.)
                            ] + props
                        names = [k.name for k in props]
                        self.log("  >> posting _ %s [%s]" %
                                 (label, ", ".join(names)))

                        self.edgetypes[label] = self.bot.post_edgetype(
                            self.gid, label, "", typeprops(props))
                        self.edge_headers[label] = props

            else:  # table data
                if self.current and self.current[2]:
                    props = self.current[2]
                    if self.current[0] in (EDGE, EDGE2):

                        start = 1  # if self.current[0] == EDGE:
                        if self.current[0] == EDGE2:
                            start = 3

                        for i, v in enumerate(row[start:]):
                            if i >= len(props): break
                            if props[i].ismulti:
                                row[i + start] = list(
                                    set([
                                        e.strip() for e in re.split(
                                            "[,;]",
                                            v.strip(),
                                        ) if e.strip() != ""
                                    ]))

                    elif self.current[0] == VERTEX:
                        for i, v in enumerate(row):
                            if i >= len(props): break
                            if props[i].ismulti:
                                row[i] = [
                                    e.strip() for e in re.split(
                                        "[,;]",
                                        v.strip(),
                                    ) if e.strip() != ""
                                ]

                rows.append(row)

        return rows
Beispiel #22
0
    def apply_projectors(self, rows, label):
        """ property projector """

        src = label  #  @ Label
        props = self.node_headers[src]
        projs = [p for p in props if p.isproj]
        names = [k[0] for k in props]

        for iprop, prop in enumerate(props):

            if not (prop.isproj or prop.iscliq): continue

            #  @ Label: %prop0 , ...
            tgt = prop.name

            # Distinct column values
            values = []
            if prop.ismulti == False:
                values = [r[iprop] for r in rows]
            else:
                for r in rows:
                    if iprop < len(r):
                        values.extend([k.strip() for k in r[iprop]])
            values = list(set(values))

            self.log("\n * [Projector] : %s(%s) -- %s(%s) (%s) " %
                     (src, len(rows), tgt, len(values), prop.name))

            if tgt in self.node_headers:
                nodeprops = {
                    prop.name: Text(default=prop.value)
                    for prop in self.node_headers[tgt]
                }

            elif tgt not in self.node_headers:
                nodeprops = {
                    "label": Text(),
                }
                self.node_headers[tgt] = [
                    Prop('label', Text(), False, False, False, False, False,
                         False, 1., None)
                ]
                self.nodetypes[tgt] = self.bot.post_nodetype(
                    self.gid, tgt, tgt, nodeprops)

            payload = []

            # is this a table ? @ prop0
            for v in values:
                #key = "%s_%s" % ( tgt, v )
                key = "%s" % (v)
                if key not in self.idx:
                    # defaults values
                    _k = [p.name for p in self.node_headers[tgt] if p.value]
                    _v = [p.value for p in self.node_headers[tgt] if p.value]
                    properties = dict(zip(_k, _v))
                    properties['label'] = v

                    payload.append({
                        'nodetype': self.nodetypes[tgt]['uuid'],
                        'properties': properties
                    })

            if len(payload):
                self.log(" * [Projector] posting @ %s %s " %
                         (len(payload), tgt))
                for node, uuid in self.bot.post_nodes(self.gid, iter(payload)):
                    tgtid = '%s' % (node['properties']['label'])
                    self.idx[tgtid] = uuid
                    self.debug(node)

            etname = "%s/%s" % (src, tgt)
            edgeprops = {
                "label": Text(),
                'weight': Numeric(vtype=float, default=1.)
            }
            if etname not in self.edgetypes:
                self.log(" * [Projector] POST edgetype %s %s " %
                         (etname, edgeprops))
                self.edgetypes[etname] = self.bot.post_edgetype(
                    self.gid, etname, etname, edgeprops)

            # label -- property edge
            edges = []
            indexes = [e for e, k in enumerate(props) if k.isindex]
            cliqset = set()
            cliqedges = []
            cliqname = ""

            for r in rows:
                if iprop < len(r):
                    targets = r[iprop] if prop.ismulti else [r[iprop]]

                    if prop.iscliq:
                        cliqname = "%s_clique" % (prop.name)
                        if cliqname not in self.edgetypes:
                            self.log(" * [Projector] POST edgetype %s %s " %
                                     (cliqname, edgeprops))
                            self.edgetypes[cliqname] = self.bot.post_edgetype(
                                self.gid, cliqname, cliqname, edgeprops)

                        for e, t in enumerate(targets):
                            for t2 in targets[e + 1:]:

                                cliqe = '%s%s' % (t, t2) if t > t2 else (t2, t)
                                if cliqe not in cliqset:

                                    properties = {
                                        "label": cliqname,
                                        'weight': prop.weight
                                    }
                                    if cliqname in self.edge_headers:
                                        _k = [
                                            p.name for p in
                                            self.edge_headers[cliqname]
                                            if p.value
                                        ]
                                        _v = [
                                            p.value for p in
                                            self.edge_headers[cliqname]
                                            if p.value
                                        ]
                                        properties = dict(zip(_k, _v))

                                    cliqedges.append({
                                        'edgetype':
                                        self.edgetypes[cliqname]['uuid'],
                                        'source':
                                        self.idx['%s' % (t)],
                                        'target':
                                        self.idx['%s' % (t2)],
                                        'properties':
                                        properties
                                    })
                                    cliqset.add(cliqe)

                    if prop.isproj:

                        for t in targets:
                            st = self.node_headers[label]
                            srcid = "".join([r[i] for i in indexes])
                            tgtid = '%s' % (t)

                            properties = dict()
                            if etname in self.edge_headers:
                                _k = [
                                    p.name for p in self.edge_headers[etname]
                                    if p.value
                                ]
                                _v = [
                                    p.value for p in self.edge_headers[etname]
                                    if p.value
                                ]
                                properties = dict(zip(_k, _v))

                            properties['label'] = etname
                            properties['weight'] = prop.weight

                            # edge direction
                            essrc = self.idx[srcid] if prop.direction in (
                                "IN", ) else self.idx[tgtid]
                            estgt = self.idx[srcid] if prop.direction in (
                                "OUT", "ALL") else self.idx[tgtid]

                            edges.append({
                                'edgetype':
                                self.edgetypes[etname]['uuid'],
                                'source':
                                essrc,
                                'target':
                                estgt,
                                'weight':
                                prop.weight,
                                'properties':
                                properties
                            })

            direction = prop.direction
            self.log(" * [Projector] posting _ = %s %s %s " %
                     (len(cliqedges), direction, cliqname))
            for e in self.bot.post_edges(self.gid,
                                         iter(cliqedges),
                                         extra=lambda x: etname):
                self.debug(e)

            self.log(" * [Projector] posting _ %% %s %s %s " %
                     (len(edges), direction, etname))
            for e in self.bot.post_edges(self.gid,
                                         iter(edges),
                                         extra=lambda x: etname):
                self.debug(e)
Beispiel #23
0
        "vertices_color": {'fort': (255,150,0),
                          'bon': (200,255,0),
                          'faible': (50,50,255),
                          'mauvais': (255,50,50)},
    },
}

for gname, config in graph_config.iteritems():

    graph = igraph.read(config.pop("path"))
    graph['vertices_color'] = config.pop("vertices_color")
    graphs.add(gname)
    engine = lexical_graph_engine(graph)

    view = EngineView(engine)
    view.set_input_type(Text())
    view.add_output("query", lambda x : x.encode('utf8'))
    view.add_output("graph", export_graph)
    view.add_output("layout", export_layout)
    view.add_output("clusters", export_clustering)

    api = ReliureAPI(name=gname )
    api.register_view(view,  url_prefix="api"  )

    app.register_blueprint(api,  url_prefix="/graph/%s" % (gname) )




# === Routes ===
Beispiel #24
0
                            }
                        }
                    }
                }
            }
        }
    }  # /q

    res = index.search(body=q, size=len(ids))
    return res


TmuseDocSchema = Schema(
    docnum=Numeric(),
    # stored fields
    graph=Text(),
    lang=Text(),
    pos=Text(),
    pzero=Boolean(),
    form=Text(),
    neighbors=Numeric(),
    out_links=Numeric(multi=True, uniq=True),
    # computed fields
    rank=Numeric(),
    score=Numeric(vtype=float, default=0.))


def to_docs(es_res, pzeros):
    _pzeros = set(pzeros) or set([])
    docs = []
    if 'hits' in es_res and 'hits' in es_res['hits']:
Beispiel #25
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--host",
                        action='store',
                        help="host",
                        default="http://localhost:5000")
    parser.add_argument("--key",
                        action='store',
                        help="authentification token",
                        default=None)
    parser.add_argument("--gid", action='store', help="graph id", default=None)

    args = parser.parse_args()
    host, key, gid = (args.host, args.key, args.gid)

    if None in (host, key, gid):
        parser.print_help()
        return

    # Setup schema

    from reliure.schema import Doc, Schema
    from reliure.types import Text, Numeric, Boolean, GenericType

    desc = """
        Game of thrones 
        %s
        """.replace("    ", "")

    g_attrs = {
        'description': desc % gid,

        #'image': "https://commons.wikimedia.org/wiki/File:Game_of_Thrones_2011_logo.svg?uselang=fr",
        #'tags': ['social-network', 'game-of-thrones']
    }

    # used for houses, sauvageons ...
    group_type = Schema(
        **{
            'label': Text(),
            'url': Text(),
            'tags': Text(multi=True, uniq=True),
            'image': Text(),
            'color': Text(),
            'shape': Text(default=u"square"),
            "name": Text(),
        })

    # human like characters
    character_type = Schema(
        **{
            'label': Text(),
            'url': Text(multi=True, uniq=True),
            'tags': Text(multi=True, uniq=True),
            'image': Text(),
            'shape': Text(default=u"circle"),
            'description': Text(),
            "name": Text(),
            "actor": Text(multi=True, uniq=True),
            "dubbling_vf": Text(multi=True, uniq=True),
            "bio_fr": Text(),
            "bio_en": Text(),
            "seasons": Text(),
            "dead": Boolean(default=False),
        })

    # creaturs dragons, wolf, white walkers ?
    creatur_type = Schema(
        **{
            'label': Text(),
            'url': Text(),
            'tags': Text(multi=True, uniq=True),
            'image': Text(),
            'shape': Text(default=u"triangle"),
            'description': Text(),
            "name": Text(),
            "bio_fr": Text(),
            "bio_en": Text(),
            "seasons": Text(),
            "dead": Boolean(default=False),
        })

    edgetypes = [
        # Characters or Creaturs -- rel --> Group
        # (name, desc , properties ),
        ("is_member_of", "Character is member of a Group", {
            "from_ep": Text(),
        }),
        ("is_child_of", "character or creatur is child of another one", {}),
        ("works_for", "character or creatur works for a character or a group",
         {
             "from_episode": Text(),
             "to_episode": Text()
         }),
        ("is_friend_of", "character is friend of another one", {
            "from_ep": Text(),
        }),
        ("married", "character meet another one", {
            "force": Numeric()
        }),
        ("belongs_to", "character or creatur belongs to another one", {
            "from_episode": Text(),
            "to_episode": Text()
        }),
        ("kill", "character or creatur kill another one", {
            "episode": Text(),
            "method": Text()
        }),
        #("have_sex", "character or creatur have sex another one", { "episode":Text()} ),
        #("rape", "character or creatur rape another one", { "episode":Text()} ),
        #("meet", "character meet another one", { "episode":Text()}),
        #("loves", "character meet another one", {} ),
    ]

    # PARSING WK page

    from pyquery import PyQuery as pq
    import codecs

    root = "."
    path = "%s/pages/Personnages_de_Game_of_Thrones" % root
    graphmlz = "%s/got.graphml" % root

    def opengot():
        html = codecs.open(path, mode='r', encoding="utf8").read()
        html = pq(html)
        html = html(".mw-content-ltr")
        html(".mw-content-ltr h2:first").prevAll().remove()
        html(".mw-content-ltr h2:first").remove()
        html(".mw-content-ltr h2:first").nextAll().remove()

        html('.mw-editsection').remove()
        html('sup').remove()
        html = pq(".mw-content-ltr", html)
        return html

    def as_doc(ctype, cdata):
        d = Doc(ctype)

        for k, v in cdata.iteritems():
            if type(ctype[k]) == Text:
                d.set_field(k, v, True)
            else:
                d[k] = v
        return d.export()

    def _parse_color(e):
        color = None
        if "style" in e.attrib:
            styles = dict(
                pair.strip().split(':')
                for pair in pq(e).attr("style").strip().lower().split(';')
                if len(pair))
            color = styles.get("background", None)

        if color and color in ("black", "#000"): color = "#000000"

        return color

    def parse_belongs_legend(html):
        houses_map = {}
        legende = pq("li", pq("table td ul", html)[:4])
        for e in legende:
            color = _parse_color(pq("span", e)[0])
            text = pq(e).text()  #.replace("Maison ", "")
            houses_map[color] = text

        # removes legendes
        html(".mw-content-ltr h3:first").prevAll().remove()
        return houses_map

    def parse_creaturs_and_characters(html, houses):

        rel_belongs = []
        rel_member_of = []
        characters = []
        creaturs = []

        while True:
            # reading from end
            if len(html("h3:last")):

                ths = pq('th', html("h3:last").nextAll())
                tds = pq('td', html("h3:last").nextAll())

                title = html("h3:last").text()
                color = None
                flg = 0

                if len(ths) % 5 == 0:
                    c = {}
                    member_of = []
                    for td in tds:
                        colspan = td.attrib.get('colspan', 0)
                        if colspan == "6":  # table headers
                            color = _parse_color(td)
                            if color:
                                member_of.append(houses[color])
                            flg = 1

                        elif colspan == 0:  # table cells
                            if flg == 1:
                                actor_img = pq("img", td).attr("src")
                                if actor_img:
                                    c['image'] = "http:%s" % actor_img
                            elif flg == 2:
                                name = pq(td).text()
                                c['name'] = name
                                for e in member_of:
                                    rel_member_of.append((name, e))
                            elif flg == 3:
                                c['actor'] = [
                                    pq(e).text() for e in pq("a", td)
                                ]

                            elif flg == 4:
                                c['dubbling_vf'] = [
                                    pq(e).text() for e in pq("a", td)
                                ]
                            elif flg == 5:
                                c['seasons'] = pq(td).text()
                                c['dead'] = u"✝" in pq(td).text()
                            flg += 1

                        elif colspan == "5":  # table bio cell
                            c['bio_fr'] = pq(td).text()

                            characters.append(as_doc(character_type, c))
                            # reset
                            c = {}
                            member_of = []
                            flg = 1

                if len(ths) == 2:
                    c = {}
                    belongs = []
                    for td in tds:
                        colspan = td.attrib.get('colspan', 0)
                        if colspan == "6":
                            color = _parse_color(td)
                            if color:
                                belongs.append(houses[color])
                            flg = 1

                        elif colspan == 0:
                            if flg == 1:
                                name = pq(td).text().strip()
                                c['name'] = name
                                for e in belongs:
                                    rel_belongs.append((name, e))
                                flg = 2
                            if flg == 2:
                                c["seasons"] = pq(td).text()
                                c["dead"] = True  # u"✝" in pq(td).text()

                        elif colspan == "5":
                            c['bio_fr'] = pq(td).text()
                            creaturs.append(as_doc(creatur_type, c))
                            c = {}
                            belongs = []
                            flg = 0

                #removes section once parsed
                html("h3:last").nextAll().remove()
                html("h3:last").remove()

            else:
                break

        return characters, rel_member_of, creaturs, rel_belongs

    # In[ ]:

    from reliure.schema import Doc

    locations = []  # TODO

    html = opengot()
    houses_map = parse_belongs_legend(html)
    characters, rel_member_of, creaturs, rel_belongs = parse_creaturs_and_characters(
        html, houses_map)

    print "Groups   ", len(houses_map)
    print "Creaturs   ", len(creaturs)
    print "Characters ", len(characters)

    print "member_of", len(rel_member_of)
    print "belongs", len(rel_belongs)

    from botapi import Botagraph, BotApiError

    bot = Botagraph(host, key)

    if not bot.has_graph(gid):

        print "\n * Creating graph %s" % gid
        bot.create_graph(gid, g_attrs)

        print "\n * Creating node type %s" % ""
        bot.post_nodetype(gid, "Character", "Character",
                          character_type._fields)
        bot.post_nodetype(gid, "Creatur", "Creatur", creatur_type._fields)
        bot.post_nodetype(gid, "Group", "Group", group_type._fields)

        for name, desc, props in edgetypes:
            bot.post_edgetype(gid, name, desc, props)

    schema = bot.get_schema(gid)['schema']
    nodetypes = {n['name']: n for n in schema['nodetypes']}
    edgetypes = {e['name']: e for e in schema['edgetypes']}

    idx = {}  # (label, uuid)
    groups = []

    for k, v in houses_map.iteritems():
        g = as_doc(group_type, {'label': v, 'name': v, 'color': k})
        groups.append(g)

    for name, els in [("Character", characters), ("Creatur", creaturs),
                      ("Group", groups)]:

        print "Posting %s nodes %s" % (len(els), name)
        for c in els:
            payload = {
                'nodetype': nodetypes[name]['uuid'],
                'properties': {k: v
                               for k, v in c.iteritems()}
            }
            payload['properties']['label'] = payload['properties']['name']
            node = bot.post_node(gid, payload)
            idx[node['label']] = node['uuid']

    vids = set()
    for name, rels in [("is_member_of", rel_member_of),
                       ("belongs_to", rel_belongs)]:

        print "Posting %s rels %s" % (len(rels), name)
        for src, tgt in rels:
            if src in idx and tgt in idx:
                edge = {
                    'edgetype': edgetypes[name]['uuid'],
                    'source': idx[src],
                    'label': name,
                    'target': idx[tgt],
                    'properties': {
                        "from_ep": "",
                    }
                }
                uuid = bot.post_edge(gid, edge)
                vids.add(src)
                vids.add(tgt)
            else:
                print src, tgt

    print "Starring %s nodes" % len(list(vids))
    bot.star_nodes(gid, list(vids))
Beispiel #26
0
    def __init__(self,
                 global_graph,
                 prox_func,
                 default_mode=OUT,
                 weight=None,
                 loops_weight=None,
                 name=None):
        """
        :param global_graph: a subclass of :class:`.AbstractGraph`
        :param prox_func: curryfied function for prox. Only `graph`, `pzero`,
            and `length` will be passed a argument to the fuction. If one wants
            to modified the named argument you want passed a lamdba with all
            named arguments setted.
        :param default_mode: default mode for the random walk (useful only if the graph is directed)
        :param weight: if None the graph is not weighting, else it could be:
            a str corresponding to an edge attribute to use as weight,
            or a list of weight (`|weight| == graph.ecount()`),
            or a callable `lambda graph, source, target: wgt`
        :param loops_weight: only if `add_loops`, weight for added loops, it may be :
            a str corresponding to a vertex attribute,
            or a list of weight (`|loops_weight| == graph.vcount()`),
            or a callable `lambda graph, vid, mode, weight: wgt`


        Here is an example of usable prox fct:

        >>> def prox_func(graph, pzero, length):
        ...     return prox.prox_markov_dict(graph, pzero, length, mode=OUT,
        ...         add_loops=False, weight=None)
        """
        super(ProxExtractGlobal, self).__init__(name=name)

        self.add_option("vcount", Numeric(default=10, help="max vertex count"))
        self.add_option("length", Numeric(default=3,
                                          help="random walk length"))
        self.add_option(
            "add_loops",
            Boolean(default=True, help="virtualy add loops on each vertex"))

        self._modes = {
            "text_to_num": {
                "IN": IN,
                "OUT": OUT,
                "ALL": ALL
            },
            "num_to_text": {
                IN: u"IN",
                OUT: u"OUT",
                ALL: u"ALL"
            }
        }

        self.add_option(
            "mode",
            Text(default=self._modes["num_to_text"][default_mode],
                 choices=[u"IN", u"OUT", u"ALL"],
                 help="edges to walk on from a vertex"))

        self._wgt = weight
        if weight is not None:
            self.add_option(
                "is_wgt", Boolean(default=True, help="consider graph weight?"))
        self.prox_func = prox_func
        self.global_graph = global_graph
        self._loops_weight = loops_weight
from botapi import Botagraph, BotApiError
from reliure.types import Text

NodeType = namedtuple("NodeType", "name description properties")
EdgeType = namedtuple("EdgeType", "name description properties")

# Graph Definition

PDG_HOST = "http://g0v-tw.padagraph.io"
PDG_KEY = ""
GRAPHNAME = "G0V Hackpads network"
DESCRIPTION = "a graph or inter-linked Hackpads"
TAGS = ["pads", "g0v-tw"]

NodePad = NodeType("pad", "", {"id": Text(), "label": Text(), "url": Text()})

EdgeLink = EdgeType("link to", "", {})

bot = Botagraph(PDG_HOST, PDG_KEY)
bot.create_graph(
    GRAPHNAME, {
        'description': DESCRIPTION,
        "tags": TAGS,
        "image": "https://avatars3.githubusercontent.com/u/2668086?v=3&s=200"
    })

# Posting Nodes and Edges Types

nodetypes_uuids = {}
edgetypes_uuids = {}
Beispiel #28
0
 def __init__(self, **kwargs):
     super(Url, self).__init__(attrs={'type': Text(),
                                          'title': Text(), })
     
     self.validators.append(TypeValidator(datetime.datetime))
     self._init_validation()
Beispiel #29
0
    def parse(self, path):
        """ :param path : txt file path

        handles special lines starting with [# @ _]
        for comments, node type, property names
        
        """
        csv = self.read(path)

        rows = []
        current = ()  # (VERTEX | EDGE, label, names, index_prop)

        for row in csv:
            cell = row[0]
            # ! comment
            if cell[:1] == "!":
                continue

            # IMPORT external ressource
            if cell[:1] == "&":
                url = cell[1:].strip()
                self.parse(url)

            # @ Nodetypes, _ Edgetypes
            elif cell[:1] in ("@", "_"):
                if len(current) > 0:
                    self.store(current, rows, path)
                # processing directiv
                line = ";".join(row)
                cols = re.sub(' ', '', line[1:])  # no space
                # @Politic: %Chamber; #First Name; #Last Name;%Party;%State;%Stance;Statement;
                cols = [
                    e for e in re.split("[:;,]", "%s" % cols, flags=re.UNICODE)
                    if len(e)
                ]
                label = cols[0]  # @Something

                # ( name, type indexed, projection )
                props = [
                    Prop(norm_key(e), Text(multi="+" in e), "@" in e, "#" in e,
                         "+" in e, "%" in e, "+" in e and "=" in e)
                    for e in cols[1:]
                ]

                if cell[:1] == "@":  # nodetype def
                    rows = []
                    current = (VERTEX, label, props)

                elif cell[:1] == "_":  # edgetype def
                    rows = []
                    current = (EDGE, label, props)
            else:  # table data
                if current and current[2]:
                    for i, v in enumerate(row):
                        if i >= len(props): break
                        if props[i].ismulti:
                            row[i] = [
                                e.strip() for e in re.split(
                                    "[_,;]",
                                    v.strip(),
                                )
                            ]

                rows.append(row)

        self.store(current, rows, path)
Beispiel #30
0
def TmuseApi(name,
             host='localhost:9200',
             index_name='tmuse',
             doc_type='graph',
             retry=5):
    """ API over tmuse elastic search
    """
    esindex = EsIndex(index_name, doc_type=doc_type, host=host)
    print "# TmuseApi", host, doc_type, index_name

    # let es start
    for i in range(retry):
        if not esindex._es.ping():
            print "waiting for es to start"
            time.sleep(i)
    assert esindex._es.ping(), "impossible to reach ES server"

    # build the API from this engine
    print "api name", name
    api = ReliureAPI(name)

    # Main api entry point: tmuse engine (subgraph)
    view = EngineView(engine(esindex))
    view.set_input_type(ComplexQuery())
    view.add_output("query", ComplexQuery())
    view.add_output("graph", export_graph)
    view.add_output("layout", export_layout)
    view.add_output("clusters", export_clustering)
    # add a simple play route
    view.play_route("<query>")
    api.register_view(view, url_prefix="subgraph")

    # Add auto completion View
    completion = TmuseEsComplete(index=esindex, size=20)
    # TODO suggestion rerank
    # completion |= rerank
    completion_view = ComponentView(completion)
    completion_view.add_input("lang", Text(default=u"*"))
    completion_view.add_input("pos", Text(default=u"*"))
    completion_view.add_input("form")
    completion_view.add_output("response")
    completion_view.play_route("<lang>.<pos>.<form>")
    api.register_view(completion_view, url_prefix="complete")

    import random

    @api.route("/random")
    @api.route("/random/<string:pos>")
    def random_node(pos=None, retry=5, count=0):
        if pos not in ALL_POS:
            pos = random.sample(ALL_POS, 1)[0]

        graph = "jdm.%s.flat" % pos
        docs = tmuse.random_node(esindex, graph)

        doc = docs[0] if len(docs) else dict()

        return jsonify({'pos': pos, 'doc': doc})

    # Debug views
    @api.route("/_extract/<string:graph>/<string:text>")
    def _extract(graph, text):
        query = QueryUnit(graph=graph, form=text)
        es_res = tmuse.extract(esindex, query)
        return jsonify({'res': es_res})

    @api.route("/_prox/<string:graph>/<string:text>")
    def _prox(graph, text):
        es_res = proxlist(esindex, graph, text, 100)
        return jsonify({'res': es_res})

    return api