Ejemplo n.º 1
0
def test_mosdef_only(testresourcepath, expected_modout1):
    modin = newmodel()
    literate.parse(INPUT_GRAPH_1, modin)

    modin = newmodel()
    literate.parse(INPUT_GRAPH_1, modin)

    FINGERPRINT_RULES = {
        SCH_NS('MusicAlbum'):
        (if_(contains(follow(SCH_NS('byArtist')), DOC_NS('md')),
             materialize(COPY()))),
        SCH_NS('Person'): (materialize(COPY())),
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, {}, {})

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('=' * 10, 'test_mosdef_only', '=' * 10)
    literate.write(modout)
    # import pprint; pprint.pprint(list(iter(modout)))

    assert len(modout) == 17
    assert len(
        list(util.all_origins(modout, only_types={SCH_NS('MusicAlbum')}))) == 1
    assert len(list(util.all_origins(modout,
                                     only_types={SCH_NS('Person')}))) == 3
Ejemplo n.º 2
0
def write(model, out=sys.stdout):
    '''
    models - input Versa model from which output is generated.
    '''
    resource_tags = {}
    property_tags = {}
    value_tags = {}

    out.write('graph TD\n')

    for o in all_origins(model):
        o_label = next(labels(model, o), None)
        o_tag = lookup_tag(o, resource_tags, o_label)
        for _, r, t, a in model.match(o):
            r_tag = lookup_tag(r, property_tags, None, is_node=False)
            if isinstance(t, I):
                t_label = next(labels(model, t), None)
                t_tag = lookup_tag(t, resource_tags, t_label)
            else:
                t_tag = lookup_tag(t, value_tags, None)

            out.write(f'    {o_tag} -->|{r_tag}| {t_tag}\n')

        out.write('\n')
    return
Ejemplo n.º 3
0
 def labelize_helper(self, rules, label_rel=VLABEL_REL, origins=None,
                         handle_misses=None, root_context=DUMMY_CONTEXT):
     '''
     Implements a common label making strategy where output
     resources are put through pattern/action according to type in order
     to determine the output label
     '''
     new_labels = {}
     # Anything with a Versa type is an output resource
     # FIXME weid, redundant logic
     for out_rid in util.all_origins(self.output_model, of_types='*'):
         for typ in util.resourcetypes(self.output_model, out_rid):
             if typ in rules:
                 rule = rules[typ]
                 link = (out_rid, VTYPE_REL, typ, {})
                 # Notice that it reads from the output model and also updates same
                 ctx = root_context.copy(current_link=link, input_model=self.output_model,
                                         output_model=self.output_model)
                 out_labels = rule(ctx)
                 if not out_labels: continue
                 for label in out_labels:
                     if not label or not str(label).strip():
                         if handle_misses:
                             handle_misses(out_rid, typ)
                     # Stripped because labels are for human reading so conventional not to differentiate by whitespace
                     # FIXME: fully normalize
                     label = str(label).strip()
                     new_labels[out_rid] = label
                     self.output_model.add(out_rid, label_rel, label)
     return new_labels
Ejemplo n.º 4
0
def test_basics_1(testresourcepath, expected_modout1):
    modin = newmodel()
    modin_fpath = 'schemaorg/catcherintherye-ugly.md'
    literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin)

    FINGERPRINT_RULES = {
        SCH_NS('Book'): ( 
            materialize(BF_NS('Instance'),
                fprint=[
                    (BF_NS('isbn'), follow(SCH_NS('isbn'))),
                ],
            )
        )
    }

    TRANSFORM_RULES = {
        SCH_NS('name'): link(rel=BF_NS('name')),

        SCH_NS('author'): materialize(BF_NS('Person'),
                                    BF_NS('creator'),
                                    vars={
                                        'birthDate': follow(SCH_NS('authorBirthDate'),
                                            origin=var('input-resource'))
                                    },
                                    fprint=[
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ],
                                    links=[
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ]
        ),
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES)

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('='*10, 'test_basics_1', '='*10)
    literate.write(modout)

    assert len(modout) == 8
    assert len(list(util.all_origins(modout, only_types={BF_NS('Instance')}))) == 1
    assert len(list(util.all_origins(modout, only_types={BF_NS('Person')}))) == 1
    assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
Ejemplo n.º 5
0
def write(model, out=sys.stdout, base=None, schema=None, shorteners=None):
    '''
    models - input Versa model from which output is generated
    '''
    shorteners = shorteners or {}

    all_schema = [schema] if schema else []
    all_schema.append(VERSA_BASEIRI)

    if any((base, schema, shorteners)):
        out.write('# @docheader\n\n* @iri:\n')
    if base:
        out.write('    * @base: {0}'.format(base))
    if schema:
        out.write('    * @schema: {0}'.format(schema))
    #for k, v in shorteners:
    #    out.write('    * @base: {0}'.format(base))

    out.write('\n\n')

    origin_space = set(all_origins(model))

    for o in origin_space:
        # First type found
        # XXX: Maybe there could be a standard primary-type attribute
        # to flag the property with the type to highlight
        first_type = next(resourcetypes(model, o), None)
        if first_type:
            first_type_str = abbreviate(first_type, all_schema)
            out.write(f'# {o} [{first_type_str}]\n\n')
        else:
            out.write(f'# {o}\n\n')
        for o_, r, t, a in model.match(o):
            if (r, t) == (VTYPE_REL, first_type): continue
            rendered_r = abbreviate(r, all_schema)
            if isinstance(rendered_r, I):
                rendered_r = f'<{rendered_r}>'
            value_format(t)
            out.write(f'* {rendered_r}: {value_format(t)}\n')
            for k, v in a.items():
                rendered_k = abbreviate(k, all_schema)
                if isinstance(rendered_k, I):
                    rendered_r = f'<{rendered_k}>'
                out.write(f'    * {rendered_k}: {value_format(t)}\n')

        out.write('\n')
    return
Ejemplo n.º 6
0
def IGNORE():
    if False:
        for rid in all_origins(model):
            #print(rid, list(model.match(rid, RDF_TYPE_REL)))
            rtypes = list(lookup(model, rid, RDF_TYPE_REL))
            #if not rtypes: rtypes = list(lookup(model, rid, VERSA_TYPE_REL))
            #Ignore if no type
            if not rtypes: continue
            row = [rid, fromlist(rtypes)] + [None] * numprops
            for ix, p in enumerate(properties):
                #v = next(lookup(model, rid, RDF_TYPE_REL), None)
                v = list(lookup(model, rid, p))
                if v:
                    row[ix + 2] = fromlist(v)
                    csvout.writerow(row)
            
    return
Ejemplo n.º 7
0
Archivo: csv.py Proyecto: uogbuji/versa
def IGNORE():
    if False:
        for rid in all_origins(m):
            #print(rid, list(m.match(rid, RDF_TYPE_REL)))
            rtypes = list(lookup(m, rid, RDF_TYPE_REL))
            #if not rtypes: rtypes = list(lookup(m, rid, VERSA_TYPE_REL))
            #Ignore if no type
            if not rtypes: continue
            row = [rid, fromlist(rtypes)] + [None] * numprops
            for ix, p in enumerate(properties):
                #v = next(lookup(m, rid, RDF_TYPE_REL), None)
                v = list(lookup(m, rid, p))
                if v:
                    row[ix + 2] = fromlist(v)
                    csvout.writerow(row)
            
    return
Ejemplo n.º 8
0
    def fingerprint_helper(self, rules, root_context=DUMMY_CONTEXT):
        '''
        Implements a common fingerprinting strategy where the input model
        is scanned for resources and each one is matched by type to the passed-in rules
        If any type is matched that corresponding action is run to determine
        the new resource ID & type
        '''
        # All output resources, whether or not from a direct fingerprint of an input resource
        new_rids = set()

        resources = list(util.all_origins(self.input_model))
        for rid in resources:
            for typ in util.resourcetypes(self.input_model, rid):
                if typ in rules:
                    rule_tup = rules[typ]
                    rule_tup = (rule_tup
                        if isinstance(rule_tup, list)
                            or isinstance(rule_tup, tuple)
                        else
                            (rule_tup,))
                    for rule in rule_tup:
                        out_rids = set()
                        def new_entity(eid):
                            '''
                            Called on Versa pipeline materialization of new entity
                            Ensures we capture additional entities created by
                            pipeline actions during this fingerprint phase
                            '''
                            out_rids.add(eid)

                        # None relationship here acts as a signal to actions
                        # such as materialize to not try to attach the newly created
                        # resource anywhere in the output, since this is just the
                        # fingerprinting stage
                        link = (rid, None, typ, {})
                        ctx = root_context.copy(current_link=link, input_model=self.input_model,
                            output_model=self.output_model)
                        ne_hook = ctx.extras.setdefault('@new-entity-hook', [])
                        ctx.extras['@new-entity-hook'] = make_list(ne_hook, new_entity)
                        main_ridouts = rule(ctx)
                        main_ridouts = set(main_ridouts) if isinstance(main_ridouts, list) else {main_ridouts}
                        mains, others = self.fingerprints.setdefault(rid, (set(), set()))
                        mains.update(main_ridouts), others.update(out_rids)
                        others -= mains
                        new_rids.update(out_rids)
        return new_rids
Ejemplo n.º 9
0
def write(model,
          out=sys.stdout,
          base=None,
          propertybase=None,
          shorteners=None):
    '''
    models - input Versa model from which output is generated
    '''
    shorteners = shorteners or {}

    all_propertybase = [propertybase] if propertybase else []
    all_propertybase.append(VERSA_BASEIRI)

    if any((base, propertybase, shorteners)):
        out.write('# @docheader\n\n* @iri:\n')
    if base:
        out.write('    * @base: {0}'.format(base))
    #for k, v in shorteners:
    #    out.write('    * @base: {0}'.format(base))

    out.write('\n\n')

    origin_space = set(all_origins(model))

    for o in origin_space:
        out.write('# {0}\n\n'.format(o))
        for o_, r, t, a in model.match(o):
            rendered_r = abbreviate(r, all_propertybase)
            if isinstance(rendered_r, I):
                rendered_r = f'<{rendered_r}>'
            value_format(t)
            out.write(f'* {rendered_r}: {value_format(t)}\n')
            for k, v in a.items():
                rendered_k = abbreviate(k, all_propertybase)
                if isinstance(rendered_k, I):
                    rendered_r = f'<{rendered_k}>'
                out.write(f'    * {rendered_k}: {value_format(t)}\n')

        out.write('\n')
    return
Ejemplo n.º 10
0
Archivo: md.py Proyecto: uogbuji/versa
def write(models, out=None, base=None, propertybase=None, shorteners=None, logger=logging):
    '''
    models - input Versa models from which output is generated. Must be a sequence
                object, not an iterator
    '''
    assert out is not None #Output stream required
    if not isinstance(models, list): models = [models]
    shorteners = shorteners or {}

    all_propertybase = [propertybase] if propertybase else []
    all_propertybase.append(VERSA_BASEIRI)

    if any((base, propertybase, shorteners)):
        out.write('# @docheader\n\n* @iri:\n')
    if base:
        out.write('    * @base: {0}'.format(base))
    #for k, v in shorteners:
    #    out.write('    * @base: {0}'.format(base))

    out.write('\n\n')

    origin_space = set()
    #base_out = models[0].base
    for m in models:
        origin_space.update(all_origins(m))

    for o in origin_space:
        out.write('# {0}\n\n'.format(o))
        for o_, r, t, a in m.match(o):
            abbr_r = abbreviate(r, all_propertybase)
            value_format(t)
            out.write('* {0}: {1}\n'.format(abbr_r, value_format(t)))
            for k, v in a.items():
                abbr_k = abbreviate(k, all_propertybase)
                out.write('    * {0}: {1}\n'.format(k, value_format(v)))

        out.write('\n')
    return
Ejemplo n.º 11
0
def test_basics_2(testresourcepath):
    modin = newmodel()
    modin_fpath = 'schemaorg/catcherintherye-ugly.md'
    literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin)

    FINGERPRINT_RULES = {
        SCH_NS('Book'): ( 
            materialize(var('itype'),
                fprint=[
                    (BF_NS('isbn'), follow(SCH_NS('isbn'))),
                ],
                links=[
                    (BF_NS('instantiates'),
                        materialize(BF_NS('Work'),
                            fprint=[
                                (BF_NS('name'), follow(SCH_NS('title'))),
                                (BF_NS('creator'), follow(SCH_NS('author'))),
                                (BF_NS('language'), var('lang')),
                            ],
                            links=[('http://instantiated-by', var('@stem'))],
                            attach=False # Can remove when we have smart sessions to avoid duplicate instantiates links
                        ),
                    )
                ],
                # Not really necessary; just testing vars in this scenario
                vars={
                    'lang': follow(SCH_NS('inLanguage')),
                    'itype': BF_NS('Instance')
                    }
            )
        )
    }

    TRANSFORM_RULES = {
        # Rule for output resource type of Work or Instance
        (SCH_NS('name'), WT, IT): link(rel=BF_NS('name')),

        # Rule only for output resource type of Work
        (SCH_NS('author'), WT): materialize(BF_NS('Person'),
                                    BF_NS('creator'),
                                    vars={
                                        'birthDate': follow(SCH_NS('authorBirthDate'),
                                            origin=var('input-resource'))
                                    },
                                    fprint=[
                                        # Supplementary type
                                        (VTYPE_REL, SCH_NS('Novelist')),
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ],
                                    links=[
                                        # Supplementary type
                                        (VTYPE_REL, SCH_NS('Novelist')),
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ],
                                    preserve_fprint=True,
        ),
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES)

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('='*10, 'test_basics_2', '='*10)
    literate.write(modout)
    #import pprint; pprint.pprint(list(iter(modout)))

    assert len(modout) == 15
    assert len(list(util.all_origins(modout, only_types={BF_NS('Instance')}))) == 1
    assert len(list(util.all_origins(modout, only_types={BF_NS('Work')}))) == 1
    assert len(list(util.all_origins(modout, only_types={BF_NS('Person')}))) == 1
    assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
Ejemplo n.º 12
0
def test_basics_4(testresourcepath):
    '''
    Convert from schema.org to [MusicBrainz scheme](https://musicbrainz.org/doc/MusicBrainz_Database/Schema)
    '''
    import sys # Uncomment to debug
    MB_NS = I('https://musicbrainz.org/doc/MusicBrainz_Database/Schema/')
    R_TYP = MB_NS('Release')
    RG_TYP = MB_NS('ReleaseGroup')
    A_TYP = MB_NS('Artist')
    DOC_NS = I('http://example.org/records/')

    modin = newmodel()
    modin_fpath = 'schemaorg/blackstar.md'
    literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin)
    # Hand-add a comment property to the Mos Def resource to test that this value doesn't bleed e.g. to Kweli's output
    modin.add(DOC_NS('md'), SCH_NS('comment'), 'test')

    FINGERPRINT_RULES = {
        SCH_NS('MusicAlbum'): ( 
            materialize(MB_NS('ReleaseGroup'),
                fprint=[
                    (MB_NS('title'), follow(SCH_NS('name'))),
                    (MB_NS('artist'), follow(SCH_NS('byArtist'), SCH_NS('name'))),
                ],
                links=[
                    (MB_NS('contains'), materialize(MB_NS('Release'),
                        fprint=[
                            (MB_NS('catalogue-number'), var('catnum')),
                        ],
                        links=[
                            (MB_NS('catalogue-number'), var('catnum')),
                        ]
                    ))
                ],
                vars={'catnum': follow(SCH_NS('catalogNumber'))},
                # debug=sys.stderr, # Uncomment to debug
            )
        ),

        SCH_NS('Person'): ( 
            materialize(MB_NS('Artist'),
                fprint=[
                    (MB_NS('name'), var('aname')),
                ],
                links=[
                    (MB_NS('name'), var('aname')),
                    (MB_NS('remark'), var('comment')),
                ],
                vars={'aname': follow(SCH_NS('name')), 'comment': follow(SCH_NS('comment'))},
            )
        )
    }

    TRANSFORM_RULES = {
        (SCH_NS('name'), R_TYP, RG_TYP): link(rel=MB_NS('title')),

        (SCH_NS('byArtist'), R_TYP): link(rel=MB_NS('by'), target=lookup('@resource')),
    }

    # Intentionally shadows the global LABELIZE_RULES
    LABELIZE_RULES = {
        MB_NS('ReleaseGroup'): follow(MB_NS('title')),
        MB_NS('Release'): follow(MB_NS('title')),
        MB_NS('Artist'): follow(MB_NS('name'))
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES)

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('='*10, 'test_basics_4', '='*10)
    literate.write(modout)
    # import pprint; pprint.pprint(list(iter(modout)))

    assert len(modout) == 16
    assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1
    assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1
    assert len(list(util.all_origins(modout, only_types={MB_NS('Artist')}))) == 2
    # assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
    # DOC_NS('md') -> I('i5GvPVm7ClA') in the transform
    assert [ l[0] for l in modout.match(None, MB_NS('remark'), 'test')] == [I('i5GvPVm7ClA')]
Ejemplo n.º 13
0
def bind(models, context=None, ignore_oftypes=None, logger=logging):
    if not isinstance(models, list): models = [models]
    vocab = context.get('@vocab')
    non_top_ids = set()
    obj_pool = {} #Mapping from resource id to object and list of referring ids
    used_objects = set() #Track multiple instance of docs to prevent data structure recursion
    #typed_origins = set()
    for m in models:
        #Everything with a type
        for origin in all_origins(m):
            typ = next(lookup(m, origin, RDF_TYPE), None)
            #if p == VERSA_TYPE: p = RDF_TYPE
            obj, referents = obj_pool.setdefault(origin, ({}, []))
            if vocab and typ:
                typ_rel = iri.relativize(typ, vocab)
                if typ_rel: typ = typ_rel
            if typ: obj['@type'] = typ
            if not origin.startswith('__VERSABLANKNODE__'): obj['@id'] = origin
            for o, r, t, a in m.match(origin):
                if r == RDF_TYPE: continue
                if isinstance(t, I) and o != t:
                    if vocab:
                        t_rel = iri.relativize(t, vocab)
                        if t_rel: t = t_rel
                    valobj, referents = obj_pool.setdefault(t, ({}, []))
                    if t in used_objects:
                        val = t
                    else:
                        val = valobj
                        if not t.startswith('__VERSABLANKNODE__') and '@id' not in val: val['@id'] = t
                        used_objects.add(t)

                        non_top_ids.add(t) #If something has an object as a value it does not appear at the top
                    referents.append(o)
                else:
                    val = t
                if vocab:
                    r_rel = iri.relativize(r, vocab)
                    if r_rel: r = r_rel
                if r in obj and isinstance(obj[r], list):
                    obj[r].append(val)
                elif r in obj:
                    obj[r] = [obj[r], val]
                else:
                    obj[r] = val

    #Eliminate objects of types to be ignored
    to_remove = []
    for (oid, (obj, referents)) in obj_pool.items():
        typ = obj.get('@type')
        if vocab and typ: typ = iri.absolutize(typ, vocab)
        if typ in ignore_oftypes:
            to_remove.append(oid)
            for ref in referents:
                refobj, _ = obj_pool[ref]
                for k in list(refobj.keys()):
                    v = refobj[k]
                    if isinstance(v, list) and obj in v:
                        v.remove(obj)
                        if len(v) == 1:
                            refobj[k] = v[0]
                    elif v == obj:
                        del refobj[k]
                        
    for k in to_remove:
        del obj_pool[k]

    #Handle @id only
    for (oid, (obj, referents)) in obj_pool.items():
        for k, v in obj.items():
            if len(v) == 1 and '@id' in v:
                obj[k] = v['@id']
    
    top_objs = [ obj for (k, (obj, refs)) in obj_pool.items() if k not in non_top_ids ]
    #Eliminate stranded top-level objects with no more than type
    to_remove = []
    #for ix, obj in enumerate(top_objs):
    for obj in top_objs:
        if len(obj) == 1 and '@type' in obj:
            to_remove.append(obj)
    for obj in to_remove:
        top_objs.remove(obj)
    #import pprint;pprint.pprint(top_objs)
    if context and context.get('@output', True):
        top = {'@context': context, '@graph': top_objs}
    else:
        return top_objs