def get_semantic_associations(fn=None, limit=None):
    if not fn:
        verified_mappings = get_verified_mappings()
        semantic_associations = get_dbpedia_pairs_from_mappings(
            verified_mappings)
        semantic_associations = [URIRefify(p) for p in semantic_associations]
    else:
        semantic_associations = []
        with gzip.open(fn) if fn.endswith('.gz') else open(fn) as f:
            # expects a file with one space separated pair of n3 encoded IRIs
            # per line
            r = csv.DictReader(
                f,
                delimiter=b' ',
                doublequote=False,
                escapechar=None,
                quoting=csv.QUOTE_NONE,
            )
            assert r.fieldnames == ['source', 'target']
            for i, row in enumerate(r):
                if limit and i >= limit:
                    break
                source = from_n3(row['source'].decode('UTF-8'))
                target = from_n3(row['target'].decode('UTF-8'))
                semantic_associations.append((source, target))
    return semantic_associations
Example #2
0
 def test_util_from_n3_expectliteralwithdatatypefrombool(self):
     s = 'true'
     res = util.from_n3(s)
     self.assertEqual(res, Literal(True))
     s = 'false'
     res = util.from_n3(s)
     self.assertEqual(res, Literal(False))
Example #3
0
 def test_util_from_n3_expectliteralwithdatatypefrombool(self):
     s = 'true'
     res = util.from_n3(s)
     self.assertEqual(res, Literal(True))
     s = 'false'
     res = util.from_n3(s)
     self.assertEqual(res, Literal(False))
Example #4
0
 def test_util_from_n3_expectpartialidempotencewithn3(self):
     for n3 in ('<http://ex.com/foo>',
                '"foo"@de',
                #'"\\""', # exception as '\\"' --> '"' by orig parser as well
                '"""multi\n"line"\nstring"""@en'):
         self.assertEqual(util.from_n3(n3).n3(), n3,
                          'from_n3(%(n3e)r).n3() != %(n3e)r' % {'n3e': n3})
Example #5
0
    def test_util_from_n3_expectsameasn3parser(self):
        def parse_n3(term_n3):
            ''' Disclaimer: Quick and dirty hack using the n3 parser. '''
            prepstr = ("@prefix  xsd: <http://www.w3.org/2001/XMLSchema#> .\n"
                       "<urn:no_use> <urn:no_use> %s.\n" % term_n3)
            g = ConjunctiveGraph()
            g.parse(data=prepstr, format='n3')
            return [t for t in g.triples((None, None, None))][0][2]

        for n3 in (  # "michel", # won't parse in original parser
                # "_:michel", # BNodes won't be the same
                '"michel"',
                '<http://example.org/schema>',
                '"michel"@fr',
                # '"michel"@fr^^xsd:fr', # FIXME: invalid n3, orig parser will prefer datatype
                # '"true"^^xsd:boolean', # FIXME: orig parser will expand xsd prefix
                '42',
                'true',
                'false',
                '"""multi\nline\nstring"""@en',
                '<http://ex.com/foo>',
                '"foo"@de',
                '"\\""@en',
                '"""multi\n"line"\nstring"""@en'):
            res, exp = util.from_n3(n3), parse_n3(n3)
            self.assertEqual(
                res, exp,
                'from_n3(%(n3e)r): %(res)r != parser.notation3: %(exp)r' % {
                    'res': res,
                    'exp': exp,
                    'n3e': n3
                })
Example #6
0
 def test_util_from_n3_expectsameasn3parser(self):
     def parse_n3(term_n3):
         ''' Disclaimer: Quick and dirty hack using the n3 parser. '''
         prepstr = ("@prefix  xsd: <http://www.w3.org/2001/XMLSchema#> .\n"
                    "<urn:no_use> <urn:no_use> %s.\n" % term_n3)
         g = ConjunctiveGraph()
         g.parse(data=prepstr, format='n3')
         return [t for t in g.triples((None, None, None))][0][2]
     
     for n3 in (# "michel", # won't parse in original parser
                # "_:michel", # BNodes won't be the same
                '"michel"',
                '<http://example.org/schema>',
                '"michel"@fr',
                # '"michel"@fr^^xsd:fr', # FIXME: invalid n3, orig parser will prefer datatype
                # '"true"^^xsd:boolean', # FIXME: orig parser will expand xsd prefix
                '42',
                'true',
                'false',
                '"""multi\nline\nstring"""@en',
                '<http://ex.com/foo>',
                '"foo"@de',
                '"\\""@en',
                '"""multi\n"line"\nstring"""@en'):
         res, exp = util.from_n3(n3), parse_n3(n3)
         self.assertEquals(res, exp,
             'from_n3(%(n3e)r): %(res)r != parser.notation3: %(exp)r' % {
                     'res': res, 'exp': exp, 'n3e':n3})
Example #7
0
 def test_util_from_n3_expectpartialidempotencewithn3(self):
     for n3 in ('<http://ex.com/foo>',
                '"foo"@de',
                #'"\\""', # exception as '\\"' --> '"' by orig parser as well
                '"""multi\n"line"\nstring"""@en'):
         self.assertEqual(util.from_n3(n3).n3(), n3,
                          'from_n3(%(n3e)r).n3() != %(n3e)r' % {'n3e': n3})
Example #8
0
def main():
    '''Parse args and handle options.'''

    parser = argparse.ArgumentParser(description='Object lister for Fedora 4.')

    # Path to the repo config (endpoint, relpath, credentials, and WebAC paths)
    parser.add_argument('-r',
                        '--repo',
                        help='Path to repository configuration file.',
                        action='store',
                        required=True)

    # long mode to print more than just the URIs (name modeled after ls -l)
    parser.add_argument('-l',
                        '--long',
                        help='Display additional information besides the URI',
                        action='store_true')

    parser.add_argument(
        '-R',
        '--recursive',
        help=
        'List additional objects found by traversing the given predicate(s)',
        action='store')

    args = parser.parse_args()

    # configure logging
    with open('config/logging.yml', 'r') as configfile:
        logging_config = yaml.safe_load(configfile)
        logfile = 'logs/list.py.{0}.log'.format(
            datetime.utcnow().strftime('%Y%m%d%H%M%S'))
        logging_config['handlers']['file']['filename'] = logfile
        logging_config['handlers']['console']['stream'] = 'ext://sys.stderr'
        logging.config.dictConfig(logging_config)

    # Load required repository config file and create repository object
    with open(args.repo, 'r') as repoconfig:
        fcrepo = Repository(yaml.safe_load(repoconfig))
        logger.info('Loaded repo configuration from {0}'.format(args.repo))

    if args.recursive is not None:
        manager = namespaces.get_manager()
        args.predicates = [
            from_n3(p, nsm=manager) for p in args.recursive.split(',')
        ]
        logger.info(
            'Listing will traverse the following predicates: {0}'.format(
                ', '.join([p.n3() for p in args.predicates])))
    else:
        args.predicates = []

    for item_uri in sys.stdin:
        for (uri, graph) in fcrepo.recursive_get(item_uri.rstrip('\n'),
                                                 traverse=args.predicates):
            if args.long:
                title = get_title_string(graph)
                print("{0} {1}".format(uri, title))
            else:
                print(uri)
Example #9
0
def term_to_rdflib(term: str) -> Term:
    """Convert an HDT term into its RDFlib representation."""
    if term.startswith('?'):
        return Variable(term[1:])
    elif term.startswith("\""):
        return from_n3(term)
    else:
        return URIRef(term)
def to_rdflib_term(value):
    """Convert a N3 term to a RDFLib Term"""
    if value.startswith('http'):
        return URIRef(value)
    elif '"^^http' in value:
        index = value.find('"^^http')
        value = "{}<{}>".format(value[0:index + 3], value[index + 3:])
    return from_n3(value)
Example #11
0
 def ask_LITERAL(self, g, sections, var, prompt):
     answer = self.input(prompt)
     if answer.startswith('"') or answer.startswith("'"):
         return util.from_n3(answer)
     else:
         return Literal(answer,
                        lang=var.langhint,
                        datatype=var.datatypehint)
Example #12
0
 def lookup_blanks(self, g, bn, conn):
     """Recursively find any relevant blank nodes for
     the current lookup
     @param g The graph
     @param bn The blank node ID (starting _:)
     @param conn The database connection
     """
     cursor = conn.cursor()
     cursor.execute("""select subject, property, object from triples where
     page="<BLANK>" """, (bn[2:],))
     rows = cursor.fetchall()
     if rows:
         for s, p, o in rows:
             g.add((from_n3(s), from_n3(p), from_n3(o)))
         if o.startswith("_:"):
             self.lookup_blanks(g, o, conn)
     cursor.close()
Example #13
0
 def test_util_from_n3_not_escapes(self) -> None:
     strings = [
         "jörn",
         "j\\xf6rn",
     ]
     for string in strings:
         with self.subTest(f"{string}"):
             literal_str = str(util.from_n3(f'"{string}"'))
             self.assertEqual(literal_str, f"{string}")
Example #14
0
 def test_util_from_n3_not_escapes_xf(self) -> None:
     strings = [
         f"j\\366rn",
         f"\\",
         f"\\0",
         f"\\I",
     ]
     for string in strings:
         with self.subTest(f"{string}"):
             literal_str = str(util.from_n3(f'"{string}"'))
             self.assertEqual(literal_str, f"{string}")
Example #15
0
 def ask_NODE(self, g, sections, var, prompt):
     answer = self.input(prompt)
     if answer.startswith("c") and var.classhint and var.classhint in sections:
         s = sections[answer[1:].strip()]
         node = s.construct(g, sections, None)
         print("back to {}".format(self.name), file=self.out)
         return node
     elif answer:        
         return util.from_n3(answer)
     else:
         return None
Example #16
0
def uri_or_curie(arg: str):
    if arg and (arg.startswith('http://') or arg.startswith('https://')):
        # looks like an absolute HTTP URI
        return URIRef(arg)
    try:
        term = from_n3(arg, nsm=namespaces.get_manager())
    except KeyError:
        raise ArgumentTypeError(
            f'"{arg[:arg.index(":") + 1]}" is not a known prefix')
    if not isinstance(term, URIRef):
        raise ArgumentTypeError('must be a URI or CURIE')
    return term
Example #17
0
 def ask_NODE(self, g, sections, var, prompt):
     answer = self.input(prompt)
     if answer.startswith(
             "c") and var.classhint and var.classhint in sections:
         s = sections[answer[1:].strip()]
         node = s.construct(g, sections, None)
         print("back to {}".format(self.name), file=self.out)
         return node
     elif answer:
         return util.from_n3(answer)
     else:
         return None
Example #18
0
    def summarize(self, id):
        """Summarize an id
        @param id The id
        @return A RDFlib Graph or None if the ID is not found
        """
        g = ConjunctiveGraph()
        conn = sqlite3.connect(self.db)
        cursor = conn.cursor()

        cursor.execute(
            """select subject, property, object from triples where
            subject=?""", ("<%s%s>" % (BASE_NAME, unicode_escape(id)),))
        rows = cursor.fetchall()
        added = 0
        if rows:
            for s, p, o in rows:
                for f in FACETS:
                    if added < 20 and str(p)[1:-1] == f["uri"]:
                        g.add((from_n3(s), from_n3(p), from_n3(o)))
                        added += 1
            conn.close()
        return g
Example #19
0
def create_class_from_mapping(mapping, rdf_type=None):
    cls = type('csv', (pcdm.Item, ), {})
    for column, conf in mapping.items():
        if 'predicate' in conf:
            pred_uri = from_n3(conf['predicate'], nsm=nsm)
            if conf.get('uriref', False):
                add_property = rdf.object_property(column, pred_uri)
            else:
                if 'datatype' in conf:
                    datatype = from_n3(conf['datatype'], nsm=nsm)
                else:
                    datatype = None
                add_property = rdf.data_property(column,
                                                 pred_uri,
                                                 datatype=datatype)
            add_property(cls)

    if rdf_type is not None:
        add_type = rdf.rdf_class(rdf_type)
        add_type(cls)

    return cls
Example #20
0
def to_rdflib_term(value: str) -> Union[Literal, URIRef, Variable]:
    """Convert a N3 term to a RDFLib Term.
    
    Argument: A RDF Term in N3 format.

    Returns: The RDF Term in rdflib format.
    """
    if value.startswith('http'):
        return URIRef(value)
    elif '"^^http' in value:
        index = value.find('"^^http')
        value = f"{value[0:index+3]}<{value[index+3:]}>"
    return from_n3(value)
Example #21
0
 def test_util_from_n3_expectpartialidempotencewithn3(self):
     for n3 in (
             "<http://ex.com/foo>",
             '"foo"@de',
             u"<http://ex.com/漢字>",
             u"<http://ex.com/a#あ>",
             # '"\\""', # exception as '\\"' --> '"' by orig parser as well
             '"""multi\n"line"\nstring"""@en',
     ):
         self.assertEqual(
             util.from_n3(n3).n3(),
             n3,
             "from_n3(%(n3e)r).n3() != %(n3e)r" % {"n3e": n3},
         )
Example #22
0
    def __init__(self, repo, config):
        self.logger = logging.getLogger(__name__ + '.' +
                                        self.__class__.__name__)

        # Set configuration properties
        self.collection = pcdm.Collection.from_repository(
            repo, config.collection_uri)

        missing_fields = []
        try:
            self.file_path = os.path.join(config.data_dir,
                                          config.handler_options['FILE_PATH'])
        except KeyError:
            missing_fields.append('FILE_PATH')
        try:
            self.metadata_map = os.path.join(
                config.data_dir, config.handler_options['METADATA_MAP'])
        except KeyError:
            missing_fields.append('METADATA_MAP')

        if missing_fields:
            field_names = ', '.join(missing_fields)
            raise ConfigException(
                f'Missing required HANDLER_OPTIONS in batch configuration: {field_names}'
            )

        if 'RDF_TYPE' in config.handler_options:
            self.item_rdf_type = URIRef(
                from_n3(config.handler_options['RDF_TYPE'], nsm=nsm))
        else:
            self.item_rdf_type = None

        # load the metadata map and metadata file
        try:
            with open(self.metadata_map, 'r') as f:
                self.logger.info(
                    f'Parsing the metadata map in {self.metadata_map}')
                self.mapping = yaml.safe_load(f)
            with open(config.batch_file, 'r') as f:
                self.logger.info(f'Reading metadata file {config.batch_file}')
                self.rows = [r for r in csv.DictReader(f)]
        except FileNotFoundError as e:
            raise ConfigException(e)

        key_column = get_flagged_column(self.mapping, 'key')
        if key_column is not None:
            self.length = len(set([line[key_column] for line in self.rows]))
        else:
            self.length = len(self.rows)
Example #23
0
def get_semantic_associations(fn=None):
    if not fn:
        verified_mappings = get_verified_mappings()
        semantic_associations = get_dbpedia_pairs_from_mappings(
            verified_mappings)
        semantic_associations = [URIRefify(p) for p in semantic_associations]
    else:
        semantic_associations = []
        with open(fn) as f:
            # expects a file with one space separated pair of n3 encoded IRIs
            # per line
            r = csv.DictReader(
                f,
                delimiter=b' ',
                doublequote=False,
                escapechar=None,
                quoting=csv.QUOTE_NONE,
            )
            assert r.fieldnames == ['source', 'target']
            for row in r:
                source = from_n3(row['source'].decode('UTF-8'))
                target = from_n3(row['target'].decode('UTF-8'))
                semantic_associations.append((source, target))
    return semantic_associations
Example #24
0
    def lookup(self, id):
        """Resolve a single id
        @param id The id
        @return A RDFlib Graph or None if the ID is not found
        """
        g = ConjunctiveGraph()
        g.bind("lemon", "http://lemon-model.net/lemon#")
        g.bind("owl", str(OWL))
        conn = sqlite3.connect(self.db)
        cursor = conn.cursor()

        cursor.execute(
            """select subject, property, object from triples where
            page=?""", (unicode_escape(id),))
        rows = cursor.fetchall()
        if rows:
            for s, p, o in rows:
                g.add((from_n3(s), from_n3(p), from_n3(o)))
                if o.startswith("_:"):
                    self.lookup_blanks(g, o, conn)
            conn.close()
            return g
        else:
            return None
Example #25
0
 def test_util_from_n3_escapes(self) -> None:
     pairs = [
         ("\\t", "\t"),
         ("\\b", "\b"),
         ("\\n", "\n"),
         ("\\r", "\r"),
         ("\\f", "\f"),
         ('\\"', '"'),
         ("\\'", "'"),
         ("\\\\", "\\"),
         ("\\u00F6", "ö"),
         ("\\U000000F6", "ö"),
     ]
     for escaped, raw in pairs:
         with self.subTest(f"{escaped} => {raw}"):
             literal_str = str(util.from_n3(f'"{escaped}"'))
             self.assertEqual(literal_str, f"{raw}")
Example #26
0
    def list_values(self, offset, limit, prop):
        """
        Produce a list of all possible values for a particular property
        @param offset Where to start listing
        @param limit Number of values to list
        @param prop The property to list for
        @return A tuple consisting of a boolean indicating if there are more
        results and list of values that exist (as N3)
        """
        conn = sqlite3.connect(self.db)
        cursor = conn.cursor()
        if not offset:
            offset = 0
        cursor.execute("""SELECT DISTINCT object, obj_label, count(*)
                          FROM triples WHERE property=? AND head=0
                          GROUP BY oid ORDER BY count(*) DESC
                          LIMIT ? OFFSET ?""", (prop, limit + 1, offset))
        row = cursor.fetchone()
        n = 0
        results = []
        while n < limit and row:
            obj, label, count = row
            n3 = from_n3(obj)
            if type(n3) == Literal:
                results.append({'link': obj, 'label': n3.value,
                                'count': count})
            elif type(n3) == URIRef:
#                u = self.unname(str(n3))
#                if u:
#                    s, _ = u
                if label:
                    results.append({'link': obj, 'label': label,
                                    'count': count})
                else:
#                        results.append({'link': obj, 'label': s,
#                                        'count': count})
#                else:
                    results.append({'link': obj,
                                    'label': yuzu.displayer.DISPLAYER.apply(
                                        str(n3)),
                                    'count': count})
            n += 1
            row = cursor.fetchone()
        conn.close()
        return n == limit, results
Example #27
0
def srtsx_body2(r, vars):
    for v in vars:
        val = from_n3(r[vars.index(v)])
        if isinstance(val, URIRef):
            yield ("    <binding name=\"%s\"><uri>%s</uri></binding>"
                   % (v, str(val)))
        elif isinstance(val, BNode):
            yield ("    <binding name=\"%s\"><bnode>%s</bnode></binding>"
                   % (v, str(val)))
        elif val.language:
            yield ("    <binding name=\"%s\"><literal xml:lang=\"%s\">"
                   "%s</literal></binding>" % (v, val.language, str(val)))
        elif val.datatype:
            yield("     <binding name=\"%s\"><literal datatype=\"%s\">"
                  "%s</literal></binding>" % (v, val.datatype, str(val)))
        else:
            yield("     <binding name=\"%s\"><literal>%s</literal></binding>"
                  % (v, str(val)))
Example #28
0
 def get_column_value(self, row, column):
     conf = self.mapping[column]
     value = row.get(column, None)
     if value is None:
         # this is a "dummy" column that is not actually in the
         # source CSV file but should be generated, either from
         # a format-string pattern or a static value
         if 'pattern' in conf:
             value = conf['pattern'].format(**row)
         elif 'value' in conf:
             value = conf['value']
     if conf.get('uriref', False):
         try:
             return URIRef(from_n3(value, nsm=nsm))
         except KeyError:
             # prefix not found, assume it is not a prefixed form
             return URIRef(value)
     else:
         return value
Example #29
0
def srtsj_body2(r, vars):
    for v in vars:
        val = from_n3(r[vars.index(v)])
        if not val:
            yield ""
        if isinstance(val, URIRef):
            yield ("      \"%s\": { \"type\": \"uri\", \"value\": \"%s\" }"
                   % (v, str(val)))
        elif isinstance(val, BNode):
            yield ("      \"%s\": { \"type\": \"bnode\", \"value\": \"%s\" }"
                   % (v, str(val)))
        elif val.language:
            yield ("      \"%s\": { \"type\": \"literal\", \"xml:lang\": "
                   "\"%s\", \"value\": \"%s\" }" % (v, val.language, str(val)))
        elif val.datatype:
            yield ("      \"%s\": { \"type\": \"literal\", \"datatype\": "
                   "\"%s\", \"value\": \"%s\" }" % (v, val.datatype,
                                                    str(val)))
        else:
            yield ("      \"%s\": { \"type\": \"literal\", \"value\": \"%s\" }"
                   % (v, str(val)))
Example #30
0
def yield_triples(file):
    total = 0
    blocks = []
    block_size = 5000
    parsed = 0
    print('-> starting yielding...')
    to_read = ""
    for cnt, line in enumerate(file):
        try:
            # do not touch  all the lines below. We read every new line,
            # if we see a triple on multiple lines aka we cant find any matches, we continue to read
            # and we accumulate the string and test the accumulated string, on a match we continue
            if to_read == "":
                triple = SAGE_NTRIPLES_REGEX.findall(line)
                to_read += line
                if len(triple) > 0:
                    triple = triple[0]
                    blocks.append((from_n3(triple[0]), from_n3(triple[1]), from_n3(triple[2])))
                    parsed += 1
                    to_read = ""
                else:
                    to_read = to_read.replace('\n', '')
            else:
                to_read += line
                triple = SAGE_NTRIPLES_REGEX.findall(to_read)
                if len(triple) > 0:
                    triple = triple[0]
                    blocks.append((from_n3(triple[0]), from_n3(triple[1]), from_n3(triple[2])))
                    parsed += 1
                    to_read = ""
                else:
                    to_read = to_read.replace('\n', '')
            if cnt % block_size == 0:
                parsed = 0
                for t in blocks:
                    total += 1
                    yield __n3_to_str(t)
                blocks = []
        except Exception as err:
            print(err)
            print(line)
            exit(1)

    if len(blocks) > 0:
        for t in blocks:
            total += 1
            yield __n3_to_str(t)
    print('-> yielded {} triples'.format(total))
Example #31
0
def parse_term(term: Union[str, List[str]],
               nsm: NamespaceManager = None) -> Term:
    """Parse a raw RDF term or a list of raw RDF Terms into the rdflib format.

    Args:
      * term: (List of) RDF Term(s) to parse (in n-triples format).
      * nsm: Namespace manager used to expand prefixed URIs.

    Returns:
      The parsed RDF term in rdflib format.
    """
    # case 1: a single RDf Term
    if type(term) == str:
        # the special keyword "none" is interpreted as "ottr:None"
        if term == "none":
            return OTTR_NONE
        # rdflib tends to see SPARQL variables as blank nodes, so we need to handle them separately
        if term.startswith('?'):
            return Variable(term[1:])
        return from_n3(term, nsm=nsm)
    else:  # Case 2: a list of RDF terms
        return [parse_term(value, nsm=nsm) for value in term]
Example #32
0
def synset(context, offset, graph=None, extras=False, translate=True):
    """ 
    Return an RDF graph for a synset given an offset value
    @param context: A WNRDFContext object
    @param offset: The offset value in the database (Int)
    @param graph: If not None add to this graph
    @return The graph passed (or a new graph) containing the triples for this synset or None if the synset was not found
    """
    if graph is None:
        graph = make_graph()
    cursor = context.conn.cursor()

    not_translated = True
    if translate:
        c = context.mconn.cursor()
        c.execute("select internal from wn31r where release=?", (offset,))
        row = c.fetchone()
        if row:
            offset, = row
            not_translated = False
        else:
            not_translated = True

    # Read the synset information
    cursor.execute("select pos, lexdomainid, definition from synsets where synsetid=?", (offset,)) # no index
    row = cursor.fetchone()
    if row is None:
        return None
    pos, lexdomainid, definition = row
    if not_translated:
        synset_uri = synset_name(context, offset, pos.upper())
    else:
        synset_uri = synset_name(context, offset, pos)
    graph.add((synset_uri, RDF.type, wn_ontology.Synset))
    graph.add((synset_uri, wn_ontology.part_of_speech, wn_ontology.term(context.postypes[pos])))
    graph.add((synset_uri, wn_ontology.lexical_domain, wn_ontology.term(context.lexdomainid_to_name[lexdomainid])))
    graph.add((synset_uri, wn_ontology.gloss, Literal(definition, lang=context.lang)))

    cursor.execute("select lemma, casedwordid from senses inner join words on senses.synsetid=? and senses.wordid=words.wordid",
                   (offset,))
    for lemma, casedwordid in cursor.fetchall():
        if casedwordid:
            cursor.execute("select cased from casedwords where casedwordid=?", (casedwordid,))
            cased_lemma, = cursor.fetchone()
            graph.add((synset_uri, RDFS.label, Literal(cased_lemma, lang=context.lang)))
            graph.add((synset_uri, wn_ontology.synset_member, entry_name(cased_lemma, pos)))
        else:
            graph.add((synset_uri, RDFS.label, Literal(lemma, lang=context.lang)))
            graph.add((synset_uri, wn_ontology.synset_member, entry_name(lemma, pos)))

    # Read the phrase type (if it exists)
    cursor.execute("select phrasetype from phrasetypes where synsetid=?", (offset,)) # unindexed
    for phrasetype, in cursor.fetchall():
        graph.add((synset_uri, wn_ontology.phrase_type, wn_ontology.term(phrasetype)))

    # Read the samples
    cursor.execute("select sampleid, sample from samples where synsetid=?", (offset,))
    for sampleid, sample in cursor.fetchall():
            graph.add((synset_uri, wn_ontology.sample, Literal(sample, lang=context.lang)))

    # Read the synset links
    cursor.execute("select synset2id, linkid from semlinks where synset1id=?", (offset,))
    for synsetid2, linkid in cursor.fetchall():
        cursor.execute("select pos from synsets where synsetid=?", (synsetid2,))
        row = cursor.fetchone()
        if row is None:
            sys.stderr.write("Synset %s referred to but not found " % synsetid2)
        else:
            pos2, = row
            synset_uri2 = synset_name(context, synsetid2,pos2)
            graph.add((synset_uri, wn_ontology.term(context.linktypes[linkid]), synset_uri2))
            if extras:
                cursor.execute("select definition from synsets where synsetid=?", (synsetid2,))
                def2, = cursor.fetchone()
                graph.add((synset_uri2, wn_ontology.gloss, Literal(def2, lang=context.lang)))

    try:
        cursor.execute("select property, object from synsettriples where synsetid=?",(offset,))
        for p, o in cursor.fetchall():
            graph.add((synset_uri, URIRef(p), from_n3(o)))
    except Exception as e:
        print (e)

    return graph
Example #33
0
def entry(context, lemma, pos, graph=None):
    """ 
    Return an RDF graph for a lexical entry given a particular lemma string
    @param context: A WNRDF Context
    @param lemma: The lemma (case-sensitive!)
    @param pos: The part-of-speech (as a 1-letter code)
    @param graph: A graph to add the triples to (or None for a new graph)
    @return The graph containing the entry's triples or None if the entry was not found
    """
    # First map the lemma to the internal word id
    if graph is None:
        graph = make_graph()
    cursor = context.conn.cursor()

    if not lemma.islower():
        cased_lemma = lemma
        lemma = lemma.lower()
    else:
        cased_lemma = lemma
    cursor.execute("select * from words where lemma=?", (lemma,))
    row = cursor.fetchone()
    if row is None:
        return None
    word_id, _ = row

    # Add entry description
    entry_uri = entry_name(cased_lemma, pos)
    graph.add((entry_uri, RDF.type, lemon.LexicalEntry))
    graph.add((entry_uri, wn_ontology.part_of_speech, wn_ontology.term(context.postypes[pos])))
    canonical_form_uri = entry_name(cased_lemma, pos, "CanonicalForm")
    graph.add((entry_uri, lemon.canonicalForm, canonical_form_uri))
    graph.add((canonical_form_uri, lemon.writtenRep, Literal(cased_lemma, lang=context.lang)))
    graph.add((canonical_form_uri, RDF.type, lemon.Form))

    # Search for morphological forms
    cursor.execute("select pos, morphid from morphmaps where wordid=? and pos=?", (word_id, pos)) # partially unindexed
    other_forms = 1
    this_pos_found = False
    for pos, morphid in cursor.fetchall():
        cursor.execute("select morph from morphs where morphid=?", (morphid,)) # unindexed
        for morph, in cursor.fetchall():
            other_form_uri = entry_name(cased_lemma, pos, "Form-%d" % other_forms)
            graph.add((entry_uri, lemon.otherForm, other_form_uri))
            graph.add((other_form_uri, RDF.type, lemon.Form))
            graph.add((other_form_uri, lemon.writtenRep, Literal(morph, lang=context.lang)))
            other_forms += 1

    # Find senses
    if cased_lemma.islower():
        #cursor.execute("select * from senses where wordid=? and casedwordid is NULL", (word_id,))
        cursor.execute("select * from senses where wordid=?", (word_id,))
    else:
        cursor.execute("select casedwordid from casedwords where cased=?",(cased_lemma,))
        row = cursor.fetchone()
        if row is None:
            return None
        casedwordid, = row
        cursor.execute("select * from senses where casedwordid=?", (casedwordid,))
    for _, casedwordid, synsetid, senseid, sensenum, lexid, tagcount, old_sensekey, sensekey in cursor.fetchall():
        # NB. This could also be achieved by querying "casedwordid is NULL" however
        # this is significantly slower, so we filter in Python checking we return cased
        # forms only for cased lemmas
        if cased_lemma.islower() == bool(casedwordid):
            continue
        if sensekey[-1] == pos:
            this_pos_found = True
            _, sensekey2 = sensekey.split('#')
            sense_uri = entry_name(cased_lemma, pos, sensekey2)
            graph.add((entry_uri, lemon.sense, sense_uri))
            graph.add((sense_uri, RDF.type, lemon.LexicalSense))
            graph.add((sense_uri, lemon.reference, synset_name(context, synsetid, pos)))
            graph.add((sense_uri, wn_ontology.sense_number, Literal(sensenum)))
            graph.add((sense_uri, wn_ontology.tag_count, Literal(tagcount)))
            graph.add((sense_uri, wn_ontology.lex_id, Literal(lexid)))
            graph.add((sense_uri, wn_ontology.old_sense_key, Literal(old_sensekey)))

            # Now adjective positions
            cursor.execute("select position from adjpositions where synsetid=? and wordid=?", (synsetid, word_id))
            rows = cursor.fetchall()
            for position, in rows:
                graph.add((sense_uri, wn_ontology.adjposition,
                           URIRef(wn_ontology.term(quote_plus(context.adjposition_names[position])))))

            # Add definition also to sense
            cursor.execute("select definition from synsets where synsetid=?", (synsetid,))
            for definition, in cursor.fetchall():
                graph.add((sense_uri, wn_ontology.gloss, Literal(definition, lang=context.lang)))

            # Sense links
            cursor.execute("select senseid2, linkid from lexlinks where senseid1=?", (senseid,))
            for senseid2, linkid in cursor.fetchall():
                cursor.execute("select sensekey from senses where senseid=?", (senseid2,))
                sensekey3, = cursor.fetchone()
                sense2_lemma, sense2_key = sensekey3.split('#')
                pos2 = sensekey3[-1]
                sense_uri2 = entry_name(sense2_lemma, pos2, sense2_key)
                graph.add((sense_uri, wn_ontology.term(context.linktypes[linkid]), sense_uri2))

            # Verb frames (maybe only if pos=='v'?)
            cursor.execute("select sentenceid from vframesentencemaps where synsetid=? and wordid=?",
                           (synsetid, word_id))
            for sentenceid, in cursor.fetchall():
                graph.add((sense_uri, wn_ontology.verb_frame_sentence,
                           Literal(context.vframesentences[sentenceid], lang=context.lang)))

            # Sense tags
            cursor.execute("select position, senseid from sensetags inner join taggedtexts on sensetags.sensetagid=taggedtexts.sensetagid where new_sensekey=?",(sensekey,)) # unindexed
            for position, senseid in cursor.fetchall():
                cursor.execute("select sensekey from senses where senseid=?",(senseid,))
                for sensekey, in cursor.fetchall():
                    if position:
                        comp_uri = entry_name(sensekey[0:sensekey.index('#')].replace("_"," "),sensekey[-1],'Component-' + str(position+1))
                        graph.add((sense_uri, wn_ontology.sense_tag, comp_uri))
            
            # LexVo Link
            graph.add((sense_uri, OWL.sameAs, translate_to_lexvo(old_sensekey, pos)))

                 
    if not this_pos_found:
        return None

    if pos == "p":
        words = lemma.split(" ")
        node = BNode()
        comp1 = entry_name(lemma, pos, "Component-1")
        graph.add((entry_uri, lemon.decomposition, node))
        graph.add((node, RDF.first, comp1))
        graph.add((comp1, RDFS.label, Literal(words[0], lang=context.lang)))
        graph.add((comp1, RDF.type, lemon.Component))

        for idx in range(1,len(words)):
            node2 = BNode()
            graph.add((node, RDF.rest, node2))
            node = node2
            comp_uri = entry_name(lemma, pos, "Component-" + str(idx + 1))
            graph.add((node, RDF.first, comp_uri))
            graph.add((comp_uri, RDFS.label, Literal(words[idx], lang=context.lang)))
            graph.add((comp_uri, RDF.type, lemon.Component))
        graph.add((node, RDF.rest, RDF.nil))

    try:
        cursor.execute("select fragment, property, object from entrytriples where lemma=?",(quote_plus(lemma)+"-"+pos,))
        for f, p, o in cursor.fetchall():
            graph.add((entry_name(lemma,pos,f), from_n3(p), from_n3(o)))
    except:
        pass


    return graph
Example #34
0
 def test_util_from_n3_expectliteralwithtrailingbackslash(self):
     s = '"trailing\\\\"^^<http://www.w3.org/2001/XMLSchema#string>'
     res = util.from_n3(s)
     self.assert_(res, Literal('trailing\\', datatype=XSD['string']))
     self.assert_(res.n3(), s)
Example #35
0
 def test_util_from_n3_expecturiref(self):
     s = '<http://example.org/schema>'
     res = util.from_n3(s, default=None, backend=None)
     self.assert_(isinstance(res, URIRef))
Example #36
0
 def test_util_from_n3_expectliteralandlangdtype(self):
     s = '"michel"@fr^^xsd:fr'
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(isinstance(res, Literal))
     self.assertEqual(res, Literal('michel', datatype=XSD['fr']))
Example #37
0
 def test_util_from_n3_expecturiref(self):
     s = '<http://example.org/schema>'
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(isinstance(res, URIRef))
Example #38
0
 def ask_LITERAL(self, g, sections, var, prompt):
     answer = self.input(prompt)
     if answer.startswith('"') or answer.startswith("'"):
         return util.from_n3(answer)
     else:
         return Literal(answer, lang=var.langhint, datatype=var.datatypehint)
Example #39
0
 def test_util_from_n3_expectliteralwithescapedquote(self):
     s = '"\\""'
     res = util.from_n3(s, default=None, backend=None)
     self.assert_(res, Literal('\\"', lang='en'))
Example #40
0
 def test_util_from_n3_expectliteralmultiline(self):
     s = '"""multi\nline\nstring"""@en'
     res = util.from_n3(s, default=None, backend=None)
     self.assert_(res, Literal('multi\nline\nstring', lang='en'))
Example #41
0
 def test_util_from_n3_expectliteralwithdatatypefromint(self):
     s = '42'
     res = util.from_n3(s)
     self.assertEqual(res, Literal(42))
Example #42
0
 def test_util_from_n3_expectliteralwithdatatypefromint(self):
     s = '42'
     res = util.from_n3(s)
     self.assertEqual(res, Literal(42))
Example #43
0
 def test_util_from_n3_expectliteralanddtype(self):
     s = '"true"^^xsd:boolean'
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(res.eq(Literal('true', datatype='xsd:boolean')))
Example #44
0
 def test_util_from_n3_expectliteralandlangdtype(self):
     s = '"michel"@fr^^xsd:fr'
     res = util.from_n3(s, default=None, backend=None)
     self.assert_(isinstance(res, Literal))
     self.assertEqual(res, Literal('michel',
                                   datatype=URIRef('xsd:fr')))
Example #45
0
 def test_util_from_n3_expectliteralandlang(self):
     s = '"michel"@fr'
     res = util.from_n3(s, default=None, backend=None)
     self.assert_(isinstance(res, Literal))
Example #46
0
 def test_util_from_n3_expectquotedgraph(self):
     s = '{<http://example.com/schema>}'
     res = util.from_n3(s, default=None, backend="IOMemory")
     self.assertTrue(isinstance(res, QuotedGraph))
Example #47
0
def main(
        resdir,
        sparql_endpoint,
        max_queries,
        clustering_variant,
        fusion_methods,
        timeout,
        max_results,
        max_target_candidates_per_gp,
        batch_predict,
        drop_bad_uris,
        **_  # gulp remaining kwargs
):
    from gp_query import calibrate_query_timeout
    from serialization import load_results
    from serialization import find_last_result
    from cluster import cluster_gps_to_reduce_queries
    from gp_learner import init_workers

    # init workers
    init_workers()

    sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint)
    timeout = timeout if timeout > 0 else calibrate_query_timeout(sparql)

    # load model
    last_res = find_last_result()
    if not last_res:
        logger.error('cannot find fully trained model in %s', resdir)
        sys.exit(1)
    result_patterns, coverage_counts, gtp_scores = load_results(last_res)
    gps = [gp for gp, _ in result_patterns]
    gps = cluster_gps_to_reduce_queries(
        gps, max_queries, gtp_scores, clustering_variant)

    processed = 0
    start = time.time()
    batch_size = config.BATCH_SIZE if batch_predict else 1
    # main loop
    for lines in chunker(sys.stdin, batch_size):
        batch = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if drop_bad_uris:
                # noinspection PyBroadException
                try:
                    source = from_n3(line)
                    utils.curify(source)
                except Exception:
                    logger.warning(
                        'Warning: Could not curify URI %s! Skip.', line)
                    continue
            if line[0] not in '<"':
                logger.error(
                    'expected inputs to start with < or ", but got: %s', line)
                sys.exit(1)
            source = from_n3(line)
            batch.append(source)
        batch = list(OrderedDict.fromkeys(batch))

        if len(batch) == 0:
            pass
        elif len(batch) == 1:
            res = predict(
                sparql, timeout, gps, batch[0], fusion_methods,
                max_results, max_target_candidates_per_gp
            )
            print(json.dumps(res))
            logger.info(
                'Predicted %d target candidates for %s',
                res['orig_result_length'], res['source']
            )
        else:
            res = multi_predict(
                sparql, timeout, gps, batch, fusion_methods,
                max_results, max_target_candidates_per_gp
            )
            for r in res:
                print(json.dumps(r))
            logger.info('\n'.join([
                'Predicted %d target candidates for %s' % (
                    r['orig_result_length'], r['source']
                ) for r in res
            ]))

        processed += len(batch)
        logger.info(
            'Have processed %d URIs now. Took %s sec',
            processed, time.time()-start)
Example #48
0
 def fromUnicode(self, str):
     value = from_n3(str)
     self.validate(value)
     return value
Example #49
0
 def test_util_from_n3_expectbnode(self):
     s = "_:michel"
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(isinstance(res, BNode))
Example #50
0
 def test_util_from_n3_expectgraph(self):
     s = '[http://example.com/schema]'
     res = util.from_n3(s, default=None, backend="IOMemory")
     self.assert_(isinstance(res, Graph))
Example #51
0
 def test_util_from_n3_expectliteralandlang(self):
     s = '"michel"@fr'
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(isinstance(res, Literal))
def from_n3(string):
    term = util.from_n3(string)
    if isinstance(term, Literal):
        if term.datatype is None and term.language is None:
            term = Literal(term, datatype=URIRef('xsd:string'))
    return term
Example #53
0
 def test_util_from_n3_expectliteralanddtype(self):
     s = '"true"^^xsd:boolean'
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(res.eq(Literal('true', datatype=XSD['boolean'])))
Example #54
0
 def test_util_from_n3_expectliteralwithescapedquote(self):
     s = '"\\""'
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(res, Literal('\\"', lang='en'))
Example #55
0
 def test_util_from_n3_expectliteralmultiline(self):
     s = '"""multi\nline\nstring"""@en'
     res = util.from_n3(s, default=None, backend=None)
     self.assertTrue(res, Literal('multi\nline\nstring', lang='en'))
Example #56
0
 def test_util_from_n3_sisnonenodefault(self):
     s = None
     default = None
     res = util.from_n3(s, default=default, backend=None)
     self.assert_(res == default)
Example #57
0
 def test_util_from_n3_expectliteralwithtrailingbackslash(self):
     s = '"trailing\\\\"^^<http://www.w3.org/2001/XMLSchema#string>'
     res = util.from_n3(s)
     self.assertTrue(res, Literal('trailing\\', datatype=XSD['string']))
     self.assertTrue(res.n3(), s)
Example #58
0
 def test_util_from_n3_sisnonewithdefault(self):
     s = None
     default = "TestofDefault"
     res = util.from_n3(s, default=default, backend=None)
     self.assert_(res == default)
Example #59
0
 def test_util_from_n3_expectgraph(self):
     s = '[<http://example.com/schema>]'
     res = util.from_n3(s, default=None, backend="IOMemory")
     self.assertTrue(isinstance(res, Graph))
Example #60
0
 def test_util_from_n3_expectbnode(self):
     s = "_:michel"
     res = util.from_n3(s, default=None, backend=None)
     self.assert_(isinstance(res, BNode))