Beispiel #1
0
    def test_bnodes_sort(self):

        graph = Graph()
        graph.load("tests/data/bnodes.ttl", format="turtle")
        ots = OrderedTurtleSerializer(graph)

        out = BytesIO()
        ots.serialize(out)
Beispiel #2
0
 def run(self):
     if "glosis_cl" in self.input_csv:
         main_class = OWL.Class
     else:
         main_class = Namespace('http://www.w3.org/ns/sosa/').Procedure
     serializer = OrderedTurtleSerializer(self.graph)
     serializer.class_order = [
         OWL.Ontology, SKOS.ConceptScheme, main_class, SKOS.Concept
     ]
     with open(self.output, 'wb') as fp:
         serializer.serialize(fp)
Beispiel #3
0
    def test_numeric(self):

        graph = Graph()
        graph.load("tests/data/numeric_unsorted.ttl", format="turtle")
        ots = OrderedTurtleSerializer(graph)
        out = BytesIO()
        ots.serialize(out)
        out = "\n".join([x for x in out.getvalue().decode("utf-8").split("\n") if x.startswith("<")])

        ref = open("tests/data/numeric_sorted.ttl").read()
        ref = "\n".join([x for x in ref.split("\n") if x.startswith("<")])

        assert ref.strip() == out.strip()
Beispiel #4
0
    def test_custom_sorter(self):

        graph = Graph()
        graph.load("tests/data/group_unsorted.ttl", format="turtle")
        ots = OrderedTurtleSerializer(graph)

        ots.sorters = [("(.*?)([0-9]+)$", lambda x: self.xhash(x[0]) + int(x[1]))]

        out = BytesIO()
        ots.serialize(out)
        out = "\n".join([x for x in out.getvalue().decode("utf-8").split("\n") if x.startswith("<")])

        ref = open("tests/data/group_sorted.ttl").read()
        ref = "\n".join([x for x in ref.split("\n") if x.startswith("<")])

        assert ref.strip() == out.strip()
Beispiel #5
0
    def test_dewey_sorter(self):

        graph = Graph()
        graph.load("tests/data/dewey_unsorted.ttl", format="turtle")
        ots = OrderedTurtleSerializer(graph)

        ots.sorters = [
            ("/([0-9A-Z\-]+)\-\-([0-9.\-;:]+)/e", lambda x: "T{0}--{0}".format(x[0], x[1])),  # table numbers
            ("/([0-9.\-;:]+)/e", lambda x: "A" + x[0]),  # standard schedule numbers
        ]

        out = BytesIO()
        ots.serialize(out)
        out = "\n".join([x for x in out.getvalue().decode("utf-8").split("\n") if x.startswith("<")])

        ref = open("tests/data/dewey_sorted.ttl").read()
        ref = "\n".join([x for x in ref.split("\n") if x.startswith("<")])

        assert ref.strip() == out.strip()
Beispiel #6
0
    def build(task):
        logger.info('Building mappings')

        g = load_mappings_from_file(task.file_dep, uri_filter)

        if target.endswith('.nt'):
            stream = BytesIO()
            g.serialize(stream, format='nt')
            with open(target, 'wb') as fp:
                stream.seek(0)
                fp.writelines(sorted(stream.readlines()))

        elif target.endswith('.ttl'):
            for pf in prefixes:
                g.namespace_manager.bind(pf[0], URIRef(pf[1]))

            serializer = OrderedTurtleSerializer(g)
            with open(task.targets[0], 'wb') as fp:
                serializer.serialize(fp)
        else:
            raise Error('Unknown file ext')

        logger.info('Wrote %s' % task.targets[0])
Beispiel #7
0
def main():

    parser = argparse.ArgumentParser(description='Convert MARC21 Classification to SKOS/RDF')
    parser.add_argument('infile', nargs='?', help='Input XML file')
    parser.add_argument('outfile', nargs='?', help='Output RDF file')
    parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)

    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='More verbose output')
    parser.add_argument('-o', '--outformat', dest='outformat', metavar='FORMAT', nargs='?',
                        help='Output format: turtle (default), jskos, or ndjson')

    parser.add_argument('--include', dest='include', help='RDF file to loaded into the graph' +
                        '(e.g. to define a concept scheme). Must be the same format as {outformat}.')

    parser.add_argument('--uri', dest='base_uri', help='URI template')
    parser.add_argument('--scheme', dest='scheme_uri', help='SKOS scheme for all records, use {edition} to specify edition.')
#    parser.add_argument('--table_scheme', dest='table_scheme_uri', help='SKOS scheme for table records, use {edition} to specify edition.')
    parser.add_argument('--whitespace', dest='whitespace', metavar='STRING',
                        help='Replace whitespaces in URI templates with this.')

    parser.add_argument('--altlabels', '--indexterms', dest='altlabels', action='store_true',
                        help='Include altlabels (from 7XX or 4XX).')
    parser.add_argument('--notes', dest='notes', action='store_true',
                        help='Include note fields (DEPRECATED as including notes is now the default).')
    parser.add_argument('--exclude_notes', dest='exclude_notes', action='store_true',
                        help='Exclude note fields.')
    parser.add_argument('--components', dest='components', action='store_true',
                        help='Include component information from 765.')
    parser.add_argument('--webdewey', dest='webdewey', action='store_true',
                        help='Include non-standard WebDewey notes from 680.')
    parser.add_argument('--skip-classification', dest='skip_classification', action='store_true',
                        help='Skip classification records')
    parser.add_argument('--skip-authority', dest='skip_authority', action='store_true',
                        help='Skip authority records')
    parser.add_argument('--expand', dest='expand', action='store_true',
                        help='Use Skosify to infer skos:hasTopConcept, skos:narrower and skos:related')
    parser.add_argument('--skosify', dest='skosify',
                        help='Run Skosify with given configuration file')

    parser.add_argument('-l', '--list-schemes', dest='list_schemes', action='store_true',
                        help='List default concept schemes.')

    args = parser.parse_args()

    if args.notes:
        warnings.warn('--notes is deprecated as including notes is now the default. '
                      'The inverse option --exclude_notes has been added to exclude notes.',
                      DeprecationWarning)

    if args.list_schemes:
        print('Classification schemes:')
        for k in CONFIG['classification_schemes'].keys():
            scheme = ConceptScheme(k, ClassificationRecord)
            print('- %s' % scheme)
        print('Authority vocabularies:')
        for k in CONFIG['subject_schemes'].keys():
            scheme = ConceptScheme(k, AuthorityRecord)
            print('- %s' % scheme)
        return

    supported_formats = ['turtle', 'jskos', 'ndjson']
    if not args.outformat and args.outfile:
        ext = args.outfile.rpartition('.')[-1]
        if ext in supported_formats:
            args.outformat = ext
    if not args.outformat:
        args.outformat = 'turtle'
    elif args.outformat not in supported_formats:
        raise ValueError("Format not supported, must be one of '%s'." % "', '".join(supported_formats))

    graph = Graph()
    if args.include:
        if args.outformat == 'turtle':
            graph.load(args.include, format='turtle')
        else:
            graph.load(args.include, format='json-ld')

    nm = graph.namespace_manager
    nm.bind('dcterms', DCTERMS)
    nm.bind('skos', SKOS)
    nm.bind('wd', WD)
    nm.bind('mads', MADS)
    nm.bind('owl', OWL)

    if args.verbose:
        console_handler.setLevel(logging.DEBUG)
    else:
        console_handler.setLevel(logging.INFO)

    if args.infile is None:
        raise ValueError('Filename not specified')

    options = {
        'base_uri': args.base_uri,
        'scheme_uri': args.scheme_uri,
        'whitespace': args.whitespace,
        'include_altlabels': args.altlabels,
        'exclude_notes': args.exclude_notes,
        'include_components': args.components,
        'include_webdewey': args.webdewey,
        'skip_classification': args.skip_classification,
        'skip_authority': args.skip_authority,
        'expand': args.expand,
        'skosify': args.skosify,
    }

    marc = MarcFileReader(args.infile)
    graph = process_records(marc.records(), graph, **options)

    if not graph:
        logger.warning('RDF result is empty!')
        return

    if args.outfile and args.outfile != '-':
        out_file = open(args.outfile, 'wb')
    else:
        if (sys.version_info > (3, 0)):
            out_file = sys.stdout.buffer
        else:
            out_file = sys.stdout

    if args.outformat == 'turtle':
        # @TODO: Perhaps use OrderedTurtleSerializer if available, but fallback to default Turtle serializer if not?
        serializer = OrderedTurtleSerializer(graph)

        serializer.class_order = [
            SKOS.ConceptScheme,
            SKOS.Concept,
        ]
        serializer.sorters = [
            (r'/([0-9A-Z\-]+)--([0-9.\-;:]+)/e', lambda x: 'C{}--{}'.format(x[0], x[1])),  # table numbers
            (r'/([0-9.\-;:]+)/e', lambda x: 'B' + x[0]),  # standard schedule numbers
            (r'^(.+)$', lambda x: 'A' + x[0]),  # fallback
        ]

        serializer.serialize(out_file)

    elif args.outformat in ['jskos', 'ndjson']:
        s = pkg_resources.resource_string(__name__, 'jskos-context.json').decode('utf-8')
        context = json.loads(s)
        jskos = json_ld.from_rdf(graph, context)
        if args.outformat == 'jskos':
            jskos['@context'] = u'https://gbv.github.io/jskos/context.json'
            out_file.write(json.dumps(jskos, sort_keys=True, indent=2).encode('utf-8'))
        else:
            for record in jskos['@graph'] if '@graph' in jskos else [jskos]:
                record['@context'] = u'https://gbv.github.io/jskos/context.json'
                out_file.write(json.dumps(record, sort_keys=True).encode('utf-8') + b'\n')

    if out_file != sys.stdout:
        logger.info('Wrote %s: %s' % (args.outformat, args.outfile))
Beispiel #8
0
 def serialize(self):
     serializer = OrderedTurtleSerializer(self.g)
     with open(self.filepath, 'wb') as fp:
         serializer.serialize(fp)
Beispiel #9
0
def main():

    parser = argparse.ArgumentParser(
        description='Convert MARC21 Classification to SKOS/RDF')
    parser.add_argument('infile', nargs='?', help='Input XML file')
    parser.add_argument('outfile', nargs='?', help='Output RDF file')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='More verbose output')
    parser.add_argument(
        '-o',
        '--outformat',
        dest='outformat',
        metavar='FORMAT',
        nargs='?',
        help='Output format: turtle (default), jskos, or ndjson')

    parser.add_argument(
        '--include',
        action='append',
        dest='include',
        default=[],
        help=
        'RDF file(s) to include in the output (e.g. to define a concept scheme). '
        'Must be the same format as {outformat}.')

    parser.add_argument(
        '--uri',
        dest='base_uri',
        help='Concept URI template. See vocabularies.yml for examples.')
    parser.add_argument(
        '--scheme',
        dest='scheme',
        help=
        'Concept scheme, either an URI or a key from vocabularies.yml (For a list: mc2skos --list).'
    )

    parser.add_argument('--whitespace',
                        dest='whitespace',
                        metavar='STRING',
                        help='Replace whitespaces in URI templates with this.')

    parser.add_argument('--altlabels',
                        '--indexterms',
                        dest='altlabels',
                        action='store_true',
                        help='Include altlabels (from 7XX or 4XX).')
    parser.add_argument(
        '--notes',
        dest='notes',
        action='store_true',
        help=
        'Include note fields (DEPRECATED as including notes is now the default).'
    )
    parser.add_argument('--exclude_notes',
                        dest='exclude_notes',
                        action='store_true',
                        help='Exclude note fields.')
    parser.add_argument('--components',
                        dest='components',
                        action='store_true',
                        help='Include component information from 765.')
    parser.add_argument('--webdewey',
                        dest='webdewey',
                        action='store_true',
                        help='Include non-standard WebDewey notes from 680.')
    parser.add_argument('--skip-classification',
                        dest='skip_classification',
                        action='store_true',
                        help='Skip classification records')
    parser.add_argument('--skip-authority',
                        dest='skip_authority',
                        action='store_true',
                        help='Skip authority records')
    parser.add_argument(
        '--expand',
        dest='expand',
        action='store_true',
        help=
        'Use Skosify to infer skos:hasTopConcept, skos:narrower and skos:related'
    )
    parser.add_argument('--skosify',
                        dest='skosify',
                        help='Run Skosify with given configuration file')

    parser.add_argument('-l',
                        '--list-schemes',
                        dest='list_schemes',
                        action='store_true',
                        help='List default concept schemes.')

    parser.add_argument(
        '--nll-lang',
        dest='nll_lang',
        action='store_true',
        help='Set langugage tags specific to the NLL authority file.')

    args = parser.parse_args()

    if args.notes:
        warnings.warn(
            '--notes is deprecated as including notes is now the default. '
            'The inverse option --exclude_notes has been added to exclude notes.',
            DeprecationWarning)

    with pkg_resources.resource_stream(__name__, 'vocabularies.yml') as fp:
        vocabularies = Vocabularies()
        vocabularies.load_yaml(fp)

    vocabularies.set_default_scheme(generic=args.base_uri,
                                    scheme=args.scheme,
                                    whitespace=args.whitespace)

    if args.list_schemes:
        print('Schemes:')
        for voc in vocabularies:
            print('- %s' % voc)
        return

    supported_formats = ['turtle', 'jskos', 'ndjson']
    if not args.outformat and args.outfile:
        ext = args.outfile.rpartition('.')[-1]
        if ext in supported_formats:
            args.outformat = ext
    if not args.outformat:
        args.outformat = 'turtle'
    elif args.outformat not in supported_formats:
        raise ValueError("Format not supported, must be one of '%s'." %
                         "', '".join(supported_formats))

    graph = Graph()
    for filename in args.include:
        if args.outformat == 'turtle':
            graph.load(filename, format='turtle')
        else:
            graph.load(filename, format='json-ld')

    nm = graph.namespace_manager
    nm.bind('dcterms', DCTERMS)
    nm.bind('skos', SKOS)
    nm.bind('wd', WD)
    nm.bind('mads', MADS)
    nm.bind('owl', OWL)

    if args.verbose:
        console_handler.setLevel(logging.DEBUG)
    else:
        console_handler.setLevel(logging.INFO)

    if args.infile is None:
        raise ValueError('Filename not specified')

    options = {
        'include_altlabels': args.altlabels,
        'exclude_notes': args.exclude_notes,
        'include_components': args.components,
        'include_webdewey': args.webdewey,
        'skip_classification': args.skip_classification,
        'skip_authority': args.skip_authority,
        'expand': args.expand,
        'skosify': args.skosify,
        'vocabularies': vocabularies,
        'nll_lang': args.nll_lang
    }

    marc = MarcFileReader(args.infile)
    graph = process_records(marc.records(), graph, **options)

    if not graph:
        logger.warning('RDF result is empty!')
        return

    if args.outfile and args.outfile != '-':
        out_file = open(args.outfile, 'wb')
    else:
        if (sys.version_info > (3, 0)):
            out_file = sys.stdout.buffer
        else:
            out_file = sys.stdout

    if args.outformat == 'turtle':
        # @TODO: Perhaps use OrderedTurtleSerializer if available, but fallback to default Turtle serializer if not?
        serializer = OrderedTurtleSerializer(graph)

        serializer.class_order = [
            SKOS.ConceptScheme,
            SKOS.Concept,
        ]
        serializer.sorters = [
            (r'/([0-9A-Z\-]+)--([0-9.\-;:]+)/e',
             lambda x: 'C{}--{}'.format(x[0], x[1])),  # table numbers
            (r'/([0-9.\-;:]+)/e',
             lambda x: 'B' + x[0]),  # standard schedule numbers
            (r'^(.+)$', lambda x: 'A' + x[0]),  # fallback
        ]

        serializer.serialize(out_file)

    elif args.outformat in ['jskos', 'ndjson']:
        s = pkg_resources.resource_string(__name__,
                                          'jskos-context.json').decode('utf-8')
        context = json.loads(s)
        jskos = json_ld.from_rdf(graph, context)
        if args.outformat == 'jskos':
            jskos['@context'] = u'https://gbv.github.io/jskos/context.json'
            out_file.write(
                json.dumps(jskos, sort_keys=True, indent=2).encode('utf-8'))
        else:
            for record in jskos['@graph'] if '@graph' in jskos else [jskos]:
                record[
                    '@context'] = u'https://gbv.github.io/jskos/context.json'
                out_file.write(
                    json.dumps(record, sort_keys=True).encode('utf-8') + b'\n')

    if out_file != sys.stdout:
        logger.info('Wrote %s: %s' % (args.outformat, args.outfile))