Exemple #1
0
def fixRecord(record="",
              record_id=0,
              validation=False,
              replaceMethod='decimal'):
    replaceMethods = {
        'decimal': (('#29;', '#30;', '#31;'), ("\x1D", "\x1E", "\x1F")),
        'unicode': (('\u001d', '\u001e', '\u001f'), ("\x1D", "\x1E", "\x1F")),
        'hex': (('\x1D', '\x1E', '\x1F'), ("\x1D", "\x1E", "\x1F"))
    }
    marcFullRecordFixed = record
    for i in range(0, 3):
        marcFullRecordFixed = marcFullRecordFixed.replace(
            replaceMethods.get(replaceMethod)[0][i],
            replaceMethods.get(replaceMethod)[1][i])
    if validation:
        try:
            reader = pymarc.MARCReader(marcFullRecordFixed.encode('utf8'),
                                       utf8_handling='replace')
            marcrecord = next(reader)
        except (RecordLengthInvalid, RecordLeaderInvalid, BaseAddressNotFound,
                BaseAddressInvalid, RecordDirectoryInvalid, NoFieldsFound,
                UnicodeDecodeError) as e:
            eprint("record id {0}:".format(record_id) + str(e))
            with open('invalid_records.txt', 'a') as error:
                #file_out.pluserror()
                eprint(marcFullRecordFixed, file=error)
                return None
    return marcFullRecordFixed
def get_gnid(rec):
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" +
                         rec["geo"].get("latitude") + "&lng=" +
                         rec["geo"].get("longitude") + "&username=slublod")
        if r.ok and isiter(r.json().get("geonames")):
            for geoNameRecord in r.json().get("geonames"):
                if rec.get("name") in geoNameRecord.get(
                        "name") or geoNameRecord.get("name") in rec.get(
                            "name"):  #match!
                    rec["sameAs"] = litter(
                        rec.get("sameAs"), "http://www.geonames.org/" +
                        str(geoNameRecord.get("geonameId")) + "/")
                    changed = True
        else:
            if r.json().get("status").get("message").startswith(
                    "the hourly limit") or r.json().get("status").get(
                        "message").startswith("the daily limit"):
                eprint("Limit exceeded!\n")
                exit(0)
        if changed:
            return rec
def main():
    try:
        for record in MARCReader(sys.stdin.buffer.read(), to_unicode=True):
            sys.stdout.write(json.dumps(transpose_to_ldj(record)) + "\n")
            sys.stdout.flush()
    except UnicodeDecodeError as e:
        eprint("unicode decode error: {}".format(e))
        eprint(record)
Exemple #4
0
def get_wdid(_ids, rec):
    """
    gets an list of sameAs Links, e.g. ['https://d-nb.info/gnd/118827545', 'http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=035143010', 'http://catalogue.bnf.fr/ark:/12148/cb119027159', 'http://id.loc.gov/rwo/agents/n50002729', 'http://isni.org/isni/0000000120960218', 'http://viaf.org/viaf/44298691']
    """
    if not isinstance(_ids, list):
        return None
    changed = False
    url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
    # Define header according to wikidata's User-Agent policy
    # see: https://meta.wikimedia.org/wiki/User-Agent_policy
    headers = {
        'User-Agent':
        'efre-lod-enrich-wikidata-bot/0.1 '
        '(https://github.com/slub/esmarc) '
        'python-requests/2.22'
    }

    or_mapping = []
    for _id in _ids:
        for key, value in lookup_table_wdProperty.items():
            if _id.startswith(key):
                or_mapping.append("?item wdt:{Property} \"{value}\"".format(
                    Property=value["property"],
                    value=_id.split(value["delim"])[-1]))
                break

    if or_mapping:
        # BUILD an SPARQL OR Query with an UNION Operator.
        # Still builds an normal query without UNION when or_mapping List only contains one element
        query = '''SELECT DISTINCT ?item \nWHERE {{\n\t{{ {UNION} }}\n}}'''.format(
            UNION="} UNION\n\t\t {".join(or_mapping))
        data = requests.get(url,
                            headers=headers,
                            params={
                                'query': query,
                                'format': 'json'
                            })
        if data.ok and len(data.json().get("results").get("bindings")) > 0:
            for item in data.json().get("results").get("bindings"):
                rec["sameAs"] = litter(
                    rec["sameAs"], {
                        "@id": item.get("item").get("value"),
                        "publisher": {
                            "@id": "https://www.wikidata.org/wiki/Q2013",
                            "abbr": "WIKIDATA",
                            "preferredName": "Wikidata"
                        },
                        "isBasedOn": {
                            "@type": "Dataset",
                            "@id": item.get("item").get("value")
                        }
                    })
                changed = True
        elif not data.ok:
            eprint("wikidata: Connection Error {status}: \'{message}\'".format(
                status=data.status_code, message=data.content))
    if changed:
        return rec
Exemple #5
0
def fix_mrc_id(jline):
    if "001" in jline and isinstance(jline["001"],list):
        _id=jline.pop("001")
        for elem in _id:
            jline["001"]=elem
            if elem=="0021114284" or len(elem)>512:  # this particulary FINC-MARC21 Record is broken and will break the whole toolchain
                eprint(elem)
                return None
        return jline
def handleIdentifiers(obj):
    if "typeURI" in obj:
        return obj["typeURI"] + "/" + obj.get("_")
    elif obj.get("_") and obj.get("_").startswith("http"):
        return obj.get("_")
    elif not "typeURI" in obj and isinstance(
            obj.get("type"), str) and obj.get("type").lower() == 'gnd':
        return "http://d-nb.info/gnd/" + obj.get("_")
    elif not "typeURI" in obj and not "type" in obj and obj.get(
            "authority").lower() == "gnd":
        return "http://d-nb.info/gnd/" + obj.get("_")
    else:
        eprint(obj)
        return None
def get_gnid_by_es(rec, host, port, index, typ):
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        records = []
        searchbody = {
            "query": {
                "bool": {
                    "filter": {
                        "geo_distance": {
                            "distance": "0.1km",
                            "location": {
                                "lat": float(rec["geo"].get("latitude")),
                                "lon": float(rec["geo"].get("longitude"))
                            }
                        }
                    }
                }
            }
        }
        try:
            for record in esgenerator(headless=True,
                                      host=host,
                                      port=port,
                                      index=index,
                                      type=typ,
                                      body=searchbody):
                records.append(record)
        except elasticsearch.exceptions.RequestError as e:
            eprint(e, json.dumps(searchbody, indent=4),
                   json.dumps(rec, indent=4))
            return

        if records:
            for record in records:
                if record.get("name") in rec.get("name") or rec.get(
                        "name") in record.get("name") or len(
                            records) == 1 or rec.get("name") in record.get(
                                "alternateName"):
                    #eprint(rec.get("name"),record.get("name"),record.get("id"),record.get("location"))
                    rec["sameAs"] = litter(
                        rec.get("sameAs"), "http://www.geonames.org/" +
                        str(record.get("id")) + "/")
                    changed = True
        if changed:
            return rec
        else:
            return None
Exemple #8
0
def run(fd):
    try:
        for record in MARCReader(fd):
            try:
                yield transpose_to_ldj(record)
            except AttributeError as e:
                eprint("attribut error: {}".format(e))
                eprint(record)
                continue
    except UnicodeDecodeError as e:
        eprint("unicode decode error: {}".format(e))
        eprint(record)
def main():
    try:
        for record in MARCReader(sys.stdin.buffer.read(), to_unicode=True):
            sys.stdout.write(
                json.dumps(transpose_to_ldj(record), sort_keys=True) + "\n")
            sys.stdout.flush()
    except UnicodeDecodeError as e:
        eprint("unicode decode error: {}".format(e))
        eprint(record)
    except pymarc.exceptions.RecordLengthInvalid as e:
        eprint("Invalid Record Length error: {}".format(e))
        eprint(record)
Exemple #10
0
def get_gnid(rec):
    """
    Use geonames API (slow and quota limit for free accounts)
    """
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" +
                         rec["geo"].get("latitude") + "&lng=" +
                         rec["geo"].get("longitude") + "&username=slublod")
        if r.ok and isiter(r.json().get("geonames")):
            for geoNameRecord in r.json().get("geonames"):
                if rec.get("name") in geoNameRecord.get(
                        "name") or geoNameRecord.get("name") in rec.get(
                            "name"):  # match!
                    newSameAs = {
                        '@id':
                        "https://sws.geonames.org/" +
                        str(geoNameRecord.get("geonameId")) + "/",
                        'publisher': {
                            'abbr': "geonames",
                            'preferredName': "GeoNames",
                            "isBasedOn": {
                                "@type":
                                "Dataset",
                                "@id":
                                "https://sws.geonames.org/" +
                                str(record.get("id")) + "/"
                            }
                        }
                    }
                    rec["sameAs"] = litter(rec.get("sameAs"), newSameAs)
                    changed = True
        else:
            if r.json().get("status").get("message").startswith(
                    "the hourly limit") or r.json().get("status").get(
                        "message").startswith("the daily limit"):
                eprint("Limit exceeded!\n")
                exit(0)
        if changed:
            return rec
def handleevent(schemaorg_attr, sourceRecord):
    ret = []
    schemaorg_name_mapping = {
        "startDate": "start",
        "endDate": "end",
        "location": "locaton",
        "name": "name",
        "alternateName": "acronym",
        "sponsor": "sponsor",
        "position": "number",
        "affiliation": "affiliation"
    }
    if "event" in sourceRecord:
        eprint(sourceRecord["event"])
        obj = {}
        for target, source in schemaorg_name_mapping.items():
            if source in sourceRecord["event"]:
                obj[target] = sourceRecord["event"][source]
        if obj:
            ret.append(obj)
    return ret if ret else None
def handlefile(attribut, record):
    retobj = []
    path = "metadata>mets:mets>mets:fileSec>mets:fileGrp"
    objects = getNestedJsonObject(record, path)
    if objects:
        for elem in objects:
            if elem.get("USE") == "DELETED":
                continue
            try:
                if elem.get("USE") == "DOWNLOAD" and elem.get(
                        "mets:file") and isinstance(elem["mets:file"], dict):
                    bnode = handlemetsfile(elem)
                    if bnode:
                        retobj.append(bnode)
                elif elem.get("mets:file") and isinstance(
                        elem["mets:file"], list):
                    for fd in elem["mets:file"]:
                        #eprint(fd)
                        if fd.get("USE") == "DOWNLOAD":
                            eprint(fd)
            except AttributeError:
                eprint(elem)
                exit(-1)
    return (attribut, retobj) if retobj else (None, None)
def get_context(con_dict, con_url):
    if con_url not in con_dict:
        if con_url in listcontexts:
            r = requests.get(listcontexts[con_url])
            if r.ok:
                con_dict[text] = r.json()
                eprint("got context from " + listcontexts[con_url])
            else:
                eprint("Error, could not get context from " + con_url)
                exit(-1)
        else:
            r = requests.get(con_url)
            if r.ok:
                con_dict[text] = r.json()
                eprint("got context from " + con_url)
                return
            eprint("Error, context unknown :( " + str(con_url), doc)
            exit(-1)
Exemple #14
0
def get_gnid_by_es(rec, host, port, index, typ):
    """
    Use local dump in Elasticsearch
    """
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        records = []
        searchbody = {
            "query": {
                "bool": {
                    "filter": {
                        "geo_distance": {
                            "distance": "0.1km",
                            "location": {
                                "lat": float(rec["geo"].get("latitude")),
                                "lon": float(rec["geo"].get("longitude"))
                            }
                        }
                    }
                }
            }
        }
        try:
            for record in esgenerator(headless=True,
                                      host=host,
                                      port=port,
                                      index=index,
                                      type=typ,
                                      body=searchbody):
                if record.get("name") in rec.get("preferredName") or rec.get(
                        "preferredName"
                ) in record.get("name") or len(records) == 1 or rec.get(
                        "preferredName") in record.get("alternateName"):
                    newSameAs = {
                        '@id':
                        "https://sws.geonames.org/" + str(record.get("id")) +
                        "/",
                        'publisher': {
                            'abbr': "geonames",
                            'preferredName': "GeoNames",
                            "isBasedOn": {
                                "@type":
                                "Dataset",
                                "@id":
                                "https://sws.geonames.org/" +
                                str(record.get("id")) + "/"
                            }
                        }
                    }
                    rec["sameAs"] = litter(rec.get("sameAs"), newSameAs)
                    changed = True
        except elasticsearch.exceptions.RequestError as e:
            eprint(e, json.dumps(searchbody, indent=4),
                   json.dumps(rec, indent=4))
            return

        if changed:
            return rec
        else:
            return None
Exemple #15
0
def get_wpinfo(record):
    """
    * iterates through all sameAs Links to extract a wikidata-ID
    * requests wikipedia sites connected to the wd-Id
    * enriches wikipedia sites if they are within lookup_table_wpSites
      (i.e. currently german, english, polish, czech)
    * if we get an new wikipedia link from wikidata, but we
      already got an old entry from other as obsolete defined sources,
      we delete the obsolete entry and append the new entry
    * enriches multilingual names if they are within lookup_table_wpSites

    :returns None (if record has not been changed)
             enriched record (dict, if record has changed)
    :rtype dict
    """
    wd_uri = None
    wd_id = None

    for _id in [x["@id"] for x in record["sameAs"]]:
        if "wikidata" in _id:
            wd_uri = _id
            wd_id = wd_uri.split("/")[-1]
            break
    if not wd_id:
        return None

    headers = {
        'User-Agent':
        'efre-lod-enrich-wikipedia-bot/0.1 '
        '(https://github.com/slub/esmarc) '
        'python-requests/2.22'
    }
    site_filter_param = '|'.join([x for x in lookup_table_wpSites])
    wd_response = requests.get("https://www.wikidata.org/w/api.php",
                               headers=headers,
                               params={
                                   'action': 'wbgetentities',
                                   'ids': wd_id,
                                   'props': 'sitelinks/urls',
                                   'format': 'json',
                                   'sitefilter': site_filter_param
                               })

    if not wd_response.ok:
        eprint("wikipedia: Connection Error {status}: \'{message}\'".format(
            status=wd_response.status_code, message=wd_response.content))
        return None

    # related wikipedia links:
    try:
        sites = wd_response.json()["entities"][wd_id]["sitelinks"]
    except KeyError:
        eprint("wikipedia: Data Error for Record:\n"
               "\'{record}\'\n\'{wp_record}\'".format(
                   record=record, wp_record=wd_response.content))
        return None

    # list of all abbreviations for publisher in record's sameAs
    abbrevs = build_abbrevs(record["sameAs"])
    changed = False
    for wpAbbr, info in sites.items():
        if wpAbbr in lookup_table_wpSites:
            wikip_url = info["url"]
            newSameAs = {
                "@id": wikip_url,
                "publisher": lookup_table_wpSites[wpAbbr],
                "isBasedOn": {
                    "@type": "Dataset",
                    "@id": wd_uri
                }
            }
            # wikipedia sameAs link enrichment
            if wpAbbr not in abbrevs:
                record["sameAs"].append(newSameAs)
                changed = True

            # we already got an wikipedia link for that language, but the
            # originating data source is obsolete, so we update
            elif abbrevs.get(
                    wpAbbr) and abbrevs[wpAbbr]["host"] in obsolete_isBasedOns:
                record["sameAs"][abbrevs[wpAbbr]["pos"]] = newSameAs
                changed = True

            # multilingual name object enrichment
            if not record.get("name"):
                record["name"] = {}
            cc = wpAbbr[:2]  # countrycode
            if cc not in record["name"]:
                record["name"][cc] = [info["title"]]
                changed = True
            if info["title"] not in record["name"][cc]:
                record["name"][cc] = litter(record["name"][cc], info["title"])
                changed = True
    if changed:
        return record
Exemple #16
0
def get_wpcategories(record):
    """
    * iterates through all sameAs Links to extract the
      link(s) to the wiki-site
    * requests wikpedia categories linked to those links
    :returns None (if record has not been changed)
             enriched record (dict, if record has changed)
    :rtype dict
    """
    wp_uri = None
    wp_title = None
    cc = None  # countrycode
    changed = False
    retobj = {}
    for _id in [x["@id"] for x in record["sameAs"]]:
        if "wikipedia" in _id:
            wp_uri = _id
            wp_title = urllib.parse.unquote(wp_uri.split("/")[-1])
            cc = wp_uri.split("/")[2].split(".")[0]

            headers = {
                'User-Agent':
                'lod-enrich-wikipedia-categories-bot/0.1'
                '(https://github.com/slub/esmarc) '
                'python-requests/2.22'
            }
            url = "https://{}.wikipedia.org/w/api.php".format(cc)
            wd_response = requests.get(url,
                                       headers=headers,
                                       params={
                                           'action': 'query',
                                           'generator': 'categories',
                                           'titles': wp_title,
                                           'gcllimit': 500,
                                           'prop': 'info',
                                           'format': 'json'
                                       })
            if not wd_response.ok:
                eprint("wikipedia-categories: Connection Error "
                       "{status}: \'{message}\'".format(
                           status=wd_response.status_code,
                           message=wd_response.content))
                return None
            # related wikipedia links:
            _base = "https://{}.wikipedia.org/wiki/".format(cc)
            try:
                pages = wd_response.json()["query"]["pages"]
                for page_id, page_data in pages.items():
                    _sameAs = _base + page_data["title"].replace(' ', '_')
                    _id = _base + "?curid={}".format(page_id)
                    # cutting off the substring 'Category:' or 'Kategorie:' from
                    # the beginning of the title for the name field
                    _name = ":".join(page_data["title"].split(":")[1:])
                    obj = {"@id": _id, "sameAs": _sameAs, "name": _name}
                    retobj[cc] = litter(retobj.get(cc), obj)
                    changed = True
            except KeyError:
                eprint("wikipedia-categories: Data Error for Record:\n"
                       "{record}\'\n\'{wp_record}\'".format(
                           record=record, wp_record=wd_response.content))
                return None
    if changed:
        record["category"] = retobj
        return record
    return None
def main():
    #argstuff
    parser = ArgumentParser(
        description=
        'Merging of local and title marc records in MarcXchange Json format on ElasticSearch'
    )
    parser.add_argument(
        '-title_host',
        type=str,
        help=
        'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.'
    )
    parser.add_argument(
        '-title_port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-title_type',
                        type=str,
                        help='ElasticSearch Type to use')
    parser.add_argument('-title_index',
                        type=str,
                        help='ElasticSearch Index to use')
    parser.add_argument(
        '-title_server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty"
    )
    parser.add_argument(
        '-local_host',
        type=str,
        help=
        'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.'
    )
    parser.add_argument(
        '-local_port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-local_type',
                        type=str,
                        help='ElasticSearch Type to use')
    parser.add_argument('-local_index',
                        type=str,
                        help='ElasticSearch Index to use')
    parser.add_argument(
        '-local_server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty"
    )
    parser.add_argument(
        '-selectbody',
        type=loads,
        default={"query": {
            "match": {
                "852.__.a.keyword": "DE-14"
            }
        }})
    parser.add_argument('-help', action="store_true", help="print this help")
    args = parser.parse_args()
    if args.help:
        parser.print_help(stderr)
        exit()
    if args.title_server:
        slashsplit = args.title_server.split("/")
        args.title_host = slashsplit[2].rsplit(":")[0]
        if isint(args.title_server.split(":")[2].rsplit("/")[0]):
            args.title_port = args.title_server.split(":")[2].split("/")[0]
        args.title_index = args.title_server.split("/")[3]
        if len(slashsplit) > 4:
            args.local_type = slashsplit[4]
    if args.local_server:
        slashsplit = args.local_server.split("/")
        args.local_host = slashsplit[2].rsplit(":")[0]
        if isint(args.local_server.split(":")[2].rsplit("/")[0]):
            args.local_port = args.local_server.split(":")[2].split("/")[0]
        args.local_index = args.local_server.split("/")[3]
        if len(slashsplit) > 4:
            args.local_type = slashsplit[4]

    if args.title_server or (args.title_host and args.title_port):
        td = Elasticsearch([{"host": args.title_host}], port=args.title_port)
    else:
        eprint("no server for title data submitted. exiting.")
        exit(-1)
    if args.local_server or (args.local_host and args.local_port):
        for records in esfatgenerator(host=args.local_host,
                                      port=args.local_port,
                                      index=args.local_index,
                                      type=args.local_type,
                                      body=args.searchbody,
                                      source="852,004,938"):
            ids = dict()
            for record in records:
                ids[record["_source"]["004"][0]] = {
                    "852": record["_source"]["852"],
                    "938": record["_source"]["852"]
                }
            try:
                titlerecords = td.mget(index=args.title_index,
                                       doc_type=args.title_type,
                                       body={"ids": [_id for _id in ids]})
            except NotFoundError:
                continue
            except RequestError:
                continue
            for record in titlerecords["docs"]:
                if "_source" in record:
                    for field in ["852", "938"]:
                        record["_source"][field] = ids[record["_id"]][field]
                    print(dumps(record["_source"]))
                else:
                    eprint(dumps(record))

    else:
        eprint("no server for local data submitted. exiting.")
        exit(-1)
                        '@id': sameAs,
                        'publisher': {
                            'abbr': value["abbr"],
                            'preferredName': value["preferredName"]
                        },
                        "isBasedOn": {
                            "@type": "Dataset"
                        }
                    }
                    if "slubID" in value:
                        obj["publisher"]["@id"] = value["slubID"]
                    if "sourceRecord" in value["isBasedOn"]:
                        obj["isBasedOn"]["@id"] = rec["isBasedOn"]
                    elif "entityFacts" in value["isBasedOn"]:
                        for sameAs in sameAsses:
                            if "d-nb.info" in sameAs:
                                obj["isBasedOn"][
                                    "@id"] = "http://hub.culturegraph.org/entityfacts/{}".format(
                                        sameAs.split("/")[-1])
                                break
                    elif "sameAs" in value["isBasedOn"]:
                        obj["isBasedOn"]["@id"] = obj["@id"]
            if isinstance(obj, dict):
                rec["sameAs"].append(obj)
            elif obj:
                eprint(obj)
            else:
                eprint(sameAs)
    rec["preferredName"] = rec.pop("name")
    print(json.dumps(rec))
Exemple #19
0
            jline["001"]=elem
            if elem=="0021114284" or len(elem)>512:  # this particulary FINC-MARC21 Record is broken and will break the whole toolchain
                eprint(elem)
                return None
        return jline

def valid_mrc_fields(jline):
    if jline:
        for key in jline:
            if isint(key) and len(str(int(key)))>1:
                for elem in jline[key]:
                    if isinstance(elem,str):
                        return None
    return jline



if __name__ == "__main__":
    for line in sys.stdin:
        try:
            jline=json.loads(line)
        except:
            eprint("corrupt json: "+str(line))
            continue
        jline=fix_mrc_id(jline)
        jline=valid_mrc_fields(jline)
        if jline:
            sys.stdout.write(json.dumps(jline)+"\n")
            sys.stdout.flush()

Exemple #20
0
def main():
    #argstuff
    parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records')
    parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.')
    parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-type',type=str,help='ElasticSearch Type to use')
    parser.add_argument('-index',type=str,help='ElasticSearch Index to use')
    parser.add_argument('-id',type=str,help='map single document, given by id')
    parser.add_argument('-help',action="store_true",help="print this help")
    parser.add_argument('-z',action="store_true",help="use gzip compression on output data")
    parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data')
    parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)')
    parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty")
    parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json")
    parser.add_argument('-w',type=int,default=8,help="how many processes to use")
    parser.add_argument('-idfile',type=str,help="path to a file with IDs to process")
    parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query')
    parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx")
    parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info")
#    parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200")
    args=parser.parse_args()
    if args.help:
        parser.print_help(sys.stderr)
        exit()        
    if args.server:
        slashsplit=args.server.split("/")
        args.host=slashsplit[2].rsplit(":")[0]
        if isint(args.server.split(":")[2].rsplit("/")[0]):
            args.port=args.server.split(":")[2].split("/")[0]
        args.index=args.server.split("/")[3]
        if len(slashsplit)>4:
            args.type=slashsplit[4]
        if len(slashsplit)>5:
            if "?pretty" in args.server:
                args.pretty=True
                args.id=slashsplit[5].rsplit("?")[0]
            else:
                args.id=slashsplit[5]
    if args.server or ( args.host and args.port ):
        es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port)
    global base_id
    global target_id
    base_id=args.base_id_src
    target_id=args.target_id
    if args.pretty:
        tabbing=4
    else:
        tabbing=None
        
    if args.host and args.index and args.type and args.id:
        json_record=None
        source=get_source_include_str()
        json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source)
        if json_record:
            print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing))
    elif args.host and args.index and args.type and args.idfile:
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esidfilegenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query,
                       idfile=args.idfile
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    elif args.host and args.index and args.type and args.debug:
        init_mp(args.host,args.port,args.prefix,args.z)
        for ldj in esgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       headless=True,
                       body=args.query
                        ): 
            record = process_line(ldj,args.host,args.port,args.index,args.type)
            if record:
                for k in record:
                    print(json.dumps(record[k],indent=None))
    elif args.host and args.index and args.type : #if inf not set, than try elasticsearch
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esfatgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    else: #oh noes, no elasticsearch input-setup. then we'll use stdin
        eprint("No host/port/index specified, trying stdin\n")
        init_mp("localhost","DEBUG","DEBUG","DEBUG")
        with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream:
            for line in input_stream:
                ret=process_line(json.loads(line),"localhost",9200,"data","mrc")
                if isinstance(ret,dict):
                    for k,v in ret.items():
                        print(json.dumps(v,indent=tabbing))
Exemple #21
0
def entityfacts(record, ef_instances):
    """ Function to harvest gnd entityfacts
    Look for connections to other entity providers in GND's
    entityfacts "sameAs" field


    :param record: json record probably containing GND entries
                   in their "sameAs" list field
    :type  record: json object

    :param ef_instances: entityfacts-URLs instances to query
    :type  ef_instances: list of strings

    :returns:
    :rtype:   json object
    """
    # abbreviations used by GND entityfacts and their
    # analoy in SLUB LOD context
    abbreviations = {
        "DNB": "https://data.slub-dresden.de/organizations/514366265",
        "VIAF": "https://data.slub-dresden.de/organizations/100092306",
        "LC": "https://data.slub-dresden.de/organizations/100822142",
        "DDB": "https://data.slub-dresden.de/organizations/824631854",
        "WIKIDATA": "https://www.wikidata.org/wiki/Q2013",
        "BNF": "https://data.slub-dresden.de/organizations/188898441",
        "KXP": "https://data.slub-dresden.de/organizations/103302212",
        "dewiki": None,
        "enwiki": None,
        "DE-611": "https://data.slub-dresden.de/organizations/103675612",
        "geonames": None,
        "ISNI": None,
        "filmportal.de": None,
        "ORCID": None,
        "Portraitindex": None,
        "ARCHIV-D": None,
        "DE-M512": None,
        "ADB": None,
        "NDB": None,
        "OEBL": "https://data.slub-dresden.de/organizations/102972389",
        "CH_HLS": None,
        "LAGIS": "https://data.slub-dresden.de/organizations/100482600",
        "WIKISOURCE": None,
        "DE-28": "https://data.slub-dresden.de/organizations/100874770",
        "OSTDEBIB": None,
        "PACELLI": None,
        "FFMPL": "https://data.slub-dresden.de/organizations/236770764",
        "epidat": "https://data.slub-dresden.de/organizations/103039031",
        "BIOKLASOZ": "https://data.slub-dresden.de/organizations/100832873",
        "HISTORICUMNET": "https://data.slub-dresden.de/organizations/102398704"
    }

    if not isinstance(record.get("sameAs"), list):
        return None

    gnd_id = None
    for item in record.get("sameAs"):
        if "d-nb.info" in item["@id"] and len(item["@id"].split("/")) > 4:
            gnd_id = item["@id"].split("/")[-1]

    if not gnd_id:
        # no GND-ID - nothing to enrich
        return None

    old_rec_sameAs_len = len(str(record["sameAs"]))
    for url in ef_instances:
        r = requests.get(url + str(gnd_id))
        if r.ok:
            data = r.json()
        else:
            # ID not found in the respective source
            # just continue
            continue

        sameAsses = []  # ba-dum-ts

        if data.get("_source"):
            # in Elasticsearch: data are in the "_source" field
            ef_sameAs = data.get("_source").get("sameAs")
        else:
            ef_sameAs = data.get("sameAs")

        if not ef_sameAs or not isinstance(ef_sameAs, list):
            continue

        for sameAs in ef_sameAs:
            id_ = sameAs.get("@id")

            # we can skip DNB-link as we already have it (and
            # used it to come here)
            if not id_ or id_.startswith("https://d-nb.info"):
                continue

            obj = {
                '@id': id_,
                'publisher': {
                    'abbr': sameAs["collection"]["abbr"],
                    'preferredName': sameAs["collection"]["name"]
                },
                'isBasedOn': {
                    '@type':
                    "Dataset",
                    '@id':
                    "http://hub.culturegraph.org/entityfacts/{}".format(gnd_id)
                }
            }
            # replace id with SLUB LOD id's listed in abbreviations
            if obj["publisher"]["abbr"] in abbreviations:
                slub_id = abbreviations[obj["publisher"]["abbr"]]
                if slub_id:
                    obj["publisher"]["@id"] = slub_id
            else:
                # unknown identifier, report into error log
                eprint("entityfacts: Abbr. {} not known [GND-ID: {}]".format(
                    sameAs["collection"]["abbr"], gnd_id))
            sameAsses.append(obj)

        if sameAsses:
            record["sameAs"] = litter(record.get("sameAs"), sameAsses)
        break

    # compare length of transformed record, if the new entry is larger
    # than the old one, it was updated
    new_rec_sameAs_len = len(str(record["sameAs"]))
    if new_rec_sameAs_len > old_rec_sameAs_len:
        return record
    elif new_rec_sameAs_len < old_rec_sameAs_len:
        eprint("entityfacts: new record shorter than old one… "
               "[GND-ID: {}]".format(gnd_id))
        return None
    else:
        return None