Example #1
0
def get_gndbyISSN(record, search_host, search_port, issn):
    #expect a array of issn
    changed = False
    for single_issn in issn:
        for hit in esgenerator(
                host=search_host,
                port=search_port,
                index="dnb-titel",
                type="resources",
                body={
                    "query": {
                        "match": {
                            "http://purl.org/ontology/bibo/[email protected]":
                            str(single_issn)
                        }
                    }
                },
                source=["http://id.loc.gov/vocabulary/relators/isb"],
                source_exclude=None,
                source_include=None,
                headless=False,
                timeout=60):
            record = enrichrecord(record, hit, search_host, search_port)
            changed = True
    if changed:
        return record
def get_gndbyISBN(record,search_host,search_port,isbn10,isbn13):
    changed = False
    if isbn13 and isbn10:
        for hit in esgenerator(host=search_host,port=search_port,index="dnb-titel",id=args.id,type="resources",body={"query":{"bool":{"must":[{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn13)}},{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn10)}}]}}},source=["http://purl.org/dc/terms/subject","http://www.w3.org/2002/07/owl#sameAs"],source_exclude=None,source_include=None,headless=False,timeout=60):
            record=enrichrecord(record,hit)
            changed=True
    if isbn13 and changed==False:
        for hit in esgenerator(host=search_host,port=search_port,index="dnb-titel",id=args.id,type="resources",body={"query":{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn13)}}},source=["http://purl.org/dc/terms/subject","http://www.w3.org/2002/07/owl#sameAs"],source_exclude=None,source_include=None,headless=False,timeout=60):
            record=enrichrecord(record,hit)
            changed=True
    elif isbn10 and changed==False:
        for hit in esgenerator(host=search_host,port=search_port,index="dnb-titel",id=args.id,type="resources",body={"query":{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn10)}}},source=["http://purl.org/dc/terms/subject","http://www.w3.org/2002/07/owl#sameAs"],source_exclude=None,source_include=None,headless=False,timeout=60):
            record=enrichrecord(record,hit)
            changed=True
    if changed:
        return record
def get_gnid_by_es(rec, host, port, index, typ):
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        records = []
        searchbody = {
            "query": {
                "bool": {
                    "filter": {
                        "geo_distance": {
                            "distance": "0.1km",
                            "location": {
                                "lat": float(rec["geo"].get("latitude")),
                                "lon": float(rec["geo"].get("longitude"))
                            }
                        }
                    }
                }
            }
        }
        try:
            for record in esgenerator(headless=True,
                                      host=host,
                                      port=port,
                                      index=index,
                                      type=typ,
                                      body=searchbody):
                records.append(record)
        except elasticsearch.exceptions.RequestError as e:
            eprint(e, json.dumps(searchbody, indent=4),
                   json.dumps(rec, indent=4))
            return

        if records:
            for record in records:
                if record.get("name") in rec.get("name") or rec.get(
                        "name") in record.get("name") or len(
                            records) == 1 or rec.get("name") in record.get(
                                "alternateName"):
                    #eprint(rec.get("name"),record.get("name"),record.get("id"),record.get("location"))
                    rec["sameAs"] = litter(
                        rec.get("sameAs"), "http://www.geonames.org/" +
                        str(record.get("id")) + "/")
                    changed = True
        if changed:
            return rec
        else:
            return None
    def run(self):

        if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0:
            path = "{date}-data".format(date=self.date)
            for index in os.listdir(path):
                for f in os.listdir(path + "/" + index):
                    cmd = "esbulk -z -verbose -server {host} -w {workers} -index {index} -type schemaorg -id identifier {fd}".format(
                        **self.config,
                        index=index,
                        fd=path + "/" + index + "/" + f)
                    output = shellout(cmd)
        #for f in os.listdir(path+"/resources"):
        #    cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && "
        #    cmd+="~/git/efre-lod-elasticsearch-tools/processing/merge2move.py -server {host} -stdin < {fd} | ".format(**self.config,fd=path+"/resources/"+f)
        #    cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py  -searchserver {host} -stdin  | ".format(**self.config,fd=path+"/resources/"+f)
        #    cmd+="esbulk -verbose -server {rawdata_host} -w {workers} -index {index} -type schemaorg -id identifier".format(**self.config,index="resources-fidmove")
        #    output=shellout(cmd)
        put_dict("{host}/date/actual/4".format(**self.config),
                 {"date": str(self.now)})
        with gzip.open("slub_resources_sourceid0.ldj", "wt") as outp:
            for record in esgenerator(
                    host="{host}".format(
                        **self.config).rsplit("/")[2].rsplit(":")[0],
                    port="{host}".format(
                        **self.config).rsplit("/")[2].rsplit(":")[1],
                    index="resources",
                    type="schemaorg",
                    body={
                        "query": {
                            "bool": {
                                "must": [{
                                    "match": {
                                        "offers.offeredBy.branchCode.keyword":
                                        "DE-14"
                                    }
                                }, {
                                    "match": {
                                        "_sourceID.keyword": "0"
                                    }
                                }]
                            }
                        }
                    },
                    headless=True):
                print(json.dumps(record), file=outp)
Example #5
0
def run():

    args = _p.parse_args()
    if args.server:
        srv = urllib.parse.urlparse(args.server)
        host = srv.hostname
        port = srv.port
        splitpath = srv.path.split("/")
        index = splitpath[1]
        doc_type = splitpath[2]
        if len(splitpath) > 3:
            doc_id = splitpath[3]
        else:
            doc_id = None
    if args.stdin:
        iterable = sys.stdin
    else:
        es_query = {
            "query": {
                "match": {
                    "sameAs.publisher.abbr.keyword": "WIKIDATA"
                }
            }
        }
        iterable = esgenerator(host=host,
                               port=port,
                               index=index,
                               type=doc_type,
                               id=doc_id,
                               headless=True,
                               body=es_query)

    for rec_in in iterable:
        if args.stdin:
            rec_in = json.loads(rec_in)

        rec_out = get_wpinfo(rec_in)

        if rec_out:
            print(json.dumps(rec_out, indent=None))
        elif args.pipeline:
            print(json.dumps(rec_in, indent=None))
Example #6
0
def run():
    parser = argparse.ArgumentParser(description='enrich ES by GN!')
    parser.add_argument(
        '-host',
        type=str,
        default="127.0.0.1",
        help=
        'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.'
    )
    parser.add_argument(
        '-port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-index',
                        type=str,
                        help='ElasticSearch Search Index to use')
    parser.add_argument('-type',
                        type=str,
                        help='ElasticSearch Search Index Type to use')
    parser.add_argument("-id",
                        type=str,
                        help="retrieve single document (optional)")
    parser.add_argument('-stdin',
                        action="store_true",
                        help="get data from stdin")
    parser.add_argument(
        '-pipeline',
        action="store_true",
        help=
        "output every record (even if not enriched) to put this script into a pipeline"
    )
    # no, i don't steal the syntax from esbulk...
    parser.add_argument(
        '-server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty"
    )
    parser.add_argument(
        '-searchserver',
        type=str,
        default="http://127.0.0.1:9200/geonames/record",
        help=
        "search instance to use. default is -server e.g. http://127.0.0.1:9200"
    )  # index with geonames_data
    args = parser.parse_args()
    tabbing = None
    if args.server:
        slashsplit = args.server.split("/")
        args.host = slashsplit[2].rsplit(":")[0]
        if isint(args.server.split(":")[2].rsplit("/")[0]):
            args.port = args.server.split(":")[2].split("/")[0]
        args.index = args.server.split("/")[3]
        if len(slashsplit) > 4:
            args.type = slashsplit[4]
        if len(slashsplit) > 5:
            if "?pretty" in args.server:
                tabbing = 4
                args.id = slashsplit[5].rsplit("?")[0]
            else:
                args.id = slashsplit[5]
    if args.searchserver:
        slashsplit = args.searchserver.split("/")
        search_host = slashsplit[2].rsplit(":")[0]
        if isint(args.searchserver.split(":")[2].rsplit("/")[0]):
            search_port = args.searchserver.split(":")[2].split("/")[0]
        search_index = args.searchserver.split("/")[3]
        if len(slashsplit) > 4:
            search_type = slashsplit[4]

    if args.stdin:
        for line in sys.stdin:
            rec = json.loads(line)
            newrec = None
            if rec.get("geo") and not "geonames" in str(rec["sameAs"]):
                newrec = get_gnid_by_es(rec, search_host, search_port,
                                        search_index, search_type)
                if newrec:
                    rec = newrec
            if args.pipeline or newrec:
                print(json.dumps(rec, indent=tabbing))
    else:
        for rec in esgenerator(
                host=args.host,
                port=args.port,
                index=args.index,
                type=args.type,
                headless=True,
                body={
                    "query": {
                        "bool": {
                            "filter": {
                                "bool": {
                                    "must_not": [{
                                        "prefix": {
                                            "*****@*****.**":
                                            "https://sws.geonames.org"
                                        }
                                    }, {
                                        "prefix": {
                                            "*****@*****.**":
                                            "http://sws.geonames.org"
                                        }
                                    }, {
                                        "prefix": {
                                            "*****@*****.**":
                                            "https://www.geonames.org"
                                        }
                                    }, {
                                        "prefix": {
                                            "*****@*****.**":
                                            "http://www.geonames.org"
                                        }
                                    }]
                                }
                            },
                            "must": {
                                "exists": {
                                    "field": "geo"
                                }
                            }
                        }
                    }
                }):
            # newrec=get_gnid(rec)
            newrec = get_gnid_by_es(rec, search_host, search_port,
                                    search_index, search_type)
            if newrec:
                rec = newrec
            if args.pipeline or newrec:
                print(json.dumps(rec, indent=tabbing))
                 rec = record
         if (record or args.pipeline) and rec:
             print(json.dumps(rec, indent=None))
 else:
     for rec in esgenerator(host=args.host,
                            port=args.port,
                            index=args.index,
                            type=args.type,
                            headless=True,
                            body={
                                "query": {
                                    "bool": {
                                        "must": {
                                            "prefix": {
                                                "sameAs.keyword":
                                                "http://d-nb.info"
                                            }
                                        },
                                        "must_not": {
                                            "prefix": {
                                                "sameAs.keyword":
                                                "http://www.wikidata.org/"
                                            }
                                        }
                                    }
                                }
                            }):
         gnd = None
         if rec.get("sameAs"):
             if isinstance(rec.get("sameAs"), list):
                 for item in rec.get("sameAs"):
                         isbn10=item
                     elif len(item)==13:
                         isbn13=item
             elif isinstance(rec.get("isbn"),str):
                 if len(rec.get("isbn"))==10:
                     isbn10=rec.get("isbn")
                 elif len(rec.get("isbn"))==13:
                     isbn13=rec.get("isbn")
         if isbn10 or isbn13:
             record=get_gndbyISBN(rec,search_host,search_port,isbn10,isbn13)
             if record:
                 rec=record
         if record or args.pipeline:
             print(json.dumps(rec,indent=None))
 else:                                                                                                   
     for rec in esgenerator(host=args.host,port=args.port,index=args.index,type=args.type,headless=True,body={"query": {"bool": {"filter": {"exists": {"field": "relatedEvent"}},"must":{"exists":{"field": "isbn"}}}}},timeout=60):
         isbn10=None
         isbn13=None
         if rec.get("isbn"):
             if isinstance(rec.get("isbn"),list):
                 for item in rec.get("isbn"):
                     if len(item)==10:
                         isbn10=item
                     elif len(item)==13:
                         isbn13=item
             elif isinstance(rec.get("isbn"),str):
                 if len(rec.get("isbn"))==10:
                     isbn10=rec.get("isbn")
                 elif len(rec.get("isbn"))==13:
                     isbn13=rec.get("isbn")
         if isbn10 or isbn13:
Example #9
0
def run():
    """
    :param args: argument object, which holds the configuration
    :type  args: argparse.Namespace

    :returns None
    :rtype   None

    """
    args = _p.parse_args()

    ef_instances = ["http://hub.culturegraph.org/entityfacts/"]

    if args.server:
        # overwrite args.host, args.port, args.index, [args.type]
        slashsplit = args.server.split("/")
        host = slashsplit[2].rsplit(":")[0]
        if isint(args.server.split(":")[2].rsplit("/")[0]):
            port = args.server.split(":")[2].split("/")[0]
        index = args.server.split("/")[3]
        if len(slashsplit) > 4:
            type = slashsplit[4]

    if args.ignhub and args.searchserver:
        ef_instances = []

    if args.searchserver:
        slashsplit = args.searchserver.split("/")
        search_host = slashsplit[2].rsplit(":")[0]
        search_port = int(args.searchserver.split(":")[2].split("/")[0])
        search_index = args.searchserver.split("/")[3]
        if len(slashsplit) > 4:
            search_type = slashsplit[4] + "/"
        url = "http://{h}:{p}/{i}/{t}".format(h=search_host,
                                              p=search_port,
                                              i=search_index,
                                              t=search_type)
        # prepend searchserver to entityfacts instances to use local
        # search first
        ef_instances = [url] + ef_instances

    if args.stdin:
        iterate = sys.stdin
    else:
        # use Elasticsearch Server for iteration
        es_query = {
            "query": {
                "prefix": {
                    "*****@*****.**": "https://d-nb.info"
                }
            }
        }
        iterate = esgenerator(host=host,
                              port=port,
                              index=index,
                              type=type,
                              headless=True,
                              body=es_query,
                              verbose=False)
    for rec_in in iterate:
        if args.stdin:
            rec_in = json.loads(rec_in)

        rec_out = entityfacts(rec_in, ef_instances)

        if rec_out:
            print(json.dumps(rec_out, indent=None))

        elif args.pipeline:
            print(json.dumps(rec_in, indent=None))
Example #10
0
def run():
    parser = argparse.ArgumentParser(
        description='enrich ES by GND Sachgruppen!!')
    parser.add_argument(
        '-host',
        type=str,
        default="127.0.0.1",
        help=
        'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.'
    )
    parser.add_argument(
        '-port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-index',
                        type=str,
                        help='ElasticSearch Search Index to use')
    parser.add_argument('-type',
                        type=str,
                        help='ElasticSearch Search Index Type to use')
    parser.add_argument("-id",
                        type=str,
                        help="retrieve single document (optional)")
    parser.add_argument('-stdin',
                        action="store_true",
                        help="get data from stdin")
    parser.add_argument(
        '-pipeline',
        action="store_true",
        help=
        "output every record (even if not enriched) to put this script into a pipeline"
    )
    # no, i don't steal the syntax from esbulk...
    parser.add_argument(
        '-server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty"
    )
    # no, i don't steal the syntax from esbulk...
    parser.add_argument(
        '-searchserver',
        type=str,
        help="use http://host:port for your GND ElasticSearch Server")
    args = parser.parse_args()
    if args.server:
        srv = urllib.parse.urlparse(args.server)
        host = srv.hostname
        port = srv.port
        splitpath = srv.path.split("/")
        index = splitpath[1]
        doc_type = splitpath[2]
        if len(splitpath) > 3:
            doc_id = splitpath[3]
        else:
            doc_id = None
    if not args.searchserver:
        args.searchserver = "http://{}:{}".format(host, port)
    record = None
    if args.stdin:
        for line in sys.stdin:
            rec = json.loads(line)
            gnd = None
            if isinstance(rec.get("sameAs"), list) and "d-nb.info" in str(
                    rec.get("sameAs")):
                for item in rec.get("sameAs"):
                    if "d-nb.info" in item["@id"] and len(
                            item["@id"].split("/")) > 4:
                        gnd = item["@id"].rstrip().split("/")[-1]
            if gnd:
                record = process(rec, gnd, args.searchserver)
                if record:
                    rec = record
            if (record or args.pipeline) and rec:
                print(json.dumps(rec, indent=None))
    else:
        for rec in esgenerator(host=host,
                               port=port,
                               index=index,
                               type=doc_type,
                               id=doc_id,
                               headless=True,
                               body={
                                   "query": {
                                       "prefix": {
                                           "*****@*****.**":
                                           "https://d-nb.info"
                                       }
                                   }
                               }):
            gnd = None
            if isinstance(rec.get("sameAs"), list):
                for item in rec.get("sameAs"):
                    if "d-nb.info" in item["@id"] and len(
                            item["@id"].split("/")) > 4:
                        gnd = item["@id"].split("/")[-1]
            if gnd:
                record = process(rec, gnd, args.searchserver)
                if record:
                    rec = record
            if record or args.pipeline:
                print(json.dumps(rec, indent=None))
Example #11
0
def run():
    parser = argparse.ArgumentParser(description='enrich ES by WD!')
    parser.add_argument(
        '-host',
        type=str,
        default="127.0.0.1",
        help=
        'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.'
    )
    parser.add_argument(
        '-port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-index',
                        type=str,
                        help='ElasticSearch Search Index to use')
    parser.add_argument('-type',
                        type=str,
                        help='ElasticSearch Search Index Type to use')
    parser.add_argument("-id",
                        type=str,
                        help="retrieve single document (optional)")
    parser.add_argument('-stdin',
                        action="store_true",
                        help="get data from stdin")
    parser.add_argument(
        '-pipeline',
        action="store_true",
        help=
        "output every record (even if not enriched) to put this script into a pipeline"
    )
    # no, i don't steal the syntax from esbulk...
    parser.add_argument(
        '-server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty"
    )
    args = parser.parse_args()
    if args.server:
        slashsplit = args.server.split("/")
        args.host = slashsplit[2].rsplit(":")[0]
        if isint(args.server.split(":")[2].rsplit("/")[0]):
            args.port = args.server.split(":")[2].split("/")[0]
        args.index = args.server.split("/")[3]
        if len(slashsplit) > 4:
            args.type = slashsplit[4]
        if len(slashsplit) > 5:
            if "?pretty" in args.server:
                args.pretty = True
                args.id = slashsplit[5].rsplit("?")[0]
            else:
                args.id = slashsplit[5]
    if args.stdin:
        for line in sys.stdin:
            rec = json.loads(line)
            record = None
            if (rec and isinstance(rec.get("sameAs"), list)
                    and "wikidata.org" not in str(rec["sameAs"])):
                record = get_wdid([x["@id"] for x in rec["sameAs"]], rec)
                if record:
                    rec = record
            if (record or args.pipeline) and rec:
                print(json.dumps(rec, indent=None))
    else:
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "bool": {
                            "should": [],
                            "must_not": [{
                                "match": {
                                    "sameAs.publisher.abbr.keyword": "WIKIDATA"
                                }
                            }]
                        }
                    }
                }
            }
        }
        for key in lookup_table_wdProperty:
            body["query"]["bool"]["filter"]["bool"]["should"].append(
                {"prefix": {
                    "*****@*****.**": key
                }})
        for rec in esgenerator(host=args.host,
                               port=args.port,
                               index=args.index,
                               type=args.type,
                               id=args.id,
                               headless=True,
                               body=body):
            record = None
            if rec.get("sameAs") and isinstance(rec.get("sameAs"), list):
                record = get_wdid([x["@id"] for x in rec["sameAs"]], rec)
                if record:
                    rec = record
            if record or args.pipeline:
                print(json.dumps(rec, indent=None))
          "should": [
            {
              "match": {
                "author.about.identifier.propertyID": "fieldOfStudy"
              }
            },
            {
              "match": {
                "author.about.identifier.propertyID": "fieldOfActivity"
              }
            },
            {
              "match": {
                "contributor.about.identifier.propertyID": "fieldOfStudy"
              }
            },
            {
              "match": {
                "contributor.about.identifier.propertyID": "fieldOfActivity"
              }
            }
          ]
        }
      }
    }
  }
}
    for rec in esgenerator(host=args.host,port=args.port,index=args.index,body=searchbody,id=args.id,type=args.type,headless=True,timeout=60):
            enrichrecord(rec,args.host,args.port)
            
             if newrec:
                 rec = newrec
         if args.pipeline or newrec:
             print(json.dumps(rec, indent=tabbing))
 else:
     for rec in esgenerator(host=args.host,
                            port=args.port,
                            index=args.index,
                            type=args.type,
                            headless=True,
                            body={
                                "query": {
                                    "bool": {
                                        "must": {
                                            "exists": {
                                                "field": "geo"
                                            }
                                        },
                                        "must_not": {
                                            "prefix": {
                                                "sameAs.keyword":
                                                "http://www.geonames.org/"
                                            }
                                        }
                                    }
                                }
                            }):
         #newrec=get_gnid(rec)
         newrec = get_gnid_by_es(rec, search_host, search_port,
                                 search_index, search_type)
         if newrec:
             rec = newrec
Example #14
0
def get_gnid_by_es(rec, host, port, index, typ):
    """
    Use local dump in Elasticsearch
    """
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        records = []
        searchbody = {
            "query": {
                "bool": {
                    "filter": {
                        "geo_distance": {
                            "distance": "0.1km",
                            "location": {
                                "lat": float(rec["geo"].get("latitude")),
                                "lon": float(rec["geo"].get("longitude"))
                            }
                        }
                    }
                }
            }
        }
        try:
            for record in esgenerator(headless=True,
                                      host=host,
                                      port=port,
                                      index=index,
                                      type=typ,
                                      body=searchbody):
                if record.get("name") in rec.get("preferredName") or rec.get(
                        "preferredName"
                ) in record.get("name") or len(records) == 1 or rec.get(
                        "preferredName") in record.get("alternateName"):
                    newSameAs = {
                        '@id':
                        "https://sws.geonames.org/" + str(record.get("id")) +
                        "/",
                        'publisher': {
                            'abbr': "geonames",
                            'preferredName': "GeoNames",
                            "isBasedOn": {
                                "@type":
                                "Dataset",
                                "@id":
                                "https://sws.geonames.org/" +
                                str(record.get("id")) + "/"
                            }
                        }
                    }
                    rec["sameAs"] = litter(rec.get("sameAs"), newSameAs)
                    changed = True
        except elasticsearch.exceptions.RequestError as e:
            eprint(e, json.dumps(searchbody, indent=4),
                   json.dumps(rec, indent=4))
            return

        if changed:
            return rec
        else:
            return None
Example #15
0
         record=None
         if rec.get("sameAs"):
             if isinstance(rec.get("sameAs"),list) and any("http://d-nb.info"in x for x in rec.get("sameAs")):
                 for item in rec.get("sameAs"):
                     if "http://d-nb.info" in item and len(item.split("/"))>4:
                         gnd=item.rstrip().split("/")[4]
             elif isinstance(rec.get("sameAs"),str) and "http://d-nb.info" in rec.get("sameAs"):
                 gnd=rec.get("sameAs").split("/")[4]
         if gnd:
             record=entityfacts(rec,gnd,ef_instances)
             if record:
                 rec=record
         if record or args.pipeline:
             print(json.dumps(rec,indent=None))
 else:
     for rec in esgenerator(host=args.host,port=args.port,index=args.index,type=args.type,headless=True,body={"query":{"prefix":{"sameAs.keyword":"http://d-nb.info"}}},verbose=True):
         gnd=None
         if rec.get("sameAs"):
             if isinstance(rec.get("sameAs"),list) and any("http://d-nb.info" in x for x in rec.get("sameAs")):
                 for item in rec.get("sameAs"):
                     if "http://d-nb.info" in item and len(item.split("/"))>4:
                         gnd=item.split("/")[4]
             elif isinstance(rec.get("sameAs"),str) and "http://d-nb.info" in rec.get("sameAs"):
                 gnd=rec.get("sameAs").split("/")[4]
         if gnd:
             record=entityfacts(rec,gnd,ef_instances)
             if record:
                 rec=record
         if record or args.pipeline:
             print(json.dumps(rec,indent=None))
             
Example #16
0
             args.id = slashsplit[5]
 if args.searchserver:
     slashsplit = args.searchserver.split("/")
     search_host = slashsplit[2].rsplit(":")[0]
     if isint(args.searchserver.split(":")[2].rsplit("/")[0]):
         search_port = args.searchserver.split(":")[2].split("/")[0]
 else:
     search_host = args.host
     search_port = args.port
 for rec in esgenerator(
         host=args.host,
         port=args.port,
         index=args.index,
         type=args.type,
         id=args.id,
         headless=True,
         body={"query": {
             "exists": {
                 "field": "partOfSeries.@id"
             }
         }},
         timeout=60):
     for seriesAttr in rec.get("partOfSeries"):
         if "@id" in seriesAttr:
             series = requests.get(
                 "http://{host}:{port}/resources/schemaorg/{_id}".format(
                     host=args.host,
                     port=args.port,
                     _id=seriesAttr.get("@id").split("/")[-1]))
             if series.json().get("_source") and series.json().get(
                     "_source").get("issn"):
Example #17
0
                args.pretty = True
                args.id = slashsplit[5].rsplit("?")[0]
            else:
                args.id = slashsplit[5]
    if args.help:
        parser.print_help(sys.stderr)
        exit()

    if args.searchserver:
        slashsplit = args.searchserver.split("/")
        search_host = slashsplit[2].rsplit(":")[0]
        if isint(args.searchserver.split(":")[2].rsplit("/")[0]):
            search_port = args.searchserver.split(":")[2].split("/")[0]
    else:
        search_host = args.host
        search_port = args.port
    es = Elasticsearch([{'host': search_host}], port=search_port)

    if args.stdin:
        for line in sys.stdin:
            hit = json.loads(line)
            enrich_sameAs(hit, search_host, search_port, args.pipeline)

    else:
        for hit in esgenerator(host=args.host,
                               port=args.port,
                               index=args.index,
                               type=args.type,
                               headless=True):
            enrich_sameAs(hit, search_host, search_port, args.pipeline)
Example #18
0
def main():
    #argstuff
    parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records')
    parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.')
    parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-type',type=str,help='ElasticSearch Type to use')
    parser.add_argument('-index',type=str,help='ElasticSearch Index to use')
    parser.add_argument('-id',type=str,help='map single document, given by id')
    parser.add_argument('-help',action="store_true",help="print this help")
    parser.add_argument('-z',action="store_true",help="use gzip compression on output data")
    parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data')
    parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)')
    parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty")
    parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json")
    parser.add_argument('-w',type=int,default=8,help="how many processes to use")
    parser.add_argument('-idfile',type=str,help="path to a file with IDs to process")
    parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query')
    parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx")
    parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info")
#    parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200")
    args=parser.parse_args()
    if args.help:
        parser.print_help(sys.stderr)
        exit()        
    if args.server:
        slashsplit=args.server.split("/")
        args.host=slashsplit[2].rsplit(":")[0]
        if isint(args.server.split(":")[2].rsplit("/")[0]):
            args.port=args.server.split(":")[2].split("/")[0]
        args.index=args.server.split("/")[3]
        if len(slashsplit)>4:
            args.type=slashsplit[4]
        if len(slashsplit)>5:
            if "?pretty" in args.server:
                args.pretty=True
                args.id=slashsplit[5].rsplit("?")[0]
            else:
                args.id=slashsplit[5]
    if args.server or ( args.host and args.port ):
        es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port)
    global base_id
    global target_id
    base_id=args.base_id_src
    target_id=args.target_id
    if args.pretty:
        tabbing=4
    else:
        tabbing=None
        
    if args.host and args.index and args.type and args.id:
        json_record=None
        source=get_source_include_str()
        json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source)
        if json_record:
            print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing))
    elif args.host and args.index and args.type and args.idfile:
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esidfilegenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query,
                       idfile=args.idfile
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    elif args.host and args.index and args.type and args.debug:
        init_mp(args.host,args.port,args.prefix,args.z)
        for ldj in esgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       headless=True,
                       body=args.query
                        ): 
            record = process_line(ldj,args.host,args.port,args.index,args.type)
            if record:
                for k in record:
                    print(json.dumps(record[k],indent=None))
    elif args.host and args.index and args.type : #if inf not set, than try elasticsearch
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esfatgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    else: #oh noes, no elasticsearch input-setup. then we'll use stdin
        eprint("No host/port/index specified, trying stdin\n")
        init_mp("localhost","DEBUG","DEBUG","DEBUG")
        with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream:
            for line in input_stream:
                ret=process_line(json.loads(line),"localhost",9200,"data","mrc")
                if isinstance(ret,dict):
                    for k,v in ret.items():
                        print(json.dumps(v,indent=tabbing))
        args.index = args.server.split("/")[3]
        if len(slashsplit) > 4:
            args.type = slashsplit[4]
        if len(slashsplit) > 5:
            if "?pretty" in args.server:
                args.pretty = True
                args.id = slashsplit[5].rsplit("?")[0]
            else:
                args.id = slashsplit[5]
    if args.searchserver:
        slashsplit = args.searchserver.split("/")
        search_host = slashsplit[2].rsplit(":")[0]
        if isint(args.searchserver.split(":")[2].rsplit("/")[0]):
            search_port = args.searchserver.split(":")[2].split("/")[0]
    else:
        search_host = args.host
        search_port = args.port
    for rec in esgenerator(host=args.host,
                           port=args.port,
                           index=args.index,
                           id=args.id,
                           type=args.type,
                           headless=True,
                           body={"query": {
                               "exists": {
                                   "field": "Thesis"
                               }
                           }},
                           timeout=60):
        enrichrecord(rec, args.host, args.port)