def get_gndbyISSN(record, search_host, search_port, issn): #expect a array of issn changed = False for single_issn in issn: for hit in esgenerator( host=search_host, port=search_port, index="dnb-titel", type="resources", body={ "query": { "match": { "http://purl.org/ontology/bibo/[email protected]": str(single_issn) } } }, source=["http://id.loc.gov/vocabulary/relators/isb"], source_exclude=None, source_include=None, headless=False, timeout=60): record = enrichrecord(record, hit, search_host, search_port) changed = True if changed: return record
def get_gndbyISBN(record,search_host,search_port,isbn10,isbn13): changed = False if isbn13 and isbn10: for hit in esgenerator(host=search_host,port=search_port,index="dnb-titel",id=args.id,type="resources",body={"query":{"bool":{"must":[{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn13)}},{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn10)}}]}}},source=["http://purl.org/dc/terms/subject","http://www.w3.org/2002/07/owl#sameAs"],source_exclude=None,source_include=None,headless=False,timeout=60): record=enrichrecord(record,hit) changed=True if isbn13 and changed==False: for hit in esgenerator(host=search_host,port=search_port,index="dnb-titel",id=args.id,type="resources",body={"query":{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn13)}}},source=["http://purl.org/dc/terms/subject","http://www.w3.org/2002/07/owl#sameAs"],source_exclude=None,source_include=None,headless=False,timeout=60): record=enrichrecord(record,hit) changed=True elif isbn10 and changed==False: for hit in esgenerator(host=search_host,port=search_port,index="dnb-titel",id=args.id,type="resources",body={"query":{"match":{"http://purl.org/ontology/bibo/[email protected]":str(isbn10)}}},source=["http://purl.org/dc/terms/subject","http://www.w3.org/2002/07/owl#sameAs"],source_exclude=None,source_include=None,headless=False,timeout=60): record=enrichrecord(record,hit) changed=True if changed: return record
def get_gnid_by_es(rec, host, port, index, typ): if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False records = [] searchbody = { "query": { "bool": { "filter": { "geo_distance": { "distance": "0.1km", "location": { "lat": float(rec["geo"].get("latitude")), "lon": float(rec["geo"].get("longitude")) } } } } } } try: for record in esgenerator(headless=True, host=host, port=port, index=index, type=typ, body=searchbody): records.append(record) except elasticsearch.exceptions.RequestError as e: eprint(e, json.dumps(searchbody, indent=4), json.dumps(rec, indent=4)) return if records: for record in records: if record.get("name") in rec.get("name") or rec.get( "name") in record.get("name") or len( records) == 1 or rec.get("name") in record.get( "alternateName"): #eprint(rec.get("name"),record.get("name"),record.get("id"),record.get("location")) rec["sameAs"] = litter( rec.get("sameAs"), "http://www.geonames.org/" + str(record.get("id")) + "/") changed = True if changed: return rec else: return None
def run(self): if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0: path = "{date}-data".format(date=self.date) for index in os.listdir(path): for f in os.listdir(path + "/" + index): cmd = "esbulk -z -verbose -server {host} -w {workers} -index {index} -type schemaorg -id identifier {fd}".format( **self.config, index=index, fd=path + "/" + index + "/" + f) output = shellout(cmd) #for f in os.listdir(path+"/resources"): # cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && " # cmd+="~/git/efre-lod-elasticsearch-tools/processing/merge2move.py -server {host} -stdin < {fd} | ".format(**self.config,fd=path+"/resources/"+f) # cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py -searchserver {host} -stdin | ".format(**self.config,fd=path+"/resources/"+f) # cmd+="esbulk -verbose -server {rawdata_host} -w {workers} -index {index} -type schemaorg -id identifier".format(**self.config,index="resources-fidmove") # output=shellout(cmd) put_dict("{host}/date/actual/4".format(**self.config), {"date": str(self.now)}) with gzip.open("slub_resources_sourceid0.ldj", "wt") as outp: for record in esgenerator( host="{host}".format( **self.config).rsplit("/")[2].rsplit(":")[0], port="{host}".format( **self.config).rsplit("/")[2].rsplit(":")[1], index="resources", type="schemaorg", body={ "query": { "bool": { "must": [{ "match": { "offers.offeredBy.branchCode.keyword": "DE-14" } }, { "match": { "_sourceID.keyword": "0" } }] } } }, headless=True): print(json.dumps(record), file=outp)
def run(): args = _p.parse_args() if args.server: srv = urllib.parse.urlparse(args.server) host = srv.hostname port = srv.port splitpath = srv.path.split("/") index = splitpath[1] doc_type = splitpath[2] if len(splitpath) > 3: doc_id = splitpath[3] else: doc_id = None if args.stdin: iterable = sys.stdin else: es_query = { "query": { "match": { "sameAs.publisher.abbr.keyword": "WIKIDATA" } } } iterable = esgenerator(host=host, port=port, index=index, type=doc_type, id=doc_id, headless=True, body=es_query) for rec_in in iterable: if args.stdin: rec_in = json.loads(rec_in) rec_out = get_wpinfo(rec_in) if rec_out: print(json.dumps(rec_out, indent=None)) elif args.pipeline: print(json.dumps(rec_in, indent=None))
def run(): parser = argparse.ArgumentParser(description='enrich ES by GN!') parser.add_argument( '-host', type=str, default="127.0.0.1", help= 'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.' ) parser.add_argument( '-port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-index', type=str, help='ElasticSearch Search Index to use') parser.add_argument('-type', type=str, help='ElasticSearch Search Index Type to use') parser.add_argument("-id", type=str, help="retrieve single document (optional)") parser.add_argument('-stdin', action="store_true", help="get data from stdin") parser.add_argument( '-pipeline', action="store_true", help= "output every record (even if not enriched) to put this script into a pipeline" ) # no, i don't steal the syntax from esbulk... parser.add_argument( '-server', type=str, help= "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty" ) parser.add_argument( '-searchserver', type=str, default="http://127.0.0.1:9200/geonames/record", help= "search instance to use. default is -server e.g. http://127.0.0.1:9200" ) # index with geonames_data args = parser.parse_args() tabbing = None if args.server: slashsplit = args.server.split("/") args.host = slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port = args.server.split(":")[2].split("/")[0] args.index = args.server.split("/")[3] if len(slashsplit) > 4: args.type = slashsplit[4] if len(slashsplit) > 5: if "?pretty" in args.server: tabbing = 4 args.id = slashsplit[5].rsplit("?")[0] else: args.id = slashsplit[5] if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] if isint(args.searchserver.split(":")[2].rsplit("/")[0]): search_port = args.searchserver.split(":")[2].split("/")[0] search_index = args.searchserver.split("/")[3] if len(slashsplit) > 4: search_type = slashsplit[4] if args.stdin: for line in sys.stdin: rec = json.loads(line) newrec = None if rec.get("geo") and not "geonames" in str(rec["sameAs"]): newrec = get_gnid_by_es(rec, search_host, search_port, search_index, search_type) if newrec: rec = newrec if args.pipeline or newrec: print(json.dumps(rec, indent=tabbing)) else: for rec in esgenerator( host=args.host, port=args.port, index=args.index, type=args.type, headless=True, body={ "query": { "bool": { "filter": { "bool": { "must_not": [{ "prefix": { "*****@*****.**": "https://sws.geonames.org" } }, { "prefix": { "*****@*****.**": "http://sws.geonames.org" } }, { "prefix": { "*****@*****.**": "https://www.geonames.org" } }, { "prefix": { "*****@*****.**": "http://www.geonames.org" } }] } }, "must": { "exists": { "field": "geo" } } } } }): # newrec=get_gnid(rec) newrec = get_gnid_by_es(rec, search_host, search_port, search_index, search_type) if newrec: rec = newrec if args.pipeline or newrec: print(json.dumps(rec, indent=tabbing))
rec = record if (record or args.pipeline) and rec: print(json.dumps(rec, indent=None)) else: for rec in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, headless=True, body={ "query": { "bool": { "must": { "prefix": { "sameAs.keyword": "http://d-nb.info" } }, "must_not": { "prefix": { "sameAs.keyword": "http://www.wikidata.org/" } } } } }): gnd = None if rec.get("sameAs"): if isinstance(rec.get("sameAs"), list): for item in rec.get("sameAs"):
isbn10=item elif len(item)==13: isbn13=item elif isinstance(rec.get("isbn"),str): if len(rec.get("isbn"))==10: isbn10=rec.get("isbn") elif len(rec.get("isbn"))==13: isbn13=rec.get("isbn") if isbn10 or isbn13: record=get_gndbyISBN(rec,search_host,search_port,isbn10,isbn13) if record: rec=record if record or args.pipeline: print(json.dumps(rec,indent=None)) else: for rec in esgenerator(host=args.host,port=args.port,index=args.index,type=args.type,headless=True,body={"query": {"bool": {"filter": {"exists": {"field": "relatedEvent"}},"must":{"exists":{"field": "isbn"}}}}},timeout=60): isbn10=None isbn13=None if rec.get("isbn"): if isinstance(rec.get("isbn"),list): for item in rec.get("isbn"): if len(item)==10: isbn10=item elif len(item)==13: isbn13=item elif isinstance(rec.get("isbn"),str): if len(rec.get("isbn"))==10: isbn10=rec.get("isbn") elif len(rec.get("isbn"))==13: isbn13=rec.get("isbn") if isbn10 or isbn13:
def run(): """ :param args: argument object, which holds the configuration :type args: argparse.Namespace :returns None :rtype None """ args = _p.parse_args() ef_instances = ["http://hub.culturegraph.org/entityfacts/"] if args.server: # overwrite args.host, args.port, args.index, [args.type] slashsplit = args.server.split("/") host = slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): port = args.server.split(":")[2].split("/")[0] index = args.server.split("/")[3] if len(slashsplit) > 4: type = slashsplit[4] if args.ignhub and args.searchserver: ef_instances = [] if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] search_port = int(args.searchserver.split(":")[2].split("/")[0]) search_index = args.searchserver.split("/")[3] if len(slashsplit) > 4: search_type = slashsplit[4] + "/" url = "http://{h}:{p}/{i}/{t}".format(h=search_host, p=search_port, i=search_index, t=search_type) # prepend searchserver to entityfacts instances to use local # search first ef_instances = [url] + ef_instances if args.stdin: iterate = sys.stdin else: # use Elasticsearch Server for iteration es_query = { "query": { "prefix": { "*****@*****.**": "https://d-nb.info" } } } iterate = esgenerator(host=host, port=port, index=index, type=type, headless=True, body=es_query, verbose=False) for rec_in in iterate: if args.stdin: rec_in = json.loads(rec_in) rec_out = entityfacts(rec_in, ef_instances) if rec_out: print(json.dumps(rec_out, indent=None)) elif args.pipeline: print(json.dumps(rec_in, indent=None))
def run(): parser = argparse.ArgumentParser( description='enrich ES by GND Sachgruppen!!') parser.add_argument( '-host', type=str, default="127.0.0.1", help= 'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.' ) parser.add_argument( '-port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-index', type=str, help='ElasticSearch Search Index to use') parser.add_argument('-type', type=str, help='ElasticSearch Search Index Type to use') parser.add_argument("-id", type=str, help="retrieve single document (optional)") parser.add_argument('-stdin', action="store_true", help="get data from stdin") parser.add_argument( '-pipeline', action="store_true", help= "output every record (even if not enriched) to put this script into a pipeline" ) # no, i don't steal the syntax from esbulk... parser.add_argument( '-server', type=str, help= "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty" ) # no, i don't steal the syntax from esbulk... parser.add_argument( '-searchserver', type=str, help="use http://host:port for your GND ElasticSearch Server") args = parser.parse_args() if args.server: srv = urllib.parse.urlparse(args.server) host = srv.hostname port = srv.port splitpath = srv.path.split("/") index = splitpath[1] doc_type = splitpath[2] if len(splitpath) > 3: doc_id = splitpath[3] else: doc_id = None if not args.searchserver: args.searchserver = "http://{}:{}".format(host, port) record = None if args.stdin: for line in sys.stdin: rec = json.loads(line) gnd = None if isinstance(rec.get("sameAs"), list) and "d-nb.info" in str( rec.get("sameAs")): for item in rec.get("sameAs"): if "d-nb.info" in item["@id"] and len( item["@id"].split("/")) > 4: gnd = item["@id"].rstrip().split("/")[-1] if gnd: record = process(rec, gnd, args.searchserver) if record: rec = record if (record or args.pipeline) and rec: print(json.dumps(rec, indent=None)) else: for rec in esgenerator(host=host, port=port, index=index, type=doc_type, id=doc_id, headless=True, body={ "query": { "prefix": { "*****@*****.**": "https://d-nb.info" } } }): gnd = None if isinstance(rec.get("sameAs"), list): for item in rec.get("sameAs"): if "d-nb.info" in item["@id"] and len( item["@id"].split("/")) > 4: gnd = item["@id"].split("/")[-1] if gnd: record = process(rec, gnd, args.searchserver) if record: rec = record if record or args.pipeline: print(json.dumps(rec, indent=None))
def run(): parser = argparse.ArgumentParser(description='enrich ES by WD!') parser.add_argument( '-host', type=str, default="127.0.0.1", help= 'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.' ) parser.add_argument( '-port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-index', type=str, help='ElasticSearch Search Index to use') parser.add_argument('-type', type=str, help='ElasticSearch Search Index Type to use') parser.add_argument("-id", type=str, help="retrieve single document (optional)") parser.add_argument('-stdin', action="store_true", help="get data from stdin") parser.add_argument( '-pipeline', action="store_true", help= "output every record (even if not enriched) to put this script into a pipeline" ) # no, i don't steal the syntax from esbulk... parser.add_argument( '-server', type=str, help= "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty" ) args = parser.parse_args() if args.server: slashsplit = args.server.split("/") args.host = slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port = args.server.split(":")[2].split("/")[0] args.index = args.server.split("/")[3] if len(slashsplit) > 4: args.type = slashsplit[4] if len(slashsplit) > 5: if "?pretty" in args.server: args.pretty = True args.id = slashsplit[5].rsplit("?")[0] else: args.id = slashsplit[5] if args.stdin: for line in sys.stdin: rec = json.loads(line) record = None if (rec and isinstance(rec.get("sameAs"), list) and "wikidata.org" not in str(rec["sameAs"])): record = get_wdid([x["@id"] for x in rec["sameAs"]], rec) if record: rec = record if (record or args.pipeline) and rec: print(json.dumps(rec, indent=None)) else: body = { "query": { "bool": { "filter": { "bool": { "should": [], "must_not": [{ "match": { "sameAs.publisher.abbr.keyword": "WIKIDATA" } }] } } } } } for key in lookup_table_wdProperty: body["query"]["bool"]["filter"]["bool"]["should"].append( {"prefix": { "*****@*****.**": key }}) for rec in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, id=args.id, headless=True, body=body): record = None if rec.get("sameAs") and isinstance(rec.get("sameAs"), list): record = get_wdid([x["@id"] for x in rec["sameAs"]], rec) if record: rec = record if record or args.pipeline: print(json.dumps(rec, indent=None))
"should": [ { "match": { "author.about.identifier.propertyID": "fieldOfStudy" } }, { "match": { "author.about.identifier.propertyID": "fieldOfActivity" } }, { "match": { "contributor.about.identifier.propertyID": "fieldOfStudy" } }, { "match": { "contributor.about.identifier.propertyID": "fieldOfActivity" } } ] } } } } } for rec in esgenerator(host=args.host,port=args.port,index=args.index,body=searchbody,id=args.id,type=args.type,headless=True,timeout=60): enrichrecord(rec,args.host,args.port)
if newrec: rec = newrec if args.pipeline or newrec: print(json.dumps(rec, indent=tabbing)) else: for rec in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, headless=True, body={ "query": { "bool": { "must": { "exists": { "field": "geo" } }, "must_not": { "prefix": { "sameAs.keyword": "http://www.geonames.org/" } } } } }): #newrec=get_gnid(rec) newrec = get_gnid_by_es(rec, search_host, search_port, search_index, search_type) if newrec: rec = newrec
def get_gnid_by_es(rec, host, port, index, typ): """ Use local dump in Elasticsearch """ if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False records = [] searchbody = { "query": { "bool": { "filter": { "geo_distance": { "distance": "0.1km", "location": { "lat": float(rec["geo"].get("latitude")), "lon": float(rec["geo"].get("longitude")) } } } } } } try: for record in esgenerator(headless=True, host=host, port=port, index=index, type=typ, body=searchbody): if record.get("name") in rec.get("preferredName") or rec.get( "preferredName" ) in record.get("name") or len(records) == 1 or rec.get( "preferredName") in record.get("alternateName"): newSameAs = { '@id': "https://sws.geonames.org/" + str(record.get("id")) + "/", 'publisher': { 'abbr': "geonames", 'preferredName': "GeoNames", "isBasedOn": { "@type": "Dataset", "@id": "https://sws.geonames.org/" + str(record.get("id")) + "/" } } } rec["sameAs"] = litter(rec.get("sameAs"), newSameAs) changed = True except elasticsearch.exceptions.RequestError as e: eprint(e, json.dumps(searchbody, indent=4), json.dumps(rec, indent=4)) return if changed: return rec else: return None
record=None if rec.get("sameAs"): if isinstance(rec.get("sameAs"),list) and any("http://d-nb.info"in x for x in rec.get("sameAs")): for item in rec.get("sameAs"): if "http://d-nb.info" in item and len(item.split("/"))>4: gnd=item.rstrip().split("/")[4] elif isinstance(rec.get("sameAs"),str) and "http://d-nb.info" in rec.get("sameAs"): gnd=rec.get("sameAs").split("/")[4] if gnd: record=entityfacts(rec,gnd,ef_instances) if record: rec=record if record or args.pipeline: print(json.dumps(rec,indent=None)) else: for rec in esgenerator(host=args.host,port=args.port,index=args.index,type=args.type,headless=True,body={"query":{"prefix":{"sameAs.keyword":"http://d-nb.info"}}},verbose=True): gnd=None if rec.get("sameAs"): if isinstance(rec.get("sameAs"),list) and any("http://d-nb.info" in x for x in rec.get("sameAs")): for item in rec.get("sameAs"): if "http://d-nb.info" in item and len(item.split("/"))>4: gnd=item.split("/")[4] elif isinstance(rec.get("sameAs"),str) and "http://d-nb.info" in rec.get("sameAs"): gnd=rec.get("sameAs").split("/")[4] if gnd: record=entityfacts(rec,gnd,ef_instances) if record: rec=record if record or args.pipeline: print(json.dumps(rec,indent=None))
args.id = slashsplit[5] if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] if isint(args.searchserver.split(":")[2].rsplit("/")[0]): search_port = args.searchserver.split(":")[2].split("/")[0] else: search_host = args.host search_port = args.port for rec in esgenerator( host=args.host, port=args.port, index=args.index, type=args.type, id=args.id, headless=True, body={"query": { "exists": { "field": "partOfSeries.@id" } }}, timeout=60): for seriesAttr in rec.get("partOfSeries"): if "@id" in seriesAttr: series = requests.get( "http://{host}:{port}/resources/schemaorg/{_id}".format( host=args.host, port=args.port, _id=seriesAttr.get("@id").split("/")[-1])) if series.json().get("_source") and series.json().get( "_source").get("issn"):
args.pretty = True args.id = slashsplit[5].rsplit("?")[0] else: args.id = slashsplit[5] if args.help: parser.print_help(sys.stderr) exit() if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] if isint(args.searchserver.split(":")[2].rsplit("/")[0]): search_port = args.searchserver.split(":")[2].split("/")[0] else: search_host = args.host search_port = args.port es = Elasticsearch([{'host': search_host}], port=search_port) if args.stdin: for line in sys.stdin: hit = json.loads(line) enrich_sameAs(hit, search_host, search_port, args.pipeline) else: for hit in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, headless=True): enrich_sameAs(hit, search_host, search_port, args.pipeline)
def main(): #argstuff parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records') parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.') parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-type',type=str,help='ElasticSearch Type to use') parser.add_argument('-index',type=str,help='ElasticSearch Index to use') parser.add_argument('-id',type=str,help='map single document, given by id') parser.add_argument('-help',action="store_true",help="print this help") parser.add_argument('-z',action="store_true",help="use gzip compression on output data") parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data') parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)') parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty") parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json") parser.add_argument('-w',type=int,default=8,help="how many processes to use") parser.add_argument('-idfile',type=str,help="path to a file with IDs to process") parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query') parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx") parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info") # parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200") args=parser.parse_args() if args.help: parser.print_help(sys.stderr) exit() if args.server: slashsplit=args.server.split("/") args.host=slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port=args.server.split(":")[2].split("/")[0] args.index=args.server.split("/")[3] if len(slashsplit)>4: args.type=slashsplit[4] if len(slashsplit)>5: if "?pretty" in args.server: args.pretty=True args.id=slashsplit[5].rsplit("?")[0] else: args.id=slashsplit[5] if args.server or ( args.host and args.port ): es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port) global base_id global target_id base_id=args.base_id_src target_id=args.target_id if args.pretty: tabbing=4 else: tabbing=None if args.host and args.index and args.type and args.id: json_record=None source=get_source_include_str() json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source) if json_record: print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing)) elif args.host and args.index and args.type and args.idfile: setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esidfilegenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query, idfile=args.idfile ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() elif args.host and args.index and args.type and args.debug: init_mp(args.host,args.port,args.prefix,args.z) for ldj in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), headless=True, body=args.query ): record = process_line(ldj,args.host,args.port,args.index,args.type) if record: for k in record: print(json.dumps(record[k],indent=None)) elif args.host and args.index and args.type : #if inf not set, than try elasticsearch setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esfatgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() else: #oh noes, no elasticsearch input-setup. then we'll use stdin eprint("No host/port/index specified, trying stdin\n") init_mp("localhost","DEBUG","DEBUG","DEBUG") with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream: for line in input_stream: ret=process_line(json.loads(line),"localhost",9200,"data","mrc") if isinstance(ret,dict): for k,v in ret.items(): print(json.dumps(v,indent=tabbing))
args.index = args.server.split("/")[3] if len(slashsplit) > 4: args.type = slashsplit[4] if len(slashsplit) > 5: if "?pretty" in args.server: args.pretty = True args.id = slashsplit[5].rsplit("?")[0] else: args.id = slashsplit[5] if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] if isint(args.searchserver.split(":")[2].rsplit("/")[0]): search_port = args.searchserver.split(":")[2].split("/")[0] else: search_host = args.host search_port = args.port for rec in esgenerator(host=args.host, port=args.port, index=args.index, id=args.id, type=args.type, headless=True, body={"query": { "exists": { "field": "Thesis" } }}, timeout=60): enrichrecord(rec, args.host, args.port)