def complete(self): es_recordcount = 0 file_recordcount = 0 es_ids = set() for record in esidfilegenerator( host="{rawdata_host}".format( **self.config).rsplit("/")[-1].rsplit(":")[0], port="{rawdata_host}".format( **self.config).rsplit("/")[-1].rsplit(":")[1], index="kxp-lok".format(date=self.yesterday.strftime("%y%m%d")), type="mrc", idfile="{date}-lok-ppns.txt".format( **self.config, date=self.yesterday.strftime("%y%m%d")), source="False"): es_ids.add(record.get("_id")) es_recordcount = len(es_ids) try: with gzip.open( "{date}-lok.ldj.gz".format(**self.config, date=self.yesterday.strftime( "%y%m%d")), "rt") as fd: ids = set() for line in fd: jline = json.loads(line) ids.add(jline.get("001")) file_recordcount = len(ids) except FileNotFoundError: return False if es_recordcount == file_recordcount and es_recordcount > 0: return True return False
def complete(self): path = "{date}-kxp".format(date=self.yesterday.strftime("%y%m%d")) ids = set() if not os.path.exists(path): return False for index in os.listdir(path): for f in os.listdir(path + "/" + index): with gzip.open("{fd}".format(fd=path + "/" + index + "/" + f), "rt") as inp: for line in inp: ids.add(json.loads(line).get("identifier")) cmd = "zcat {fd} | jq -rc .identifier >> schemaorg-ids-{date}.txt".format( fd=path + "/" + index + "/" + f, date=self.yesterday.strftime("%y%m%d")) shellout(cmd) es_ids = set() for record in esidfilegenerator( host="{host}".format( **self.config).rsplit("/")[-1].rsplit(":")[0], port="{host}".format( **self.config).rsplit("/")[-1].rsplit(":")[1], index="slub-resources", type="schemaorg", idfile="schemaorg-ids-{date}.txt".format( date=self.yesterday.strftime("%y%m%d")), source=False): es_ids.add(record.get("_id")) if len(es_ids) == len(ids) and len(es_ids) > 0: return True return False
def complete(self): fail=0 es_recordcount=0 file_recordcount=0 es_ids=set() try: filesize=os.stat("{date}-finc-fixed.ldj.gz".format(date=self.date)).st_size except FileNotFoundError: return False if filesize > 0: try: for record in esidfilegenerator(host="{host}".format(**self.config).rsplit("/")[2].rsplit(":")[0],port="{host}".format(**self.config).rsplit("/")[2].rsplit(":")[1],index="finc-resources",type="schemaorg",idfile="{date}-finc-ppns.txt".format(**self.config,date=self.date),source="False"): es_ids.add(record.get("_id")) es_recordcount=len(es_ids) with open("{date}-finc-ppns.txt".format(**self.config,date=self.date),"rt") as fd: ids=set() for line in fd: ids.add(line) file_recordcount=len(ids) if es_recordcount==file_recordcount and es_recordcount>0: return True except FileNotFoundError: if os.path.exists("{date}".format(date=self.date)): try: os.listdir("{date}".format(date=self.date)) return False except: return True return False return False else: return True
def complete(self): """ just a check if there are still records described by those PPNs """ for _file in self.config["indices"]: for index in self.config["indices"][_file]: for response in esidfilegenerator(host=index["host"], port=index["port"], index=index["_index"], type=index["_doc_type"], idfile="{date}-delPPN/{fd}".format(date=self.today, fd=_file), headless=False): if response["found"]: return False return True
def complete(self): fail = 0 es_recordcount = 0 file_recordcount = 0 es_ids = set() try: filesize = os.stat("{date}.mrc.bz2".format(date=self.date)).st_size except FileNotFoundError: return False if filesize > 0: try: for record in esidfilegenerator( host="{rawdata_host}".format( **self.config).rsplit("/")[2].rsplit(":")[0], port="{rawdata_host}".format( **self.config).rsplit("/")[2].rsplit(":")[1], index="finc-main-k10plus", type="mrc", idfile="{date}-ppns.txt".format(**self.config, date=self.date), source="False"): es_ids.add(record.get("_id")) es_recordcount = len(es_ids) with gzip.open( "{date}.ldj.gz".format(**self.config, date=self.date), "rt") as fd: ids = set() for line in fd: jline = json.loads(line) ids.add(jline.get("001")) file_recordcount = len(ids) print(file_recordcount) if es_recordcount == file_recordcount and es_recordcount > 0: return True except FileNotFoundError: if os.path.exists("{date}".format(date=self.date)): try: os.listdir("{date}".format(date=self.date)) return False except: return True return False return False else: return True
def complete(self): ids = set() es_ids = set() with open("ids.txt") as inp: for line in inp: ids.add(line.strip()) for record in esidfilegenerator( host="{rawdata_host}".format( **self.config).rsplit("/")[-1].rsplit(":")[0], port="{rawdata_host}".format( **self.config).rsplit("/")[-1].rsplit(":")[1], index="kxp-de14", type="mrc", idfile="ids.txt", source=False): es_ids.add(record.get("_id")) if len(es_ids) == len(ids) and len(es_ids) > 0: return True return False
def complete(self): """ takes all the IDS from the in LODTransform2ldj created TXT file and checks whether those are in the elasticsearch node """ es_recordcount = 0 file_recordcount = 0 es_ids = set() for record in esidfilegenerator( host="{host}".format( **self.config).rsplit("/")[2].rsplit(":")[0], port="{host}".format( **self.config).rsplit("/")[2].rsplit(":")[1], index="swb-aut", type="mrc", idfile="{date}-norm-aut-ppns.txt".format( **self.config, date=self.yesterday.strftime("%y%m%d")), source="False"): es_ids.add(record.get("_id")) es_recordcount = len(es_ids) try: with gzip.open( "{date}-norm-aut.ldj.gz".format( **self.config, date=self.yesterday.strftime("%y%m%d")), "rt") as fd: ids = set() for line in fd: jline = json.loads(line) ids.add(jline.get("001")) file_recordcount = len(ids) print(file_recordcount) except FileNotFoundError: return False if es_recordcount == file_recordcount and es_recordcount > 0: return True return False
def main(): #argstuff parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records') parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.') parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-type',type=str,help='ElasticSearch Type to use') parser.add_argument('-index',type=str,help='ElasticSearch Index to use') parser.add_argument('-id',type=str,help='map single document, given by id') parser.add_argument('-help',action="store_true",help="print this help") parser.add_argument('-z',action="store_true",help="use gzip compression on output data") parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data') parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)') parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty") parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json") parser.add_argument('-w',type=int,default=8,help="how many processes to use") parser.add_argument('-idfile',type=str,help="path to a file with IDs to process") parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query') parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx") parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info") # parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200") args=parser.parse_args() if args.help: parser.print_help(sys.stderr) exit() if args.server: slashsplit=args.server.split("/") args.host=slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port=args.server.split(":")[2].split("/")[0] args.index=args.server.split("/")[3] if len(slashsplit)>4: args.type=slashsplit[4] if len(slashsplit)>5: if "?pretty" in args.server: args.pretty=True args.id=slashsplit[5].rsplit("?")[0] else: args.id=slashsplit[5] if args.server or ( args.host and args.port ): es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port) global base_id global target_id base_id=args.base_id_src target_id=args.target_id if args.pretty: tabbing=4 else: tabbing=None if args.host and args.index and args.type and args.id: json_record=None source=get_source_include_str() json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source) if json_record: print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing)) elif args.host and args.index and args.type and args.idfile: setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esidfilegenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query, idfile=args.idfile ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() elif args.host and args.index and args.type and args.debug: init_mp(args.host,args.port,args.prefix,args.z) for ldj in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), headless=True, body=args.query ): record = process_line(ldj,args.host,args.port,args.index,args.type) if record: for k in record: print(json.dumps(record[k],indent=None)) elif args.host and args.index and args.type : #if inf not set, than try elasticsearch setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esfatgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() else: #oh noes, no elasticsearch input-setup. then we'll use stdin eprint("No host/port/index specified, trying stdin\n") init_mp("localhost","DEBUG","DEBUG","DEBUG") with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream: for line in input_stream: ret=process_line(json.loads(line),"localhost",9200,"data","mrc") if isinstance(ret,dict): for k,v in ret.items(): print(json.dumps(v,indent=tabbing))
elif isinstance(rec.get("isbn"),str): if len(rec.get("isbn"))==10: isbn10=rec.get("isbn") elif len(rec.get("isbn"))==13: isbn13=rec.get("isbn") if isbn10 or isbn13: record=get_gndbyISBN(rec,search_host,search_port,isbn10,isbn13) if record: rec=record if record or args.pipeline: print(json.dumps(rec,indent=None)) elif args.idfile: for rec in esidfilegenerator(host=args.host, port=args.port, index=args.index, type=args.type, idfile=args.idfile, headless=True, timeout=600 ): isbn10=None isbn13=None record=None if rec and rec.get("isbn"): if isinstance(rec.get("isbn"),list): for item in rec.get("isbn"): if len(item)==10: isbn10=item elif len(item)==13: isbn13=item elif isinstance(rec.get("isbn"),str): if len(rec.get("isbn"))==10:
def run(self): """ we iterate thru all the deleteLists and extract the correct PPNs and put them into the correct files, which are line-delimited PPNs """ lok_epns = set() # 3 sets for deduplication tit_ppns = set() norm_ppns = set() for f in os.listdir(self.today+"-delPPN/"): with open("{date}-delPPN/{file}".format(date=self.today, file=f)) as handle: for line in handle: # dissect line __date = line[0:5] # YYDDD, WTF __time = line[5:11] # HHMMSS d_type = line[11:12] # epn = 9, titledata = A, normdata = B|C|D __xpn = line[12:22] __iln = line[22:26] # only in epns # __xpn is an EPN and the trailing ILN is our configured ILN if d_type == "9" and __iln == self.config["ILN"]: lok_epns.add(__xpn) # __xpn is an PPN for title data elif d_type == "A": tit_ppns.add(__xpn) # __xpn is a authority data elif d_type in ("B", "C", "D"): norm_ppns.add(__xpn) # everything else is not in our interest else: continue with open("{date}-delPPN/kxp-lok".format(date=self.today), "w") as lok: for epn in lok_epns: print(epn, file=lok) with open("{date}-delPPN/kxp-tit".format(date=self.today), "w") as tit: for ppn in tit_ppns: print(ppn, file=tit) with open("{date}-delPPN/kxp-norm".format(date=self.today), "w") as norm: for ppn in norm_ppns: print(ppn, file=norm) associated_ppns = set() """ we iterate thru the epns and ther corresponding local data records, save the associated PPNs which are in field 004, if no local data record is refering to the associated ppn, then we call it a day or abgesigelt and delete it in all our title and resources indices """ for lok_record in esidfilegenerator(host=self.config["indices"]["kxp-lok"][0]["host"], port=self.config["indices"]["kxp-lok"][0]["port"], index=self.config["indices"]["kxp-lok"][0]["_index"], type=self.config["indices"]["kxp-lok"][0]["_doc_type"], idfile="{date}-delPPN/kxp-lok".format(date=self.today), headless=True): if lok_record and "004" in lok_record: associated_ppns.add(lok_record["004"][0]) with open("{date}-delPPN/associated-tit".format(date=self.today), "w") as assoc_tit: for ppn in associated_ppns: print(ppn, file=assoc_tit)
def main(): #argstuff parser = ArgumentParser( description= 'Merging of local and title marc records in MarcXchange Json format on ElasticSearch' ) parser.add_argument( '-title_host', type=str, help= 'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.' ) parser.add_argument( '-title_port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-title_type', type=str, help='ElasticSearch Type to use') parser.add_argument('-title_index', type=str, help='ElasticSearch Index to use') parser.add_argument( '-title_server', type=str, help= "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty" ) parser.add_argument( '-local_host', type=str, help= 'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.' ) parser.add_argument( '-local_port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-local_type', type=str, help='ElasticSearch Type to use') parser.add_argument('-local_index', type=str, help='ElasticSearch Index to use') parser.add_argument( '-local_server', type=str, help= "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty" ) parser.add_argument( '-selectbody', type=loads, default={"query": { "match": { "852.__.a.keyword": "DE-14" } }}) parser.add_argument('-idfile', type=str, help='idfile to use') parser.add_argument('-help', action="store_true", help="print this help") args = parser.parse_args() if args.help: parser.print_help(stderr) exit() if args.title_server: slashsplit = args.title_server.split("/") args.title_host = slashsplit[2].rsplit(":")[0] if isint(args.title_server.split(":")[2].rsplit("/")[0]): args.title_port = args.title_server.split(":")[2].split("/")[0] args.title_index = args.title_server.split("/")[3] if len(slashsplit) > 4: args.local_type = slashsplit[4] if args.local_server: slashsplit = args.local_server.split("/") args.local_host = slashsplit[2].rsplit(":")[0] if isint(args.local_server.split(":")[2].rsplit("/")[0]): args.local_port = args.local_server.split(":")[2].split("/")[0] args.local_index = args.local_server.split("/")[3] if len(slashsplit) > 4: args.local_type = slashsplit[4] if args.title_server or (args.title_host and args.title_port): td = Elasticsearch([{"host": args.title_host}], port=args.title_port) else: eprint("no server for title data submitted. exiting.") exit(-1) if (args.local_server or (args.local_host and args.local_port)) and args.idfile: ids = dict() for i, record in enumerate( esidfilegenerator(host=args.local_host, port=args.local_port, index=args.local_index, type=args.local_type, body=args.selectbody, source="852,004,938", idfile=args.idfile)): ids[record["_source"]["004"][0]] = { "852": record["_source"]["852"], "938": record["_source"]["852"] } titlerecords = td.mget(index=args.title_index, doc_type=args.title_type, body={"ids": [_id for _id in ids]}) for record in titlerecords["docs"]: if "_source" in record: for field in ["852", "938"]: record["_source"][field] = ids[record["_id"]][field] print(dumps(record["_source"])) else: eprint(dumps(record)) elif not args.idfile and (args.local_server or (args.local_host and args.local_port)): for records in esfatgenerator(host=args.local_host, port=args.local_port, index=args.local_index, type=args.local_type, body=args.selectbody, source="852,004,938"): ids = dict() for record in records: ids[record["_source"]["004"][0]] = { "852": record["_source"]["852"], "938": record["_source"]["852"] } try: titlerecords = td.mget(index=args.title_index, doc_type=args.title_type, body={"ids": [_id for _id in ids]}) except NotFoundError: continue except RequestError: continue for record in titlerecords["docs"]: if "_source" in record: for field in ["852", "938"]: record["_source"][field] = ids[record["_id"]][field] print(dumps(record["_source"])) else: eprint(dumps(record)) else: eprint("no server for local data submitted. exiting.") exit(-1)