Esempio n. 1
0
    def complete(self):
        es_recordcount = 0
        file_recordcount = 0
        es_ids = set()
        for record in esidfilegenerator(
                host="{rawdata_host}".format(
                    **self.config).rsplit("/")[-1].rsplit(":")[0],
                port="{rawdata_host}".format(
                    **self.config).rsplit("/")[-1].rsplit(":")[1],
                index="kxp-lok".format(date=self.yesterday.strftime("%y%m%d")),
                type="mrc",
                idfile="{date}-lok-ppns.txt".format(
                    **self.config, date=self.yesterday.strftime("%y%m%d")),
                source="False"):
            es_ids.add(record.get("_id"))
        es_recordcount = len(es_ids)

        try:
            with gzip.open(
                    "{date}-lok.ldj.gz".format(**self.config,
                                               date=self.yesterday.strftime(
                                                   "%y%m%d")), "rt") as fd:
                ids = set()
                for line in fd:
                    jline = json.loads(line)
                    ids.add(jline.get("001"))
            file_recordcount = len(ids)
        except FileNotFoundError:
            return False

        if es_recordcount == file_recordcount and es_recordcount > 0:
            return True
        return False
Esempio n. 2
0
 def complete(self):
     path = "{date}-kxp".format(date=self.yesterday.strftime("%y%m%d"))
     ids = set()
     if not os.path.exists(path):
         return False
     for index in os.listdir(path):
         for f in os.listdir(path + "/" + index):
             with gzip.open("{fd}".format(fd=path + "/" + index + "/" + f),
                            "rt") as inp:
                 for line in inp:
                     ids.add(json.loads(line).get("identifier"))
             cmd = "zcat {fd} | jq -rc .identifier >> schemaorg-ids-{date}.txt".format(
                 fd=path + "/" + index + "/" + f,
                 date=self.yesterday.strftime("%y%m%d"))
             shellout(cmd)
     es_ids = set()
     for record in esidfilegenerator(
             host="{host}".format(
                 **self.config).rsplit("/")[-1].rsplit(":")[0],
             port="{host}".format(
                 **self.config).rsplit("/")[-1].rsplit(":")[1],
             index="slub-resources",
             type="schemaorg",
             idfile="schemaorg-ids-{date}.txt".format(
                 date=self.yesterday.strftime("%y%m%d")),
             source=False):
         es_ids.add(record.get("_id"))
     if len(es_ids) == len(ids) and len(es_ids) > 0:
         return True
     return False
Esempio n. 3
0
 def complete(self):
     fail=0
     es_recordcount=0
     file_recordcount=0
     es_ids=set()
     
     try:
         filesize=os.stat("{date}-finc-fixed.ldj.gz".format(date=self.date)).st_size
     except FileNotFoundError:
             return False
     if filesize > 0:
         try:
             for record in esidfilegenerator(host="{host}".format(**self.config).rsplit("/")[2].rsplit(":")[0],port="{host}".format(**self.config).rsplit("/")[2].rsplit(":")[1],index="finc-resources",type="schemaorg",idfile="{date}-finc-ppns.txt".format(**self.config,date=self.date),source="False"):
                 es_ids.add(record.get("_id"))
                 es_recordcount=len(es_ids)
     
             with open("{date}-finc-ppns.txt".format(**self.config,date=self.date),"rt") as fd:
                 ids=set()
                 for line in fd:
                     ids.add(line)
             file_recordcount=len(ids)
             if es_recordcount==file_recordcount and es_recordcount>0:
                 return True
         except FileNotFoundError:
             if os.path.exists("{date}".format(date=self.date)):
                 try:
                     os.listdir("{date}".format(date=self.date))
                     return False
                 except:
                     return True
             return False
         return False
     else:
         return True
 def complete(self):
     """
     just a check if there are still records described by those PPNs
     """
     for _file in self.config["indices"]:
         for index in self.config["indices"][_file]:
             for response in esidfilegenerator(host=index["host"], port=index["port"], index=index["_index"], type=index["_doc_type"], idfile="{date}-delPPN/{fd}".format(date=self.today, fd=_file), headless=False):
                 if response["found"]:
                     return False
     return True
    def complete(self):
        fail = 0
        es_recordcount = 0
        file_recordcount = 0
        es_ids = set()

        try:
            filesize = os.stat("{date}.mrc.bz2".format(date=self.date)).st_size
        except FileNotFoundError:
            return False
        if filesize > 0:
            try:
                for record in esidfilegenerator(
                        host="{rawdata_host}".format(
                            **self.config).rsplit("/")[2].rsplit(":")[0],
                        port="{rawdata_host}".format(
                            **self.config).rsplit("/")[2].rsplit(":")[1],
                        index="finc-main-k10plus",
                        type="mrc",
                        idfile="{date}-ppns.txt".format(**self.config,
                                                        date=self.date),
                        source="False"):
                    es_ids.add(record.get("_id"))
                    es_recordcount = len(es_ids)

                with gzip.open(
                        "{date}.ldj.gz".format(**self.config, date=self.date),
                        "rt") as fd:
                    ids = set()
                    for line in fd:
                        jline = json.loads(line)
                        ids.add(jline.get("001"))
                file_recordcount = len(ids)
                print(file_recordcount)
                if es_recordcount == file_recordcount and es_recordcount > 0:
                    return True
            except FileNotFoundError:
                if os.path.exists("{date}".format(date=self.date)):
                    try:
                        os.listdir("{date}".format(date=self.date))
                        return False
                    except:
                        return True
                return False
            return False
        else:
            return True
 def complete(self):
     ids = set()
     es_ids = set()
     with open("ids.txt") as inp:
         for line in inp:
             ids.add(line.strip())
     for record in esidfilegenerator(
             host="{rawdata_host}".format(
                 **self.config).rsplit("/")[-1].rsplit(":")[0],
             port="{rawdata_host}".format(
                 **self.config).rsplit("/")[-1].rsplit(":")[1],
             index="kxp-de14",
             type="mrc",
             idfile="ids.txt",
             source=False):
         es_ids.add(record.get("_id"))
     if len(es_ids) == len(ids) and len(es_ids) > 0:
         return True
     return False
Esempio n. 7
0
    def complete(self):
        """
        takes all the IDS from the in LODTransform2ldj created TXT file and checks whether those are in the elasticsearch node  
        """
        es_recordcount = 0
        file_recordcount = 0
        es_ids = set()
        for record in esidfilegenerator(
                host="{host}".format(
                    **self.config).rsplit("/")[2].rsplit(":")[0],
                port="{host}".format(
                    **self.config).rsplit("/")[2].rsplit(":")[1],
                index="swb-aut",
                type="mrc",
                idfile="{date}-norm-aut-ppns.txt".format(
                    **self.config, date=self.yesterday.strftime("%y%m%d")),
                source="False"):
            es_ids.add(record.get("_id"))
        es_recordcount = len(es_ids)

        try:
            with gzip.open(
                    "{date}-norm-aut.ldj.gz".format(
                        **self.config, date=self.yesterday.strftime("%y%m%d")),
                    "rt") as fd:
                ids = set()
                for line in fd:
                    jline = json.loads(line)
                    ids.add(jline.get("001"))
            file_recordcount = len(ids)
            print(file_recordcount)
        except FileNotFoundError:
            return False

        if es_recordcount == file_recordcount and es_recordcount > 0:
            return True
        return False
Esempio n. 8
0
def main():
    #argstuff
    parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records')
    parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.')
    parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-type',type=str,help='ElasticSearch Type to use')
    parser.add_argument('-index',type=str,help='ElasticSearch Index to use')
    parser.add_argument('-id',type=str,help='map single document, given by id')
    parser.add_argument('-help',action="store_true",help="print this help")
    parser.add_argument('-z',action="store_true",help="use gzip compression on output data")
    parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data')
    parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)')
    parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty")
    parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json")
    parser.add_argument('-w',type=int,default=8,help="how many processes to use")
    parser.add_argument('-idfile',type=str,help="path to a file with IDs to process")
    parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query')
    parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx")
    parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info")
#    parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200")
    args=parser.parse_args()
    if args.help:
        parser.print_help(sys.stderr)
        exit()        
    if args.server:
        slashsplit=args.server.split("/")
        args.host=slashsplit[2].rsplit(":")[0]
        if isint(args.server.split(":")[2].rsplit("/")[0]):
            args.port=args.server.split(":")[2].split("/")[0]
        args.index=args.server.split("/")[3]
        if len(slashsplit)>4:
            args.type=slashsplit[4]
        if len(slashsplit)>5:
            if "?pretty" in args.server:
                args.pretty=True
                args.id=slashsplit[5].rsplit("?")[0]
            else:
                args.id=slashsplit[5]
    if args.server or ( args.host and args.port ):
        es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port)
    global base_id
    global target_id
    base_id=args.base_id_src
    target_id=args.target_id
    if args.pretty:
        tabbing=4
    else:
        tabbing=None
        
    if args.host and args.index and args.type and args.id:
        json_record=None
        source=get_source_include_str()
        json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source)
        if json_record:
            print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing))
    elif args.host and args.index and args.type and args.idfile:
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esidfilegenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query,
                       idfile=args.idfile
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    elif args.host and args.index and args.type and args.debug:
        init_mp(args.host,args.port,args.prefix,args.z)
        for ldj in esgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       headless=True,
                       body=args.query
                        ): 
            record = process_line(ldj,args.host,args.port,args.index,args.type)
            if record:
                for k in record:
                    print(json.dumps(record[k],indent=None))
    elif args.host and args.index and args.type : #if inf not set, than try elasticsearch
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esfatgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    else: #oh noes, no elasticsearch input-setup. then we'll use stdin
        eprint("No host/port/index specified, trying stdin\n")
        init_mp("localhost","DEBUG","DEBUG","DEBUG")
        with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream:
            for line in input_stream:
                ret=process_line(json.loads(line),"localhost",9200,"data","mrc")
                if isinstance(ret,dict):
                    for k,v in ret.items():
                        print(json.dumps(v,indent=tabbing))
             elif isinstance(rec.get("isbn"),str):
                 if len(rec.get("isbn"))==10:
                     isbn10=rec.get("isbn")
                 elif len(rec.get("isbn"))==13:
                     isbn13=rec.get("isbn")
         if isbn10 or isbn13:
             record=get_gndbyISBN(rec,search_host,search_port,isbn10,isbn13)
             if record:
                 rec=record
         if record or args.pipeline:
             print(json.dumps(rec,indent=None))
 elif args.idfile:
     for rec in esidfilegenerator(host=args.host,
                    port=args.port,
                    index=args.index,
                    type=args.type,
                    idfile=args.idfile,
                    headless=True,
                    timeout=600
                     ):
         isbn10=None
         isbn13=None
         record=None
         if rec and rec.get("isbn"):
             if isinstance(rec.get("isbn"),list):
                 for item in rec.get("isbn"):
                     if len(item)==10:
                         isbn10=item
                     elif len(item)==13:
                         isbn13=item
             elif isinstance(rec.get("isbn"),str):
                 if len(rec.get("isbn"))==10:
    def run(self):
        """
        we iterate thru all the deleteLists and extract the correct PPNs and put them into the correct files, which are line-delimited PPNs
        """
        lok_epns = set()  # 3 sets for deduplication
        tit_ppns = set()
        norm_ppns = set()
        
        for f in os.listdir(self.today+"-delPPN/"):
            with open("{date}-delPPN/{file}".format(date=self.today, file=f)) as handle:
                for line in handle:
                    # dissect line
                    __date = line[0:5]  # YYDDD, WTF
                    __time = line[5:11]  # HHMMSS
                    d_type = line[11:12]  # epn = 9, titledata = A, normdata = B|C|D
                    __xpn = line[12:22]
                    __iln = line[22:26]  # only in epns
                    # __xpn is an EPN and the trailing ILN is our configured ILN
                    if d_type == "9" and __iln == self.config["ILN"]:
                        lok_epns.add(__xpn)

                    #  __xpn is an PPN for title data 
                    elif d_type == "A":
                        tit_ppns.add(__xpn)

                    # __xpn is a authority data
                    elif d_type in ("B", "C", "D"):
                        norm_ppns.add(__xpn)

                    # everything else is not in our interest
                    else:
                        continue

        with open("{date}-delPPN/kxp-lok".format(date=self.today), "w") as lok:
            for epn in lok_epns:
                print(epn, file=lok)

        with open("{date}-delPPN/kxp-tit".format(date=self.today), "w") as tit:
            for ppn in tit_ppns:
                print(ppn, file=tit)

        with open("{date}-delPPN/kxp-norm".format(date=self.today), "w") as norm:
            for ppn in norm_ppns:
                print(ppn, file=norm)

        associated_ppns = set()
        """
        we iterate thru the epns and ther corresponding local data records, save the associated PPNs which are in field 004,
        if no local data record is refering to the associated ppn, then we call it a day or abgesigelt and delete it in all our title and resources indices
        """
        for lok_record in esidfilegenerator(host=self.config["indices"]["kxp-lok"][0]["host"],
                                            port=self.config["indices"]["kxp-lok"][0]["port"],
                                            index=self.config["indices"]["kxp-lok"][0]["_index"],
                                            type=self.config["indices"]["kxp-lok"][0]["_doc_type"],
                                            idfile="{date}-delPPN/kxp-lok".format(date=self.today),
                                            headless=True):
            if lok_record and "004" in lok_record:
                associated_ppns.add(lok_record["004"][0])

        with open("{date}-delPPN/associated-tit".format(date=self.today), "w") as assoc_tit:
            for ppn in associated_ppns:
                print(ppn, file=assoc_tit)
Esempio n. 11
0
def main():
    #argstuff
    parser = ArgumentParser(
        description=
        'Merging of local and title marc records in MarcXchange Json format on ElasticSearch'
    )
    parser.add_argument(
        '-title_host',
        type=str,
        help=
        'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.'
    )
    parser.add_argument(
        '-title_port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-title_type',
                        type=str,
                        help='ElasticSearch Type to use')
    parser.add_argument('-title_index',
                        type=str,
                        help='ElasticSearch Index to use')
    parser.add_argument(
        '-title_server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty"
    )
    parser.add_argument(
        '-local_host',
        type=str,
        help=
        'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.'
    )
    parser.add_argument(
        '-local_port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-local_type',
                        type=str,
                        help='ElasticSearch Type to use')
    parser.add_argument('-local_index',
                        type=str,
                        help='ElasticSearch Index to use')
    parser.add_argument(
        '-local_server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty"
    )
    parser.add_argument(
        '-selectbody',
        type=loads,
        default={"query": {
            "match": {
                "852.__.a.keyword": "DE-14"
            }
        }})
    parser.add_argument('-idfile', type=str, help='idfile to use')
    parser.add_argument('-help', action="store_true", help="print this help")
    args = parser.parse_args()
    if args.help:
        parser.print_help(stderr)
        exit()
    if args.title_server:
        slashsplit = args.title_server.split("/")
        args.title_host = slashsplit[2].rsplit(":")[0]
        if isint(args.title_server.split(":")[2].rsplit("/")[0]):
            args.title_port = args.title_server.split(":")[2].split("/")[0]
        args.title_index = args.title_server.split("/")[3]
        if len(slashsplit) > 4:
            args.local_type = slashsplit[4]
    if args.local_server:
        slashsplit = args.local_server.split("/")
        args.local_host = slashsplit[2].rsplit(":")[0]
        if isint(args.local_server.split(":")[2].rsplit("/")[0]):
            args.local_port = args.local_server.split(":")[2].split("/")[0]
        args.local_index = args.local_server.split("/")[3]
        if len(slashsplit) > 4:
            args.local_type = slashsplit[4]

    if args.title_server or (args.title_host and args.title_port):
        td = Elasticsearch([{"host": args.title_host}], port=args.title_port)
    else:
        eprint("no server for title data submitted. exiting.")
        exit(-1)
    if (args.local_server or
        (args.local_host and args.local_port)) and args.idfile:
        ids = dict()
        for i, record in enumerate(
                esidfilegenerator(host=args.local_host,
                                  port=args.local_port,
                                  index=args.local_index,
                                  type=args.local_type,
                                  body=args.selectbody,
                                  source="852,004,938",
                                  idfile=args.idfile)):
            ids[record["_source"]["004"][0]] = {
                "852": record["_source"]["852"],
                "938": record["_source"]["852"]
            }
        titlerecords = td.mget(index=args.title_index,
                               doc_type=args.title_type,
                               body={"ids": [_id for _id in ids]})
        for record in titlerecords["docs"]:
            if "_source" in record:
                for field in ["852", "938"]:
                    record["_source"][field] = ids[record["_id"]][field]
                print(dumps(record["_source"]))
            else:
                eprint(dumps(record))
    elif not args.idfile and (args.local_server or
                              (args.local_host and args.local_port)):
        for records in esfatgenerator(host=args.local_host,
                                      port=args.local_port,
                                      index=args.local_index,
                                      type=args.local_type,
                                      body=args.selectbody,
                                      source="852,004,938"):
            ids = dict()
            for record in records:
                ids[record["_source"]["004"][0]] = {
                    "852": record["_source"]["852"],
                    "938": record["_source"]["852"]
                }
            try:
                titlerecords = td.mget(index=args.title_index,
                                       doc_type=args.title_type,
                                       body={"ids": [_id for _id in ids]})
            except NotFoundError:
                continue
            except RequestError:
                continue
            for record in titlerecords["docs"]:
                if "_source" in record:
                    for field in ["852", "938"]:
                        record["_source"][field] = ids[record["_id"]][field]
                    print(dumps(record["_source"]))
                else:
                    eprint(dumps(record))

    else:
        eprint("no server for local data submitted. exiting.")
        exit(-1)