Ejemplo n.º 1
0
def backup(conf):
    try:
        for records in esfatgenerator(host=conf.get("host"),
                                      port=conf.get("port"),
                                      index=conf.get("index"),
                                      timeout=60):
            if records:
                with gzip.open(
                        "{}-{}-{}-{}.ldj.gz".format(conf.get("host"),
                                                    conf.get("port"),
                                                    conf.get("index"),
                                                    records[0].get("_type")),
                        "at") as fileout:
                    for record in records:
                        if "_source" in record:
                            print(json.dumps(record["_source"]), file=fileout)
    except Exception as e:
        with open("errors.txt", 'a') as f:
            traceback.print_exc(file=f)
Ejemplo n.º 2
0
def test_esfatgenerator():
    """
    old test for deprecated esfatgenerator, which is still used in esmarc
    """
    expected_records = []
    for n in range(0, MAX):
        retrecord = deepcopy(default_returnrecord)
        retrecord["_id"] = str(n)
        retrecord["_source"]["foo"] = n
        retrecord["_source"]["bar"] = MAX - n
        retrecord["_source"]["baz"] = "test{}".format(n)
        expected_records.append(dict(sorted(retrecord.items())))
    records = []
    for fatrecords in es2json.esfatgenerator(**default_kwargs):
        for record in fatrecords:
            record.pop("sort")
            records.append(dict(sorted(record.items())))
    assert sorted(expected_records,
                  key=lambda k: k["_id"]) == sorted(records,
                                                    key=lambda k: k["_id"])
Ejemplo n.º 3
0
def main():
    #argstuff
    parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records')
    parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.')
    parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-type',type=str,help='ElasticSearch Type to use')
    parser.add_argument('-index',type=str,help='ElasticSearch Index to use')
    parser.add_argument('-id',type=str,help='map single document, given by id')
    parser.add_argument('-help',action="store_true",help="print this help")
    parser.add_argument('-z',action="store_true",help="use gzip compression on output data")
    parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data')
    parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)')
    parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty")
    parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json")
    parser.add_argument('-w',type=int,default=8,help="how many processes to use")
    parser.add_argument('-idfile',type=str,help="path to a file with IDs to process")
    parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query')
    parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx")
    parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info")
#    parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200")
    args=parser.parse_args()
    if args.help:
        parser.print_help(sys.stderr)
        exit()        
    if args.server:
        slashsplit=args.server.split("/")
        args.host=slashsplit[2].rsplit(":")[0]
        if isint(args.server.split(":")[2].rsplit("/")[0]):
            args.port=args.server.split(":")[2].split("/")[0]
        args.index=args.server.split("/")[3]
        if len(slashsplit)>4:
            args.type=slashsplit[4]
        if len(slashsplit)>5:
            if "?pretty" in args.server:
                args.pretty=True
                args.id=slashsplit[5].rsplit("?")[0]
            else:
                args.id=slashsplit[5]
    if args.server or ( args.host and args.port ):
        es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port)
    global base_id
    global target_id
    base_id=args.base_id_src
    target_id=args.target_id
    if args.pretty:
        tabbing=4
    else:
        tabbing=None
        
    if args.host and args.index and args.type and args.id:
        json_record=None
        source=get_source_include_str()
        json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source)
        if json_record:
            print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing))
    elif args.host and args.index and args.type and args.idfile:
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esidfilegenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query,
                       idfile=args.idfile
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    elif args.host and args.index and args.type and args.debug:
        init_mp(args.host,args.port,args.prefix,args.z)
        for ldj in esgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       headless=True,
                       body=args.query
                        ): 
            record = process_line(ldj,args.host,args.port,args.index,args.type)
            if record:
                for k in record:
                    print(json.dumps(record[k],indent=None))
    elif args.host and args.index and args.type : #if inf not set, than try elasticsearch
        setupoutput(args.prefix)
        pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z))
        for ldj in esfatgenerator(host=args.host,
                       port=args.port,
                       index=args.index,
                       type=args.type,
                       source=get_source_include_str(),
                       body=args.query
                        ):
            pool.apply_async(worker,args=(ldj,))
        pool.close()
        pool.join()
    else: #oh noes, no elasticsearch input-setup. then we'll use stdin
        eprint("No host/port/index specified, trying stdin\n")
        init_mp("localhost","DEBUG","DEBUG","DEBUG")
        with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream:
            for line in input_stream:
                ret=process_line(json.loads(line),"localhost",9200,"data","mrc")
                if isinstance(ret,dict):
                    for k,v in ret.items():
                        print(json.dumps(v,indent=tabbing))
 })
 f = args.format
 if not args.doc or not args.debug:
     pool = Pool(processes=args.w,
                 initializer=init,
                 initargs=(
                     l,
                     c,
                     True,
                     i,
                     f,
                 ))
 if args.scroll and not args.debug:
     for fatload in esfatgenerator(
             host=args.host,
             port=args.port,
             type=args.type,
             index=args.index,
             source_exclude="_isil,_recorddate,identifier"):
         pool.apply_async(adjust_or_get_context_elasticsearchScroll,
                          args=(fatload, ))
 elif args.scroll and args.debug:
     init(
         l,
         c,
         True,
         i,
         f,
     )
     for fatload in esfatgenerator(
             host=args.host,
             port=args.port,
def main():
    #argstuff
    parser = ArgumentParser(
        description=
        'Merging of local and title marc records in MarcXchange Json format on ElasticSearch'
    )
    parser.add_argument(
        '-title_host',
        type=str,
        help=
        'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.'
    )
    parser.add_argument(
        '-title_port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-title_type',
                        type=str,
                        help='ElasticSearch Type to use')
    parser.add_argument('-title_index',
                        type=str,
                        help='ElasticSearch Index to use')
    parser.add_argument(
        '-title_server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty"
    )
    parser.add_argument(
        '-local_host',
        type=str,
        help=
        'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.'
    )
    parser.add_argument(
        '-local_port',
        type=int,
        default=9200,
        help='Port of the ElasticSearch-node to use, default is 9200.')
    parser.add_argument('-local_type',
                        type=str,
                        help='ElasticSearch Type to use')
    parser.add_argument('-local_index',
                        type=str,
                        help='ElasticSearch Index to use')
    parser.add_argument(
        '-local_server',
        type=str,
        help=
        "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty"
    )
    parser.add_argument(
        '-selectbody',
        type=loads,
        default={"query": {
            "match": {
                "852.__.a.keyword": "DE-14"
            }
        }})
    parser.add_argument('-help', action="store_true", help="print this help")
    args = parser.parse_args()
    if args.help:
        parser.print_help(stderr)
        exit()
    if args.title_server:
        slashsplit = args.title_server.split("/")
        args.title_host = slashsplit[2].rsplit(":")[0]
        if isint(args.title_server.split(":")[2].rsplit("/")[0]):
            args.title_port = args.title_server.split(":")[2].split("/")[0]
        args.title_index = args.title_server.split("/")[3]
        if len(slashsplit) > 4:
            args.local_type = slashsplit[4]
    if args.local_server:
        slashsplit = args.local_server.split("/")
        args.local_host = slashsplit[2].rsplit(":")[0]
        if isint(args.local_server.split(":")[2].rsplit("/")[0]):
            args.local_port = args.local_server.split(":")[2].split("/")[0]
        args.local_index = args.local_server.split("/")[3]
        if len(slashsplit) > 4:
            args.local_type = slashsplit[4]

    if args.title_server or (args.title_host and args.title_port):
        td = Elasticsearch([{"host": args.title_host}], port=args.title_port)
    else:
        eprint("no server for title data submitted. exiting.")
        exit(-1)
    if args.local_server or (args.local_host and args.local_port):
        for records in esfatgenerator(host=args.local_host,
                                      port=args.local_port,
                                      index=args.local_index,
                                      type=args.local_type,
                                      body=args.searchbody,
                                      source="852,004,938"):
            ids = dict()
            for record in records:
                ids[record["_source"]["004"][0]] = {
                    "852": record["_source"]["852"],
                    "938": record["_source"]["852"]
                }
            try:
                titlerecords = td.mget(index=args.title_index,
                                       doc_type=args.title_type,
                                       body={"ids": [_id for _id in ids]})
            except NotFoundError:
                continue
            except RequestError:
                continue
            for record in titlerecords["docs"]:
                if "_source" in record:
                    for field in ["852", "938"]:
                        record["_source"][field] = ids[record["_id"]][field]
                    print(dumps(record["_source"]))
                else:
                    eprint(dumps(record))

    else:
        eprint("no server for local data submitted. exiting.")
        exit(-1)