def run(self): """ itereates over the in LODProcessFromRdi generated JSON-Linked-Data enriches them with identifier from entityfacts enriches them with subjects from the GND enriches them with identifier from wikidata enriches them with identifier from geonames, if its a geographic Place ingests them into a elasticsearch node """ path = "{date}-aut-data".format(date=self.yesterday.strftime("%y%m%d")) for index in os.listdir(path): # doing several enrichment things before indexing the data for f in os.listdir(path + "/" + index): cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && zcat {fd} | ".format( fd=path + "/" + index + "/" + f ) # with -pipeline, all the data get's thru, not only enriched docs # cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py -pipeline -stdin -searchserver {host} | ".format(**self.config) cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/entityfacts-bot.py -pipeline -stdin -searchserver {host} | ".format( **self.config) cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/gnd-sachgruppen.py -pipeline -stdin -searchserver {host} | ".format( **self.config) cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/wikidata.py -pipeline -stdin | " if index == "geo": cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/geonames.py -pipeline -stdin -searchserver {geonames_host} | ".format( **self.config) cmd += "esbulk -verbose -server {host} -w 1 -size 20 -index {index} -type schemaorg -id identifier".format( **self.config, index=index) shellout(cmd) put_dict("{host}/date/actual/1".format(**self.config), {"date": str(self.yesterday.strftime("%Y-%m-%d"))})
def run(self): """ Loads mapped data into a given ElasticSearch index (with help of esbulk) """ if os.stat( "{date}-finc-fixed.ldj.gz".format(date=self.date)).st_size > 0: cmd = "esbulk -z -verbose -server {host} -w {workers} -index finc-resources -type schemaorg -id _id {date}-finc-fixed.ldj.gz" "".format( **self.config, date=self.date) output = shellout(cmd) put_dict("{host}/date/actual/5".format(**self.config), {"date": str(self.now)})
def run(self): path="{date}-aut-data".format(date=self.yesterday.strftime("%y%m%d")) enrichmentstr=[] for index in os.listdir(path): for f in os.listdir(path+"/"+index): ### doing several enrichment things before indexing the data cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && zcat {fd} | ".format(fd=path+"/"+index+"/"+f)#with -pipeline, all the data get's thru, not only enriched docs #cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py -pipeline -stdin -searchserver {host} | ".format(**self.config) cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/entityfacts-bot.py -pipeline -stdin -searchserver {host} | ".format(**self.config) cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/gnd-sachgruppen.py -pipeline -stdin -searchserver {host} | ".format(**self.config) cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/wikidata.py -pipeline -stdin | " if index=="geo": cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/geonames.py -pipeline -stdin -searchserver {geonames_host} | ".format(**self.config) cmd+="esbulk -verbose -server {host} -w 1 -size 20 -index {index} -type schemaorg -id identifier".format(**self.config,index=index) output=shellout(cmd) put_dict("{host}/date/actual/1".format(**self.config),{"date":str(self.yesterday.strftime("%Y-%m-%d"))})
def run(self): r = delete("{server}/{index}".format(**self.config)) put_dict( "{server}/{index}".format(**self.config), { "mappings": { "{type}".format(**self.config): { "properties": { "location": { "type": "geo_point" } } } } }) cmd = "esbulk -z -server {server} -index {index} -type {type} -w {workers} -id id -verbose {file}.ldj.gz".format( **self.config) output = shellout(cmd)
def run(self): if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0: path = "{date}-data".format(date=self.date) for index in os.listdir(path): for f in os.listdir(path + "/" + index): cmd = "esbulk -z -verbose -server {host} -w {workers} -index {index} -type schemaorg -id identifier {fd}".format( **self.config, index=index, fd=path + "/" + index + "/" + f) output = shellout(cmd) #for f in os.listdir(path+"/resources"): # cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && " # cmd+="~/git/efre-lod-elasticsearch-tools/processing/merge2move.py -server {host} -stdin < {fd} | ".format(**self.config,fd=path+"/resources/"+f) # cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py -searchserver {host} -stdin | ".format(**self.config,fd=path+"/resources/"+f) # cmd+="esbulk -verbose -server {rawdata_host} -w {workers} -index {index} -type schemaorg -id identifier".format(**self.config,index="resources-fidmove") # output=shellout(cmd) put_dict("{host}/date/actual/4".format(**self.config), {"date": str(self.now)}) with gzip.open("slub_resources_sourceid0.ldj", "wt") as outp: for record in esgenerator( host="{host}".format( **self.config).rsplit("/")[2].rsplit(":")[0], port="{host}".format( **self.config).rsplit("/")[2].rsplit(":")[1], index="resources", type="schemaorg", body={ "query": { "bool": { "must": [{ "match": { "offers.offeredBy.branchCode.keyword": "DE-14" } }, { "match": { "_sourceID.keyword": "0" } }] } } }, headless=True): print(json.dumps(record), file=outp)
def run(self): """ Loads processed GND data into a given ElasticSearch index (with help of esbulk) """ cmd = "esbulk -z -verbose -server http://{host}:{port} -w {workers}" "".format( **self.config) for k, v in self.config.get("indices").items(): shellout("curl -XDELETE http://{host}:{port}/{index}".format( **self.config, index=v)) put_dict("http://{host}/{index}".format(**self.config, index=v), {"mappings": { k: { "date_detection": False } }}) shellout(cmd + """ -index {index} -type {type} -id id {type}s.ldj.gz""". format(index=v, type=k))
def run(self): """ ingests the data processed in LODTITProcessFromRdi into an elasticsearch-index saves the date of the update into the config file if this was successfull """ if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0: path = "{date}-data".format(date=self.date) for index in os.listdir(path): for f in os.listdir(path+"/"+index): cmd = "esbulk -z -verbose -server {host} -w {workers} -index {index} -type schemaorg -id identifier {fd}".format( **self.config, index=index, fd=path+"/"+index+"/"+f) shellout(cmd) # for f in os.listdir(path+"/resources"): # cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && " # cmd+="~/git/efre-lod-elasticsearch-tools/processing/merge2move.py -server {host} -stdin < {fd} | ".format(**self.config,fd=path+"/resources/"+f) # cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py -searchserver {host} -stdin | ".format(**self.config,fd=path+"/resources/"+f) # cmd+="esbulk -verbose -server {rawdata_host} -w {workers} -index {index} -type schemaorg -id identifier".format(**self.config,index="resources-fidmove") # output=shellout(cmd) put_dict("{host}/date/actual/4".format(**self.config), {"date": str(self.now)})
def run(self): """ deletes the geonames index creates an geonames index with the proper type:geo_point mapping ingests the json data """ delete("{server}/{index}".format(**self.config)) put_dict( "{server}/{index}".format(**self.config), { "mappings": { "{type}".format(**self.config): { "properties": { "location": { "type": "geo_point" } } } } }) cmd = "esbulk -z -server {server} -index {index} -type {type} -w {workers} -id id -verbose {file}.ldj.gz".format( **self.config) shellout(cmd)
def run(self): if os.stat("{date}-finc-fixed.ldj.gz".format(date=self.date)).st_size > 0: cmd="esbulk -z -verbose -server {host} -w {workers} -index finc-resources -type schemaorg -id _id {date}-finc-fixed.ldj.gz""".format(**self.config,date=self.date) output=shellout(cmd) put_dict("{host}/date/actual/5".format(**self.config),{"date":str(self.now)})