Esempio n. 1
0
 def run(self):
     """
     itereates over the in LODProcessFromRdi generated JSON-Linked-Data
     enriches them with identifier from entityfacts
     enriches them with subjects from the GND
     enriches them with identifier from wikidata
     enriches them with identifier from geonames, if its a geographic Place
     ingests them into a elasticsearch node
     """
     path = "{date}-aut-data".format(date=self.yesterday.strftime("%y%m%d"))
     for index in os.listdir(path):
         # doing several enrichment things before indexing the data
         for f in os.listdir(path + "/" + index):
             cmd = ". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && zcat {fd} | ".format(
                 fd=path + "/" + index + "/" + f
             )  # with -pipeline, all the data get's thru, not only enriched docs
             # cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py         -pipeline -stdin -searchserver {host} | ".format(**self.config)
             cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/entityfacts-bot.py   -pipeline -stdin -searchserver {host} | ".format(
                 **self.config)
             cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/gnd-sachgruppen.py   -pipeline -stdin -searchserver {host} | ".format(
                 **self.config)
             cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/wikidata.py          -pipeline -stdin | "
             if index == "geo":
                 cmd += "~/git/efre-lod-elasticsearch-tools/enrichment/geonames.py       -pipeline -stdin -searchserver {geonames_host} | ".format(
                     **self.config)
             cmd += "esbulk -verbose -server {host} -w 1 -size 20 -index {index} -type schemaorg -id identifier".format(
                 **self.config, index=index)
             shellout(cmd)
     put_dict("{host}/date/actual/1".format(**self.config),
              {"date": str(self.yesterday.strftime("%Y-%m-%d"))})
 def run(self):
     """
     Loads mapped data into a given ElasticSearch index (with help of esbulk)
     """
     if os.stat(
             "{date}-finc-fixed.ldj.gz".format(date=self.date)).st_size > 0:
         cmd = "esbulk -z -verbose -server {host} -w {workers} -index finc-resources -type schemaorg -id _id {date}-finc-fixed.ldj.gz" "".format(
             **self.config, date=self.date)
         output = shellout(cmd)
         put_dict("{host}/date/actual/5".format(**self.config),
                  {"date": str(self.now)})
 def run(self):
     path="{date}-aut-data".format(date=self.yesterday.strftime("%y%m%d"))
     enrichmentstr=[]
     for index in os.listdir(path):
         for f in os.listdir(path+"/"+index):                                        ### doing several enrichment things before indexing the data
             cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && zcat {fd} | ".format(fd=path+"/"+index+"/"+f)#with -pipeline, all the data get's thru, not only enriched docs
             #cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py         -pipeline -stdin -searchserver {host} | ".format(**self.config)
             cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/entityfacts-bot.py   -pipeline -stdin -searchserver {host} | ".format(**self.config)
             cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/gnd-sachgruppen.py   -pipeline -stdin -searchserver {host} | ".format(**self.config)
             cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/wikidata.py          -pipeline -stdin | "
             if index=="geo":
                 cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/geonames.py       -pipeline -stdin -searchserver {geonames_host} | ".format(**self.config)
             cmd+="esbulk -verbose -server {host} -w 1 -size 20 -index {index} -type schemaorg -id identifier".format(**self.config,index=index)
             output=shellout(cmd)
     put_dict("{host}/date/actual/1".format(**self.config),{"date":str(self.yesterday.strftime("%Y-%m-%d"))})
 def run(self):
     r = delete("{server}/{index}".format(**self.config))
     put_dict(
         "{server}/{index}".format(**self.config), {
             "mappings": {
                 "{type}".format(**self.config): {
                     "properties": {
                         "location": {
                             "type": "geo_point"
                         }
                     }
                 }
             }
         })
     cmd = "esbulk -z -server {server} -index {index} -type {type} -w {workers} -id id -verbose {file}.ldj.gz".format(
         **self.config)
     output = shellout(cmd)
    def run(self):

        if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0:
            path = "{date}-data".format(date=self.date)
            for index in os.listdir(path):
                for f in os.listdir(path + "/" + index):
                    cmd = "esbulk -z -verbose -server {host} -w {workers} -index {index} -type schemaorg -id identifier {fd}".format(
                        **self.config,
                        index=index,
                        fd=path + "/" + index + "/" + f)
                    output = shellout(cmd)
        #for f in os.listdir(path+"/resources"):
        #    cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && "
        #    cmd+="~/git/efre-lod-elasticsearch-tools/processing/merge2move.py -server {host} -stdin < {fd} | ".format(**self.config,fd=path+"/resources/"+f)
        #    cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py  -searchserver {host} -stdin  | ".format(**self.config,fd=path+"/resources/"+f)
        #    cmd+="esbulk -verbose -server {rawdata_host} -w {workers} -index {index} -type schemaorg -id identifier".format(**self.config,index="resources-fidmove")
        #    output=shellout(cmd)
        put_dict("{host}/date/actual/4".format(**self.config),
                 {"date": str(self.now)})
        with gzip.open("slub_resources_sourceid0.ldj", "wt") as outp:
            for record in esgenerator(
                    host="{host}".format(
                        **self.config).rsplit("/")[2].rsplit(":")[0],
                    port="{host}".format(
                        **self.config).rsplit("/")[2].rsplit(":")[1],
                    index="resources",
                    type="schemaorg",
                    body={
                        "query": {
                            "bool": {
                                "must": [{
                                    "match": {
                                        "offers.offeredBy.branchCode.keyword":
                                        "DE-14"
                                    }
                                }, {
                                    "match": {
                                        "_sourceID.keyword": "0"
                                    }
                                }]
                            }
                        }
                    },
                    headless=True):
                print(json.dumps(record), file=outp)
 def run(self):
     """
     Loads processed GND data into a given ElasticSearch index (with help of esbulk)
     """
     cmd = "esbulk -z -verbose -server http://{host}:{port} -w {workers}" "".format(
         **self.config)
     for k, v in self.config.get("indices").items():
         shellout("curl -XDELETE http://{host}:{port}/{index}".format(
             **self.config, index=v))
         put_dict("http://{host}/{index}".format(**self.config, index=v),
                  {"mappings": {
                      k: {
                          "date_detection": False
                      }
                  }})
         shellout(cmd +
                  """ -index {index} -type {type} -id id {type}s.ldj.gz""".
                  format(index=v, type=k))
    def run(self):
        """
        ingests the data processed in LODTITProcessFromRdi into an elasticsearch-index
        saves the date of the update into the config file if this was successfull
        """

        if os.stat("{date}.mrc.bz2".format(date=self.date)).st_size > 0:
            path = "{date}-data".format(date=self.date)
            for index in os.listdir(path):
                for f in os.listdir(path+"/"+index):
                    cmd = "esbulk -z -verbose -server {host} -w {workers} -index {index} -type schemaorg -id identifier {fd}".format(
                        **self.config, index=index, fd=path+"/"+index+"/"+f)
                    shellout(cmd)
        # for f in os.listdir(path+"/resources"):
        #    cmd=". ~/git/efre-lod-elasticsearch-tools/init_environment.sh && "
        #    cmd+="~/git/efre-lod-elasticsearch-tools/processing/merge2move.py -server {host} -stdin < {fd} | ".format(**self.config,fd=path+"/resources/"+f)
        #    cmd+="~/git/efre-lod-elasticsearch-tools/enrichment/sameAs2id.py  -searchserver {host} -stdin  | ".format(**self.config,fd=path+"/resources/"+f)
        #    cmd+="esbulk -verbose -server {rawdata_host} -w {workers} -index {index} -type schemaorg -id identifier".format(**self.config,index="resources-fidmove")
        #    output=shellout(cmd)
        put_dict("{host}/date/actual/4".format(**self.config),
                 {"date": str(self.now)})
 def run(self):
     """
     deletes the geonames index
     creates an geonames index with the proper type:geo_point mapping
     ingests the json data
     """
     delete("{server}/{index}".format(**self.config))
     put_dict(
         "{server}/{index}".format(**self.config), {
             "mappings": {
                 "{type}".format(**self.config): {
                     "properties": {
                         "location": {
                             "type": "geo_point"
                         }
                     }
                 }
             }
         })
     cmd = "esbulk -z -server {server} -index {index} -type {type} -w {workers} -id id -verbose {file}.ldj.gz".format(
         **self.config)
     shellout(cmd)
Esempio n. 9
0
 def run(self):
     if os.stat("{date}-finc-fixed.ldj.gz".format(date=self.date)).st_size > 0:
         cmd="esbulk -z -verbose -server {host} -w {workers} -index finc-resources -type schemaorg -id _id {date}-finc-fixed.ldj.gz""".format(**self.config,date=self.date)
         output=shellout(cmd)
         put_dict("{host}/date/actual/5".format(**self.config),{"date":str(self.now)})