def index_webentity(web_entity_pile,web_entity_done_pile,conf,mainlog):
    processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid())
    processlog.info("starting infinite loop")
    corpus = conf['hyphe-core']['corpus_id']
    solr = sunburnt.SolrInterface("http://%s:%s/solr/%s" % (conf["solr"]['host'], conf["solr"]['port'], get_solr_instance_name(conf["solr"]['path'])))
    hyphe_core=jsonrpclib.Server('http://%s:%s'%(conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]), version=1)
    db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port'])
    collname = "%s.pages" % conf['hyphe-core']['corpus_id']
    coll = db[conf["mongo"]["db"]][collname]
    while True :
        we=web_entity_pile.get()

        # logging in proc log
        processlog.info("%s: starting processing"%we["name"])

        #setting LOG
        web_entity_name_safe=re.sub(r"[\W]","",we['name'])
        web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"])
        logfilename="logs/by_web_entity/%s.log"%(web_entity_log_id[:80])
        errors_solr_document_filename="logs/errors_solr_document/%s.json"%(web_entity_log_id[:80])
        welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"]))
        #mainlog.info("DEBUG %s"%(we["id"]))
        try:
            web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus)
        except:
            web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus)

        if (web_pages['code'] == 'fail') :
            mainlog.info(we_pages['message'])
        welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"]))
        we["web_pages"]=web_pages["result"]

        processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"])))

        #getting mongo html web page
        urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0]
        nb_urls=len(urls)
        last_id=""
        pages_mongo=[]
        nb_pages_mongo=0
        nb_pages_indexed=0
        i=0
        url_slice_len=1000
        welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"]))

        while i<len(urls) :
            urls_slice=urls[i:i+url_slice_len]
            pages_mongo_slice=list(coll.find({
                    "url": {"$in": urls_slice},
                    "status": 200,
                    "content_type": {"$in": accepted_content_types},
                    "body" : {"$exists":True}
                },
                projection=["_id","encoding","url","lru","depth","body"]))
            #mainlog.info(str(len(pages_mongo_slice)))
            #local counters
            nb_slice_mongo=len(pages_mongo_slice)
            nb_slice_indexed=0

            welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice)))

            error_solr_doc=[]
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding",""))
                    encoding = page_mongo.get("encoding","")
                except Exception :
                    body = body.decode("UTF8","replace")
                    encoding = "UTF8-replace"
                solr_document={
                    "id":page_mongo["_id"],
                    "web_entity":we["name"],
                    "web_entity_id":we["id"],
                    "web_entity_status":we["status"],
                    "corpus":conf['hyphe-core']['corpus_id'],
                    "encoding":encoding,
                    "original_encoding":page_mongo.get("encoding",""),
                    "url":page_mongo["url"],
                    "lru":page_mongo["lru"],
                    "depth":page_mongo["depth"],
                    #"html":body,
                    "text":html2text.textify(body, extractor="raw", encoding=encoding)
                    #"textCanola":html2text.textify(body, extractor="CanolaExtractor", encoding=encoding)
                }

                try:
                     solr.add(solr_document)
                     nb_slice_indexed+=1
                except Exception as e:
                    # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document))
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({"text": solr_document["text"],"body": body, "error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]})
                    # import traceback
                    # traceback.print_exc()
            if len(error_solr_doc) >0 :
                with open(errors_solr_document_filename,"a") as errors_solr_document_json_file :
                    json.dump(error_solr_doc,errors_solr_document_json_file,indent=4)
            del(error_solr_doc)
			#log
            welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo+=nb_slice_mongo
            nb_pages_indexed+=nb_slice_indexed
            i=i+url_slice_len


        del we["web_pages"]
        del web_pages
        del urls

        welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo))
        try:
            solr.commit()
        except Exception as e:
            mainlog.info("ERROR %s: %s" %(type(e), e))
            mainlog.info("Retrying...")
            try:
                solr.commit()
            except Exception as e:
                mainlog.info("STILL BROKEN, giving up on %s %s" % (we['id'], we['name']))
		#relying on autocommit
        #welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()
            web_entity_done_pile.task_done()

re_solrname = re.compile(r"^.*/([^/]+)$")
get_solr_instance_name = lambda solrpath: re_solrname.sub(r"\1", solrpath)


if __name__=='__main__':

    # usage :
    # --delete_index
    parser = argparse.ArgumentParser()
    parser.add_argument("-d","--delete_index", action='store_true', help="delete solr index before (re)indexing.\n\rWARNING all previous indexing work will be lost.")
    args = parser.parse_args()


    mainlog=TimeElapsedLogging.create_log("main")
    #Load conf
    try:
        with open('config.json') as confile:
            conf = json.loads(confile.read())
    except Exception as e:
        sys.stderr.write("%s: %s\n" % (type(e), e))
        sys.stderr.write('ERROR: Could not read configuration\n')
        sys.exit(1)

    try:
        if not os.path.exists("logs"):
            os.makedirs("logs")
            os.makedirs("logs/by_pid")
            os.makedirs("logs/by_web_entity")
            os.makedirs("logs/errors_solr_document")
Beispiel #3
0
def index_webentity(web_entity_pile, web_entity_done_pile, conf, mainlog):
    processlog = TimeElapsedLogging.create_log(str(os.getpid()),
                                               filename="logs/by_pid/%s.log" %
                                               os.getpid())
    processlog.info("starting infinite loop")
    corpus = conf['hyphe-core']['corpus_id']
    solr = sunburnt.SolrInterface(
        "http://%s:%s/solr/%s" %
        (conf["solr"]['host'], conf["solr"]['port'],
         get_solr_instance_name(conf["solr"]['path'])))
    hyphe_core = jsonrpclib.Server(
        'http://%s:%s' %
        (conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]),
        version=1)
    db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port'])
    collname = "%s.pages" % conf['hyphe-core']['corpus_id']
    coll = db[conf["mongo"]["db"]][collname]
    while True:
        we = web_entity_pile.get()

        # logging in proc log
        processlog.info("%s: starting processing" % we["name"])

        #setting LOG
        web_entity_name_safe = re.sub(r"[\W]", "", we['name'])
        web_entity_log_id = "%s_%s" % (web_entity_name_safe, we["id"])
        logfilename = "logs/by_web_entity/%s.log" % web_entity_log_id
        errors_solr_document_filename = "logs/errors_solr_document/%s.json" % web_entity_log_id
        welog = TimeElapsedLogging.create_log(we["id"], filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,
                  "retrieving pages of web entity %s" % (we["name"]))
        #mainlog.info("DEBUG %s"%(we["id"]))
        web_pages = hyphe_core.store.get_webentity_pages(
            we["id"], True, corpus)
        if (web_pages['code'] == 'fail'):
            mainlog.info(we_pages['message'])
        welog.log(
            logging.INFO, "retrieved %s pages of web entity %s" %
            (len(web_pages["result"]), we["name"]))
        we["web_pages"] = web_pages["result"]

        processlog.info("%s: got %s webpages" %
                        (we["name"], len(we["web_pages"])))

        #getting mongo html web page
        urls = [page["url"]
                for page in we["web_pages"]]  #if page["http_status"]!=0]
        nb_urls = len(urls)
        last_id = ""
        pages_mongo = []
        nb_pages_mongo = 0
        nb_pages_indexed = 0
        i = 0
        url_slice_len = 1000
        welog.info(
            "retrieving + indexing HTML pages from mongo to solr of web entity %s"
            % (we["name"]))

        while i < len(urls):
            urls_slice = urls[i:i + url_slice_len]
            pages_mongo_slice = list(
                coll.find(
                    {
                        "url": {
                            "$in": urls_slice
                        },
                        "status": 200,
                        "content_type": {
                            "$in": accepted_content_types
                        },
                        "body": {
                            "$exists": True
                        }
                    },
                    projection=[
                        "_id", "encoding", "url", "lru", "depth", "body"
                    ]))
            #mainlog.info(str(len(pages_mongo_slice)))
            #local counters
            nb_slice_mongo = len(pages_mongo_slice)
            nb_slice_indexed = 0

            welog.info(
                "%s %s: got %s pages in slice %s %s" %
                (we["name"], we["id"], nb_slice_mongo, i, len(urls_slice)))

            error_solr_doc = []
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding", ""))
                    encoding = page_mongo.get("encoding", "")
                except Exception:
                    body = body.decode("UTF8", "replace")
                    encoding = "UTF8-replace"
                solr_document = {
                    "id": page_mongo["_id"],
                    "web_entity": we["name"],
                    "web_entity_id": we["id"],
                    "web_entity_status": we["status"],
                    "corpus": conf['hyphe-core']['corpus_id'],
                    "encoding": encoding,
                    "original_encoding": page_mongo.get("encoding", ""),
                    "url": page_mongo["url"],
                    "lru": page_mongo["lru"],
                    "depth": page_mongo["depth"],
                    #"html":body,
                    "text": html2text.textify(body, encoding)
                }

                try:
                    solr.add(solr_document)
                    nb_slice_indexed += 1
                except Exception as e:
                    # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document))
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({
                        "error":
                        "%s: %s" % (type(e), e),
                        "url":
                        solr_document["url"],
                        "encoding":
                        solr_document["encoding"],
                        "original_encoding":
                        solr_document["original_encoding"]
                    })
                    # import traceback
                    # traceback.print_exc()
            if len(error_solr_doc) > 0:
                with open(errors_solr_document_filename,
                          "a") as errors_solr_document_json_file:
                    json.dump(error_solr_doc,
                              errors_solr_document_json_file,
                              indent=4)
            del (error_solr_doc)
            #log
            welog.info("%s %s: indexed %s pages" %
                       (we["name"], we["id"], nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo += nb_slice_mongo
            nb_pages_indexed += nb_slice_indexed
            i = i + url_slice_len

        del we["web_pages"]
        del web_pages
        del urls

        welog.log(
            logging.INFO, "'%s' indexed (%s web pages on %s)" %
            (we["name"], nb_pages_indexed, nb_pages_mongo))
        try:
            solr.commit()
        except Exception as e:
            mainlog.info("ERROR %s: %s" % (type(e), e))
            mainlog.info("Retrying...")
            try:
                solr.commit()
            except Exception as e:
                mainlog.info("STILL BROKEN, giving up on %s %s" %
                             (we['id'], we['name']))

#relying on autocommit
#welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages" %
                        (we["name"], nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()
Beispiel #4
0
if __name__ == '__main__':

    # usage :
    # --delete_index
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-d",
        "--delete_index",
        action='store_true',
        help=
        "delete solr index before (re)indexing.\n\rWARNING all previous indexing work will be lost."
    )
    args = parser.parse_args()

    mainlog = TimeElapsedLogging.create_log("main")
    #Load conf
    try:
        with open('config.json') as confile:
            conf = json.loads(confile.read())
    except Exception as e:
        sys.stderr.write("%s: %s\n" % (type(e), e))
        sys.stderr.write('ERROR: Could not read configuration\n')
        sys.exit(1)

    try:
        if not os.path.exists("logs"):
            os.makedirs("logs")
            os.makedirs("logs/by_pid")
            os.makedirs("logs/by_web_entity")
            os.makedirs("logs/errors_solr_document")
def index_webentity(web_entity_pile,web_entity_done_pile,hyphe_core,coll,solr):
    processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid())
    processlog.info("starting infinite loop")
    #hyphe_core=jsonrpclib.Server(hyphe_core_url)
    while True :
        we=web_entity_pile.get()
        # logging in proc log
        processlog.info("%s: starting processing"%we["name"])

        #setting LOG
        web_entity_name_safe=re.sub(r"[\W]","",we['name'])
        web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"])
        logfilename="logs/by_web_entity/%s.log"%web_entity_log_id
        errors_solr_document_filename="logs/errors_solr_document/%s.json"%web_entity_log_id
        welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"]))
        web_pages = hyphe_core.store.get_webentity_pages(we["id"])
        welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"]))
        we["web_pages"]=web_pages["result"]

        processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"])))

        #getting mongo html web page
        urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0]
        nb_urls=len(urls)
        last_id=""
        pages_mongo=[]
        nb_pages_mongo=0
        nb_pages_indexed=0
        i=0
        url_slice_len=1000
        welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"]))
        while i<len(urls) :
            urls_slice=urls[i:i+url_slice_len]
            pages_mongo_slice=coll.find({
                    "url": {"$in": urls_slice},
                    "content_type": {"$in": accepted_content_types},
                    "body" : {"$exists":True}
                },
                fields=["_id","encoding","url","lru","depth","body"])

            #local counters
            nb_slice_mongo=pages_mongo_slice.count()
            nb_slice_indexed=0

            welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice)))

            error_solr_doc=[]
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding",""))
                    encoding = page_mongo.get("encoding","")
                except Exception :
                    body = body.decode("UTF8","replace")
                    encoding = "UTF8-replace"
                solr_document={
                    "id":page_mongo["_id"],
                    "web_entity":we["name"],
                    "web_entity_id":we["id"],
                    "web_entity_status":we["status"],
                    "corpus":"hyphe",
                    "encoding":encoding,
                    "original_encoding":page_mongo.get("encoding",""),
                    "url":page_mongo["url"],
                    "lru":page_mongo["lru"],
                    "depth":page_mongo["depth"],
                    "html":body,
                    "text":html2text.textify(body)
                }

                try:
                     solr.add(solr_document)
                     nb_slice_indexed+=1
                except Exception as e:
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({"error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]})
            if len(error_solr_doc) >0 :
                with open(errors_solr_document_filename,"a") as errors_solr_document_json_file :
                    json.dump(error_solr_doc,errors_solr_document_json_file,indent=4)
            del(error_solr_doc)
			#log
            welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo+=nb_slice_mongo
            nb_pages_indexed+=nb_slice_indexed
            i=i+url_slice_len


        del we["web_pages"]
        del web_pages
        del urls

        welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo))
	    #solr.commit()
		#relying on autocommit
        #welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()
def index_webentity(web_entity_pile,web_entity_done_pile,hyphe_core,coll,solr, corpus, tags):
    processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid())
    processlog.info("starting infinite loop")
    while True :
        we=web_entity_pile.get()

        # Get the tags of the web entity
        tagsWE = tags[we["id"]]
        # logging in proc log
        processlog.info("%s: starting processing"%we["name"])

        #setting LOG
        web_entity_name_safe=re.sub(r"[\W]","",we['name'])
        web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"])
        logfilename="logs/by_web_entity/%s.log"%web_entity_log_id
        errors_solr_document_filename="logs/errors_solr_document/%s.json"%web_entity_log_id
        welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"]))
        mainlog.info("DEBUG %s"%(we["id"]))
        web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus)
        if (web_pages['code'] == 'fail') :
            mainlog.info(we_pages['message'])
        welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"]))
        we["web_pages"]=web_pages["result"]

        processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"])))

        #getting mongo html web page
        urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0]
        nb_urls=len(urls)
        last_id=""
        pages_mongo=[]
        nb_pages_mongo=0
        nb_pages_indexed=0
        i=0
        url_slice_len=1000
        welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"]))

        while i<len(urls) :
            urls_slice=urls[i:i+url_slice_len]
            pages_mongo_slice=coll.find({
                    "url": {"$in": urls_slice},
                    "status": 200,
                    "content_type": {"$in": accepted_content_types},
                    "body" : {"$exists":True}
                },
                fields=["_id","encoding","url","lru","depth","body"])
            # mainlog.info(str(len(list(pages_mongo_slice))))
            #local counters
            nb_slice_mongo=pages_mongo_slice.count()
            nb_slice_indexed=0

            welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice)))

            error_solr_doc=[]
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding",""))
                    encoding = page_mongo.get("encoding","")
                except Exception :
                    body = body.decode("UTF8","replace")
                    encoding = "UTF8-replace"
                solr_document={
                    "id":page_mongo["_id"],
                    "web_entity":we["name"],
                    "web_entity_id":we["id"],
                    "web_entity_status":we["status"],
                    "corpus":conf['hyphe-core']['corpus_id'],
                    "encoding":encoding,
                    "original_encoding":page_mongo.get("encoding",""),
                    "url":page_mongo["url"],
                    "lru":page_mongo["lru"],
                    "depth":page_mongo["depth"],
                    "html":body,
                    "text":html2text.textify(body),
                    "actors_type": tagsWE["ACTORS_TYPE"],
                    "country": tagsWE["COUNTRY"],
                    "anthropogenic": tagsWE["ANTHROPOGENIC_CLIMATE_CHANGE"],
                    "mitigation_adaptation": tagsWE["MITIGATION_ADAPTATION"],
                    "industrial_delegation": tagsWE["INDUSTRIAL_DELEGATION"],
                    "thematic_delegation": tagsWE["THEMATIC_DELEGATION"],
                    "language": tagsWE["LANGUAGE"]
                }

                try:
                     solr.add(solr_document)
                     nb_slice_indexed+=1
                except Exception as e:
                    # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document))
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({"text": solr_document["text"],"body": solr_document["html"], "error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]})
                    # import traceback
                    # traceback.print_exc()
            if len(error_solr_doc) >0 :
                with open(errors_solr_document_filename,"a") as errors_solr_document_json_file :
                    json.dump(error_solr_doc,errors_solr_document_json_file,indent=4)
            del(error_solr_doc)
			#log
            welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo+=nb_slice_mongo
            nb_pages_indexed+=nb_slice_indexed
            i=i+url_slice_len


        del we["web_pages"]
        del web_pages
        del urls

        welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo))
	    #solr.commit()
		#relying on autocommit
        #welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()