Ejemplo n.º 1
0
def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None):
    starttime = time.time()

    docs = dirlist + filelist
    index_bulk_add(es, docs, config, cliargs)

    data = {"worker_name": worker_name, "dir_count": len(dirlist),
            "file_count": len(filelist), "bulk_time": round(time.time() - starttime, 6),
            "crawl_time": round(totalcrawltime, 6),
            "indexing_date": datetime.utcnow().isoformat()}
    es.index(index=cliargs['index'], doc_type='worker', body=data)
Ejemplo n.º 2
0
def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None):
    if cliargs['chunkfiles']:
        updated_dirlist = []
        # check for existing directory docs in index and update crawl time only (dirchunk)
        for d in dirlist:
            try:
                path = d[
                    'chunkpath']  # this key determins if its part of a chunked dir
                crawltime = d['crawl_time']
                f = os.path.basename(path)
                # parent path
                p = os.path.abspath(os.path.join(path, os.pardir))

                data = {
                    "size": 1,
                    "_source": ['crawl_time'],
                    "query": {
                        "query_string": {
                            "query":
                            "filename: \"" + f + "\" AND path_parent: \"" + p +
                            "\""
                        }
                    }
                }

                es.indices.refresh(index=cliargs['index'])
                res = es.search(index=cliargs['index'],
                                doc_type='directory',
                                body=data,
                                request_timeout=config['es_timeout'])

                if len(res['hits']['hits']) == 0:
                    continue

                docid = res['hits']['hits'][0]['_id']
                current_crawltime = res['hits']['hits'][0]['_source'][
                    'crawl_time']
                udpated_crawltime = current_crawltime + crawltime

                # update crawltime in index
                d = {
                    '_op_type': 'update',
                    '_index': cliargs['index'],
                    '_type': 'directory',
                    '_id': docid,
                    'doc': {
                        'crawl_time': udpated_crawltime
                    }
                }
            except KeyError:
                pass  # not part of a chunked dir

            updated_dirlist.append(d)

        dirlist = updated_dirlist

    starttime = time.time()

    docs = dirlist + filelist
    index_bulk_add(es, docs, config, cliargs)

    if not cliargs['noworkerdocs']:
        data = {
            "worker_name": worker_name,
            "dir_count": len(dirlist),
            "file_count": len(filelist),
            "bulk_time": round(time.time() - starttime, 6),
            "crawl_time": round(totalcrawltime, 6),
            "indexing_date": datetime.utcnow().isoformat()
        }
        es.index(index=cliargs['index'], doc_type='worker', body=data)