Ejemplo n.º 1
0
        def consumer(file):
            num_to_skip = es_client.get_source(index=STATE_INDEX_NAME,
                                               id=file,
                                               doc_type=STATE_TYPE)["line"]
            process_name = multiprocessing.current_process().name
            print("%s is skipping %d lines in file: %s " %
                  (process_name, num_to_skip, file))

            results_g = helpers.streaming_bulk(
                es_client,
                actions=(build_es_action(try_to_process(d),
                                         es_index_name,
                                         es_type,
                                         op_type="index")
                         for d in data_io.read_jsonl(
                             file, limit=limit, num_to_skip=num_to_skip)),
                chunk_size=chunk_size,
                yield_ok=True,
                raise_on_error=False,
                raise_on_exception=False,
            )
            counter = num_to_skip
            for k, (ok, d) in enumerate(results_g):
                counter += 1
                if not ok and "index" in d:
                    print("shit")
                if k % 1000 == 0:
                    update_state(file, {"line": counter})

            update_state(file, {"line": counter})
            if limit is None or counter < limit:
                update_state(file, {"done": True})

            print("%s is done; inserted %d new docs!" %
                  (process_name, counter - num_to_skip))
Ejemplo n.º 2
0
def populate_es_parallel_bulk(
    es, files, es_index_name, es_type, limit=None, num_processes=4, chunk_size=500
):
    dicts_g = (d for file in files for d in read_jsonl(file, limit=limit))

    actions_g = (build_es_action(d, es_index_name, es_type) for d in dicts_g)
    results_g = helpers.parallel_bulk(
        es,
        actions_g,
        thread_count=num_processes,
        queue_size=num_processes,
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
    )
    failed_g = (
        pop_exception(d)
        for ok, d in tqdm(results_g)
        if not ok and d.get("create", {}).get("status", 200) != 409
    )
    data_io.write_jsonl("failed.jsonl", failed_g)
Ejemplo n.º 3
0
def populate_es_streaming_bulk(
    es_client: Elasticsearch,
    dicts: Iterable[Dict],
    es_index_name: str,
    es_type: str,
    chunk_size: int = 500,
):
    def pop_exception(d):
        d["index"].pop("exception")
        return d

    es_actions_g = (build_es_action(d,
                                    index_name=es_index_name,
                                    es_type=es_type) for d in dicts)
    results_g = helpers.streaming_bulk(
        es_client,
        es_actions_g,
        chunk_size=chunk_size,
        yield_ok=True,
        raise_on_error=True,
    )
    failed_g = (pop_exception(d) for ok, d in tqdm(results_g) if not ok)
    data_io.write_jsonl("failed.jsonl", failed_g)
Ejemplo n.º 4
0
        def consumer(file):
            print("%s is doing %s; limit: %d" %
                  (multiprocessing.current_process(), file, limit))

            dicts_g = (d for d in data_io.read_jsonl(file, limit=limit))

            actions_g = (build_es_action(d,
                                         es_index_name,
                                         es_type,
                                         op_type="index") for d in dicts_g)
            results_g = helpers.streaming_bulk(
                es_client,
                actions_g,
                chunk_size=chunk_size,
                yield_ok=True,
                raise_on_error=False,
                raise_on_exception=False,
            )

            failed_g = (pop_exception(d) for ok, d in results_g if not ok)
            data_io.write_jsonl(
                "%s_failed.jsonl" % multiprocessing.current_process(),
                failed_g)
Ejemplo n.º 5
0
def setup_index(es_client,
                files: List[str],
                INDEX_NAME,
                TYPE,
                from_scratch=False,
                mapping=None):
    STATE_INDEX_NAME = INDEX_NAME + "_state"
    STATE_TYPE = "file_state"

    if from_scratch:
        es_client.indices.delete(index=INDEX_NAME, ignore=[400, 404])
        es_client.indices.delete(index=STATE_INDEX_NAME, ignore=[400, 404])

    sleep(3)
    es_client.indices.create(index=INDEX_NAME, ignore=400, body=mapping)
    es_client.indices.create(index=STATE_INDEX_NAME, ignore=400)
    sleep(3)

    def build_es_action(datum, index_name, es_type, op_type="index"):
        _source = {
            k: None if isinstance(v, str) and len(v) == 0 else v
            for k, v in datum.items()
        }
        doc = {
            "_id": datum["file"],
            "_op_type": op_type,
            "_index": index_name,
            "_type": es_type,
            "_source": _source,
        }
        return doc

    helpers.bulk(
        es_client,
        (build_es_action(
            {
                "file": file,
                "line": 0,
                "done": False
            },
            STATE_INDEX_NAME,
            STATE_TYPE,
            op_type="create",
        ) for file in files),
        raise_on_error=False,
    )

    sum_in_state = sum([
        es_client.get_source(index=STATE_INDEX_NAME,
                             id=file,
                             doc_type=STATE_TYPE)["line"] for file in files
    ])
    if sum_in_state > 0:
        count = es_client.count(index=INDEX_NAME, doc_type=TYPE)["count"]
        if sum_in_state != count:
            print(sum_in_state)
            print(count)
            assert False

    body = '''
            {
              "query": {
                "bool": {
                  "must": [
                    {"term": {
                      "done": {
                        "value": "true"
                      }
                    }}
                  ]
                }
              }
            }    
    '''
    r = es_client.search(index=STATE_INDEX_NAME, body=body, size=10_000)

    files_in_es = set(
        [os.path.split(s['_source']['file'])[1] for s in r['hits']['hits']])
    not_yet_in_index_files = [
        f for f in files if os.path.split(f)[1] not in files_in_es
    ]
    print('got %d files which are not yet in ES-index' %
          len(not_yet_in_index_files))
    return not_yet_in_index_files