Exemple #1
0
def test_store_get_result():
    "test whether results can be stored and retrieved"
    from xtas.tasks.es import store_single, get_single_result, get_all_results
    idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
    with clean_es() as es:
        id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id']
        assert_equal(get_single_result("task1", idx, typ, id), None)
        assert_equal(get_all_results(idx, typ, id), {})

        store_single("task1_result", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"})

        # test second result and test non-scalar data
        task2_result = {"a": {"b": ["c", "d"]}}
        store_single(task2_result, "task2", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id),
                     {"task1": "task1_result", "task2": task2_result})

        # store a task result under an existing task, check that it is replaced
        store_single("task1_result2", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result2")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id),
                     {"task1": "task1_result2", "task2": task2_result})

        # check that the original document is intact
        src = es.get_source(index=idx, doc_type=typ, id=id)
        assert_equal(src['text'], "test")
Exemple #2
0
def pipeline(doc,
             pipeline,
             store_final=True,
             store_intermediate=False,
             block=True):
    """
    Get the result for a given document.
    Pipeline should be a list of dicts, with members task and argument
    e.g. [{"module" : "tokenize"},
          {"module" : "pos_tag", "arguments" : {"model" : "nltk"}}]
    @param block: if True, it will block and return the actual result.
                  If False, it will return an AsyncResult unless the result was
                  cached, in which case it returns the result immediately (!)
    @param store_final: if True, store the final result
    @param store_intermediate: if True, store all intermediate results as well
    """
    # form basic pipeline by resolving task dictionaries to task objects
    tasks = [_get_task(t) for t in pipeline]

    if is_es_document(doc):
        idx, typ, id, field = es_address(doc)
        chain = []
        input = None
        # Check cache for existing documents
        # Iterate over tasks in reverse order, check cached result, and
        # otherwise prepend task (and cache store command) to chain
        for i in range(len(tasks), 0, -1):
            taskname = "__".join(t.task for t in tasks[:i])
            input = get_single_result(taskname, idx, typ, id)
            if input:
                break
            if (i == len(tasks) and store_final) or store_intermediate:
                chain.insert(0, store_single.s(taskname, idx, typ, id))
            chain.insert(0, tasks[i - 1])
        if not chain:  # final result was cached, good!
            return input
        elif input is None:
            input = fetch(doc)
    else:
        # the doc is a string, so we can't use caching
        chain = tasks
        input = doc

    chain = celery.chain(*chain).delay(input)
    if block:
        return chain.get()
    else:
        return chain
Exemple #3
0
def pipeline(doc, pipeline, store_final=True, store_intermediate=False,
             block=True):
    """
    Get the result for a given document.
    Pipeline should be a list of dicts, with members task and argument
    e.g. [{"module" : "tokenize"},
          {"module" : "pos_tag", "arguments" : {"model" : "nltk"}}]
    @param block: if True, it will block and return the actual result.
                  If False, it will return an AsyncResult unless the result was
                  cached, in which case it returns the result immediately (!)
    @param store_final: if True, store the final result
    @param store_intermediate: if True, store all intermediate results as well
    """
    # form basic pipeline by resolving task dictionaries to task objects
    tasks = [_get_task(t) for t in pipeline]

    if is_es_document(doc):
        idx, typ, id, field = es_address(doc)
        chain = []
        input = None
        # Check cache for existing documents
        # Iterate over tasks in reverse order, check cached result, and
        # otherwise prepend task (and cache store command) to chain
        for i in range(len(tasks), 0, -1):
            taskname = "__".join(t.task for t in tasks[:i])
            input = get_single_result(taskname, idx, typ, id)
            if input:
                break
            if (i == len(tasks) and store_final) or store_intermediate:
                chain.insert(0, store_single.s(taskname, idx, typ, id))
            chain.insert(0, tasks[i-1])
        if not chain:  # final result was cached, good!
            return input
        elif input is None:
            input = fetch(doc)
    else:
        # the doc is a string, so we can't use caching
        chain = tasks
        input = doc

    chain = celery.chain(*chain).delay(input)
    if block:
        return chain.get()
    else:
        return chain
Exemple #4
0
def test_store_get_result():
    "test whether results can be stored and retrieved"
    from xtas.tasks.es import (
        store_single,
        get_single_result,
        get_tasks_per_index,
        fetch_documents_by_task,
        fetch_results_by_document,
        fetch_query_details_batch
        )
    idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
    with clean_es() as es:
        id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id']
        assert_equal(get_single_result("task1", idx, typ, id), None)

        store_single("task1_result", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_in("task1", get_tasks_per_index(idx, typ))
        # test second result and test non-scalar data
        task2_result = {"a": {"b": ["c", "d"]}}
        store_single(task2_result, "task2", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        query = {"match": {"b": {"query": "c"}}}
        assert_equal(len(fetch_documents_by_task(idx, typ, query, "task2")),
                     1)
        query = {"match": {"text": {"query": "test"}}}
        results = fetch_results_by_document(idx, typ, query, "task2")
        assert_equal(len(results), 1)
        results = fetch_query_details_batch(idx, typ, query, True)
        assert_in("task1", results[0][1])
        assert_in("task2", results[0][1])
        results = fetch_query_details_batch(idx, typ, query,
                                            tasknames=["task2"])
        assert_in("task2", results[0][1])
        # store a task result under an existing task, check that it is replaced
        store_single("task1_result2", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result2")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)

        # check that the original document is intact
        src = es.get_source(index=idx, doc_type=typ, id=id)
        assert_equal(src['text'], "test")
Exemple #5
0
def test_store_get_result():
    "test whether results can be stored and retrieved"
    from xtas.tasks.es import store_single, get_single_result, get_all_results
    idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
    with clean_es() as es:
        id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id']
        assert_equal(get_single_result("task1", idx, typ, id), None)
        assert_equal(get_all_results(idx, typ, id), {})

        r = store_single("task1_result",
                         "task1",
                         idx,
                         typ,
                         id,
                         return_data=False)
        assert_equal(r, None)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"})

        # test second result and test non-scalar data
        task2_result = {"a": {"b": ["c", "d"]}}
        store_single(task2_result, "task2", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id), {
            "task1": "task1_result",
            "task2": task2_result
        })

        # store a task result under an existing task, check that it is replaced
        store_single("task1_result2", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result2")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id), {
            "task1": "task1_result2",
            "task2": task2_result
        })

        # check that the original document is intact
        src = es.get_source(index=idx, doc_type=typ, id=id)
        assert_equal(src['text'], "test")
Exemple #6
0
def test_store_get_result():
    "test whether results can be stored and retrieved"
    from xtas.tasks.es import (store_single, get_single_result,
                               get_tasks_per_index, fetch_documents_by_task,
                               fetch_results_by_document,
                               fetch_query_details_batch)
    idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
    with clean_es() as es:
        id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id']
        assert_equal(get_single_result("task1", idx, typ, id), None)

        store_single("task1_result", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_in("task1", get_tasks_per_index(idx, typ))
        # test second result and test non-scalar data
        task2_result = {"a": {"b": ["c", "d"]}}
        store_single(task2_result, "task2", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        query = {"match": {"b": {"query": "c"}}}
        assert_equal(len(fetch_documents_by_task(idx, typ, query, "task2")), 1)
        query = {"match": {"text": {"query": "test"}}}
        results = fetch_results_by_document(idx, typ, query, "task2")
        assert_equal(len(results), 1)
        results = fetch_query_details_batch(idx, typ, query, True)
        assert_in("task1", results[0][1])
        assert_in("task2", results[0][1])
        results = fetch_query_details_batch(idx,
                                            typ,
                                            query,
                                            tasknames=["task2"])
        assert_in("task2", results[0][1])
        # store a task result under an existing task, check that it is replaced
        store_single("task1_result2", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result2")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)

        # check that the original document is intact
        src = es.get_source(index=idx, doc_type=typ, id=id)
        assert_equal(src['text'], "test")