def test_store_get_result(): "test whether results can be stored and retrieved" from xtas.tasks.es import store_single, get_single_result, get_all_results idx, typ = ES_TEST_INDEX, ES_TEST_TYPE with clean_es() as es: id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id'] assert_equal(get_single_result("task1", idx, typ, id), None) assert_equal(get_all_results(idx, typ, id), {}) store_single("task1_result", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"}) # test second result and test non-scalar data task2_result = {"a": {"b": ["c", "d"]}} store_single(task2_result, "task2", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result", "task2": task2_result}) # store a task result under an existing task, check that it is replaced store_single("task1_result2", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result2") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result2", "task2": task2_result}) # check that the original document is intact src = es.get_source(index=idx, doc_type=typ, id=id) assert_equal(src['text'], "test")
def test_store_get_result(): "test whether results can be stored and retrieved" from xtas.tasks.es import store_single, get_single_result, get_all_results idx, typ = ES_TEST_INDEX, ES_TEST_TYPE with clean_es() as es: id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id'] assert_equal(get_single_result("task1", idx, typ, id), None) assert_equal(get_all_results(idx, typ, id), {}) r = store_single("task1_result", "task1", idx, typ, id, return_data=False) assert_equal(r, None) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"}) # test second result and test non-scalar data task2_result = {"a": {"b": ["c", "d"]}} store_single(task2_result, "task2", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), { "task1": "task1_result", "task2": task2_result }) # store a task result under an existing task, check that it is replaced store_single("task1_result2", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result2") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), { "task1": "task1_result2", "task2": task2_result }) # check that the original document is intact src = es.get_source(index=idx, doc_type=typ, id=id) assert_equal(src['text'], "test")
def pipeline(doc, pipeline, store_final=True, store_intermediate=False, block=True): """ Get the result for a given document. Pipeline should be a list of dicts, with members task and argument e.g. [{"module" : "tokenize"}, {"module" : "pos_tag", "arguments" : {"model" : "nltk"}}] @param block: if True, it will block and return the actual result. If False, it will return an AsyncResult unless the result was cached, in which case it returns the result immediately (!) @param store_final: if True, store the final result @param store_intermediate: if True, store all intermediate results as well """ # form basic pipeline by resolving task dictionaries to task objects tasks = [_get_task(t) for t in pipeline] if isinstance(doc, dict) and set(doc.keys()) == set(_ES_DOC_FIELDS): idx, typ, id, field = [doc[k] for k in _ES_DOC_FIELDS] chain = [] input = None cache = get_all_results(idx, typ, id) # Check cache for existing documents # Iterate over tasks in reverse order, check cached result, and # otherwise prepend task (and cache store command) to chain for i in range(len(tasks), 0, -1): taskname = "__".join(t.task for t in tasks[:i]) if taskname in cache: input = cache[taskname] break if (i == len(tasks) and store_final) or store_intermediate: chain.insert(0, store_single.s(taskname, idx, typ, id)) chain.insert(0, tasks[i-1]) if not chain: # final result was cached, good! return input elif input is None: input = fetch(doc) else: # the doc is a string, so we can't use caching chain = tasks input = doc chain = celery.chain(*chain).delay(input) if block: return chain.get() else: return chain