def run(self): task_family_name = str(self.task_family) if self.task_name == "ClarityNLPLuigiTask": self.task_name = task_family_name client = util.mongo_client() try: with self.output().open('w') as temp_file: temp_file.write("start writing custom task") jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Batch %s" % self.batch) self.pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") self.docs = solr_data.query( self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=self.pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, types=self.pipeline_config.report_types, sources=self.pipeline_config.sources, filter_query=self.pipeline_config.filter_query, cohort_ids=self.pipeline_config.cohort, job_results_filters=self.pipeline_config.job_results) for d in self.docs: doc_id = d[util.solr_report_id_field] if util.use_memory_caching == "true": k = keys.hashkey(doc_id) document_cache[k] = d if util.use_redis_caching == "true": util.write_to_redis_cache("doc:" + doc_id, json.dumps(d)) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running %s main task" % self.task_name) self.run_custom_task(temp_file, client) temp_file.write("Done writing custom task!") self.docs = list() except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close()
def get_race_data(document_id): if util.use_redis_caching == "true": util.add_cache_query_count() key = "raceFinder:" + str(document_id) res = util.get_from_redis_cache(key) if res: return json.loads(res) else: util.add_cache_compute_count() res2 = get_race_for_doc(document_id) util.write_to_redis_cache(key, json.dumps(res2)) return res2 elif util.use_memory_caching == "true": util.add_cache_query_count() return _get_race_data(document_id) else: return get_race_for_doc(document_id)
def get_document_by_id(document_id): doc = None if util.use_redis_caching == "true": util.add_cache_query_count() txt = util.get_from_redis_cache("doc:" + document_id) if not txt: util.add_cache_compute_count() doc = solr_data.query_doc_by_id(document_id, solr_url=util.solr_url) util.write_to_redis_cache("doc:" + document_id, json.dumps(doc)) else: doc = json.loads(txt) elif util.use_memory_caching == "true": util.add_cache_query_count() doc = _get_document_by_id(document_id) if not doc: return solr_data.query_doc_by_id(document_id, solr_url=util.solr_url) else: return doc
def get_job_performance(job_ids: list, connection_string: str): if not job_ids or len(job_ids) == 0: return dict() conn = psycopg2.connect(connection_string) cursor = conn.cursor() metrics = dict() try: in_clause = '' for i in job_ids: if len(in_clause) > 0: in_clause += ', ' in_clause += '%s' cursor.execute( """ SELECT status, nlp_job_id from nlp.nlp_job where nlp_job_id in ({}) """.format(in_clause), job_ids) status = None job_id = -1 statuses = cursor.fetchall() for s in statuses: status = s[0] job_id = s[1] metrics[job_id] = { "status": status, "final_results": 0, "final_subjects": 0, "intermediate_results": 0, "intermediate_subjects": 0, "counts_found": 0 } cursor.execute( """ SELECT status, description, date_updated, nlp_job_id from nlp.nlp_job_status where nlp_job_id in ({}) order by date_updated """.format(in_clause), job_ids) updates = cursor.fetchall() for row in updates: status_name = row[0] status_date = row[2] status_value = row[1] job_id = row[3] performance = metrics[job_id] counts_found = performance.get('counts_found', 0) if status_name == 'STATS_FINAL_SUBJECTS': performance['final_subjects'] = status_value counts_found += int(status_value) elif status_name == 'STATS_FINAL_RESULTS': performance['final_results'] = status_value counts_found += int(status_value) elif status_name == 'STATS_INTERMEDIATE_SUBJECTS': performance['intermediate_subjects'] = status_value elif status_name == 'STATS_INTERMEDIATE_RESULTS': performance['intermediate_results'] = status_value performance['counts_found'] = counts_found metrics[job_id] = performance for k in metrics.keys(): performance = metrics[k] counts_found = performance.get('counts_found', 0) if counts_found == 0 and status == COMPLETED: final_subjects = util.get_from_redis_cache( 'final_subjects_{}'.format(k)) final_results = util.get_from_redis_cache( 'final_results{}'.format(k)) if final_results and final_subjects: performance['final_subjects'] = final_subjects performance['final_results'] = final_results else: stats = phenotype_stats(str(job_id), True) performance['final_subjects'] = stats["subjects"] performance['final_results'] = stats["results"] util.write_to_redis_cache('final_subjects_{}'.format(k), stats["subjects"]) util.write_to_redis_cache('final_results_{}'.format(k), stats["results"]) int_subjects = util.get_from_redis_cache( 'intermediate_subjects_{}'.format(k)) int_results = util.get_from_redis_cache( 'intermediate_results{}'.format(k)) if int_subjects and int_results: performance['intermediate_subjects'] = int_subjects performance['intermediate_results'] = int_results else: intermediate_stats = phenotype_stats(str(job_id), False) performance['intermediate_subjects'] = intermediate_stats[ "subjects"] performance['intermediate_results'] = intermediate_stats[ "results"] util.write_to_redis_cache( 'intermediate_subjects_{}'.format(k), intermediate_stats["subjects"]) util.write_to_redis_cache( 'intermediate_results_{}'.format(k), intermediate_stats["results"]) if 'counts_found' in performance: del performance['counts_found'] metrics[job_id] = performance except Exception as ex: traceback.print_exc(file=sys.stdout) finally: conn.close() return metrics