from data_access import pipeline_config as config from data_access import solr_data from data_access import base_model from algorithms.sec_tag import * from cachetools import LRUCache, cached, keys sentences_key = "sentence_attrs" section_names_key = "section_name_attrs" section_text_key = "section_text_attrs" doc_fields = [ 'report_id', 'subject', 'report_date', 'report_type', 'source', 'solr_id' ] pipeline_cache = LRUCache(maxsize=5000) document_cache = LRUCache(maxsize=5000) init_cache = LRUCache(maxsize=1000) segment = segmentation.Segmentation() @cached(document_cache) def _get_document_by_id(document_id): util.add_cache_compute_count() return solr_data.query_doc_by_id(document_id, solr_url=util.solr_url) def get_document_by_id(document_id): doc = None if util.use_redis_caching == "true": util.add_cache_query_count() txt = util.get_from_redis_cache("doc:" + document_id) if not txt:
class BaseTask(luigi.Task): pipeline = luigi.IntParameter() job = luigi.IntParameter() start = luigi.IntParameter() solr_query = luigi.Parameter() batch = luigi.IntParameter() task_name = "ClarityNLPLuigiTask" docs = list() pipeline_config = config.PipelineConfig('', '') segment = segmentation.Segmentation() def run(self): task_family_name = str(self.task_family) if self.task_name == "ClarityNLPLuigiTask": self.task_name = task_family_name client = util.mongo_client() try: with self.output().open('w') as temp_file: temp_file.write("start writing custom task") jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Batch %s" % self.batch) self.pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") self.docs = solr_data.query( self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=self.pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, types=self.pipeline_config.report_types, sources=self.pipeline_config.sources, filter_query=self.pipeline_config.filter_query, cohort_ids=self.pipeline_config.cohort, job_results_filters=self.pipeline_config.job_results) for d in self.docs: doc_id = d[util.solr_report_id_field] if util.use_memory_caching == "true": k = keys.hashkey(doc_id) document_cache[k] = d if util.use_redis_caching == "true": util.write_to_redis_cache("doc:" + doc_id, json.dumps(d)) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running %s main task" % self.task_name) self.run_custom_task(temp_file, client) temp_file.write("Done writing custom task!") self.docs = list() except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close() def output(self): return luigi.LocalTarget( "%s/pipeline_job%s_%s_batch%s.txt" % (util.tmp_dir, str(self.job), self.task_name, str(self.start))) def set_name(self, name): self.task_name = name def write_result_data(self, temp_file, mongo_client, doc, data: dict, prefix: str = '', phenotype_final: bool = False): inserted = pipeline_mongo_writer(mongo_client, self.pipeline, self.task_name, self.job, self.batch, self.pipeline_config, doc, data, prefix=prefix) if temp_file is not None: temp_file.write(str(inserted)) temp_file.write('\n') return inserted def write_multiple_result_data(self, temp_file, mongo_client, doc, data: list, prefix: str = ''): ids = list() for d in data: inserted = pipeline_mongo_writer(mongo_client, self.pipeline, self.task_name, self.job, self.batch, self.pipeline_config, doc, d, prefix=prefix) ids.append(inserted) if temp_file is not None: temp_file.write(str(inserted)) temp_file.write('\n') return ids def write_log_data(self, job_status, status_message): jobs.update_job_status(str(self.job), util.conn_string, job_status, status_message) def run_custom_task(self, temp_file, mongo_client: MongoClient): print("Implement your custom functionality here ") def get_document_text(self, doc, clean=True): if doc and util.solr_text_field in doc: txt = doc[util.solr_text_field] if type(txt) == str: txt_val = txt elif type(txt) == list: txt_val = ' '.join(txt) else: txt_val = str(txt) if clean: return txt_val.encode("ascii", errors="ignore").decode() else: return txt_val else: return '' def get_boolean(self, key, default=False): return get_config_boolean(self.pipeline_config, key, default=default) def get_integer(self, key, default=-1): return get_config_integer(self.pipeline_config, key, default=default) def get_string(self, key, default=''): return get_config_string(self.pipeline_config, key, default=default) def get_document_sentences(self, doc): return document_sentences(doc) def get_document_sections(self, doc): names, section_texts = document_sections(doc) return names, section_texts
class BaseTask(luigi.Task): pipeline = luigi.IntParameter() job = luigi.IntParameter() start = luigi.IntParameter() solr_query = luigi.Parameter() batch = luigi.IntParameter() task_name = "ClarityNLPLuigiTask" docs = list() pipeline_config = config.PipelineConfig('', '') segment = segmentation.Segmentation() def run(self): task_family_name = str(self.task_family) if self.task_name == "ClarityNLPLuigiTask": self.task_name = task_family_name client = MongoClient(util.mongo_host, util.mongo_port) try: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Batch %s" % self.batch) self.pipeline_config = config.get_pipeline_config( self.pipeline, util.conn_string) jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running Solr query") self.docs = solr_data.query( self.solr_query, rows=util.row_count, start=self.start, solr_url=util.solr_url, tags=self.pipeline_config.report_tags, mapper_inst=util.report_mapper_inst, mapper_url=util.report_mapper_url, mapper_key=util.report_mapper_key, types=self.pipeline_config.report_types, filter_query=self.pipeline_config.filter_query) with self.output().open('w') as temp_file: jobs.update_job_status(str(self.job), util.conn_string, jobs.IN_PROGRESS, "Running %s main task" % self.task_name) self.run_custom_task(temp_file, client) temp_file.write("Done writing custom task!") self.docs = list() except Exception as ex: traceback.print_exc(file=sys.stderr) jobs.update_job_status(str(self.job), util.conn_string, jobs.WARNING, ''.join(traceback.format_stack())) print(ex) finally: client.close() def output(self): return luigi.LocalTarget( "%s/pipeline_job%s_%s_batch%s.txt" % (util.tmp_dir, str(self.job), self.task_name, str(self.start))) def set_name(self, name): self.task_name = name def write_result_data(self, temp_file, mongo_client, doc, data: dict): inserted = pipeline_mongo_writer(mongo_client, self.pipeline, self.task_name, self.job, self.batch, self.pipeline_config, doc, data) if temp_file is not None: temp_file.write(str(inserted)) temp_file.write('\n') return inserted def write_multiple_result_data(self, temp_file, mongo_client, doc, data: list): ids = list() for d in data: inserted = pipeline_mongo_writer(mongo_client, self.pipeline, self.task_name, self.job, self.batch, self.pipeline_config, doc, d) ids.append(inserted) if temp_file is not None: temp_file.write(str(inserted)) temp_file.write('\n') return ids def write_log_data(self, job_status, status_message): jobs.update_job_status(str(self.job), util.conn_string, job_status, status_message) def run_custom_task(self, temp_file, mongo_client: MongoClient): print("Implement your custom functionality here ") def get_document_text(self, doc): if doc and util.solr_text_field in doc: return doc[util.solr_text_field] else: return '' def get_document_sentences(self, doc): txt = self.get_document_text(doc) sentence_list = self.segment.parse_sentences(txt) return sentence_list