def requires(self): register_tasks() tasks = list() pipeline_ids = data_access.query_pipeline_ids(int(self.phenotype), util.conn_string) phenotype_config = data_access.query_phenotype(int(self.phenotype), util.conn_string) phenotype_config['phenotype_id'] = int(self.phenotype) log("getting ready to execute pipelines...") log(pipeline_ids) if len(pipeline_ids) > 0: configs = dict() for pipeline_id in pipeline_ids: pipeline_config = data_access.get_pipeline_config( pipeline_id, util.conn_string) pipeline_config['pipeline_id'] = pipeline_id configs[pipeline_config['name']] = pipeline_config update_phenotype_model(phenotype_config, util.conn_string) for pipeline_config in configs.values(): pipeline_id = pipeline_config['pipeline_id'] tasks.append( PipelineTask(pipeline=pipeline_id, job=self.job, owner=self.owner, pipelinetype=pipeline_config.config_type)) log(tasks) return tasks
def run(self): print('dependencies done; run phenotype reconciliation') client = MongoClient(util.mongo_host, util.mongo_port) try: data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Finished Pipelines") phenotype = data_access.query_phenotype(int(self.phenotype), util.conn_string) print(phenotype) db = client[util.mongo_db] data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Filtering Results") stats = phenotype_stats(str(self.job), True) intermediate_stats = phenotype_stats(str(self.job), False) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_RESULTS", str(intermediate_stats["results"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_SUBJECTS", str(intermediate_stats["subjects"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_RESULTS", str(stats["results"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_SUBJECTS", str(stats["subjects"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_QUERY_COUNTS", str(util.get_cache_query_count())) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_COMPUTE_COUNTS", str(util.get_cache_compute_count())) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_HIT_RATIO", str(util.get_cache_hit_ratio())) for k in util.properties.keys(): data_access.update_job_status(str(self.job), util.conn_string, data_access.PROPERTIES + "_" + k, util.properties[k]) with self.output().open('w') as outfile: phenotype_helper.write_phenotype_results(db, self.job, phenotype, self.phenotype, self.phenotype) data_access.update_job_status(str(self.job), util.conn_string, data_access.COMPLETED, "Job completed successfully") outfile.write("DONE!") outfile.write('\n') except BulkWriteError as bwe: print(bwe.details) data_access.update_job_status(str(self.job), util.conn_string, data_access.WARNING, str(bwe.details)) except Exception as ex: traceback.print_exc(file=sys.stdout) data_access.update_job_status(str(self.job), util.conn_string, data_access.FAILURE, str(ex)) print(ex) finally: client.close()
def run(self): print('dependencies done; run phenotype reconciliation') client = util.mongo_client() try: data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Finished Pipelines") phenotype = data_access.query_phenotype(int(self.phenotype), util.conn_string) print(phenotype) db = client[util.mongo_db] data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Filtering Results") stats = phenotype_stats(str(self.job), True) intermediate_stats = phenotype_stats(str(self.job), False) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_RESULTS", str(intermediate_stats["results"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_SUBJECTS", str(intermediate_stats["subjects"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_RESULTS", str(stats["results"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_SUBJECTS", str(stats["subjects"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_QUERY_COUNTS", str(util.get_cache_query_count())) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_COMPUTE_COUNTS", str(util.get_cache_compute_count())) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_HIT_RATIO", str(util.get_cache_hit_ratio())) for k in util.properties.keys(): data_access.update_job_status(str(self.job), util.conn_string, data_access.PROPERTIES + "_" + k, util.properties[k]) with self.output().open('w') as outfile: phenotype_helper.write_phenotype_results(db, self.job, phenotype, self.phenotype, self.phenotype) data_access.update_job_status(str(self.job), util.conn_string, data_access.COMPLETED, "Job completed successfully") outfile.write("DONE!") outfile.write('\n') except BulkWriteError as bwe: print(bwe.details) data_access.update_job_status(str(self.job), util.conn_string, data_access.WARNING, str(bwe.details)) except Exception as ex: traceback.print_exc(file=sys.stdout) data_access.update_job_status(str(self.job), util.conn_string, data_access.FAILURE, str(ex)) print(ex)
def run(self): print('dependencies done; run phenotype reconciliation') client = MongoClient(util.mongo_host, util.mongo_port) try: data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Finished Pipelines") phenotype = data_access.query_phenotype(int(self.phenotype), util.conn_string) print(phenotype) db = client[util.mongo_db] data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Filtering Results") with self.output().open('w') as outfile: phenotype_helper.write_phenotype_results( db, self.job, phenotype, self.phenotype, self.phenotype) data_access.update_job_status(str(self.job), util.conn_string, data_access.COMPLETED, "Job completed successfully") outfile.write("DONE!") outfile.write('\n') except BulkWriteError as bwe: print(bwe.details) data_access.update_job_status(str(self.job), util.conn_string, data_access.WARNING, str(bwe.details)) except Exception as ex: traceback.print_exc(file=sys.stdout) data_access.update_job_status(str(self.job), util.conn_string, data_access.FAILURE, str(ex)) print(ex) finally: client.close()
def run(self): log('dependencies done; run phenotype reconciliation') client = util.mongo_client() try: data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Finished Pipelines") phenotype = data_access.query_phenotype(int(self.phenotype), util.conn_string) # log(phenotype) db = client[util.mongo_db] data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS, "Filtering Results") stats = phenotype_stats(str(self.job), True) intermediate_stats = phenotype_stats(str(self.job), False) data_access.update_job_status( str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_RESULTS", str(intermediate_stats["results"])) data_access.update_job_status( str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_SUBJECTS", str(intermediate_stats["subjects"])) data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_RESULTS", str(stats["results"])) data_access.update_job_status( str(self.job), util.conn_string, data_access.STATS + "_FINAL_SUBJECTS", str(stats["subjects"])) log("writing job stats....") log(json.dumps(stats, indent=4)) # data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_QUERY_COUNTS", # str(util.get_cache_query_count())) # data_access.update_job_status(str(self.job), util.conn_string,data_access.STATS + "_CACHE_COMPUTE_COUNTS", # str(util.get_cache_compute_count())) # data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_HIT_RATIO", # str(util.get_cache_hit_ratio())) for k in util.properties.keys(): data_access.update_job_status(str(self.job), util.conn_string, data_access.PROPERTIES + "_" + k, util.properties[k]) with self.output().open('w') as outfile: phenotype_helper.write_phenotype_results( db, self.job, phenotype, self.phenotype, self.phenotype) # do tuple processing now that all tasks have completed succeeded = tuple_processor.process_tuples( db['phenotype_results'], int(self.job)) if not succeeded: log('*** ERROR: tuple processing failed ***') # force all mongo writes to complete by calling fsync on the admin db, then releasing the lock wrote_docs = False for tries in range(1, _MAX_ATTEMPTS): try: with ILock(_LOCK_NAME, timeout=_LOCK_WAIT_SECS): # only a SINGLE ClarityNLP process can execute this code at any time # force writes to disk by locking the Mongo admin database log('*** Job {0}: FORCING MONGO WRITES ***'.format( self.job)) admin_db = client['admin'] fsync_result = admin_db.command('fsync', lock=True) assert 1 == fsync_result['lockCount'] unlock_result = admin_db.command('fsyncUnlock') assert 0 == unlock_result['lockCount'] log('*** Job {0}: ALL MONGO WRITES COMPLETED ***'. format(self.job)) wrote_docs = True except ILockException: # timed out before acquiring the lock, will try again pass if wrote_docs: break if not wrote_docs: log('Job {0} failed to lock the Mongo admin database.'. format(self.job)) data_access.update_job_status(str(self.job), util.conn_string, data_access.COMPLETED, "Job completed successfully") outfile.write("DONE!") outfile.write('\n') log("job {} done!".format(self.job)) except BulkWriteError as bwe: log(bwe.details) data_access.update_job_status(str(self.job), util.conn_string, data_access.WARNING, str(bwe.details)) except Exception as ex: traceback.print_exc(file=sys.stdout) data_access.update_job_status(str(self.job), util.conn_string, data_access.FAILURE, str(ex)) log(ex) finally: client.close()
def requires(self): register_tasks() tasks = list() pipeline_ids = data_access.query_pipeline_ids(int(self.phenotype), util.conn_string) phenotype_config = data_access.query_phenotype(int(self.phenotype), util.conn_string) phenotype_config['phenotype_id'] = int(self.phenotype) print("getting ready to execute pipelines...") actually_use_chaining = False print(pipeline_ids) if len(pipeline_ids) > 0: configs = dict() for pipeline_id in pipeline_ids: pipeline_config = data_access.get_pipeline_config(pipeline_id, util.conn_string) pipeline_config['pipeline_id'] = pipeline_id configs[pipeline_config['name']] = pipeline_config n = 0 first_de = None secondary_des = list() if util.use_chained_queries == 'true': for op in phenotype_config['operations']: if op['action'] == 'AND': actually_use_chaining = True first_de = op['data_entities'][0] first_pipeline = configs[first_de] secondary_des = op['data_entities'][1:] name = "DownselectedCohort" + str(n) cohort = dict() cohort['name'] = name cohort['named_arguments'] = dict() cohort['named_arguments']['pipeline_id'] = first_pipeline['pipeline_id'] cohort['declaration'] = 'cohort' cohort['funct'] = 'getJobResults' cohort['library'] = 'Clarity' found = False for c in phenotype_config['cohorts']: if name == c['name']: found = True if not found: phenotype_config['cohorts'].append(cohort) for de in secondary_des: secondary_pipeline = configs[de] job_res_config = dict() job_res_config['context'] = 'document' job_res_config['pipeline_id'] = secondary_pipeline['pipeline_id'] secondary_pipeline['job_results'][name] = job_res_config secondary_pipeline['chained_query'] = name configs[de] = secondary_pipeline update_pipeline_config(secondary_pipeline, util.conn_string) o = 0 for de2 in phenotype_config['data_entities']: if de == de2['name']: cohorts = phenotype_config['data_entities'][o]['named_arguments']['cohort'] if name in cohorts: continue if 'cohort' not in phenotype_config['data_entities'][o]['named_arguments']: phenotype_config['data_entities'][o]['named_arguments']['cohort'] = [name] else: phenotype_config['data_entities'][o]['named_arguments']['cohort'].append(name) o += 1 n += 1 phenotype_config.chained_queries = actually_use_chaining update_phenotype_model(phenotype_config, util.conn_string) for pipeline_config in configs.values(): pipeline_id = pipeline_config['pipeline_id'] if actually_use_chaining and first_de: if first_de == pipeline_config['name']: tasks.append(PipelineTask(pipeline=pipeline_id, job=self.job, owner=self.owner, pipelinetype=pipeline_config.config_type)) dependent_pipeline_ids = list() for de in secondary_des: secondary_pipeline = configs[de] dependent_pipeline_ids.append(secondary_pipeline['pipeline_id']) # tasks.append(ChainedPipelineTask(pipeline=pipeline_id, job=self.job, owner=self.owner, # pipelinetype=pipeline_config.config_type, first_de=first_de, # dependent_pipeline_ids= # dependent_pipeline_ids)) tasks.append(PipelineTask(pipeline=secondary_pipeline.pipeline_id, job=self.job, owner=self.owner, pipelinetype=secondary_pipeline.config_type)) else: tasks.append(PipelineTask(pipeline=pipeline_id, job=self.job, owner=self.owner, pipelinetype=pipeline_config.config_type)) print(tasks) return tasks