Example #1
0
    def requires(self):
        register_tasks()
        tasks = list()
        pipeline_ids = data_access.query_pipeline_ids(int(self.phenotype),
                                                      util.conn_string)
        phenotype_config = data_access.query_phenotype(int(self.phenotype),
                                                       util.conn_string)
        phenotype_config['phenotype_id'] = int(self.phenotype)

        log("getting ready to execute pipelines...")
        log(pipeline_ids)
        if len(pipeline_ids) > 0:
            configs = dict()
            for pipeline_id in pipeline_ids:
                pipeline_config = data_access.get_pipeline_config(
                    pipeline_id, util.conn_string)
                pipeline_config['pipeline_id'] = pipeline_id
                configs[pipeline_config['name']] = pipeline_config

            update_phenotype_model(phenotype_config, util.conn_string)
            for pipeline_config in configs.values():
                pipeline_id = pipeline_config['pipeline_id']
                tasks.append(
                    PipelineTask(pipeline=pipeline_id,
                                 job=self.job,
                                 owner=self.owner,
                                 pipelinetype=pipeline_config.config_type))
        log(tasks)

        return tasks
    def run(self):
        print('dependencies done; run phenotype reconciliation')
        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS,
                                          "Finished Pipelines")

            phenotype = data_access.query_phenotype(int(self.phenotype), util.conn_string)
            print(phenotype)

            db = client[util.mongo_db]

            data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS,
                                          "Filtering Results")

            stats = phenotype_stats(str(self.job), True)
            intermediate_stats = phenotype_stats(str(self.job), False)
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_RESULTS",
                                          str(intermediate_stats["results"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_SUBJECTS",
                                          str(intermediate_stats["subjects"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_RESULTS",
                                          str(stats["results"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_SUBJECTS",
                                          str(stats["subjects"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_QUERY_COUNTS",
                                          str(util.get_cache_query_count()))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_COMPUTE_COUNTS",
                                          str(util.get_cache_compute_count()))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_HIT_RATIO",
                                          str(util.get_cache_hit_ratio()))

            for k in util.properties.keys():
                data_access.update_job_status(str(self.job), util.conn_string, data_access.PROPERTIES + "_" + k,
                                              util.properties[k])
            with self.output().open('w') as outfile:
                phenotype_helper.write_phenotype_results(db, self.job, phenotype, self.phenotype, self.phenotype)
                data_access.update_job_status(str(self.job), util.conn_string, data_access.COMPLETED,
                                              "Job completed successfully")
                outfile.write("DONE!")
                outfile.write('\n')
        except BulkWriteError as bwe:
            print(bwe.details)
            data_access.update_job_status(str(self.job), util.conn_string, data_access.WARNING, str(bwe.details))
        except Exception as ex:
            traceback.print_exc(file=sys.stdout)
            data_access.update_job_status(str(self.job), util.conn_string, data_access.FAILURE, str(ex))
            print(ex)
        finally:
            client.close()
Example #3
0
    def run(self):
        print('dependencies done; run phenotype reconciliation')
        client = util.mongo_client()

        try:
            data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS,
                                          "Finished Pipelines")

            phenotype = data_access.query_phenotype(int(self.phenotype), util.conn_string)
            print(phenotype)

            db = client[util.mongo_db]

            data_access.update_job_status(str(self.job), util.conn_string, data_access.IN_PROGRESS,
                                          "Filtering Results")

            stats = phenotype_stats(str(self.job), True)
            intermediate_stats = phenotype_stats(str(self.job), False)
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_RESULTS",
                                          str(intermediate_stats["results"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_INTERMEDIATE_SUBJECTS",
                                          str(intermediate_stats["subjects"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_RESULTS",
                                          str(stats["results"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_FINAL_SUBJECTS",
                                          str(stats["subjects"]))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_QUERY_COUNTS",
                                          str(util.get_cache_query_count()))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_COMPUTE_COUNTS",
                                          str(util.get_cache_compute_count()))
            data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_HIT_RATIO",
                                          str(util.get_cache_hit_ratio()))

            for k in util.properties.keys():
                data_access.update_job_status(str(self.job), util.conn_string, data_access.PROPERTIES + "_" + k,
                                              util.properties[k])
            with self.output().open('w') as outfile:
                phenotype_helper.write_phenotype_results(db, self.job, phenotype, self.phenotype, self.phenotype)
                data_access.update_job_status(str(self.job), util.conn_string, data_access.COMPLETED,
                                              "Job completed successfully")
                outfile.write("DONE!")
                outfile.write('\n')
        except BulkWriteError as bwe:
            print(bwe.details)
            data_access.update_job_status(str(self.job), util.conn_string, data_access.WARNING, str(bwe.details))
        except Exception as ex:
            traceback.print_exc(file=sys.stdout)
            data_access.update_job_status(str(self.job), util.conn_string, data_access.FAILURE, str(ex))
            print(ex)
Example #4
0
    def run(self):
        print('dependencies done; run phenotype reconciliation')
        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.IN_PROGRESS,
                                          "Finished Pipelines")

            phenotype = data_access.query_phenotype(int(self.phenotype),
                                                    util.conn_string)
            print(phenotype)

            db = client[util.mongo_db]

            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.IN_PROGRESS,
                                          "Filtering Results")

            with self.output().open('w') as outfile:
                phenotype_helper.write_phenotype_results(
                    db, self.job, phenotype, self.phenotype, self.phenotype)
                data_access.update_job_status(str(self.job), util.conn_string,
                                              data_access.COMPLETED,
                                              "Job completed successfully")
                outfile.write("DONE!")
                outfile.write('\n')
        except BulkWriteError as bwe:
            print(bwe.details)
            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.WARNING,
                                          str(bwe.details))
        except Exception as ex:
            traceback.print_exc(file=sys.stdout)
            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.FAILURE, str(ex))
            print(ex)
        finally:
            client.close()
Example #5
0
    def run(self):
        log('dependencies done; run phenotype reconciliation')
        client = util.mongo_client()

        try:
            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.IN_PROGRESS,
                                          "Finished Pipelines")

            phenotype = data_access.query_phenotype(int(self.phenotype),
                                                    util.conn_string)
            # log(phenotype)

            db = client[util.mongo_db]

            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.IN_PROGRESS,
                                          "Filtering Results")

            stats = phenotype_stats(str(self.job), True)
            intermediate_stats = phenotype_stats(str(self.job), False)
            data_access.update_job_status(
                str(self.job), util.conn_string,
                data_access.STATS + "_INTERMEDIATE_RESULTS",
                str(intermediate_stats["results"]))
            data_access.update_job_status(
                str(self.job), util.conn_string,
                data_access.STATS + "_INTERMEDIATE_SUBJECTS",
                str(intermediate_stats["subjects"]))
            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.STATS + "_FINAL_RESULTS",
                                          str(stats["results"]))
            data_access.update_job_status(
                str(self.job), util.conn_string,
                data_access.STATS + "_FINAL_SUBJECTS", str(stats["subjects"]))
            log("writing job stats....")
            log(json.dumps(stats, indent=4))
            # data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_QUERY_COUNTS",
            #                               str(util.get_cache_query_count()))
            # data_access.update_job_status(str(self.job), util.conn_string,data_access.STATS + "_CACHE_COMPUTE_COUNTS",
            #                               str(util.get_cache_compute_count()))
            # data_access.update_job_status(str(self.job), util.conn_string, data_access.STATS + "_CACHE_HIT_RATIO",
            #                               str(util.get_cache_hit_ratio()))

            for k in util.properties.keys():
                data_access.update_job_status(str(self.job), util.conn_string,
                                              data_access.PROPERTIES + "_" + k,
                                              util.properties[k])
            with self.output().open('w') as outfile:
                phenotype_helper.write_phenotype_results(
                    db, self.job, phenotype, self.phenotype, self.phenotype)

                # do tuple processing now that all tasks have completed
                succeeded = tuple_processor.process_tuples(
                    db['phenotype_results'], int(self.job))
                if not succeeded:
                    log('*** ERROR: tuple processing failed ***')

                # force all mongo writes to complete by calling fsync on the admin db, then releasing the lock

                wrote_docs = False
                for tries in range(1, _MAX_ATTEMPTS):

                    try:
                        with ILock(_LOCK_NAME, timeout=_LOCK_WAIT_SECS):

                            # only a SINGLE ClarityNLP process can execute this code at any time

                            # force writes to disk by locking the Mongo admin database
                            log('*** Job {0}: FORCING MONGO WRITES ***'.format(
                                self.job))

                            admin_db = client['admin']
                            fsync_result = admin_db.command('fsync', lock=True)
                            assert 1 == fsync_result['lockCount']
                            unlock_result = admin_db.command('fsyncUnlock')
                            assert 0 == unlock_result['lockCount']

                            log('*** Job {0}: ALL MONGO WRITES COMPLETED ***'.
                                format(self.job))

                            wrote_docs = True

                    except ILockException:
                        # timed out before acquiring the lock, will try again
                        pass

                    if wrote_docs:
                        break

                if not wrote_docs:
                    log('Job {0} failed to lock the Mongo admin database.'.
                        format(self.job))

                data_access.update_job_status(str(self.job), util.conn_string,
                                              data_access.COMPLETED,
                                              "Job completed successfully")
                outfile.write("DONE!")
                outfile.write('\n')

            log("job {} done!".format(self.job))
        except BulkWriteError as bwe:
            log(bwe.details)
            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.WARNING,
                                          str(bwe.details))
        except Exception as ex:
            traceback.print_exc(file=sys.stdout)
            data_access.update_job_status(str(self.job), util.conn_string,
                                          data_access.FAILURE, str(ex))
            log(ex)
        finally:
            client.close()
    def requires(self):
        register_tasks()
        tasks = list()
        pipeline_ids = data_access.query_pipeline_ids(int(self.phenotype), util.conn_string)
        phenotype_config = data_access.query_phenotype(int(self.phenotype), util.conn_string)
        phenotype_config['phenotype_id'] = int(self.phenotype)

        print("getting ready to execute pipelines...")
        actually_use_chaining = False
        print(pipeline_ids)
        if len(pipeline_ids) > 0:
            configs = dict()
            for pipeline_id in pipeline_ids:
                pipeline_config = data_access.get_pipeline_config(pipeline_id, util.conn_string)
                pipeline_config['pipeline_id'] = pipeline_id
                configs[pipeline_config['name']] = pipeline_config

            n = 0
            first_de = None
            secondary_des = list()
            if util.use_chained_queries == 'true':
                for op in phenotype_config['operations']:
                    if op['action'] == 'AND':
                        actually_use_chaining = True
                        first_de = op['data_entities'][0]
                        first_pipeline = configs[first_de]
                        secondary_des = op['data_entities'][1:]
                        name = "DownselectedCohort" + str(n)

                        cohort = dict()
                        cohort['name'] = name
                        cohort['named_arguments'] = dict()
                        cohort['named_arguments']['pipeline_id'] = first_pipeline['pipeline_id']
                        cohort['declaration'] = 'cohort'
                        cohort['funct'] = 'getJobResults'
                        cohort['library'] = 'Clarity'

                        found = False
                        for c in phenotype_config['cohorts']:
                            if name == c['name']:
                                found = True
                        if not found:
                            phenotype_config['cohorts'].append(cohort)
                        for de in secondary_des:
                            secondary_pipeline = configs[de]
                            job_res_config = dict()
                            job_res_config['context'] = 'document'
                            job_res_config['pipeline_id'] = secondary_pipeline['pipeline_id']
                            secondary_pipeline['job_results'][name] = job_res_config
                            secondary_pipeline['chained_query'] = name
                            configs[de] = secondary_pipeline
                            update_pipeline_config(secondary_pipeline, util.conn_string)
                            o = 0
                            for de2 in phenotype_config['data_entities']:
                                if de == de2['name']:
                                    cohorts = phenotype_config['data_entities'][o]['named_arguments']['cohort']
                                    if name in cohorts:
                                        continue
                                    if 'cohort' not in phenotype_config['data_entities'][o]['named_arguments']:
                                        phenotype_config['data_entities'][o]['named_arguments']['cohort'] = [name]
                                    else:
                                        phenotype_config['data_entities'][o]['named_arguments']['cohort'].append(name)
                                o += 1
                        n += 1

                phenotype_config.chained_queries = actually_use_chaining

            update_phenotype_model(phenotype_config, util.conn_string)
            for pipeline_config in configs.values():
                pipeline_id = pipeline_config['pipeline_id']
                if actually_use_chaining and first_de:
                    if first_de == pipeline_config['name']:
                        tasks.append(PipelineTask(pipeline=pipeline_id, job=self.job, owner=self.owner,
                                                  pipelinetype=pipeline_config.config_type))
                        dependent_pipeline_ids = list()
                        for de in secondary_des:
                            secondary_pipeline = configs[de]
                            dependent_pipeline_ids.append(secondary_pipeline['pipeline_id'])
                        # tasks.append(ChainedPipelineTask(pipeline=pipeline_id, job=self.job, owner=self.owner,
                        #                                  pipelinetype=pipeline_config.config_type, first_de=first_de,
                        #                                  dependent_pipeline_ids=
                        #                                  dependent_pipeline_ids))
                            tasks.append(PipelineTask(pipeline=secondary_pipeline.pipeline_id, job=self.job,
                                                      owner=self.owner,
                                                      pipelinetype=secondary_pipeline.config_type))

                else:
                    tasks.append(PipelineTask(pipeline=pipeline_id, job=self.job, owner=self.owner,
                                              pipelinetype=pipeline_config.config_type))
        print(tasks)

        return tasks