Exemple #1
0
    def run(self):
        task_family_name = str(self.task_family)
        if self.task_name == "ClarityNLPLuigiTask":
            self.task_name = task_family_name
        client = util.mongo_client()

        try:
            with self.output().open('w') as temp_file:
                temp_file.write("start writing custom task")
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running Batch %s" % self.batch)

                self.pipeline_config = config.get_pipeline_config(
                    self.pipeline, util.conn_string)
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS, "Running Solr query")
                self.docs = solr_data.query(
                    self.solr_query,
                    rows=util.row_count,
                    start=self.start,
                    solr_url=util.solr_url,
                    tags=self.pipeline_config.report_tags,
                    mapper_inst=util.report_mapper_inst,
                    mapper_url=util.report_mapper_url,
                    mapper_key=util.report_mapper_key,
                    types=self.pipeline_config.report_types,
                    sources=self.pipeline_config.sources,
                    filter_query=self.pipeline_config.filter_query,
                    cohort_ids=self.pipeline_config.cohort,
                    job_results_filters=self.pipeline_config.job_results)

                for d in self.docs:
                    doc_id = d[util.solr_report_id_field]
                    if util.use_memory_caching == "true":
                        k = keys.hashkey(doc_id)
                        document_cache[k] = d
                    if util.use_redis_caching == "true":
                        util.write_to_redis_cache("doc:" + doc_id,
                                                  json.dumps(d))
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running %s main task" % self.task_name)
                self.run_custom_task(temp_file, client)
                temp_file.write("Done writing custom task!")

            self.docs = list()
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()
def get_race_data(document_id):
    if util.use_redis_caching == "true":
        util.add_cache_query_count()
        key = "raceFinder:" + str(document_id)
        res = util.get_from_redis_cache(key)
        if res:
            return json.loads(res)
        else:
            util.add_cache_compute_count()
            res2 = get_race_for_doc(document_id)
            util.write_to_redis_cache(key, json.dumps(res2))
            return res2
    elif util.use_memory_caching == "true":
        util.add_cache_query_count()
        return _get_race_data(document_id)
    else:
        return get_race_for_doc(document_id)
def get_race_data(document_id):
    if util.use_redis_caching == "true":
        util.add_cache_query_count()
        key = "raceFinder:" + str(document_id)
        res = util.get_from_redis_cache(key)
        if res:
            return json.loads(res)
        else:
            util.add_cache_compute_count()
            res2 = get_race_for_doc(document_id)
            util.write_to_redis_cache(key, json.dumps(res2))
            return res2
    elif util.use_memory_caching == "true":
        util.add_cache_query_count()
        return _get_race_data(document_id)
    else:
        return get_race_for_doc(document_id)
Exemple #4
0
def get_document_by_id(document_id):
    doc = None

    if util.use_redis_caching == "true":
        util.add_cache_query_count()
        txt = util.get_from_redis_cache("doc:" + document_id)
        if not txt:
            util.add_cache_compute_count()
            doc = solr_data.query_doc_by_id(document_id,
                                            solr_url=util.solr_url)
            util.write_to_redis_cache("doc:" + document_id, json.dumps(doc))
        else:
            doc = json.loads(txt)
    elif util.use_memory_caching == "true":
        util.add_cache_query_count()
        doc = _get_document_by_id(document_id)

    if not doc:
        return solr_data.query_doc_by_id(document_id, solr_url=util.solr_url)
    else:
        return doc
Exemple #5
0
def get_job_performance(job_ids: list, connection_string: str):
    if not job_ids or len(job_ids) == 0:
        return dict()

    conn = psycopg2.connect(connection_string)
    cursor = conn.cursor()
    metrics = dict()

    try:
        in_clause = ''
        for i in job_ids:
            if len(in_clause) > 0:
                in_clause += ', '
            in_clause += '%s'
        cursor.execute(
            """
            SELECT status, nlp_job_id from nlp.nlp_job 
            where nlp_job_id in ({})
            """.format(in_clause), job_ids)
        status = None
        job_id = -1
        statuses = cursor.fetchall()
        for s in statuses:
            status = s[0]
            job_id = s[1]

            metrics[job_id] = {
                "status": status,
                "final_results": 0,
                "final_subjects": 0,
                "intermediate_results": 0,
                "intermediate_subjects": 0,
                "counts_found": 0
            }

        cursor.execute(
            """
            SELECT status, description, date_updated, nlp_job_id from nlp.nlp_job_status
            where nlp_job_id in  ({}) 
            order by date_updated
            """.format(in_clause), job_ids)
        updates = cursor.fetchall()

        for row in updates:
            status_name = row[0]
            status_date = row[2]
            status_value = row[1]
            job_id = row[3]

            performance = metrics[job_id]
            counts_found = performance.get('counts_found', 0)

            if status_name == 'STATS_FINAL_SUBJECTS':
                performance['final_subjects'] = status_value
                counts_found += int(status_value)
            elif status_name == 'STATS_FINAL_RESULTS':
                performance['final_results'] = status_value
                counts_found += int(status_value)
            elif status_name == 'STATS_INTERMEDIATE_SUBJECTS':
                performance['intermediate_subjects'] = status_value
            elif status_name == 'STATS_INTERMEDIATE_RESULTS':
                performance['intermediate_results'] = status_value

            performance['counts_found'] = counts_found
            metrics[job_id] = performance

        for k in metrics.keys():
            performance = metrics[k]
            counts_found = performance.get('counts_found', 0)
            if counts_found == 0 and status == COMPLETED:
                final_subjects = util.get_from_redis_cache(
                    'final_subjects_{}'.format(k))
                final_results = util.get_from_redis_cache(
                    'final_results{}'.format(k))
                if final_results and final_subjects:
                    performance['final_subjects'] = final_subjects
                    performance['final_results'] = final_results
                else:
                    stats = phenotype_stats(str(job_id), True)

                    performance['final_subjects'] = stats["subjects"]
                    performance['final_results'] = stats["results"]

                    util.write_to_redis_cache('final_subjects_{}'.format(k),
                                              stats["subjects"])
                    util.write_to_redis_cache('final_results_{}'.format(k),
                                              stats["results"])

                int_subjects = util.get_from_redis_cache(
                    'intermediate_subjects_{}'.format(k))
                int_results = util.get_from_redis_cache(
                    'intermediate_results{}'.format(k))
                if int_subjects and int_results:
                    performance['intermediate_subjects'] = int_subjects
                    performance['intermediate_results'] = int_results
                else:
                    intermediate_stats = phenotype_stats(str(job_id), False)

                    performance['intermediate_subjects'] = intermediate_stats[
                        "subjects"]
                    performance['intermediate_results'] = intermediate_stats[
                        "results"]

                    util.write_to_redis_cache(
                        'intermediate_subjects_{}'.format(k),
                        intermediate_stats["subjects"])
                    util.write_to_redis_cache(
                        'intermediate_results_{}'.format(k),
                        intermediate_stats["results"])

            if 'counts_found' in performance:
                del performance['counts_found']
            metrics[job_id] = performance
    except Exception as ex:
        traceback.print_exc(file=sys.stdout)
    finally:
        conn.close()

    return metrics