Ejemplo n.º 1
0
from data_access import pipeline_config as config
from data_access import solr_data
from data_access import base_model
from algorithms.sec_tag import *
from cachetools import LRUCache, cached, keys

sentences_key = "sentence_attrs"
section_names_key = "section_name_attrs"
section_text_key = "section_text_attrs"
doc_fields = [
    'report_id', 'subject', 'report_date', 'report_type', 'source', 'solr_id'
]
pipeline_cache = LRUCache(maxsize=5000)
document_cache = LRUCache(maxsize=5000)
init_cache = LRUCache(maxsize=1000)
segment = segmentation.Segmentation()


@cached(document_cache)
def _get_document_by_id(document_id):
    util.add_cache_compute_count()
    return solr_data.query_doc_by_id(document_id, solr_url=util.solr_url)


def get_document_by_id(document_id):
    doc = None

    if util.use_redis_caching == "true":
        util.add_cache_query_count()
        txt = util.get_from_redis_cache("doc:" + document_id)
        if not txt:
Ejemplo n.º 2
0
class BaseTask(luigi.Task):

    pipeline = luigi.IntParameter()
    job = luigi.IntParameter()
    start = luigi.IntParameter()
    solr_query = luigi.Parameter()
    batch = luigi.IntParameter()
    task_name = "ClarityNLPLuigiTask"
    docs = list()
    pipeline_config = config.PipelineConfig('', '')
    segment = segmentation.Segmentation()

    def run(self):
        task_family_name = str(self.task_family)
        if self.task_name == "ClarityNLPLuigiTask":
            self.task_name = task_family_name
        client = util.mongo_client()

        try:
            with self.output().open('w') as temp_file:
                temp_file.write("start writing custom task")
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running Batch %s" % self.batch)

                self.pipeline_config = config.get_pipeline_config(
                    self.pipeline, util.conn_string)
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS, "Running Solr query")
                self.docs = solr_data.query(
                    self.solr_query,
                    rows=util.row_count,
                    start=self.start,
                    solr_url=util.solr_url,
                    tags=self.pipeline_config.report_tags,
                    mapper_inst=util.report_mapper_inst,
                    mapper_url=util.report_mapper_url,
                    mapper_key=util.report_mapper_key,
                    types=self.pipeline_config.report_types,
                    sources=self.pipeline_config.sources,
                    filter_query=self.pipeline_config.filter_query,
                    cohort_ids=self.pipeline_config.cohort,
                    job_results_filters=self.pipeline_config.job_results)

                for d in self.docs:
                    doc_id = d[util.solr_report_id_field]
                    if util.use_memory_caching == "true":
                        k = keys.hashkey(doc_id)
                        document_cache[k] = d
                    if util.use_redis_caching == "true":
                        util.write_to_redis_cache("doc:" + doc_id,
                                                  json.dumps(d))
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running %s main task" % self.task_name)
                self.run_custom_task(temp_file, client)
                temp_file.write("Done writing custom task!")

            self.docs = list()
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()

    def output(self):
        return luigi.LocalTarget(
            "%s/pipeline_job%s_%s_batch%s.txt" %
            (util.tmp_dir, str(self.job), self.task_name, str(self.start)))

    def set_name(self, name):
        self.task_name = name

    def write_result_data(self,
                          temp_file,
                          mongo_client,
                          doc,
                          data: dict,
                          prefix: str = '',
                          phenotype_final: bool = False):
        inserted = pipeline_mongo_writer(mongo_client,
                                         self.pipeline,
                                         self.task_name,
                                         self.job,
                                         self.batch,
                                         self.pipeline_config,
                                         doc,
                                         data,
                                         prefix=prefix)
        if temp_file is not None:
            temp_file.write(str(inserted))
            temp_file.write('\n')
        return inserted

    def write_multiple_result_data(self,
                                   temp_file,
                                   mongo_client,
                                   doc,
                                   data: list,
                                   prefix: str = ''):
        ids = list()
        for d in data:
            inserted = pipeline_mongo_writer(mongo_client,
                                             self.pipeline,
                                             self.task_name,
                                             self.job,
                                             self.batch,
                                             self.pipeline_config,
                                             doc,
                                             d,
                                             prefix=prefix)
            ids.append(inserted)
            if temp_file is not None:
                temp_file.write(str(inserted))
                temp_file.write('\n')

        return ids

    def write_log_data(self, job_status, status_message):
        jobs.update_job_status(str(self.job), util.conn_string, job_status,
                               status_message)

    def run_custom_task(self, temp_file, mongo_client: MongoClient):
        print("Implement your custom functionality here ")

    def get_document_text(self, doc, clean=True):
        if doc and util.solr_text_field in doc:
            txt = doc[util.solr_text_field]
            if type(txt) == str:
                txt_val = txt
            elif type(txt) == list:
                txt_val = ' '.join(txt)
            else:
                txt_val = str(txt)

            if clean:
                return txt_val.encode("ascii", errors="ignore").decode()
            else:
                return txt_val
        else:
            return ''

    def get_boolean(self, key, default=False):
        return get_config_boolean(self.pipeline_config, key, default=default)

    def get_integer(self, key, default=-1):
        return get_config_integer(self.pipeline_config, key, default=default)

    def get_string(self, key, default=''):
        return get_config_string(self.pipeline_config, key, default=default)

    def get_document_sentences(self, doc):
        return document_sentences(doc)

    def get_document_sections(self, doc):
        names, section_texts = document_sections(doc)
        return names, section_texts
Ejemplo n.º 3
0
class BaseTask(luigi.Task):

    pipeline = luigi.IntParameter()
    job = luigi.IntParameter()
    start = luigi.IntParameter()
    solr_query = luigi.Parameter()
    batch = luigi.IntParameter()
    task_name = "ClarityNLPLuigiTask"
    docs = list()
    pipeline_config = config.PipelineConfig('', '')
    segment = segmentation.Segmentation()

    def run(self):
        task_family_name = str(self.task_family)
        if self.task_name == "ClarityNLPLuigiTask":
            self.task_name = task_family_name
        client = MongoClient(util.mongo_host, util.mongo_port)

        try:
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS,
                                   "Running Batch %s" % self.batch)

            self.pipeline_config = config.get_pipeline_config(
                self.pipeline, util.conn_string)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.IN_PROGRESS, "Running Solr query")
            self.docs = solr_data.query(
                self.solr_query,
                rows=util.row_count,
                start=self.start,
                solr_url=util.solr_url,
                tags=self.pipeline_config.report_tags,
                mapper_inst=util.report_mapper_inst,
                mapper_url=util.report_mapper_url,
                mapper_key=util.report_mapper_key,
                types=self.pipeline_config.report_types,
                filter_query=self.pipeline_config.filter_query)

            with self.output().open('w') as temp_file:
                jobs.update_job_status(str(self.job), util.conn_string,
                                       jobs.IN_PROGRESS,
                                       "Running %s main task" % self.task_name)
                self.run_custom_task(temp_file, client)
                temp_file.write("Done writing custom task!")

            self.docs = list()
        except Exception as ex:
            traceback.print_exc(file=sys.stderr)
            jobs.update_job_status(str(self.job), util.conn_string,
                                   jobs.WARNING,
                                   ''.join(traceback.format_stack()))
            print(ex)
        finally:
            client.close()

    def output(self):
        return luigi.LocalTarget(
            "%s/pipeline_job%s_%s_batch%s.txt" %
            (util.tmp_dir, str(self.job), self.task_name, str(self.start)))

    def set_name(self, name):
        self.task_name = name

    def write_result_data(self, temp_file, mongo_client, doc, data: dict):
        inserted = pipeline_mongo_writer(mongo_client, self.pipeline,
                                         self.task_name, self.job, self.batch,
                                         self.pipeline_config, doc, data)
        if temp_file is not None:
            temp_file.write(str(inserted))
            temp_file.write('\n')
        return inserted

    def write_multiple_result_data(self, temp_file, mongo_client, doc,
                                   data: list):
        ids = list()
        for d in data:
            inserted = pipeline_mongo_writer(mongo_client, self.pipeline,
                                             self.task_name, self.job,
                                             self.batch, self.pipeline_config,
                                             doc, d)
            ids.append(inserted)
            if temp_file is not None:
                temp_file.write(str(inserted))
                temp_file.write('\n')

        return ids

    def write_log_data(self, job_status, status_message):
        jobs.update_job_status(str(self.job), util.conn_string, job_status,
                               status_message)

    def run_custom_task(self, temp_file, mongo_client: MongoClient):
        print("Implement your custom functionality here ")

    def get_document_text(self, doc):
        if doc and util.solr_text_field in doc:
            return doc[util.solr_text_field]
        else:
            return ''

    def get_document_sentences(self, doc):
        txt = self.get_document_text(doc)
        sentence_list = self.segment.parse_sentences(txt)
        return sentence_list