def copy_docs(index_setting_file,
              src_index,
              src_doc_type,
              entity_id_field_name,
              dest_index,
              dest_doc_type,
              patient_list_file,
              thread_num=30):
    """
    copy a list of docs (doc ids read from doc_list_file) from one index to another
    :param index_setting_file:
    :param src_index:
    :param src_doc_type:
    :param entity_id_field_name:
    :param dest_index:
    :param dest_doc_type:
    :param patient_list_file:
    :param thread_num:
    :return:
    """
    es = EntityCentricES.get_instance(index_setting_file)
    patients = utils.read_text_file(patient_list_file)
    utils.multi_thread_tasking(patients,
                               thread_num,
                               do_copy_doc,
                               args=[
                                   es, src_index, src_doc_type,
                                   entity_id_field_name, dest_index,
                                   dest_doc_type
                               ])
    print 'all done'
Ejemplo n.º 2
0
def run_parallel_prediction(settings):
    cm_obj = Concept2Mapping(settings['concept_mapping_file'])
    mp_inst = mp.MentionPattern(settings['pattern_folder'],
                                cm_obj.cui2label,
                                in_action=True)
    mp_inst = None
    db_pool = du.get_mysql_pooling(settings['dbconf'], num=30)
    doc_ids = []
    model_factory = ModelFactory(settings['phenotypes'], settings['model_dir'])
    du.query_data(settings['sql_docs4process'],
                  pool=db_pool,
                  container=doc_ids)
    # for d in doc_ids:
    #     do_one_doc(d, model_factory, cm_obj, mp_inst, db_pool,
    #                                  settings['sql_text_ptn'],
    #                                  settings['sql_ann_ptn'],
    #                                  settings['save_result_sql_ptn'],
    #                                  settings['update_doc_sql_ptn'])
    utils.multi_thread_tasking(doc_ids,
                               num_threads=settings['num_threads'],
                               process_func=do_one_doc,
                               args=[
                                   model_factory, cm_obj, mp_inst, db_pool,
                                   settings['sql_text_ptn'],
                                   settings['sql_ann_ptn'],
                                   settings['save_result_sql_ptn'],
                                   settings['update_doc_sql_ptn']
                               ])
    logging.info('#docs: %s all done' % len(doc_ids))
Ejemplo n.º 3
0
def update_mimic_doc_dates(doc_dates):
    es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json')
    container = []
    utils.multi_thread_tasking(doc_dates,
                               20,
                               do_doc_update_date,
                               args=[es, container])
Ejemplo n.º 4
0
 def collect_result(self, output_file, graph_file_path):
     files = [
         f for f in listdir(self._doc_pth) if isfile(join(self._doc_pth, f))
     ]
     f_did = []
     for f in files:
         sr = re.search(self._did_pattern, f, re.IGNORECASE)
         if sr:
             f_did.append((f, sr.group(1)))
     results = []
     logging.info('collecting results ...')
     utils.multi_thread_tasking(
         lst=f_did,
         num_threads=10,
         process_func=DocCohort.collect_doc_anns_by_types,
         args=[self._doc_pth, self.collect_semantic_types, results])
     logging.info('total anns collected %s' % len(results))
     ret = {'concepts': {}, 'p2c': {}}
     for r in results:
         if r['d'] in self._d2p:
             p = self._d2p[r['d']]
             if p not in ret['p2c']:
                 ret['p2c'][p] = {}
             pd = ret['p2c'][p]
             if r['cui'] not in ret['concepts']:
                 ret['concepts'][r['cui']] = r['pref']
             if r['cui'] not in pd:
                 pd[r['cui']] = 1
             else:
                 pd[r['cui']] += 1
         else:
             logging.error('doc %s not in cohort map' % r['d'])
     utils.save_json_array(ret, output_file)
     utils.save_json_array(DocCohort.result_to_graph(ret), graph_file_path)
     logging.info('result collected')
Ejemplo n.º 5
0
def query_doc_anns(es,
                   concepts,
                   skip_terms,
                   retained_patients_filter=None,
                   filter_obj=None,
                   doc_filter_function=None):
    patients = es.search_by_scroll(" ".join(concepts),
                                   es.patient_type,
                                   collection_func=lambda d, c: c.append(d))
    print '%s patients matched' % len(patients)
    if retained_patients_filter is not None:
        retained = []
        for po in patients:
            if po['_id'] in retained_patients_filter:
                retained.append(po)
        patients = retained
        print 'patients filtered to size %s' % len(patients)
    doc_anns = {}
    container = []
    utils.multi_thread_tasking(patients,
                               40,
                               collect_patient_docs,
                               args=[
                                   es, concepts, skip_terms, container,
                                   filter_obj, doc_filter_function
                               ])
    print 'data collected, merging...'
    for d in container:
        doc_anns.update(d)
    print 'merged dic size %s' % len(doc_anns)
    return doc_anns
Ejemplo n.º 6
0
def predict_exp(corpus_trans_file, ann_file, cache_file, output_file):
    # initialise pattern instances from documents
    if not isfile(cache_file):
        # load labelled data
        ann_lines = utils.read_text_file(ann_file)
        prev_doc = None
        anns = []
        doc_anns = []
        ptn_insts = []
        doc_to_pt = {}
        for ls in ann_lines:
            l = ls.split('\t')
            doc_id = l[1]
            doc_to_pt[doc_id] = l[0]
            if prev_doc != doc_id:
                if prev_doc is not None:
                    if exists(join(working_folder, 'docs',
                                   '%s.txt' % prev_doc)):
                        doc_anns.append((prev_doc, anns))
                anns = []
                prev_doc = doc_id
            anns.append({
                's': int(l[2]),
                'e': int(l[3]),
                'signed_label': l[4],
                'gt_label': l[5]
            })
        if prev_doc is not None:
            if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)):
                doc_anns.append((prev_doc, anns))
        # mutithreading do processing labelled docs
        print 'processing docs...'
        utils.multi_thread_tasking(doc_anns,
                                   30,
                                   do_process_labelled_doc,
                                   args=[ptn_insts])
        jl.dump({'insts': ptn_insts, 'doc_to_pt': doc_to_pt}, cache_file)
    else:
        cached = jl.load(cache_file)
        ptn_insts = cached['insts']
        doc_to_pt = cached['doc_to_pt']

    cp = sp.CorpusPredictor.load_corpus_model(corpus_trans_file)
    ret = []
    for inst in ptn_insts:
        print 'predicting [%s]...' % inst.sentence
        acc = cp.predcit(inst)
        print 'accuracy: %s' % acc
        ann = inst.annotations[0]
        ret.append(
            (doc_to_pt[inst.doc_id], inst.doc_id, str(ann['s']), str(ann['e']),
             ann['signed_label'], ann['gt_label'], str(acc)))
    s = []
    for r in ret:
        s.append(u'\t'.join(r))
    print u'\n'.join(s)
    utils.save_json_array(ret, output_file)
    return ret
Ejemplo n.º 7
0
def update_doc_dates(ser_file):
    data = jl.load(ser_file)
    step = 100
    batch = []
    for i in xrange(0, len(data), step):
        batch.append(data[i:min(len(data), i + step)])
    utils.multi_thread_tasking(batch, 20, update_doc_date,
                               thread_init_func=get_mysql_conn,
                               thread_end_func=dutil.release_db_connection)
Ejemplo n.º 8
0
def compute_all_subconcepts(concepts, file_path):
    c_to_subs = {}
    umls = UMLSAPI(_umls_api_key)
    container = []
    utils.multi_thread_tasking(concepts,
                               10,
                               do_compute_subconcept,
                               args=[umls, container])
    for p in container:
        c_to_subs[p[0]] = p[1]
    utils.save_json_array(c_to_subs, file_path)
def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])
    es.doc_level_index = settings.get_attr(['semehr', 'doc_level_index'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        logging.info('[SemEHR-step] starting semehr-concept process')
        logging.debug('working on files : %s' % ann_files)
        # index concepts
        concept_index = settings.get_attr(['semehr', 'concept_index'])
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(
                join(f_yodie_anns, ann),
                10,
                do_index_100k_anns,
                args=[es, doc_to_patient, concept_index])
        logging.info('[SemEHR-step-end]concept/document level indexing done')

    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        logging.info('[SemEHR-step] indexing annotations at patient level')
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2(),
                                     verify_certs=False)
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])
        logging.info('[SemEHR-step-end]patient level indexing done')
Ejemplo n.º 10
0
def index_cris_patients():
    f_patient_doc = './hepc_pos_doc_brcid.txt'
    lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig')
    patients = []
    for l in lines:
        arr = l.split('\t')
        if arr[0] not in patients:
            patients.append(arr[0])
    print 'total patients %s %s' % (len(patients), patients[0])
    es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json')
    utils.multi_thread_tasking(patients, 10, do_index_patient, args=[es])
    print 'done'
Ejemplo n.º 11
0
 def compute_all_concept_closure(all_concepts, umls_instance):
     concept_to_closure = {}
     print 'all concepts number %s' % len(all_concepts)
     computed = []
     results = []
     utils.multi_thread_tasking(all_concepts,
                                40,
                                StudyConcept.do_compute_concept_closure,
                                args=[umls_instance, computed, results])
     for r in results:
         concept_to_closure[r['concept']] = r['closure']
     return concept_to_closure
def working_on_docs(index_setting_file, job_file, src_index, src_doc_type, dest_index, dest_doc_type, num_threads=20):
    job_status = JobStatus(job_file)
    docs = get_docs_for_processing(job_status)
    print 'copy docs: [%s]' % docs
    es = ees.EntityCentricES.get_instance(index_setting_file)
    try:
        utils.multi_thread_tasking(docs, num_threads, do_copy_doc,
                                   args=[es, src_index, src_doc_type, dest_index, dest_doc_type])
        job_status.set_status(True)
    except:
        job_status.set_status(JobStatus.STATUS_FAILURE)
    job_status.save()
Ejemplo n.º 13
0
def process_labelled_docs(labelled_file, corpus_model_file, mini_comp_file):
    corpus_analyzer = None
    if not isfile(corpus_model_file):
        # load labelled data
        ann_lines = utils.read_text_file(labelled_file)
        prev_doc = None
        anns = []
        doc_anns = []
        ptn_insts = []
        for ls in ann_lines:
            l = ls.split('\t')
            doc_id = l[0]
            if prev_doc != doc_id:
                if prev_doc is not None:
                    if exists(join(working_folder, 'docs',
                                   '%s.txt' % prev_doc)):
                        doc_anns.append((prev_doc, anns))
                anns = []
                prev_doc = doc_id
            anns.append({
                's': int(l[1]),
                'e': int(l[2]),
                'signed_label': l[3],
                'gt_label': l[4]
            })
        if prev_doc is not None:
            if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)):
                doc_anns.append((prev_doc, anns))
        # mutithreading do processing labelled docs
        print 'processing docs...'
        utils.multi_thread_tasking(doc_anns,
                                   30,
                                   do_process_labelled_doc,
                                   args=[ptn_insts])
        print 'merging patterns..'
        corpus_analyzer = sp.CorpusAnalyzer()
        for pi in ptn_insts:
            corpus_analyzer.add_pattern(pi)
        corpus_analyzer.serialise(corpus_model_file)
    else:
        corpus_analyzer = sp.CorpusAnalyzer.load_seralisation(
            corpus_model_file)
        # corpus_analyzer.show()
        # pt_insts = corpus_analyzer.pattern_to_insts

    if isfile(mini_comp_file):
        corpus_analyzer.load_mini_comp_dict(mini_comp_file)
    else:
        corpus_analyzer.produce_save_comp_dict(mini_comp_file)
    corpus_analyzer.show_mini_comp_patterns()
    # generate_corpus_model(corpus_analyzer)
    return corpus_analyzer
Ejemplo n.º 14
0
def index_pubmed():
    es = EntityCentricES.get_instance('./index_settings/es_setting.json')
    doc_details = utils.load_json_data('./index_settings/pmc_docs.json')
    pmcid_to_journal = {}
    for d in doc_details:
        if 'pmcid' in d and 'journalTitle' in d:
            pmcid_to_journal[d['pmcid']] = d['journalTitle']
    # load anns
    # utils.multi_thread_large_file_tasking('./index_settings/test_anns.json', 10, do_index_pubmed,
    #                                       args=[es, pmcid_to_journal, './index_settings/fulltext'])
    utils.multi_thread_tasking(doc_details, 10, do_index_pubmed_docs,
                               args=[es, './index_settings/fulltext'])
    print 'done'
Ejemplo n.º 15
0
def get_concepts_names(umls, concepts):
    batch_size = 200

    batches = []
    for k in range(0, int(math.ceil(len(concepts)*1.0/batch_size))):
        batches.append(concepts[batch_size * k : batch_size * (k+1)])
        print 'batch %s, len %s' % (k, len(batches[-1]))
    container = []
    utils.multi_thread_tasking(batches, 20, do_get_concepts_names, args=[umls, container])
    print len(container)
    c2label = {}
    for r in container:
        c2label[r[0]] = r[1]
    return c2label
Ejemplo n.º 16
0
def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        print 'working on files : %s' % ann_files
        # index concepts
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann),
                                                  10,
                                                  do_index_100k_anns,
                                                  args=[es, doc_to_patient])
    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2())
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])
def complete_samples(sample_file, complete_sql, db_conn_file, out_file):
    ann_prefix = 'var sample_docs='
    anns_str = utils.read_text_file_as_string(sample_file)
    if anns_str.startswith(ann_prefix):
        anns_str = anns_str[len(ann_prefix):]
    anns = json.loads(anns_str)
    # anns = utils.load_json_data(sample_file)
    key_anns = []
    for k in anns:
        key_anns.append((k, anns[k]))
    container = []
    utils.multi_thread_tasking(key_anns, 40, complete_sample_ann_data,
                               args=[complete_sql, db_conn_file, container])
    results = {}
    for r in container:
        results[r[0]] = r[1]
    utils.save_string(ann_prefix + json.dumps(results), out_file)
    print 'done'
Ejemplo n.º 18
0
def index_100k(index_setting_file, patient_index_only=None):
    es = EntityCentricES.get_instance(index_setting_file)
    f_patient_doc = es.customise_settings['patient_doc_mapping_file']
    f_yodie_anns = es.customise_settings['yodie_output_folder']
    es_epr_full_text = es.customise_settings['es_ft']
    ft_index_name = es.customise_settings['ft_index_name']
    ft_doc_type = es.customise_settings['ft_doc_type']
    ft_entity_field = es.customise_settings['ft_entity_field']
    ft_fulltext_field = es.customise_settings['ft_fulltext_field']

    lines = utils.read_text_file(f_patient_doc)
    doc_to_patient = {}
    patients = set()
    for l in lines:
        arr = l.split('\t')
        doc_to_patient[arr[1]] = arr[0]
        patients.add(arr[0])
    patients = list(patients)
    # epr full text index api
    es_full_text = Elasticsearch([es_epr_full_text],
                                 serializer=JSONSerializerPython2())
    # es_full_text.get()

    if patient_index_only is None:
        ann_files = [
            f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
        ]
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann),
                                                  10,
                                                  do_index_100k_anns,
                                                  args=[es, doc_to_patient])
        print 'anns done, indexing patients...'
    else:
        print 'skipping concept indexing'
    utils.multi_thread_tasking(patients,
                               10,
                               do_index_100k_patients,
                               args=[
                                   es, es_full_text, ft_index_name,
                                   ft_doc_type, ft_entity_field,
                                   ft_fulltext_field
                               ])
    print 'all done'
Ejemplo n.º 19
0
def query_doc_by_search(es,
                        doc_es,
                        es_search,
                        patiet_id_field,
                        retained_patients_filter=None,
                        filter_obj=None,
                        doc_filter_function=None):
    """
    get number of mentions by elasticsearch queries instead of NLP results
    :param es:
    :param doc_es:
    :param es_search:
    :param patiet_id_field:
    :param retained_patients_filter:
    :param filter_obj:
    :param doc_filter_function:
    :return:
    """
    patients = es.search_by_scroll(" ".join(es_search),
                                   es.patient_type,
                                   collection_func=lambda d, c: c.append(d))
    print '%s patients matched' % len(patients)
    if retained_patients_filter is not None:
        retained = []
        for po in patients:
            if po['_id'] in retained_patients_filter:
                retained.append(po)
        patients = retained
        print 'patients filtered to size %s' % len(patients)
    container = []
    utils.multi_thread_tasking(patients,
                               40,
                               query_collect_patient_docs,
                               args=[
                                   doc_es, es_search, patiet_id_field,
                                   container, filter_obj, doc_filter_function
                               ])
    return container
def mimic_anonymisation(single_file, rule_file):
    doc = utils.read_text_file_as_string(single_file)
    arr = re.split(r'START\_OF\_RECORD=\d+\|\|\|\|\d+\|\|\|\|\r{0,1}\n', doc)
    i = 0
    texts = []
    for t in arr:
        texts.append(t.replace('||||END_OF_RECORD\n', ''))

    anonymis_inst = AnonymiseRule(rule_file)
    failed_docs = []
    sent_data = []
    utils.multi_thread_tasking(texts,
                               1,
                               wrap_anonymise_doc,
                               args=[failed_docs, anonymis_inst, sent_data])
    t2sent = {}
    for s in sent_data:
        if s['type'] not in t2sent:
            t2sent[s['type']] = []
        t2sent[s['type']].append(s['sent'])
    for t in t2sent:
        t2sent[t] = list(set(t2sent[t]))
        print('%s\n======\n%s\n\n' % (t, '\n'.join(t2sent[t])))
def generate_result_in_one_iteration(cohort_name, study_analyzer, out_file,
                                     sample_size, sample_out_file,
                                     doc_to_brc_sql, brc_sql, anns_iter_sql, skip_term_sql, doc_content_sql,
                                     db_conn_file):
    """
    generate result in one iteration over all annotations. this is supposed to be much faster when working on
    large study concepts. But post-processing using rules not supported now
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :param sample_size:
    :param sample_out_file:
    :param doc_to_brc_sql:
    :param brc_sql:
    :param anns_iter_sql:
    :param skip_term_sql:
    :param doc_content_sql:
    :param db_conn_file:
    :return:
    """
    # populate concept to anns maps
    sc2anns = {}
    for sc in study_analyzer.study_concepts:
        sc2anns[sc.name] = []

    # populate patient list
    print 'populating patient list...'
    patients = {}
    rows_container = []
    dutil.query_data(brc_sql.format(cohort_name), rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    for r in rows_container:
        patients[r['brcid']] = {'brcid': r['brcid']}

    # populate document id to patient id dictionary
    print 'populating doc to patient map...'
    rows_container = []
    dutil.query_data(doc_to_brc_sql.format(cohort_name), rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    doc2brc = {}
    for dp in rows_container:
        doc2brc[dp['doc_id']] = dp['brcid']

    # query annotations
    print 'iterating annotations...'
    rows_container = []
    dutil.query_data(anns_iter_sql.format(**{'cohort_id': cohort_name,
                                             'extra_constrains':
                                                 ' \n '.join(
                                                  [generate_skip_term_constrain(study_analyzer, skip_term_sql)]
                                                  + [] if (study_analyzer.study_options is None or
                                                           study_analyzer.study_options['extra_constrains'] is None)
                                                  else study_analyzer.study_options['extra_constrains'])}),
                     rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    for r in rows_container:
        concept_id = r['inst_uri']
        brcid = doc2brc[r['doc_id']] if r['doc_id'] in doc2brc else None
        if brcid is None:
            print 'doc %s not matched to a patient!!!' % r['doc_id']
            continue
        patient = patients[brcid] if brcid in patients else None
        if patient is None:
            print 'brc id %s not matched a patient!!!' % brcid
            continue
        # get matched study concepts
        for sc in study_analyzer.study_concepts:
            if concept_id in sc.concept_closure:
                patient[sc.name] = (patient[sc.name] + 1) if sc.name in patient else 1
                sc2anns[sc.name].append({'ann_id': r['ann_id'], 'doc_id': r['doc_id'], 'concept_id': concept_id,
                                         'start': r['start_offset'], 'end': r['end_offset']})

    # generate result table
    print 'generate result table...'
    concept_labels = sorted([k for k in sc2anns])
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    lines = []
    utils.multi_thread_tasking([patients[pid] for pid in patients], 40, do_put_line,
                               args=[concept_labels, lines])
    s += '\n'.join(lines)
    utils.save_string(s, out_file)

    # generate sample annotations
    term_to_docs = {}
    for concept in sc2anns:
        ann_ids = sc2anns[concept]
        sample_ids = []
        if len(ann_ids) <= sample_size:
            sample_ids = ann_ids
        else:
            for i in xrange(sample_size):
                index = random.randrange(len(ann_ids))
                sample_ids.append(ann_ids[index])
                del ann_ids[index]
        term_to_docs[concept] = sample_ids

    # query doc contents
    print 'populating term to sampled anns...'
    term_to_sampled = {}
    for term in term_to_docs:
        sample_ids = term_to_docs[term]
        if len(sample_ids) <=0 :
            continue
        sample_doc_ids = ['\'' + s['doc_id'] + '\'' for s in sample_ids]
        rows_container = []
        dutil.query_data(doc_content_sql.format(','.join(sample_doc_ids)), rows_container,
                         dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        doc_to_content = {}
        for r in rows_container:
            doc_to_content[r['doc_id']] = r['TextContent']
        term_sampled = []
        for s in sample_ids:
            term_sampled.append({'id': s['doc_id'],
                                 'content': doc_to_content[s['doc_id']],
                                 'annotations': [{'start': s['start'],
                                                  'end': s['end'],
                                                  'concept': s['concept_id']}]})
        term_to_sampled[term] = term_sampled
    utils.save_json_array(convert_encoding(term_to_sampled, 'cp1252', 'utf-8'), sample_out_file)
def dump_doc_as_files(folder):
    utils.multi_thread_tasking(load_all_docs(), 10, do_save_file, args=[folder])
Ejemplo n.º 23
0
def process_semehr(config_file):
    """
    a pipeline to process all SemEHR related processes:
    0. ES doc copy from one index to another;
    1. bio-yodie NLP pipeline annotation on docs;
    2. entity centric SemEHR ES indexing
    :param config_file:
    :return:
    """
    # read the configuration
    ps = ProcessSetting(config_file)

    # setting log configuration
    log_level = 'INFO' if ps.get_attr(
        ['logging', 'level']) is None else ps.get_attr(['logging', 'level'])
    log_format = '%(name)s %(asctime)s %(levelname)s %(message)s' if ps.get_attr(['logging', 'format']) is None \
        else ps.get_attr(['logging', 'format'])
    log_file = None if ps.get_attr(
        ['logging', 'file']) is None else ps.get_attr(['logging', 'file'])
    logging.basicConfig(level=log_level, format=log_format)
    if log_file is not None:
        formatter = logging.Formatter(log_format)
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(log_level)
        file_handler.setFormatter(formatter)
        logging.getLogger().addHandler(file_handler)
        logging.info('logging to %s' % log_file)

    # initialise the jobstatus class instance
    job_file = join(
        ps.get_attr(['job', 'job_status_file_path']),
        'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id']))
    logging.info('[SemEHR-step] using job status file %s' % job_file)
    job_status = JobStatus(job_file)
    job_status.job_start()

    # preload: load documents to es
    if ps.get_attr(['job', 'epr_index']) == 'yes':
        logging.info('[SemEHR-step]load documents to elasticsearch...')
        load_document_to_es(settings=ps)
        logging.info('[SemEHR-step-end] epr_index step done')

    data_rows = []
    doc2pid = {}
    pids = []
    if ps.get_attr(['job', 'load_docs']) == 'yes':
        sql_template = ps.get_attr(['new_docs', 'sql_query'])
        logging.info(
            '[SemEHR-step] retrieving docs by using the template [%s]' %
            sql_template)
        data_rows = get_docs_for_processing(
            job_status, sql_template,
            ps.get_attr(['new_docs', 'dbconn_setting_file']))
        logging.info('total docs num is %s' % len(data_rows))
    elif ps.get_attr(['job', 'cohort_docs']) == 'yes':
        logging.info('[SemEHR-step] retrieving docs by cohort [%s]' %
                     ps.get_attr(['cohort_docs', 'es_cohort_file']))
        data_rows, doc2pid, pids = es_get_cohort_docs(ps)
        logging.info('total docs num is %s' % len(data_rows))

    try:
        # if True:
        # 0. copy docs
        if ps.get_attr(['job', 'copy_docs']) == 'yes':
            logging.info('[SemEHR-step] copy docs')
            docs = [str(r['docid']) for r in data_rows]
            utils.multi_thread_tasking(
                docs,
                ps.get_attr(['doc_copy', 'thread_num']),
                do_copy_doc,
                args=[
                    EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])),
                    ps.get_attr(['doc_copy', 'src_index']),
                    ps.get_attr(['doc_copy', 'src_doc_type']),
                    ps.get_attr(['doc_copy', 'dest_index']),
                    ps.get_attr(['doc_copy', 'dest_doc_type'])
                ])
            logging.info('[SemEHR-step-end]copying docs done')

        if ps.get_attr(['job', 'yodie']) == 'yes':
            docid_path = '%s/%s_docids.txt' % (ps.get_attr([
                'yodie', 'input_doc_file_path'
            ]), ps.get_attr(['job', 'job_id']))
            logging.info('[SemEHR-step] doing yodie')
            # 1. do bio-yodie pipeline
            # 1.1 prepare the configuration file
            num_docs = produce_yodie_config(ps, data_rows, docid_path)
            if num_docs == 0:
                logging.info(
                    '[SemEHR-step-end] nothing to process, NLP step done')
            else:
                logging.info('total number of docs %s' % num_docs)
                # 1.2 set the env variables
                set_sys_env(ps)
                # 1.3 clear ann output folder
                logging.info('clearing %s ...' %
                             ps.get_attr(['yodie', 'output_file_path']))
                clear_folder(ps.get_attr(['yodie', 'output_file_path']))
                # 1.3 run bio-yodie
                os.chdir(ps.get_attr(['yodie', 'gcp_run_path']))
                if ps.get_attr(['yodie', 'os']) == 'win':
                    cmd = ' '.join([
                        'java',
                        "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']),
                        "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']),
                        "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols",
                        "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;"
                        "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format(
                            **{
                                "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']),
                                "GATE_HOME": ps.get_attr(['env', 'gate_home'])
                            }),
                        '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                        % ps.get_attr(['env', 'yodie_path']),
                        "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                        "gate.cloud.batch.BatchRunner",
                        "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                        "-b %s" % ps.get_attr(['yodie', 'config_xml_path'])
                    ])
                else:
                    cmd = ' '.join([
                        'gcp-direct.sh',
                        "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                        "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                        "-b %s" % ps.get_attr(['yodie', 'config_xml_path']),
                        '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                        % ps.get_attr(['env', 'yodie_path']),
                    ])
                logging.debug(
                    'executing the following command to start NLP...')
                logging.info(cmd)
                p = Popen(cmd, shell=True, stderr=STDOUT)
                p.wait()

                if 0 != p.returncode:
                    job_status.set_status(False)
                    job_status.save()
                    logging.error(
                        'ERROR doing the NLP, stopped with a coide [%s]' %
                        p.returncode)
                    exit(p.returncode)
                else:
                    logging.info('[SemEHR-step-end] NLP step done')
                if 'semehr_path' in os.environ:
                    logging.info('changing back to semehr_path: %s' %
                                 os.environ['semehr_path'])
                    os.chdir(os.environ['semehr_path'])

        # 2. do SemEHR concept/entity indexing
        if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr(
            ['job', 'semehr-patients']) == 'yes':
            patients = []
            doc_to_patient = {}
            for r in data_rows:
                patients.append(str(r['patientid']))
                doc_to_patient[str(r['docid'])] = str(r['patientid'])
            patients = list(set(patients))
            do_semehr_index(ps, patients, doc_to_patient)

        # 3. do SemEHR actionable transparency
        if ps.get_attr(['job', 'action_trans']) == 'yes':
            logging.info('[SemEHR-step]doing transparency...')
            actionable_transparise(settings=ps)

        # 4. do SemEHR document annotation analysis (post processing)
        if ps.get_attr(['job', 'doc_analysis']) == 'yes':
            logging.info('[SemEHR-step]doing SemEHR annotation analysis...')
            do_semehr_doc_anns_analysis(settings=ps)
            logging.info('[SemEHR-step-end] doc_analysis step done')

        # 4.5 do SemEHR patient level index
        if ps.get_attr(['job', 'patient_index']) == 'yes':
            logging.info('[SemEHR-step]doing patient level indexing...')
            patient_level_indexing(settings=ps, pids=pids)
            logging.info('[SemEHR-step-end] patient level indexing done')

        # 5. do populate results for a research study
        if ps.get_attr(['job', 'populate_cohort_result']) == 'yes':
            logging.info(
                '[SemEHR-step]doing SemEHR cohort result extraction...')
            populate_cohort_results(settings=ps)
            logging.info('[SemEHR-step-end] populate_cohort_result step done')

        # 6. do collect cohort doc based results for a research study
        if ps.get_attr(['job', 'cohort_doc_collection']) == 'yes':
            logging.info(
                '[SemEHR-step]doing SemEHR cohort doc based collection...')
            collect_cohort_doc_results(settings=ps, doc2pid=doc2pid)
            logging.info(
                '[SemEHR-step-end] collect_cohort_doc_results step done')

        job_status.set_status(True)
        job_status.save()
        logging.info('[SemEHR-process-end] all done')
    except Exception as e:
        logging.error('[SemEHR-process-ERROR] Failed to do SemEHR process %s' %
                      str(e))
        job_status.set_status(False)
        job_status.save()
Ejemplo n.º 24
0
 def save_docs_to_db(self, docs):
     utils.multi_thread_tasking(docs, process_func=CohortHelper.do_save_doc_to_db,
                                num_threads=10 if 'threads' not in self._conf else self._conf['threads'],
                                args=[self._conf['sql_template'],
                                      self._conf['db_conf_file']])
Ejemplo n.º 25
0
def index_patients(patients, es):
    print 'indexing %s patients...' % len(patients)
    utils.multi_thread_tasking(patients, 10, do_index_patient, args=[es])
    print 'patient indexing done'
Ejemplo n.º 26
0
def process_semehr(config_file):
    """
    a pipeline to process all SemEHR related processes:
    0. ES doc copy from one index to another;
    1. bio-yodie NLP pipeline annotation on docs;
    2. entity centric SemEHR ES indexing
    :param config_file:
    :return:
    """
    # read the configuration
    ps = ProcessSetting(config_file)

    # initialise the jobstatus class instance
    job_file = join(
        ps.get_attr(['job', 'job_status_file_path']),
        'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id']))
    print 'using job status file %s' % job_file
    job_status = JobStatus(job_file)
    job_status.job_start()

    data_rows = []
    if ps.get_attr(['job', 'load_docs']) == 'yes':
        sql_template = ps.get_attr(['new_docs', 'sql_query'])
        print 'retrieving docs by using the template [%s]' % sql_template
        data_rows = get_docs_for_processing(
            job_status, sql_template,
            ps.get_attr(['new_docs', 'dbconn_setting_file']))
        print 'total docs num is %s' % len(data_rows)

    # try:
    if True:
        # 0. copy docs
        if ps.get_attr(['job', 'copy_docs']) == 'yes':
            docs = [str(r['docid']) for r in data_rows]
            utils.multi_thread_tasking(
                docs,
                ps.get_attr(['doc_copy', 'thread_num']),
                do_copy_doc,
                args=[
                    EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])),
                    ps.get_attr(['doc_copy', 'src_index']),
                    ps.get_attr(['doc_copy', 'src_doc_type']),
                    ps.get_attr(['doc_copy', 'dest_index']),
                    ps.get_attr(['doc_copy', 'dest_doc_type'])
                ])

        if ps.get_attr(['job', 'yodie']) == 'yes':
            docid_path = '%s/%s_docids.txt' % (ps.get_attr([
                'yodie', 'input_doc_file_path'
            ]), ps.get_attr(['job', 'job_id']))
            print 'working on yodie with %s documents, saved to %s...' % (str(
                len(data_rows)), docid_path)
            # save doc ids to text file for input to bioyodie
            print 'saving doc ids to [%s]' % docid_path
            utils.save_string('\n'.join([str(r['docid']) for r in data_rows]),
                              docid_path)
            # 1. do bio-yodie pipeline
            # 1.1 prepare the configuration file
            produce_yodie_config(ps)
            # 1.2 set the env variables
            set_sys_env(ps)
            # 1.3 clear ann output folder
            print 'clearing %s ...' % ps.get_attr(
                ['yodie', 'output_file_path'])
            clear_folder(ps.get_attr(['yodie', 'output_file_path']))
            # 1.3 run bio-yodie
            os.chdir(ps.get_attr(['yodie', 'gcp_run_path']))
            if ps.get_attr(['yodie', 'os']) == 'win':
                cmd = ' '.join([
                    'java',
                    "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']),
                    "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']),
                    "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols",
                    "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;"
                    "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format(
                        **{
                            "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']),
                            "GATE_HOME": ps.get_attr(['env', 'gate_home'])
                        }),
                    '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                    % ps.get_attr(['env', 'yodie_path']),
                    "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                    "gate.cloud.batch.BatchRunner",
                    "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                    "-b %s" % ps.get_attr(['yodie', 'config_xml_path'])
                ])
            else:
                cmd = ' '.join([
                    'gcp-direct.sh',
                    "-t %s" % ps.get_attr(['yodie', 'thread_num']),
                    "-Xmx%s" % ps.get_attr(['yodie', 'memory']),
                    "-b %s" % ps.get_attr(['yodie', 'config_xml_path']),
                    '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" '
                    % ps.get_attr(['env', 'yodie_path']),
                ])
            print cmd
            p = Popen(cmd, shell=True, stderr=STDOUT)
            p.wait()

            if 0 != p.returncode:
                job_status.set_status(False)
                job_status.save()
                exit(p.returncode)

        # 2. do SemEHR concept/entity indexing
        if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr(
            ['job', 'semehr-patients']) == 'yes':
            patients = []
            doc_to_patient = {}
            for r in data_rows:
                patients.append(str(r['patientid']))
                doc_to_patient[str(r['docid'])] = str(r['patientid'])
            patients = list(set(patients))
            do_semehr_index(ps, patients, doc_to_patient)

        # 3. do SemEHR actionable transparency
        if ps.get_attr(['job', 'action_trans']) == 'yes':
            print 'doing transparency...'
            actionable_transparise(settings=ps)

        job_status.set_status(True)
        job_status.save()
def index_mimic_af_cohort_smp():
    med_profile_type = 'medprofile'
    pids = utils.read_text_file('../resources/af_pids.txt')
    print pids
    utils.multi_thread_tasking(pids, 5, smp_index, args=[es, med_profile_type])
Ejemplo n.º 28
0
def dump_pmc_data(term, page_size, data_path):
    docs = search_pmc(term, page_size)
    utils.save_json_array(docs, join(data_path, 'pmc_docs.json'))
    utils.multi_thread_tasking([d['pmcid'] for d in docs if 'pmcid' in d], 10, do_download_pmc_full_text,
                               args=[join(data_path, 'fulltext')])
    print 'done'
Ejemplo n.º 29
0
def db_populate_study_results(cohort_sql,
                              doc_ann_sql_temp,
                              doc_ann_pks,
                              dbcnn_file,
                              study_folder,
                              output_folder,
                              sample_sql_temp,
                              thread_num=10,
                              study_config='study.json',
                              sampling=True,
                              sample_size=20):
    """
    populate results for a research study
    :param cohort_sql: cohort selection query
    :param doc_ann_sql_temp: query template for getting a doc_anns item
    :param doc_ann_pks: primary key columns of doc ann table
    :param dbcnn_file: database connection config file
    :param study_folder: study folder
    :param output_folder: where to save the results
    :param sample_sql_temp: query template for getting a sample item (including full text and doc_anns)
    :param thread_num:
    :param study_config:
    :param sampling: whether sampling is needed
    :param sample_size: how many samples per study concept
    :return:
    """
    ret = load_study_ruler(study_folder, None, study_config)
    sa = ret['sa']
    concept_list = sorted([sc.name for sc in sa.study_concepts])
    cui2concept = {}
    for sc in sa.study_concepts:
        for c in sc.concept_closure:
            cui2concept[c] = sc.name
    results = []
    rows = []
    db.query_data(cohort_sql, rows,
                  db.get_db_connection_by_setting(dbcnn_file))
    logging.info('querying results (cohort size:%s)...' % len(rows))
    utils.multi_process_tasking([r['pid'] for r in rows],
                                db_populate_patient_result,
                                num_procs=thread_num,
                                args=[
                                    doc_ann_sql_temp, doc_ann_pks, dbcnn_file,
                                    concept_list, cui2concept,
                                    positive_patient_filter
                                ],
                                thread_init_func=proc_init_container,
                                thread_end_func=proc_final_collect,
                                thread_end_args=[results])
    # populate result table
    c2pks = {}
    for c in concept_list:
        c2pks[c] = []
    s = '\t'.join(['pid'] + concept_list)
    for r in results:
        pr = [r['p']]
        for c in concept_list:
            if r['c2f'][c]['f'] > 0:
                c2pks[c].append(r['c2f'][c]['docs'][0])
            pr.append(str(r['c2f'][c]['f']))
        s += '\t'.join(pr) + '\n'
    f = join(output_folder, 'result.tsv')
    utils.save_string(s, f)
    logging.info('result table saved to [%s]' % f)
    if sampling:
        logging.info('doing sampling...')
        sampled_result = {}
        for c in c2pks:
            pks = c2pks[c]
            sample_pks = []
            logging.info('doc cache size: %s' % len(pks))
            if len(pks) <= sample_size:
                sample_pks = pks
            else:
                for i in xrange(sample_size):
                    index = random.randrange(len(pks))
                    sample_pks.append(pks[index])
                    del pks[index]
            samples = []
            utils.multi_thread_tasking(
                sample_pks,
                thread_num,
                extract_sample,
                args=[c, cui2concept, sample_sql_temp, dbcnn_file, samples])
            sampled_result[c] = samples
            logging.info('%s sampled (%s) results' % (c, len(samples)))

        f = join(output_folder, 'sampled_docs.js')
        utils.save_string('var sample_docs= %s;' % json.dumps(sampled_result),
                          f)
        logging.info('samples saved to %s' % f)
    logging.info('all results populated')