コード例 #1
0
def parsing_tsv_to_manual_mapped(tsv_file, icd2umls_file):
    icd2umls = {}
    for l in utils.read_text_file(icd2umls_file):
        cols = l.split('\t')
        icd2umls[cols[0]] = cols[1]

    lines = utils.read_text_file(tsv_file)
    condition2code = {}
    for l in lines:
        cols = l.split('\t')
        c = cols[0]
        icds = cols[len(cols) - 1].split(',')
        concepts = []
        condition2code[c] = concepts
        for icd in icds:
            icd = icd.strip().upper()
            icd_codes = []
            m = re.match(r'([A-Z])(\d+)\-[A-Z]{0,1}(\d+)', icd)
            if m is not None:
                logging.info('range mappings: %s' % m.group(0))
                for num in range(int(m.group(2)), int(m.group(3)), 1):
                    icd = '%s%02d' % (m.group(1), num)
                    icd_codes.append(icd)
            else:
                icd_codes.append(icd)
            concepts += process_icd_to_umls(icd_codes, icd2umls=icd2umls)
    logging.info(json.dumps(condition2code))
コード例 #2
0
def patient_level_analysis(complete_anns_file, output_file):
    lines = utils.read_text_file(complete_anns_file)
    pos_condition2patients = {}
    patient2conditions = {}
    positive_labels = ['posM', 'hisM']
    indexable_labels = ['posM', 'hisM', 'negM']
    for l in lines:
        arr = l.split('\t')
        label = arr[2]
        condition = arr[3]
        pid = arr[8]
        if label in positive_labels:
            pos_condition2patients[condition] = [pid] if condition not in pos_condition2patients else \
                pos_condition2patients[condition] + [pid]
        if label in indexable_labels:
            pd = patient2conditions[pid] if pid in patient2conditions else {}
            patient2conditions[pid] = pd
            if label in pd:
                pd[label].append(condition)
                pd[label] = list(set(pd[label]))
            else:
                pd[label] = [condition]
    utils.save_json_array(
        {
            'p2c': patient2conditions,
            'c2p': pos_condition2patients
        }, output_file)
コード例 #3
0
 def load_gaz_dir(self, gaz_dir):
     files = [f for f in listdir(gaz_dir) if isfile(join(gaz_dir, f))]
     for f in files:
         if f.endswith('.lst'):
             t = f.split('.')[0]
             self._type2gaz[t] = utils.read_text_file(join(gaz_dir, f))
             self._all_entities += [t.lower() for t in self._type2gaz[t]]
コード例 #4
0
def create_instances():
    results = {"instances":[]}
    HOST_FILE_PATH = (
        pathlib
        .Path(__file__)
        .absolute()
        .parent
        .parent
        .parent/"hosts.txt"
    )
    addresses = read_text_file(HOST_FILE_PATH)
    names = list(random_name.generate(len(addresses)))

    #creating the public instance
    public = {
        "instance_name": names.pop(),
        "ip": addresses.pop(),
        "is_public": True
    }
    public_instance = Instance(config,public)
    results["instances"].append(str(public_instance))

    #creating the rest of the instances
    for name , ip in zip(names,addresses):
        private = {
            "instance_name": name,
            "ip": ip,
            "is_public": False
        }
        results["instances"].append(str(Instance(config,private)))
    return results
コード例 #5
0
def run_learning(
        train_ann_dir, train_gold_dir, train_text_dir,
        test_ann_dir, test_gold_dir, test_text_dir,
        settings):
    log_level = 'DEBUG'
    log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s'
    logging.basicConfig(level='DEBUG', format=log_format)
    log_file = './settings/processing.log'
    logging.basicConfig(level=log_level, format=log_format)
    global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir
    global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj
    global _annotated_anns
    _annotated_anns = {}
    _min_sample_size = settings['min_sample_size']
    _ann_dir = train_ann_dir
    _gold_dir = train_gold_dir
    _test_ann_dir = test_ann_dir
    _test_gold_dir = test_gold_dir
    _gold_text_dir = train_text_dir
    _test_text_dir = test_text_dir
    _concept_mapping = settings['concept_mapping_file']
    _learning_model_dir = settings['learning_model_dir']
    _labels = utils.read_text_file(settings['entity_types_file'])
    _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern']
    _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
    _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False
    _cm_obj = Concept2Mapping(_concept_mapping)

    # not using mention patterns for prediction as this is only a in-development feature
    mp_inst = None
    return do_learn_exp(settings['viz_file'],
                        num_dimensions=[50],
                        ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False,
                        separate_by_label=True,
                        conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst)
コード例 #6
0
def direct_nlp_prediction(settings):
    ann_dir = settings['test_ann_dir']
    test_text_dir = settings['test_fulltext_dir']
    _concept_mapping = settings['concept_mapping_file']
    _learning_model_dir = settings['learning_model_dir']
    _labels = utils.read_text_file(settings['entity_types_file'])
    ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
    _cm_obj = Concept2Mapping(_concept_mapping)
    file_keys = [
        f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))
    ]
    doc2predicted = {}
    for fk in file_keys:
        cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk),
                                   _concept_mapping)
        d = cr.full_text_file_pattern % fk
        for ann in cr.annotations:
            if ann.cui in _concept_mapping.cui2label:
                lbl = _concept_mapping.cui2label[ann.cui]
                pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end,
                                      ann.negation, ann.temporality,
                                      ann.experiencer, 'StudyName', lbl)
                put_ann_label(lbl, pheAnn, doc2predicted, d)
        for ann in cr.phenotypes:
            put_ann_label(ann.minor_type, ann, doc2predicted, d)
    return doc2predicted
コード例 #7
0
def phenotype_counting(phenotype_def, concept_level_results, output_file):
    pd = utils.load_json_data(phenotype_def)
    npd = {}
    cd = utils.read_text_file(concept_level_results)
    c_headers = cd[0].split('\t')
    headers = [h for h in c_headers[2:len(c_headers) - 1]]
    for r in cd[1:]:
        arr = r.split('\t')
        c = arr[0]
        num_mention = arr[12]
        for p in pd:
            if c in pd[p]['concepts']:
                po = npd[p] if p in npd else {'freq':0, 'p': p,
                                              'num_concepts': len(pd[p]['concepts'])}
                npd[p] = po
                po['freq'] += int(num_mention)
                for idx in xrange(2, len(arr) - 1):
                    h = headers[idx-2]
                    po[h] = int(arr[idx]) if h not in po else (int(arr[idx]) + int(po[h]))

    rows = ['\t'.join(['phenotype', 'num concepts'] + headers + ['prevalence'])]
    for p in npd:
        po = npd[p]
        rows.append('\t'.join([p, str(po['num_concepts'])] + [str(po[h]) for h in headers] + [str(po['freq'])]))
    utils.save_string('\n'.join(rows), output_file)
コード例 #8
0
def generate_hpo_umls_mapping(hpo_dump):
    lines = utils.read_text_file(hpo_dump)
    # lines = [u'id: HP:3000076', u'def: "An abnormality', u'xref: UMLS:C4073283']
    maps = []
    cur_map = None
    for l in lines:
        m = re.match(r'^id\: (HP\:\d+)', l)
        if m is not None:
            print 'start with %s' % m.group(1)
            cur_map = {'hp': m.group(1), 'cuis': []}
            maps.append(cur_map)
        m = re.match(r'^xref: (UMLS:C\d+)', l)
        if m is not None:
            cur_map['cuis'].append(m.group(1))
        if l == 'is_obsolete: true':
            cur_map['is_obsolete'] = True
        m = re.match(r'^replaced_by: (HP:\d+)', l)
        if m is not None:
            cur_map['replaced_by'] = m.group(1)

    hpo2umls = {}
    obsolete2replace = {}
    for cur_map in maps:
        hpo2umls[cur_map['hp']] = cur_map['cuis'] if cur_map['hp'] not in hpo2umls \
            else cur_map['cuis'] + hpo2umls[cur_map['hp']]
        if 'is_obsolete' in cur_map and 'replaced_by' in cur_map:
            obsolete2replace[cur_map['hp']] = cur_map['replaced_by']
    for obs in obsolete2replace:
        if obsolete2replace[obs] in hpo2umls:
            hpo2umls[obs] = hpo2umls[obsolete2replace[obs]]
    print json.dumps(hpo2umls)
コード例 #9
0
def run_study(folder_path, no_sql_filter=None):
    study_config = 'study.json' if no_sql_filter is None else 'study_no_filter.json'
    if isfile(join(folder_path, study_config)):
        r = utils.load_json_data(join(folder_path, study_config))
        retained_patients = None
        if 'query_patients_file' in r:
            retained_patients = []
            lines = utils.read_text_file(r['query_patients_file'])
            for l in lines:
                arr = l.split('\t')
                retained_patients.append(arr[0])

        study(folder_path,
              r['cohort'],
              r['sql_config'],
              r['db_conn'],
              concept_mapping.get_umls_client_inst(r['umls_key']),
              do_preprocessing=r['do_preprocessing'],
              rule_setting_file=r['rule_setting_file'],
              do_one_iter=r['do_one_iter'],
              sem_idx_setting_file=None if 'sem_idx_setting_file' not in r else
              r['sem_idx_setting_file'],
              concept_filter_file=None
              if 'concept_filter_file' not in r else r['concept_filter_file'],
              retained_patients_filter=retained_patients,
              filter_obj_setting=None
              if 'filter_obj_setting' not in r else r['filter_obj_setting'],
              do_disjoint_computing=True
              if 'do_disjoint' not in r else r['do_disjoint'],
              export_study_concept_only=False if 'export_study_concept'
              not in r else r['export_study_concept'])
    else:
        print 'study.json not found in the folder'
コード例 #10
0
def copy_docs(index_setting_file,
              src_index,
              src_doc_type,
              entity_id_field_name,
              dest_index,
              dest_doc_type,
              patient_list_file,
              thread_num=30):
    """
    copy a list of docs (doc ids read from doc_list_file) from one index to another
    :param index_setting_file:
    :param src_index:
    :param src_doc_type:
    :param entity_id_field_name:
    :param dest_index:
    :param dest_doc_type:
    :param patient_list_file:
    :param thread_num:
    :return:
    """
    es = EntityCentricES.get_instance(index_setting_file)
    patients = utils.read_text_file(patient_list_file)
    utils.multi_thread_tasking(patients,
                               thread_num,
                               do_copy_doc,
                               args=[
                                   es, src_index, src_doc_type,
                                   entity_id_field_name, dest_index,
                                   dest_doc_type
                               ])
    print 'all done'
コード例 #11
0
def load_document_to_es(settings):
    """
    load document to elastic search
    :param settings:
    :return:
    """
    doc_folder = settings.get_attr(['epr_index', 'doc_folder'])
    d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv'])
    es = SemEHRES.get_instance_by_setting(
        settings.get_attr(['epr_index', 'es_host']),
        settings.get_attr(['epr_index', 'es_index_name']),
        settings.get_attr(['epr_index', 'doc_type']), '', '')
    tsv_lines = utils.read_text_file(d2p_tsv)
    d2p = {}
    for l in tsv_lines:
        arr = l.split('\t')
        if len(arr) > 1:
            d2p[arr[0]] = arr[1]
    for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]:
        if f in d2p:
            p = d2p[f]
            t = utils.read_text_file_as_string(join(doc_folder, f))
            es.index_new_doc(
                index=settings.get_attr(['epr_index', 'es_index_name']),
                doc_type=settings.get_attr(['epr_index', 'doc_type']),
                data={
                    settings.get_attr(['epr_index', 'text_field']): t,
                    settings.get_attr(['epr_index', 'patient_id_field']): p,
                    "id": f
                },
                doc_id=f)
コード例 #12
0
def index_cris_cohort():
    f_patient_doc = './hepc_pos_doc_brcid.txt'
    f_yodie_anns = 'U:/kconnect/hepc_output/'
    print 'loading all docs at a time...'
    docs = load_all_docs()
    print 'docs read'
    doc_dict = {}
    for d in docs:
        doc_dict[d['CN_Doc_ID']] = d

    es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json')
    lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig')
    doc_to_patient = {}
    for l in lines:
        arr = l.split('\t')
        doc_to_patient[arr[1]] = arr[0]
    container = []
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]
    for ann in ann_files:
        utils.multi_thread_large_file_tasking(
            join(f_yodie_anns, ann),
            20,
            do_index_cris,
            args=[es, doc_to_patient, doc_dict, container],
            file_encoding='iso-8859-1')
        print 'file %s [%s] done' % (ann, len(container))
    print 'num done %s' % len(container)
    print 'done'
コード例 #13
0
def regenerate_manual_mapped_concepts(tsv, closure_file):
    selected_concepts = set()
    c2l = {}
    for l in utils.read_text_file(tsv):
        arr = l.split('\t')
        selected_concepts.add(arr[1])
        c2l[arr[1]] = arr[0]
    t2closure = utils.load_json_data(closure_file)
    mapped_concepts = []
    map = {}
    v_map = {}
    for t in t2closure:
        disjoint_list = list(set(t2closure[t]) & selected_concepts)
        if len(disjoint_list) > 0:
            mapped_concepts += disjoint_list
            map[t] = {
                "tc": {
                    "closure": len(disjoint_list),
                    "mapped": disjoint_list[0]
                },
                "concepts": disjoint_list
            }
            v_map[t] = [('%s [%s]' % (c2l[c], c)) for c in disjoint_list]
    print json.dumps(map)
    print selected_concepts - set(mapped_concepts)
    print json.dumps(v_map)
コード例 #14
0
def direct_nlp_prediction(settings):
    ann_dir = settings['test_ann_dir']
    test_text_dir = settings['test_fulltext_dir']
    _concept_mapping = settings['concept_mapping_file']
    _learning_model_dir = settings['learning_model_dir']
    _labels = utils.read_text_file(settings['entity_types_file'])
    ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
    _cm_obj = Concept2Mapping(_concept_mapping)
    file_keys = [
        f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir)
        if isfile(join(ann_dir, f))
    ]
    doc2predicted = {}
    for fk in file_keys:
        cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk),
                                   _concept_mapping)
        d = fk
        for ann in cr.annotations:
            if ann.cui in _cm_obj.concept2label:
                lbl = _cm_obj.concept2label[ann.cui][0]
                pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end,
                                      ann.negation, ann.temporality,
                                      ann.experiencer, 'StudyName', lbl)
                if ann.negation != 'Affirmed' or len(ann.ruled_by) > 0:
                    continue
                put_ann_label(lbl, pheAnn, doc2predicted, d)
        for ann in cr.phenotypes:
            put_ann_label(ann.minor_type, ann, doc2predicted, d)
    return doc2predicted
コード例 #15
0
ファイル: nlp_to_phenome.py プロジェクト: knowlab/nlp2phenome
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file,
                              new_dict_folder):
    maps = [utils.load_json_data(mf) for mf in map_files]
    new_m = {}
    for m in maps:
        new_m.update(m)
    t2list = {}
    for dd in dict_dirs:
        lst_files = [
            f for f in listdir(dd)
            if isfile(join(dd, f)) and f.endswith('.lst')
        ]
        for f in lst_files:
            t = f[:f.index('.')]
            labels = utils.read_text_file(join(dd, f))
            if t not in t2list:
                t2list[t] = set()
            for l in labels:
                if len(l) > 0:
                    t2list[t].add(l)
    utils.save_json_array(new_m, new_map_file)
    logging.info('mapping saved to %s' % new_map_file)
    for t in t2list:
        utils.save_string('\n'.join(list(t2list[t])) + '\n',
                          join(new_dict_folder, t + '.lst'))
        logging.info('%s.lst saved' % t)
    logging.info('all done')
コード例 #16
0
def load_corpus_to_FHIR_mapping(tsv_map_file):
    lines = utils.read_text_file(tsv_map_file)
    sec_to_fhir = {}
    for l in lines:
        arr = l.split('\t')
        for i in range(1, len(arr)):
            sec_to_fhir[arr[i]] = arr[0]
    return sec_to_fhir
コード例 #17
0
def load_patient_date(patient_date_file):
    lines = utils.read_text_file(patient_date_file)
    print 'patient file read. parsing...'
    p2time = {}
    for l in lines[1:]:
        arr = l.split(',')
        p2time[arr[0]] = datetime.strptime(arr[1], '%d/%m/%Y')
    return p2time
コード例 #18
0
def load_doc_date(doc_date_file):
    lines = utils.read_text_file(doc_date_file)
    print 'doc file read. parsing...'
    d2time = {}
    for l in lines[1:]:
        arr = l.split('\t')
        d2time[arr[0]] = datetime.strptime(arr[2], '%Y-%m-%d %H:%M:%S.%f')
    return d2time
コード例 #19
0
def complete_tsv_concept_label(umls, tsv_file):
    lines = []
    for l in utils.read_text_file(tsv_file):
        arr = l.split('\t')
        print arr
        arr.insert(1, get_umls_concept_detail(umls, arr[1])['result']['name'])
        lines.append(arr)
    print '\n'.join(['\t'.join(l) for l in lines])
コード例 #20
0
def predict_exp(corpus_trans_file, ann_file, cache_file, output_file):
    # initialise pattern instances from documents
    if not isfile(cache_file):
        # load labelled data
        ann_lines = utils.read_text_file(ann_file)
        prev_doc = None
        anns = []
        doc_anns = []
        ptn_insts = []
        doc_to_pt = {}
        for ls in ann_lines:
            l = ls.split('\t')
            doc_id = l[1]
            doc_to_pt[doc_id] = l[0]
            if prev_doc != doc_id:
                if prev_doc is not None:
                    if exists(join(working_folder, 'docs',
                                   '%s.txt' % prev_doc)):
                        doc_anns.append((prev_doc, anns))
                anns = []
                prev_doc = doc_id
            anns.append({
                's': int(l[2]),
                'e': int(l[3]),
                'signed_label': l[4],
                'gt_label': l[5]
            })
        if prev_doc is not None:
            if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)):
                doc_anns.append((prev_doc, anns))
        # mutithreading do processing labelled docs
        print 'processing docs...'
        utils.multi_thread_tasking(doc_anns,
                                   30,
                                   do_process_labelled_doc,
                                   args=[ptn_insts])
        jl.dump({'insts': ptn_insts, 'doc_to_pt': doc_to_pt}, cache_file)
    else:
        cached = jl.load(cache_file)
        ptn_insts = cached['insts']
        doc_to_pt = cached['doc_to_pt']

    cp = sp.CorpusPredictor.load_corpus_model(corpus_trans_file)
    ret = []
    for inst in ptn_insts:
        print 'predicting [%s]...' % inst.sentence
        acc = cp.predcit(inst)
        print 'accuracy: %s' % acc
        ann = inst.annotations[0]
        ret.append(
            (doc_to_pt[inst.doc_id], inst.doc_id, str(ann['s']), str(ann['e']),
             ann['signed_label'], ann['gt_label'], str(acc)))
    s = []
    for r in ret:
        s.append(u'\t'.join(r))
    print u'\n'.join(s)
    utils.save_json_array(ret, output_file)
    return ret
コード例 #21
0
def load_tree_node_file(file_path, node_name=None):
    if node_name is None:
        p, fn = split(file_path)
        node_name = fn
    folder_node = create_folder_node('\\', node_name)
    for l in utils.read_text_file(file_path):
        leaf = create_leaf_node(folder_node['path'], l.split('\t')[0])
        folder_node['children'].append(leaf)
    return folder_node
コード例 #22
0
def align_mapped_concepts(map_file, disorder_file):
    concept_map = utils.load_json_data(map_file)
    disorders = [d.strip() for d in utils.read_text_file(disorder_file)]
    exact_mapped = {}
    for d in disorders:
        if d in concept_map:
            exact_mapped[d] = concept_map[d]
        else:
            exact_mapped[d] = ""
    print json.dumps(exact_mapped)
コード例 #23
0
def produce_weka_output(predict_output_file,
                        orig_features_file,
                        merged_output_file,
                        arrf_file,
                        threshold=.70,
                        mode='threshold'):
    orig_data_lines = utils.read_text_file(orig_features_file)
    ret = utils.load_json_data(predict_output_file)
    ptn2anns = {}
    for r in ret:
        ptn = r[0]
        if ptn not in ptn2anns:
            ptn2anns[ptn] = {'posM': 0, 'negM': 0, 'hisM': 0, 'otherM': 0}
        if mode == 'threshold':
            if float(r[6]) >= threshold:
                ptn2anns[ptn][r[4]] += 1
        elif mode == 'weighted_sum':
            ptn2anns[ptn][r[4]] += float(r[6])

    rows = []
    arrf_header = """@RELATION	hepc

@ATTRIBUTE	Total_Mentions	NUMERIC
@ATTRIBUTE	Positive_Mentions	NUMERIC
@ATTRIBUTE	History_hypothetical_Mentions	NUMERIC
@ATTRIBUTE	Negative_Mentions	NUMERIC
@ATTRIBUTE	Other_Experiencers	NUMERIC
@ATTRIBUTE	AT_Total_Mentions	NUMERIC
@ATTRIBUTE	AT_Positive_Mentions	NUMERIC
@ATTRIBUTE	AT_History_hypothetical_Mentions	NUMERIC
@ATTRIBUTE	AT_Negative_Mentions	NUMERIC
@ATTRIBUTE	AT_Other_Experiencers	NUMERIC
@ATTRIBUTE	class	{positive,negative,unknown}


@DATA
"""
    arrf_rows = []
    for l in orig_data_lines:
        arr = l.split('\t')
        ptn = arr[0]
        new_line = arr[:6] + \
                   ([str(ptn2anns[ptn]['posM'] + ptn2anns[ptn]['negM'] + ptn2anns[ptn]['hisM'] + ptn2anns[ptn]['otherM']),
                                str(ptn2anns[ptn]['posM']),
                                str(ptn2anns[ptn]['hisM']),
                                str(ptn2anns[ptn]['negM']),
                                str(ptn2anns[ptn]['otherM'])] if ptn in ptn2anns else ['0','0','0','0','0']) + \
                   [arr[6]]
        rows.append(new_line)
        arrf_rows.append(','.join(new_line[1:]))

    utils.save_string(arrf_header + '\n'.join(arrf_rows), arrf_file)
    utils.save_string('\n'.join(['\t'.join(r) for r in rows]),
                      merged_output_file)
コード例 #24
0
def load_episode_data(file_path, date_format='%d/%m/%Y %H:%M'):
    lines = utils.read_text_file(file_path)
    eps = []
    for l in lines:
        arr = l.split('\t')
        eps.append({'brcid': arr[0],
                    'win1': {'s': datetime.strptime(arr[1], date_format), 'e': datetime.strptime(arr[2], date_format)},
                    'win2': {'s': datetime.strptime(arr[3], date_format), 'e': datetime.strptime(arr[4], date_format)},
                    'win3': {'s': datetime.strptime(arr[5], date_format), 'e': datetime.strptime(arr[6], date_format)}
                    })
    return eps
コード例 #25
0
def load_patient_truth(truth_file):
    all_pids = []
    lines = utils.read_text_file(truth_file)
    type2ids = {}
    for l in lines:
        arr = l.split('\t')
        if arr[2] not in type2ids:
            type2ids[arr[2]] = []
        type2ids[arr[2]].append(arr[0])
        all_pids.append(arr[0])
    return type2ids, all_pids
コード例 #26
0
def extract_doc_level_ann(ann_dump, output_folder):
    """

    extract doc level annotations and save to separate files
    :param ann_dump:
    :param output_folder:
    :return:
    """
    lines = utils.read_text_file(ann_dump)
    for l in lines:
        doc_ann = json.loads(l)
        utils.save_string(l, join(output_folder, doc_ann['docId'].split('.')[0] + '.json'))
コード例 #27
0
def index_cris_patients():
    f_patient_doc = './hepc_pos_doc_brcid.txt'
    lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig')
    patients = []
    for l in lines:
        arr = l.split('\t')
        if arr[0] not in patients:
            patients.append(arr[0])
    print 'total patients %s %s' % (len(patients), patients[0])
    es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json')
    utils.multi_thread_tasking(patients, 10, do_index_patient, args=[es])
    print 'done'
コード例 #28
0
def process_labelled_docs(labelled_file, corpus_model_file, mini_comp_file):
    corpus_analyzer = None
    if not isfile(corpus_model_file):
        # load labelled data
        ann_lines = utils.read_text_file(labelled_file)
        prev_doc = None
        anns = []
        doc_anns = []
        ptn_insts = []
        for ls in ann_lines:
            l = ls.split('\t')
            doc_id = l[0]
            if prev_doc != doc_id:
                if prev_doc is not None:
                    if exists(join(working_folder, 'docs',
                                   '%s.txt' % prev_doc)):
                        doc_anns.append((prev_doc, anns))
                anns = []
                prev_doc = doc_id
            anns.append({
                's': int(l[1]),
                'e': int(l[2]),
                'signed_label': l[3],
                'gt_label': l[4]
            })
        if prev_doc is not None:
            if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)):
                doc_anns.append((prev_doc, anns))
        # mutithreading do processing labelled docs
        print 'processing docs...'
        utils.multi_thread_tasking(doc_anns,
                                   30,
                                   do_process_labelled_doc,
                                   args=[ptn_insts])
        print 'merging patterns..'
        corpus_analyzer = sp.CorpusAnalyzer()
        for pi in ptn_insts:
            corpus_analyzer.add_pattern(pi)
        corpus_analyzer.serialise(corpus_model_file)
    else:
        corpus_analyzer = sp.CorpusAnalyzer.load_seralisation(
            corpus_model_file)
        # corpus_analyzer.show()
        # pt_insts = corpus_analyzer.pattern_to_insts

    if isfile(mini_comp_file):
        corpus_analyzer.load_mini_comp_dict(mini_comp_file)
    else:
        corpus_analyzer.produce_save_comp_dict(mini_comp_file)
    corpus_analyzer.show_mini_comp_patterns()
    # generate_corpus_model(corpus_analyzer)
    return corpus_analyzer
コード例 #29
0
 def run(self, quota, **query_kwargs):
     
     for t in range(quota):
         
         if t % 10 == 0 and t != 0:
             print("Query# {}".format(t))
         
         #train model with current labeled train data
         X_train, y_train, _ = self.dataset.get_labeled_data('train')
         self.model.fit(X_train, y_train)
         
         #make active query
         ask_id = self.qs.make_query(self.model, self.dataset, **query_kwargs)
         
         # check current prediction for this id and add user given
         if self.dataset.mode == 'active':
             # prompt user here to label by showing document text
             if not self.docid2path:
                 raise ValueError('document paths are required for interactive learning')
             doctext = utils.read_text_file(self.docid2path[ask_id])
             print("================================================")
             print("                 Document Text                  ")
             print("================================================")
             print(doctext)
             print()
             true_class_ask_id = int(input('DOCUMENT LABEL (Hint: {}): '.format(self.y_ideal[ask_id])))
             _ = os.system('cls')
         else:
             true_class_ask_id = self.dataset.ask_label(ask_id)
         
         # accumulate user labeled example for validation set (if set)
         if self.progressive_validation and t % self.pr_rate == 0 and t > 1:
             self.dataset.update(ask_id, true_class_ask_id, 'valid')
         else:
             # update model and dataset
             self.dataset.update(ask_id, true_class_ask_id, 'query', t=t)
         
         self.relevant_found = sum([1 for props in self.dataset.index2props.values() if props.y_true == 1])
         
         if t > 1 and t % self.eval_at == 0:
             self.supervised_eval('train')
             self.eval_xs['train'].append(t)
             if self.dataset.mode:
                 self.active_simulation_eval()
                 self.eval_xs['simulate'].append(t)
             if self.progressive_validation:
                 # if we have at least 5 positive classes in validation, start evaluation
                 pos_val_count = sum([1 for props in self.dataset.index2props.values() if props.is_valid and props.y_true == 1])
                 if pos_val_count >= 5:
                     self.supervised_eval('valid')
                     self.eval_xs['valid'].append(t)
コード例 #30
0
 def __iter__(self):
     for path in super().__iter__():
         if self.match_ids:
             doc_id = os.path.split(path)[1][:-4]
             if not doc_id in self.match_ids:
                 continue
         if self.read:
             if self.as_lines:
                 lines_iter = read_doc_lines(path, encoding=self.encoding)
                 yield lines_iter, path
             else:
                 raw_text = read_text_file(path, encoding=self.encoding)
                 yield raw_text, path
         else:
             yield path