def load_document_to_es(settings):
    """
    load document to elastic search
    :param settings:
    :return:
    """
    doc_folder = settings.get_attr(['epr_index', 'doc_folder'])
    d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv'])
    es = SemEHRES.get_instance_by_setting(
        settings.get_attr(['epr_index', 'es_host']),
        settings.get_attr(['epr_index', 'es_index_name']),
        settings.get_attr(['epr_index', 'doc_type']), '', '')
    tsv_lines = utils.read_text_file(d2p_tsv)
    d2p = {}
    for l in tsv_lines:
        arr = l.split('\t')
        if len(arr) > 1:
            d2p[arr[0]] = arr[1]
    for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]:
        if f in d2p:
            p = d2p[f]
            t = utils.read_text_file_as_string(join(doc_folder, f))
            es.index_new_doc(
                index=settings.get_attr(['epr_index', 'es_index_name']),
                doc_type=settings.get_attr(['epr_index', 'doc_type']),
                data={
                    settings.get_attr(['epr_index', 'text_field']): t,
                    settings.get_attr(['epr_index', 'patient_id_field']): p,
                    "id": f
                },
                doc_id=f)
def icd10_queries():
    endpoint = 'http://sparql.bioontology.org/sparql/'
    query_template = """
PREFIX owl:  <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT distinct ?umls, ?label
FROM <http://bioportal.bioontology.org/ontologies/ICD10>
WHERE {{
  <http://purl.bioontology.org/ontology/ICD10/{}> <http://bioportal.bioontology.org/ontologies/umls/cui> ?umls;
  <http://www.w3.org/2004/02/skos/core#prefLabel> ?label.
  ?s <http://bioportal.bioontology.org/ontologies/umls/isRoot> true.
}}
    """
    icd2umls = {}
    for c in range(ord('A'), ord('Z') + 1):
        for i in xrange(0, 100):
            icd = '%s%s' % (chr(c), '0' + str(i) if i <= 9 else str(i))
            q = query_template.format(icd)
            ret = json.loads(
                query(
                    q,
                    utils.read_text_file_as_string(
                        './resources/HW_NCBO_KEY.txt'), endpoint))
            ret = ret['results']['bindings']
            if len(ret) > 0:
                icd2umls[icd] = ret[0]['umls']['value']
                print '%s\t%s\t%s' % (icd, ret[0]['umls']['value'],
                                      ret[0]['label']['value'])
    print json.dumps(icd2umls)
 def convert_text_ann_from_files(full_text_folder,
                                 ann_folder,
                                 output_folder,
                                 full_text_file_pattern='(%s).txt',
                                 ann_file_pattern='se_ann_%s.json',
                                 output_file_pattern='%s.txt.knowtator.xml',
                                 ann_to_convert=None):
     text_files = [
         f for f in listdir(full_text_folder)
         if isfile(join(full_text_folder, f))
     ]
     p = re.compile(full_text_file_pattern)
     for f in text_files:
         logging.info('working on [%s]' % f)
         m = p.match(f)
         if m is not None:
             fk = m.group(1)
             text = utils.read_text_file_as_string(join(
                 full_text_folder, f))
             anns = utils.load_json_data(
                 join(ann_folder, ann_file_pattern % fk))
             xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, fk),
                                         full_text=text,
                                         ann_to_convert=ann_to_convert)
             utils.save_string(
                 xml, join(output_folder, output_file_pattern % fk))
             utils.save_string(text.replace('\r', ' '),
                               join(full_text_folder, f))
             logging.info('doc [%s] done' % fk)
def do_index_pubmed_docs(doc_obj, es, full_text_path):
    if 'pmcid' in doc_obj:
        pmcid = doc_obj['pmcid']
        doc_obj['fulltext'] = utils.read_text_file_as_string(
            join(full_text_path, pmcid))
        es.index_document(doc_obj, pmcid)
        print 'doc %s indexed' % pmcid
def do_process_labelled_doc(doc_anns, container):
    doc_id = doc_anns[0]
    anns = doc_anns[1]
    doc = utils.read_text_file_as_string(join(working_folder, 'docs',
                                              '%s.txt' % doc_id),
                                         encoding='utf-8')  # print doc
    container += doc_processing(nlp, doc, anns, doc_id)
def dir_anonymisation(folder, rule_file):
    anonymis_inst = AnonymiseRule(rule_file)
    onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
    container = []
    sent_data = []
    for f in onlyfiles:
        text = utils.read_text_file_as_string(join(folder, f))
        print anonymise_doc(f, text, container, anonymis_inst, sent_data)
def get_umls_client_inst(umls_key_file):
    """
    create a umls client instance using the key stored in give file
    :param umls_key_file: the text file containing UMLS API key
    :return:
    """
    key = utils.read_text_file_as_string(umls_key_file)
    print key
    return UMLSAPI(key)
Exemple #8
0
 def get_doc_by_id(self, doc_id):
     doc_path = self._path_template.format(**{'doc_id': doc_id})
     print 'working on %s' % doc_path
     text = utils.read_text_file_as_string(doc_path)
     if doc_id in self._doc_to_anns:
         self.add_annotations(self._doc_to_anns[doc_id])
     if self.need_preprocess:
         text = FileIterDocs.preprocess(text)
     return self.markup_annotations(text)
Exemple #9
0
def do_index_pubmed(line, es, pmcid_to_journal, full_text_path):
    ann_data = json.loads(line)
    pmcid = ann_data['docId']
    if pmcid in pmcid_to_journal:
        journal_name = pmcid_to_journal[pmcid]
        es.index_entity_data(hashlib.md5(journal_name).hexdigest().upper(),
                             pmcid, ann_data['annotations'][0],
                             {"pmcid:": pmcid,
                              "fulltext": utils.read_text_file_as_string(join(full_text_path, pmcid))
                              })
Exemple #10
0
def query_result(q, endpoint_url, key_file):
    headers = {
        "Accept": "application/sparql-results+json",
        "Content-Type": "application/x-www-form-urlencoded"
    }
    response = utils.http_post_result(endpoint_url,
                                      "apikey:" + utils.read_text_file_as_string(key_file) + "&query=" + q,
                                      headers=headers)
    print response
    ret = json.loads(response)
    return ret['results']['bindings']
def complete_samples(sample_file, complete_sql, db_conn_file, out_file):
    ann_prefix = 'var sample_docs='
    anns_str = utils.read_text_file_as_string(sample_file)
    if anns_str.startswith(ann_prefix):
        anns_str = anns_str[len(ann_prefix):]
    anns = json.loads(anns_str)
    # anns = utils.load_json_data(sample_file)
    key_anns = []
    for k in anns:
        key_anns.append((k, anns[k]))
    container = []
    utils.multi_thread_tasking(key_anns, 40, complete_sample_ann_data,
                               args=[complete_sql, db_conn_file, container])
    results = {}
    for r in container:
        results[r[0]] = r[1]
    utils.save_string(ann_prefix + json.dumps(results), out_file)
    print 'done'
Exemple #12
0
def get_what_is_changing(ann_folder,
                         text_folder,
                         output_file,
                         eHostAnnFile=True):
    """
    get what is getting better/worse
    :param ann_folder:
    :param text_folder:
    :param output_file:
    :return:
    """
    nlp = rr.get_nlp_instance()
    files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))]
    type2abstractions = {}
    for f in files:
        anns = []
        text_file = join(text_folder, f[0:-14])
        if eHostAnnFile:
            d = eHostAnnDoc(join(ann_folder, f))
            anns = d.get_ess_entities(no_context=True)
        else:
            d = eHostGenedDoc(join(ann_folder, f))
            anns = d.get_ess_entities()
        if len(anns) == 0:
            logging.info('anns is empty for [{:s}]'.format(f))
        text = utils.read_text_file_as_string(join(text_folder, f[0:-14]),
                                              encoding='cp1252')
        sents = rr.get_sentences_as_anns(nlp, text)
        for ann in anns:
            for s in sents:
                if ann.overlap(s):
                    abss = rr.AbstractedSentence(1)
                    abss.text = s.str
                    result = abss.get_abstaction_by_pos(
                        abss.locate_pos(ann.str), nlp)
                    if result is None:
                        logging.info('%s not found in %s' % (ann.str, f))
                        continue
                    type = ann.label
                    if type not in type2abstractions:
                        type2abstractions[type] = []
                    type2abstractions[type].append(result.to_dict())
    logging.debug(type2abstractions)
    utils.save_json_array(type2abstractions, output_file)
Exemple #13
0
def process_batched_docs(folder_path, out_folder):
    if isdir(folder_path):
        for f in listdir(folder_path):
            if isfile(join(folder_path, f)):
                t = utils.read_text_file_as_string(join(folder_path, f))
                print 'processing %s' % join(folder_path, f)
                print t
                mit = re.finditer(r'^(\d+)\,\"', t, re.MULTILINE)
                prev_pos = 0
                prev_id = None
                for m in mit:
                    if prev_pos > 0:
                        utils.save_string(t[prev_pos:m.start()-2], join(out_folder, prev_id))
                    prev_pos = m.end()
                    prev_id = m.string[m.start(1):m.end(1)]
                if prev_id is not None:
                    utils.save_string(t[prev_pos:len(t) - 1], join(out_folder, prev_id))
                else:
                    print 'ERROR!! pattern not found in %s' % join(folder_path, f)
Exemple #14
0
 def populate_linux_odbc_setting(self, template_file='./docker/linux_odbc_init_temp.sh'):
     """
     to access ms sql from linux, odbc settings need to be configured before dsn based
     access. this function automates such configuration
     :param template_file: the template bash file
     :return: dsn settings for python odbc access
     """
     s = utils.read_text_file_as_string(template_file)
     ret = s.format(**{'host': self._conf['server'], 'port': self._conf['port'],
                       'database': self._conf['database']})
     utils.save_string(ret, template_file)
     cmd = 'sh %s' % template_file
     p = Popen(cmd, shell=True, stderr=STDOUT)
     p.wait()
     if 0 != p.returncode:
         logging.error('ERROR doing the ODBC setting, stopped with a coide [%s]' % p.returncode)
         exit(p.returncode)
     return {'dsn': 'semehrdns', 'user': self._conf['user'],
             'password': self._conf['password'],
             'database': self._conf['database']}
def mimic_anonymisation(single_file, rule_file):
    doc = utils.read_text_file_as_string(single_file)
    arr = re.split(r'START\_OF\_RECORD=\d+\|\|\|\|\d+\|\|\|\|\r{0,1}\n', doc)
    i = 0
    texts = []
    for t in arr:
        texts.append(t.replace('||||END_OF_RECORD\n', ''))

    anonymis_inst = AnonymiseRule(rule_file)
    failed_docs = []
    sent_data = []
    utils.multi_thread_tasking(texts,
                               1,
                               wrap_anonymise_doc,
                               args=[failed_docs, anonymis_inst, sent_data])
    t2sent = {}
    for s in sent_data:
        if s['type'] not in t2sent:
            t2sent[s['type']] = []
        t2sent[s['type']].append(s['sent'])
    for t in t2sent:
        t2sent[t] = list(set(t2sent[t]))
        print('%s\n======\n%s\n\n' % (t, '\n'.join(t2sent[t])))
Exemple #16
0
 def populate_linux_odbc_setting(
         self, template_file='./docker/linux_odbc_init_temp.sh'):
     s = utils.read_text_file_as_string(template_file)
     ret = s.format(
         **{
             'host': self._conf['server'],
             'port': self._conf['port'],
             'database': self._conf['database']
         })
     utils.save_string(ret, template_file)
     cmd = 'sh %s' % template_file
     p = Popen(cmd, shell=True, stderr=STDOUT)
     p.wait()
     if 0 != p.returncode:
         logging.error(
             'ERROR doing the ODBC setting, stopped with a coide [%s]' %
             p.returncode)
         exit(p.returncode)
     return {
         'dsn': 'semehrdns',
         'user': self._conf['user'],
         'password': self._conf['password'],
         'database': self._conf['database']
     }
def load_doc_from_dir(folder, doc_id):
    doc_obj = utils.load_json_data(join(folder, doc_id + '.json'))
    doc_obj['TextContent'] = utils.read_text_file_as_string(
        join(folder, doc_id + '.txt'))
    return doc_obj
Exemple #18
0
 def read_full_text(self, fk):
     p = join(self._folder, self._pattern % fk)
     if isfile(p):
         return utils.read_text_file_as_string(p)
     else:
         return None
def nlp_process_doc(doc_file, container):
    container.append(nlp(utils.read_text_file_as_string(doc_file)))
Exemple #20
0
 def get_full_text(self, fk):
     if self._full_text is None and self._full_text_folder is not None and self._full_text_file_pattern is not None:
         self._full_text = utils.read_text_file_as_string(join(
             self._full_text_folder, self._full_text_file_pattern % fk),
                                                          encoding='utf-8')
     return self._full_text