def load_document_to_es(settings): """ load document to elastic search :param settings: :return: """ doc_folder = settings.get_attr(['epr_index', 'doc_folder']) d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv']) es = SemEHRES.get_instance_by_setting( settings.get_attr(['epr_index', 'es_host']), settings.get_attr(['epr_index', 'es_index_name']), settings.get_attr(['epr_index', 'doc_type']), '', '') tsv_lines = utils.read_text_file(d2p_tsv) d2p = {} for l in tsv_lines: arr = l.split('\t') if len(arr) > 1: d2p[arr[0]] = arr[1] for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]: if f in d2p: p = d2p[f] t = utils.read_text_file_as_string(join(doc_folder, f)) es.index_new_doc( index=settings.get_attr(['epr_index', 'es_index_name']), doc_type=settings.get_attr(['epr_index', 'doc_type']), data={ settings.get_attr(['epr_index', 'text_field']): t, settings.get_attr(['epr_index', 'patient_id_field']): p, "id": f }, doc_id=f)
def icd10_queries(): endpoint = 'http://sparql.bioontology.org/sparql/' query_template = """ PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT distinct ?umls, ?label FROM <http://bioportal.bioontology.org/ontologies/ICD10> WHERE {{ <http://purl.bioontology.org/ontology/ICD10/{}> <http://bioportal.bioontology.org/ontologies/umls/cui> ?umls; <http://www.w3.org/2004/02/skos/core#prefLabel> ?label. ?s <http://bioportal.bioontology.org/ontologies/umls/isRoot> true. }} """ icd2umls = {} for c in range(ord('A'), ord('Z') + 1): for i in xrange(0, 100): icd = '%s%s' % (chr(c), '0' + str(i) if i <= 9 else str(i)) q = query_template.format(icd) ret = json.loads( query( q, utils.read_text_file_as_string( './resources/HW_NCBO_KEY.txt'), endpoint)) ret = ret['results']['bindings'] if len(ret) > 0: icd2umls[icd] = ret[0]['umls']['value'] print '%s\t%s\t%s' % (icd, ret[0]['umls']['value'], ret[0]['label']['value']) print json.dumps(icd2umls)
def convert_text_ann_from_files(full_text_folder, ann_folder, output_folder, full_text_file_pattern='(%s).txt', ann_file_pattern='se_ann_%s.json', output_file_pattern='%s.txt.knowtator.xml', ann_to_convert=None): text_files = [ f for f in listdir(full_text_folder) if isfile(join(full_text_folder, f)) ] p = re.compile(full_text_file_pattern) for f in text_files: logging.info('working on [%s]' % f) m = p.match(f) if m is not None: fk = m.group(1) text = utils.read_text_file_as_string(join( full_text_folder, f)) anns = utils.load_json_data( join(ann_folder, ann_file_pattern % fk)) xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, fk), full_text=text, ann_to_convert=ann_to_convert) utils.save_string( xml, join(output_folder, output_file_pattern % fk)) utils.save_string(text.replace('\r', ' '), join(full_text_folder, f)) logging.info('doc [%s] done' % fk)
def do_index_pubmed_docs(doc_obj, es, full_text_path): if 'pmcid' in doc_obj: pmcid = doc_obj['pmcid'] doc_obj['fulltext'] = utils.read_text_file_as_string( join(full_text_path, pmcid)) es.index_document(doc_obj, pmcid) print 'doc %s indexed' % pmcid
def do_process_labelled_doc(doc_anns, container): doc_id = doc_anns[0] anns = doc_anns[1] doc = utils.read_text_file_as_string(join(working_folder, 'docs', '%s.txt' % doc_id), encoding='utf-8') # print doc container += doc_processing(nlp, doc, anns, doc_id)
def dir_anonymisation(folder, rule_file): anonymis_inst = AnonymiseRule(rule_file) onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))] container = [] sent_data = [] for f in onlyfiles: text = utils.read_text_file_as_string(join(folder, f)) print anonymise_doc(f, text, container, anonymis_inst, sent_data)
def get_umls_client_inst(umls_key_file): """ create a umls client instance using the key stored in give file :param umls_key_file: the text file containing UMLS API key :return: """ key = utils.read_text_file_as_string(umls_key_file) print key return UMLSAPI(key)
def get_doc_by_id(self, doc_id): doc_path = self._path_template.format(**{'doc_id': doc_id}) print 'working on %s' % doc_path text = utils.read_text_file_as_string(doc_path) if doc_id in self._doc_to_anns: self.add_annotations(self._doc_to_anns[doc_id]) if self.need_preprocess: text = FileIterDocs.preprocess(text) return self.markup_annotations(text)
def do_index_pubmed(line, es, pmcid_to_journal, full_text_path): ann_data = json.loads(line) pmcid = ann_data['docId'] if pmcid in pmcid_to_journal: journal_name = pmcid_to_journal[pmcid] es.index_entity_data(hashlib.md5(journal_name).hexdigest().upper(), pmcid, ann_data['annotations'][0], {"pmcid:": pmcid, "fulltext": utils.read_text_file_as_string(join(full_text_path, pmcid)) })
def query_result(q, endpoint_url, key_file): headers = { "Accept": "application/sparql-results+json", "Content-Type": "application/x-www-form-urlencoded" } response = utils.http_post_result(endpoint_url, "apikey:" + utils.read_text_file_as_string(key_file) + "&query=" + q, headers=headers) print response ret = json.loads(response) return ret['results']['bindings']
def complete_samples(sample_file, complete_sql, db_conn_file, out_file): ann_prefix = 'var sample_docs=' anns_str = utils.read_text_file_as_string(sample_file) if anns_str.startswith(ann_prefix): anns_str = anns_str[len(ann_prefix):] anns = json.loads(anns_str) # anns = utils.load_json_data(sample_file) key_anns = [] for k in anns: key_anns.append((k, anns[k])) container = [] utils.multi_thread_tasking(key_anns, 40, complete_sample_ann_data, args=[complete_sql, db_conn_file, container]) results = {} for r in container: results[r[0]] = r[1] utils.save_string(ann_prefix + json.dumps(results), out_file) print 'done'
def get_what_is_changing(ann_folder, text_folder, output_file, eHostAnnFile=True): """ get what is getting better/worse :param ann_folder: :param text_folder: :param output_file: :return: """ nlp = rr.get_nlp_instance() files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))] type2abstractions = {} for f in files: anns = [] text_file = join(text_folder, f[0:-14]) if eHostAnnFile: d = eHostAnnDoc(join(ann_folder, f)) anns = d.get_ess_entities(no_context=True) else: d = eHostGenedDoc(join(ann_folder, f)) anns = d.get_ess_entities() if len(anns) == 0: logging.info('anns is empty for [{:s}]'.format(f)) text = utils.read_text_file_as_string(join(text_folder, f[0:-14]), encoding='cp1252') sents = rr.get_sentences_as_anns(nlp, text) for ann in anns: for s in sents: if ann.overlap(s): abss = rr.AbstractedSentence(1) abss.text = s.str result = abss.get_abstaction_by_pos( abss.locate_pos(ann.str), nlp) if result is None: logging.info('%s not found in %s' % (ann.str, f)) continue type = ann.label if type not in type2abstractions: type2abstractions[type] = [] type2abstractions[type].append(result.to_dict()) logging.debug(type2abstractions) utils.save_json_array(type2abstractions, output_file)
def process_batched_docs(folder_path, out_folder): if isdir(folder_path): for f in listdir(folder_path): if isfile(join(folder_path, f)): t = utils.read_text_file_as_string(join(folder_path, f)) print 'processing %s' % join(folder_path, f) print t mit = re.finditer(r'^(\d+)\,\"', t, re.MULTILINE) prev_pos = 0 prev_id = None for m in mit: if prev_pos > 0: utils.save_string(t[prev_pos:m.start()-2], join(out_folder, prev_id)) prev_pos = m.end() prev_id = m.string[m.start(1):m.end(1)] if prev_id is not None: utils.save_string(t[prev_pos:len(t) - 1], join(out_folder, prev_id)) else: print 'ERROR!! pattern not found in %s' % join(folder_path, f)
def populate_linux_odbc_setting(self, template_file='./docker/linux_odbc_init_temp.sh'): """ to access ms sql from linux, odbc settings need to be configured before dsn based access. this function automates such configuration :param template_file: the template bash file :return: dsn settings for python odbc access """ s = utils.read_text_file_as_string(template_file) ret = s.format(**{'host': self._conf['server'], 'port': self._conf['port'], 'database': self._conf['database']}) utils.save_string(ret, template_file) cmd = 'sh %s' % template_file p = Popen(cmd, shell=True, stderr=STDOUT) p.wait() if 0 != p.returncode: logging.error('ERROR doing the ODBC setting, stopped with a coide [%s]' % p.returncode) exit(p.returncode) return {'dsn': 'semehrdns', 'user': self._conf['user'], 'password': self._conf['password'], 'database': self._conf['database']}
def mimic_anonymisation(single_file, rule_file): doc = utils.read_text_file_as_string(single_file) arr = re.split(r'START\_OF\_RECORD=\d+\|\|\|\|\d+\|\|\|\|\r{0,1}\n', doc) i = 0 texts = [] for t in arr: texts.append(t.replace('||||END_OF_RECORD\n', '')) anonymis_inst = AnonymiseRule(rule_file) failed_docs = [] sent_data = [] utils.multi_thread_tasking(texts, 1, wrap_anonymise_doc, args=[failed_docs, anonymis_inst, sent_data]) t2sent = {} for s in sent_data: if s['type'] not in t2sent: t2sent[s['type']] = [] t2sent[s['type']].append(s['sent']) for t in t2sent: t2sent[t] = list(set(t2sent[t])) print('%s\n======\n%s\n\n' % (t, '\n'.join(t2sent[t])))
def populate_linux_odbc_setting( self, template_file='./docker/linux_odbc_init_temp.sh'): s = utils.read_text_file_as_string(template_file) ret = s.format( **{ 'host': self._conf['server'], 'port': self._conf['port'], 'database': self._conf['database'] }) utils.save_string(ret, template_file) cmd = 'sh %s' % template_file p = Popen(cmd, shell=True, stderr=STDOUT) p.wait() if 0 != p.returncode: logging.error( 'ERROR doing the ODBC setting, stopped with a coide [%s]' % p.returncode) exit(p.returncode) return { 'dsn': 'semehrdns', 'user': self._conf['user'], 'password': self._conf['password'], 'database': self._conf['database'] }
def load_doc_from_dir(folder, doc_id): doc_obj = utils.load_json_data(join(folder, doc_id + '.json')) doc_obj['TextContent'] = utils.read_text_file_as_string( join(folder, doc_id + '.txt')) return doc_obj
def read_full_text(self, fk): p = join(self._folder, self._pattern % fk) if isfile(p): return utils.read_text_file_as_string(p) else: return None
def nlp_process_doc(doc_file, container): container.append(nlp(utils.read_text_file_as_string(doc_file)))
def get_full_text(self, fk): if self._full_text is None and self._full_text_folder is not None and self._full_text_file_pattern is not None: self._full_text = utils.read_text_file_as_string(join( self._full_text_folder, self._full_text_file_pattern % fk), encoding='utf-8') return self._full_text