import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd2') #txt目录提取 dis = open(r'C:\Users\Administrator\Desktop\ICD-10.txt', errors='ignore') ds = dis.readlines() ds_cs = [] ryzd = [] for line in ds: line = re.sub('\n', '', line) ds_cs.append(line) for emrtxt in emrtxts: out = [] f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 lines = f.readlines() for line in lines: line = re.sub('\n', '', line) while ds_c in ds_cs: if set(line) == set(ds_c): out.append(ds_c) elif line.find(ds_c) > -1: out.append(ds_c)
#-*- coding: UTF-8 -*- #本文件用于数据清洗 import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore") #呼吸疾病目录 hxjbdic = hxjb.readlines() #读行 ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] line_out = [] for line in f.readlines(): line = re.sub('\n', '', line) line = re.sub(r'(.+?)肺炎', '肺炎', line) #替换所有的肺炎 for hxjbc in hxjbdic: #检索每个词 hxjbc = re.sub('\n', '', hxjbc) if line.find(hxjbc) > -1: line_out.append(line) line_output = EMRdef.delre(line_out) ryzd.append(line_out) #line = '\n'.join(line_output) #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re import pandas as pd emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd5') #txt目录提取 emrtxt2s = EMRdef.txttq(u'D:\DeepLearning ER\EHRsex') ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 lines = f.readlines() lines = ''.join(lines) lines = re.sub(' ', '', lines) lines = re.split('\n', lines) for emrtxt2 in emrtxt2s: f2 = open(emrtxt2, 'r', errors="ignore") #中文加入errors emrpath2 = os.path.basename(emrtxt2) emrpath2 = os.path.splitext(emrpath2)[0] #提取目录 lines2 = f2.readlines() lines2 = ''.join(lines2) if emrpath == emrpath2: lines.append(lines2) ryzd.append(lines) print
#-*- coding: UTF-8 -*- #本文件用于提取目标目录中的所有txt,并提取关键词所在行到指定目录,并提取关键词新建文件 import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'C:\Users\Administrator\Desktop\xiaochuan') #txt目录提取 for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_', emrtxt) #提取ID emrtxt = "".join(emrtxt_str) #转成str pattern = r',|.|,|。|;|;' #清除标点 #txtp=txtp.decode('utf-8') for line in f.readlines(): line = re.sub(' ', '', line) #删除空格 if line.find(u'入院诊断:', 0, 6) > -1: line = re.sub(r'h|H', '小时', line) #小时替换成中文 line = re.sub(r'入院诊断:', '', line) #删除入院诊断字样 line_deldl = re.split(r';|。|,|;|?', line) #根据标点分行 line_deld = '\n'.join(line_deldl) #转成str格式 line_out = re.sub(r'\d+、', '\n', line_deld) #删除序号 line_output = re.split('\n', line_out) line = '\n'.join(line_output) EMRdef.text_create(r'F:\zljh', '.txt', emrtxt,
#根据词典提取 #-*- coding: UTF-8 -*- #本文件用于根据指标参数提取所有指标 import os import EMRdef import re #根据句号分词 emrtxt2s = EMRdef.txttq(u'D:\DeepLearning ER\EHRzlgc') #txt目录提取 pattern2 = r'、|;|:|、|:|,' #根据标点分词 for emrtxt2 in emrtxt2s: f2 = open(emrtxt2, 'r', errors="ignore") #中文加入errors f2_end = re.split(pattern2, f2.read()) f2_out = "\n".join(f2_end) #转成str emrpath2 = os.path.basename(emrtxt2) emrpath2 = os.path.splitext(emrpath2)[0] EMRdef.text_create(u'D:\DeepLearning ER\EHRzlgc3', '.txt', emrpath2, f2_out) #EMRdef.text_save(emrtxt,f_end) '''----------------------------------------------------------------------------------------------------------------------------------------------''' #根据化验指标提取段落 b = open('D:\python\EMR\hyzb.txt', 'r', errors="ignore") emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzlgc3') #txt目录提取 pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|,|。|:|;|‘|’|【|】|(|)|·|!|、|…' #清除标点 brl = b.readlines() for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 test_out = []
import time import math import os import sys import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 emrtxt2s = EMRdef.txttq(r'D:\DeepLearning ER\EHRzlgc4') #txt目录提取 out = [] for emrtxt in emrtxts: emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] for emrtxt2 in emrtxt2s: emrpath2 = os.path.basename(emrtxt2) emrpath2 = os.path.splitext(emrpath2)[0] if emrpath == emrpath2: f = open(emrtxt, 'r', errors="ignore") #中文加入errors f2 = open(emrtxt2, 'r', errors="ignore") #中文加入errors a = f.readlines() b = f2.readlines() c = b + a
#-*- coding: UTF-8 -*- #根据给药方式和剂量剂型分词 import os import EMRdef import string import re emrtxts = EMRdef.txttq(r'D:\DeepLearning ER\EHRzlgc4') #txt目录提取 #pattern = r',|;|\*|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|,|。|:|;|‘|’|\+|\-|【|】| \)|\( |(|)|·|!|、|…'#清除标点 pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|>|,|。|:|<|;|‘|’|【|】|(|)|·|!|\*|\/|…' #清除标点 hyjg = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] #提取目录 f_end = [] for line in f.readlines(): c = line line = re.sub(' ', '', line) #删除空格 line = re.sub('\.', '', line) #删除. line = re.sub('×', '', line) #删除. a = EMRdef.tq_bnum(line) a_end = "".join(a) #转成str a_end = re.split(pattern, a_end) a_end = "".join(a_end) #转成str a_end = re.sub(' ', '', a_end) #删除空格 a_end = "".join(a_end) #转成str if a_end == '': a_end = 1 else: acb = EMRdef.rre(c, a_end, a_end + ':', 1)