import time
import math
import os
import sys
import os, os.path, shutil
import codecs
import EMRdef
import re
import pandas as pd
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd2')  #txt目录提取
dis = open(r'C:\Users\Administrator\Desktop\ICD-10.txt', errors='ignore')
ds = dis.readlines()
ds_cs = []
ryzd = []
for line in ds:
    line = re.sub('\n', '', line)
    ds_cs.append(line)
for emrtxt in emrtxts:
    out = []
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]  #提取目录
    lines = f.readlines()
    for line in lines:
        line = re.sub('\n', '', line)
        while ds_c in ds_cs:
            if set(line) == set(ds_c):
                out.append(ds_c)

            elif line.find(ds_c) > -1:
                out.append(ds_c)
#-*- coding: UTF-8 -*-

#本文件用于数据清洗
import time
import math
import os
import sys
import os, os.path, shutil
import codecs
import EMRdef
import re
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd')  #txt目录提取
hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore")  #呼吸疾病目录
hxjbdic = hxjb.readlines()  #读行
ryzd = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]
    line_out = []
    for line in f.readlines():
        line = re.sub('\n', '', line)
        line = re.sub(r'(.+?)肺炎', '肺炎', line)  #替换所有的肺炎
        for hxjbc in hxjbdic:  #检索每个词
            hxjbc = re.sub('\n', '', hxjbc)
            if line.find(hxjbc) > -1:
                line_out.append(line)
        line_output = EMRdef.delre(line_out)
        ryzd.append(line_out)
        #line = '\n'.join(line_output)
        #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)
import time
import math
import os
import sys
import os, os.path, shutil
import codecs
import EMRdef
import re
import pandas as pd
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzhzd5')  #txt目录提取
emrtxt2s = EMRdef.txttq(u'D:\DeepLearning ER\EHRsex')
ryzd = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]  #提取目录
    lines = f.readlines()
    lines = ''.join(lines)
    lines = re.sub(' ', '', lines)
    lines = re.split('\n', lines)
    for emrtxt2 in emrtxt2s:
        f2 = open(emrtxt2, 'r', errors="ignore")  #中文加入errors
        emrpath2 = os.path.basename(emrtxt2)
        emrpath2 = os.path.splitext(emrpath2)[0]  #提取目录
        lines2 = f2.readlines()
        lines2 = ''.join(lines2)
        if emrpath == emrpath2:
            lines.append(lines2)
            ryzd.append(lines)
print
Exemple #4
0
#-*- coding: UTF-8 -*-

#本文件用于提取目标目录中的所有txt,并提取关键词所在行到指定目录,并提取关键词新建文件
import time
import math
import os
import sys
import os, os.path, shutil
import codecs
import EMRdef
import re

emrtxts = EMRdef.txttq(u'C:\Users\Administrator\Desktop\xiaochuan')  #txt目录提取
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrtxt = os.path.basename(emrtxt)
    emrtxt_str = re.findall(r'(^.+?)\_', emrtxt)  #提取ID
    emrtxt = "".join(emrtxt_str)  #转成str
    pattern = r',|.|,|。|;|;'  #清除标点
    #txtp=txtp.decode('utf-8')
    for line in f.readlines():
        line = re.sub(' ', '', line)  #删除空格
        if line.find(u'入院诊断:', 0, 6) > -1:
            line = re.sub(r'h|H', '小时', line)  #小时替换成中文
            line = re.sub(r'入院诊断:', '', line)  #删除入院诊断字样
            line_deldl = re.split(r';|。|,|;|?', line)  #根据标点分行
            line_deld = '\n'.join(line_deldl)  #转成str格式
            line_out = re.sub(r'\d+、', '\n', line_deld)  #删除序号
            line_output = re.split('\n', line_out)
            line = '\n'.join(line_output)
            EMRdef.text_create(r'F:\zljh', '.txt', emrtxt,
Exemple #5
0
#根据词典提取
#-*- coding: UTF-8 -*-
#本文件用于根据指标参数提取所有指标
import os
import EMRdef
import re
#根据句号分词

emrtxt2s = EMRdef.txttq(u'D:\DeepLearning ER\EHRzlgc')  #txt目录提取
pattern2 = r'、|;|:|、|:|,'  #根据标点分词
for emrtxt2 in emrtxt2s:
    f2 = open(emrtxt2, 'r', errors="ignore")  #中文加入errors
    f2_end = re.split(pattern2, f2.read())
    f2_out = "\n".join(f2_end)  #转成str
    emrpath2 = os.path.basename(emrtxt2)
    emrpath2 = os.path.splitext(emrpath2)[0]
    EMRdef.text_create(u'D:\DeepLearning ER\EHRzlgc3', '.txt', emrpath2,
                       f2_out)
#EMRdef.text_save(emrtxt,f_end)
'''----------------------------------------------------------------------------------------------------------------------------------------------'''

#根据化验指标提取段落
b = open('D:\python\EMR\hyzb.txt', 'r', errors="ignore")
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRzlgc3')  #txt目录提取
pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|,|。|:|;|‘|’|【|】|(|)|·|!|、|…'  #清除标点
brl = b.readlines()
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]  #提取目录
    test_out = []
Exemple #6
0
import time
import math
import os
import sys
import os, os.path, shutil
import codecs
import EMRdef
import re
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd')  #txt目录提取
emrtxt2s = EMRdef.txttq(r'D:\DeepLearning ER\EHRzlgc4')  #txt目录提取
out = []
for emrtxt in emrtxts:
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]
    for emrtxt2 in emrtxt2s:
        emrpath2 = os.path.basename(emrtxt2)
        emrpath2 = os.path.splitext(emrpath2)[0]
        if emrpath == emrpath2:
            f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
            f2 = open(emrtxt2, 'r', errors="ignore")  #中文加入errors
            a = f.readlines()
            b = f2.readlines()
            c = b + a
Exemple #7
0
#-*- coding: UTF-8 -*-
#根据给药方式和剂量剂型分词
import os
import EMRdef
import string
import re

emrtxts = EMRdef.txttq(r'D:\DeepLearning ER\EHRzlgc4')  #txt目录提取
#pattern = r',|;|\*|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|,|。|:|;|‘|’|\+|\-|【|】| \)|\( |(|)|·|!|、|…'#清除标点
pattern = r',|;|\'|`|\[|\]|<|>|\?|"|\{|\}|!|@|#|\$|%|\^|&|=|>|,|。|:|<|;|‘|’|【|】|(|)|·|!|\*|\/|…'  #清除标点
hyjg = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]  #提取目录
    f_end = []
    for line in f.readlines():
        c = line
        line = re.sub(' ', '', line)  #删除空格
        line = re.sub('\.', '', line)  #删除.
        line = re.sub('×', '', line)  #删除.
        a = EMRdef.tq_bnum(line)
        a_end = "".join(a)  #转成str
        a_end = re.split(pattern, a_end)
        a_end = "".join(a_end)  #转成str
        a_end = re.sub(' ', '', a_end)  #删除空格
        a_end = "".join(a_end)  #转成str
        if a_end == '':
            a_end = 1
        else:
            acb = EMRdef.rre(c, a_end, a_end + ':', 1)