def readmydata(): print('readmydata') # readcsvnamex = 'prescriptionFeature.csv' # readcsvnamey = 'labelFeature.csv' readcsvnamex = 'prescriptionFeature4.csv' readcsvnamey = 'labelFeature.csv' x_data = excelprocess.read_csv(readcsvnamex) y_data = excelprocess.read_csv(readcsvnamey) X = [] Y = [] for item in x_data: xx = [] for itemdata in item: itemdata = itemdata.replace('', '') xx.append(float(itemdata)) X.append(xx) for item in y_data: yy = [] for itemdata in item: itemdata = itemdata.replace('', '') yy.append(float(itemdata)) Y.append(yy) print('X.lenth:', len(X)) print('Y.lenth:', len(Y)) return X, Y
def composition_pracess(): print('composition_pracess') readcsvname = 'composition_6.csv' csvdata = excelprocess.read_csv(readcsvname) datas = [] i = 1 for item in csvdata: # print 'row',i,item[0].split(' ') item = item[0].split(' ') num = 0 for itemdata in item: if itemdata == '': item.pop(num) num += 1 datas.append(item) i += 1 # j=1 # for item in datas: # print 'j', j # j+=1 # for itemdata in item: # print 'zz',itemdata writecsvname = 'composition_6_1.csv' excelprocess.write_in_csv(writecsvname, datas)
def seperateNone(): print('seperateNone') readcsvname = 'csvtest_3.csv' # readcsvname ='allData_none.csv' csvdata = excelprocess.read_csv(readcsvname) normalList = [] noneLise = [] nn = 1 #统计每类数据里带NONE的方剂有多少条 count = 1 for item in csvdata: # print 'item',item check = 1 for itemdata in item: # print 'itemdata',itemdata if (itemdata == 'None'): check = 0 noneLise.append(item) break if (check == 1): normalList.append(item) zz = item[0].split('*') # print 'zz',zz if (zz[1] == nn): count += 1 else: print('count :', count) count = 1 #nn用于判断类别有没有改变 nn = zz[1]
def createAllLableList(): print('createAllLableLis') addlist = [] for inum in range(1, 7): print('inum', inum) readcsvname = 'composition_' + str(inum) + '_class.csv' csvdata = excelprocess.read_csv(readcsvname) pnum = '*' + str(inum) # print 'csvdata',csvdata i = 1 for item in csvdata: pnum = pnum + '*' + str(i) item.insert(0, pnum) # print 'zzzz:',item addlist.append(item) pnum = '*' + str(inum) i += 1
def noneStandard(): print('noneStandard') readcsvname = 'allData_none.csv' csvdata = excelprocess.read_csv(readcsvname) noneList = [] for item in csvdata: midList = [] for itemdata in item: itemdata = itemdata.replace('', '') itemdata = itemdata.replace('等分', '') itemdata = itemdata.replace('少许', '') itemdata = itemdata.replace('3倍于上药', '') itemdata = itemdata.replace('减半', '') itemdata = itemdata.replace('倍加', '') midList.append(itemdata) noneList.append(midList) writecsvname = 'allData_none1.csv' excelprocess.write_in_csv(writecsvname, noneList)
def splitnumandstr(): print('splitnumandstr') readcsvname = 'csvtest.csv' # readcsvname='composition_1_1.csv' csvdata = excelprocess.read_csv(readcsvname) i = 1 data_after = [] datalist = [] for item in csvdata: print('%%%%%%%%%%%%%num', i) for itemdata in item: itemdata = itemdata.replace('', '') #把 “各” 字 单独处理 itemdata = itemdata.replace('各', '#') print('itemdata', itemdata) #清洗数据:处理描述中的里括号里的内容***不要括号里的内容********** try: num1 = itemdata.index('(') num2 = itemdata.index(')') itemdata = itemdata + '*' print('*****************num1,num2******************', num1, num2) itemdata = itemdata[0:num1] + itemdata[num2 + 1:-1] print('*****************清除英文括号内容******************', itemdata) num11 = itemdata.index('(') num22 = itemdata.index(')') itemdata = itemdata + '*' print('*****************num1,num2******************', num11, num22) itemdata = itemdata[0:num11] + itemdata[num22 + 1:-1] print('*****************清除中文括号内容******************', itemdata) except: pass data_after.append(itemdata) i += 1 datalist.append(data_after) # print 'data_after^^^^^^^^^^^^^^^^^',data_after data_after = []
def transLabelvalue(): print('transLabelvalue') readcsvname = 'allLabelData.csv' csvdata = excelprocess.read_csv(readcsvname) labelList = [] for item in csvdata: data_after = [] # print 'item:',item mark = 0 for itemdata in item: if (mark == 0): data_after.append(itemdata) mark += 1 continue else: itemdata = itemdata.replace('', '') itemdata = itemdata.replace(';', ';') itemdata = itemdata.split(';') # print 'itemdata', itemdata for labelitem in itemdata: if (labelitem.find('祛风解表') > -1): # print 'itemdata', itemdata data_after.append(label_1) elif (labelitem.find('胜湿止痛') > -1): data_after.append(label_2) elif (labelitem.find('止痒') > -1 or labelitem.find('透疹') > -1): data_after.append(label_3) elif (labelitem.find('止痉') > -1 or labelitem.find('中风') > -1): data_after.append(label_4) elif (labelitem.find('散肝舒脾') > -1): data_after.append(label_5) elif (labelitem.find('目') > -1 or labelitem.find('明目') > -1): data_after.append(label_6) else: # data_after.append(label_other) print('error:', item[0]) labelList.append(data_after)
def extractnumwithstr(readcsvname, writecsvname): print('extractnumwithstr') # readcsvname='csvtest_1.csv' # readcsvname='composition_5_2.csv' csvdata = excelprocess.read_csv(readcsvname) # i:指示第i个处方 i = 1 #正则匹配要用' ur'' '才能正确匹配中文 #(?:..):(...)的不分组版本,用于使用| 或 后接数量词 pattern1 = re.compile( u'\d+.\d+(?:g||kg|ml|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握)' ) pattern2 = re.compile( u'\d+(?:g||kg|ml|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握)') pattern3 = re.compile(u'kg') finalmedicallist = [] for item in csvdata: print( '****************************************************************** 处方: ', i) medicallist = [] point = [] medicaldict = [] for itemdata in item: weight = '' yaowulist = [] itemdata = itemdata.replace('', '') itemdata = itemdata.replace('.', '.') itemdata = itemdata.replace('o', '0') itemdata = itemdata.decode('utf8') # print 'itemdata', itemdata #在处方内容中通过正则匹配找出数量单位 start weight1 = pattern1.findall(itemdata) weight2 = pattern2.findall(itemdata) #把正确的值放在变量weight中 if (weight1): weight = weight1[0] yaowulist = pattern1.split(itemdata) elif (weight2): weight = weight2[0] yaowulist = pattern2.split(itemdata) # print '$$$$$$$$$$$$findal',weight1,weight2,weight # 在处方内容中通过正则匹配找出数量单位 end # 把处方的每味药提出来重新放在medicallist列表元素[0]里,同时已经去除了药的数量单位 if (yaowulist): # try: yaowulist.remove('') for zz in yaowulist: medicallist.append(zz) # except: # pass else: medicallist.append(itemdata) # 把处方的每味药所对应的数量单位存入medicallist 列表元素[1]的位置列表里 if (weight): medicallist.append(weight) else: medicallist.append('None') #medicallist eg.[u'\u9632\u98ce', u'3l'] medicaldict.append(medicallist) medicallist = [] # print "medicallist", medicallist print('medicaldict', medicaldict) #j统计每个处方里的第j味药 j = 0 for k, v in medicaldict: #用point记录#在哪味药上 # print k,v if (k.find('#') > 0): point.append(j) medicaldict[j][0] = medicaldict[j][0].replace('#', '') print(medicaldict[j][0]) print('检测到“各”字,该味药在处方中所处位置:', j, k) j += 1 print('该方剂一共配药数量为:', j) print('该方剂中出现“各”字的位置有:', point) f = 0 print('##################### 开始处理所有药的数量单位 ##################') print('medicaldict', medicaldict) for m, n in medicaldict: if (point != []): for pointnum in point: # print 'test', pointnum if (f > pointnum): continue elif (n == 'None'): # print 'test4',medicaldict[f][1],pointnum medicaldict[f][1] = medicaldict[pointnum][1] break f += 1 print('@@@@@@~~最后处理结果(列表):', medicaldict) #重新整理medicaldict数据格式,并存入csv里 onepiece = [] for x, y in medicaldict: print(x, y) onepiece.append(x) onepiece.append(y) # print 'onepiece',onepiece # excelprocess.createListCSV('csvtest_2.csv',medicaldict) finalmedicallist.append(onepiece) #处方数增一 i += 1
def unitTransformation(): print('unitTransformation()') readcsvname = 'csvtest_normal.csv' # readcsvname = 'allData_normal.csv' # readcsvname ='allData_none.csv' csvdata = excelprocess.read_csv(readcsvname) normalList = [] for item in csvdata: # print 'item',item midList = [] for itemdata in item: # print 'itemdata', itemdata itemdata = itemdata.decode('utf8') if (itemdata.find('两') > 0): try: zz = itemdata.split('两') # print 'split itemdata', itemdata unit = float(zz[0]) * 50 # print 'unit', unit changeunit = str(unit) + 'g' # print 'changeunit', changeunit midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('钱') > 0): try: zz = itemdata.split('钱') unit = float(zz[0]) * 3.125 # print 'unit',unit changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('kg') > 0): try: zz = itemdata.split('kg') unit = float(zz[0]) * 1000 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('斤') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 500 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('分') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 0.3 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) else: midList.append(itemdata) normalList.append(midList) # writecsvname = 'csvtest_normal1.csv' writecsvname = 'allData_normal1.csv'
def countallmedical(): print 'countallmedical' # readcsvname = 'allmedical.csv' # readcsvname = 'allData_normal1.csv' # csvdata = excelprocess.read_csv(readcsvname) # readcsvname = 'allData_none1.csv' readcsvname = 'allData1.csv' csvdata = excelprocess.read_csv(readcsvname) medicaList = [] medical_value = [] pattern = re.compile(ur'[\u4e00-\u9fa5]') j = 1 for item in csvdata: # print 'item:',item mark = 0 for itemdata in item: data_value = [] if (mark == 0): mark += 1 continue else: if ((mark % 2) == 0): mark += 1 continue else: itemdata = itemdata.strip() itemdata = itemdata.replace('l', '') itemdata = itemdata.decode('utf8') if (pattern.search(itemdata)): # print 'j',j,mark #存取出的药物 medicaList.append(itemdata) #存药物对应的数值 data_value.append(itemdata) data_value.append(findnum(item[mark + 1])) mark += 1 medical_value.append(data_value) j += 1 allcount = len(medicaList) print '所有处方中共有药物(medicaList): ', allcount medicaListSet = list(set(medicaList)) medicalcount = len(medicaListSet) print 'medicaList去重后得到处方中不同药物数量: ', medicalcount # print 'medicaListSet:',medicaListSet # print 'medical_value:',medical_value medicalminmax = maxValueandminValue(medicaListSet, medical_value) print 'medicalminmax去重后得到处方中不同药物数量: ', len(medicalminmax) #统计每种药物出现的次数 numarray = [] n = [] for item in medicaListSet: n.append(item) n.append(medicaList.count(item)) numarray.append(n) n = [] #以次数排序 numarray = sorted(numarray, key=lambda x: x[1], reverse=True) # writecsvname = 'allNormalMedicalCount.csv' # excelprocess.write_in_csv(writecsvname , numarray) # writecsvname = 'allNormalMedicalandValue.csv' # excelprocess.write_in_csv(writecsvname , medical_value) # writecsvname = 'allNormalMedicalMinMaxValue.csv' # excelprocess.write_in_csv(writecsvname , medicalminmax) writecsvname = 'allData1Count.csv' excelprocess.write_in_csv(writecsvname, numarray)
def prescription2Feature(): print 'prescription2Feature' #allData_normal1.csv里的不同药味数统计 readcsvname = 'allNormalMedicalMinMaxValue.csv' # allData1.csv里的不同药味数统计 # readcsvname = 'allData1Count.csv' medicaldata = excelprocess.read_csv(readcsvname) readcsvname = 'allData_normal1.csv' # readcsvname = 'allData1.csv' prescriptiondata = excelprocess.read_csv(readcsvname) readcsvname = 'allLabelDataValue.csv' labeldata = excelprocess.read_csv(readcsvname) medicaList = [] mediaclvalueList = [] for item in medicaldata: medicaList.append(item[0].replace('', '')) # mediaclvalueList.append(item) labelmark = [] labelvalue = [] for item in labeldata: mark = 0 nn = [] for itemdata in item: itemdata = itemdata.replace('', '') if (mark == 0): labelmark.append(itemdata) else: nn.append(itemdata) mark += 1 labelvalue.append(nn) presFeatrue = [] presLabelFeatrue = [] j = 1 wrongnum = 1 # allData_normal1.csv一共有药物1298种 for item in prescriptiondata: # print 'item:',item mark_v = 0 prevalue = 0 for itemdata in item: #计算每个处方里药物剂量总值 if (mark_v == 0): mark_v = +1 continue else: if ((mark_v % 2) != 0): mark_v += 1 else: value = findnum(itemdata) prevalue = prevalue + value mark_v += 1 mark = 0 # allData1.csv # dim=1379 # allData_normal1.csv dim = 1298 featrue = [0] * dim for itemdata in item: if (mark == 0): #处理对应的标签 itemdata = itemdata.replace('', '') # print 'itemdata', itemdata loc = labelmark.index(itemdata) # print 'loc',loc # print (labelvalue[loc]) presLabelFeatrue.append(labelvalue[loc]) else: if ((mark % 2) != 0): try: location = medicaList.index(itemdata) itemvalue = findnum(item[mark + 1]) / prevalue # featrue[location] = 1 # featrue[location+dim-1] = itemvalue featrue[location] = itemvalue except: print 'wrong', wrongnum, item[0], mark, itemdata wrongnum += 1 else: mark += 1 continue mark += 1 j += 1 # print (featrue) presFeatrue.append(featrue) print len(presFeatrue), j, len(presLabelFeatrue) writecsvname = 'prescriptionFeature4.csv' excelprocess.write_in_csv(writecsvname, presFeatrue) writecsvname = 'labelFeature4.csv' excelprocess.write_in_csv(writecsvname, presLabelFeatrue)
def prescriptionFeature(): print 'prescriptionFeature' readcsvname = 'allNormalMedicalMinMaxValue.csv' medicaldata = excelprocess.read_csv(readcsvname) # readcsvname = 'allData_normal1.csv' readcsvname = 'allData1.csv' prescriptiondata = excelprocess.read_csv(readcsvname) readcsvname = 'allLabelDataValue.csv' labeldata = excelprocess.read_csv(readcsvname) medicaList = [] mediaclvalueList = [] for item in medicaldata: medicaList.append(item[0].replace('', '')) mediaclvalueList.append(item) labelmark = [] labelvalue = [] for item in labeldata: mark = 0 nn = [] for itemdata in item: itemdata = itemdata.replace('', '') if (mark == 0): labelmark.append(itemdata) else: nn.append(itemdata) mark += 1 labelvalue.append(nn) # print (labelvalue) # print 'mediacl', medicaList # for item in medicaList: # print (item) presFeatrue = [] presLabelFeatrue = [] j = 1 wrongnum = 1 #allData_normal1.csv一共有药物1298种 # print (featrue) for item in prescriptiondata: # print 'item:',item mark = 0 #allData_normal1.csv # featrue = [0] * 1298 #allData1.csv featrue = [0] * 1379 for itemdata in item: if (mark == 0): # print 'itemdata',itemdata itemdata = itemdata.replace('', '') # print 'itemdata', itemdata loc = labelmark.index(itemdata) # print 'loc',loc # print (labelvalue[loc]) presLabelFeatrue.append(labelvalue[loc]) else: if ((mark % 2) != 0): try: location = medicaList.index(itemdata) # print 'location',location itemvalue = findnum(item[mark + 1]) finalValue = ( itemvalue - float(mediaclvalueList[location][2]) + 1) / (float(mediaclvalueList[location][3]) + 1) #特征既有配伍成分,有考虑了单位数量 # featrue[location]=finalValue #只关心配伍成分,不关心单位数量 featrue[location] = 1 except: # print 'wrong',wrongnum,item[0],mark,itemdata wrongnum += 1 else: mark += 1 continue mark += 1 j += 1 # print (featrue) presFeatrue.append(featrue) print len(presFeatrue), j, len(presLabelFeatrue)