def func2feature(csvname1, csvname2): print 'func2feature' funcdata = web_data_process.read_csv(csvname1) countdata = web_data_process.read_csv(csvname2) countlist = [] for item in countdata: countlist.append(item[0]) featurelist = [] #这里可以修改需要判别的功效,放一个时会检索不到(‘.-’) locmark = countlist.index('清热解毒'.decode('utf-8')) print 'locmark', locmark for item in funcdata: check = 0 for itemdata in item: itemdata = itemdata.replace('', '') itemdata = itemdata.replace('疏风', '祛风') itemdata = itemdata.replace('散风', '祛风') itemdata = itemdata.replace('驱风', '祛风') try: loc = countlist.index(itemdata.decode('utf-8')) if loc == locmark: check = 1 except: pass if check == 1: featurelist.append(1) else: featurelist.append(0) print 'len(featurelist):', len(featurelist) print '有多少方剂属于该功效(祛风清热):', featurelist.count(1) return featurelist
def loadData(): train_x = [] train_y = [] # presCsvname='presFeature_realValue.csv' presCsvname = 'presFeature_onehot.csv' funcCsvname = 'funcFeature.csv' data = web_data_process.read_csv(presCsvname) labeldata=web_data_process.read_csv(funcCsvname) num=0 for i in data: if num==0: i[0]=i[0].replace('', '') i=[float(item) for item in i] i.insert(0,1.0) train_x.append(i) num+=1 num = 0 for j in labeldata: if num==0: j[0]=j[0].replace('', '') train_y.append(float(j[0])) num += 1 #别人的例子 # fileIn = open('../Ch05/testSet.txt') # for line in fileIn.readlines(): # lineArr = line.strip().split() # train_x.append([1.0, float(lineArr[0]), float(lineArr[1])]) # train_y.append(float(lineArr[2])) return mat(train_x), mat(train_y).transpose()
def loadDataSet(funcCsvname, presCsvname): dataMat = [] labelMat = [] funcdata = web_data_process.read_csv(presCsvname) labeldata = web_data_process.read_csv(funcCsvname) for i in funcdata: dataMat.append(i) for j in labeldata: labelMat.append(j) return dataMat, labelMat
def function_count(csvname): print 'function_count 计算有多少种功效,每种功效出现的次数和比例*' csv_data = web_data_process.read_csv(csvname) flist = [] for item in csv_data: # j=0 for itemdata in item: # if j!=0 and itemdata!='': print 'itemdata', itemdata itemdata = itemdata.replace('疏风', '祛风') itemdata = itemdata.replace('散风', '祛风') itemdata = itemdata.replace('驱风', '祛风') flist.append(itemdata) # j+=1 print '所有方剂中的功效有(没有去重):', len(flist) #去重 计算有多少不同的功效 flistset = list(set(flist)) # 统计每种药物出现的次数 numarray = [] n = [] for item in flistset: n.append(item) n.append(flist.count(item)) numarray.append(n) n = [] # 以次数排序 numarray = sorted(numarray, key=lambda x: x[1], reverse=True) print '所有方剂中的功效有(去重):', len(numarray) return numarray
def deletebianhao(readcsvname1, readcsvname2): print 'deletebianhao' csvdata1 = web_data_process.read_csv(readcsvname1) csvdata2 = web_data_process.read_csv(readcsvname2) formulaList = [] functionList = [] for item in csvdata1: item.pop(0) formulaList.append(item) for item in csvdata2: item.pop(0) functionList.append(item) writecsvname1 = 'webFormula_final_2.csv' writecsvname2 = 'webFunction_3.csv' web_data_process.write_in_csv(writecsvname1, formulaList) web_data_process.write_in_csv(writecsvname2, functionList)
def deleteblank(readcsvname, writecsvname): print 'deleteblank' csvdata = web_data_process.read_csv(readcsvname) finalList = [] for item in csvdata: if len(item) != 1: finalList.append(item) print 'lenth', len(finalList) web_data_process.write_in_csv(writecsvname, finalList)
def checkContent(readcsvname1, readcsvname2): print 'checkContent' csvdata1 = web_data_process.read_csv(readcsvname1) csvdata2 = web_data_process.read_csv(readcsvname2) formulaList = [] functionList = [] for item in csvdata1: formulaList.append(item) for item in csvdata2: functionList.append(item) print '方剂个数:%d 功效个数:%d ' % (len(formulaList), len(functionList)) num = 0 wrong = 1 for i in range(0, len(formulaList)): if formulaList[i][0] != functionList[i][0]: print '编号不对应!', num, formulaList[i][0], functionList[i][0] wrong = 0 num += 1 if wrong: print '检测完毕!编号对应!'
def webSix(readcsvname, writecsvname): print 'webFive' csvdata = web_data_process.read_csv(readcsvname) data = [] num = 0 for content in csvdata: j = 0 for item in content: item = item.decode('utf-8') str = match1(item) content[j] = str # print 'item - x',num,j,item,x j += 1 data.append(content) num += 1 web_data_process.write_in_csv(writecsvname, data)
def composition_process(readcsvname): print ('composition_process') # readcsvname='composition_6.csv' csvdata=web_data_process.read_csv(readcsvname) datas=[] i=1 for item in csvdata: # print 'row',i,item[0].split(' ') item=item[0].split(' ') num = 0 for itemdata in item: if itemdata == '': item.pop(num) num += 1 datas.append(item) i+=1 return datas
def webOne(readcsvname): print 'webOne' csvdata = web_data_process.read_csv(readcsvname) formulalist = [] functionlist = [] zhuzhilist = [] num = 0 for item in csvdata: # print item for itemdata in item: print num, itemdata.strip() try: formulalist.append(str(num) + '#' + item[1].strip()) functionlist.append(str(num) + '#' + item[2].strip()) zhuzhilist.append(str(num) + '#' + item[3].strip()) except: pass num += 1 print '........得到配伍数据共%d条 \n' % len(formulalist)
def webFive(readcsvname, writecsvname): print 'webFive' csvdata = web_data_process.read_csv(readcsvname) data = [] num = 0 for content in csvdata: j = 0 for item in content: item = item.decode('utf-8') pos = item.find('去') if pos > -1: item = item[0:pos] x = wordmatch(item) # x = item.replace('炙', '') # x = x.replace('不', '') # x = x.replace('蒸', '') # x = x.replace('炒', '') # x = x.replace('熬', '') # x = x.replace('锉', '') # x = x.replace('炒香', '') # x = x.replace('炮', '') # x = x.replace('切', '') # x = x.replace('轧细', '') # x = x.replace('捣碎', '') # x = x.replace('裹煨', '') # x = x.replace('研粉', '') # x = x.replace('调下', '') # x = x.replace('另研', '') # x = x.replace('碎绵裹', '') # 通过正则表达去除多余的单位,只保留数值+g的单位。 str = match(x) content[j] = str # print 'item - x',num,j,item,x j += 1 data.append(content) num += 1 web_data_process.write_in_csv(writecsvname, data)
def webTwo(readcsvname, writecsvname): print 'webTwo' csvdata = web_data_process.read_csv(readcsvname) formulalist = [] num = 0 for item in csvdata: # print num, item for itemdata in item: if itemdata != '': itemdata = itemdata.decode('utf8') itemdata = itemdata.replace('\r', '') itemdata = itemdata.replace('\n', '') itemdata = itemdata.replace('\t', '') itemdata = itemdata.replace('"', '') itemdata = itemdata.replace('\xc2\xa0', '') itemdata = itemdata.replace('\xe3\x80\x80\xe3\x80\x80', ' ') itemdata = itemdata.replace('\xe3\x80\x80', ' ') itemdata = itemdata.replace('】 ', '】') #webThree itemdata = itemdata.replace('', '') itemdata = itemdata.replace('【组成】', ' ') itemdata = itemdata.replace(',', ' ') itemdata = itemdata.replace('。', ' ') itemdata = itemdata.replace('、', '') itemdata = itemdata.replace('(原书未注用量)', '') itemdata = itemdata.replace('(原书未著用量)', '') itemdata = itemdata.replace('酒洗', '') itemdata = itemdata.replace('洗', '') itemdata = itemdata.replace('汤洗七次', '') # webThree itemdata = itemdata.strip() print 'zz', num, itemdata formulalist.append(itemdata.decode('utf-8')) # formulalist.append(itemdata) # print num,item num += 1 web_data_process.write_list_in_csv(writecsvname, formulalist)
# coding=utf-8 import re import sys import web_dataFeatureValue import web_data_process reload(sys) sys.setdefaultencoding('utf-8') if __name__ == '__main__': print '从训练好的logistic模型参数中找出占主导作用的药物...' readcsvname = 'weights_0.2.csv' weightdata = web_data_process.read_csv(readcsvname) csvname = 'allMedicalCount.csv' medicaldata = web_data_process.read_csv(csvname) medicallist = [] importantMedical = [] weightlist = [] for item in medicaldata: medicallist.append(item[0]) num = 0 for item in weightdata: zz = [] item[0] = item[0].replace('[[', '') item[0] = item[0].replace(']]', '') item[0] = item[0].replace('', '') # print 'zz',item[0] if num == 0: pass else: if float(item[0]) > 0.001: zz.append(medicallist[num - 1])
def prescriptionFeature(): print 'prescriptionFeature' readcsvname = 'allNormalMedicalMinMaxValue.csv' medicaldata = web_data_process.read_csv(readcsvname) # readcsvname = 'allData_normal1.csv' readcsvname = 'allData1.csv' prescriptiondata = web_data_process.read_csv(readcsvname) readcsvname = 'allLabelDataValue.csv' labeldata = web_data_process.read_csv(readcsvname) medicaList = [] mediaclvalueList = [] for item in medicaldata: medicaList.append(item[0].replace('', '')) mediaclvalueList.append(item) labelmark = [] labelvalue = [] for item in labeldata: mark = 0 nn = [] for itemdata in item: itemdata = itemdata.replace('', '') if (mark == 0): labelmark.append(itemdata) else: nn.append(itemdata) mark += 1 labelvalue.append(nn) # print (labelvalue) # print 'mediacl', medicaList # for item in medicaList: # print (item) presFeatrue = [] presLabelFeatrue = [] j = 1 wrongnum = 1 #allData_normal1.csv一共有药物1298种 # print (featrue) for item in prescriptiondata: # print 'item:',item mark = 0 #allData_normal1.csv # featrue = [0] * 1298 #allData1.csv featrue = [0] * 1379 for itemdata in item: if (mark == 0): # print 'itemdata',itemdata itemdata = itemdata.replace('', '') # print 'itemdata', itemdata loc = labelmark.index(itemdata) # print 'loc',loc # print (labelvalue[loc]) presLabelFeatrue.append(labelvalue[loc]) else: if ((mark % 2) != 0): try: location = medicaList.index(itemdata) # print 'location',location itemvalue = findnum(item[mark + 1]) finalValue = ( itemvalue - float(mediaclvalueList[location][2]) + 1) / (float(mediaclvalueList[location][3]) + 1) #特征既有配伍成分,有考虑了单位数量 # featrue[location]=finalValue #只关心配伍成分,不关心单位数量 featrue[location] = 1 except: # print 'wrong',wrongnum,item[0],mark,itemdata wrongnum += 1 else: mark += 1 continue mark += 1 j += 1 # print (featrue) presFeatrue.append(featrue) print len(presFeatrue), j, len(presLabelFeatrue)
def prescription2Feature(): print 'prescription2Feature' #allData_normal1.csv里的不同药味数统计 readcsvname = 'allNormalMedicalMinMaxValue.csv' # allData1.csv里的不同药味数统计 # readcsvname = 'allData1Count.csv' medicaldata = web_data_process.read_csv(readcsvname) readcsvname = 'allData_normal1.csv' # readcsvname = 'allData1.csv' prescriptiondata = web_data_process.read_csv(readcsvname) readcsvname = 'allLabelDataValue.csv' labeldata = web_data_process.read_csv(readcsvname) medicaList = [] mediaclvalueList = [] for item in medicaldata: medicaList.append(item[0].replace('', '')) # mediaclvalueList.append(item) labelmark = [] labelvalue = [] for item in labeldata: mark = 0 nn = [] for itemdata in item: itemdata = itemdata.replace('', '') if (mark == 0): labelmark.append(itemdata) else: nn.append(itemdata) mark += 1 labelvalue.append(nn) presFeatrue = [] presLabelFeatrue = [] j = 1 wrongnum = 1 # allData_normal1.csv一共有药物1298种 for item in prescriptiondata: # print 'item:',item mark_v = 0 prevalue = 0 for itemdata in item: #计算每个处方里药物剂量总值 if (mark_v == 0): mark_v = +1 continue else: if ((mark_v % 2) != 0): mark_v += 1 else: value = findnum(itemdata) prevalue = prevalue + value mark_v += 1 mark = 0 # allData1.csv # dim=1379 # allData_normal1.csv dim = 1298 featrue = [0] * dim for itemdata in item: if (mark == 0): #处理对应的标签 itemdata = itemdata.replace('', '') # print 'itemdata', itemdata loc = labelmark.index(itemdata) # print 'loc',loc # print (labelvalue[loc]) presLabelFeatrue.append(labelvalue[loc]) else: if ((mark % 2) != 0): try: location = medicaList.index(itemdata) itemvalue = findnum(item[mark + 1]) / prevalue # featrue[location] = 1 # featrue[location+dim-1] = itemvalue featrue[location] = itemvalue except: print 'wrong', wrongnum, item[0], mark, itemdata wrongnum += 1 else: mark += 1 continue mark += 1 j += 1 # print (featrue) presFeatrue.append(featrue) print len(presFeatrue), j, len(presLabelFeatrue)
def countallmedical(readcsvname): print 'countallmedical' # readcsvname = 'allmedical.csv' # readcsvname = 'allData_normal1.csv' # csvdata = data_process.read_csv(readcsvname) # readcsvname = 'allData_none1.csv' # readcsvname = 'allData1.csv' csvdata = web_data_process.read_csv(readcsvname) medicaList = [] medical_value = [] pattern = re.compile(ur'[\u4e00-\u9fa5]') j = 1 for item in csvdata: # print 'item:',item mark = 0 for itemdata in item: data_value = [] # if (mark == 0): # mark += 1 # continue # else: if ((mark % 2) == 0): itemdata = itemdata.strip() itemdata = itemdata.replace('', '') itemdata = itemdata.replace('l', '') itemdata = itemdata.decode('utf8') # print 'itemdata zzz', itemdata if (pattern.search(itemdata)): # print 'j', j, mark # 存取出的药物 medicaList.append(itemdata) # 存药物对应的数值 data_value.append(itemdata) data_value.append(findnum(item[mark + 1])) mark += 1 medical_value.append(data_value) else: mark += 1 continue j += 1 allcount = len(medicaList) print '所有处方中共有药物(medicaList): ', allcount medicaListSet = list(set(medicaList)) medicalcount = len(medicaListSet) print 'medicaList去重后得到处方中不同药物数量: ', medicalcount # print 'medicaListSet:',medicaListSet # print 'medical_value:',medical_value # medicalminmax=maxValueandminValue(medicaListSet, medical_value) # print 'medicalminmax去重后得到处方中不同药物数量: ', len(medicalminmax) #统计每种药物出现的次数 numarray = [] n = [] for item in medicaListSet: n.append(item) n.append(medicaList.count(item)) numarray.append(n) n = [] #以次数排序 numarray = sorted(numarray, key=lambda x: x[1], reverse=True) # writecsvname = 'allNormalMedicalandValue.csv' # data_process.write_in_csv(writecsvname , medical_value) # writecsvname = 'allNormalMedicalMinMaxValue.csv' # data_process.write_in_csv(writecsvname , medicalminmax) # writecsvname = 'allMedicalCount.csv' # data_process.write_in_csv(writecsvname , numarray) return numarray
def UnifiedDose(readcsvname, writecsvname): print 'UnifiedDose' csvdata = web_data_process.read_csv(readcsvname) normalList = [] for item in csvdata: # print 'item',item midList = [] for itemdata in item: # print 'itemdata', itemdata itemdata = itemdata.decode('utf8') if (itemdata.find('两') > 0): try: zz = itemdata.split('两') # print 'split itemdata', itemdata unit = float(zz[0]) * 50 # print 'unit', unit changeunit = str(unit) + 'g' # print 'changeunit', changeunit midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('钱') > 0): try: zz = itemdata.split('钱') unit = float(zz[0]) * 3.125 # print 'unit',unit changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('kg') > 0): try: zz = itemdata.split('kg') unit = float(zz[0]) * 1000 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('Kg') > 0): try: zz = itemdata.split('Kg') unit = float(zz[0]) * 1000 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('斤') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 500 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('分') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 0.3 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) else: midList.append(itemdata) normalList.append(midList) web_data_process.write_in_csv(writecsvname, normalList)
def webProcessNum(readcsvname, writecsvname): print 'webProcessNum' csvdata = web_data_process.read_csv(readcsvname) finaldata = [] num = 0 pattern3 = re.compile( ur'\d+(?:g|kg|ml|l|个|Kg|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)' ) pattern4 = re.compile( ur'\d+.\d+(?:g|kg|ml|Kg|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)' ) pattern1 = re.compile(ur'[\u4e00-\u9fa5]+(\(|()') pattern2 = re.compile(ur'(?:\(|(|)|\))') pattern5 = re.compile( ur'(?<![\u4e00-\u9fa5])(?:\(|()\d+(?:g|kg|ml|l|个|Kg|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)(?:\)|))(?![\u4e00-\u9fa5])' ) pattern6 = re.compile(ur'(?:\)|))(?=[\u4e00-\u9fa5]+)') pattern7 = re.compile(ur'(?:\(|()') for content in csvdata: j = 0 medicallist = [] yaowulist = [] try: content.remove('') except: pass for item in content: item = item.decode('utf-8') if j == 0: medicallist.append(content[0]) else: #情况一:升麻 =》只有字 if not pattern3.search(item): # print '1', item word = item wordnumber = 'None' medicallist.append(word) medicallist.append(wordnumber) #情况三:(6g)=>只有数量 if pattern5.search(item): # print '3',item wordnumber = re.sub(pattern2, '', item) k = 0 for item in medicallist: if item == 'None': medicallist[k] = wordnumber k += 1 #情况五:处理‘各’:各(30g) or 焦栀各(各9g) if item.find('各') > -1: # print '各',item item = item.replace('各', '') item = re.sub(pattern2, '', item) #找到整数 或者 小数 weight1 = pattern4.findall(item) #小数 weight2 = pattern3.findall(item) #整数 # 把正确的值放在变量weight中 if (weight1): wordnumber = weight1[0] yaowulist = pattern4.split(item) elif (weight2): wordnumber = weight2[0] yaowulist = pattern3.split(item) # 把处方的每味药提出来重新放在medicallist列表元素[0]里,同时已经去除了药的数量单位 if (yaowulist): try: for zz in yaowulist: if zz != u'': word = zz # print 'word',word medicallist.append(word) medicallist.append(wordnumber) except: pass k = 0 for item in medicallist: if item == 'None': medicallist[k] = wordnumber k += 1 #情况二:麻黄(6g) if pattern1.search(item): # print '麻黄(6g)',item word = item[0:pattern7.search(item).start()] # print 'bb',word wordnumber = item[pattern7.search(item).end():] wordnumber = re.sub(pattern2, '', wordnumber) # print 'nn',wordnumber medicallist.append(word) medicallist.append(wordnumber) #情况四:(6g)麻黄 if pattern6.search(item): wordnumber = item[:pattern6.search(item).start()] wordnumber = re.sub(pattern2, '', wordnumber) word = item[pattern6.search(item).end():] medicallist.append(wordnumber) medicallist.append(word) j += 1 finaldata.append(medicallist) num += 1 web_data_process.write_in_csv(writecsvname, finaldata)