def deleteblank(readcsvname, writecsvname): print 'deleteblank' csvdata = web_data_process.read_csv(readcsvname) finalList = [] for item in csvdata: if len(item) != 1: finalList.append(item) print 'lenth', len(finalList) web_data_process.write_in_csv(writecsvname, finalList)
def webSix(readcsvname, writecsvname): print 'webFive' csvdata = web_data_process.read_csv(readcsvname) data = [] num = 0 for content in csvdata: j = 0 for item in content: item = item.decode('utf-8') str = match1(item) content[j] = str # print 'item - x',num,j,item,x j += 1 data.append(content) num += 1 web_data_process.write_in_csv(writecsvname, data)
def deletebianhao(readcsvname1, readcsvname2): print 'deletebianhao' csvdata1 = web_data_process.read_csv(readcsvname1) csvdata2 = web_data_process.read_csv(readcsvname2) formulaList = [] functionList = [] for item in csvdata1: item.pop(0) formulaList.append(item) for item in csvdata2: item.pop(0) functionList.append(item) writecsvname1 = 'webFormula_final_2.csv' writecsvname2 = 'webFunction_3.csv' web_data_process.write_in_csv(writecsvname1, formulaList) web_data_process.write_in_csv(writecsvname2, functionList)
def webFive(readcsvname, writecsvname): print 'webFive' csvdata = web_data_process.read_csv(readcsvname) data = [] num = 0 for content in csvdata: j = 0 for item in content: item = item.decode('utf-8') pos = item.find('去') if pos > -1: item = item[0:pos] x = wordmatch(item) # x = item.replace('炙', '') # x = x.replace('不', '') # x = x.replace('蒸', '') # x = x.replace('炒', '') # x = x.replace('熬', '') # x = x.replace('锉', '') # x = x.replace('炒香', '') # x = x.replace('炮', '') # x = x.replace('切', '') # x = x.replace('轧细', '') # x = x.replace('捣碎', '') # x = x.replace('裹煨', '') # x = x.replace('研粉', '') # x = x.replace('调下', '') # x = x.replace('另研', '') # x = x.replace('碎绵裹', '') # 通过正则表达去除多余的单位,只保留数值+g的单位。 str = match(x) content[j] = str # print 'item - x',num,j,item,x j += 1 data.append(content) num += 1 web_data_process.write_in_csv(writecsvname, data)
def UnifiedDose(readcsvname, writecsvname): print 'UnifiedDose' csvdata = web_data_process.read_csv(readcsvname) normalList = [] for item in csvdata: # print 'item',item midList = [] for itemdata in item: # print 'itemdata', itemdata itemdata = itemdata.decode('utf8') if (itemdata.find('两') > 0): try: zz = itemdata.split('两') # print 'split itemdata', itemdata unit = float(zz[0]) * 50 # print 'unit', unit changeunit = str(unit) + 'g' # print 'changeunit', changeunit midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('钱') > 0): try: zz = itemdata.split('钱') unit = float(zz[0]) * 3.125 # print 'unit',unit changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('kg') > 0): try: zz = itemdata.split('kg') unit = float(zz[0]) * 1000 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('Kg') > 0): try: zz = itemdata.split('Kg') unit = float(zz[0]) * 1000 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('斤') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 500 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('分') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 0.3 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) else: midList.append(itemdata) normalList.append(midList) web_data_process.write_in_csv(writecsvname, normalList)
def webProcessNum(readcsvname, writecsvname): print 'webProcessNum' csvdata = web_data_process.read_csv(readcsvname) finaldata = [] num = 0 pattern3 = re.compile( ur'\d+(?:g|kg|ml|l|个|Kg|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)' ) pattern4 = re.compile( ur'\d+.\d+(?:g|kg|ml|Kg|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)' ) pattern1 = re.compile(ur'[\u4e00-\u9fa5]+(\(|()') pattern2 = re.compile(ur'(?:\(|(|)|\))') pattern5 = re.compile( ur'(?<![\u4e00-\u9fa5])(?:\(|()\d+(?:g|kg|ml|l|个|Kg|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)(?:\)|))(?![\u4e00-\u9fa5])' ) pattern6 = re.compile(ur'(?:\)|))(?=[\u4e00-\u9fa5]+)') pattern7 = re.compile(ur'(?:\(|()') for content in csvdata: j = 0 medicallist = [] yaowulist = [] try: content.remove('') except: pass for item in content: item = item.decode('utf-8') if j == 0: medicallist.append(content[0]) else: #情况一:升麻 =》只有字 if not pattern3.search(item): # print '1', item word = item wordnumber = 'None' medicallist.append(word) medicallist.append(wordnumber) #情况三:(6g)=>只有数量 if pattern5.search(item): # print '3',item wordnumber = re.sub(pattern2, '', item) k = 0 for item in medicallist: if item == 'None': medicallist[k] = wordnumber k += 1 #情况五:处理‘各’:各(30g) or 焦栀各(各9g) if item.find('各') > -1: # print '各',item item = item.replace('各', '') item = re.sub(pattern2, '', item) #找到整数 或者 小数 weight1 = pattern4.findall(item) #小数 weight2 = pattern3.findall(item) #整数 # 把正确的值放在变量weight中 if (weight1): wordnumber = weight1[0] yaowulist = pattern4.split(item) elif (weight2): wordnumber = weight2[0] yaowulist = pattern3.split(item) # 把处方的每味药提出来重新放在medicallist列表元素[0]里,同时已经去除了药的数量单位 if (yaowulist): try: for zz in yaowulist: if zz != u'': word = zz # print 'word',word medicallist.append(word) medicallist.append(wordnumber) except: pass k = 0 for item in medicallist: if item == 'None': medicallist[k] = wordnumber k += 1 #情况二:麻黄(6g) if pattern1.search(item): # print '麻黄(6g)',item word = item[0:pattern7.search(item).start()] # print 'bb',word wordnumber = item[pattern7.search(item).end():] wordnumber = re.sub(pattern2, '', wordnumber) # print 'nn',wordnumber medicallist.append(word) medicallist.append(wordnumber) #情况四:(6g)麻黄 if pattern6.search(item): wordnumber = item[:pattern6.search(item).start()] wordnumber = re.sub(pattern2, '', wordnumber) word = item[pattern6.search(item).end():] medicallist.append(wordnumber) medicallist.append(word) j += 1 finaldata.append(medicallist) num += 1 web_data_process.write_in_csv(writecsvname, finaldata)
def webFour(readcsvname, writecsvname): print 'webFour' data = web_dataDetailProcess.composition_process(readcsvname) web_data_process.write_in_csv(writecsvname, data)