def pickData(readcsvname1, readcsvname2, function): print 'pickData' preslist = [] funclist = [] presdata = data_process.read_csv(readcsvname1) funcdata = data_process.read_csv(readcsvname2) presdatalist = [] for item in presdata: presdatalist.append(item) num = 0 for item in funcdata: # print '功效',item if num == 0: item[0] = item[0].replace('', '') for itemdata in item: if itemdata.decode('utf8').find(function) > -1: funclist.append(item) preslist.append(presdatalist[num]) num += 1 print '功效 %s 的样本集大小为:%d' % (function, len(funclist)) #换功效时 需要修改最后的保存文件!!! writecsvname = 'Apriori_QFCS_Prescription.csv' data_process.write_in_csv(writecsvname, preslist) writecsvname = 'Apriori_QFCS_Function.csv' data_process.write_in_csv(writecsvname, funclist)
def func2feature(csvname1, csvname2): print 'func2feature' funcdata = data_process.read_csv(csvname1) countdata = data_process.read_csv(csvname2) countlist = [] for item in countdata: countlist.append(item[0]) featurelist = [] #这里可以修改需要判别的功效,放一个时会检索不到(‘.-’) locmark = countlist.index('祛风除湿'.decode('utf-8')) print 'locmark', locmark for item in funcdata: check = 0 for itemdata in item: itemdata = itemdata.replace('疏风', '祛风') itemdata = itemdata.replace('散风', '祛风') itemdata = itemdata.replace('驱风', '祛风') try: loc = countlist.index(itemdata.decode('utf-8')) if loc == locmark: check = 1 except: pass if check == 1: featurelist.append(1) else: featurelist.append(0) print 'len(featurelist):', len(featurelist) print '有多少方剂属于该功效(祛风除湿):', featurelist.count(1) return featurelist
def pickFunction(readcsvname1, readcsvname2, writecsvname1, writecsvname2, function): print 'pickFunction' funcdata = data_process.read_csv(readcsvname1) presdata = data_process.read_csv(readcsvname2) funclist = [] preslist = [] for item in presdata: preslist.append(item) for item in funcdata: funclist.append(item) finalpreslist = [] finalfunclist = [] print "要选择的方剂功效为 %s:" % function print "正在进行中....." num = 0 for item in funclist: for itemdata in item: itemdata = itemdata.decode('utf8') if itemdata.find(function) > -1 and len(preslist[num]) >= 3: finalfunclist.append(item) finalpreslist.append(preslist[num]) break num += 1 print "功效%s 在5W数据集的方剂中找到含有该功效方剂 %d 条。" % (function, len(finalfunclist)) print "收集负例中.....(设置负例为正例个数的1.3倍)" num = 0 count = 0 neglength = len(finalfunclist) * 1.3 for item in funclist: if (num % 50 == 0): check = True for itemdata in item: itemdata = itemdata.decode('utf8') if itemdata.find(function) > -1: check = False break #近义词也要考虑过滤 if itemdata.find('清热') > -1: check = False break if itemdata.find('解毒') > -1: check = False break if check and count < neglength: if len(preslist[num]) > 3: finalfunclist.append(item) finalpreslist.append(preslist[num]) count += 1 if count > neglength: break num += 1 print "功效%s 在5W数据集的方剂中正负样例共有 功效:%d 配伍:%d 条。" % ( function, len(finalfunclist), len(finalpreslist)) data_process.write_in_csv(writecsvname1, finalfunclist) data_process.write_in_csv(writecsvname2, finalpreslist)
def pickData(readcsvname1, readcsvname2, readcsvname3, readcsvname4, function): print 'pickData' preslist = [] funclist = [] presdata = data_process.read_csv(readcsvname1) funcdata = data_process.read_csv(readcsvname2) webPresdata = data_process.read_csv(readcsvname3) webFuncdata = data_process.read_csv(readcsvname4) presdatalist = [] for item in presdata: presdatalist.append(item) webPresdatalist = [] for item in webPresdata: webPresdatalist.append(item) num = 0 for item in funcdata: # print '功效',item if num == 0: item[0] = item[0].replace('', '') for itemdata in item: if itemdata.decode('utf8').find(function) > -1: funclist.append(item) preslist.append(presdatalist[num]) num += 1 positiveCount = len(funclist) print '功效 %s 的个数为:%d' % (function, positiveCount) negativeCount = positiveCount * 1.2 num = 0 count = 0 for item in webFuncdata: # print '功效', item if num == 0: item[0] = item[0].replace('', '') if count < negativeCount: for itemdata in item: # print 'itemdata.decode(utf8).find(function)',num,itemdata.decode('utf8').find(function) if itemdata.decode('utf8').find(function) == -1: funclist.append(item) # print 'test',webPresdatalist[num] preslist.append(webPresdatalist[num]) count += 1 break num += 1 print '功效 %s 的测试样本集大小为:%d' % (function, len(funclist)) #换功效时 需要修改最后的保存文件!!! writecsvname = 'combinePrescription.csv' data_process.write_in_csv(writecsvname, preslist) writecsvname = 'combineFunction_QFCS.csv' data_process.write_in_csv(writecsvname, funclist)
def presFeature_2(csvname1,csvname2): print 'presFeature_1' prescriptiondata = data_process.read_csv(csvname1) medicaldata = data_process.read_csv(csvname2) medicaList=[] for item in medicaldata: medicaList.append(item[0].replace('', '')) pFeatrue= [] presLabelFeatrue=[] j=1 wrongnum=1 for item in prescriptiondata: # print 'item:',item item[0] = item[0].replace('', '') mark = 0 # featrue = [0] * 1563 featrue = [0] * 223 lenth=len(item) thisall=0 for i in range(0,lenth): if ((i % 2)!=0): if item[i] != 'None': thisall = thisall+dataFeatureValue.findnum(item[i]) else: thisall=thisall+28 for itemdata in item: if(( mark % 2) == 0): try: location=medicaList.index(itemdata) # print 'location',location if item[mark+1]!='None': itemvalue=dataFeatureValue.findnum(item[mark+1]) # finalValue=(itemvalue-float(mediaclvalueList[location][2])+1)/(float(mediaclvalueList[location][3])+1) #特征既有配伍成分,又考虑了单位数量和归一化 featrue[location]=itemvalue/thisall else: featrue[location] = 28/thisall except: # print 'wrong',wrongnum,item[0],mark,itemdata wrongnum += 1 mark += 1 else: mark+=1 j+=1 pFeatrue.append(featrue) print len(pFeatrue),j,wrongnum return pFeatrue
def presFeature(csvname1,csvname2): print 'presFeature' prescriptiondata = data_process.read_csv(csvname1) medicaldata = data_process.read_csv(csvname2) medicaList=[] for item in medicaldata: medicaList.append(item[0].replace('', '')) # print 'mediacl', medicaList # for item in medicaList: # print (item) pFeatrue= [] presLabelFeatrue=[] wrongnum=1 #allData_normal1.csv一共有药物1487种 for item in prescriptiondata: # print 'item:',item item[0] = item[0].replace('', '') mark = 0 #多少种药就是多少维 668种药,668维 # featrue = [0] * 1563 featrue = [0] * 584 for itemdata in item: if mark==0: mark+=1 continue if(( mark % 2) == 1): try: location=medicaList.index(itemdata) # print 'location',location # itemvalue=dataFeatureValue.findnum(item[mark+1]) # finalValue=(itemvalue-float(mediaclvalueList[location][2])+1)/(float(mediaclvalueList[location][3])+1) #特征既有配伍成分,有考虑了单位数量 # featrue[location]=finalValue #只关心配伍成分,不关心单位数量 featrue[location]=1 except: # print 'wrong',wrongnum,item[0],mark,itemdata wrongnum += 1 mark += 1 else: mark+=1 pFeatrue.append(featrue) print len(pFeatrue),wrongnum return pFeatrue
def tongji(readcsvname): print 'tongji' data = data_process.read_csv(readcsvname) medicallist=[] for item in data: medicallist.append(item) num=0 count = 0 countp=0 all=len(medicallist) print all for item in medicallist: if num ==0: aa=item[1] count += 1 else: # print 'zz', item[1] if item[1]==aa: count+=1 else: # print count p=float(count)/all countp=countp+p print '频次:%s , 占比:%f'%(aa,p) count=0 count += 1 aa = item[1] num+=1 p = float(count) / all countp = countp + p print '频次:%s , 占比:%f' % (aa, p) print 'countp',countp
def composition_process(readcsvname, writecsvname): print('composition_process') # readcsvname='composition_6.csv' csvdata = data_process.read_csv(readcsvname) datas = [] i = 1 for item in csvdata: # print 'row',i,item[0].split(' ') item = item[0].split(' ') num = 0 for itemdata in item: if itemdata == '': item.pop(num) num += 1 datas.append(item) i += 1 # j=1 # for item in datas: # print 'j', j # j+=1 # print item # for itemdata in item: # print 'zz',itemdata data_process.write_in_csv(writecsvname, datas)
def seperateNone(): print('seperateNone') readcsvname = 'csvtest_3.csv' # readcsvname ='allData_none.csv' csvdata = data_process.read_csv(readcsvname) normalList = [] noneLise = [] nn = 1 #统计每类数据里带NONE的方剂有多少条 count = 1 for item in csvdata: # print 'item',item check = 1 for itemdata in item: # print 'itemdata',itemdata if (itemdata == 'None'): check = 0 noneLise.append(item) break if (check == 1): normalList.append(item) zz = item[0].split('*') # print 'zz',zz if (zz[1] == nn): count += 1 else: print('count :', count) count = 1 #nn用于判断类别有没有改变 nn = zz[1]
def process_blank(readcsvname, writecsvname): print('process_blank') csvdata = data_process.read_csv(readcsvname) datas = [] i = 1 replace = [] for item in csvdata: num = 0 lengh = len(item) for itemdata in item: # print 'itemdata',i,itemdata if itemdata == '': item.pop(num) itemdata = itemdata.split('\xe3\x80\x80') if len(itemdata) > 1: # print 'split itemdata',i,itemdata for zz in itemdata: replace.append(zz) else: replace.append(itemdata[0]) num += 1 datas.append(replace) replace = [] i += 1 data_process.write_in_csv(writecsvname, datas)
def splitnumandstr(readcsvname): print('splitnumandstr') csvdata = data_process.read_csv(readcsvname) pattern1 = re.compile(ur'(?:\(|()') pattern2 = re.compile(ur'(?:\)|))') data_after = [] datalist = [] for item in csvdata: # print ('%%%%%%%%%%%%%num',i) check = True for itemdata in item: itemdata = itemdata.replace('', '') itemdata = itemdata.decode('utf-8') #清洗数据:处理描述中的里括号里的内容***不要括号里的内容********** try: num1, num2 = kuohaoClear(itemdata) itemdata = itemdata[0:num1] + itemdata[num2:] except: # print 'kuohao error',item[0] if pattern1.search(itemdata): itemdata = itemdata[:pattern1.search(itemdata).start()] data_after.append(itemdata) check = False if pattern2.search(itemdata): itemdata = itemdata[pattern2.search(itemdata).end():] check = True if check: data_after.append(itemdata) datalist.append(data_after) data_after = [] return datalist
def function_count(csvname): print 'function_count 计算有多少种功效,每种功效出现的次数和比例*' csv_data = data_process.read_csv(csvname) flist = [] for item in csv_data: checknum = 0 for itemdata in item: if checknum != 0: itemdata = itemdata.replace('疏风', '祛风') itemdata = itemdata.replace('散风', '祛风') itemdata = itemdata.replace('驱风', '祛风') flist.append(itemdata) checknum += 1 print '所有方剂中的功效有(没有去重):', len(flist) #去重 计算有多少不同的功效 flistset = list(set(flist)) # 统计每种药物出现的次数 numarray = [] n = [] for item in flistset: n.append(item) n.append(flist.count(item)) numarray.append(n) n = [] # 以次数排序 numarray = sorted(numarray, key=lambda x: x[1], reverse=True) print '所有方剂中的功效有(去重):', len(numarray) return numarray
def countallmedical(readcsvname): print 'countallmedical' csvdata = data_process.read_csv(readcsvname) medicaList = [] medical_value = [] pattern = re.compile(ur'[\u4e00-\u9fa5]') j = 1 for item in csvdata: print 'item:', item[0] mark = 0 for itemdata in item: if mark == 0: mark += 1 continue data_value = [] if ((mark % 2) == 1): itemdata = itemdata.strip() itemdata = itemdata.replace('l', '') itemdata = itemdata.decode('utf8') # print 'itemdata zzz', itemdata if (pattern.search(itemdata)): # print 'j', j, mark # 存取出的药物 medicaList.append(itemdata) # 存药物对应的数值 # data_value.append(itemdata) # data_value.append(findnum(item[mark + 1])) mark += 1 medical_value.append(data_value) else: mark += 1 continue j += 1 allcount = len(medicaList) print '所有处方中共有药物(medicaList): ', allcount medicaListSet = list(set(medicaList)) medicalcount = len(medicaListSet) print 'medicaList去重后得到处方中不同药物数量: ', medicalcount # medicalminmax=maxValueandminValue(medicaListSet, medical_value) # print 'medicalminmax去重后得到处方中不同药物数量: ', len(medicalminmax) #统计每种药物出现的次数 numarray = [] n = [] for item in medicaListSet: n.append(item) n.append(medicaList.count(item)) numarray.append(n) n = [] print 'numarray1' #以次数排序 numarray = sorted(numarray, key=lambda x: x[1], reverse=True) print 'numarray2' return numarray
def main(): wgt_path = r'./data/HS300_idx_wt.csv' trade_data_path = r'./data/tmp/fixed_daily_data.csv' idx_df = r'./data/HS300_idx_wt.csv' name2code = get_name2code(wgt_path) code2name = dict([(v, k) for k, v in name2code.items()]) wgt_df = read_csv(wgt_path) trade_df = read_csv(trade_data_path) idx_df = read_csv(idx_df) date_list = wgt_df['trade_date'].drop_duplicates().to_list() date_list.sort() for date in date_list[::-1]: date = 20190531 idx_df_day = idx_df[idx_df.trade_date == date] wgt_df_day = wgt_df[wgt_df.trade_date == date] trade_df_day = trade_df[trade_df.trade_date == date][['ts_code', 'trade_date', 'total_mv', 'circ_mv', 'total_share', 'close', 'float_share', 'free_share']] wgt_code = idx_df_day['ts_code'].drop_duplicates().to_list() trade_code = trade_df_day['ts_code'].drop_duplicates().to_list() suspended_stock = select_suspend_stock(trade_code, wgt_code) if len(suspended_stock) == 0: print(date) original_weight = get_ts_code_original_weight(idx_df_day) mkt_val = get_market_value(trade_df_day) wgt = caculate_weight_by_market_value(wgt_code, mkt_val) for i in range(len(trade_code)-1): a, b = trade_code[i], trade_code[i+1] try: r_a = original_weight[a] r_b = original_weight[b] b1 = r_a/r_b w_a = wgt[a] w_b = wgt[b] b2 = w_a/w_b print(b1-b2) except: continue exit()
def checkitem(readcsvname1, readcsvname2, readcsvname3): print '检查配伍-功效-主治对应与否中....' data1 = data_process.read_csv(readcsvname1) data2 = data_process.read_csv(readcsvname2) data3 = data_process.read_csv(readcsvname3) i = 0 for item in data1: if i == 1620: print 'excel 第1622行 配伍(蓝实,决明子。。。) ', item for itemdata in item: print itemdata i += 1 j = 0 for item in data2: if j == 1620: print '第1622行 功效(疏风散热,清肝明目。) ', item[0] j += 1 k = 0 for item in data3: if k == 1620: print '第1622行 主治(肝胆风热上攻,两目??(目旁加流字右边),视物不明。) ', item[0] k += 1
def splitnumandstr(readcsvname): print('splitnumandstr') # readcsvname='csvtest.csv' # readcsvname='composition_1_1.csv' csvdata = data_process.read_csv(readcsvname) i = 1 data_after = [] datalist = [] for item in csvdata: print('%%%%%%%%%%%%%num', i) for itemdata in item: itemdata = itemdata.replace('', '') #把 “各” 字 单独处理 itemdata = itemdata.replace('各', '#') print('itemdata', itemdata) #清洗数据:处理描述中的里括号里的内容***不要括号里的内容********** try: num1 = itemdata.index('(') num2 = itemdata.index(')') itemdata = itemdata + '*' print('*****************num1,num2******************', num1, num2) itemdata = itemdata[0:num1] + itemdata[num2 + 1:-1] print('*****************清除英文括号内容******************', itemdata) num11 = itemdata.index('(') num22 = itemdata.index(')') itemdata = itemdata + '*' print('*****************num1,num2******************', num11, num22) itemdata = itemdata[0:num11] + itemdata[num22 + 1:-1] print('*****************清除中文括号内容******************', itemdata) except: pass data_after.append(itemdata) i += 1 datalist.append(data_after) # print 'data_after^^^^^^^^^^^^^^^^^',data_after data_after = [] # writecsvname = 'composition_1_2.csv' # writecsvname='csvtest_1.csv' # data_process.write_in_csv(writecsvname,datalist) return datalist
def createAllList(): print('createAllList') addlist = [] for inum in range(1, 7): print('inum', inum) readcsvname = 'composition_' + str(inum) + '_3.csv' # readcsvname = 'csvtest_2.csv' csvdata = data_process.read_csv(readcsvname) pnum = '*' + str(inum) # print 'csvdata',csvdata i = 1 for item in csvdata: pnum = pnum + '*' + str(i) item.insert(0, pnum) # print 'zzzz:',item addlist.append(item) pnum = '*' + str(inum) i += 1
def computeAverage(csvname): print 'computeAverage' prescriptiondata = data_process.read_csv(csvname) num=0 itemvalue=0 for item in prescriptiondata: mark = 0 for itemdata in item: if ((mark % 2) == 0): mark += 1 else: if itemdata != 'None': value=dataFeatureValue.findnum(itemdata) if value<1000: itemvalue = itemvalue+value num += 1 else: continue mark += 1 ave=itemvalue/num print 'itemvalue,num,ave=itemvalue/num:',itemvalue,num,ave
def noneStandard(readcsvname): print('noneStandard') # readcsvname ='allData_none.csv' csvdata = data_process.read_csv(readcsvname) noneList = [] for item in csvdata: midList = [] for itemdata in item: itemdata = itemdata.replace('', '') itemdata = itemdata.replace('等分', '') itemdata = itemdata.replace('少许', '') itemdata = itemdata.replace('3倍于上药', '') itemdata = itemdata.replace('减半', '') itemdata = itemdata.replace('倍加', '') midList.append(itemdata) noneList.append(midList) # writecsvname = 'allData_none1.csv' # data_process.write_in_csv(writecsvname , noneList) return noneList
def func2feature(csvname1, function): print 'func2feature' funcdata = data_process.read_csv(csvname1) featurelist = [] for item in funcdata: check = False for itemdata in item: itemdata = itemdata.decode('utf-8') itemdata = itemdata.replace('疏风', '祛风') itemdata = itemdata.replace('散风', '祛风') itemdata = itemdata.replace('驱风', '祛风') if itemdata.find(function) > -1: check = True if check: featurelist.append(1) else: featurelist.append(0) print 'len(featurelist):', len(featurelist) print '有多少方剂属于该功效(祛风除湿):', featurelist.count(1) return featurelist
def composition_process(readcsvname, writecsvname): print('composition_process') # readcsvname='composition_6.csv' csvdata = data_process.read_csv(readcsvname) datas = [] pattern = re.compile(ur'(?:、|,|;|。|,)') pattern1 = re.compile(ur'(?:\(|()') pattern2 = re.compile(ur'(?:\)|))') for item in csvdata: item[0] = item[0].decode('utf-8') item[0] = re.sub(pattern, ' ', item[0]) item[0] = re.sub(pattern1, ' (', item[0]) item[0] = re.sub(pattern2, ') ', item[0]) item = item[0].split(' ') num = 0 for itemdata in item: if itemdata == '': item.pop(num) num += 1 datas.append(item) data_process.write_in_csv(writecsvname, datas)
def onlyWord(readcsvname): print 'onlyWord' preslist = [] presdata = data_process.read_csv(readcsvname) for item in presdata: preslist.append(item) finallist = [] for item in preslist: count = 0 # print item zz = [] for itemdata in item: # print count # print itemdata if (count % 2) == 0: zz.append(itemdata) count += 1 else: count += 1 # print zz finallist.append(zz) return finallist
lenList=[] evalCount = 0 a=0.01 print('start testing') allList = [] words, tags = load_csv('../data/testData/HXHYvision.txt', target_columns=[0], columns_to_ignore=None, target_dict=label_dict) words_with_index = string_parser(words, fit=True) word_input = tflearn.data_utils.pad_sequences(words_with_index, maxlen=word_pad_length) total = len(word_input) evalNum = total-1 rs = 0. #load evalData start evalData=[] evalCav='../data/evalData/%s_evaluate.csv'%preName evalList=data_process.read_csv(evalCav) for item in evalList: evalData.append(item) # load evalData end if FLAGS.visualize == True and preNum < finalNum: k_count = 0 f = open('../myMedicalModel/atentionVision/html/%s_visualizeTCM_%s_noLSTM_HWH_epoches%s_r1_num%s.html' % ( preName, preName, FLAGS.num_epochs, preNum), 'w') f.write( '<html style="margin:0;padding:0;"><meta http-equiv="Content-Type" content="text/html; charset=GBK"><body style="margin:0;padding:0;">\n') for i in range(int(total / batch_size)): batch_input, batch_tags = ( word_input[i * batch_size:(i + 1) * batch_size], tags[i * batch_size:(i + 1) * batch_size]) result = sess.run([logits, model.B, model.Q], feed_dict={model.input_pl: batch_input, labels: batch_tags}) # arr保存预测概率 arr = result[0].tolist()
def extractnumfromstr(readcsvname,writecsvname): print ('extractnumwithstr') csvdata = data_process.read_csv(readcsvname) # 正则匹配要用' ur'' '才能正确匹配中文 # (?:..):(...)的不分组版本,用于使用| 或 后接数量词 pattern1 = re.compile(ur'\d+.\d+(?:g|kg|ml|l|千克|克|钱半|斤半|分半|升半|升|个|钱|片|根|条|份|张|枚|寸|具|朵|只|粒|茎|两半|斤|文|挺|对|头|L|ML|分|节|cm|握|株|两|铢)') pattern2 = re.compile(ur'\d+(?:g|kg|ml|l|千克|克|钱半|分半|斤半|升半|升|个|钱|片|根|条|份|张|枚|寸|具|朵|只|合|粒|茎|两半|斤|文|挺|对|头|L|ML|分|节|cm|握|株|两|铢)') pattern3 = re.compile( ur'\d+(?:g|kg|ml|l|千克|克|钱半|分半|斤半|升半|升|个|钱|片|根|条|份|张|枚|寸|具|朵|只|粒|茎|两半|斤|文|挺|合|对|头|L|ML|分|节|cm|握|株|两|铢)\d+(?:g|kg|ml|l|克|钱半|分半|斤半|升半|升|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两半|斤|文|挺|对|头|L|ML|分|节|cm|握|株|两|铢)') pattern4 = re.compile( ur'(?:一|二|三|四|五|六|七|八|九|十|两|半)(?:g|kg|ml|l|千克|克|钱半|斤半|分半|升半|升|个|钱|片|根|条|份|合|张|枚|寸|具|朵|只|粒|茎|两半|斤|文|挺|对|头|L|ML|分|节|cm|握|株|两|铢)') pattern_other=re.compile(ur'(?:等分|适量|少许)') medicallist = [] for item in csvdata: print '****************************************************************** 处方: ', item[0] checkBH = 0 medical = [] for itemdata in item: check_ge=False weight = 'None' if checkBH: itemdata = itemdata.replace('', '') itemdata = itemdata.replace('.', '.') itemdata = itemdata.replace('o', '0') itemdata = itemdata.decode('utf8') # print 'itemdata', itemdata if itemdata.find('各')>-1: check_ge=True itemdata = itemdata.replace('各', '') itemdata = re.sub(pattern_other, '', itemdata) match1 = pattern3.search(itemdata)#组合,1两2钱 match2 = pattern1.search(itemdata)#小数 match3 = pattern2.search(itemdata)#整数 match4 = pattern4.search(itemdata)#汉字单位 if match1: yaowu = re.sub(pattern3, '', itemdata) weight = match1.group() if yaowu: medical.append(yaowu) medical.append(weight) if medical[-2] == 'None': medical.pop(-2) elif match2: yaowu = re.sub(pattern1, '', itemdata) weight = match2.group() if yaowu: medical.append(yaowu) medical.append(weight) if medical[-2] == 'None': medical.pop(-2) elif match3: yaowu = re.sub(pattern2, '', itemdata) weight = match3.group() if yaowu: medical.append(yaowu) medical.append(weight) if medical[-2] == 'None': medical.pop(-2) elif match4: yaowu = re.sub(pattern4, '', itemdata) weight = match4.group() if yaowu: medical.append(yaowu) medical.append(weight) if medical[-2] == 'None': medical.pop(-2) else: if itemdata: medical.append(itemdata) medical.append('None') if check_ge: num=0 for i in medical: if i=='None': medical[num]=weight num+=1 else: medical.append(itemdata) checkBH+=1 # print 'medical',medical medicallist.append(medical) medical=[] finalmedicallist=[] for content in medicallist: num=0 for icontent in content: if icontent.find('~')>-1: content[num]=icontent[icontent.find('~')+1:] if icontent.find('-') > -1: content[num] = icontent[icontent.find('-') + 1:] num+=1 finalmedicallist.append(content) return finalmedicallist
def unitTransformation(readcsvname): print('unitTransformation()') # readcsvname = 'csvtest_normal.csv' # readcsvname = 'allData_normal.csv' # readcsvname ='allData_none.csv' csvdata = data_process.read_csv(readcsvname) normalList = [] for item in csvdata: # print 'item',item midList = [] for itemdata in item: # print 'itemdata', itemdata itemdata = itemdata.decode('utf8') if (itemdata.find('两') > 0): try: zz = itemdata.split('两') # print 'split itemdata', itemdata unit = float(zz[0]) * 50 # print 'unit', unit changeunit = str(unit) + 'g' # print 'changeunit', changeunit midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('钱') > 0): try: zz = itemdata.split('钱') unit = float(zz[0]) * 3.125 # print 'unit',unit changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('kg') > 0): try: zz = itemdata.split('kg') unit = float(zz[0]) * 1000 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('斤') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 500 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) elif (itemdata.find('分') > 0): try: zz = itemdata.split('斤') unit = float(zz[0]) * 0.3 changeunit = str(unit) + 'g' midList.append(changeunit) except: midList.append(itemdata) else: midList.append(itemdata) normalList.append(midList) # writecsvname = 'csvtest_normal1.csv' # writecsvname = 'allData_normal1.csv' # data_process.write_in_csv(writecsvname , normalList) return normalList
# coding=utf-8 import data_process readcsvname = '../formulaData_Experiment/ExResult_onehot_QRJD.csv' readdata = data_process.read_csv(readcsvname) datalist = [] data = [] for item in readdata: print 'item', item[0] num = 0 for i in item[0].split(' '): print '1', i if (num == 3): print '2', i print i.split(':')[-1] maxiter = i.split(':')[-1] data.append(float(maxiter)) num += 1 acc = item[1].split('acc:')[-1] acc = acc.replace('"', '') print acc data.append(float(acc)) datalist.append(data) data = [] writecsvname = '../formulaData_1/L1_draw.csv' medicaldata = data_process.write_in_csv(writecsvname, datalist)
import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.externals import joblib import random # 版本二 加了抽样,样本正反例数据平衡 start train_x = [] train_y = [] rem = [] # presCsvname='presFeature_realValue.csv' presCsvname = 'presFeature_onehot_668.csv' funcCsvname = 'funcFeature.csv' data = data_process.read_csv(presCsvname) labeldata = data_process.read_csv(funcCsvname) # csv内容存放在list才可再读 labellist = [] num = 0 for j in labeldata: labellist.append(j) if num == 0: j[0] = j[0].replace('', '') # print j if int(j[0]) == 1: rem.append(num) train_y.append(float(j[0])) num += 1 print 'len(rem)1', len(rem) print 'len(train_y)1', len(train_y)
# words ='*********** 依次输出找到的频繁项集' # print words data_process.write_str_in_csv_a(writeCsv, words) for item in aprioriEvaList: medicalstr = '' for itemdata in item: medicalstr = medicalstr + itemdata + ',' words = '支持度 最高的 %d - 频繁项集 为 : %s' % (len(item), medicalstr) # print words data_process.write_str_in_csv_a(writeCsv, words) words = '########################## medical evaluating.... ##########################' printTowrite(words) evalueatecsv = '../%sFile/%s_evaluate.csv' % (preName, preName) evalueateData = data_process.read_csv(evalueatecsv) evalueateDataList = [] for item in evalueateData: item[0] = item[0].replace('', '') for i in range(len(item)): # item[i] = item[i].decode('utf8',errors='ignore') item[i] = item[i].decode('utf8') evalueateDataList.append(item) # print evalueateDataList for item in evalueateDataList: zstr = '' for itemset in item: zstr = zstr + itemset + ',' # print zstr finalaprioriEvaList = []
maxiter = i * 10 for j in range(0, 4): # NOL1_weight_onehot_QRJD_s0.1_50.csv # readcsvname = '../formulaData_Experiment/NOL1_weight_onehot_QRJD_s0.1_'+str(maxiter)+'.csv' if (qq == 1): lamda = 0.7 if (qq == 2): lamda = 1.3 if (qq == 3): lamda = 2.7 if (qq == 4): lamda = 4.3 print 'lamda', lamda readcsvname = '../formulaData_Experiment/weight_onehot_QRJD_s0.1_500_' + str( lamda) + '.csv' weightdata = data_process.read_csv(readcsvname) qq += 1 csvname = '../formulaData_1/QRJD_medical_count.csv' medicaldata = data_process.read_csv(csvname) medicallist = [] importantMedical = [] weightlist = [] for item in medicaldata: medicallist.append(item[0]) weightlist = [] num = 0 for item in weightdata: if num != 0: zz = []
def extractnumwithstr(readcsvname): print('extractnumwithstr') # readcsvname='csvtest_1.csv' # readcsvname='composition_5_2.csv' csvdata = data_process.read_csv(readcsvname) # i:指示第i个处方 i = 1 #正则匹配要用' ur'' '才能正确匹配中文 #(?:..):(...)的不分组版本,用于使用| 或 后接数量词 pattern1 = re.compile( ur'\d+.\d+(?:g|kg|ml|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株|两半)' ) pattern2 = re.compile( ur'\d+(?:g|kg|ml|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株|两半)' ) finalmedicallist = [] for item in csvdata: print '****************************************************************** 处方: ', i medicallist = [] point = [] medicaldict = [] for itemdata in item: weight = '' yaowulist = [] itemdata = itemdata.replace('', '') itemdata = itemdata.replace('.', '.') itemdata = itemdata.replace('o', '0') itemdata = itemdata.decode('utf8') # print 'itemdata', itemdata #在处方内容中通过正则匹配找出数量单位 start weight1 = pattern1.findall(itemdata) weight2 = pattern2.findall(itemdata) #把正确的值放在变量weight中 if (weight1): weight = weight1[0] yaowulist = pattern1.split(itemdata) elif (weight2): weight = weight2[0] yaowulist = pattern2.split(itemdata) # print '$$$$$$$$$$$$findal',weight1,weight2,weight # 在处方内容中通过正则匹配找出数量单位 end # print 'yaowulist',yaowulist # for ii in yaowulist: # print ii # 把处方的每味药提出来重新放在medicallist列表元素[0]里,同时已经去除了药的数量单位 if (yaowulist): try: yaowulist.remove('') for zz in yaowulist: medicallist.append(zz) except: pass else: medicallist.append(itemdata) # 把处方的每味药所对应的数量单位存入medicallist 列表元素[1]的位置列表里 if (weight): medicallist.append(weight) else: medicallist.append('None') #medicallist eg.[u'\u9632\u98ce', u'3l'] medicaldict.append(medicallist) medicallist = [] # print "medicallist", medicallist print 'medicaldict', medicaldict #j统计每个处方里的第j味药 j = 0 for k, v in medicaldict: #用point记录#在哪味药上 # print k,v if (k.find('各') > 0): point.append(j) medicaldict[j][0] = medicaldict[j][0].replace('各', '') # print (medicaldict[j][0]) print '检测到“各”字,该味药在处方中所处位置:', j, k j += 1 print '该方剂一共配药数量为:', j print '该方剂中出现“各”字的位置有:', point f = 0 print '##################### 开始处理所有药的数量单位 ##################' # print ('medicaldict', medicaldict) for m, n in medicaldict: if (point != []): for pointnum in point: # print 'test', pointnum if (f > pointnum): continue elif (n == 'None'): # print 'test4',medicaldict[f][1],pointnum medicaldict[f][1] = medicaldict[pointnum][1] break f += 1 # print '@@@@@@~~最后处理结果(列表):', medicaldict #重新整理medicaldict数据格式,并存入csv里 onepiece = [] for x, y in medicaldict: print x, y onepiece.append(x) onepiece.append(y) # print 'onepiece',onepiece # data_process.createListCSV('csvtest_2.csv',medicaldict) finalmedicallist.append(onepiece) #处方数增一 i += 1 return finalmedicallist