def func2feature(csvname1, csvname2):
    print 'func2feature'
    funcdata = web_data_process.read_csv(csvname1)
    countdata = web_data_process.read_csv(csvname2)

    countlist = []
    for item in countdata:
        countlist.append(item[0])

    featurelist = []
    #这里可以修改需要判别的功效,放一个时会检索不到(‘.-’)
    locmark = countlist.index('清热解毒'.decode('utf-8'))
    print 'locmark', locmark
    for item in funcdata:
        check = 0
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            itemdata = itemdata.replace('疏风', '祛风')
            itemdata = itemdata.replace('散风', '祛风')
            itemdata = itemdata.replace('驱风', '祛风')
            try:
                loc = countlist.index(itemdata.decode('utf-8'))
                if loc == locmark:
                    check = 1
            except:
                pass
        if check == 1:
            featurelist.append(1)
        else:
            featurelist.append(0)
    print 'len(featurelist):', len(featurelist)
    print '有多少方剂属于该功效(祛风清热):', featurelist.count(1)
    return featurelist
def loadData():
    train_x = []
    train_y = []
    # presCsvname='presFeature_realValue.csv'
    presCsvname = 'presFeature_onehot.csv'
    funcCsvname = 'funcFeature.csv'
    data = web_data_process.read_csv(presCsvname)
    labeldata=web_data_process.read_csv(funcCsvname)
    num=0
    for i in data:
        if num==0:
            i[0]=i[0].replace('', '')
        i=[float(item) for item in i]
        i.insert(0,1.0)
        train_x.append(i)
        num+=1
    num = 0
    for j in labeldata:
        if num==0:
            j[0]=j[0].replace('', '')
        train_y.append(float(j[0]))
        num += 1

    #别人的例子
    # fileIn = open('../Ch05/testSet.txt')
    # for line in fileIn.readlines():
    #     lineArr = line.strip().split()
    #     train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
    #     train_y.append(float(lineArr[2]))
    return mat(train_x), mat(train_y).transpose()
Ejemplo n.º 3
0
def loadDataSet(funcCsvname, presCsvname):
    dataMat = []
    labelMat = []
    funcdata = web_data_process.read_csv(presCsvname)
    labeldata = web_data_process.read_csv(funcCsvname)
    for i in funcdata:
        dataMat.append(i)
    for j in labeldata:
        labelMat.append(j)

    return dataMat, labelMat
def function_count(csvname):
    print 'function_count 计算有多少种功效,每种功效出现的次数和比例*'
    csv_data = web_data_process.read_csv(csvname)
    flist = []
    for item in csv_data:
        # j=0
        for itemdata in item:
            # if j!=0 and itemdata!='':
            print 'itemdata', itemdata
            itemdata = itemdata.replace('疏风', '祛风')
            itemdata = itemdata.replace('散风', '祛风')
            itemdata = itemdata.replace('驱风', '祛风')
            flist.append(itemdata)
        # j+=1
    print '所有方剂中的功效有(没有去重):', len(flist)
    #去重 计算有多少不同的功效
    flistset = list(set(flist))

    # 统计每种药物出现的次数
    numarray = []
    n = []
    for item in flistset:
        n.append(item)
        n.append(flist.count(item))
        numarray.append(n)
        n = []
    # 以次数排序
    numarray = sorted(numarray, key=lambda x: x[1], reverse=True)
    print '所有方剂中的功效有(去重):', len(numarray)

    return numarray
Ejemplo n.º 5
0
def deletebianhao(readcsvname1, readcsvname2):
    print 'deletebianhao'
    csvdata1 = web_data_process.read_csv(readcsvname1)
    csvdata2 = web_data_process.read_csv(readcsvname2)
    formulaList = []
    functionList = []
    for item in csvdata1:
        item.pop(0)
        formulaList.append(item)
    for item in csvdata2:
        item.pop(0)
        functionList.append(item)

    writecsvname1 = 'webFormula_final_2.csv'
    writecsvname2 = 'webFunction_3.csv'
    web_data_process.write_in_csv(writecsvname1, formulaList)
    web_data_process.write_in_csv(writecsvname2, functionList)
Ejemplo n.º 6
0
def deleteblank(readcsvname, writecsvname):
    print 'deleteblank'
    csvdata = web_data_process.read_csv(readcsvname)
    finalList = []
    for item in csvdata:
        if len(item) != 1:
            finalList.append(item)
    print 'lenth', len(finalList)
    web_data_process.write_in_csv(writecsvname, finalList)
Ejemplo n.º 7
0
def checkContent(readcsvname1, readcsvname2):
    print 'checkContent'
    csvdata1 = web_data_process.read_csv(readcsvname1)
    csvdata2 = web_data_process.read_csv(readcsvname2)
    formulaList = []
    functionList = []
    for item in csvdata1:
        formulaList.append(item)
    for item in csvdata2:
        functionList.append(item)
    print '方剂个数:%d 功效个数:%d ' % (len(formulaList), len(functionList))
    num = 0
    wrong = 1
    for i in range(0, len(formulaList)):
        if formulaList[i][0] != functionList[i][0]:
            print '编号不对应!', num, formulaList[i][0], functionList[i][0]
            wrong = 0
        num += 1
    if wrong:
        print '检测完毕!编号对应!'
Ejemplo n.º 8
0
def webSix(readcsvname, writecsvname):
    print 'webFive'
    csvdata = web_data_process.read_csv(readcsvname)
    data = []
    num = 0
    for content in csvdata:
        j = 0
        for item in content:
            item = item.decode('utf-8')
            str = match1(item)
            content[j] = str
            # print 'item - x',num,j,item,x
            j += 1
        data.append(content)
        num += 1
    web_data_process.write_in_csv(writecsvname, data)
Ejemplo n.º 9
0
def composition_process(readcsvname):
    print ('composition_process')
    # readcsvname='composition_6.csv'
    csvdata=web_data_process.read_csv(readcsvname)
    datas=[]
    i=1
    for item in csvdata:
        # print 'row',i,item[0].split(' ')
        item=item[0].split(' ')
        num = 0
        for itemdata in item:
            if itemdata == '':
                item.pop(num)
            num += 1
        datas.append(item)
        i+=1

    return datas
Ejemplo n.º 10
0
def webOne(readcsvname):
    print 'webOne'
    csvdata = web_data_process.read_csv(readcsvname)
    formulalist = []
    functionlist = []
    zhuzhilist = []
    num = 0
    for item in csvdata:
        # print item
        for itemdata in item:
            print num, itemdata.strip()
        try:
            formulalist.append(str(num) + '#' + item[1].strip())
            functionlist.append(str(num) + '#' + item[2].strip())
            zhuzhilist.append(str(num) + '#' + item[3].strip())
        except:
            pass
        num += 1

    print '........得到配伍数据共%d条 \n' % len(formulalist)
Ejemplo n.º 11
0
def webFive(readcsvname, writecsvname):
    print 'webFive'
    csvdata = web_data_process.read_csv(readcsvname)
    data = []
    num = 0
    for content in csvdata:
        j = 0
        for item in content:
            item = item.decode('utf-8')
            pos = item.find('去')
            if pos > -1:
                item = item[0:pos]

            x = wordmatch(item)
            # x = item.replace('炙', '')
            # x = x.replace('不', '')
            # x = x.replace('蒸', '')
            # x = x.replace('炒', '')
            # x = x.replace('熬', '')
            # x = x.replace('锉', '')
            # x = x.replace('炒香', '')
            # x = x.replace('炮', '')
            # x = x.replace('切', '')
            # x = x.replace('轧细', '')
            # x = x.replace('捣碎', '')
            # x = x.replace('裹煨', '')
            # x = x.replace('研粉', '')
            # x = x.replace('调下', '')
            # x = x.replace('另研', '')
            # x = x.replace('碎绵裹', '')

            # 通过正则表达去除多余的单位,只保留数值+g的单位。
            str = match(x)
            content[j] = str
            # print 'item - x',num,j,item,x
            j += 1
        data.append(content)
        num += 1
    web_data_process.write_in_csv(writecsvname, data)
Ejemplo n.º 12
0
def webTwo(readcsvname, writecsvname):
    print 'webTwo'
    csvdata = web_data_process.read_csv(readcsvname)
    formulalist = []
    num = 0
    for item in csvdata:
        # print num, item
        for itemdata in item:
            if itemdata != '':
                itemdata = itemdata.decode('utf8')
                itemdata = itemdata.replace('\r', '')
                itemdata = itemdata.replace('\n', '')
                itemdata = itemdata.replace('\t', '')
                itemdata = itemdata.replace('"', '')
                itemdata = itemdata.replace('\xc2\xa0', '')
                itemdata = itemdata.replace('\xe3\x80\x80\xe3\x80\x80', ' ')
                itemdata = itemdata.replace('\xe3\x80\x80', ' ')
                itemdata = itemdata.replace('】 ', '】')
                #webThree
                itemdata = itemdata.replace('', '')
                itemdata = itemdata.replace('【组成】', ' ')
                itemdata = itemdata.replace(',', ' ')
                itemdata = itemdata.replace('。', ' ')
                itemdata = itemdata.replace('、', '')
                itemdata = itemdata.replace('(原书未注用量)', '')
                itemdata = itemdata.replace('(原书未著用量)', '')
                itemdata = itemdata.replace('酒洗', '')
                itemdata = itemdata.replace('洗', '')
                itemdata = itemdata.replace('汤洗七次', '')
                # webThree
                itemdata = itemdata.strip()
                print 'zz', num, itemdata
                formulalist.append(itemdata.decode('utf-8'))
                # formulalist.append(itemdata)
        # print num,item
        num += 1
    web_data_process.write_list_in_csv(writecsvname, formulalist)
Ejemplo n.º 13
0
# coding=utf-8
import re
import sys
import web_dataFeatureValue
import web_data_process
reload(sys)
sys.setdefaultencoding('utf-8')

if __name__ == '__main__':
    print '从训练好的logistic模型参数中找出占主导作用的药物...'
    readcsvname = 'weights_0.2.csv'
    weightdata = web_data_process.read_csv(readcsvname)
    csvname = 'allMedicalCount.csv'
    medicaldata = web_data_process.read_csv(csvname)
    medicallist = []
    importantMedical = []
    weightlist = []
    for item in medicaldata:
        medicallist.append(item[0])
    num = 0
    for item in weightdata:
        zz = []
        item[0] = item[0].replace('[[', '')
        item[0] = item[0].replace(']]', '')
        item[0] = item[0].replace('', '')
        # print 'zz',item[0]
        if num == 0:
            pass
        else:
            if float(item[0]) > 0.001:
                zz.append(medicallist[num - 1])
Ejemplo n.º 14
0
def prescriptionFeature():
    print 'prescriptionFeature'
    readcsvname = 'allNormalMedicalMinMaxValue.csv'
    medicaldata = web_data_process.read_csv(readcsvname)

    # readcsvname = 'allData_normal1.csv'
    readcsvname = 'allData1.csv'
    prescriptiondata = web_data_process.read_csv(readcsvname)

    readcsvname = 'allLabelDataValue.csv'
    labeldata = web_data_process.read_csv(readcsvname)

    medicaList = []
    mediaclvalueList = []
    for item in medicaldata:
        medicaList.append(item[0].replace('', ''))
        mediaclvalueList.append(item)

    labelmark = []
    labelvalue = []
    for item in labeldata:
        mark = 0
        nn = []
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            if (mark == 0):
                labelmark.append(itemdata)
            else:
                nn.append(itemdata)
            mark += 1
        labelvalue.append(nn)

    # print (labelvalue)

    # print 'mediacl', medicaList
    # for item in medicaList:
    #     print (item)
    presFeatrue = []
    presLabelFeatrue = []
    j = 1
    wrongnum = 1
    #allData_normal1.csv一共有药物1298种
    # print (featrue)
    for item in prescriptiondata:
        # print 'item:',item
        mark = 0
        #allData_normal1.csv
        # featrue = [0] * 1298
        #allData1.csv
        featrue = [0] * 1379
        for itemdata in item:
            if (mark == 0):
                # print 'itemdata',itemdata
                itemdata = itemdata.replace('', '')
                # print 'itemdata', itemdata
                loc = labelmark.index(itemdata)
                # print 'loc',loc
                # print (labelvalue[loc])
                presLabelFeatrue.append(labelvalue[loc])
            else:
                if ((mark % 2) != 0):
                    try:
                        location = medicaList.index(itemdata)
                        # print 'location',location
                        itemvalue = findnum(item[mark + 1])
                        finalValue = (
                            itemvalue - float(mediaclvalueList[location][2]) +
                            1) / (float(mediaclvalueList[location][3]) + 1)
                        #特征既有配伍成分,有考虑了单位数量
                        # featrue[location]=finalValue
                        #只关心配伍成分,不关心单位数量
                        featrue[location] = 1
                    except:
                        # print 'wrong',wrongnum,item[0],mark,itemdata
                        wrongnum += 1
                else:
                    mark += 1
                    continue
            mark += 1

        j += 1
        # print (featrue)
        presFeatrue.append(featrue)

    print len(presFeatrue), j, len(presLabelFeatrue)
Ejemplo n.º 15
0
def prescription2Feature():
    print 'prescription2Feature'
    #allData_normal1.csv里的不同药味数统计
    readcsvname = 'allNormalMedicalMinMaxValue.csv'
    # allData1.csv里的不同药味数统计
    # readcsvname = 'allData1Count.csv'
    medicaldata = web_data_process.read_csv(readcsvname)

    readcsvname = 'allData_normal1.csv'
    # readcsvname = 'allData1.csv'
    prescriptiondata = web_data_process.read_csv(readcsvname)

    readcsvname = 'allLabelDataValue.csv'
    labeldata = web_data_process.read_csv(readcsvname)

    medicaList = []
    mediaclvalueList = []
    for item in medicaldata:
        medicaList.append(item[0].replace('', ''))
        # mediaclvalueList.append(item)

    labelmark = []
    labelvalue = []
    for item in labeldata:
        mark = 0
        nn = []
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            if (mark == 0):
                labelmark.append(itemdata)
            else:
                nn.append(itemdata)
            mark += 1
        labelvalue.append(nn)

    presFeatrue = []
    presLabelFeatrue = []
    j = 1
    wrongnum = 1
    # allData_normal1.csv一共有药物1298种
    for item in prescriptiondata:
        # print 'item:',item
        mark_v = 0
        prevalue = 0
        for itemdata in item:
            #计算每个处方里药物剂量总值
            if (mark_v == 0):
                mark_v = +1
                continue
            else:
                if ((mark_v % 2) != 0):
                    mark_v += 1
                else:
                    value = findnum(itemdata)
                    prevalue = prevalue + value
                    mark_v += 1
        mark = 0
        # allData1.csv
        # dim=1379
        # allData_normal1.csv
        dim = 1298
        featrue = [0] * dim
        for itemdata in item:
            if (mark == 0):
                #处理对应的标签
                itemdata = itemdata.replace('', '')
                # print 'itemdata', itemdata
                loc = labelmark.index(itemdata)
                # print 'loc',loc
                # print (labelvalue[loc])
                presLabelFeatrue.append(labelvalue[loc])
            else:
                if ((mark % 2) != 0):
                    try:
                        location = medicaList.index(itemdata)
                        itemvalue = findnum(item[mark + 1]) / prevalue

                        # featrue[location] = 1
                        # featrue[location+dim-1] = itemvalue
                        featrue[location] = itemvalue
                    except:
                        print 'wrong', wrongnum, item[0], mark, itemdata
                        wrongnum += 1
                else:
                    mark += 1
                    continue
            mark += 1

        j += 1
        # print (featrue)
        presFeatrue.append(featrue)

    print len(presFeatrue), j, len(presLabelFeatrue)
Ejemplo n.º 16
0
def countallmedical(readcsvname):
    print 'countallmedical'
    # readcsvname = 'allmedical.csv'
    # readcsvname = 'allData_normal1.csv'
    # csvdata = data_process.read_csv(readcsvname)
    # readcsvname = 'allData_none1.csv'
    # readcsvname = 'allData1.csv'
    csvdata = web_data_process.read_csv(readcsvname)
    medicaList = []
    medical_value = []
    pattern = re.compile(ur'[\u4e00-\u9fa5]')
    j = 1
    for item in csvdata:
        # print 'item:',item
        mark = 0
        for itemdata in item:
            data_value = []
            # if (mark == 0):
            #     mark += 1
            #     continue
            # else:
            if ((mark % 2) == 0):
                itemdata = itemdata.strip()
                itemdata = itemdata.replace('', '')
                itemdata = itemdata.replace('l', '')
                itemdata = itemdata.decode('utf8')
                # print 'itemdata zzz', itemdata
                if (pattern.search(itemdata)):
                    # print 'j', j, mark
                    # 存取出的药物
                    medicaList.append(itemdata)
                    # 存药物对应的数值
                    data_value.append(itemdata)
                    data_value.append(findnum(item[mark + 1]))
                mark += 1
                medical_value.append(data_value)
            else:
                mark += 1
                continue
        j += 1
    allcount = len(medicaList)
    print '所有处方中共有药物(medicaList): ', allcount
    medicaListSet = list(set(medicaList))

    medicalcount = len(medicaListSet)
    print 'medicaList去重后得到处方中不同药物数量: ', medicalcount

    # print 'medicaListSet:',medicaListSet
    # print 'medical_value:',medical_value

    # medicalminmax=maxValueandminValue(medicaListSet,  medical_value)
    # print 'medicalminmax去重后得到处方中不同药物数量: ', len(medicalminmax)

    #统计每种药物出现的次数
    numarray = []
    n = []
    for item in medicaListSet:
        n.append(item)
        n.append(medicaList.count(item))
        numarray.append(n)
        n = []

    #以次数排序
    numarray = sorted(numarray, key=lambda x: x[1], reverse=True)

    # writecsvname = 'allNormalMedicalandValue.csv'
    # data_process.write_in_csv(writecsvname ,  medical_value)

    # writecsvname = 'allNormalMedicalMinMaxValue.csv'
    # data_process.write_in_csv(writecsvname , medicalminmax)

    # writecsvname = 'allMedicalCount.csv'
    # data_process.write_in_csv(writecsvname , numarray)

    return numarray
Ejemplo n.º 17
0
def UnifiedDose(readcsvname, writecsvname):
    print 'UnifiedDose'
    csvdata = web_data_process.read_csv(readcsvname)
    normalList = []
    for item in csvdata:
        # print 'item',item
        midList = []
        for itemdata in item:
            # print 'itemdata', itemdata
            itemdata = itemdata.decode('utf8')
            if (itemdata.find('两') > 0):
                try:
                    zz = itemdata.split('两')
                    # print 'split itemdata', itemdata
                    unit = float(zz[0]) * 50
                    # print 'unit', unit
                    changeunit = str(unit) + 'g'
                    # print 'changeunit', changeunit
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('钱') > 0):
                try:
                    zz = itemdata.split('钱')
                    unit = float(zz[0]) * 3.125
                    # print 'unit',unit
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('kg') > 0):
                try:
                    zz = itemdata.split('kg')
                    unit = float(zz[0]) * 1000
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('Kg') > 0):
                try:
                    zz = itemdata.split('Kg')
                    unit = float(zz[0]) * 1000
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('斤') > 0):
                try:
                    zz = itemdata.split('斤')
                    unit = float(zz[0]) * 500
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('分') > 0):
                try:
                    zz = itemdata.split('斤')
                    unit = float(zz[0]) * 0.3
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            else:
                midList.append(itemdata)
        normalList.append(midList)

    web_data_process.write_in_csv(writecsvname, normalList)
Ejemplo n.º 18
0
def webProcessNum(readcsvname, writecsvname):
    print 'webProcessNum'
    csvdata = web_data_process.read_csv(readcsvname)
    finaldata = []
    num = 0

    pattern3 = re.compile(
        ur'\d+(?:g|kg|ml|l|个|Kg|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)'
    )
    pattern4 = re.compile(
        ur'\d+.\d+(?:g|kg|ml|Kg|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)'
    )

    pattern1 = re.compile(ur'[\u4e00-\u9fa5]+(\(|()')
    pattern2 = re.compile(ur'(?:\(|(|)|\))')

    pattern5 = re.compile(
        ur'(?<![\u4e00-\u9fa5])(?:\(|()\d+(?:g|kg|ml|l|个|Kg|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握|株)(?:\)|))(?![\u4e00-\u9fa5])'
    )
    pattern6 = re.compile(ur'(?:\)|))(?=[\u4e00-\u9fa5]+)')

    pattern7 = re.compile(ur'(?:\(|()')
    for content in csvdata:
        j = 0
        medicallist = []
        yaowulist = []
        try:
            content.remove('')
        except:
            pass
        for item in content:
            item = item.decode('utf-8')
            if j == 0:
                medicallist.append(content[0])
            else:
                #情况一:升麻 =》只有字
                if not pattern3.search(item):
                    # print '1', item
                    word = item
                    wordnumber = 'None'
                    medicallist.append(word)
                    medicallist.append(wordnumber)

                #情况三:(6g)=>只有数量
                if pattern5.search(item):
                    # print '3',item
                    wordnumber = re.sub(pattern2, '', item)
                    k = 0
                    for item in medicallist:
                        if item == 'None':
                            medicallist[k] = wordnumber
                        k += 1

                #情况五:处理‘各’:各(30g) or 焦栀各(各9g)
                if item.find('各') > -1:
                    # print '各',item
                    item = item.replace('各', '')
                    item = re.sub(pattern2, '', item)
                    #找到整数 或者 小数
                    weight1 = pattern4.findall(item)  #小数
                    weight2 = pattern3.findall(item)  #整数
                    # 把正确的值放在变量weight中
                    if (weight1):
                        wordnumber = weight1[0]
                        yaowulist = pattern4.split(item)
                    elif (weight2):
                        wordnumber = weight2[0]
                        yaowulist = pattern3.split(item)
                        # 把处方的每味药提出来重新放在medicallist列表元素[0]里,同时已经去除了药的数量单位
                    if (yaowulist):
                        try:
                            for zz in yaowulist:
                                if zz != u'':
                                    word = zz
                                    # print 'word',word
                                    medicallist.append(word)
                                    medicallist.append(wordnumber)
                        except:
                            pass
                    k = 0
                    for item in medicallist:
                        if item == 'None':
                            medicallist[k] = wordnumber
                        k += 1

                #情况二:麻黄(6g)
                if pattern1.search(item):
                    # print '麻黄(6g)',item
                    word = item[0:pattern7.search(item).start()]
                    # print 'bb',word
                    wordnumber = item[pattern7.search(item).end():]
                    wordnumber = re.sub(pattern2, '', wordnumber)
                    # print 'nn',wordnumber

                    medicallist.append(word)
                    medicallist.append(wordnumber)

                #情况四:(6g)麻黄
                if pattern6.search(item):
                    wordnumber = item[:pattern6.search(item).start()]
                    wordnumber = re.sub(pattern2, '', wordnumber)
                    word = item[pattern6.search(item).end():]

                    medicallist.append(wordnumber)
                    medicallist.append(word)

            j += 1
        finaldata.append(medicallist)
        num += 1
    web_data_process.write_in_csv(writecsvname, finaldata)