Ejemplo n.º 1
0
def readmydata():
    print('readmydata')
    # readcsvnamex = 'prescriptionFeature.csv'
    # readcsvnamey = 'labelFeature.csv'

    readcsvnamex = 'prescriptionFeature4.csv'
    readcsvnamey = 'labelFeature.csv'

    x_data = excelprocess.read_csv(readcsvnamex)
    y_data = excelprocess.read_csv(readcsvnamey)

    X = []
    Y = []

    for item in x_data:
        xx = []
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            xx.append(float(itemdata))
        X.append(xx)

    for item in y_data:
        yy = []
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            yy.append(float(itemdata))
        Y.append(yy)

    print('X.lenth:', len(X))
    print('Y.lenth:', len(Y))

    return X, Y
Ejemplo n.º 2
0
def composition_pracess():
    print('composition_pracess')
    readcsvname = 'composition_6.csv'
    csvdata = excelprocess.read_csv(readcsvname)
    datas = []
    i = 1
    for item in csvdata:
        # print 'row',i,item[0].split(' ')
        item = item[0].split(' ')
        num = 0
        for itemdata in item:
            if itemdata == '':
                item.pop(num)
            num += 1
        datas.append(item)
        i += 1

    # j=1
    # for item in datas:
    #     print 'j', j
    #     j+=1
    #     for itemdata in item:
    #         print 'zz',itemdata

    writecsvname = 'composition_6_1.csv'
    excelprocess.write_in_csv(writecsvname, datas)
Ejemplo n.º 3
0
def seperateNone():
    print('seperateNone')
    readcsvname = 'csvtest_3.csv'
    # readcsvname ='allData_none.csv'
    csvdata = excelprocess.read_csv(readcsvname)
    normalList = []
    noneLise = []
    nn = 1
    #统计每类数据里带NONE的方剂有多少条
    count = 1
    for item in csvdata:
        # print 'item',item
        check = 1
        for itemdata in item:
            # print 'itemdata',itemdata
            if (itemdata == 'None'):
                check = 0
                noneLise.append(item)
                break
        if (check == 1):
            normalList.append(item)
        zz = item[0].split('*')
        # print 'zz',zz
        if (zz[1] == nn):
            count += 1
        else:
            print('count :', count)
            count = 1
        #nn用于判断类别有没有改变
        nn = zz[1]
Ejemplo n.º 4
0
def createAllLableList():
    print('createAllLableLis')
    addlist = []
    for inum in range(1, 7):
        print('inum', inum)
        readcsvname = 'composition_' + str(inum) + '_class.csv'
        csvdata = excelprocess.read_csv(readcsvname)
        pnum = '*' + str(inum)
        # print 'csvdata',csvdata
        i = 1
        for item in csvdata:
            pnum = pnum + '*' + str(i)
            item.insert(0, pnum)
            # print 'zzzz:',item
            addlist.append(item)
            pnum = '*' + str(inum)
            i += 1
Ejemplo n.º 5
0
def noneStandard():
    print('noneStandard')
    readcsvname = 'allData_none.csv'
    csvdata = excelprocess.read_csv(readcsvname)

    noneList = []
    for item in csvdata:
        midList = []
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            itemdata = itemdata.replace('等分', '')
            itemdata = itemdata.replace('少许', '')
            itemdata = itemdata.replace('3倍于上药', '')
            itemdata = itemdata.replace('减半', '')
            itemdata = itemdata.replace('倍加', '')
            midList.append(itemdata)
        noneList.append(midList)

    writecsvname = 'allData_none1.csv'
    excelprocess.write_in_csv(writecsvname, noneList)
Ejemplo n.º 6
0
def splitnumandstr():
    print('splitnumandstr')
    readcsvname = 'csvtest.csv'
    # readcsvname='composition_1_1.csv'
    csvdata = excelprocess.read_csv(readcsvname)
    i = 1
    data_after = []
    datalist = []
    for item in csvdata:
        print('%%%%%%%%%%%%%num', i)

        for itemdata in item:
            itemdata = itemdata.replace('', '')
            #把 “各” 字 单独处理
            itemdata = itemdata.replace('各', '#')
            print('itemdata', itemdata)

            #清洗数据:处理描述中的里括号里的内容***不要括号里的内容**********
            try:
                num1 = itemdata.index('(')
                num2 = itemdata.index(')')
                itemdata = itemdata + '*'
                print('*****************num1,num2******************', num1,
                      num2)
                itemdata = itemdata[0:num1] + itemdata[num2 + 1:-1]
                print('*****************清除英文括号内容******************', itemdata)
                num11 = itemdata.index('(')
                num22 = itemdata.index(')')
                itemdata = itemdata + '*'
                print('*****************num1,num2******************', num11,
                      num22)
                itemdata = itemdata[0:num11] + itemdata[num22 + 1:-1]
                print('*****************清除中文括号内容******************', itemdata)
            except:
                pass
            data_after.append(itemdata)
        i += 1
        datalist.append(data_after)
        # print 'data_after^^^^^^^^^^^^^^^^^',data_after
        data_after = []
Ejemplo n.º 7
0
def transLabelvalue():
    print('transLabelvalue')
    readcsvname = 'allLabelData.csv'
    csvdata = excelprocess.read_csv(readcsvname)
    labelList = []
    for item in csvdata:
        data_after = []
        # print 'item:',item
        mark = 0
        for itemdata in item:
            if (mark == 0):
                data_after.append(itemdata)
                mark += 1
                continue
            else:
                itemdata = itemdata.replace('', '')
                itemdata = itemdata.replace(';', ';')
                itemdata = itemdata.split(';')
                # print 'itemdata', itemdata
                for labelitem in itemdata:
                    if (labelitem.find('祛风解表') > -1):
                        # print 'itemdata', itemdata
                        data_after.append(label_1)
                    elif (labelitem.find('胜湿止痛') > -1):
                        data_after.append(label_2)
                    elif (labelitem.find('止痒') > -1
                          or labelitem.find('透疹') > -1):
                        data_after.append(label_3)
                    elif (labelitem.find('止痉') > -1
                          or labelitem.find('中风') > -1):
                        data_after.append(label_4)
                    elif (labelitem.find('散肝舒脾') > -1):
                        data_after.append(label_5)
                    elif (labelitem.find('目') > -1
                          or labelitem.find('明目') > -1):
                        data_after.append(label_6)
                    else:
                        # data_after.append(label_other)
                        print('error:', item[0])
            labelList.append(data_after)
Ejemplo n.º 8
0
def extractnumwithstr(readcsvname, writecsvname):
    print('extractnumwithstr')
    # readcsvname='csvtest_1.csv'
    # readcsvname='composition_5_2.csv'
    csvdata = excelprocess.read_csv(readcsvname)

    # i:指示第i个处方
    i = 1
    #正则匹配要用' ur'' '才能正确匹配中文
    #(?:..):(...)的不分组版本,用于使用| 或 后接数量词
    pattern1 = re.compile(
        u'\d+.\d+(?:g||kg|ml|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握)'
    )
    pattern2 = re.compile(
        u'\d+(?:g||kg|ml|l|个|钱|片|根|条|份|张|枚|具|朵|只|粒|茎|两|斤|挺|对|头|L|ML|分|节|cm|握)')
    pattern3 = re.compile(u'kg')
    finalmedicallist = []
    for item in csvdata:
        print(
            '****************************************************************** 处方: ',
            i)
        medicallist = []
        point = []
        medicaldict = []
        for itemdata in item:
            weight = ''
            yaowulist = []
            itemdata = itemdata.replace('', '')
            itemdata = itemdata.replace('.', '.')
            itemdata = itemdata.replace('o', '0')
            itemdata = itemdata.decode('utf8')
            # print 'itemdata', itemdata

            #在处方内容中通过正则匹配找出数量单位 start
            weight1 = pattern1.findall(itemdata)
            weight2 = pattern2.findall(itemdata)
            #把正确的值放在变量weight中
            if (weight1):
                weight = weight1[0]
                yaowulist = pattern1.split(itemdata)
            elif (weight2):
                weight = weight2[0]
                yaowulist = pattern2.split(itemdata)
            # print '$$$$$$$$$$$$findal',weight1,weight2,weight
            # 在处方内容中通过正则匹配找出数量单位 end

            # 把处方的每味药提出来重新放在medicallist列表元素[0]里,同时已经去除了药的数量单位
            if (yaowulist):
                # try:
                yaowulist.remove('')
                for zz in yaowulist:
                    medicallist.append(zz)
                # except:
                #     pass
            else:
                medicallist.append(itemdata)

            # 把处方的每味药所对应的数量单位存入medicallist 列表元素[1]的位置列表里
            if (weight):
                medicallist.append(weight)
            else:
                medicallist.append('None')

            #medicallist eg.[u'\u9632\u98ce', u'3l']
            medicaldict.append(medicallist)
            medicallist = []

        # print "medicallist", medicallist

        print('medicaldict', medicaldict)
        #j统计每个处方里的第j味药
        j = 0
        for k, v in medicaldict:
            #用point记录#在哪味药上
            # print k,v
            if (k.find('#') > 0):
                point.append(j)
                medicaldict[j][0] = medicaldict[j][0].replace('#', '')
                print(medicaldict[j][0])
                print('检测到“各”字,该味药在处方中所处位置:', j, k)
            j += 1
        print('该方剂一共配药数量为:', j)
        print('该方剂中出现“各”字的位置有:', point)

        f = 0
        print('##################### 开始处理所有药的数量单位 ##################')
        print('medicaldict', medicaldict)

        for m, n in medicaldict:
            if (point != []):
                for pointnum in point:
                    # print 'test', pointnum
                    if (f > pointnum):
                        continue
                    elif (n == 'None'):
                        # print 'test4',medicaldict[f][1],pointnum
                        medicaldict[f][1] = medicaldict[pointnum][1]
                        break
            f += 1
        print('@@@@@@~~最后处理结果(列表):', medicaldict)

        #重新整理medicaldict数据格式,并存入csv里
        onepiece = []
        for x, y in medicaldict:
            print(x, y)
            onepiece.append(x)
            onepiece.append(y)
            # print 'onepiece',onepiece

        # excelprocess.createListCSV('csvtest_2.csv',medicaldict)
        finalmedicallist.append(onepiece)
        #处方数增一
        i += 1
Ejemplo n.º 9
0
def unitTransformation():
    print('unitTransformation()')
    readcsvname = 'csvtest_normal.csv'
    # readcsvname = 'allData_normal.csv'
    # readcsvname ='allData_none.csv'
    csvdata = excelprocess.read_csv(readcsvname)
    normalList = []
    for item in csvdata:
        # print 'item',item
        midList = []
        for itemdata in item:
            # print 'itemdata', itemdata
            itemdata = itemdata.decode('utf8')
            if (itemdata.find('两') > 0):
                try:
                    zz = itemdata.split('两')
                    # print 'split itemdata', itemdata
                    unit = float(zz[0]) * 50
                    # print 'unit', unit
                    changeunit = str(unit) + 'g'
                    # print 'changeunit', changeunit
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('钱') > 0):
                try:
                    zz = itemdata.split('钱')
                    unit = float(zz[0]) * 3.125
                    # print 'unit',unit
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('kg') > 0):
                try:
                    zz = itemdata.split('kg')
                    unit = float(zz[0]) * 1000
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('斤') > 0):
                try:
                    zz = itemdata.split('斤')
                    unit = float(zz[0]) * 500
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            elif (itemdata.find('分') > 0):
                try:
                    zz = itemdata.split('斤')
                    unit = float(zz[0]) * 0.3
                    changeunit = str(unit) + 'g'
                    midList.append(changeunit)
                except:
                    midList.append(itemdata)
            else:
                midList.append(itemdata)
        normalList.append(midList)

    # writecsvname = 'csvtest_normal1.csv'
    writecsvname = 'allData_normal1.csv'
Ejemplo n.º 10
0
def countallmedical():
    print 'countallmedical'
    # readcsvname = 'allmedical.csv'
    # readcsvname = 'allData_normal1.csv'
    # csvdata = excelprocess.read_csv(readcsvname)
    # readcsvname = 'allData_none1.csv'
    readcsvname = 'allData1.csv'
    csvdata = excelprocess.read_csv(readcsvname)
    medicaList = []
    medical_value = []
    pattern = re.compile(ur'[\u4e00-\u9fa5]')
    j = 1
    for item in csvdata:
        # print 'item:',item
        mark = 0
        for itemdata in item:
            data_value = []
            if (mark == 0):
                mark += 1
                continue
            else:
                if ((mark % 2) == 0):
                    mark += 1
                    continue
                else:
                    itemdata = itemdata.strip()
                    itemdata = itemdata.replace('l', '')
                    itemdata = itemdata.decode('utf8')

                    if (pattern.search(itemdata)):
                        # print 'j',j,mark
                        #存取出的药物
                        medicaList.append(itemdata)
                        #存药物对应的数值
                        data_value.append(itemdata)
                        data_value.append(findnum(item[mark + 1]))

                    mark += 1
                    medical_value.append(data_value)
            j += 1
    allcount = len(medicaList)
    print '所有处方中共有药物(medicaList): ', allcount
    medicaListSet = list(set(medicaList))

    medicalcount = len(medicaListSet)
    print 'medicaList去重后得到处方中不同药物数量: ', medicalcount

    # print 'medicaListSet:',medicaListSet
    # print 'medical_value:',medical_value

    medicalminmax = maxValueandminValue(medicaListSet, medical_value)
    print 'medicalminmax去重后得到处方中不同药物数量: ', len(medicalminmax)

    #统计每种药物出现的次数
    numarray = []
    n = []
    for item in medicaListSet:
        n.append(item)
        n.append(medicaList.count(item))
        numarray.append(n)
        n = []

    #以次数排序
    numarray = sorted(numarray, key=lambda x: x[1], reverse=True)

    # writecsvname = 'allNormalMedicalCount.csv'
    # excelprocess.write_in_csv(writecsvname , numarray)

    # writecsvname = 'allNormalMedicalandValue.csv'
    # excelprocess.write_in_csv(writecsvname ,  medical_value)

    # writecsvname = 'allNormalMedicalMinMaxValue.csv'
    # excelprocess.write_in_csv(writecsvname , medicalminmax)

    writecsvname = 'allData1Count.csv'
    excelprocess.write_in_csv(writecsvname, numarray)
Ejemplo n.º 11
0
def prescription2Feature():
    print 'prescription2Feature'
    #allData_normal1.csv里的不同药味数统计
    readcsvname = 'allNormalMedicalMinMaxValue.csv'
    # allData1.csv里的不同药味数统计
    # readcsvname = 'allData1Count.csv'
    medicaldata = excelprocess.read_csv(readcsvname)

    readcsvname = 'allData_normal1.csv'
    # readcsvname = 'allData1.csv'
    prescriptiondata = excelprocess.read_csv(readcsvname)

    readcsvname = 'allLabelDataValue.csv'
    labeldata = excelprocess.read_csv(readcsvname)

    medicaList = []
    mediaclvalueList = []
    for item in medicaldata:
        medicaList.append(item[0].replace('', ''))
        # mediaclvalueList.append(item)

    labelmark = []
    labelvalue = []
    for item in labeldata:
        mark = 0
        nn = []
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            if (mark == 0):
                labelmark.append(itemdata)
            else:
                nn.append(itemdata)
            mark += 1
        labelvalue.append(nn)

    presFeatrue = []
    presLabelFeatrue = []
    j = 1
    wrongnum = 1
    # allData_normal1.csv一共有药物1298种
    for item in prescriptiondata:
        # print 'item:',item
        mark_v = 0
        prevalue = 0
        for itemdata in item:
            #计算每个处方里药物剂量总值
            if (mark_v == 0):
                mark_v = +1
                continue
            else:
                if ((mark_v % 2) != 0):
                    mark_v += 1
                else:
                    value = findnum(itemdata)
                    prevalue = prevalue + value
                    mark_v += 1
        mark = 0
        # allData1.csv
        # dim=1379
        # allData_normal1.csv
        dim = 1298
        featrue = [0] * dim
        for itemdata in item:
            if (mark == 0):
                #处理对应的标签
                itemdata = itemdata.replace('', '')
                # print 'itemdata', itemdata
                loc = labelmark.index(itemdata)
                # print 'loc',loc
                # print (labelvalue[loc])
                presLabelFeatrue.append(labelvalue[loc])
            else:
                if ((mark % 2) != 0):
                    try:
                        location = medicaList.index(itemdata)
                        itemvalue = findnum(item[mark + 1]) / prevalue

                        # featrue[location] = 1
                        # featrue[location+dim-1] = itemvalue
                        featrue[location] = itemvalue
                    except:
                        print 'wrong', wrongnum, item[0], mark, itemdata
                        wrongnum += 1
                else:
                    mark += 1
                    continue
            mark += 1

        j += 1
        # print (featrue)
        presFeatrue.append(featrue)

    print len(presFeatrue), j, len(presLabelFeatrue)

    writecsvname = 'prescriptionFeature4.csv'
    excelprocess.write_in_csv(writecsvname, presFeatrue)

    writecsvname = 'labelFeature4.csv'
    excelprocess.write_in_csv(writecsvname, presLabelFeatrue)
Ejemplo n.º 12
0
def prescriptionFeature():
    print 'prescriptionFeature'
    readcsvname = 'allNormalMedicalMinMaxValue.csv'
    medicaldata = excelprocess.read_csv(readcsvname)

    # readcsvname = 'allData_normal1.csv'
    readcsvname = 'allData1.csv'
    prescriptiondata = excelprocess.read_csv(readcsvname)

    readcsvname = 'allLabelDataValue.csv'
    labeldata = excelprocess.read_csv(readcsvname)

    medicaList = []
    mediaclvalueList = []
    for item in medicaldata:
        medicaList.append(item[0].replace('', ''))
        mediaclvalueList.append(item)

    labelmark = []
    labelvalue = []
    for item in labeldata:
        mark = 0
        nn = []
        for itemdata in item:
            itemdata = itemdata.replace('', '')
            if (mark == 0):
                labelmark.append(itemdata)
            else:
                nn.append(itemdata)
            mark += 1
        labelvalue.append(nn)

    # print (labelvalue)

    # print 'mediacl', medicaList
    # for item in medicaList:
    #     print (item)
    presFeatrue = []
    presLabelFeatrue = []
    j = 1
    wrongnum = 1
    #allData_normal1.csv一共有药物1298种
    # print (featrue)
    for item in prescriptiondata:
        # print 'item:',item
        mark = 0
        #allData_normal1.csv
        # featrue = [0] * 1298
        #allData1.csv
        featrue = [0] * 1379
        for itemdata in item:
            if (mark == 0):
                # print 'itemdata',itemdata
                itemdata = itemdata.replace('', '')
                # print 'itemdata', itemdata
                loc = labelmark.index(itemdata)
                # print 'loc',loc
                # print (labelvalue[loc])
                presLabelFeatrue.append(labelvalue[loc])
            else:
                if ((mark % 2) != 0):
                    try:
                        location = medicaList.index(itemdata)
                        # print 'location',location
                        itemvalue = findnum(item[mark + 1])
                        finalValue = (
                            itemvalue - float(mediaclvalueList[location][2]) +
                            1) / (float(mediaclvalueList[location][3]) + 1)
                        #特征既有配伍成分,有考虑了单位数量
                        # featrue[location]=finalValue
                        #只关心配伍成分,不关心单位数量
                        featrue[location] = 1
                    except:
                        # print 'wrong',wrongnum,item[0],mark,itemdata
                        wrongnum += 1
                else:
                    mark += 1
                    continue
            mark += 1

        j += 1
        # print (featrue)
        presFeatrue.append(featrue)

    print len(presFeatrue), j, len(presLabelFeatrue)