def ResultDFToSave(rules):  #根据Qrange3关联分析生成的规则得到并返回对于的DataFrame数据结构的函数
    returnRules = []
    for i in rules:
        temList = []
        temStr = ''
        for j in i[0]:  #处理第一个frozenset
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temList.append(temStr)
        temList.append(i[2])
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '覆盖度', '力度', '提升度',
                                 '利用度'))

    supportRate = 0.02
    confidenceRate = 0.5
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis, supportRate))
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules)
    result = list(oaf.rules_stats(
        rules, itemsets, len(listToAnalysis)))  #下面这个函数改变了rules,把rules用完了!
    printResult = dealResult(result)

    #################################################下面将结果保存成excel格式的文件
    dfToSave = ResultDFToSave(result)
    saveRegularName = str(supportRate) + '支持度_' + str(
        confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx'
    dfToSave.to_excel(saveRegularName)

    #######################################################下面是根据不同置信度和关联度得到关联规则数目
    listTable = []
    supportRate = 0.01
    confidenceRate = 0.1
    for i in range(9):
        support = supportRate * (i + 1)
        listS = []
        for j in range(9):
            confidence = confidenceRate * (j + 1)
            itemsets = dict(oaf.frequent_itemsets(listToAnalysis, support))
            rules = list(oaf.association_rules(itemsets, confidence))
            listS.append(len(rules))
        listTable.append(listS)
    dfList = pd.DataFrame(listTable,
                          index=[supportRate * (i + 1) for i in range(9)],
                          columns=[confidenceRate * (i + 1) for i in range(9)])
    dfList.to_excel('regularNum.xlsx')
Esempio n. 2
0
def rules_extractor(X, profundidades=range(4), metric=0.3):
    res = {}

    for i in profundidades:
        T = transacciones_profundidad(X, i)

        itemsets = dict(fp.frequent_itemsets(T, metric))
        rules = [
            (P, Q, supp, conf)
            for P, Q, supp, conf in fp.association_rules(itemsets, metric)
        ]

        res[i] = (itemsets, rules)

    return res
    def find_rules(self):
        if self.data is None or not len(self.data):
            return
        if self._is_running:
            return
        self._is_running = True
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.error(911)
        if X is None:
            self.error(911, 'Need some discrete data to work with.')

        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {
            item:
            ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(
                var.name, val)
            for item, var, val in OneHot.decode(mapping, data, mapping)
        }
        # Items that consequent must include if classifying
        class_items = {
            item
            for item, var, val in OneHot.decode(mapping, data, mapping)
            if var is data.domain.class_var
        } if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, tooltip) in enumerate([
            ("Supp", "Support"),
            ("Conf", "Confidence (support / antecedent support)"),
            ("Covr", "Coverage (antecedent support / number of examples)"),
            ("Strg", "Strength (consequent support / antecedent support)"),
            ("Lift",
             "Lift (number of examples * confidence / consequent support)"),
            ("Levr",
             "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)"
             ), ("Antecedent", None), ("", None), ("Consequent", None)
        ]):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        #~ # Aggregate rules by common (support,confidence) for scatterplot
        #~ scatter_agg = defaultdict(list)

        # Find itemsets
        nRules = 0
        itemsets = {}
        with self.progressBar(self.maxRules + 1) as progress:
            for itemset, support in frequent_itemsets(X,
                                                      self.minSupport / 100):
                itemsets[itemset] = support

                if class_items and not class_items & itemset:
                    continue

                # Filter itemset by joined filters before descending into it
                itemset_str = ' '.join(names[i] for i in itemset)
                if (filterSearch and
                    (len(itemset) < itemsetMin or itemsetMax < len(itemset)
                     or not isRegexMatch(itemset_str, itemset_str))):
                    continue

                for rule in association_rules(itemsets,
                                              self.minConfidence / 100,
                                              itemset):
                    left, right, support, confidence = rule

                    if class_items and right - class_items:
                        continue
                    if filterSearch and not isSizeMatch(len(left), len(right)):
                        continue
                    left_str = ', '.join(names[i] for i in sorted(left))
                    right_str = ', '.join(names[i] for i in sorted(right))
                    if filterSearch and not isRegexMatch(left_str, right_str):
                        continue

                    # All filters matched, calculate stats and add table row
                    _, _, _, _, coverage, strength, lift, leverage = next(
                        rules_stats((rule, ), itemsets, n_examples))

                    support_item = NumericItem(support / n_examples)
                    # Set row data on first column
                    support_item.setData(
                        (itemset - class_items, class_items and
                         (class_items & itemset).pop()), self.ROW_DATA_ROLE)
                    left_item = StandardItem(left_str, len(left))
                    left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    model.appendRow([
                        support_item,
                        NumericItem(confidence),
                        NumericItem(coverage),
                        NumericItem(strength),
                        NumericItem(lift),
                        NumericItem(leverage), left_item,
                        StandardItem('→'),
                        StandardItem(right_str, len(right))
                    ])
                    #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right))
                    nRules += 1
                    progress.advance()
                    if nRules >= self.maxRules:
                        break
                if nRules >= self.maxRules:
                    break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount(
        )  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
        self._is_running = False
import os, os.path, shutil
import codecs
import EMRdef
import re
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd')  #txt目录提取
hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore")  #呼吸疾病目录
hxjbdic = hxjb.readlines()  #读行
ryzd = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]
    line_out = []
    for line in f.readlines():
        line = re.sub('\n', '', line)
        line = re.sub(r'(.+?)肺炎', '肺炎', line)  #替换所有的肺炎
        for hxjbc in hxjbdic:  #检索每个词
            hxjbc = re.sub('\n', '', hxjbc)
            if line.find(hxjbc) > -1:
                line_out.append(line)
        line_output = EMRdef.delre(line_out)
        ryzd.append(line_out)
        #line = '\n'.join(line_output)
        #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)

import orangecontrib.associate.fpgrowth as oaf
often = dict(oaf.frequent_itemsets(ryzd, .01))  #生成频繁度
rules = oaf.association_rules(often, .5)  #这里设置置信度
rules = list(rules)

print(rules)
Esempio n. 5
0
    for line in f.readlines():
            line = re.sub('\n','',line)
            line = re.sub(r'(.+?)肺炎','肺炎',line)#替换所有的肺炎
            for hxjbc in hxjbdic:#检索每个词
                hxjbc = re.sub('\n','',hxjbc)
                if line.find(hxjbc) >-1:
                    line_out.append(line)
            line_output = EMRdef.delre(line_out)
            ryzd.append(line_out)
            #line = '\n'.join(line_output)
            #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)

import orangecontrib.associate.fpgrowth as oaf
often=dict(oaf.frequent_itemsets(ryzd, .02))#生成频繁度

rules = oaf.association_rules(often, .25, )  #这里设置置信度frozenset({'肺炎'})
rules = list(rules)

def dealResult(rules):
    returnRules = []
    for i in rules:
        temStr = '';
        for j in i[0]:   #处理第一个frozenset
            temStr = temStr+j+'&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr+j+'&'
        temStr = temStr[:-1]
        temStr = temStr + ';' +'\t'+str(i[2])+ ';' +'\t'+str(i[3])+ ';' +'\t'+str(i[4])+ ';' +'\t'+str(i[5])+ ';' +'\t'+str(i[6])+ ';' +'\t'+str(i[7])
#        print(temStr)
Esempio n. 6
0
    def find_rules(self):
        if self.data is None:
            return
        if self._is_running:
            return
        self._is_running = True
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {item: ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(var.name, val)
                 for item, var, val in OneHot.decode(mapping, data, mapping)}
        # Items that consequent must include if classifying
        class_items = {item
                       for item, var, val in OneHot.decode(mapping, data, mapping)
                       if var is data.domain.class_var} if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, tooltip) in enumerate([("Supp", "Support"),
                                                ("Conf", "Confidence (support / antecedent support)"),
                                                ("Covr", "Coverage (antecedent support / number of examples)"),
                                                ("Strg", "Strength (consequent support / antecedent support)"),
                                                ("Lift", "Lift (number of examples * confidence / consequent support)"),
                                                ("Levr", "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)"),
                                                ("Antecedent", None),
                                                ("", None),
                                                ("Consequent", None)]):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        #~ # Aggregate rules by common (support,confidence) for scatterplot
        #~ scatter_agg = defaultdict(list)

        # Find itemsets
        nRules = 0
        itemsets = {}
        with self.progressBar(self.maxRules + 1) as progress:
            for itemset, support in frequent_itemsets(X, self.minSupport / 100):
                itemsets[itemset] = support

                if class_items and not class_items & itemset:
                    continue

                # Filter itemset by joined filters before descending into it
                itemset_str = ' '.join(names[i] for i in itemset)
                if (filterSearch and
                    (len(itemset) < itemsetMin or
                     itemsetMax < len(itemset) or
                     not isRegexMatch(itemset_str, itemset_str))):
                    continue

                for rule in association_rules(itemsets,
                                              self.minConfidence / 100,
                                              itemset):
                    left, right, support, confidence = rule

                    if class_items and right - class_items:
                        continue
                    if filterSearch and not isSizeMatch(len(left), len(right)):
                        continue
                    left_str =  ', '.join(names[i] for i in sorted(left))
                    right_str = ', '.join(names[i] for i in sorted(right))
                    if filterSearch and not isRegexMatch(left_str, right_str):
                        continue

                    # All filters matched, calculate stats and add table row
                    _, _, _, _, coverage, strength, lift, leverage = next(
                        rules_stats((rule,), itemsets, n_examples))

                    support_item = NumericItem(support / n_examples)
                    # Set row data on first column
                    support_item.setData((itemset - class_items,
                                          class_items and (class_items & itemset).pop()),
                                         self.ROW_DATA_ROLE)
                    left_item = StandardItem(left_str, len(left))
                    left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    model.appendRow([support_item,
                                     NumericItem(confidence),
                                     NumericItem(coverage),
                                     NumericItem(strength),
                                     NumericItem(lift),
                                     NumericItem(leverage),
                                     left_item,
                                     StandardItem('→'),
                                     StandardItem(right_str, len(right))])
                    #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right))
                    nRules += 1
                    progress.advance()
                    if nRules >= self.maxRules:
                        break
                if nRules >= self.maxRules:
                    break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount()  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
        self._is_running = False
    def doAnalysize(self,
                    pd_data,
                    category,
                    supportRate=0.02,
                    confidenceRate=0.5,
                    savepath=r'C:\Users\Administrator\Desktop'):
        # 初始化词库路径
        savepath = savepath + "\\" + category
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        initpath = "tmall\\spiders\\DataAnalysize\\jiebaInit\\" + category + ".txt"
        jieba.load_userdict(initpath)
        pd_data['ratecontent_list'] = pd_data.apply(
            lambda r: list(jieba.cut(r['rateContent'])), axis=1)

        aim_list = []
        with open(initpath, 'r', encoding="utf-8") as f:
            for line in f.readlines():
                aim_list.append(line.strip('\n'))
        pd_data['aim_list'] = pd_data.apply(lambda r: list(
            set(r['ratecontent_list']).intersection(set(aim_list))),
                                            axis=1)
        simple_aimdata = []
        pd_data.apply(lambda r: simple_aimdata.append(r['aim_list'])
                      if not r['aim_list'] == [] else 1,
                      axis=1)
        wordcloudlist = []
        for item in simple_aimdata:
            for i in item:
                wordcloudlist.append(i)
        # 生成每种分析的词云图
        self.everyWordCloud(wordcloudlist, savepath)

        #经过上面两行操作,得到目标列表: simple_aimdata
        strSet = set(functools.reduce(lambda a, b: a + b, simple_aimdata))
        strEncode = dict(zip(strSet, range(
            len(strSet))))  # 编码字典,即:{'甜腻': 6,'鱼腥味': 53,etc...}
        strDecode = dict(
            zip(strEncode.values(),
                strEncode.keys()))  # 解码字典,即:{6:'甜腻',53:'鱼腥味',etc...}
        listToAnalysis_int = [
            list(map(lambda item: strEncode[item], row))
            for row in simple_aimdata
        ]
        # 开始进行关联分析
        itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, supportRate))
        # print("itemsets : ")
        # print(itemsets)
        rules = oaf.association_rules(itemsets, confidenceRate)
        rules = list(rules)
        regularNum = len(rules)
        printRules = self.dealRules(rules, strDecode)  # 该变量可以打印查看生成的规则
        # print(printRules)
        result = list(oaf.rules_stats(
            rules, itemsets,
            len(listToAnalysis_int)))  # 下面这个函数改变了rules,把rules用完了!
        # print(result)
        printResult = self.dealResult(result, strDecode)  # 该变量可以打印查看结果
        # print(printResult)

        #################################################下面将结果保存成excel格式的文件
        # save rules to excel
        dfToSave = self.ResultDFToSave(result, strDecode)
        saveRegularName = savepath + "\\" + str(supportRate) + '支持度_' + str(
            confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx'
        dfToSave.to_excel(saveRegularName)
        # save itemsets to excel
        self.saveItemSets(itemsets, strDecode, savepath)

        #######################################################下面是根据不同置信度和关联度得到关联规则数目
        listTable = []
        supportRate = 0.01
        confidenceRate = 0.1
        for i in range(9):
            support = supportRate * (i + 1)
            listS = []
            for j in range(9):
                confidence = confidenceRate * (j + 1)
                itemsets = dict(
                    oaf.frequent_itemsets(listToAnalysis_int, support))
                rules = list(oaf.association_rules(itemsets, confidence))
                listS.append(len(rules))
            listTable.append(listS)
        dfList = pd.DataFrame(
            listTable,
            index=[supportRate * (i + 1) for i in range(9)],
            columns=[confidenceRate * (i + 1) for i in range(9)])
        dfList.to_excel(savepath + "\\" + 'regularNum.xlsx')
Esempio n. 8
0
    def find_rules(self):
        if self.data is None or not len(self.data):
            return
        if self._is_running:
            self._is_running = False
            return

        self.button.button.setText('Cancel')

        self._is_running = True
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.Error.need_discrete_data.clear()
        if X is None:
            self.Error.need_discrete_data()

        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {item: ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(var.name, val)
                 for item, var, val in OneHot.decode(mapping, data, mapping)}
        # Items that consequent must include if classifying
        class_items = {item
                       for item, var, val in OneHot.decode(mapping, data, mapping)
                       if var is data.domain.class_var} if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, _, tooltip) in enumerate(self.header):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        # Find itemsets
        nRules = 0
        itemsets = {}
        ARROW_ITEM = StandardItem('→')
        ARROW_ITEM.setTextAlignment(Qt.AlignCenter)
        with self.progressBar(self.maxRules + 1) as progress:
            for itemset, support in frequent_itemsets(X, self.minSupport / 100):
                itemsets[itemset] = support

                if class_items and not class_items & itemset:
                    continue

                # Filter itemset by joined filters before descending into it
                itemset_str = ' '.join(names[i] for i in itemset)
                if (filterSearch and
                    (len(itemset) < itemsetMin or
                     itemsetMax < len(itemset) or
                     not isRegexMatch(itemset_str, itemset_str))):
                    continue

                for rule in association_rules(itemsets,
                                              self.minConfidence / 100,
                                              itemset):
                    left, right, support, confidence = rule

                    if class_items and right - class_items:
                        continue
                    if filterSearch and not isSizeMatch(len(left), len(right)):
                        continue
                    left_str =  ', '.join(names[i] for i in sorted(left))
                    right_str = ', '.join(names[i] for i in sorted(right))
                    if filterSearch and not isRegexMatch(left_str, right_str):
                        continue

                    # All filters matched, calculate stats and add table row
                    _, _, _, _, coverage, strength, lift, leverage = next(
                        rules_stats((rule,), itemsets, n_examples))

                    support_item = NumericItem(support / n_examples)
                    # Set row data on first column
                    support_item.setData((itemset - class_items,
                                          class_items and (class_items & itemset).pop()),
                                         self.ROW_DATA_ROLE)
                    left_item = StandardItem(left_str, len(left))
                    left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    model.appendRow([support_item,
                                     NumericItem(confidence),
                                     NumericItem(coverage),
                                     NumericItem(strength),
                                     NumericItem(lift),
                                     NumericItem(leverage),
                                     left_item,
                                     ARROW_ITEM.clone(),
                                     StandardItem(right_str, len(right))])
                    nRules += 1
                    progress.advance()

                    if not self._is_running or nRules >= self.maxRules:
                        break

                qApp.processEvents()

                if not self._is_running or nRules >= self.maxRules:
                    break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)
        self.table_rules = proxy_model.get_data()
        if self.table_rules is not None:
            self.Outputs.rules.send(self.table_rules)

        self.button.button.setText('Find Rules')

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount()  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
        self._is_running = False
import codecs
import EMRdef
import re
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd')  #txt目录提取
hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore")  #呼吸疾病目录
hxjbdic = hxjb.readlines()  #读行
ryzd = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]
    line_out = []
    for line in f.readlines():
        line = re.sub('\n', '', line)
        line = re.sub(r'(.+?)肺炎', '肺炎', line)  #替换所有的肺炎
        for hxjbc in hxjbdic:  #检索每个词
            hxjbc = re.sub('\n', '', hxjbc)
            if line.find(hxjbc) > -1:
                line_out.append(line)
        line_output = EMRdef.delre(line_out)
        ryzd.append(line_out)
        #line = '\n'.join(line_output)
        #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)

import orangecontrib.associate.fpgrowth as oaf
often = dict(oaf.frequent_itemsets(ryzd, .01))  #生成频繁度

rules = oaf.association_rules(often, .5, hxjbdic)  #这里设置置信度
rules = list(rules)

print(rules)
import re

emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd')  #txt目录提取
hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore")  #呼吸疾病目录
hxjbdic = hxjb.readlines()  #读行
ryzd = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]
    line_out = []
    for line in f.readlines():
        line = re.sub('\n', '', line)
        line = re.sub(r'(.+?)肺炎', '肺炎', line)  #替换所有的肺炎
        for hxjbc in hxjbdic:  #检索每个词
            hxjbc = re.sub('\n', '', hxjbc)
            if line.find(hxjbc) > -1:
                line_out.append(line)
        line_output = EMRdef.delre(line_out)
        ryzd.append(line_out)
        #line = '\n'.join(line_output)
        #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)

import orangecontrib.associate.fpgrowth as oaf

often = dict(oaf.frequent_itemsets(ryzd, .01))  #生成频繁度

rules = oaf.association_rules(hxjbdic, .5, often)  #这里设置置信度
rules = list(rules)

print(rules)
Esempio n. 11
0
def associateRules(support=0.02, confidence=0.5):
    support = 0.15
    confidence = 0.15
    try:
        with open('filelocation.json') as f_obj:
            fileInput = json.load(f_obj)
    except:
        with open('errorFlag.json', 'w') as e_obj:
            json.dump("File open process failed", e_obj)
        return
    filename = fileInput

    dfar = pd.read_csv(filename)
    tag = list(dfar.columns.values)
    listToAnalysis = []  #最终结果

    for item in range(1, len(tag) - 1):  #遍历列
        imax = max(list(dfar[tag[item]]))  #上界
        imin = min(list(dfar[tag[item]]))  #下界

        ijc = imax - imin  #极差
        l = ijc / 4

        i1 = imin + l
        i2 = i1 + l
        i3 = i2 + l

        listToStore = []

        for i in range(dfar.shape[0]):
            s = dfar.iloc[i][tag[item]]

            if s >= i3 and s <= imax:
                ss = tag[item] + str(i3) + '-' + str(imax)
            elif s >= i2:
                ss = tag[item] + str(i2) + '-' + str(i3)
            elif s >= i1:
                ss = tag[item] + str(i1) + '-' + str(i2)
            elif s >= imin:
                ss = tag[item] + str(imin) + '-' + str(i1)
            listToStore.append(ss)

        listToAnalysis.append(listToStore.copy())

    listToAnalysis2 = []
    ll = len(listToAnalysis[0])

    for ii in range(ll):
        ltmp = []
        for it in listToAnalysis:
            ltmp.append(it[ii])
        listToAnalysis2.append(ltmp.copy())

    #创建编码词典与解码词典
    what = functools.reduce(lambda a, b: a + b, listToAnalysis2)
    strSet = set(what)

    zz = zip(strSet, range(len(strSet)))
    strEncode = dict(zz)  #编码字典

    strDecode = dict(zip(strEncode.values(), strEncode.keys()))  #解码字典

    listToAnalysis_int = [
        list(map(lambda item: strEncode[item], row)) for row in listToAnalysis2
    ]

    with open('Information.json') as obj:
        infostring = json.load(obj)
    inforlist = infostring.split(' ')
    confidence = float(inforlist[0]) / float(100)
    support = float(inforlist[1]) / float(100)
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, support))
    #频繁项集

    rules = oaf.association_rules(itemsets, confidence)
    rules = list(rules)
    #关联规则

    regularNum = len(rules)

    #printRules=dealResult(result,strDecode)
    #######
    #print("You will get ")
    #print(regularNum)
    #print("association rules when\n"+"SupportRate = ",end='')
    #print(support,end='')
    #print("ConfidenceRate = "+str(confidence))
    informationBack="You will get "+str(regularNum)+"association rules when\n"\
                                                    +"SupportRate = "+str(support)+" ConfidenceRate = "+str(confidence)
    with open('InformationBack.json', 'w') as inf:
        json.dump(informationBack, inf)
    result = list(oaf.rules_stats(rules, itemsets, len(listToAnalysis_int)))

    dfToSave = ResultDFToSave(result, strDecode)
    with open('arInteractiveText.json', 'w') as ij:
        json.dump(str(dfToSave), ij)
    saveRegularName = "Processed.xlsx"
    dfToSave.to_excel(saveRegularName)
    return regularNum
Esempio n. 12
0
print("FREQUENT PATTERN WITH MORE THAN 1 ITEM")
counter = 1
frequentItemSet.sort(key=lambda x: -x[1])
for itemSet in frequentItemSet:
    itemSet_list = list(itemSet[0])
    if len(itemSet_list) > 1:
        print("[" + str(counter) + "]")
        for item in itemSet_list:
            print(dictItemToDescription[dictKeyToItem[item]])
        print("Minimal Support = " + str(itemSet[1]))
        counter += 1
        if counter > 15:
            break

# search for association Rule
associationRuleItemList = or3.association_rules(
    dict(or3.frequent_itemsets(ItemList, 0.02)), 0.001)
rules = list(associationRuleItemList)
rules.sort(key=lambda x: -x[3])

print("10 ASSOCIATION RULE WITH GREATER SUPPORT")
counter = 1

for rule in rules:
    rule_list = list(rule)
    print("[" + str(counter) + "]")
    print(
        str(dictItemToDescription[dictKeyToItem[list(rule[0])[0]]]) + " => " +
        str(dictItemToDescription[dictKeyToItem[list(rule[1])[0]]]))
    print("Minimal Support = " + str(rule[2]))
    print("Confidence      = " + str(rule[3]))
    counter += 1
def asso_analysis(path, file_path):
    if not os.path.exists(root + '/asso_analysis/'):
        os.mkdir(root + '/asso_analysis/')

    if not os.path.exists(root + '/asso_analysis/err_label_clean.csv'):
        data = pd.read_csv(path + file_path, encoding='utf-8')
        room_list = data['PAR_ROOM'].unique().tolist()
        room_type_map = {}
        # 生成机房类型 - 机房ID映射字典
        for r in room_list:
            r_info = data[data['PAR_ROOM'] == r].dropna(axis=1)
            if str(r_info.columns.tolist()) not in room_type_map.keys():
                room_type_map[str(r_info.columns.tolist())] = [r]
            else:
                room_type_map[str(r_info.columns.tolist())].append(r)

        # 生成标记数据
        label_df = pd.DataFrame()
        for k in room_type_map.keys():
            same_type_room = room_type_map.get(k)
            df = pd.DataFrame()
            for r in same_type_room:
                df = df.append(data[data['PAR_ROOM'] == r], ignore_index=True)
            df.dropna(axis=1, inplace=True)

            col = [c for c in df.columns.tolist() if c not in ['TIME','PAR_ROOM','ALARM_CAUSE']]
            cur_label = pd.DataFrame()
            cur_label['before_err'] = list(range(24, 0, -1)) * int(df.shape[0] / 24) + list(range(24, 24 - int(df.shape[0] % 24), -1))
            cur_label['err_feature'] = to_label(col, df, cur_label['before_err'].values.tolist())
            cur_label['ALARM_CAUSE'] = df['ALARM_CAUSE']
            label_df = label_df.append(cur_label)
        label_df.dropna(inplace=True)
        label_df.to_csv(root + '/asso_analysis/err_label_clean.csv', index=False, encoding='utf-8')
    cur_cate = pd.read_csv(root + '/asso_analysis/err_label_clean.csv', encoding='utf-8', low_memory=False)

    cur_cate.dropna(inplace=True)
    cate_dict = {'R_LOS': 161, 'NE_NOT_LOGIN': 161, 'High Temperature': 161, 'NE_COMMU_BREAK': 161, 'lossOfSignal': 161, 'R_LOF': 161, 'IN_PWR_HIGH': 161, 'POWERALM': 161, 'HARD_BAD': 161,
                 'NE_Backup_Failed': 161, 'Comms fail alarm': 161, 'FCS_ERR': 161, 'LSR_NO_FITED': 161, 'PKG_FAIL': 161, 'IN_PWR_FAIL': 161, 'BUS_ERR': 161, 'PLUGGABLE_TRANSCEIVER_DISMOUNT': 161,
                 'R_OOF': 161, 'PWR_MAJ_ALM': 161, 'Client Service Mismatch': 161, 'UNKNOWN_CARD': 161, 'OS-Optical_Power_High': 161, 'GNE_CONNECT_FAIL': 161,
                 'Replaceable Unit Problem': 162, 'Loss Of Signal': 162, 'LOS': 162, 'LOF': 162, 'IN_PWR_ABN': 162, 'OUT_PWR_ABN': 162, 'Underlying Resource Unavailable': 162, 'Loss Of Frame': 162,
                 'ME loss of communication': 162, 'COMMUN_FAIL': 162, 'TEMP_OVER': 162, 'BD_STATUS': 162, 'SUBCARD_ABN': 162, 'POWER_FAIL': 162, 'Duplicate Shelf Detected': 162,
                 'NE_DATA_INCONSISTENCY': 162, 'SYSBUS_FAIL': 162, 'SHELF_ABSENCE': 162, 'ABSENCE_WARNING': 162, 'POWER_ABNORMAL': 162, 'Bipolar Violations': 162, 'Transmitter Failure': 162, 'CHIP_FAIL': 162,
                 'BUS_ERROR': 162, 'LAPS_FAIL': 162, 'Degraded Signal': 163, 'Signal Degrade': 163, 'Internal Communication Problem': 163, 'RDI': 163,
                 'cntrlBusFail': 163, 'BD_NOT_INSTALLED': 163, 'FAN_FAIL': 163, 'SYN_BAD': 163, 'Circuit Pack Mismatch': 163, 'Fan Failed': 163, 'Replaceable Unit Missing': 163,
                 'Fuse Failure': 163, 'Battery Failure': 163, 'Temperature Out Of Range': 163, 'Power Failure - B': 163, 'Database Save and Restore Failed': 163, 'Cooling Fan Failure': 163,
                 'MIB backup misaligned': 164, 'Inside Failure': 164, 'Sfwr Environment Problem': 164, 'HouseKeeping': 164}

    err_type = ['161', '162', '163', '164']
    # err_type = cur_cate['ALARM_CAUSE'].unique().tolist()
    cur_cate['ALARM_CAUSE'] = cur_cate['ALARM_CAUSE'].apply(lambda x: str(cate_dict[x]) if x in cate_dict.keys() else "-1")
    cur_cate['err_feature'] = cur_cate['err_feature'].apply(lambda x: x.split("|"))

    err_feature = []
    last_before_err = 24
    items_dict = {'161': [], '162': [], '163': [], '164': []}
    for index, row in cur_cate.iterrows():
        err_feature.append(row['err_feature'])
        if last_before_err < row['before_err'] or index == cur_cate.shape[0]-1:
            cause = cur_cate.loc[index - 1, 'ALARM_CAUSE']
            items_dict[cause] += err_feature
            err_feature.clear()
        last_before_err = row['before_err']

    d_itemsets = {}
    for c in err_type:
        # 频繁项集
        each_itemsets = dict(oaf.frequent_itemsets(items_dict[c], 0.0125))
        total = 0
        # 关联规则
        for k in each_itemsets.keys():
            s = set(k)
            s.add(c)
            d_itemsets[frozenset(s)] = each_itemsets[k]
            if k not in d_itemsets:
                d_itemsets[k] = each_itemsets[k]
            else:
                d_itemsets[k] += each_itemsets[k]
            total += each_itemsets[k]
        d_itemsets[frozenset([c])] = total
    rules = list(oaf.association_rules(d_itemsets, 0.7))
    cur_result = pd.DataFrame(rule_process(rules, err_type), columns=('规则', '置信度'))
    cur_result.to_csv(root + '/asso_analysis/associate_analysis.csv', encoding='utf-8', header=True, index=False)
Esempio n. 14
0
import pandas
import pyodbc

suppParam = 0.1
confParam = 0.7

_conn = pyodbc.connect(
    "DRIVER={SQL Server};SERVER=(local)\sql2017;Database=PythonDemo;Trusted_Connection=yes;"
)
_sql = "SELECT [Departments] as [Values] FROM [dbo].[CombinedSets] WHERE StoreCode=20"
InputDataSet = pandas.read_sql_query(sql=_sql, con=_conn)

mlb = MultiLabelBinarizer(sparse_output=True)
X = mlb.fit_transform(InputDataSet["Values"].str.split(",\s*")) > 0
classes = mlb.classes_

itemsets = dict(frequent_itemsets(X, suppParam))

rules = [[
    ", ".join(classes[i] for i in P), classes[next(iter(Q))], supp, conf
] for P, Q, supp, conf in association_rules(itemsets, confParam)]

OutputDataSet = pandas.DataFrame(rules,
                                 columns=["ante", "cons", "supp", "conf"])
rows = len(InputDataSet)
OutputDataSet["suppPCT"] = pandas.Series([(i / rows)
                                          for i in OutputDataSet["supp"]],
                                         dtype="float")
OutputDataSet.sort_values(["conf"], ascending=False)

print(OutputDataSet)
    transaction.append(tag_numbers)

#print(transaction)

#df2 = pd.read_csv("/Users/ahmaddorri/Desktop/tag recomendation/data/mixed/youtube.words",header=None ,sep=" ")
#print(df2.head())
sampleTransaction = np.random.choice(transaction, size=2000,
                                     replace=False).tolist()
#print(sampleTransaction)

import orangecontrib.associate.fpgrowth as org

T = [["unicef", "child", "united", "nation"],
     ["education", "child", "game", "math"],
     ["unicef", "education", "child", "job"]]

#freq_item = org.frequent_itemsets(T,2)

itemsets = dict(org.frequent_itemsets(T, 1))

#print(list(freq_item))
print(itemsets)
print(len(itemsets))

rules = org.association_rules(itemsets, min_confidence=0.49)
rules = list(rules)
for r in rules:
    print(r)
    if ("unicef" in r[0]):
        print(r[0])
import codecs
import EMRdef
import re
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd')  #txt目录提取
hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore")  #呼吸疾病目录
hxjbdic = hxjb.readlines()  #读行
ryzd = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]
    line_out = []
    for line in f.readlines():
        line = re.sub('\n', '', line)
        line = re.sub(r'(.+?)肺炎', '肺炎', line)  #替换所有的肺炎
        for hxjbc in hxjbdic:  #检索每个词
            hxjbc = re.sub('\n', '', hxjbc)
            if line.find(hxjbc) > -1:
                line_out.append(line)
        line_output = EMRdef.delre(line_out)
        ryzd.append(line_out)
        #line = '\n'.join(line_output)
        #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)

import orangecontrib.associate.fpgrowth as oaf
often = dict(oaf.frequent_itemsets(ryzd, .01))  #生成频繁度

rules = oaf.association_rules(often, .01, frozenset({'肺炎'}))  #这里设置置信度
rules = list(rules)

print(rules)
import sys
import os, os.path,shutil
import codecs 
import EMRdef
import re

#关键词提取 关键词为诊疗计划
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR')#txt目录提取
pattern2 = r'。|:|“|”|;|,'#根据标点分词
tgjc = []
for emrtxt in emrtxts:
    f = open(emrtxt,'r',errors="ignore")#中文加入errors
    emrtxt = os.path.basename(emrtxt)
    emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID
    emrtxt = "".join(emrtxt_str)#转成str  
    #txtp=txtp.decode('utf-8')
    for line in f.readlines():
        line = re.sub(' ','',line)#删除空格
        if line.find (u'体格检查') >-1:
            line = re.sub('体格检查:','',line)
            f2_end = re.split(pattern2,line)
            tgjc.append(f2_end)
            f2_out = "\n".join(f2_end)#转成str
            #EMRdef.text_create(r'D:\DeepLearning ER\EHRtigejiancha','.txt' ,emrtxt,f2_out)#导出
            #zljhs.append(emrtxt+':'+line)
#EMRdef.text_save('D:\python\EMR\zljh.txt',zljhs)'''
'''------------------------------------------------------------------------------------------------------------'''
#导入关联规则
import orangecontrib.associate.fpgrowth as oaf
rules = oaf.association_rules(tgjc, 0.2)   #这里设置置信度
rules = list(rules)
Esempio n. 18
0
def model(data, support=0.05, confidence=0.2):
    fre_ite = dict(oaf.frequent_itemsets(data, support))  #这里设置置信度
    rules = oaf.association_rules(fre_ite, confidence)
    result = list(rules)
    return result
Esempio n. 19
0
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '支持度', '力度', '提升度',
                                 '利用度'))


if __name__ == '__main__':
    supportRate = 0.004
    confidenceRate = 0.6
    itemsets = dict(oaf.frequent_itemsets(ryzd, supportRate))
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules)
    result = list(oaf.rules_stats(rules, itemsets,
                                  len(ryzd)))  #下面这个函数改变了rules,把rules用完了!
    printResult = dealResult(result)

    #################################################
    # 下面将结果保存成excel格式的文件
    dfToSave = ResultDFToSave(result)
    dfToSave.to_excel(r'C:\Users\Administrator\Desktop\2.xlsx')

    #######################################################
    # 下面是根据不同置信度和关联度得到关联规则数目
Esempio n. 20
0
# So the items ‘4’ and ‘25’ (fifth and twenty sixth columns of X) are the only items (and itemsets) that appear 10 or more times. Let’s check this:
print((X.sum(axis=0) >= 10).nonzero()[1])

# Conclusion: Given databases of uniformly distributed random data, there’s not much to work with.




# Examples with rules
np.random.seed(0)
N = 100
X = np.random.random((N, 100)) > .9

# Find all itemsets with at least 5% support:
itemsets = dict(ofpg.frequent_itemsets(X, .05))

# Generate all association rules from these itemsets with minimum 50% confidence:
rules = ofpg.association_rules(itemsets, .5)
rules = list(rules)

# Or only the rules for a particular itemset:
print(list(ofpg.association_rules(itemsets, .3, frozenset({75, 98}))))


# Examples of additional stats for rules generated by association_rules()
N = 30
X = np.random.random((N, 50)) > .9
itemsets = dict(ofpg.frequent_itemsets(X, .1))
rules = ofpg.association_rules(itemsets, .6)
print(list(ofpg.rules_stats(rules, itemsets, N)))