Python frequent_itemsets Examples, orangecontrib.associate.fpgrowth.frequent_itemsets Python Examples

Example #1

0

Show file

File: EMRryzd_2_20190516105833.py Project: cyc19950621/python

def ResultDFToSave(rules):  #根据Qrange3关联分析生成的规则得到并返回对于的DataFrame数据结构的函数
    returnRules = []
    for i in rules:
        temList = []
        temStr = ''
        for j in i[0]:  #处理第一个frozenset
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temList.append(temStr)
        temList.append(i[2])
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '覆盖度', '力度', '提升度',
                                 '利用度'))

    supportRate = 0.02
    confidenceRate = 0.5
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis, supportRate))
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules)
    result = list(oaf.rules_stats(
        rules, itemsets, len(listToAnalysis)))  #下面这个函数改变了rules，把rules用完了！
    printResult = dealResult(result)

    #################################################下面将结果保存成excel格式的文件
    dfToSave = ResultDFToSave(result)
    saveRegularName = str(supportRate) + '支持度_' + str(
        confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx'
    dfToSave.to_excel(saveRegularName)

    #######################################################下面是根据不同置信度和关联度得到关联规则数目
    listTable = []
    supportRate = 0.01
    confidenceRate = 0.1
    for i in range(9):
        support = supportRate * (i + 1)
        listS = []
        for j in range(9):
            confidence = confidenceRate * (j + 1)
            itemsets = dict(oaf.frequent_itemsets(listToAnalysis, support))
            rules = list(oaf.association_rules(itemsets, confidence))
            listS.append(len(rules))
        listTable.append(listS)
    dfList = pd.DataFrame(listTable,
                          index=[supportRate * (i + 1) for i in range(9)],
                          columns=[confidenceRate * (i + 1) for i in range(9)])
    dfList.to_excel('regularNum.xlsx')

Example #2

0

Show file

def fpgrowth2(item_dataset, min_support=0.1, repetitions=10):

    #Transform data
    itemsets = []
    for row in item_dataset:
        vals = []
        for i, val in enumerate(row):
            if val == 1:
                vals.append(i)
        itemsets.append(vals)

    times = []
    for _ in range(repetitions):
        t0 = time.time()
        patterns = fpgrowth.frequent_itemsets(itemsets, min_support)
        times.append(time.time() - t0)

    fp_item_sets = []
    for (items, count) in patterns:
        sup = count / len(item_dataset)
        conds = [[x, 1] for x in items]
        fp_item_sets.append([sup, conds])
    exc_time = np.mean(times)

    return fp_item_sets, exc_time

Example #3

0

Show file

File: clasificacion.py Project: Fuminides/TFM--Mercados

def rules_extractor(X, profundidades=range(4), metric=0.3):
    res = {}

    for i in profundidades:
        T = transacciones_profundidad(X, i)

        itemsets = dict(fp.frequent_itemsets(T, metric))
        rules = [
            (P, Q, supp, conf)
            for P, Q, supp, conf in fp.association_rules(itemsets, metric)
        ]

        res[i] = (itemsets, rules)

    return res

Example #4

0

Show file

File: TransactionMain.py Project: scropothree/TMall_Scrawlering

    def doAnalysize(self,
                    pd_data,
                    category,
                    supportRate=0.02,
                    confidenceRate=0.5,
                    savepath=r'C:\Users\Administrator\Desktop'):
        # 初始化词库路径
        savepath = savepath + "\\" + category
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        initpath = "tmall\\spiders\\DataAnalysize\\jiebaInit\\" + category + ".txt"
        jieba.load_userdict(initpath)
        pd_data['ratecontent_list'] = pd_data.apply(
            lambda r: list(jieba.cut(r['rateContent'])), axis=1)

        aim_list = []
        with open(initpath, 'r', encoding="utf-8") as f:
            for line in f.readlines():
                aim_list.append(line.strip('\n'))
        pd_data['aim_list'] = pd_data.apply(lambda r: list(
            set(r['ratecontent_list']).intersection(set(aim_list))),
                                            axis=1)
        simple_aimdata = []
        pd_data.apply(lambda r: simple_aimdata.append(r['aim_list'])
                      if not r['aim_list'] == [] else 1,
                      axis=1)
        wordcloudlist = []
        for item in simple_aimdata:
            for i in item:
                wordcloudlist.append(i)
        # 生成每种分析的词云图
        self.everyWordCloud(wordcloudlist, savepath)

        #经过上面两行操作，得到目标列表： simple_aimdata
        strSet = set(functools.reduce(lambda a, b: a + b, simple_aimdata))
        strEncode = dict(zip(strSet, range(
            len(strSet))))  # 编码字典，即:{'甜腻': 6,'鱼腥味': 53,etc...}
        strDecode = dict(
            zip(strEncode.values(),
                strEncode.keys()))  # 解码字典，即:{6:'甜腻',53:'鱼腥味',etc...}
        listToAnalysis_int = [
            list(map(lambda item: strEncode[item], row))
            for row in simple_aimdata
        ]
        # 开始进行关联分析
        itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, supportRate))
        # print("itemsets : ")
        # print(itemsets)
        rules = oaf.association_rules(itemsets, confidenceRate)
        rules = list(rules)
        regularNum = len(rules)
        printRules = self.dealRules(rules, strDecode)  # 该变量可以打印查看生成的规则
        # print(printRules)
        result = list(oaf.rules_stats(
            rules, itemsets,
            len(listToAnalysis_int)))  # 下面这个函数改变了rules，把rules用完了！
        # print(result)
        printResult = self.dealResult(result, strDecode)  # 该变量可以打印查看结果
        # print(printResult)

        #################################################下面将结果保存成excel格式的文件
        # save rules to excel
        dfToSave = self.ResultDFToSave(result, strDecode)
        saveRegularName = savepath + "\\" + str(supportRate) + '支持度_' + str(
            confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx'
        dfToSave.to_excel(saveRegularName)
        # save itemsets to excel
        self.saveItemSets(itemsets, strDecode, savepath)

        #######################################################下面是根据不同置信度和关联度得到关联规则数目
        listTable = []
        supportRate = 0.01
        confidenceRate = 0.1
        for i in range(9):
            support = supportRate * (i + 1)
            listS = []
            for j in range(9):
                confidence = confidenceRate * (j + 1)
                itemsets = dict(
                    oaf.frequent_itemsets(listToAnalysis_int, support))
                rules = list(oaf.association_rules(itemsets, confidence))
                listS.append(len(rules))
            listTable.append(listS)
        dfList = pd.DataFrame(
            listTable,
            index=[supportRate * (i + 1) for i in range(9)],
            columns=[confidenceRate * (i + 1) for i in range(9)])
        dfList.to_excel(savepath + "\\" + 'regularNum.xlsx')

Example #5

0

Show file

File: EMRtigejiancha.py Project: cyc19950621/python

import os, os.path,shutil
import codecs 
import EMRdef
import re

#关键词提取 关键词为诊疗计划
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR')#txt目录提取
pattern2 = r'。|：|“|”|；|，'#根据标点分词
tgjc = []
for emrtxt in emrtxts:
    f = open(emrtxt,'r',errors="ignore")#中文加入errors
    emrtxt = os.path.basename(emrtxt)
    emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID
    emrtxt = "".join(emrtxt_str)#转成str  
    #txtp=txtp.decode('utf-8')
    for line in f.readlines():
        line = re.sub(' ','',line)#删除空格
        if line.find (u'体格检查') >-1:
            line = re.sub('体格检查：','',line)
            f2_end = re.split(pattern2,line)
            tgjc.append(f2_end)
            f2_out = "\n".join(f2_end)#转成str
            #EMRdef.text_create(r'D:\DeepLearning ER\EHRtigejiancha','.txt' ,emrtxt,f2_out)#导出
            #zljhs.append(emrtxt+':'+line)
#EMRdef.text_save('D:\python\EMR\zljh.txt',zljhs)'''
'''------------------------------------------------------------------------------------------------------------'''
#导入关联规则
import orangecontrib.associate.fpgrowth as oaf
often=dict(oaf.frequent_itemsets(tgjc, .02))
rules = oaf.association_rules(often, .5)   #这里设置置信度
rules = list(rules)

Example #6

0

Show file

File: EMRryzd_2_20190516105228.py Project: cyc19950621/python

    for i in rules:
        temList = []
        temStr = ''
        for j in i[0]:  #处理第一个frozenset
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temList.append(temStr)
        temList.append(i[2])
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '覆盖度', '力度', '提升度',
                                 '利用度'))


often = dict(oaf.frequent_itemsets(ryzd, .02))
rules = oaf.association_rules(often, .5)  #这里设置置信度
rules = list(rules)
printRules = dealRules(rules)
result = list(oaf.rules_stats(rules, often,
                              len(ryzd)))  #下面这个函数改变了rules，把rules用完了！
printResult = dealResult(result)
print(printResult)

Example #7

0

Show file

File: model.py Project: tanghuihao725/fudan-cloud

def model(data, support=0.05, confidence=0.2):
    fre_ite = dict(oaf.frequent_itemsets(data, support))  #这里设置置信度
    rules = oaf.association_rules(fre_ite, confidence)
    result = list(rules)
    return result

Example #8

0

Show file

File: tagRecReadData.py Project: ahmaddorri/Tag-Recommendation

    transaction.append(tag_numbers)

#print(transaction)

#df2 = pd.read_csv("/Users/ahmaddorri/Desktop/tag recomendation/data/mixed/youtube.words",header=None ,sep=" ")
#print(df2.head())
sampleTransaction = np.random.choice(transaction, size=2000,
                                     replace=False).tolist()
#print(sampleTransaction)

import orangecontrib.associate.fpgrowth as org

T = [["unicef", "child", "united", "nation"],
     ["education", "child", "game", "math"],
     ["unicef", "education", "child", "job"]]

#freq_item = org.frequent_itemsets(T,2)

itemsets = dict(org.frequent_itemsets(T, 1))

#print(list(freq_item))
print(itemsets)
print(len(itemsets))

rules = org.association_rules(itemsets, min_confidence=0.49)
rules = list(rules)
for r in rules:
    print(r)
    if ("unicef" in r[0]):
        print(r[0])

Example #9

0

Show file

File: owassociate.py Project: minhpth/orange3-associate

    def find_rules(self):
        if self.data is None or not len(self.data):
            return
        if self._is_running:
            self._is_running = False
            return

        self.button.button.setText('Cancel')

        self._is_running = True
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.Error.need_discrete_data.clear()
        if X is None:
            self.Error.need_discrete_data()

        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {item: ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(var.name, val)
                 for item, var, val in OneHot.decode(mapping, data, mapping)}
        # Items that consequent must include if classifying
        class_items = {item
                       for item, var, val in OneHot.decode(mapping, data, mapping)
                       if var is data.domain.class_var} if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, _, tooltip) in enumerate(self.header):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        # Find itemsets
        nRules = 0
        itemsets = {}
        ARROW_ITEM = StandardItem('→')
        ARROW_ITEM.setTextAlignment(Qt.AlignCenter)
        with self.progressBar(self.maxRules + 1) as progress:
            for itemset, support in frequent_itemsets(X, self.minSupport / 100):
                itemsets[itemset] = support

                if class_items and not class_items & itemset:
                    continue

                # Filter itemset by joined filters before descending into it
                itemset_str = ' '.join(names[i] for i in itemset)
                if (filterSearch and
                    (len(itemset) < itemsetMin or
                     itemsetMax < len(itemset) or
                     not isRegexMatch(itemset_str, itemset_str))):
                    continue

                for rule in association_rules(itemsets,
                                              self.minConfidence / 100,
                                              itemset):
                    left, right, support, confidence = rule

                    if class_items and right - class_items:
                        continue
                    if filterSearch and not isSizeMatch(len(left), len(right)):
                        continue
                    left_str =  ', '.join(names[i] for i in sorted(left))
                    right_str = ', '.join(names[i] for i in sorted(right))
                    if filterSearch and not isRegexMatch(left_str, right_str):
                        continue

                    # All filters matched, calculate stats and add table row
                    _, _, _, _, coverage, strength, lift, leverage = next(
                        rules_stats((rule,), itemsets, n_examples))

                    support_item = NumericItem(support / n_examples)
                    # Set row data on first column
                    support_item.setData((itemset - class_items,
                                          class_items and (class_items & itemset).pop()),
                                         self.ROW_DATA_ROLE)
                    left_item = StandardItem(left_str, len(left))
                    left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    model.appendRow([support_item,
                                     NumericItem(confidence),
                                     NumericItem(coverage),
                                     NumericItem(strength),
                                     NumericItem(lift),
                                     NumericItem(leverage),
                                     left_item,
                                     ARROW_ITEM.clone(),
                                     StandardItem(right_str, len(right))])
                    nRules += 1
                    progress.advance()

                    if not self._is_running or nRules >= self.maxRules:
                        break

                qApp.processEvents()

                if not self._is_running or nRules >= self.maxRules:
                    break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)
        self.table_rules = proxy_model.get_data()
        if self.table_rules is not None:
            self.Outputs.rules.send(self.table_rules)

        self.button.button.setText('Find Rules')

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount()  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
        self._is_running = False

Example #10

0

Show file

File: owitemsets.py Project: r0b1n1983liu/o3env

    def find_itemsets(self):
        if self.data is None: return
        data = self.data
        self.tree.clear()
        self.tree.setUpdatesEnabled(False)
        self.tree.blockSignals(True)

        class ItemDict(dict):
            def __init__(self, item):
                self.item = item

        top = ItemDict(self.tree.invisibleRootItem())
        X, mapping = OneHot.encode(data)
        self.onehot_mapping = mapping
        names = {item: '{}={}'.format(var.name, val)
                 for item, var, val in OneHot.decode(mapping.keys(), data, mapping)}
        nItemsets = 0

        filterSearch = self.filterSearch
        filterMinItems, filterMaxItems = self.filterMinItems, self.filterMaxItems
        isRegexMatch = self.isRegexMatch

        # Find itemsets and populate the TreeView
        progress = gui.ProgressBar(self, self.maxItemsets + 1)
        for itemset, support in frequent_itemsets(X, self.minSupport / 100):

            if filterSearch and not filterMinItems <= len(itemset) <= filterMaxItems:
                continue

            parent = top
            first_new_item = None
            itemset_matches_filter = False

            for item in sorted(itemset):
                name = names[item]

                if filterSearch and not itemset_matches_filter:
                    itemset_matches_filter = isRegexMatch(name)

                child = parent.get(name)
                if child is None:
                    wi = self.TreeWidgetItem(parent.item, [name, str(support), '{:.1f}'.format(100 * support / len(data))])
                    wi.setData(0, self.ITEM_DATA_ROLE, item)
                    child = parent[name] = ItemDict(wi)

                    if first_new_item is None:
                        first_new_item = (parent, name)
                parent = child

            if filterSearch and not itemset_matches_filter:
                parent, name = first_new_item
                parent.item.removeChild(parent[name].item)
                del parent[name].item
                del parent[name]
            else:
                nItemsets += 1
                progress.advance()
            if nItemsets >= self.maxItemsets:
                break

        if not filterSearch:
            self.filter_change()
        self.nItemsets = nItemsets
        self.nSelectedItemsets = 0
        self.nSelectedExamples = 0
        self.tree.expandAll()
        for i in range(self.tree.columnCount()):
            self.tree.resizeColumnToContents(i)
        self.tree.setUpdatesEnabled(True)
        self.tree.blockSignals(False)
        progress.finish()

Example #11

0

Show file

File: owassociate.py Project: nicolechensh/orange3-associate

    def find_rules(self):
        if self.data is None or not len(self.data):
            return
        if self._is_running:
            return
        self._is_running = True
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.error(911)
        if X is None:
            self.error(911, 'Need some discrete data to work with.')

        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {
            item:
            ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(
                var.name, val)
            for item, var, val in OneHot.decode(mapping, data, mapping)
        }
        # Items that consequent must include if classifying
        class_items = {
            item
            for item, var, val in OneHot.decode(mapping, data, mapping)
            if var is data.domain.class_var
        } if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, tooltip) in enumerate([
            ("Supp", "Support"),
            ("Conf", "Confidence (support / antecedent support)"),
            ("Covr", "Coverage (antecedent support / number of examples)"),
            ("Strg", "Strength (consequent support / antecedent support)"),
            ("Lift",
             "Lift (number of examples * confidence / consequent support)"),
            ("Levr",
             "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)"
             ), ("Antecedent", None), ("", None), ("Consequent", None)
        ]):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        #~ # Aggregate rules by common (support,confidence) for scatterplot
        #~ scatter_agg = defaultdict(list)

        # Find itemsets
        nRules = 0
        itemsets = {}
        with self.progressBar(self.maxRules + 1) as progress:
            for itemset, support in frequent_itemsets(X,
                                                      self.minSupport / 100):
                itemsets[itemset] = support

                if class_items and not class_items & itemset:
                    continue

                # Filter itemset by joined filters before descending into it
                itemset_str = ' '.join(names[i] for i in itemset)
                if (filterSearch and
                    (len(itemset) < itemsetMin or itemsetMax < len(itemset)
                     or not isRegexMatch(itemset_str, itemset_str))):
                    continue

                for rule in association_rules(itemsets,
                                              self.minConfidence / 100,
                                              itemset):
                    left, right, support, confidence = rule

                    if class_items and right - class_items:
                        continue
                    if filterSearch and not isSizeMatch(len(left), len(right)):
                        continue
                    left_str = ', '.join(names[i] for i in sorted(left))
                    right_str = ', '.join(names[i] for i in sorted(right))
                    if filterSearch and not isRegexMatch(left_str, right_str):
                        continue

                    # All filters matched, calculate stats and add table row
                    _, _, _, _, coverage, strength, lift, leverage = next(
                        rules_stats((rule, ), itemsets, n_examples))

                    support_item = NumericItem(support / n_examples)
                    # Set row data on first column
                    support_item.setData(
                        (itemset - class_items, class_items and
                         (class_items & itemset).pop()), self.ROW_DATA_ROLE)
                    left_item = StandardItem(left_str, len(left))
                    left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    model.appendRow([
                        support_item,
                        NumericItem(confidence),
                        NumericItem(coverage),
                        NumericItem(strength),
                        NumericItem(lift),
                        NumericItem(leverage), left_item,
                        StandardItem('→'),
                        StandardItem(right_str, len(right))
                    ])
                    #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right))
                    nRules += 1
                    progress.advance()
                    if nRules >= self.maxRules:
                        break
                if nRules >= self.maxRules:
                    break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount(
        )  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
        self._is_running = False

Example #12

0

Show file

File: EMRryzd_2_20190507095025.py Project: cyc19950621/python

import codecs
import EMRdef
import re
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd')  #txt目录提取
hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore")  #呼吸疾病目录
hxjbdic = hxjb.readlines()  #读行
ryzd = []
for emrtxt in emrtxts:
    f = open(emrtxt, 'r', errors="ignore")  #中文加入errors
    emrpath = os.path.basename(emrtxt)
    emrpath = os.path.splitext(emrpath)[0]
    line_out = []
    for line in f.readlines():
        line = re.sub('\n', '', line)
        line = re.sub(r'(.+?)肺炎', '肺炎', line)  #替换所有的肺炎
        for hxjbc in hxjbdic:  #检索每个词
            hxjbc = re.sub('\n', '', hxjbc)
            if line.find(hxjbc) > -1:
                line_out.append(line)
        line_output = EMRdef.delre(line_out)
        ryzd.append(line_out)
        #line = '\n'.join(line_output)
        #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line)

import orangecontrib.associate.fpgrowth as oaf
often = dict(oaf.frequent_itemsets(ryzd, .01))  #生成频繁度

rules = oaf.association_rules(often, .5, hxjbdic)  #这里设置置信度
rules = list(rules)

print(rules)

Example #13

0

Show file

import sys
import os, os.path,shutil
import codecs 
import EMRdef
import re

#关键词提取 关键词为诊疗计划
emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR')#txt目录提取
pattern2 = r'。|：|“|”|；|，'#根据标点分词
tgjc = []
for emrtxt in emrtxts:
    f = open(emrtxt,'r',errors="ignore")#中文加入errors
    emrtxt = os.path.basename(emrtxt)
    emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID
    emrtxt = "".join(emrtxt_str)#转成str  
    #txtp=txtp.decode('utf-8')
    for line in f.readlines():
        line = re.sub(' ','',line)#删除空格
        if line.find (u'体格检查') >-1:
            line = re.sub('体格检查：','',line)
            f2_end = re.split(pattern2,line)
            tgjc.append(f2_end)
            f2_out = "\n".join(f2_end)#转成str
            #EMRdef.text_create(r'D:\DeepLearning ER\EHRtigejiancha','.txt' ,emrtxt,f2_out)#导出
            #zljhs.append(emrtxt+':'+line)
#EMRdef.text_save('D:\python\EMR\zljh.txt',zljhs)'''
'''------------------------------------------------------------------------------------------------------------'''
#导入关联规则
import orangecontrib.associate.fpgrowth as oaf
oaf.frequent_itemsets(tgjc, .02)

Example #14

0

Show file

File: owitemsets.py Project: Sandy4321/orange3-associate

    def find_itemsets(self):
        if self.data is None:
            return
        if self._is_running:
            return
        self._is_running = True

        data = self.data
        self.tree.clear()
        self.tree.setUpdatesEnabled(False)
        self.tree.blockSignals(True)

        class ItemDict(dict):
            def __init__(self, item):
                self.item = item

        top = ItemDict(self.tree.invisibleRootItem())
        X, mapping = OneHot.encode(data)
        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {
            item: ITEM_FMT.format(var.name, val)
            for item, var, val in OneHot.decode(mapping.keys(), data, mapping)
        }
        nItemsets = 0

        filterSearch = self.filterSearch
        filterMinItems, filterMaxItems = self.filterMinItems, self.filterMaxItems
        isRegexMatch = self.isRegexMatch

        # Find itemsets and populate the TreeView
        with self.progressBar(self.maxItemsets + 1) as progress:
            for itemset, support in frequent_itemsets(X,
                                                      self.minSupport / 100):

                if filterSearch and not filterMinItems <= len(
                        itemset) <= filterMaxItems:
                    continue

                parent = top
                first_new_item = None
                itemset_matches_filter = False

                for item in sorted(itemset):
                    name = names[item]

                    if filterSearch and not itemset_matches_filter:
                        itemset_matches_filter = isRegexMatch(name)

                    child = parent.get(name)
                    if child is None:
                        try:
                            wi = self.TreeWidgetItem(parent.item, [
                                name,
                                str(support), '{:.4g}'.format(
                                    100 * support / len(data))
                            ])
                        except RuntimeError:
                            # FIXME: When autoFind was in effect and the support
                            # slider was moved, this line excepted with:
                            #     RuntimeError: wrapped C/C++ object of type
                            #                   TreeWidgetItem has been deleted
                            return
                        wi.setData(0, self.ITEM_DATA_ROLE, item)
                        child = parent[name] = ItemDict(wi)

                        if first_new_item is None:
                            first_new_item = (parent, name)
                    parent = child

                if filterSearch and not itemset_matches_filter:
                    parent, name = first_new_item
                    parent.item.removeChild(parent[name].item)
                    del parent[name].item
                    del parent[name]
                else:
                    nItemsets += 1
                    progress.advance()
                if nItemsets >= self.maxItemsets:
                    break

        if not filterSearch:
            self.filter_change()
        self.nItemsets = nItemsets
        self.nSelectedItemsets = 0
        self.nSelectedExamples = 0
        self.tree.expandAll()
        for i in range(self.tree.columnCount()):
            self.tree.resizeColumnToContents(i)
        self.tree.setUpdatesEnabled(True)
        self.tree.blockSignals(False)
        self._is_running = False

Example #15

0

Show file

        dictKeyToItem[IdCounter] = row[1]
        IdCounter += 1
    dictCounterItem[row[1]] += 1
    Basket.append(dictItemToKey[row[1]])
    if row[0] != firstIndex:
        ItemList.append(Basket)
        Basket = emptyList()
        firstIndex = row[0]
    counter += 1
    if counter % 50000 == 0:
        print(str(counter) + "..")

# -- end of make ItemList

# search for frequent itemset
frequentItemSet = list(or3.frequent_itemsets(ItemList, 0.02))

print("FREQUENT PATTERN WITH MORE THAN 1 ITEM")
counter = 1
frequentItemSet.sort(key=lambda x: -x[1])
for itemSet in frequentItemSet:
    itemSet_list = list(itemSet[0])
    if len(itemSet_list) > 1:
        print("[" + str(counter) + "]")
        for item in itemSet_list:
            print(dictItemToDescription[dictKeyToItem[item]])
        print("Minimal Support = " + str(itemSet[1]))
        counter += 1
        if counter > 15:
            break

Example #16

0

Show file

File: time_series_analysis.py Project: MforMargarite/ElectricPower

def asso_analysis(path, file_path):
    if not os.path.exists(root + '/asso_analysis/'):
        os.mkdir(root + '/asso_analysis/')

    if not os.path.exists(root + '/asso_analysis/err_label_clean.csv'):
        data = pd.read_csv(path + file_path, encoding='utf-8')
        room_list = data['PAR_ROOM'].unique().tolist()
        room_type_map = {}
        # 生成机房类型 - 机房ID映射字典
        for r in room_list:
            r_info = data[data['PAR_ROOM'] == r].dropna(axis=1)
            if str(r_info.columns.tolist()) not in room_type_map.keys():
                room_type_map[str(r_info.columns.tolist())] = [r]
            else:
                room_type_map[str(r_info.columns.tolist())].append(r)

        # 生成标记数据
        label_df = pd.DataFrame()
        for k in room_type_map.keys():
            same_type_room = room_type_map.get(k)
            df = pd.DataFrame()
            for r in same_type_room:
                df = df.append(data[data['PAR_ROOM'] == r], ignore_index=True)
            df.dropna(axis=1, inplace=True)

            col = [c for c in df.columns.tolist() if c not in ['TIME','PAR_ROOM','ALARM_CAUSE']]
            cur_label = pd.DataFrame()
            cur_label['before_err'] = list(range(24, 0, -1)) * int(df.shape[0] / 24) + list(range(24, 24 - int(df.shape[0] % 24), -1))
            cur_label['err_feature'] = to_label(col, df, cur_label['before_err'].values.tolist())
            cur_label['ALARM_CAUSE'] = df['ALARM_CAUSE']
            label_df = label_df.append(cur_label)
        label_df.dropna(inplace=True)
        label_df.to_csv(root + '/asso_analysis/err_label_clean.csv', index=False, encoding='utf-8')
    cur_cate = pd.read_csv(root + '/asso_analysis/err_label_clean.csv', encoding='utf-8', low_memory=False)

    cur_cate.dropna(inplace=True)
    cate_dict = {'R_LOS': 161, 'NE_NOT_LOGIN': 161, 'High Temperature': 161, 'NE_COMMU_BREAK': 161, 'lossOfSignal': 161, 'R_LOF': 161, 'IN_PWR_HIGH': 161, 'POWERALM': 161, 'HARD_BAD': 161,
                 'NE_Backup_Failed': 161, 'Comms fail alarm': 161, 'FCS_ERR': 161, 'LSR_NO_FITED': 161, 'PKG_FAIL': 161, 'IN_PWR_FAIL': 161, 'BUS_ERR': 161, 'PLUGGABLE_TRANSCEIVER_DISMOUNT': 161,
                 'R_OOF': 161, 'PWR_MAJ_ALM': 161, 'Client Service Mismatch': 161, 'UNKNOWN_CARD': 161, 'OS-Optical_Power_High': 161, 'GNE_CONNECT_FAIL': 161,
                 'Replaceable Unit Problem': 162, 'Loss Of Signal': 162, 'LOS': 162, 'LOF': 162, 'IN_PWR_ABN': 162, 'OUT_PWR_ABN': 162, 'Underlying Resource Unavailable': 162, 'Loss Of Frame': 162,
                 'ME loss of communication': 162, 'COMMUN_FAIL': 162, 'TEMP_OVER': 162, 'BD_STATUS': 162, 'SUBCARD_ABN': 162, 'POWER_FAIL': 162, 'Duplicate Shelf Detected': 162,
                 'NE_DATA_INCONSISTENCY': 162, 'SYSBUS_FAIL': 162, 'SHELF_ABSENCE': 162, 'ABSENCE_WARNING': 162, 'POWER_ABNORMAL': 162, 'Bipolar Violations': 162, 'Transmitter Failure': 162, 'CHIP_FAIL': 162,
                 'BUS_ERROR': 162, 'LAPS_FAIL': 162, 'Degraded Signal': 163, 'Signal Degrade': 163, 'Internal Communication Problem': 163, 'RDI': 163,
                 'cntrlBusFail': 163, 'BD_NOT_INSTALLED': 163, 'FAN_FAIL': 163, 'SYN_BAD': 163, 'Circuit Pack Mismatch': 163, 'Fan Failed': 163, 'Replaceable Unit Missing': 163,
                 'Fuse Failure': 163, 'Battery Failure': 163, 'Temperature Out Of Range': 163, 'Power Failure - B': 163, 'Database Save and Restore Failed': 163, 'Cooling Fan Failure': 163,
                 'MIB backup misaligned': 164, 'Inside Failure': 164, 'Sfwr Environment Problem': 164, 'HouseKeeping': 164}

    err_type = ['161', '162', '163', '164']
    # err_type = cur_cate['ALARM_CAUSE'].unique().tolist()
    cur_cate['ALARM_CAUSE'] = cur_cate['ALARM_CAUSE'].apply(lambda x: str(cate_dict[x]) if x in cate_dict.keys() else "-1")
    cur_cate['err_feature'] = cur_cate['err_feature'].apply(lambda x: x.split("|"))

    err_feature = []
    last_before_err = 24
    items_dict = {'161': [], '162': [], '163': [], '164': []}
    for index, row in cur_cate.iterrows():
        err_feature.append(row['err_feature'])
        if last_before_err < row['before_err'] or index == cur_cate.shape[0]-1:
            cause = cur_cate.loc[index - 1, 'ALARM_CAUSE']
            items_dict[cause] += err_feature
            err_feature.clear()
        last_before_err = row['before_err']

    d_itemsets = {}
    for c in err_type:
        # 频繁项集
        each_itemsets = dict(oaf.frequent_itemsets(items_dict[c], 0.0125))
        total = 0
        # 关联规则
        for k in each_itemsets.keys():
            s = set(k)
            s.add(c)
            d_itemsets[frozenset(s)] = each_itemsets[k]
            if k not in d_itemsets:
                d_itemsets[k] = each_itemsets[k]
            else:
                d_itemsets[k] += each_itemsets[k]
            total += each_itemsets[k]
        d_itemsets[frozenset([c])] = total
    rules = list(oaf.association_rules(d_itemsets, 0.7))
    cur_result = pd.DataFrame(rule_process(rules, err_type), columns=('规则', '置信度'))
    cur_result.to_csv(root + '/asso_analysis/associate_analysis.csv', encoding='utf-8', header=True, index=False)

Example #17

0

Show file

        #处理Country段位
        s = df.iloc[i]['Country']
        s = 'Country_'+s.strip()
        listToStore.append(s)
        #print(listToStore)
        listToAnalysis.append(listToStore.copy())
        listToStore.clear()
    #进行编码，将listToAnalysis里面的字符串转换成整数
    strSet = set(functools.reduce(lambda a,b:a+b, listToAnalysis))
    strEncode = dict(zip(strSet,range(len(strSet)))) #编码字典，即:{'ArticleTag_BS': 6,'Country_Argentina': 53,etc...}
    strDecode = dict(zip(strEncode.values(), strEncode.keys()))  #解码字典，即:{6:'ArticleTag_BS',53:'Country_Argentina',etc...}
    listToAnalysis_int = [list(map(lambda item:strEncode[item],row)) for row in listToAnalysis]
    #开始进行关联分析     
    supportRate = 0.02
    confidenceRate = 0.5     
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, supportRate))        
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules,strDecode)  #该变量可以打印查看生成的规则
    result = list(oaf.rules_stats(rules, itemsets, len(listToAnalysis_int)))   #下面这个函数改变了rules，把rules用完了！
    printResult = dealResult(result,strDecode)  #该变量可以打印查看结果
    
#################################################下面将结果保存成excel格式的文件    
    dfToSave = ResultDFToSave(result,strDecode)
    saveRegularName = str(supportRate)+'支持度_'+str(confidenceRate)+'置信度_产生了'+str(regularNum)+'条规则'+'.xlsx'
    dfToSave.to_excel(saveRegularName)

#######################################################下面是根据不同置信度和关联度得到关联规则数目
    listTable = []
    supportRate = 0.01

Example #18

0

Show file

File: assoc_rules_o3.py Project: smoltis/sbiug

import pandas
import pyodbc

suppParam = 0.1
confParam = 0.7

_conn = pyodbc.connect(
    "DRIVER={SQL Server};SERVER=(local)\sql2017;Database=PythonDemo;Trusted_Connection=yes;"
)
_sql = "SELECT [Departments] as [Values] FROM [dbo].[CombinedSets] WHERE StoreCode=20"
InputDataSet = pandas.read_sql_query(sql=_sql, con=_conn)

mlb = MultiLabelBinarizer(sparse_output=True)
X = mlb.fit_transform(InputDataSet["Values"].str.split(",\s*")) > 0
classes = mlb.classes_

itemsets = dict(frequent_itemsets(X, suppParam))

rules = [[
    ", ".join(classes[i] for i in P), classes[next(iter(Q))], supp, conf
] for P, Q, supp, conf in association_rules(itemsets, confParam)]

OutputDataSet = pandas.DataFrame(rules,
                                 columns=["ante", "cons", "supp", "conf"])
rows = len(InputDataSet)
OutputDataSet["suppPCT"] = pandas.Series([(i / rows)
                                          for i in OutputDataSet["supp"]],
                                         dtype="float")
OutputDataSet.sort_values(["conf"], ascending=False)

print(OutputDataSet)

Example #19

0

Show file

File: FrequentFileEngine.py Project: SteveDodierLazaroPhD/PolicyAnalysis

    def processFrequentItemLists(self, inputDirs: list):
        """Process frequent item lists found in a list of input folders."""
        from orangecontrib.associate.fpgrowth import frequent_itemsets
        from os.path import isfile, exists
        from os import replace, makedirs

        inputPaths = [
            d + '/typesPerInstance.list' for d in inputDirs.split(",")
        ]

        # Check for missing files.
        for p in inputPaths:
            if not isfile(p):
                raise ValueError("File '%s' could not be found, please verify "
                                 "you have invoked the analysis software with "
                                 "the --related-files flag for this user." % p)

        # Read every file and aggregate transactions.
        tprnt("Aggregating transactions from input files...")
        transactions = []
        for p in inputPaths:
            participantFolder = p.split("/")[-2]
            tprnt("%s: %s" % (participantFolder, p))
            with open(p, 'r') as f:
                for line in f:
                    transaction = line.rstrip("\n").split("\t")
                    transaction[0] = participantFolder + "/" + transaction[0]
                    transactions.append(transaction)
        tprnt("Done.")

        # Compute itemsets from transactions.
        tprnt("\nComputing frequent itemsets.")
        itemsets = frequent_itemsets(transactions, frequency())
        tprnt("Done.")

        # Functions to sort itemsets.
        def _isPath(elem):
            return elem[0] in ['/', '~', '@']

        def _hasPath(item):
            typeCnt = 0

            for t in item[0]:
                if _isPath(t):
                    return True

            return False

        def _uniqueType(item):
            typeCnt = 0

            for t in item[0]:
                if not _isPath(t):
                    typeCnt += 1

                    # Save time.
                    if typeCnt > 1:
                        return False

            return typeCnt == 1

        def _uniqueTypeWithAccessVariations(item):
            uniqueType = None

            for t in item[0]:
                if not _isPath(t):
                    if t.endswith(":r") or t.endswith(":w"):
                        t = t[:-2]

                    if not uniqueType:
                        uniqueType = t
                    elif uniqueType != t:
                        return False

            return uniqueType != None

        def _multipleTypes(item):
            uniqueType = None

            for t in item[0]:
                if not _isPath(t):
                    if t.endswith(":r") or t.endswith(":w"):
                        t = t[:-2]

                    if not uniqueType:
                        uniqueType = t
                    elif uniqueType != t:
                        return True

            return False

        # Sort itemsets
        tprnt("\nSorting frequent itemsets to isolate mime type co-access "
              "patterns.")
        uniques = []
        patterns = dict()
        for item in itemsets:
            if _hasPath(item):
                pass
            elif _uniqueType(item):
                uniques.append(item)
            elif _uniqueTypeWithAccessVariations(item):
                pass
            elif _multipleTypes(item):
                patterns[item[0]] = item[1]
        tprnt("Done.")

        # Make output directory.
        if exists(self.outputDir):
            backup = self.outputDir.rstrip("/") + ".backup"
            if exists(backup):
                shutil.rmtree(backup)
            replace(self.outputDir, backup)
        makedirs(self.outputDir, exist_ok=False)

        # displayPatterns = dict()
        # for p in patterns:
        #     disp = set()
        #     for elem in p:
        #         if elem.endswith(":r") or elem.endswith(":w"):
        #             disp.add(elem)
        #         elif elem+":w" not in p and elem+":r" not in p:
        #             disp.add(elem)
        #     displayPatterns[p] = disp

        # Print to files.
        with open(self.outputDir + '/' + 'patterns.out', 'w') as f:
            tprnt("\nMost commonly found types:")
            print("Most commonly found types:", file=f)
            for item in sorted(uniques, key=lambda x: x[1], reverse=True):
                print("\t", item)
                print("mcft\t", item, file=f)

            tprnt("\nMost commonly found patterns:")
            print("\nMost commonly found patterns:", file=f)
            for item in sorted(patterns.items(),
                               key=lambda x: x[1],
                               reverse=True):
                print("\t", item)
                print("mcfp\t", item, file=f)
            print("", file=f)

        del itemsets

        # Match items in patterns to transactions, and print out app and file
        # names.
        tprnt("\nMatching frequent patterns to transactions...")
        transactionsPerPattern = dict()
        for t in transactions:
            for p in patterns.keys():
                if p.issubset(t):
                    matches = transactionsPerPattern.get(p) or []
                    matches.append(t)
                    transactionsPerPattern[p] = matches
        tprnt("Done.")

        def _printPattern(p, matches, counter, exclusiveCounter):
            msg = ""
            listing = ""
            summary = ""

            # Base pattern identity.
            msg += ("\n\nPATTERN: %d\t%s" % (patterns[p], p.__str__()))

            # Transaction listing.
            for matchedTransaction in matches:
                listing += ("App: %s\n" % matchedTransaction[0])
                for transactionElem in sorted(matchedTransaction[1:]):
                    listing += ("\t* %s\n" % transactionElem)
                listing += ("\n")

            # Counters of file extension co-occurrences.
            for (k, v) in sorted(counter.items()):
                summary += ("\t{%s} occurs %d times, in %d patterns\n" %
                            (','.join(k), v, counterI[k]))
            summary += "\n"
            for (k, v) in sorted(exclusiveCounter.items()):
                summary += ("\t{%s} is exclusive %d times, in %d patterns\n" %
                            (','.join(k), v, exclusiveCounterI[k]))

            # Print to files.
            with open(self.outputDir + '/' + 'patterns.out', 'a') as f:
                print(msg, file=f)
                print(summary, file=f)

            with open(self.outputDir + '/' + 'patternsListing.out', 'a') as f:
                print(msg, file=f)
                print(listing, file=f)

        # Pre-analyse the relationships between file endings in patterns.
        tprnt("\nPre-analysing the relationships between files in patterns...")
        for (p, matches) in sorted(transactionsPerPattern.items()):
            # Counter used to count combos of files with the same name and
            # different extensions.
            counter = dict()
            exclusiveCounter = dict()
            counterI = dict()
            exclusiveCounterI = dict()

            # Go through file accesses that match the pattern.
            for matchedTransaction in matches:
                # We collect sets of names for each encountered file extension.
                nameDict = dict()
                extensions = set()
                for transactionElem in sorted(matchedTransaction[1:]):
                    if not (transactionElem.startswith("/")
                            or transactionElem.startswith("~")):
                        continue

                    # Get the base name and file extension.
                    ftype = mimetypes.guess_type(transactionElem)[0]
                    fname = File.getFileNameFromPath(transactionElem)
                    fnoext = File.getNameWithoutExtensionFromPath(fname)
                    fext = File.getExtensionFromPath(fname, filterInvalid=True)

                    # Remember which exts were found for a name and overall.
                    if fext:
                        extensions.add(fext)
                        extSet = nameDict.get(fnoext) or set()
                        extSet.add(fext)
                        nameDict[fnoext] = extSet

                # Now check which extension combos exist, and how many times
                # they occur.
                extPairOccs = dict()
                for (fname, extSet) in nameDict.items():
                    fs = frozenset(extSet)
                    extPairOccs[fs] = (extPairOccs.get(fs) or 0) + 1

                # Compile list of all valid extension combos, and browse them
                # in reverse order of length as we first want to validate the
                # largest combinations.
                combos = list(extPairOccs.keys())
                combos.sort(key=len, reverse=True)

                # Count patterns which exclusively have one extension tied to
                # another (i.e. extension never appears on its own).
                exclusives = dict()
                nonExclusiveKeys = set()
                for k in combos:
                    # All the subsets of the current combo of filetypes are not
                    # exclusive since they're included in this set.
                    subcombos = list()
                    for i in range(1, len(k)):
                        subcombos.extend([
                            frozenset(x) for x in itertools.combinations(k, i)
                        ])
                    nonExclusiveKeys.update(subcombos)

                    # Also check if any of these subsets is itself in the list,
                    # if so the current set is not exclusive.
                    for sub in subcombos:
                        if sub in extPairOccs:
                            break
                    else:
                        # Remember: subsets of a previous set aren't exclusive.
                        if k not in nonExclusiveKeys:
                            exclusives[k] = extPairOccs[k]

                # Now add the match's groups of filenames to counters for the
                # whole pattern. Count both number of cases where the pattern
                # is found / exclusively found, and the number of times it is
                # found.
                for (k, v) in extPairOccs.items():
                    counter[k] = (counter.get(k) or 0) + v
                    counterI[k] = (counterI.get(k) or 0) + 1
                for (k, v) in exclusives.items():
                    exclusiveCounter[k] = (exclusiveCounter.get(k) or 0) + v
                    exclusiveCounterI[k] = (exclusiveCounterI.get(k) or 0) + 1

            # Finally, print information on the pattern.
            _printPattern(p, matches, counter, exclusiveCounter)

Example #20

0

Show file

def compute_fqis_orangefpgrowth_list(super_ilists, min_sup=0.6):
    itemsets = frequent_itemsets(super_ilists, min_sup)
    return list(itemsets)

Example #21

0

Show file

File: dig.py Project: SocraLee/DataAnalysisTool

def associateRules(support=0.02, confidence=0.5):
    support = 0.15
    confidence = 0.15
    try:
        with open('filelocation.json') as f_obj:
            fileInput = json.load(f_obj)
    except:
        with open('errorFlag.json', 'w') as e_obj:
            json.dump("File open process failed", e_obj)
        return
    filename = fileInput

    dfar = pd.read_csv(filename)
    tag = list(dfar.columns.values)
    listToAnalysis = []  #最终结果

    for item in range(1, len(tag) - 1):  #遍历列
        imax = max(list(dfar[tag[item]]))  #上界
        imin = min(list(dfar[tag[item]]))  #下界

        ijc = imax - imin  #极差
        l = ijc / 4

        i1 = imin + l
        i2 = i1 + l
        i3 = i2 + l

        listToStore = []

        for i in range(dfar.shape[0]):
            s = dfar.iloc[i][tag[item]]

            if s >= i3 and s <= imax:
                ss = tag[item] + str(i3) + '-' + str(imax)
            elif s >= i2:
                ss = tag[item] + str(i2) + '-' + str(i3)
            elif s >= i1:
                ss = tag[item] + str(i1) + '-' + str(i2)
            elif s >= imin:
                ss = tag[item] + str(imin) + '-' + str(i1)
            listToStore.append(ss)

        listToAnalysis.append(listToStore.copy())

    listToAnalysis2 = []
    ll = len(listToAnalysis[0])

    for ii in range(ll):
        ltmp = []
        for it in listToAnalysis:
            ltmp.append(it[ii])
        listToAnalysis2.append(ltmp.copy())

    #创建编码词典与解码词典
    what = functools.reduce(lambda a, b: a + b, listToAnalysis2)
    strSet = set(what)

    zz = zip(strSet, range(len(strSet)))
    strEncode = dict(zz)  #编码字典

    strDecode = dict(zip(strEncode.values(), strEncode.keys()))  #解码字典

    listToAnalysis_int = [
        list(map(lambda item: strEncode[item], row)) for row in listToAnalysis2
    ]

    with open('Information.json') as obj:
        infostring = json.load(obj)
    inforlist = infostring.split(' ')
    confidence = float(inforlist[0]) / float(100)
    support = float(inforlist[1]) / float(100)
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, support))
    #频繁项集

    rules = oaf.association_rules(itemsets, confidence)
    rules = list(rules)
    #关联规则

    regularNum = len(rules)

    #printRules=dealResult(result,strDecode)
    #######
    #print("You will get ")
    #print(regularNum)
    #print("association rules when\n"+"SupportRate = ",end='')
    #print(support,end='')
    #print("ConfidenceRate = "+str(confidence))
    informationBack="You will get "+str(regularNum)+"association rules when\n"\
                                                    +"SupportRate = "+str(support)+" ConfidenceRate = "+str(confidence)
    with open('InformationBack.json', 'w') as inf:
        json.dump(informationBack, inf)
    result = list(oaf.rules_stats(rules, itemsets, len(listToAnalysis_int)))

    dfToSave = ResultDFToSave(result, strDecode)
    with open('arInteractiveText.json', 'w') as ij:
        json.dump(str(dfToSave), ij)
    saveRegularName = "Processed.xlsx"
    dfToSave.to_excel(saveRegularName)
    return regularNum

Example #22

0

Show file

File: owassociate.py Project: pombredanne/orange3-associate

    def find_rules(self):
        if self.data is None: return
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.onehot_mapping = mapping
        names = {item: '{}={}'.format(var.name, val)
                 for item, var, val in OneHot.decode(mapping, data, mapping)}

        # Items that consequent must include if classifying
        class_items = {item
                       for item, var, val in OneHot.decode(mapping, data, mapping)
                       if var is data.domain.class_var} if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, tooltip) in enumerate([("Supp", "Support"),
                                                ("Conf", "Confidence (support / antecedent support)"),
                                                ("Covr", "Coverage (antecedent support / number of examples)"),
                                                ("Strg", "Strength (consequent support / antecedent support)"),
                                                ("Lift", "Lift (number of examples * confidence / consequent support)"),
                                                ("Levr", "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)"),
                                                ("Antecedent", None),
                                                ("", None),
                                                ("Consequent", None)]):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        #~ # Aggregate rules by common (support,confidence) for scatterplot
        #~ scatter_agg = defaultdict(list)

        # Find itemsets
        nRules = 0
        itemsets = {}
        progress = gui.ProgressBar(self, self.maxRules + 1)
        for itemset, support in frequent_itemsets(X, self.minSupport / 100):
            itemsets[itemset] = support

            if class_items and not class_items & itemset:
                continue

            # Filter itemset by joined filters before descending into it
            itemset_str = ' '.join(names[i] for i in itemset)
            if (filterSearch and
                (len(itemset) < itemsetMin or
                 itemsetMax < len(itemset) or
                 not isRegexMatch(itemset_str, itemset_str))):
                continue

            for rule in gen_assoc_rules(itemsets,
                                        self.minConfidence / 100,
                                        itemset):
                (left, right), support, confidence = rule

                if class_items and right - class_items:
                    continue
                if filterSearch and not isSizeMatch(len(left), len(right)):
                    continue
                left_str = ' '.join(names[i] for i in sorted(left))
                right_str = ' '.join(names[i] for i in sorted(right))
                if filterSearch and not isRegexMatch(left_str, right_str):
                    continue

                # All filters matched, calculate stats and add table row
                _, _, _, coverage, strength, lift, leverage = next(
                    rules_stats((rule,), itemsets, n_examples))

                support_item = NumericItem(support / n_examples)
                # Set row data on first column
                support_item.setData((itemset - class_items,
                                      class_items and (class_items & itemset).pop()),
                                     self.ROW_DATA_ROLE)
                left_item = StandardItem(left_str, len(left))
                left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                model.appendRow([support_item,
                                 NumericItem(confidence),
                                 NumericItem(coverage),
                                 NumericItem(strength),
                                 NumericItem(lift),
                                 NumericItem(leverage),
                                 left_item,
                                 StandardItem('→'),
                                 StandardItem(right_str, len(right))])
                #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right))
                nRules += 1
                progress.advance()
                if nRules >= self.maxRules:
                    break
            if nRules >= self.maxRules:
                break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)
        progress.finish()

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount()  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0

Example #23

0

Show file

File: owitemsets.py Project: biolab/orange3-associate

    def find_itemsets(self):
        if self.data is None:
            return
        if self._is_running:
            return
        self._is_running = True

        data = self.data
        self.tree.clear()
        self.tree.setUpdatesEnabled(False)
        self.tree.blockSignals(True)

        class ItemDict(dict):
            def __init__(self, item):
                self.item = item

        top = ItemDict(self.tree.invisibleRootItem())
        X, mapping = OneHot.encode(data)
        self.onehot_mapping = mapping
        ITEM_FMT = "{}" if issparse(data.X) else "{}={}"
        names = {
            item: ITEM_FMT.format(var.name, val) for item, var, val in OneHot.decode(mapping.keys(), data, mapping)
        }
        nItemsets = 0

        filterSearch = self.filterSearch
        filterMinItems, filterMaxItems = self.filterMinItems, self.filterMaxItems
        isRegexMatch = self.isRegexMatch

        # Find itemsets and populate the TreeView
        with self.progressBar(self.maxItemsets + 1) as progress:
            for itemset, support in frequent_itemsets(X, self.minSupport / 100):

                if filterSearch and not filterMinItems <= len(itemset) <= filterMaxItems:
                    continue

                parent = top
                first_new_item = None
                itemset_matches_filter = False

                for item in sorted(itemset):
                    name = names[item]

                    if filterSearch and not itemset_matches_filter:
                        itemset_matches_filter = isRegexMatch(name)

                    child = parent.get(name)
                    if child is None:
                        try:
                            wi = self.TreeWidgetItem(
                                parent.item, [name, str(support), "{:.4g}".format(100 * support / len(data))]
                            )
                        except RuntimeError:
                            # FIXME: When autoFind was in effect and the support
                            # slider was moved, this line excepted with:
                            #     RuntimeError: wrapped C/C++ object of type
                            #                   TreeWidgetItem has been deleted
                            return
                        wi.setData(0, self.ITEM_DATA_ROLE, item)
                        child = parent[name] = ItemDict(wi)

                        if first_new_item is None:
                            first_new_item = (parent, name)
                    parent = child

                if filterSearch and not itemset_matches_filter:
                    parent, name = first_new_item
                    parent.item.removeChild(parent[name].item)
                    del parent[name].item
                    del parent[name]
                else:
                    nItemsets += 1
                    progress.advance()
                if nItemsets >= self.maxItemsets:
                    break

        if not filterSearch:
            self.filter_change()
        self.nItemsets = nItemsets
        self.nSelectedItemsets = 0
        self.nSelectedExamples = 0
        self.tree.expandAll()
        for i in range(self.tree.columnCount()):
            self.tree.resizeColumnToContents(i)
        self.tree.setUpdatesEnabled(True)
        self.tree.blockSignals(False)
        self._is_running = False

Example #24

0

Show file

        temList.append(i[2])
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '支持度', '力度', '提升度',
                                 '利用度'))


if __name__ == '__main__':
    supportRate = 0.004
    confidenceRate = 0.6
    itemsets = dict(oaf.frequent_itemsets(ryzd, supportRate))
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules)
    result = list(oaf.rules_stats(rules, itemsets,
                                  len(ryzd)))  #下面这个函数改变了rules，把rules用完了！
    printResult = dealResult(result)

    #################################################
    # 下面将结果保存成excel格式的文件
    dfToSave = ResultDFToSave(result)
    dfToSave.to_excel(r'C:\Users\Administrator\Desktop\2.xlsx')

    #######################################################
    # 下面是根据不同置信度和关联度得到关联规则数目

Example #25

0

Show file

File: AR_FPG.py Project: Paliking/ML_algs


#Have a database of 50 transactions, 100 possible items:

import numpy as np
import orangecontrib.associate.fpgrowth as ofpg
from scipy.sparse import lil_matrix  # other types would convert to LIL anyway

np.random.seed(0)
X = np.random.random((50, 100)) > .9

# Convert it to sparse so we show this type is supported:
X = lil_matrix(X)

# Count the number of itemsets of at least two items with support greater than 4%:
num_items = sum(1 for itemset, support in ofpg.frequent_itemsets(X, .05)
     if len(itemset) >= 2)
print('number of itemsets of at least two items with support greater than 4%:', num_items)

# Let’s get all the itemsets with at least 20% support:
gen = ofpg.frequent_itemsets(X, .2)
itemsets = list(gen)

# We get the same result by specifying the support as absolute number:
print(list(ofpg.frequent_itemsets(X, 10)) == itemsets)

# So the items ‘4’ and ‘25’ (fifth and twenty sixth columns of X) are the only items (and itemsets) that appear 10 or more times. Let’s check this:
print((X.sum(axis=0) >= 10).nonzero()[1])

# Conclusion: Given databases of uniformly distributed random data, there’s not much to work with.