Esempio n. 1
0
def gen_rules_handler(event, context):
    # Get list of all possible request types
    types = requests.get(REQUEST_TYPES_ENDPOINT)
    # Convert them into list
    types_text = types.text.replace("[", "").replace("]", "").replace("\"", "")
    types = list(types_text.split(","))

    client = boto3.client('s3',
                          aws_access_key_id=AWS_ACCESS_KEY_ID,
                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # Build Association rules for each type and save in S3
    for type in types:
        print("Generating association rules for type = " + type)

        # Get all records for the given type
        url = BACKED_ENDPOINT + '/' + type
        content = requests.get(url)
        content_json = json.loads(content.text)
        transactions = [tuple((i.split(','))) for i in content_json["fields"]]

        # Build rules with given min_support and min_confidence
        itemsets, rules = efficient_apriori.apriori(
            transactions,
            min_support=MIN_SUPPORT,
            min_confidence=MIN_CONFIDENCE)

        # Save rules in a local file
        filename = type + '.pkl'
        file = open('/tmp/' + filename, 'wb')
        pickle.dump(rules, file)
        file.close()

        # Upload rule file to s3

        client.upload_file('/tmp/' + filename, S3_BUCKET_NAME, filename)

    # Genrate rules for all types
    print("Generating association rules for all types ")
    url = BACKED_ENDPOINT
    content = requests.get(url)
    content_json = json.loads(content.text)
    transactions = [tuple((i.split(','))) for i in content_json["fields"]]

    # Build rules with given min_support and min_confidence
    itemsets, rules = efficient_apriori.apriori(transactions,
                                                min_support=MIN_SUPPORT,
                                                min_confidence=MIN_CONFIDENCE)

    # Save rules in a local file
    filename = 'rules.pkl'
    file = open('/tmp/' + filename, 'wb')
    pickle.dump(rules, file)
    file.close()

    # Upload rule file to s3
    client.upload_file('/tmp/' + filename, S3_BUCKET_NAME, filename)

    # Return succss
    return {'statusCode': 200, 'body': json.dumps('Rules created/updated')}
Esempio n. 2
0
def apriori_one(transcation, **kwargs):
    if len(kwargs) == 0:
        itemsets, rules = apriori(transcation,
                                  min_support=0.1,
                                  min_confidence=0.5)
        print("频繁项集:", itemsets)
        print("-" * 30)
        print("关联规则:", rules)
    else:
        itemsets, rules = apriori(transcation,
                                  min_support=float(kwargs['support']),
                                  min_confidence=float(kwargs['confidence']))
        print("频繁项集:", itemsets)
        print("-" * 30)
        print("关联规则:", rules)
Esempio n. 3
0
def rule2():
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules

    data = dataset.stack()
    #转置
    # print(data)

    a = data.reset_index()
    #更新index
    print(a)

    b = a.groupby(['level_0', 0])[0].count().unstack().fillna(0)
    #使用原订单序号与商品名称进行表格展开
    # print(b)

    itemsets = b.applymap(encode_units)
    #表格中数据转换成0/1

    print(itemsets)

    frequent_itemsets = apriori(itemsets, min_support=0.02, use_colnames=True)

    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=0.5)

    print(frequent_itemsets.sort_values(by="support", ascending=False))
    print(rules.sort_values(by="lift", ascending=False))
def rule1():
    from efficient_apriori import apriori
    start = time.time()
    # 得到一维数组orders_series,以客户ID为index,产品名称为value
    orders_series = data.set_index('客户ID')['产品名称']
    # 将数据集进行格式转换
    transactions = []
    temp_index = 0
    for i, v in orders_series.items():
        if i != temp_index:
            temp_set = set()
            temp_index = i
            temp_set.add(v)
            transactions.append(temp_set)
        else:
            temp_set.add(v)
    #print(temp_index)
    #print(transactions)
    # 挖掘频繁项集和频繁规则
    itemsets, rules = apriori(transactions,
                              min_support=0.03,
                              min_confidence=0.5)
    print('频繁项集:', itemsets)
    print('关联规则:', rules)
    end = time.time()
    print("用时:", end - start)
Esempio n. 5
0
def rule1():
    from efficient_apriori import apriori
    start = time.time()
    # 得到一维数组orders_series,并且将Transaction作为index, value为Item取值
    orders_series = data.set_index('Transaction')['Item']
    print("一维数组orders_series:", orders_series)
    # 将数据集进行格式转换
    transactions = []
    temp_index = 0
    for i, v in orders_series.items():
        if i != temp_index:
            temp_set = set()
            temp_index = i
            temp_set.add(v)
            transactions.append(temp_set)
        else:
            temp_set.add(v)

    # 挖掘频繁项集和频繁规则
    itemsets, rules = apriori(transactions,
                              min_support=0.01,
                              min_confidence=0.5)
    print('频繁项集:', itemsets)
    print('关联规则:', rules)
    end = time.time()
    print("用时:", end - start)
def rule2():
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    pd.options.display.max_columns = 100
    start = time.time()
    # 格式转换
    data_test = pd.DataFrame({'items': []})
    for i in range(len(data)):
        test = str()
        test_dict = dict()
        for t in range(len(data.iloc[i])):
            if t == 0:
                test = data.iloc[i][t]
            else:
                if data.iloc[i][t] == '':
                    continue
                else:
                    test = test + '|' + data.iloc[i][t]
        test_dict['items'] = test
        data_test = data_test.append(test_dict, ignore_index=True)
    hot_encoded_data = data_test['items'].str.get_dummies('|')
    hot_encoded_data = hot_encoded_data.applymap(encode_units)

    # 挖掘频繁项集和频繁规则
    frequent_itemsets = apriori(hot_encoded_data,
                                min_support=0.01,
                                use_colnames=True)
    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=0.5)
    print("频繁项集:", frequent_itemsets)
    print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.5)])
    #print(rules['confidence'])
    end = time.time()
    print("用时:", end - start)
Esempio n. 7
0
def rule2(s2):
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    # 计算时间
    start = time.time()

    # 数据格式转化
    # 1)将每行数据都放入一列,并用'-'分隔
    data['total'] = data[data.columns[:]].apply(lambda x: '-'.join(x.dropna()),axis=1)
    # 2)利用get_dummies建立one-hot编码
    data_ = data.drop(data.columns[:21],axis=1).join(data.total.str.get_dummies(sep='-'))

    # 挖掘频繁项集
    frequent_itemsets = apriori(data_, min_support = s2, use_colnames=True)
    # 按支持度大小,降序排列
    frequent_itemsets = frequent_itemsets.sort_values(by = 'support',ascending=False)
    print("频繁项集:", frequent_itemsets)
    # 求关联规则,选取提升度为度量选项
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
    # 按提升度大小,降序排序
    rules = rules.sort_values(by = 'lift',ascending=False)
    print("关联规则:",rules)

    end = time.time()
    print("用时2:", end - start)
Esempio n. 8
0
def simpleTest():
    transactions = [(1, 3, 4), (2, 3, 5), (1, 2, 3, 5), (2, 5)]
    itemsets, rules = apriori(transactions,
                              min_support=0.5,
                              min_confidence=0.5)
    print(itemsets)
    print(rules)
Esempio n. 9
0
def rule2():
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    pd.options.display.max_columns = 100
    start = time.time()
    #对数据进行独热编码
    temp_df = train.groupby(
        ['客户ID', '产品名称'])['产品名称'].count().unstack().reset_index().fillna(0)
    temp_df['ID'] = range(len(temp_df))
    hot_encoded_df = temp_df.drop(['客户ID'], 1).set_index('ID')
    #print(hot_encoded_df.head())
    #运用函数对数据进行0/1化处理
    hot_encoded_df = hot_encoded_df.applymap(encode_units)
    #挖掘频繁项集和频繁规则
    frequent_itemsets = apriori(hot_encoded_df,
                                min_support=0.02,
                                use_colnames=True)
    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=0.5)
    print("频繁项集:", frequent_itemsets)
    print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.5)])
    #print(rules['confidence'])
    end = time.time()
    print("用时:", end - start)
Esempio n. 10
0
    def findFeature(self):
        reviewHeadline=self.stringDataProcess(self.data['review_body'])        
        # print(reviewHeadline)

        # # print(reviewHeadline.values.tolist())
        itemsets,rules=apriori(reviewHeadline,min_support=0.02,min_confidence=0.5,max_length=10)
        # print(itemsets)
        # print(rules)
        # f=open(self.filename,'w')  
        # pickle.dump(rules,f,0)
        # with open(filename,'w') as file_obj:
        #     for i in itemsets:
        #         tmp=itemsets[i]
        #         print(tmp)
        #         tmp = json.dumps({str(k): tmp[k] for k in tmp})
        #         json.dump(str(itemsets),file_obj)
        #         break

    # def test(self):
        # rules=pickle.load(self.filename)
        with open(self.filename,'w') as f:
            for rule in rules:
                l=list(rule.lhs)
                r=list(rule.rhs)
                if (" microwave" in l) or (" microwave" in r):
                    f.write(','.join(l)+" -> "+','.join(r)+'\n')
                    print(rule)
Esempio n. 11
0
def rule1():
    from efficient_apriori import apriori
    start = time.time()
    # 得到一维数组orders_series,并且将Transaction作为index, value为Item取值
    orders_series = data.set_index('FaultNo.')['Alarm Msg.']
    # 将数据集进行格式转换
    transactions = []
    temp_index = 0
    for i, v in orders_series.items():
        if i != temp_index:
            temp_set = set()
            temp_index = i
            temp_set.add(v)
            transactions.append(temp_set)
        else:
            temp_set = set()
            temp_set.add(v)

    # 挖掘频繁项集和频繁规则
    itemsets, rules = apriori(transactions,
                              min_support=0.01,
                              min_confidence=0.2)
    itemsets = pd.DataFrame(itemsets)
    rules = pd.DataFrame(rules)
    print('频繁项集:', itemsets)
    print('关联规则:', rules)
    itemsets.to_excel('E:\DATA_ENGIN\\apiority\\' + F + '频繁项集.xlsx')
    rules.to_excel('E:\DATA_ENGIN\\apiority\\' + F + '关联规则.xlsx')
    end = time.time()
    print("本次用时:", end - start)
Esempio n. 12
0
    def alg_apriori(self):

        print('starting analysis apriori')

        itemsets, rules = apriori(self.__prep_list_of_tuples(),
                                  min_support=self.min_supp,
                                  min_confidence=self.min_conf)

        if self.mode == 0:
            return rules
        elif self.mode == 1:
            return itemsets
        elif self.mode == 2:
            rules_rhs = filter(
                lambda rule: len(rule.lhs) == self.rule_lhs and len(rule.rhs)
                == self.rule_rhs, rules)

            cols = ['id1', 'id2', 'supp', 'conf', 'lift']
            lst_data = list()

            for rule in sorted(rules_rhs, key=lambda rule: self.sort_by):
                lst_data.append([
                    rule.lhs, rule.rhs, rule.support, rule.confidence,
                    rule.lift
                ])
            return pd.DataFrame(lst_data, columns=cols)

        else:
            print('Put val mode...')
Esempio n. 13
0
def perform_apriori(final_df):
    records = []
    for i in range(final_df.shape[0]):
        records.append([str(final_df.values[i, j]) for j in range(final_df.shape[1])])
    association_rules = apriori(records, min_support=0.005, min_confidence=0.2, min_length=2)
    association_results = list(association_rules)
    return association_results
Esempio n. 14
0
def rule2():
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    pd.options.display.max_columns = 100
    start = time.time()
    transactions = []
    for i in range(0, dataset.shape[0]):
        temp = ""
        for j in range(0, 20):
            if str(dataset.values[i, j]) != 'nan':
                temp += str(dataset.values[i, j]) + "|"
        transactions.append(temp)
    dataset_new = pd.DataFrame(data=transactions)
    dataset_new.columns = ["Market_Basket"]
    # 对数据进行one-hot编码
    dataset_new_hot_encoded = dataset_new.drop("Market_Basket", 1).join(
        dataset_new.Market_Basket.str.get_dummies("|"))
    dataset_new_hot_encoded = dataset_new_hot_encoded.dropna(axis=1)
    frequent_itemsets = apriori(dataset_new_hot_encoded,
                                min_support=0.05,
                                use_colnames=True)
    frequent_itemsets = frequent_itemsets.sort_values(by="support",
                                                      ascending=False)
    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=1)
    rules = rules.sort_values(by="lift", ascending=False)
    print("频繁项集:", frequent_itemsets)
    print("关联规则:", rules)
    end = time.time()
    print("用时:", end - start)
def efficient_apr(transactions):
    #efficient_apr挖掘频繁项集和频繁规则
    itemsets, rules = apriori(transactions,
                              min_support=0.02,
                              min_confidence=0.4)
    print("频繁项集:", itemsets)
    print("关联规则:", rules)
Esempio n. 16
0
def run(item_set, min_sup, min_conf):
    start = time.time()
    freqItemSet, rules = apriori(item_set,
                                 min_support=min_sup,
                                 min_confidence=min_conf)
    end = time.time()
    execution_time = end - start
    return freqItemSet, rules, execution_time
def rule1():
    start = time.time()
    from efficient_apriori import apriori
    itemsets, rules = apriori(data1, min_support=0.05, min_confidence=0.4)
    print("频繁项集:", itemsets)
    print("关联规则:", rules)
    end = time.time()
    print("用时:", end - start)
Esempio n. 18
0
def extract_association_with_apriori(transactions, min_sup, min_conf):
    transaction_list = []
    for items in transactions.values():
        transaction_list.append(tuple(items))
    itemsets, rules = apriori(transaction_list,
                              min_support=min_sup,
                              min_confidence=min_conf)
    return itemsets, rules
def eff_apriori():
    '''使用efficient_apriori挖掘频繁项集和频繁规则'''
    
    from efficient_apriori import apriori
    
    itemsets, rules = apriori(transactions, min_support=0.02, min_confidence=0.4)
    print("频繁项集:", itemsets)
    print("关联规则:", rules)
Esempio n. 20
0
def apriorize(transactions): # finds rules based on transaction list
    itemsets, rules = apriori(transactions, min_support=0.5,  min_confidence=1)
    print('Rules:')
    for r in rules:
        print(r)
    print('Itemsets:')
    for i in itemsets:
        print(i)
Esempio n. 21
0
def main():
    dataset = pd.read_csv('./Market_Basket_Optimisation.csv', header=None)
    transacations = []
    for myrow in dataset.values:
        transacations.append(list(filter(lambda x:x is not np.nan, myrow)))
    itemsets, rules = apriori(transacations, min_support=0.05, min_confidence=0.25)
    print('频繁项集:', itemsets)
    print('关联规则:', rules)
Esempio n. 22
0
def efficient_apriori(transcation, min_support, min_confidence):
    itemsets, rules = apriori(transcation,
                              min_support=min_support,
                              min_confidence=min_confidence)
    print("-" * 60)
    print('efficient_apriori:')
    print("频繁项集:", itemsets)
    print("关联规则:", rules)
    print("-" * 60)
Esempio n. 23
0
 def calculate(self):
     transactions = self.get_events()[:8]
     itemsets, rules = apriori(transactions,
                               min_support=0.35,
                               min_confidence=1)
     bestFits = {}
     for rule in rules:
         bestFits[self.name_to_id(
             rule.lhs[0])] = [self.name_to_id(r) for r in rule.rhs]
     return bestFits
Esempio n. 24
0
def show_apriori_table(min_support, sentence):
    table = PrettyTable()
    table.field_names = ['lift', 'confidence', 'support', 'word']

    item, rules = apriori(sentence, min_support=min_support)
    rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1,
                       rules)
    for rule in sorted(rules_rhs, key=lambda rule: rule.lift):
        table.add_row([rule.lhs, rule.support, rule.confidence, rule.lift])
    print(table)
Esempio n. 25
0
def use_mlextend2(hot_encode):
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    frequent_itemsets=apriori(hot_encode,min_support=0.05,use_colnames= True)
    frequent_itemsets = frequent_itemsets.sort_values(by="support" , ascending=False) 
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5)
    rules = rules.sort_values(by="lift" , ascending=False) 
    print('-'*20, '关联规则', '-'*20)
    print(rules)
    rules.to_csv('关联规则2.csv')
Esempio n. 26
0
def get_itemsets_for_chunk(chunk, min_support=MIN_SUPPORT):
    """ Reads a chunk and creates a list of frozensets. """
    chunk_data = [line.strip(" \n") for line in chunk]
    transactions = [frozenset(ls.split(' ')) for ls in chunk_data]

    itemsets, rules = apriori(transactions,
                              min_support=MIN_SUPPORT,
                              min_confidence=1,
                              max_length=15,
                              verbosity=0)
    return itemsets
Esempio n. 27
0
def efficient_apriori(transcation, min_support, min_confidence):
    itemsets, rules = apriori(transcation,
                              min_support=min_support,
                              min_confidence=min_confidence)
    # print("-"*60)
    # print('efficient_apriori:')
    # print("频繁项集:", itemsets)
    # print("关联规则:", rules)
    # print("-"*60)
    result = {'itemsets': itemsets, 'rules': rules}
    return result
Esempio n. 28
0
def countItemSets(support, delay, loop):
    for l in range(loop):
        time.sleep(delay)
        print("Counting Itemsets...")
        tempDataList = dataList
        transactions = [x['items'] for x in tempDataList]
        itemsets, rules = apriori(transactions,
                                  min_support=support,
                                  min_confidence=1)
        print("Count Itemsets (Min Support", support, "):")
        print(itemsets)
        print()
Esempio n. 29
0
def save_part_2(dataset_map):
    itemsets, rules = apriori(dataset_map, min_support=0.01)
    output = ""
    for number in itemsets:
        if number is not 3:
            for item, count in itemsets[number].items():
                line = f"{count}:{';'.join(item)}\n"
                output += line

    file = "patterns-2.txt"
    with open(file, "w") as filetowrite:
        filetowrite.write(output)
def rule1():
    from efficient_apriori import apriori
    start = time.time()
    # 挖掘频繁项集和频繁规则
    itemsets, rules = apriori(transactions,
                              min_support=0.03,
                              min_confidence=0.2)
    print('频繁项集:', itemsets)
    df_results1 = pd.DataFrame(rules)
    print('关联规则:', df_results1)
    end = time.time()
    print("用时:", end - start)
Esempio n. 31
0
def rule1():
    from efficient_apriori import apriori
    start = time.time()
    
    itemsets, rules = apriori(transactions, min_support=0.02,  min_confidence=0.5)
    print('频繁项集:', itemsets)
    print('-'*50)
    print('关联规则:', rules)
    end = time.time()
    print("用时:", end-start)

    end = time.time()
    print("用时:", end-start)