Ejemplo n.º 1
0
def freqAlgo():
    print(' ------------------------------------- ')
    print('|Please select your desired algorithm |')
    print('|1. Apriori                           |')
    print('|2. FP-Growth                         |')
    print(' ------------------------------------- ')

    freqChoice = input('Enter the number of your choice: ')
    if freqChoice.isalpha() or any(c in specialChars for c in freqChoice):
        print('\nPlease enter an integer.\n')
        freqFlag = 1
    else:
        freqChoice = int(freqChoice)
        if freqChoice == 1:
            print('*** You selected Apriori algorithm. ***\n')
            freqFlag = 0
            supportCount()
            apriori.apriori(transRecord, minSupp)
        elif freqChoice == 2:
            print('*** You selected FP-Growth algorithm. ***\n')
            freqFlag = 0
            supportCount()
            fpgrowth.process(transRecord, minSupp)
        else:
            print('\n*** Please choose between 1 or 2 and try again. ***\n')
            freqFlag = 1
Ejemplo n.º 2
0
	def aprithread(self):
		self.result_data_Text.insert(INSERT,'频繁项集:\n')
		start = time.time()	
		n=0
		if (self.suanfa=='Aprioi'):
			if(self.minsupport!=0):
				self.L,self.support=apriori.apriori(self.dataset,self.minsupport)
			else:
				self.L,self.support=apriori.apriori(self.dataset)
			for x in self.L:
				for i in x:
					self.result_data_Text.insert(INSERT,i)
					self.result_data_Text.insert(INSERT,'\n')
					n+=1

		else:
			self.frozenDataSet = fpgrowth.transfer2FrozenDataSet(self.dataset)
			self.L = {}
			self.prefix = set([])
			if(self.minconfig!=0):
				self.fptree,self.headPointTable = fpgrowth.createFPTree(self.frozenDataSet, self.minsupport)				
				fpgrowth.mineFPTree(self.headPointTable, self.prefix, self.L, self.minsupport)
			else:
				self.fptree,self.headPointTable = fpgrowth.createFPTree(self.frozenDataSet)
				fpgrowth.mineFPTree(self.headPointTable, self.prefix,self.L)
			for i in self.L:
				# print(i)
				self.result_data_Text.insert(INSERT,i)
				self.result_data_Text.insert(INSERT,'\n')
				n+=1
		self.result_data_Text.insert(INSERT,str(n)+'\n')	
		end = time.time()
		self.log_data_Text.insert(INSERT,'频繁项集已生成!	共'+str(n)+'项	耗时:'+str(round(end-start,2))+'s\n')
Ejemplo n.º 3
0
def run(file, s, categorical=False):
    a_t = 0
    e_t = 0
    f_t = 0
    t1 = time.time()
    apriori(file, s, categorical)
    a_t = time.time() - t1

    t1 = time.time()
    eclat(file, s, categorical)
    e_t = time.time() - t1

    t1 = time.time()
    fp(file, s, categorical)
    f_t = time.time() - t1
    return a_t, e_t, f_t
Ejemplo n.º 4
0
def product_frequency(filename):
	# Import Data
	df = read_cheerio(filename)
	df = df[['Business/Ent Support','CloudFront', 'CloudSearch', 'DynamoDB', 'EC2', 'EMR',\
			'ElastiCache', 'Glacier', 'RDS Service', 'Route 53', 'SES',\
			'SNS',	'SQS', 'S3', 'SWS', 'SimpleDB', 'VPC', 'Red Shift',\
			'OpsWorks', 'Transcode', 'EC2 P-IOPS', 'EC2 EBS Optimized',\
			'EC2 Load Balancer', 'EC2 Spot Usage', ' EBS:Snapshot',\
			'Invalidations', 'Multi-AZ', ' RDS-PIOPS', 'LBR Queries',\
			'TimedStorage Glacier', 'TimedStorage RRS', 'Data Transfer Region']]
	df = df.fillna(0)
	df = df.applymap(f)
	#print df
	counts = pd.DataFrame()
	for column in df.columns.values:
		counts[column] = df[column].value_counts()
	counts = counts.T
	counts=counts.sort(1, ascending=False)

	minsupport = 80
	valid = set(k for k,v in counts[1].iteritems()
             if (v >= minsupport))
	itemsets = [frozenset([v]) for v in valid]
	freqsets, support = apriori(itemsets, minsupport, 100)
	pprint(freqsets)
	print support
Ejemplo n.º 5
0
def run_apriori_and_generate_rules(transactions, items, min_support, min_confidence, output_filename, output_rules=True):
    """ Take the necessary parameters after the main function parses the CLI arguments to start
        the apriori algorithm and generate all association rules.
        
        @Input:
        transactions: list of frozensets of transactions
        items: list of 1-itemsets
        min_support: minimum support to use when calculating candidate itemsets and validating itemsets
        min_confidence: minimum confidence to use when generating rules
        output_filename: filename to which the program will write association rules
        output_rules: If set to True, the program will serialize the rules to a file, as specified on the command line.
                      If set to False, the function will simply return the array of rules for use in stress testing.

        @Return: None or association_rules (depends on output_rules)
    """
    N = len(transactions)

    global_itemset_dict, frequency_set = apriori.apriori(transactions, items, min_support)
    association_rules, output_header = apriori.derive_association_rules(global_itemset_dict, frequency_set, integer_to_data, min_support, min_confidence, N)

    if output_rules:
        if len(association_rules) == 0:
            print("No association rules to serialize")
        else:
            serialize_rules(global_itemset_dict, association_rules, output_header, output_filename)
    else:
        return association_rules
def main():
    min_sup = 0.3
    min_conf = 0.8
    items_frecuentes = apriori(carrito, min_sup)
    print("Base de Datos utilizada")
    for i, t in enumerate(carrito):
        print("t", i, ">>", t)
    print('*' * 10, "REGLAS GENERADAS", '*' * 10)
    generador_reglas(carrito, items_frecuentes, min_conf, min_sup)
Ejemplo n.º 7
0
def test_apriori():
    data = ("a,b,c,d,e,f\n"
            "g,h,i,j,k,l\n"
            "z,x\n"
            "z,x\n"
            "z,x,y\n"
            "z,x,y,i\n")

    expectedItemSets = {
        ItemSet("i"): 2 / 6,
        ItemSet("z"): 4 / 6,
        ItemSet("x"): 4 / 6,
        ItemSet("y"): 2 / 6,
        ItemSet("xz"): 4 / 6,
        ItemSet("yz"): 2 / 6,
        ItemSet("xy"): 2 / 6,
        ItemSet("xyz"): 2 / 6
    }

    index = InvertedIndex()
    index.load(data)
    itemsets = apriori(index, 2 / 6)
    assert (set(expectedItemSets.keys()) == set(itemsets))
    for itemset in itemsets:
        assert (expectedItemSets[itemset] == index.support(itemset))

    print("Itemsets={}".format([i for i in itemsets if len(i) > 1]))

    # (antecedent, consequent, confidence, lift, support)
    expectedRules = {
        (frozenset({Item("x"),
                    Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("z"),
                                            Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("z")}), 1, 1.5, 2 / 3),
        (frozenset({Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("y")}), frozenset({Item("z"),
                                            Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3),
        (frozenset({Item("z"),
                    Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("z"),
                    Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("z")}), frozenset({Item("x"),
                                            Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("z")}), frozenset({Item("x")}), 1, 1.5, 2 / 3),
        (frozenset({Item("z")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
    }

    rules = set(generate_rules(itemsets, 0, 0, index))

    for (antecedent, consequent, confidence, lift, support) in rules:
        print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".format(
            antecedent, consequent, confidence, lift, support))

    assert (rules == expectedRules)
Ejemplo n.º 8
0
def test_rules(filename, minconf):
    transactions, attr_info = pp.read_transaction_data(filename)
    large_itemsets = ap.apriori(filename, 3)
    #lhs = large_itemsets[-1][:-1]
    #rhs = [large_itemsets[-1][-1]]

    lhs = ['A', 'B', 'C']
    rhs = ['Iris-setosa']
    print(lhs, rhs)
    print(gr.process_one_rule(lhs, rhs, transactions, minconf))
Ejemplo n.º 9
0
def test_apriori():
    expected_result = {
        ('johnson81;4081;Craig;Johnson', ): 0.2,
        ('Username; Identifier;First name;Last name', ): 0.2,
        ('grey07;2070;Laura;Grey', ): 0.2,
        ('booker12;9012;Rachel;Booker', ): 0.2,
        ('jenkins46;9346;Mary;Jenkins', ): 0.2
    }
    actual_result = apriori.apriori(data_path, 0.2)
    assert expected_result.items() <= actual_result.items()
Ejemplo n.º 10
0
 def test_generate_associations(self):
     L, supp_data = apriori.apriori(self.dataset, min_support=0.5)
     print 'L:', L
     print '-'*20
     print 'supp_data: ', supp_data
     print '-'*20
     rules = apriori.generateRules(L, supp_data, min_confidence=0.95)
     print '-'*20
     print 'rules: ', rules
     print '-'*20
     assert False
Ejemplo n.º 11
0
def run_algorithm(data, mode, support, iterative, use_CUDA, block, thread):
	if mode == 'apriori':
		print('Running Apriori algorithm with %f support and data shape: ' % (support), np.shape(data))
		result = apriori(data, support)
		return result
	elif mode == 'eclat':
		print('Running eclat algorithm with %f support and data shape: ' % (support), np.shape(data))
		result = eclat(data, support, iterative, use_CUDA, block, thread)
		return result
	else:
		raise NotImplementedError('Invalid algorithm mode.')
Ejemplo n.º 12
0
def run_main():
	#处理数据
	#changeData()
	#handleData()
	#测试频繁项集
	dataSet = apriori.loadDataSet()
	print(dataSet)
	print(len(dataSet))
	#C1 = apriori.createC1(dataSet)
	#D = list(map(set,dataSet))
	L,suppData = apriori.apriori(dataSet,0.2)
	print(L)
	print("========")
	print(L[0])
Ejemplo n.º 13
0
def createLs1(dataSet, min_support):# 'Ls' for Large Sequence
    n = len(dataSet)
    flattenSet = list(itertools.chain(*dataSet))
    flatten_n = len(flattenSet)
    
    # Transform the min_support to litemset_support
    min_support_new = min_support * n /flatten_n
    litemsets = apriori(flattenSet, min_support=min_support_new)
        
    mapping = {v: k for k, v in enumerate(litemsets)}
    # Transform the litemset_support to sequence_support
    supportLs1 = {(mapping[k],):v * flatten_n / n for k, v in litemsets.items()}
    
    return mapping, supportLs1
Ejemplo n.º 14
0
    def explain(self):
        print('start')
        retData = []
        with progressbar.ProgressBar(max_value=self.num) as bar:
            for m in range(self.num):
                tmpData = {}
                code = self.tokenedCodes[m]
                # print(self.sbts[m])
                com, c = self.translateStrs(
                    code, self.checkMode(self.mode, 'withSbt'), self.sbts[m])
                tmpData['code'] = code
                tmpData['comment'] = com

                self.r.extract_keywords_from_text(com)
                comKeys = self.r.get_ranked_phrases()
                tmpData['commentKeywords'] = comKeys

                codeWordList = self.tokenizer.toDoubleList(code)
                codeKeys, codeKeyIndex = self.extractCodeKeys(code)
                # codeKeys, codeKeyIndex = self.extractCodeKeysn(code)
                tmpData['codeKeywords'] = codeKeys
                tmpData['codeKeyIndex'] = codeKeyIndex
                tmpList = []
                for key in comKeys:
                    tmpResults = {
                        'commentKeyword': key,
                    }
                    keyNums = np.zeros(len(codeKeyIndex))
                    i, ni, p = self.explainKey(codeWordList, self.sbts[m],
                                               codeKeyIndex, self.numSamples,
                                               key, 0.6)
                    tmpResults['numberHaveKey'] = len(i)
                    tmpResults['numberNoKey'] = len(ni)
                    tmpResults['probability'] = p
                    for keyIds in ni:
                        tmp = list(set(keyIds))
                        for id in tmp:
                            keyNums[id] += 1
                    L, support = apriori(ni, 0.3)
                    L = [[[int(j) for j in i] for i in l] for l in L]
                    support = [[[int(i) for i in s[0]], s[1]]
                               for s in support.items()]
                    tmpResults['anchors'] = L
                    tmpResults['supports'] = support
                    tmpList.append(tmpResults)
                tmpData['explanations'] = tmpList
                retData.append(tmpData)
                bar.update(m)
        return retData
Ejemplo n.º 15
0
def test_stress():
    datasets = [
        ("datasets/UCI-zoo.csv", 0.3),
        ("datasets/mushroom.csv", 0.4),
        # ("datasets/BMS-POS.csv", 0.05),
        # ("datasets/kosarak.csv", 0.05),
    ]

    for (csvFilePath, min_support) in datasets:
        # Run Apriori and FP-Growth and assert both have the same results.
        print("Running Apriori for {}".format(csvFilePath))
        start = time.time()
        index = InvertedIndex()
        index.load_csv(csvFilePath)
        apriori_itemsets = apriori(index, min_support)
        apriori_duration = time.time() - start
        print(
            "Apriori complete. Generated {} itemsets in {:.2f} seconds".format(
                len(apriori_itemsets),
                apriori_duration))

        print("Running FPTree for {}".format(csvFilePath))
        start = time.time()
        with open(csvFilePath, newline='') as csvfile:
            test_transactions = list(csv.reader(csvfile))
            fptree_itemsets = mine_fp_tree(test_transactions, min_support)
        fptree_duration = time.time() - start
        print(
            "fp_growth complete. Generated {} itemsets in {:.2f} seconds".format(
                len(fptree_itemsets),
                fptree_duration))

        if set(fptree_itemsets) == set(apriori_itemsets):
            print("SUCCESS({}): Apriori and fptree results match".format(csvFilePath))
        else:
            print("FAIL({}): Apriori and fptree results differ!".format(csvFilePath))
        assert(set(fptree_itemsets) == set(apriori_itemsets))

        if apriori_duration > fptree_duration:
            print(
                "FPTree was faster by {:.2f} seconds".format(
                    apriori_duration -
                    fptree_duration))
        else:
            print(
                "Apriori was faster by {:.2f} seconds".format(
                    fptree_duration -
                    apriori_duration))
        print("")
Ejemplo n.º 16
0
def frequentPattern():
	# Count language frequencies
	langfreq = dict()
	for val in columns["progLangs"]:
		for lang in val.split(','):
			langl = lang.lower().lstrip()
			if(langl in langfreq.keys()):
				langfreq[langl] = langfreq[langl] + 1
			else:
				langfreq[langl] = 1

	# List language frequencies of languages appearing more than once
	langfreqc = dict()
	langfreqkey = []
	langfreqindex = dict()
	index = 0
	for lang in langfreq.keys():
		freq = langfreq[lang]
		if(freq > 1 and lang != ""):
			langfreqc[lang] = freq
			langfreqkey.append(lang)
			langfreqindex[lang] = index
			index = index + 1

	# Create list of sets for languages
	setLang = []
	for i, val in enumerate(columns["progLangs"]):
		row = []
		# Add languages
		for lang in val.split(','):
			langl = lang.lower().lstrip()
			if(langl in langfreqkey):
				row.append(langl)
		# Add OS
		#if (not oss[i] == "-"):
		#	row.append(oss[i])
		# Add if all data good
		if(len(row) > 0):
			setLang.append(row)

	result = apriori.apriori(setLang, 0.5)
	print("\nLanguage frequencies: {0}".format(langfreqc))
	print("Frequent Patterns: {0}".format(result))

	# Verification (compute lift)
	for (pattern, freq) in result.iteritems():
		lift = apriori.lift(setLang, pattern)
		print("Lift{0}: {1}".format(pattern, lift))
Ejemplo n.º 17
0
def main(argv):
    #读取数据
    dataset = load_large_data()
    print dataset

    #寻找频繁项集
    timestart = time.clock()
    L, support = ap.apriori(dataset, 0.2)
    time_elapsed = (time.clock() - timestart)
    print '频繁项集有'
    print L

    #生成关联规则
    print '关联规则有'
    ap.printRules(L, support, 0.6)

    print '耗时:', time_elapsed, 's'

    print '内存:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, 'byte'
Ejemplo n.º 18
0
def main(argv):
    #读取数据
    dataset = load_large_data()
    print dataset

    #寻找频繁项集
    timestart = time.clock()
    L, support = ap.apriori(dataset, 0.2)
    time_elapsed = (time.clock() - timestart)
    print '频繁项集有'
    print L

    #生成关联规则
    print '关联规则有'
    ap.printRules(L, support, 0.6)

    print '耗时:', time_elapsed, 's'

    print '内存:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, 'byte'
Ejemplo n.º 19
0
  def execute(self) :
    if self.somethingWrong : #We do not execute the program
      print("An error was found, the data-set have missing values")
      print("Please remove those values before the execution")
      print("Aborting the program")
      #We should not use the statement: System.exit(-1);
    else :
      #We do here the algorithm's operations
      print("No errors, Execute in FARCHD execute :")
      self.dataBase = DataBase(self.nLabels,self.train_myDataSet)
      self.ruleBase = RuleBase(self.dataBase,self.train_myDataSet,self.k,self.typeInference)
      print("dataBase, ruleBase initialized , Execute in FARCHD execute :")
      self.apriori_instance = apriori()
      self.apriori_instance.init_with_more_parameters(self.ruleBase,self.dataBase,self.train_myDataSet,self.minsup,self.maxconf,self.depth)
      self.apriori_instance.generate_RB()
      print("dataBase, ruleBase initialized , Execute in FARCHD execute :")

      self.rules_stage1 = self.apriori_instance.get_rules_stage1()
      print("FARC_HD,rules_stage1,is :" + str(self.rules_stage1))
      self.rules_stage2 = self.ruleBase.size()
      print("FARC_HD,rules_stage2,is :" + str(self.rules_stage2))

      print("self.ruleBase in FARC_HD execute, pass into population :" + str(self.ruleBase))
      pop = population(self.train_myDataSet,self.dataBase,self.ruleBase,self.population_size,self.BITS_GEN,self.maxTrials,self.alpha)
      pop.generation()

      print("Building classifier ......")
      self.ruleBase = pop.rulebase_get_bestRB()
      print("FARC_HD,rule stage3, FARC_HD ruleBase.size() is :" + str(self.ruleBase.size()))
      self.rules_stage3 = self.ruleBase.size()

      self.dataBase.save_file(self.fileDB)
      self.ruleBase.save_file(self.fileRB)

      self.doOutput(self.val_myDataSet,self.outputTr)
      self.doOutput(self.test_myDataSet,self.outputTst)

      self.total_time= time.time() -self.startTime
      self.write_time()
      self.write_rules()

      print(" FARC_HD algorithm is finished . ")
Ejemplo n.º 20
0
def test1():
    dataSet = apriori.loadDataSet()
    print(dataSet)  #[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

    #C1=apriori.createC1(dataSet)
    #print(set(C1)) #{frozenset({4}), frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})}
    #print(list(C1)) #[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]

    #D=map(set,dataSet)
    #print(list(D)) #[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}] 注意!!被list(map1)之后,map1的内容就空了。。。好像set(.)也会清空人家

    #L1,suppData0 = apriori.scanD(D, C1, 0.5)  #不能直接用了,要把D和C1先变成list
    #print(L1)   #[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})]
    #print(suppData0) #{frozenset({4}): 0.25, frozenset({5}): 0.75, frozenset({2}): 0.75, frozenset({3}): 0.75, frozenset({1}): 0.5}

    L, suppData = apriori.apriori(dataSet, 0.5)
    print(L)
    print(suppData)
    rules = apriori.generateRules(L, suppData, minConf=0.5)
    print(rules)
Ejemplo n.º 21
0
    def explain_n(self):
        print('start')
        retData = []
        with progressbar.ProgressBar(max_value=self.num) as bar:
            for m in range(self.num):
                tmpData = {}
                code = self.tokenedCodes[m]
                # print(self.sbts[m])
                com, c = self.translateStrs(
                    code, self.checkMode(self.mode, 'withSbt'), self.sbts[m])
                tmpData['code'] = code
                tmpData['comment'] = com

                self.r.extract_keywords_from_text(com)
                comKeys = self.r.get_ranked_phrases()
                tmpData['commentKeywords'] = comKeys

                codeWordList = self.tokenizer.toDoubleList(code)
                codeKeys, codeKeyIndex = self.extractCodeKeys(code)
                tmpData['codeKeywords'] = codeKeys
                tmpData['codeKeyIndex'] = codeKeyIndex
                tmpList = []
                retSamples = self.explainMultiKey(codeWordList, self.sbts[m],
                                                  codeKeyIndex,
                                                  self.numSamples, comKeys,
                                                  0.6)
                for index, sample in retSamples.items():
                    tmpResults = {
                        'commentKeyword': comKeys[index],
                    }
                    L, support = apriori(sample, 0.3)
                    L = [[[int(j) for j in i] for i in l] for l in L]
                    support = [[[int(i) for i in s[0]], s[1]]
                               for s in support.items()]
                    tmpResults['anchors'] = L
                    tmpResults['supports'] = support
                    tmpList.append(tmpResults)
                tmpData['explanations'] = tmpList
                retData.append(tmpData)
                bar.update(m)
        return retData
def run(dataset=None, filename=None, path="./Data", sep=",", minsupport=0.1, min_factor=0.5):

    # reading binarize file(changing binarize file into transaction)
    filepath = path + "/" + dataset + "/" + filename
    load.load(dataset=dataset, filename=filename)

    # reading binarized transactions
    transpath = path + "/" + dataset + "/" + "trans.json"
    with open(transpath, "r") as fp:
        d = json.load(fp)

        # running apriori with minsupport to get frequency sets
    l, support_data, c, f = apriori.apriori(d, minsupport=0.1)
    # print("l is",l)
    # print("support is",support_data)

    # printing frequency set with support
    """
	filepath=path + "/" + dataset + "/support_" + str(minsupport) + ".csv"
	writer = csv.writer(open(filepath, 'wb'))
	for key, value in support_data.items():
	   	writer.writerow([list(key)[:], value])
	"""

    # generating maximal and closed itemset
    print("# of candidate itemset is", c)
    print("# of frequent itemset is", f)
    s, sc = maximal_itemset.maximal(l)
    print("# of maximal frequent itemset", sc)
    c, cc = closed_itemset.closed(l, support_data)
    print("# of closed frequent itemset is", cc)
    # print("support data is",support_data)
    # mining.generateRules(l,support_data)

    # generating rules
    # min_lift=min_factor
    # rules,noofrules=mining_lift.generateRules(l,support_data,min_factor=0.85)
    """
Ejemplo n.º 23
0
def main(args: Namespace):
    print(f"Dataset: {args.dataset}")
    print(f"Support: {args.support}")
    print(f"Confidence: {args.confidence}")
    print("-" * 20, "\n")
    transactions = get_transactions(args.dataset)
    f_item_sets = apriori(transactions, args.support, args.k)
    lengths = {}
    for k, v in f_item_sets.items():
        if len(k) in lengths:
            lengths[len(k)] += 1
        else:
            lengths[len(k)] = 1

    if len(f_item_sets) <= 20:
        print(f"The frequent itemsets are:\n")
    else:
        print(f"The first 20 frequent itemsets are:\n")

    for i, (f_item_set, support) in enumerate(f_item_sets.items()):
        if i < 20:
            print(f_item_set, support)

    print(f"\nA total of {len(f_item_sets)} frequent itemsets was found.")
    print(f"The distribution of frequent itemsets is: {lengths}.\n")

    a_rules = association_rules(f_item_sets, args.confidence)

    if len(a_rules) <= 20:
        print(f"The association rules are:\n")
    else:
        print(f"The first 20 association rules are:\n")

    for i, rule in enumerate(a_rules.items()):
        if i < 20:
            print(rule)

    print(f"\nA total of {len(a_rules)} association rules was found.")
Ejemplo n.º 24
0
def guestlike(guestid):
    guestid = int(guestid)
    idlist, goodslist, actionlist = reader.search()
    dataSet = reader.data_handle(idlist, goodslist)
    L, supportData = apriori.apriori(dataSet, minSupport=0.2)
    rule = apriori.gen_rule(L, supportData, minConf=0.7)
    glike = search.search(search.PowerSetsBinary(search.get(guestid, idlist, goodslist, actionlist)), rule)
    print(glike)
    guestlike = []
    conn = pymysql.connect(host='wxs.chinaeast.cloudapp.chinacloudapi.cn',
                           user='******',
                           password='******',
                           port=3306,
                           db='demo')
    cur = conn.cursor()
    for i in glike:
        j = str(i) + '%'
        sql3 = " SELECT `goods_id`,`goods_name`,`goods_price` FROM goods_information " \
               "WHERE  `goods_id` like '%s'" % (j)
        cur.execute(sql3)
        u3 = cur.fetchone()
        guestlike.append(u3)
    conn.close()
    return guestlike
Ejemplo n.º 25
0
#dataSet=[line.split() for line in f.readlines()]
#f.close()

dataSet=[line.split() for line in open('C:\\Users\\GYN\\Desktop\\lxx\yibin_data\\SourceDrugNum20160125_asc.txt').readlines()]

# code by Adu
#dataSet = reduce(lambda x,y:x|y,[set(i.strip().split(',')) for i in open('C:\\Users\\GYN\\Desktop\\lxx\yibin_data\\drug_allClass.txt')])

print dataSet

#//  test print utf-8 data into chinese character
#f2 = codecs.open('C:\\Users\\GYN\\Desktop\\lxx\yibin_data\\apriori_result\\testwrite.txt','a','utf-8')
#for line in dataSet:
    #f2.writelines(str(line).encode('gbk')+'\n')

L,suppData=apriori.apriori(dataSet,minSupport=0.0005) #1/2124=0.00047

#file_object=open('F:\\test_result.txt', mode='w')

#strresult = str(suppData)
#file_object.write(strresult)
#file_object.close()
print "ok"
#print 'suppData=',suppData
#print 'L=',L

rules = apriori.generateRules(L,suppData,minConf=0.0005)

#decodedRules = rules.decode("unicode-escape")   
        
print 'rules='
Ejemplo n.º 26
0
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

from apriori import apriori, association_rules
from gzip import GzipFile
dataset = [[int(tok) for tok in line.strip().split()]
           for line in GzipFile('retail.dat.gz')]
freqsets, baskets = apriori(dataset, 80, maxsize=5)
nr_transactions = float(len(dataset))
for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30):
    print('{} | {} | {} ({:%}) | {} | {} | {}'
          .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift)))
import preprocess

DATA_FILE = '../data/diagnosis.data'
OUTPUT_FILE = '../data/diagnosis.csv'
MIN_SUPPORT = 0.2
MIN_CONFIDENT = 0.6
MIN_LIFT = 3.0

# prepocess the original data file
preprocess.preprocess(DATA_FILE, OUTPUT_FILE)

# get the data_set from .csv file
data_set = apriori.load_dataset(OUTPUT_FILE)

# get frequent items and their support value
f, f_support = apriori.apriori(data_set, MIN_SUPPORT)

# generate the rules
rules = apriori.gen_rules(f, f_support, MIN_CONFIDENT, MIN_LIFT)

# discard duplicate rules
# if rule A's lhs and rhs is the subset of rule B's
# and rule A's lift is less than B's
discard_rules = []
for i in range(len(rules)):
    rule_a = rules[i]
    for j in range(len(rules)):
        if i == j or i in discard_rules or j in discard_rules:
            continue
        rule_b = rules[j]
        if(rule_a['lhs'].issubset(rule_b['lhs']) and rule_a['rhs'].issubset(rule_b['rhs']) and rule_a['lift'] <= rule_b['lift']):
Ejemplo n.º 28
0
data = []
for i in range(len(values)):
    temp = []
    for j in range(len(values[0])):
        if values[i][j] == 1:
            temp.append(j)
    data.append(temp)
counts = []
for index in columns:
    line = df[index]
    count = 0
    for i in range(len(line)):
        if line[i] == 1:
            count += 1
    counts.append((float)(count) / 10000)
counts.sort()
minSupport = counts[len(counts) * 1 / 5]

#use apriori
L, supportData = ap.apriori(data, minSupport)
rules = ap.generateRules(L, supportData, minConf=0.4)

#use fpGrowth

minSup = minSupport * 10000
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
Ejemplo n.º 29
0
    def fp_growth(self, transactions, support):
        return pyfpgrowth.find_frequent_patterns(transactions, support)


if __name__ == '__main__':
    sys.setrecursionlimit(80000)
    train_set = pd.read_csv('GSM/new2gtrain.csv')

    a_time = []
    f_time = []
    for i in range(1, 10):
        train = train_set.head(i * 100).groupby('IMSI')['GridID'].apply(list)
        train = map(lambda a: list(set(a)), train)

        a_start = time.time()
        apriori(train, minSupport=1.0)
        a_end = time.time()
        print a_end - a_start
        a_time.append((a_end - a_start) * 1000)

        f = FPGrowthProcessor()
        f_start = time.time()
        f.fp_growth(train, 1)
        f_end = time.time()
        print f_end - f_start
        f_time.append((f_end - f_start) * 1000)

    x = [100, 200, 300, 400, 500, 600, 700, 800, 900]
    plt.figure(figsize=(8, 4))
    plt.plot(x, a_time, label="apriori", color="red", linewidth=2)
    plt.plot(x, f_time, color='blue', label="fpgrowth")
Ejemplo n.º 30
0
reload(apriori)
dataSet = apriori.loadDataSet()  # 获取数据
dataSet
C1 = apriori.creadteC1(dataSet)  # 获取数据集的C1-候选项集合
C1
D = list(map(set, dataSet))  # 把数据转换成集合的形式存放在列表中
D
L1, supportData0 = apriori.scanD(
    D, C1, 0.5)  # 以0.5支持度为要求,计算候选集的每一个项的支持度,并返回大于支持度的集合L1
L1
supportData0

# 根据支持度生成频繁集
reload(apriori)
L, supportData = apriori.apriori(dataSet)
L  # 获得支持度大于0.5的频繁集合
L[0]  # 包含一个元素的
L[1]  # 包含两个元素的
L[2]  # 包含三个元素的
L[3]
apriori.aprioriGen(L[0], 2)  # 看一下如何生成的未和支持度比较的‘L[1]’
L, supportData = apriori.apriori(dataSet, minSupport=0.7)  # 更大的支持度,获得少的结果了

# 根据可信度生成关联规则
reload(apriori)
L, supportData = apriori.apriori(dataSet, minSupport=0.5)
rules = apriori.generateRules(L, supportData, minConf=0.7)  # 0.7的可信度生成的规则
rules = apriori.generateRules(L, supportData, minConf=0.5)  # 0.5的可信度生成的规则

# 在毒蘑菇的数据集上测试下效果如何
Ejemplo n.º 31
0
from apriori import generate_one_item_set
from apriori import apriori

def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

if __name__ == '__main__':

    dataSet = loadDataSet()
    one = generate_one_item_set(dataSet)

    freq = apriori(dataSet)

    print freq
Ejemplo n.º 32
0
print L1
print suppData1
'''
'''
L,suppData=apriori.apriori(dataSet,minSupport=0.5)
rules=apriori.generateRules(L,suppData,minConf=0.7)
print rules
rules=apriori.generateRules(L,suppData,minConf=0.5)
print rules
'''
'''
#--------------国会投票------------#
actionIdList,billTitles=apriori.getActionIds()
'''
mushDatSet=[line.split() for line in open("C:\Users\YAN\Desktop\Apriori/mushroom.dat").readlines()]
L,suppData=apriori.apriori(mushDatSet,minSupport=0.3)
for item in L[1]:
    #intersection 表示交集的意思
    if item.intersection('2'):
        print item










Ejemplo n.º 33
0
import apriori

dataMat = apriori.loadDataSet()
print(dataMat)

dataSet = apriori.createC1(dataMat)
print(dataSet)

L, supportData = apriori.apriori(dataMat)
print(L)
print(supportData)

apriori.generateRules(L, supportData, 0.5)
Ejemplo n.º 34
0
import apriori

apriori.apriori("75000-out1.csv", 0.01)
Ejemplo n.º 35
0

import apriori as ap

dataSet = ap.loadDataSet()
#print dataSet
C1 = ap.createC1(dataSet)
#print C1
D = map(set, dataSet)
#print D
L1, suppData0 = ap.scanD(D, C1, 0.5)
#print suppData0
L, S = ap.apriori(D, 0.5)
#print L

print L

List = ap.generateRules(L, S, minConf=0.4)
print List


Ejemplo n.º 36
0
'''
Analyse species itemsets
'''
import argparse
import joblib

import pandas as pd
import apriori

import apriori_sequential as asq
import helpers

parser = argparse.ArgumentParser(description='Convert Halias RDF dataset for data mining')
parser.add_argument('minsup', help='Minimum support', nargs='?', type=float, default=0.8)
#parser.add_argument('minconf', help='Minimum confidence', nargs='?', type=float, default=0.8)
args = parser.parse_args()

itemsets = helpers.get_species_itemsets()
all_items = list(set([item for itemset in itemsets for item in itemset]))
print(len(itemsets))
print(len(all_items))

freq_items = apriori.apriori(itemsets, all_items, args.minsup, verbose=True)

print('\nSupport {:.3f} frequent itemsets:\n'.format(args.minsup))
print(len(freq_items))
print(freq_items[-1])

joblib.dump(freq_items, helpers.DATA_DIR + 'freq_species_itemsets_{:.3f}_NEW.pkl'.format(args.minsup))

Ejemplo n.º 37
0
def map_meaning (raw_meaning):
    strip = re.compile("\'.*\'")
    parsed = []
    for i in raw_meaning:
        parsed.append(strip.search(i).group())
    return dict(zip(range(0,len(raw_meaning)), parsed))

def get_meaning (i, meaning):
    print meaning[i]

# Extract transactions and meanings
transactions = map_transactions(RAW_DATA[0])
meaning = map_meaning(RAW_MEANING[0])

for threshold in np.arange(0.5, 0.25, -0.05):
    itemsets, support = apriori.apriori(transactions.values(), minSupport=threshold)
    print "THRESHOLD: ", threshold
    print len(itemsets), "itemsets of length:"
    print [len(i) for i in itemsets]
    print "\n"

itemset, support = apriori.apriori(transactions.values(), minSupport=0.3)
for threshold in np.arange(0.7, 0.99, 0.05):
    print "THRESHOLD: ", threshold
    rules = apriori.generateRules(itemset, support, minConf=threshold)
    print "\n"

def get_meaning (rule, meaning):
    condition, result = [], []
    for c in rule[0]:
        condition.append(meaning[c])
Ejemplo n.º 38
0
def test():
    dataSet = apriori.loadDataSet()
    print "DataSet:", dataSet
    L,suppData = apriori.apriori(dataSet)
    rules = apriori.generateRules(L, suppData, minConf=0.5)
    print rules
Ejemplo n.º 39
0
# coding:utf-8

import apriori

# 发现频繁项集和发现关联规则

dataSet = apriori.loadDataSet()
print(dataSet)

C1 = apriori.createC1(dataSet)
print(C1)

D = map(set, dataSet)
print(D)

L1, suppData0 = apriori.scanD(D, C1, 0.5)
print(L1)

L, suppData = apriori.apriori(dataSet)
print(L)

L, suppData = apriori.apriori(dataSet, minSupport=0.5)
rules = apriori.generateRules(L, suppData, minConf=0.7)
print rules

rules = apriori.generateRules(L, suppData, minConf=0.5)
print rules
Ejemplo n.º 40
0
def test2():
    mushDataSet = [line.split() for line in open('mushroom.dat').readlines()]
    L,suppData = apriori.apriori(mushDataSet, minSupport=0.3)
    for item in L[1]:
        if item.intersection('2'):
            print item
# -*- coding: utf-8 -*-
"""
Module implementing Dialog.
"""
import sys
from PyQt4 import QtGui
from PyQt4 import QtCore
from PyQt4.Qt import *
from Ui_mainWindow import Ui_MainWindow
import jobDB
import apriori
import re
from Login import login

apr = apriori.apriori()
db = jobDB.jobDB()


#列表内容区中显示
class CenterDelegate(QtGui.QItemDelegate):
    def __init__(self, parent=None):
        QtGui.QItemDelegate.__init__(self, parent)

    def paint(self, painter, option, index):
        painter.save()
        #painter.drawText(option.rect, Qt.AlignCenter, index.data(Qt.DisplayRole).toString())
        painter.drawText(option.rect, Qt.TextWordWrap | Qt.AlignHCenter,
                         index.data(Qt.DisplayRole).toString())
        painter.restore()

Ejemplo n.º 42
0
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

from apriori import apriori, association_rules
from gzip import GzipFile

# 读入数据,数据格式为二维列表,[[1,3],[22,1]],第一维代表购物篮,第二维代表商品
dataset = [[int(tok) for tok in line.strip().split()]
           for line in GzipFile('retail.dat.gz')]
freqsets, support = apriori(dataset, 80, maxsize=16)
rules = list(association_rules(dataset, freqsets, support, minlift=30.0))
rules.sort(key=(lambda ar: ar.lift),reverse=True)
for ar in rules:
    print('{} -> {} (lift = {:.4})'
          .format(set(ar.antecendent),
                    set(ar.consequent),
                    ar.lift))
Ejemplo n.º 43
0
    #rules = generateRules(L, suppData, minConf=0.7)
    #print 'rules:\n', rules
    with open("xss-train.txt") as f:
        for line in f:
            #/discuz?q1=0&q3=0&q2=0%3Ciframe%20src=http://xxooxxoo.js%3E
            index=line.find("?")
            if index>0:
                line=line[index+1:len(line)]
                #print line
                tokens=re.split('\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29',line)
                #print "token:"
                #print tokens
                myDat.append(tokens)
        f.close()

    L, suppData = apriori(myDat, 0.15)
    rules = generateRules(L, suppData, minConf=0.6)
    #print 'rules:\n', rules# -*- coding:utf-8 -*-

import sys
import urllib
import urlparse
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import HTMLParser
import nltk


#处理参数值的最小长度
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

from apriori import apriori, association_rules
from gzip import GzipFile

# Load dataset
dataset = [[int(tok) for tok in line.strip().split()]
           for line in GzipFile('retail.dat.gz')]

freqsets, support = apriori(dataset, 80, maxsize=16)
rules = list(association_rules(dataset, freqsets, support, minlift=30.0))

rules.sort(key=(lambda ar: -ar.lift))
for ar in rules:
    print('{} -> {} (lift = {:.4})'
          .format(set(ar.antecendent),
                    set(ar.consequent),
                    ar.lift))
Ejemplo n.º 45
0
# 导入库
import sys

sys.path.append('../chapter4')
import pandas as pd
from graphviz import Digraph
import apriori

# 定义数据文件
fileName = 'association.txt'

# 通过调用自定义的apriori做关联分析
minS = 0.1  # 定义最小支持度阀值
minC = 0.38  # 定义最小置信度阀值
dataSet = apriori.createData(fileName)  # 获取格式化的数据集
L, suppData = apriori.apriori(dataSet, minSupport=minS)  # 计算得到满足最小支持度的规则
rules = apriori.generateRules(fileName, L, suppData,
                              minConf=minC)  # 计算满足最小置信度的规则

# 关联结果报表评估
model_summary = 'data record: {1} \nassociation rules count: {0}'  # 展示数据集记录数和满足阀值定义的规则数量
print(model_summary.format(len(rules), len(dataSet)))  # 使用str.format做格式化输出
df = pd.DataFrame(
    rules,
    columns=['item1', 'itme2', 'instance', 'support', 'confidence',
             'lift'])  # 创建频繁规则数据框
df_lift = df[df['lift'] > 1.0]  # 只选择提升度>1的规则
print(df_lift.sort('instance', ascending=False))  # 打印排序后的数据框

# 关联结果图形展示
dot = Digraph()  # 创建有向图
Ejemplo n.º 46
0
import apriori
transactions = [('elma', 'muz', 'dondurma', 'simit'),
                ('elma', 'muz', 'simit'),
                ('yumurta', 'simit'),
                ('yumurta', 'erik'),
                ('elma', 'muz'),
                ('elma', 'muz', 'yumurta')]
print(apriori.apriori(transactions))
Ejemplo n.º 47
0
import os
import django
import sys
pro_dir = os.getcwd()
sys.path.append(pro_dir)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BioDesigner.settings")

import fpTree
import igemRecomdData
import apriori

from design.models import team_parts

def getPartData():
    result = list()
    tList = team_parts.objects.all().distinct().values_list('team_id', flat=True)[:100]
    for t in tList:
        pList = team_parts.objects.filter(team_id=t).values_list('part_id', flat=True)
        result.append(pList)
    #for i in result:
        #print i
    return result

if __name__ == '__main__':
    django.setup()
    l,m = apriori.apriori(getPartData())
    print m
    print l
Ejemplo n.º 48
0
print dataSet
C1 = apriori.createC1(dataSet)
print "C1"
print C1

D=map(set, dataSet)
print "D"
print D

L1, suppData0 = apriori.scanD(D, C1, 0.5)
print "L1"
print L1
print "suppData0"
print suppData0


L,suppData = apriori.apriori(dataSet, minSupport=0.5)
print "L"
print L
print "suppData"
print suppData

rules = apriori.generateRules(L, suppData, minConf=0.7)
print "rules"
print rules

rules = apriori.generateRules(L, suppData, minConf=0.5)
print "rules"
print rules

apriori.NUM_CORES = 1


MINSUP = args.minsup

itemsets = helpers.read_observation_basket(helpers.DATA_DIR + 'observation.basket')

all_items = list(set([item for itemset in itemsets for item in itemset]))

print(len(itemsets))
print(len(all_items))
#print(itemsets[:1])

print('\nSupport {:.3f} frequent itemsets:\n'.format(MINSUP))

freq_items = apriori.apriori(itemsets, all_items, MINSUP, verbose=True)

print(freq_items[-1])
print(len(freq_items))

joblib.dump(freq_items, helpers.DATA_DIR + 'freq_items_{:.3f}.pkl'.format(MINSUP))

ruler = RuleGenerator(itemsets, freq_items)

rules = ruler.rule_generation(0.5) #, fixed_consequents=[('varis',)])

print(len(rules))

joblib.dump(rules, helpers.DATA_DIR + 'freq_rules_{:.3f}.pkl'.format(MINSUP))

#for (rule, conf) in rules:
Ejemplo n.º 50
0
#apriori原理:可以减少可能感兴趣的项集。apriori原理是说,如果某个项集是频繁的,那么它的所有子集也是频繁的。
#反过来说,如果一个项集是非频繁集,那么它的所有超集也是非频繁集。


#问题1:为什么关联规则中,如果项集中有三个元素,为什么只计算1个 -> 2个
#而不计算2个  --->1个???????????????

import apriori
from votesmart import votesmart

dataSet = apriori.loadDataSet()
#C1 = apriori.createC1(dataSet)
#print ("C1 is %s"  % C1)
#D = map(set,dataSet)
#print ( "%r"  % D)
#L1,suppData0 = apriori.scanD(list(D), list(C1), 0.5)
#print (L1)
#print (suppData0)

L,suppData = apriori.apriori(dataSet, 0.5)
print ("L is" , L)
print ("suppData is" , suppData)
#L is [[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})], [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})], [frozenset({2, 3, 5})], []]
#suppData is {frozenset({5}): 0.75, frozenset({3}): 0.75, frozenset({2, 3, 5}): 0.5, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({1}): 0.5, frozenset({1, 3}): 0.5, frozenset({2}): 0.75}

#关联规则挖掘
rules = apriori.generateRules(L, suppData, 0.7)

print ("rules is " ,rules)
Ejemplo n.º 51
0
#!/usr/bin/env python
# encoding: utf-8

import apriori
import codecs

f=codecs.open('/home/will/data/search_result_cn_phone.txt','r','utf-8')

id_word={}
for x in f:
    temp=x.split('\t')
    if len(temp)==4:
        temp1=temp[1].split(',')
        for y in temp1:
            if y not in id_word:
                id_word[y]=[temp[0]]
            else:
                id_word[y].append(temp[0])
'''
with codecs.open('/home/will/data/temp.txt','w','utf-8') as wf:
    for key,value in id_word.iteritems():
        wf.write(str(key)+'\t'+'\t'.join(s for s in value)+'\n')
'''
l,support_data=apriori.apriori(id_word.values(),minSupport=0.001)
print '#########################################################'
print l

Ejemplo n.º 52
0
	def test_apriori(self):
		result_dict = apriori.apriori(small_trans, min_sup)
		self.assert_small_trans_result(result_dict)
Ejemplo n.º 53
0
support = 0.4
loadText.importFromFile('spanish_db.txt')
dataset = loadText.rawPriori
#print dataset
C1 = apriori.createC1(dataset)
#print 'C1', C1
D = map(set,dataset)
#print 'D', D
L1, support_data = apriori.scanD(D,C1,support)
#print 'L1', L1
#print 'support_data', support_data
k_length = 2
transactions = apriori.aprioriGen(L1, k_length)
#print 'transactions', transactions
#print '\n*** *** ***'
L,support_data = apriori.apriori(dataset, support)
#print 'L', L
#print 'support_data', support_data
rules = apriori.generateRules(L, support_data, min_confidence=0.7)
#print 'rules', rules

ruleDict = apriori.generateRuleDict(rules)

'''
print 'ruleDict', ruleDict
print '*** *** ***'
'''
print 'keys', ruleDict.keys()
print '*** *** ***'

Ejemplo n.º 54
0
data = []
for i in range(len(values)):
	temp = []
	for j in range(len(values[0])):
		if values[i][j] == 1:
			temp.append(j)
	data.append(temp)
counts = []
for index in columns:
	line = df[index]
	count = 0
	for i in range(len(line)):
		if line[i]==1:
			count += 1
	counts.append((float)(count)/10000)
counts.sort()
minSupport = counts[len(counts)*1/5]

#use apriori 
L,supportData = ap.apriori(data,minSupport)
rules = ap.generateRules(L,supportData,minConf=0.4)

#use fpGrowth

minSup = minSupport*10000
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
	Lee un documento dado su identificador.
	"""
	s = ""
	with codecs.open("documentos_keywords/" + str(id) + ".txt", "r", "utf-8-sig") as f:
		s=f.read()
	return s.split()

def cargar_datos(N):
	"""
	Carga N documentos, para realizar los experimentos. 
	"""
	data = []
	for i in range(1, N + 1):
		data.append(leer_documento(i))
	return data


if __name__ == "__main__":
	# Leer argumentos de entrada
    N = int(sys.argv[1])
    support = float(sys.argv[2])

	# Cargar datos
    dataset = cargar_datos(N)
	# Aplicar algoritmo apriori
    L, support_data = apriori.apriori(dataset, minsupport = support)
	# Generar reglas
    apriori.generateRules(L, support_data, min_confidence = 0.0)


Ejemplo n.º 56
0
import apriori

dataSet = apriori.loadDataSet()
L, supportData = apriori.apriori(dataSet, minSupport=0.1)

print "[result]-----------------------------------------"
rules = apriori.generateRules(L, supportData, minConf=1.0)
# the current data-set isn't in transactional format. To convert it into a transactional data-set, we use the following snippet of code:
basket_str=""

for rowNum, row in accident_data.iterrows():
    
    #Break lines
    if (rowNum != 0):
        basket_str = basket_str + "\n"
    #Add the rowid as the first column
    basket_str = basket_str + str(rowNum) 
    #Add columns
    for colName, col in row.iteritems():
           if ( colName != 'Accident_Index'):
               basket_str = basket_str + "," + colName + "=" + str(col)
#print basket_str
basket_file=open("accidents_basket.csv","w")
basket_file.write(basket_str)
basket_file.close()

import csv
with open("accidents_basket.csv","rb") as f:
    reader=csv.reader(f)
    my_list=list(reader)

#my_list
L,supportData=apriori.apriori(my_list,0.6)
f_rules= apriori.generateRules(L,supportData,0.6)

for row in f_rules:
    print list(row[0]), " => ", list(row[1]), row[2]