Exemple #1
0
def process(data_path="../data/DBLP.pkl.gz",
            dir_path="../result/group_of_year/"):
    '''
        Function:寻找关联项集。
        Principle: 运行Apirori算法,寻找频繁项集,然后进行关联项集的挖掘
        Use: 用于寻找关联项集
    '''
    _data, stat = load_data()
    s_data = split_by_item(_data, stat, "year")
    # 加载数据,并且按照author、year等进行划分
    for s in stat["year"]:
        print("year = %d" % s)
        data = s_data[s]
        _input = []
        for d in data:
            _input.append([])
            for author in d['author']:
                _input[-1].append(author)
        #加载Apriori模型
        miner = apriori()
        #以最小支持度为3个进行一阶频繁项集的挖掘
        miner.L_1(_input, 3)
        #循环直到没有更多频繁项被挖掘出
        while not miner.L[miner.k] == []:
            print("iter %d, size of C = %d, size of L = %d" %
                  (miner.k, len(miner.C[miner.k]), len(miner.L[miner.k])))
            miner.C_k_p_1()
            miner.L_k_p_1(_input, 3)
        #保存结果
        path = dir_path + str(s)
        f = gzip.open(path, 'wb')
        pickle.dump(miner.L, f)
        f.close()
Exemple #2
0
def main():
    
    """
    Step 0:
    makeFileName() will take the directory where medical records are stored and ...
    returns path to text file that stores file paths to Medical records.
    
    """
    filenames = makeFileName()
    
    print "\n\nstep 0 done... 6 more to go "
    print "From the directory of data files : a file containing their path is generated."
    print "=========================================================================\n\n"
    
    """
    Step 1:
    extractSent(filename) will take the text file containing file paths to Medical records..
    writes 3 files and returns their paths :
    -- file1 = Candidate sentences (stop words included)
    -- file2 = Candidate sentences (stop words removed and Record Id added)
    -- file3 = Run record file -that writes summary of the run
    """   

    fullSentFile, sentFile, runFile = extractSent(filenames)
    print "step 1 done... 5 more to go "
    print " This step writes three files: "
    print "1) candidate sentences (full i.e including stop words)"
    print "2) candidate sentences (no stop words and contains Id -indicating medical record)"
    print "3) Run file that summarizes the run process"
    print "================================================================================\n\n"
    """
    Step 2:
    makeBagOfWords() takes the sentence file with ids and makes bag of words file 
    This is a csv file.
    Returns the path to the file.
    """
    bagFile = makeBagOfWords(sentFile)
    print "step 2 done... 4 more to go "
    print "This step creates bag of words-csv file for Apriori alg."
    print "=================================================================================\n\n"
    
    """
    Step 3:
    apriori - takes the bag of the word calculates the sets that satisfy the support .
    apriori(dataset, idList, min_support=5, verbose=True)
    Returns : file path as string
    """
 
    mydirectory = os.path.dirname(bagFile) # get the directory of bagFile
    apOut = apriori(bagFile, mydirectory, min_support=5, verbose=False)
    print apOut
    print "step 3 done... 3 more to go "
    print "This step takes bag of words with ids and runs apriori algorithm."
    print "All association sets that satisfy support> 5 are generated ."
    print "ID's are used to count the support at the Report level"
    print "=================================================================================\n\n"
    
    """
    Step 4:
    checkspan():
    
    """
    filespan3 = checkSpan(fullSentFile, apOut, mydirectory, 3 )
    print "3-span satisfying sets stored in "+ filespan3
    filespan5 = checkSpan(fullSentFile, apOut, mydirectory, 5 )
    print "5-span satisfying sets stored in "+ filespan5
    filespan10 = checkSpan(fullSentFile, apOut, mydirectory, 10 )
    print "10-span satisfying sets stored in "+ filespan10
    
    print "step 4 done... 2 more to go "
    print "Checks if association words lie in k span. "
    print "K values 3, 5 and 10 are checked - generates 3 files"
    print "=================================================================================\n\n"
    
    """
    Step 5:
    OrderCheck():
    
    """
    freq_3 = checkOrder(fullSentFile, filespan3, mydirectory, 3)
    freq_5 = checkOrder(fullSentFile, filespan5, mydirectory, 5)
    freq_10 = checkOrder(fullSentFile, filespan10, mydirectory, 10)
    
    print "step 5 done... 1 more to go "
    print " The association sets permutations for e.g. {mother, diabetes} and {diabetes, mother} are checked for frequency in candidate sentences"
    print " Two files for each run : "
    print "1) one with all permutations"
    print "2) ordered Wordlists "
    print "for each span (3,5,10) are generated."
    print "=================================================================================\n\n"
    
    
    
    getTopLists(freq_3, 3, mydirectory)
    getTopLists(freq_5, 5, mydirectory)
    getTopLists(freq_10, 10, mydirectory)
    
   
    print "Step 6 part 1 done : "
    print "5 files are generated , each answering the respective questions: "
    print " a.txt : contain at least one family member?"
    print " b.txt : contain at least one of the diseases? (disease file in package)"
    print " c.txt : one family member but no disease? "
    print " d.txt : both a family member and a disease? "
    print " e.txt : neither a family nor a disease?"
    
    print "check out Put files in : " + mydirectory 
    print "=================================================================================\n\n"
    
      
    familyReport(freq_3, 3, mydirectory)
    familyReport(freq_5, 5, mydirectory)
    familyReport(freq_10, 10, mydirectory)
    
    print "Step 6 part 2 done : PROCESS COMPLETE !! "
    print "=================================================================================\n\n"
__author__ = 'Mentu'

from Apriori import loadDataset
from Apriori import apriori
from Apriori import generateRules

dataset = loadDataset()
L, supportdata = apriori(dataset)

rules = generateRules(L, supportdata)
print(rules)
__author__ = 'simon.hughes'

import GwData
import WordTokenizer
from Apriori import apriori, print_rules

from sklearn.ensemble import GradientBoostingClassifier

code = "50"

data = GwData.GwData()
xs = WordTokenizer.tokenize(data.documents, spelling_correct=False)
ys = ["CODE_50" if y == 1 else "NOT_50" for y in  data.labels_for(code)]

inputs = [ x + [y] for x,y in zip(xs, ys) ]

rules, support_data= apriori(inputs, min_support=0.025, max_k=5)
print_rules(rules, support_data, inputs, min_size=2)

#print len(inputs)
def RunningExpr02():
	OurputRslt = apriori(MainFunc4(), 0.02)
	from operator import itemgetter
	for item in sorted(OurputRslt, key=itemgetter(1, 0), reverse=True):
		if len(item[0]) > 1:
			print str(item[0]).replace('[', '').replace(']', '').replace('\'', '') + '\t' + str(item[1])
Exemple #6
0
# -*- coding: utf-8 -*-
"""
Created on Wed May 20 09:26:13 2020

@author: bululu
"""


from Apriori import apriori
import pandas as pd
import numpy as np

datafile = 'visit-patterns-by-census-block-group\cbg_patterns.csv'
#数据导入
data =pd.read_csv(datafile,encoding='utf-8')
data.info()
brand=data['top_brands']
print(brand)
dataSet=brand
L,suppData=apriori(dataSet)
i=0
for one in L:
    print("项数为 %s 的频繁项集:" % (i+1),one,"\n")
    i +=1
 print "minConf=0.7时:"
    rules = generateRules(L,suppData, minConf=0.7)

    print "\nminConf=0.5时:"
    rules = generateRules(L,suppData, minConf=0.5)
    plot(rules,method='graph')
__author__ = 'simon.hughes'

import GwData
import WordTokenizer
from Apriori import apriori, print_rules

from sklearn.ensemble import GradientBoostingClassifier

code = "50"

data = GwData.GwData()
xs = WordTokenizer.tokenize(data.documents, spelling_correct=False)
ys = ["CODE_50" if y == 1 else "NOT_50" for y in data.labels_for(code)]

inputs = [x + [y] for x, y in zip(xs, ys)]

rules, support_data = apriori(inputs, min_support=0.025, max_k=5)
print_rules(rules, support_data, inputs, min_size=2)

#print len(inputs)
Exemple #8
0
def process(data_path = "../data/DBLP.pkl.gz",
            dir_path = "../result/topic_of_year/"):
    '''
        Function:寻找关联项集。
        Principle: 运行Apirori算法,寻找频繁项集,然后进行关联项集的挖掘
        Use: 用于寻找关联项集
    '''
    _data, stat = load_data()
    s_data = split_by_item(_data, stat, "year")
    for s in s_data.keys():
        # 加载数据,并且按照author、year等进行划分
        s_data[s] = split_by_item(s_data[s], stat, "Conference")
        for s_ in s_data[s].keys():
            print ("year = %d, Conference = %s" % (s, s_))
            data = s_data[s][s_]
            _input = []
            #使用停止词去除一些数据
            for d in data:
                _input.append([])
                str_ = re.sub("[\:\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+",
                           "", d["title"])
                del_list = ["a", "an", "the", "in", "of", "with", "on", "for",
                            "and", "to", "under", "beyond", "as", "from", "-",
                            "by", "using", "when", "is", "are", "about", "at",
                            "via", "aaai", "papers", "usa", "arizona"]
                words = str_.split()
                for word in words:
                    if word.lower() not in del_list:
                        _input[-1].append(word.lower())
            #加载Apriori模型
            miner = apriori()
            #选择最小支持度为5篇论文 
            miner.L_1(_input, 5)
            while not miner.L[miner.k] == []:
            print ("iter %d, size of C = %d, size of L = %d"
                   % (miner.k, len(miner.C[miner.k]), len(miner.L[miner.k])))
            miner.C_k_p_1()
            miner.L_k_p_1(_input, 5)
            #保存结果
            path = dir_path + str(s) + "_" + str(s_)
            f = gzip.open(path, 'wb')
            pickle.dump(miner.L, f)
            print ("result saved at %s" % path)
            f.close()

def get_result(dir_path="../result/topic_of_year/"):
    '''
        Function:从文件中读取结果
        Pinciple:已经执行完之后,从文件中读取结果,进行显示
    '''   
    files = os.listdir(dir_path)
    ret = {}

    for file in files:
        #print(dir_path + file)
        year, conf = file.split("_")
        if conf not in ret.keys():
            ret[conf] = {}
        ret[conf][int(year)] = get_L(dir_path + file)
        
    return ret
    
def get_topic(res):
    '''
        Funtion:已经执行完之后,获得topic信息
    '''
    ret = {}
    for k in res.keys():
        ret[k] = {}
        for k_ in res[k].keys():
            ret[k][k_] = res[k][k_][-2]
    return ret
    
def save_topic_txt(ts, path="../result/topic.txt"):
    '''
        Function:保存分组信息到txt文件
    '''
    f = open(path, "w", encoding="utf8")
    for key in ts.keys():
        f.write("\n\nConference %s:\n" % key)
        for key_ in ts[key].keys():
            f.write("year %d:\n" % key_)
            for g in range(len(ts[key][key_])):
                f.write(("%d, " % (g+1)))
                for words in ts[key][key_][g]:
                    f.write(words + " ")
                f.write("\n")
    f.close()
    
    
if __name__ == "__main__":
    #执行关联规则挖掘
    process()
    Ls = get_result()
    ts = get_topic(Ls)
    save_topic_txt(ts)
    #显示结果
    for key in ts.keys():
        print ("\nConference %s:\n" % key)
        for key_ in ts[key].keys():
            print ("year %d:" % key_)
            for g in range(len(ts[key][key_])):
                print (("%d" % (g+1)), end=". ")
                for words in ts[key][key_][g]:
                    print (words, end=', ')
                print ("")