def test_fpgrowth(self): data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]] rdd = self.sc.parallelize(data, 2) model1 = FPGrowth.train(rdd, 0.6, 2) # use default data partition number when numPartitions is not specified model2 = FPGrowth.train(rdd, 0.6) self.assertEqual(sorted(model1.freqItemsets().collect()), sorted(model2.freqItemsets().collect()))
def prepare_fpgrowth_data(self): tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR', 'MER_CAT_CD') \ .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013") result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey() def m(x): k = x[0] l = list(x[1]) v = set() for i in l: v.add(i[0]) return set(v) result = result.map(m) for i in result.take(10): print(i) model = FPGrowth.train(result, minSupport=0.05, numPartitions=10) result = model.freqItemsets().collect() for r in result: print(r)
def FPGrowthRDD(transactionsRDD, minSupport=0.2, numPartitions=10): ''' perform the FPGrowth algorithm ''' model = FPGrowth.train(transactionsRDD, minSupport=0.2, numPartitions=10) return model.freqItemsets()
def get_most(): print("get most") my_spark = pyspark.sql.SparkSession \ .builder \ .appName("RESTAPI_most_frequent") \ .master("local[2]") \ .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/conception.factures") \ .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/conception.factures") \ .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\ .config("spark.executor.memory", "1G") \ .config("spark.driver.memory", "5G") \ .getOrCreate()\ df = my_spark.read.format("com.mongodb.spark.sql.DefaultSource").load() df.show() transactions = df.groupBy("_id") \ .agg(functions.collect_list("articles.product_name").alias("name")) \ .rdd \ .flatMap(lambda x: x.name) transactions.collect() model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10) result = model.freqItemsets().collect() return json.dumps(result)
def _train_fp_growth_model(cls, data_store, eco_to_package_topic_dict, min_support_count, additional_path, fp_num_partition): sc = SparkContext() manifest_file_list = data_store.list_files(prefix=additional_path + MANIFEST_FILEPATH) list_of_topic_list = list() for manifest_file in manifest_file_list: eco_to_package_list_json_array = data_store.read_json_file( manifest_file) for eco_to_package_list_json in eco_to_package_list_json_array: ecosystem = eco_to_package_list_json.get(MANIFEST_ECOSYSTEM) list_of_package_list = eco_to_package_list_json.get( MANIFEST_PACKAGE_LIST) for package_list in list_of_package_list: package_list_lowercase = [x.lower() for x in package_list] topic_list = cls.get_topic_list_for_package_list( package_list_lowercase, ecosystem, eco_to_package_topic_dict) list_of_topic_list.append(topic_list) transactions = sc.parallelize(list_of_topic_list) transactions.cache() min_support = float(min_support_count / float(transactions.count())) model = FPGrowth.train(transactions, minSupport=min_support, numPartitions=fp_num_partition) return model
def chercher_produits(): products = df.rdd.map(lambda x: x.produits) model = FPGrowth.train(products, minSupport=0.4, numPartitions=5) result = list( set(model.freqItemsets().flatMap( lambda itemset: itemset.items).collect())) print("Frequent items: %s" % str(result)) return jsonify(result)
def getConfident(self): f = udf(lambda x: float(len(x)), FloatType()) rdd = self.df.rdd.flatMap(lambda x: x[0]) model = FPGrowth.train(rdd, self.support, 2) rules = model._java_model.generateAssociationRules( self.confidence).collect() ls = [[i.javaAntecedent()[0], i.javaConsequent()[0], i.confidence()] for i in rules if len(i.javaAntecedent()) == 1] return spark.createDataFrame(ls, ['l', 'r', 'confidencePositive'])
def writeToFile(rdd): with open("count.txt", "w") as f: f.write(str(rdd.count())) rdd_words = rdd.map(lambda line: list( filter(lambda a: a != "" and a not in stop_words, list(set(line.strip().split(' ')))))).filter(lambda x: x != []) model = FPGrowth.train(rdd_words, minSupport=0.02, numPartitions=20) result = model.freqItemsets().collect() with open("frequent_items.txt", "w") as g: for i in range(5): g.write(json.dumps(result[i].items) + "\n")
def run_FPM(tweets, collection): model = FPGrowth.train(tweets.select("filtered").rdd.map(lambda x: x[0]), minSupport=0.02) result = sorted(model.freqItemsets().collect(), reverse=True) # sort the result in reverse order sorted_result = sorted(result, key=lambda item: int(item.freq), reverse=True) # save output to file with codecs.open(globals.FP_dir + "/" + time.strftime("%Y%m%d-%H%M%S") + '_' + collection["Id"] + '_' + collection["name"] + '.txt', 'w',encoding='utf-8') as file: for item in sorted_result: file.write("%s %s\n" % (item.freq, ' '.join(item.items)))
def similar_items_for_type(rdd, index, type): print("Calculating similar items for type: ", type) new_rdd = rdd.map(lambda row: [row[0]]+(row[index] if row[index] else [])) new_rdd = new_rdd.map(lambda row: list(set(row))) model = FPGrowth.train(new_rdd, minSupport=0.001, numPartitions=4) freq_items_sets = model.freqItemsets().collect() item_to_sim = {} for freq_item_set in freq_items_sets: items = set(freq_item_set[0]) for item in items: item_to_sim.setdefault(item, set()).update(items.difference(set([item]))) return item_to_sim
def main(sc, argv): inputfile = '' outputfile = '' sigma = 0 """ Parse command line option * inputfile: csv file containing the transaction data (delimiter is ' ') * outputfile: csv file containing frequent item sets of size 3 or more with support greater than sigma * sigma: minimum support required (should be positive integer) """ try: opts, args = getopt.getopt(argv[1:], "hi:o:s:", ["ifile=", "ofile=", "sigma="]) except getopt.GetoptError: usage(argv[0]) sys.exit(2) for opt, arg in opts: if opt == '-h': usage(arg[0]) sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-s", "--sigma"): sigma = int(arg) print 'Input file is ', inputfile print 'Output file is ', outputfile print 'Support level is ', str(sigma) if inputfile == '' or outputfile == '' or sigma <= 0: usage(argv[0]) sys.exit(2) data = sc.textFile(inputfile) transactions = data.map(lambda line: line.strip().split(' ')) """Compute minSupport from sigma""" minSupport = float((float(sigma))/transactions.count()) print "MinSupport = " + str(minSupport) model = FPGrowth.train(transactions, minSupport, numPartitions=10) result = model.freqItemsets().collect() with open(outputfile, "w") as opf: for items, freq in result: if len(items) >= 3: """Write only frequent item sets with 3 or more items""" opf.write("%d, %d, %s\n" % (len(items), freq, printItemSet(items)))
def FpGrowthWithFilterByOverall(dataframe, date, product): df = dataframe.select([product, date]) df.na.drop() transactions_data = df.groupBy(date).agg( F.collect_list(product).alias("transactions")).rdd.map( lambda x: x.transactions) unique_transactions = transactions_data.map(lambda x: list(set(x))).cache() model = FPGrowth.train(unique_transactions, 0.2, 10) result = model.freqItemsets().collect() return result
def main(sc, argv): inputfile = '' outputfile = '' sigma = 0 """ Parse command line option * inputfile: csv file containing the transaction data (delimiter is ' ') * outputfile: csv file containing frequent item sets of size 3 or more with support greater than sigma * sigma: minimum support required (should be positive integer) """ try: opts, args = getopt.getopt(argv[1:], "hi:o:s:", ["ifile=", "ofile=", "sigma="]) except getopt.GetoptError: usage(argv[0]) sys.exit(2) for opt, arg in opts: if opt == '-h': usage(arg[0]) sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-s", "--sigma"): sigma = int(arg) print 'Input file is ', inputfile print 'Output file is ', outputfile print 'Support level is ', str(sigma) if inputfile == '' or outputfile == '' or sigma <= 0: usage(argv[0]) sys.exit(2) data = sc.textFile(inputfile) transactions = data.map(lambda line: line.strip().split(' ')) """Compute minSupport from sigma""" minSupport = float((float(sigma)) / transactions.count()) print "MinSupport = " + str(minSupport) model = FPGrowth.train(transactions, minSupport, numPartitions=10) result = model.freqItemsets().collect() with open(outputfile, "w") as opf: for items, freq in result: if len(items) >= 3: """Write only frequent item sets with 3 or more items""" opf.write("%d, %d, %s\n" % (len(items), freq, printItemSet(items)))
def generate_freq_items(self): """Run Spark FP Growth to get the frequent item sets. :return: Only those frequent items sets where len(item_set) is either 4 or 5 """ sc = SparkContext.getOrCreate() rdd = sc.parallelize(self.all_list_of_package_list, NUM_PARTITIONS) model = FPGrowth.train(rdd, MIN_SUPPORT_COUNT, NUM_PARTITIONS) freq_item_sets = model.freqItemsets().collect() for item_set in freq_item_sets: item_set_len = len(item_set.items) if item_set_len == 4: self.freq_items_4.append(item_set.items) elif item_set_len == 5: self.freq_items_5.append(item_set.items) sc.stop()
def frequent_word_comb(parsed): # Create the transaction-like dataset for FP-Growth transactions = parsed.map(lambda line: split_tweet(line, True)) # Run the FPGrwoth algorithm with the specified minimum support and number of partitions model = FPGrowth.train(transactions, minSupport=0.01, numPartitions=4) result = model.freqItemsets().collect() # Select only frequent itemsets that have 2 or more elements. pair_results = [itemset for itemset in result if len(itemset[0]) >= 2] # Sort in the descending order of frequency pair_results.sort(reverse=True, key=custom_sort) f = open('outputfile.txt', "a+") for pair in pair_results[:10]: f.write("Frequent Itemset: " + str(pair[0]) + "\tFrequency: " + str(pair[1]) + "\n") f.close()
def gp_growth_demo(self): ''' r z h k p z y x w v u t s s x o n r x z y m t s q e z x z y r q t p :return: ''' data = self.sc.textFile(self.base + 'sample_fpgrowth.txt') transactions = data.map(lambda line: line.strip().split(' ')) model = FPGrowth.train(transactions, minSupport=0.3, numPartitions=10) result = model.freqItemsets().collect() for fi in result: print(fi)
def FpGrowthWithFilterByCity(dataframe, date, product, city): df = dataframe.select([product, date, city]) df = df.withColumn("_Products_", F.concat(F.col(product), F.lit(","), F.col(city))) df.na.drop() transactions_data = df.groupBy(date).agg( F.collect_list("_Products_").alias("transactions")).rdd.map( lambda x: x.transactions) unique_transactions = transactions_data.map(lambda x: list(set(x))).cache() model = FPGrowth.train(unique_transactions, 0.2, 10) result = model.freqItemsets().collect() return result
def create_model_text(self, data, params): minSupport = float(params.get('minSupport', 0.2)) numPartitions = int(params.get('numPartitions', 10)) limits = int(params.get('limits', 10)) transactions = data.map(lambda line: line.strip().split(' ')) model = FPGrowth.train(transactions, minSupport=minSupport, numPartitions=numPartitions) result = model.freqItemsets().collect() for index, fi in enumerate(result): if index == limits: break print(str(fi.items) + ':' + str(fi.freq))
def main(): itemsets_path1 = "./itemsets.csv" min_supp = 0.0002 itemsets = sc.textFile( itemsets_path1).map(lambda line: line.strip().split('\t')).map( lambda x: list(set(x))) # items must be unique model = FPGrowth.train(itemsets, minSupport=min_supp) # freq=50 result = model.freqItemsets().collect() with open('./freq.csv', 'w') as fout: with open('./frqitems.csv', 'w') as fout2: for freqitemset in result: it = freqitemset.items # list freq = freqitemset.freq for CUI in it: fout.write(CUI + '\t') fout.write("\n") fout2.write(str(freq)) fout2.write('\n')
def fit(self, rdd, min_support): ''' Mine frequent itemsets, using `pyspark.mllib.fpm.FPGrowth` param: `rdd`: PythonRDD, transactions `min_support`: float in [0, 1) or int in [1, inf]. If former, percentage of records; latter, number of records `n_partitions`: int, number of partitions ''' self.rdd = rdd self.n = rdd.count() # Allow for passing "number of records" or percentage if min_support >= 1: min_support /= self.n model = FPGrowth.train(rdd, min_support, rdd.getNumPartitions()) self.itemsets_df = model.freqItemsets().toDF() self._addl_itemset_setup()
def fpgrowth(self): ''' 用户消费商户类型频繁项 :return: ''' tran_df = self.spark.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR', 'MER_CAT_CD') \ .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013") result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey() def m(x): k = x[0] l = list(x[1]) v = set() for i in l: v.add(i[0]) return set(v) result = result.map(m) for i in result.take(10): print(i) model = FPGrowth.train(result, minSupport=0.05, numPartitions=10) result = model.freqItemsets().collect() single=[] many=[] for r in result: if len(r[0]) == 1: single.append(r) else: many.append(r) for i in single: print(i[0]) for i in many: print(i[0])
def alg_fp_growth(data_set_rdd, threshold, num_of_partitions): start = time.time() model = FPGrowth.train(data_set_rdd, threshold, num_of_partitions) end = time.time() itemsets_calculation_time = end - start print 'Training took %s seconds' % itemsets_calculation_time start = time.time() result = model.freqItemsets().collect() result = { str(sorted(list(set(n.items)))): (set(n.items), n.freq) for n in result } result_copy = {k: v for k, v in result.iteritems()} res = {} for k, v in result.iteritems(): if isCis(result_copy, v): res[k] = v end = time.time() collect_and_filter_time = end - start print 'Frequent itemsets collection and cis filter took %s seconds' % collect_and_filter_time return res, itemsets_calculation_time, collect_and_filter_time
def process_batch(df, epoch_id, topic_name): """ Counts tweets in a batch, runs FPGrowth, and saves output to text file """ # Open file in outputs folder in append mode file = open(f'outputs/{topic_name}', 'a') now = datetime.now() current_time = now.strftime("%d/%m/%y %H:%M:%S") file.write(f"Time: {current_time}\n") log.info(f"Custom Batch process for {topic_name}") log.debug(df.collect()) log.info(f"Current Time: {current_time}, Epoch ID: {epoch_id}") tweet_count = df.count() log.info(f"Total tweets in batch: {tweet_count}") file.write(f"Total tweets in batch: {tweet_count}\n") if tweet_count > 3: log.info("Running FPGrowth") file.write("Frequent Itemsets:\n") # Remove duplicate entries in a row transactions = df.rdd.map(lambda line: line.value.split(" ")) unique = transactions.map(lambda x: list(set(x))) model = FPGrowth.train(unique, minSupport=0.3) # Sort items based on frequency result = sorted(model.freqItemsets().collect(), reverse=True, key=lambda x: x[1]) for fi in result: log.debug(fi) file.write(f'{fi}\n') else: file.write('Not running FPGrowth due to low no. of tweets\n') file.write('\n\n') file.close()
def fpgrowth(self): ''' frequent mining 1.get the group of similar customers 2.list the products these customers using 3.run this fpgroup :return: ''' data = [ ['1', '1', '2'], ['2', '1', '1', '2'], ['P1', 'P3'], ['P3', 'P5', 'P4', 'P6'], ['P4', 'P5'] ] rdd = self.sc.parallelize(data, 2).cache() model = FPGrowth.train(rdd, minSupport=0.3, numPartitions=10) result = model.freqItemsets().collect() for r in result: print(r)
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter( "description is not NULL").cache() # df_jobs = spark.read.json("newjobs4rdd/newjobs.jsonl").filter("description is not NULL").cache() tokenizer = Tokenizer(inputCol="description", outputCol="words") tokenized = tokenizer.transform(df_jobs) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) words = removed.select("filtered").rdd.map(lambda x: list( set(blacklist(lemmatize(strip_punctuation(x.filtered)))))) model = FPGrowth.train(words, minSupport=0.1, numPartitions=1) finalDF = model.freqItemsets().map( lambda row: (' '.join(row.items), row.freq)).toDF( ["items", "freq"]).orderBy(desc("freq")).coalesce(1) finalDF.write.csv('fpjobs-newjobs-with-blacklist')
def getRulesByFPGrowth(FILENAME, iter1, iter2, classes, min_sup=0.1, min_conf=0.0, numPartitions=32, ratio=True) : # read data filepath = DIR_UCI+'/'+FILENAME+'/alpha/'+FILENAME+'-train'+str(iter1)+'-'+str(iter2)+'.txt' data = sc.textFile(filepath) print(filepath) transactions = data.map(lambda line: line.strip().split(' ')) # 最小支持度を定める nrow = sum(1 for line in open(filepath)) minSupport = float(min_sup) if ratio == True else float(min_sup) / float(nrow) # model 定義 model = FPGrowth.train(transactions, minSupport = minSupport, numPartitions=numPartitions) # クラスを含まない頻出アイテム集合だけを取り出す nocls_freq_item_sets = model.freqItemsets().filter(lambda fis: all(not x in fis.items for x in classes)) # クラスを含む頻出アイテム集合でかつ長さが2以上のものを取り出す cls_freq_item_sets = model.freqItemsets().filter(lambda fis: any(x in fis.items for x in classes)).filter(lambda fis: len(fis.items) > 1).collect() rules = [] #def getRule(cls_freq_item): # クラス以外の分が同じアイテムでかつ長さが1違いのアイテムを取り出す # cls_freq_item = cls_freq_item.first() # nocls_freq_item = nocls_freq_item_sets.filter(lambda ifs : all(x in cls_freq_item.items for x in ifs.items)).filter(lambda fis: len(fis.items) == len(cls_freq_item.items) - 1).first() #print(cls_freq_item) #print(nocls_freq_item) # conf = float(cls_freq_item.freq) / float(nocls_freq_item.freq) # if conf >= min_conf: # rule = Rule() # rule.setValue(nocls_freq_item.items) # cls = list(set(cls_freq_item.items) & set(nocls_freq_item.items))[0] # rule.setConsequent(cls) # rule.setSupport(cls_freq_item.freq) # rule.setConf(conf) # return(rule) # else : # return(None) # #rules = cls_freq_item_sets.foreach(getRule) rules = [] print("item count :"+str(len(cls_freq_item_sets))) for cls_freq_item in cls_freq_item_sets: # クラス以外の分が同じアイテムでかつ長さが1違いのアイテムを取り出す # nocls_freq_item = nocls_freq_item_sets.filter(lambda ifs : all(x in cls_freq_item.items for x in ifs.items)).filter(lambda fis: len(fis.items) == len(cls_freq_item.items) - 1).first() #print(cls_freq_item) # print(nocls_freq_item) #for nocls_freq_item in nocls_freq_item_sets: # # クラス以外の部分が同じアイテムでかつ長さが1違いのアイテムを取り出す # cls_freq_item = cls_freq_item_sets.filter(lambda fis: (all(x in fis.items for x in nocls_freq_item.items))).filter(lambda fis: len(fis.items) == len(nocls_freq_item.items) + 1).collect() # if cls_freq_item: # conf = float(cls_freq_item.freq) / float(nocls_freq_item.freq) # if conf >= min_conf: values = [x for x in cls_freq_item.items if not x in classes] cls = [x for x in cls_freq_item.items if x in classes][0] conf = 0.0 rule = Rule() rule.setValue(values) #cls = list(set(cls_freq_item.items) & set(nocls_freq_item.items))[0] rule.setConsequent(cls) rule.setStrength(cls_freq_item.freq) rule.setConf(conf) rules.append(rule) return(rules)
import time from pyspark.sql.functions import * from pyspark.mllib.fpm import FPGrowth, PrefixSpan # sc is an existing SparkContext. sqlContext = HiveContext(sc) # load i2b2 data data = sc.textFile("/Users/jayurbain/Dropbox/machine-learning/machine-learning/data/sample_fpgrowth.txt") print data.take(10) # fpgrowth example transactions = data.map(lambda line: line.strip().split(' ')) print transactions.take(5) model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10) result = model.freqItemsets().collect() for fi in result: print(fi) for i in result: print '(', ', '.join(i.items), ')', 'freq=', str(i.freq) ############################################# data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]] rdd = sc.parallelize(data, 2) model = FPGrowth.train(rdd, 0.6, 2) sorted(model.freqItemsets().collect()) ####################################################
sc = SparkContext() from pyspark.mllib.fpm import FPGrowth nums = set() reader = open( '/home/edu/songsong/mif_2019/fraud_detection/output/dataOfYZJPTC.csv') for num in reader: num = num.strip("\n").split(',') nums.add(num[0]) data = sc.textFile("/mif/data_new/worker_hospital_detail.txt") data = data.map(lambda line: line.split(',')) # num 0 ,medical_name 2 ,count 4 data_ngs = data.filter(lambda line: line[0] in nums and len(line) > 4) #basket data_bkt_withNum = data_ngs.map(lambda line: ((line[0], line[2]), 1)) \ .reduceByKey(lambda a, b: a) \ .map(lambda (k, v): (k[0], [k[1]])) \ .reduceByKey(lambda a, b: a + b) data_bkt = data_bkt_withNum.map(lambda (k, v): v) data_bkt.cache() model = FPGrowth.train(data_bkt, 0.01) fitems = model.freqItemsets().collect() out = open('output/fpm_yzjptc.txt', 'w') for itemset in fitems: line = reduce(lambda a, b: "%s\t%s" % (a, b), itemset.items).encode("utf-8") out.write("%d\t%s\n" % (itemset.freq, line)) out.close()
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # $example on$ from pyspark.mllib.fpm import FPGrowth # $example off$ from pyspark import SparkContext if __name__ == "__main__": sc = SparkContext(appName="FPGrowth") # $example on$ data = sc.textFile("data/mllib/sample_fpgrowth.txt") transactions = data.map(lambda line: line.strip().split(' ')) model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10) result = model.freqItemsets().collect() for fi in result: print(fi) # $example off$ minConfidence = 0.8 associations = model.generateAssociationRules(minConfidence).collect() for ar in associations: print(ar)
from pyspark import SparkContext from pyspark import SparkConf from pyspark.mllib.fpm import FPGrowth import sys, operator import re, string inputs = sys.argv[1] output = sys.argv[2] conf = SparkConf().setAppName('frequent itemsets') sc = SparkContext() text = sc.textFile(inputs) transactions = text.map(lambda line: map(int,line.split())) model = FPGrowth.train(transactions, 0.0002).freqItemsets().map(lambda (w,z):(sorted(w),z)) modelsort=model.sortBy(lambda (w,c): (-c,w)).map(lambda (w,c): u"%s %i" % (w, c)).take(10000) modelsort1=sc.parallelize(modelsort,1) modelsort1.saveAsTextFile(output)
# -*- coding:utf-8 -*- """" Program: FPGrowth Description:调用spark内置的fpgrowth算法示例 Author: zhenglei - [email protected] Date: 2016-01-14 13:36:09 Last modified: 2016-01-14 13:37:01 Python release: 2.7 """ # 调用spark内置的fp-growth算法,实现机器学习实战中的第十二章示例 from pyspark import SparkContext from pyspark.mllib.fpm import FPGrowth if __name__ == '__main__': sc = SparkContext() tmpdatas = sc.textFile('kosarak.dat') datas = tmpdatas.map(lambda line: line.strip().split(' ')) # tmpdatas = sc.textFile('/opt/spark-1.6.0/data/mllib/sample_fpgrowth.txt') # datas = tmpdatas.map(lambda line: line.strip().split(' ')) model = FPGrowth.train(datas, minSupport=0.1) results = model.freqItemsets().collect() for item in results: print item sc.stop()
.master("local")\ .appName("RDD_and_DataFrame") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() spark.conf.set("spark.sql.execution.arrow.enabled", "true") #执行sql语句,将订单数据整理成一个个事务 sentenceData = spark.sql("读取数据") #将Dataframe类型转换成rdd进行map rdd=sentenceData.rdd #将数据格式转换成[[itemid,itemid,...],[itemid,itemid,...],[itemid,itemid,...]],[itemid,itemid,...]表示一组事务,也就是同以订单中的物品组合 rdd=rdd.map(lambda x: x[1].split(",")).collect() #转换成模型需要的数据格式rdd类型,新版本可能模型有变化 rdd=spark.sparkContext.parallelize(rdd) #后面的参数分别表示最低支持度,2 表示输入的rdd 分成几个partition来做处理 model = FPGrowth.train(rdd, 0.00001, 2) skuresult=model.freqItemsets().collect() #skuresult为从订单中挖掘出来的频繁项集 2.关联规挖掘(频繁项集) 根据频繁项集找到对应的强关联规则: 找到k项频繁项集,将k项频繁项集进行关联规则提取,1对1,1对多,多对多 k项频繁项集假设为{1,2,3} 具体步骤: step1:求出所有非空子集{1},{2},{3},{1,2},{2,3},{1,3} step2:任意取两个不想交的非空子集生成一个规则:如{1}--》{2,3} step3:判断规则是否为强关联规则,如果是则进行存储规则表中可用dict存储以便于后续用来进行匹配推荐。 step4:循环2,3步
def stopwords_remover(text_list): res = [] for word in text_list: if len(word)>2: if word not in stopwords: res.append(word) return res #keeps only one word's occurrence for basket def duplicate_remover(text_list): return dict.fromkeys(text_list).keys() #find text inside tweet's data and def parse_text(tweet): #search text res = re.search('\"text\" : "(.*)" , \"in_reply_to_status_id\"', tweet).group(1) #remove bad characters res = text_cleaner(res) #remove words that appear more than once and next remove stopwords return stopwords_remover(duplicate_remover(res.split())) sc = SparkContext(appName="testFPGrowth") stopwords = open('/home/e01/stopwords.txt','r').read().splitlines() rdd_tweets = sc.textFile(large_file).sample(False, sample_size, 42).map(lambda $ model = FPGrowth.train(rdd_tweets, minSupport=0.02, numPartitions=1000) result = model.freqItemsets().collect() for fi in result: print(fi)
except ImportError as e: print("Can not import Spark Modules", e) try: from pyspark import SparkContext from pyspark import SparkConf from pyspark.mllib.fpm import FPGrowth from pyspark import SparkContext from pyspark import SparkConf from pyspark import SparkContext from pyspark.conf import SparkConf conf = SparkConf() conf.setMaster("local").setAppName('FPgrowth-notebook').set( "spark.executor.memory", "50g") sc = SparkContext(conf=conf) data = sc.textFile("/home/mehrdad/Downloads/Text.csv", 10) transactions = data.map(lambda line: line.strip().split(' ')) model = FPGrowth.train(transactions, minSupport=0.0001, numPartitions=10) result = model.freqItemsets().collect() f2 = open("/home/mehrdad/Downloads/RulesOfBank.txt", 'w') for fi in result: print(fi) f2.write(fi) sc.stop() except ImportError as e: print("Can not import Spark Modules", e)
from_addr = '*****@*****.**', to_addr = '*****@*****.**', subject = 'subject', text_body = 'sdfasf') envelope.send('172.16.8.28') #envelope.send('172.16.8.28', login='******',password='******', tls=True) from clockwork import clockwork api = clockwork.API('2bba3f5cb100cb0b3e1085c6b546a1ffe2f2cec8') message = clockwork.SMS(to = '9910055945', message = 'This is a test message.') response = api.send(message) ---------------------------- from pyspark.mllib.fpm import FPGrowth from pyspark.mllib.evaluation import RankingMetrics from pyspark import SparkContext #sc = SparkContext(appName='aslkjsdf') data = sc.textFile('/spark-data/input/eval_100000') header = data.first() data1 = data.filter( lambda x: x != header) data = data1.map(lambda x: x.split(",")).map( lambda x: tuple([x[2],x[0]])).distinct() data = data.groupByKey().mapValues(list).map(lambda x : x[1]) data.saveAsTextFile('/spark-data/testing/formatted_data') train,test = data.randomSplit([7, 3], 0) model = FPGrowth.train(data, minSupport=0.01) result = model.freqItemsets() result.saveAsTextFile('/spark-data/testing/ouput/complete')
from pyspark.mllib.fpm import FPGrowth import sys, operator inputs = sys.argv[1] #input output = sys.argv[2] #output conf = SparkConf().setAppName('frequent itemsets') sc = SparkContext(conf=conf) text = sc.textFile(inputs) """ sbaronia - taking itemsets in int form and splitting then of spaces, else ' ' becomes an itemset """ items = text.map(lambda line: map(int, line.strip().split(' '))) """ sbaronia - calling FPGrowth function with support 0.0022 and partition 1, will give more than 10k frequent itemsets """ model = FPGrowth.train(items, 0.0022, 1) fitems = model.freqItemsets() """ sbaronia - here we sort every transaction in ascending order and then the entire 10k by descending order of frequencies and make and rdd from list of 10k items """ sort_transactions = sc.parallelize(fitems.map(lambda (i,c): (sorted(i), c)).sortBy(lambda (i,c): (-c,i)).take(10000)) sort_transactions.saveAsTextFile(output)
# This program trains and fits a FPGrowth model using RDD for finding frequent # patterns from the data from pyspark import SparkContext from pyspark import SparkConf from pyspark.mllib.fpm import FPGrowth sc = SparkContext.getOrCreate(SparkConf()) #data = sc.textFile("hdfs://worker2.hdp-internal:8020/user/ketkid1/calls.txt") data = sc.textFile( "hdfs://worker2.hdp-internal:8020/user/ketkid1/calldata.txt") # remove the empty lines present in RDD data = data.filter(lambda line: line not in '') # split each line on comma calls = data.map(lambda line: line.strip().split(',')) # remove duplicates if any and cache the input data unique = calls.map(lambda x: list(set(x))).cache() # train the FP Growth model and predict the result model = FPGrowth.train(calls, minSupport=0.02, numPartitions=2) result = model.freqItemsets().collect() # print the result for i in result: print(i)
def FPGrowthRDD(transactionsRDD, minSupport=0.2, numPartitions=10): ''' perform the FPGrowth algorithm ''' model = FPGrowth.train(transactionsRDD, minSupport, numPartitions) return model.freqItemsets()
# print(data.take(num_obs),'\n') data = data.map(lambda x: x[1]) # print(data.take(num_obs),'\n') data = data.map(lambda line:re.sub('[^A-Za-z0-9]+', ',', line)) print(data.take(num_obs),'\n') data = data.map(lambda line:re.sub(num_rep, 'NUM', line)) print(data.take(num_obs),'\n') data = data.map(lambda line: line.lower().strip().split(',')[:3]) print(data.take(num_obs),'\n') data = data.map(lambda line: [elem+'_P'+str(idx+1) for (elem,idx) in zip(line,range(len(line)))]) print(data.take(num_obs),'\n') # In[53]: datamodel = FPGrowth.train(data,minSupport=0.001,numPartitions=10) # In[9]: # data.saveAsTextFile(processed_data_out) # In[10]: #df=data.toDF() # data.take(100) #data.getNumPartitions() # In[15]:
sc = SparkContext(conf=conf) text = sc.textFile(inputs1) def split_items(ts): items_list = [] for transaction in ts: items = transaction.split() int_items = [int(i) for i in items] items_list.append(int_items) return items_list transactions = text.map(lambda line: line).collect() transaction_list = split_items(transactions) rdd = sc.parallelize(transaction_list, 6) model = FPGrowth.train(rdd, minSupport=0.002, numPartitions=10) frequent_sets = model.freqItemsets().collect() frequent_tuples = sc.parallelize(frequent_sets).map(lambda (items, freq): (sorted(items), freq)).coalesce(1).collect() frequent_tuples.sort(key = lambda r: r[0]) frequent_tuples.sort(key = lambda r: r[1], reverse = True) top10k = sc.parallelize(frequent_tuples).take(10000) output_data = sc.parallelize(top10k).coalesce(1) output_data.saveAsTextFile(output)
conf = SparkConf().setAppName("itemsets") sc = SparkContext(conf=conf) text = sc.textFile(inputs1) def split_items(ts): items_list = [] for transaction in ts: items = transaction.split() int_items = [int(i) for i in items] items_list.append(int_items) return items_list transactions = text.map(lambda line: line).collect() transaction_list = split_items(transactions) rdd = sc.parallelize(transaction_list, 6) model = FPGrowth.train(rdd, minSupport=0.002, numPartitions=10) frequent_sets = model.freqItemsets().collect() frequent_tuples = sc.parallelize(frequent_sets).map( lambda (items, freq): (sorted(items), freq)).coalesce(1).collect() frequent_tuples.sort(key=lambda r: r[0]) frequent_tuples.sort(key=lambda r: r[1], reverse=True) top10k = sc.parallelize(frequent_tuples).take(10000) output_data = sc.parallelize(top10k).coalesce(1) output_data.saveAsTextFile(output)
f.write(output) cmd_put = "hadoop fs -put " + outputdir + " /test" cmd_rm = "hadoop fs -rm /test/" + outputdir.split("/")[-1] (iRet, RetInfo) = commands.getstatusoutput(cmd_put) if iRet != 0: commands.getstatusoutput(cmd_rm) commands.getstatusoutput(cmd_put) if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: fpmining_spark <infile> miniSupport(0,1) <outfile>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonFpMining") # input pre-process data = sc.textFile(sys.argv[1], 1) fSupport = float(sys.argv[2]) outDir = sys.argv[3] # data pre-process transactions = data.map(lambda line: list(set(line.strip().split(",")))) new_trans = transactions.filter(lambda t: len(t) > 1) # FPGrowth model = FPGrowth.train(new_trans, minSupport=fSupport, numPartitions=10) result = model.freqItemsets().collect() # for fi in result: # print(fi.items, fi.freq) sc.stop() GetFreqItems(result, outDir)
trans = transactions.select("items") trans = trans.collect() a = [(item) for sublist in trans for item in sublist] a = sc.parallelize(a) #model = FPGrowth.train(a, minSupport=0.2, numPartitions=10) #result = model.freqItemsets().collect() #for fi in result: # print(fi) from pyspark.mllib.fpm import FPGrowth #model = FPGrowth.train(transactions, minSupport=0.3, numPartitions=5) #result = model.freqItemsets().collect() fpGrowth = FPGrowth.train(a[:20], minSupport=0.1, minConfidence=0.6) model = fpGrowth.fit(a[:20]) model.associationRules.show(1) model.freqItemsets.show(1) # transform examines the input items against all the association rules and summarize the # consequents as prediction #model.transform(transactions).show(10) #import pyspark.ml.stat #icecream = orders.join(order_products, orders.order_id == order_products.order_id) #icecream = icecream.select("order_hour_of_day","product_id") #icecream = icecream.join(products, icecream.product_id == products.product_id) #icecream = icecream.select("order_hour_of_day","product_name").show() ##icecream = icecream.filter(icecream.product_name=="Ice cream").show()
#print(DescriptionGrp.rdd.take(2)) minSupport = 0.05 * DescriptionGrp.rdd.count() apr_tem = DescriptionGrp.rdd.map(lambda x: (x[0], list([x[1]]))).reduceByKey( lambda x, y: x + y) schema = StructType([ StructField("id", StringType(), True), StructField("items", ArrayType(StringType()), True) ]) transactions = spark.createDataFrame(apr_tem, schema) print(transactions.show(2)) ##transactions_fp=apr_tem.map(lambda x: (x[1])) #print(transactions_fp.take(2)) #schema = StructType([StructField("test_123",ArrayType(StringType(),True),True)]) #fields = [StructField(field_name, StringType(), True) for field_name in schema.split(',')] #schema = StructType(fields) ##final_transactions_rdd = sc.parallelize(transactions_fp.collect()) ##final_transactions = final_transactions_rdd.map(lambda x : ','.join(x)) ##print(final_transactions.take(2)) #transactions = spark.createDataFrame([final_transactions]) ##transactions = final_transactions.map(lambda line: line.strip().split(',')) ##print(transactions.take(2)) fpgrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6) ##fpgrowth = FPGrowth(minSupport=0.5, minConfidence=0.6) model = fpgrowth.fit(transactions) # Display frequent itemsets. model.freqItemsets.show() # Display generated association rules. model.associationRules.show() # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show()
from pyspark import SparkContext from pyspark import SparkConf from pyspark.mllib.fpm import FPGrowth import sys, operator import re, string inputs = sys.argv[1] output = sys.argv[2] conf = SparkConf().setAppName('frequent itemsets') sc = SparkContext() text = sc.textFile(inputs) transactions = text.map(lambda line: map(int, line.split())) model = FPGrowth.train( transactions, 0.0002).freqItemsets().map(lambda (w, z): (sorted(w), z)) modelsort = model.sortBy(lambda (w, c): (-c, w)).map(lambda (w, c): u"%s %i" % (w, c)).take(10000) modelsort1 = sc.parallelize(modelsort, 1) modelsort1.saveAsTextFile(output)
print splitted_explanation.take(5) # In[12]: distincted_set = splitted_explanation.map(lambda line: distinct_set(line)) distincted_set.take(5) # In[13]: distinctedlist = distincted_set.map(lambda line: distinct_list(line)) distinctedlist.take(5) # In[14]: from pyspark.mllib.fpm import FPGrowth model = FPGrowth.train(distinctedlist, minSupport=0.001, numPartitions=1000) result = model.freqItemsets().collect() for fi in result: print(fi) # In[ ]: