def keep_task_run_time(start_time=None): """ Keep execute of spark-task :param start_time: Start time, is a datetime :return: """ # todo from pyspark.shell import spark # from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DataType # 导入类型 date_ = datetime.now() origin_data = spark.sparkContext.parallelize([ (date_, 'test', (datetime.now() - start_time).seconds), ]) # Use given schema # schema = StructType([ # StructField("generateTime", DataType(), True), # StructField("name", StringType(), True), # StructField("runTime", IntegerType(), True) # ]) # Use the schema spark auto given spark_df = spark.createDataFrame( origin_data, schema=['generateTime', 'name', 'runTime']) # 保存数据,覆盖写 file_ = r"/srv/BigData/dbdata_service/ffk/test_run_record/test.parquet" spark_df.write.parquet(path=file_, mode='overwrite')
def mP(a, tab, Aligner, sc): manager = multiprocessing.Manager() alignments = manager.list() cores = int(input('Inserisci il numero di processori: ')) if cores > multiprocessing.cpu_count(): cores = multiprocessing.cpu_count() print("Superato il numero massimo di processori,", str(cores), "in uso") else: print(str(cores), "processori in uso") processes = [] data = ReadFile.SPARKreadFile(sc) dict = [x["SEQ"] for x in data.rdd.collect()] #dict = ReadFile.HengLireadFile() #Heng Li chunk_size = len(dict) / cores slices = Chunks(dict, math.ceil(chunk_size)) for i, s in enumerate(slices): procname = 'processor' + str(i) p = multiprocessing.Process(target=Alignment.mPalignment, args=(a, tab, Aligner, s, alignments, procname)) p.start() processes.append(p) for p in processes: p.join() DF = spark.createDataFrame(alignments) DataFrame = DF.join(data, on=['seq'], how='inner') return DataFrame
def createInterestVector(self): self.predict_interest=pd.read_csv(r'../analyed_data/fourth_predict_interest.csv',parse_dates=['report_date']) self.predict_interest = spark.createDataFrame(self.predict_interest) # vecAssembler = VectorAssembler( # inputCols=['mfd_daily_yield', 'mfd_7daily_yield', 'Interest_O_N', 'Interest_1_W', 'Interest_2_W', # 'Interest_1_M', 'Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'], # outputCol='features') vecAssembler = VectorAssembler( inputCols=[ 'Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'], outputCol='features') self.predict_interest = vecAssembler.transform(self.predict_interest)
def write_log(level_log, program_name, procedure_name, message): log_row = namedtuple('log_row', 'TIME_LOG LEVEL_LOG PROGRAM_NAME PROCEDURE_NAME MESSAGE'.split()) data = log_row(datetime.datetime.today(), level_log, program_name, procedure_name, message) result = spark.createDataFrame([data]) result.write \ .format('jdbc') \ .mode('append') \ .option('driver', 'oracle.jdbc.OracleDriver') \ .option('url', DATABASE_SOURCE['url']) \ .option('dbtable', 'log_table') \ .option('user', DATABASE_SOURCE['user']) \ .option('password', DATABASE_SOURCE['password']) \ .save()
def write_log(level_log, program_name, procedure_name, message): log_row = namedtuple('log_row', 'TIME_LOG LEVEL_LOG PROGRAM_NAME PROCEDURE_NAME MESSAGE'.split()) data = log_row(datetime.datetime.today(), level_log, program_name, procedure_name, message) result = spark.createDataFrame([data]) result.write \ .format('jdbc') \ .mode('append') \ .option('driver', DRIVER) \ .option('url', URL_SOURCE_DB) \ .option('dbtable', LOG_TABLE_NAME) \ .option('user', SOURCE_DB_USER_NAME) \ .option('password', SOURCE_DB_USER_PASSWORD) \ .save()
def load_classifier_model(): # model = PipelineModel.load("./movie-robot-model") # print(model) # if model != None: # return model data_set = ModelProcessUtil.create_train_vectors() df = spark.createDataFrame(data_set) df.show() nb = NaiveBayes(modelType="bernoulli") nb_model = nb.fit(df) nb_model.setFeaturesCol("features") # nb_model.save("./movie-robot-model") nb_model.write().overwrite().save("./movie-robot-model") return nb_model
def createBalanceVector(self): # 读取训练集 self.rawdata=pd.read_csv(r'../analyed_data/data_user_balance.csv',index_col='report_date',parse_dates=['report_date']) # 获取要删除的列名 # self.drop_name = list(self.rawdata.columns[2:-10]) # # 将没用到的列删除 # self.drop_data = self.rawdata.drop(columns=self.drop_name) self.data = spark.createDataFrame(self.rawdata) # 用VectorAssembler来将多个列合并成一个列 # vecAssembler = VectorAssembler( # inputCols=['mfd_daily_yield', 'mfd_7daily_yield', 'Interest_O_N', 'Interest_1_W', 'Interest_2_W', # 'Interest_1_M', 'Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'], # outputCol='features') vecAssembler = VectorAssembler( inputCols=['Interest_3_M', 'Interest_6_M', 'Interest_9_M', 'Interest_1_Y'], outputCol='features') self.balanceData=vecAssembler.transform(self.data)
def setsCreation(multiplier, dataset): if dataset == 1: source_file = "creditcard_undersampled" + str(dataset) elif dataset == 2: source_file = "creditcard_undersampled" + str(dataset) elif dataset == 3: source_file = "creditcard_normalized" + str(dataset - 2) else: source_file = "creditcard_normalized" + str(dataset - 2) datas = [] # Leggo il ds con pandas p_df = pd.read_csv( str(Path(__file__).parent) + "/CSV_Sources/" + source_file + ".csv") # Converto il ds pandas in un ds spark, dando un numero di partition pari alla radice quadrata degli elementi s_df = spark.createDataFrame(p_df).repartition(int(math.sqrt(len(p_df)))) # Cachare è fondamentale quando si esegue questo codice in un cluster di almeno 2 macchine s_df.cache() for i in range(0, multiplier): datas.append(s_df.rdd.randomSplit([0.7, 0.3], seed=1234)) #numTrainingData = datas[0][0].count() #numTestData = datas[0][1].count() #print(str(numTrainingData + numTestData) + " elementi sono stati divisi in " + str(numTrainingData) +" nel trainingData e " # + str(numTestData) + " nel testData") #print("Indici di training: " + str(sorted(datas[0][0].map(lambda x: x[31]).collect()))) #print("#####") #print("Indici di test: " + str(sorted(datas[0][1].map(lambda x: x[31]).collect()))) # Creo una RDD di LabeledPoint # converted_data = s_df.rdd.map(lambda x: LabeledPoint(x[30], x[:30])) # Splitto i dati in training set e test set # (trainingData, testData) = converted_data.randomSplit([0.7, 0.3]) return datas
def write_log(level_log, program_name, procedure_name, message): """ Function for writing log :param level_log: level of logging, can be one of ["INFO", "WARN", "ERROR"]; :param program_name: script's name; :param procedure_name: function's name; :param message: description of the recording. """ log_row = namedtuple( 'log_row', 'TIME_LOG LEVEL_LOG PROGRAM_NAME PROCEDURE_NAME MESSAGE'.split()) data = log_row(datetime.datetime.today(), level_log, program_name, procedure_name, message) result = spark.createDataFrame([data]) result.write \ .format("jdbc") \ .mode("append") \ .option("driver", 'oracle.jdbc.OracleDriver') \ .option("url", DATABASE_SOURCE['url']) \ .option("dbtable", LOG_TABLE_NAME) \ .option("user", DATABASE_SOURCE['user']) \ .option("password", DATABASE_SOURCE['password']) \ .save()
# -*- coding:utf-8 -*- from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.shell import spark sentenceData = spark.createDataFrame( [(0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat")], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show()
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("OnlineNewsPopularity").getOrCreate() data = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load( "OnlineNewsPopularity/OnlineNewsPopularity.csv") # Standardization # Define the `input_data` input_data = data.rdd.map(lambda x: (x[0], DenseVector(x[1:5]))) # Replace `data` with the new DataFrame data = spark.createDataFrame(input_data, ["label", "features"]) # data.show() # Import `StandardScaler` from pyspark.ml.feature import StandardScaler # Initialize the `standardScaler` standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Fit the DataFrame to the scaler scaler = standardScaler.fit(data) # Transform the data in `df` with the scaler scaled_data = scaler.transform(data)
from pyspark.ml.feature import StringIndexer from pyspark.shell import spark df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) df.show() indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") indexed = indexer.fit(df).transform(df) indexed.show() def StringIndexerModel(input_col, output_col,input_data): indexer = StringIndexer(inputCol=input_col, outputCol=output_col) result = indexer.fit(input_data).transform(input_data) return result
@author: xieweiwei ''' from pyspark.ml.feature import VectorIndexer from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression, SparkSession from pyspark.ml.feature import HashingTF, Tokenizer # Prepare training documents from a list of (id, text, label) tuples. # spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate() from pyspark.shell import spark training = spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) test = spark.createDataFrame([ (4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"),
# -*- coding:utf-8 -*- from pyspark.ml.feature import Word2Vec # Input data: Each row is a bag of words from a sentence or document. from pyspark.shell import spark documentDF = spark.createDataFrame([ ("Hi I heard about Spark".split(" "),), ("I wish Java could use case classes".split(" "),), ("Logistic regression models are neat".split(" "),) ], ["text"]) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF) result = model.transform(documentDF) for row in result.collect(): text, vector = row print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation from pyspark.shell import spark data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
""" :author: young :DATE: 2019/5/14 14:20 :copyright: © 2019 young <*****@*****.**> :license: None, see LICENSE for more details. """ from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression # Prepare training data from a list of (label, features) tuples. from pyspark.shell import spark training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance.
from pyspark.sql import Row from pyspark.sql.functions import col, concat_ws, lit from pyspark.sql.dataframe import DataFrame from dependencies.spark import start_spark schema = StructType([ StructField("person.name", StringType(), True), StructField("person", StructType([ StructField("name", StringType(), True), StructField("age", IntegerType(), True)])) ]) data = [ ("charles", Row("chuck", 42)), ("lawrence", Row("larry", 73)) ] # spark = SparkSession.builder \ # .master("local") \ # .appName("Word Count") \ # .config("spark.some.config.option", "some-value") \ # .getOrCreate() # s1 = SparkSession.builder.config("k1", "v1").getOrCreate() # df = s1.createDataFrame(data, schema) # df.show() # df = sqlContext.createDataFrame(data, ["features"]) df = spark.createDataFrame( [("china", "asia"), ("colombia", "south america")], ["country.name", "continent"] ) df.show()
from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark.shell import spark sentenceDataFrame = spark.createDataFrame( [(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat")], ["id", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) countTokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
cols = ('age', 'sex', 'chest pain', 'resting blood pressure', 'serum cholesterol', 'fasting blood sugar', 'resting electrocardiographic results', 'maximum heart rate achieved', 'exercise induced angina', 'ST depression induced by exercise relative to rest', 'the slope of the peak exercise ST segment', 'number of major vessels ', 'thal', 'last') data = pd.read_csv('heart.dat', delimiter=' ', names=cols) data = data.iloc[:, 0:13] data['isSick'] = data['thal'].apply(isSick) df = spark.createDataFrame(data) features = ('age', 'sex', 'chest pain', 'resting blood pressure', 'serum cholestoral', 'fasting blood sugar', 'resting electrocardiographic results', 'maximum heart rate achieved', 'exercise induced angina', 'ST depression induced by exercise relative to rest', 'the slope of the peak exercise ST segment', 'number of major vessels ') assembler = VectorAssembler(inputCols=features, outputCol="features") raw_data = assembler.transform(df) raw_data.select("features").show(truncate=False) standardscaler = StandardScaler().setInputCol("features").setOutputCol(
from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # Prepare training documents, which are labeled. from pyspark.shell import spark training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), (8, "e spark program", 1.0), (9, "a e c l", 0.0), (10, "spark compile", 1.0), (11, "hadoop software", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. # This will allow us to jointly choose parameters for all Pipeline stages. # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. # We use a ParamGridBuilder to construct a grid of parameters to search over.
recipes_recom = [] recom_results = [] rated_results = [] user_index = 1 recipes = np.load('recipes.npy') ratings = pd.read_csv('ratings.csv') ratings_pivot = ratings.pivot_table(values='rating', index=['user_id'], columns=['recipe_id'], fill_value=0, dropna=False) ratings_values = ratings_pivot.values ratings = spark.createDataFrame(ratings) string_indexer1 = StringIndexer(inputCol="user_id", outputCol="user_id_index") string_indexer2 = StringIndexer(inputCol="recipe_id", outputCol="recipe_id_index") indexers = [string_indexer1, string_indexer2] pipeline = Pipeline(stages=indexers) ratings_final = pipeline.fit(ratings).transform(ratings) als = ALS(rank=20, maxIter=20, regParam=0.1, userCol="user_id_index", itemCol="recipe_id_index", ratingCol="rating", coldStartStrategy="drop") model = als.fit(ratings_final) users_recs = model.recommendForAllUsers(10)
# featureIndexer=VectorIndexer(inputCol='features',outputCol='indexedFeatures',maxCategories=4).fit(data) # (trainingData,testData)=data.randomSplit([0.7,0.3]) # gbt=GBTClassifier(labelCol='indexedLabel',featuresCol='indexedFeatures',maxIter=10) # pipeline=Pipeline(labelIndexer,featureIndexer,gbt) # model=pipeline.fit(trainingData) # predictions=model.transform(testData) # print(predictions) # # data=spark.read.format('libsvm').load('/media/four/four/spark-2.2.0-bin-hadoop2.7/data/mllib/sample_multiclass_classification_data.txt') # print(data.show()) # splits=data.randomSplit([0.6,0.4],1234) # train=splits[0] # test=splits[1] # layers=[4,5,4,3] # trainer=MultilayerPerceptronClassifier(maxIter=100,layers=layers,blockSize=128,seed=1234) # model=trainer.fit(train) # result=model.transform(test) # print (result) # lsvc=LinearSVC(maxIter=10,regParam=0.1) # lsvcModel=lsvc.fit(data) from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] df = spark.createDataFrame(data, ['features']) r1 = Correlation.corr(df, 'features', 'spearman') print(r1.head())
import pandas as pd import numpy as np from pyspark.shell import spark length = 100 names = np.random.choice(['Bob', 'James', 'Marek', 'Johannes', None], length) amounts = np.random.randint(0, 1000000, length) country = np.random.choice( ['United Kingdom', 'Poland', 'USA', 'Germany', None], length) df = pd.DataFrame({'name': names, 'amount': amounts, 'country': country}) print(df.head()) # Default RDD partition creation transactions = spark.createDataFrame(df) print('Number of partitions: {}'.format(transactions.rdd.getNumPartitions())) print('Partitioner: {}'.format(transactions.rdd.partitioner)) print('Partitions structure: {}'.format(transactions.rdd.glom().collect())) print('Total transactions RDD instances: {}'.format(transactions.rdd.count())) # Re-partitioned increased to 8 repartitioned = transactions.repartition(8) print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions())) print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect())) print('Total repartitioned RDD instances: {}'.format( repartitioned.rdd.count())) # Re-partitioning by specifying the column repartitioned = transactions.repartition('country') print('Number of partitions: {}'.format(repartitioned.rdd.getNumPartitions())) print('Partitions structure: {}'.format(repartitioned.rdd.glom().collect())) print('Total repartitioned RDD instances based on country column: {}'.format(
from pyspark.shell import spark from pyspark.sql import Row sc = spark.sparkContext # Load a text file and convert each line to a Row. lines = sc.textFile("examples/src/main/resources/people.txt") print('*****************') print(type(lines)) parts = lines.map(lambda l: l.split(",")) people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) print(type(people)) print(people) # Infer the schema, and register the DataFrame as a table. schemaPeople = spark.createDataFrame(people) schemaPeople.createOrReplaceTempView("people") # SQL can be run over DataFrames that have been registered as a table. teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") # The results of SQL queries are Dataframe objects. # rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`. teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect() for name in teenNames: print(name)
# sc = SparkContext(master, "Twitter_Sentiment_Analysis") rdd = spark.sparkContext.wholeTextFiles(base) def extract_time_and_tweet(line): # print(line[len(base):]) print(line[0]) a_flume = line[1].split('\n') for tweet in a_flume: if len(tweet) > 3: data = json.loads(tweet) return 0.0, data['text'], data["timestamp_ms"] text2 = rdd.map(extract_time_and_tweet) text2.count() rawLabelTweetDataFrame = spark.createDataFrame( text2, ["label", "tweets", "time_stamp_ms"]) regexTokenizer = RegexTokenizer(inputCol="tweets", outputCol="words", pattern="\\W") tokenized = regexTokenizer.transform(rawLabelTweetDataFrame) remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(tokenized).select( "label", "filtered", "time_stamp_ms") ngram = NGram(n=1, inputCol="filtered", outputCol="ngrams") ngramDataFrame = ngram.transform(filteredDataFrame) ngramDataFrame.show() ngramData = ngramDataFrame.select("label", "ngrams", "time_stamp_ms")
from pyspark.ml.fpm import FPGrowth from pyspark.shell import spark df = spark.createDataFrame([(0, [1, 2, 5]), (1, [1, 2, 3, 5]), (2, [1, 2])], ["id", "items"]) fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6) model = fpGrowth.fit(df) # Display frequent itemsets. model.freqItemsets.show() # Display generated association rules. model.associationRules.show() # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show()