def get_output(in_data): text = sc.textFile(in_data) nltk_data_path = "[your nltk data path]" # maybe changed to the sfu server path nltk.data.path.append(nltk_data_path) stop_words = set(stopwords.words("english")) cleaned_review = text.map(lambda review_line: clean_review(review_line, stop_words)) data_set = cleaned_review.map(lambda cleaned_line: (cleaned_line['reviewText'], cleaned_line['overall'], time.strptime(cleaned_line['reviewTime'], '%m %d, %Y'))) nor = Normalizer(1) training_data = data_set.filter(lambda (review_text, rating, review_date): review_date.tm_year < 2014).cache() training_ratings = training_data.map(lambda (review_text, rating, review_date): rating) training_reviews = training_data.map(lambda (review_text, rating, review_date): review_text) training_tfidf_features = get_tfidf_features(training_reviews) nor_training = nor.transform(training_tfidf_features) training_output = training_ratings.zip(nor_training).coalesce(1) testing_data = data_set.filter(lambda (review_text, rating, review_date): review_date.tm_year == 2014).cache() testing_ratings = testing_data.map(lambda (review_text, rating, review_date): rating) testing_reviews = testing_data.map(lambda (review_text, rating, review_date): review_text) testing_tfidf_features = get_tfidf_features(testing_reviews) nor_testing = nor.transform(testing_tfidf_features) testing_output = testing_ratings.zip(nor_testing).coalesce(1) return training_output, testing_output
def column_search(words,row_filter): if row_filter == 'n' or row_filter == 'N': min_row = 0 else: min_row = row_filter rawData = table_cols.join(master_index, master_index["Table_Name"]==table_cols["Name"]).rdd data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse) titles = data.map(lambda x: x[0]) documents = data.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) normalizer = Normalizer() tfidf = normalizer.transform(idf.transform(tf)) tfidfData = titles.zip(tfidf).toDF(["label", "features"]) query = parse((0, words))[1] queryTF = hashingTF.transform(query) queryTFIDF = normalizer.transform(idf.transform(queryTF)) queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0) queryRelevance = queryRelevance.toDF(["Doc_ID", "scores"]) queryRelevance = queryRelevance.join(table_desc,queryRelevance.Doc_ID == table_desc.Doc_ID).select(table_desc.Doc_ID, queryRelevance.scores, table_desc.Columns) queryRelevance = queryRelevance.join(master_index, master_index.Doc_ID==queryRelevance.Doc_ID).select(master_index.Table_Name,master_index.Table_Length, queryRelevance.Columns, queryRelevance.scores) queryRelevance = queryRelevance.rdd.filter(lambda x: int(x['Table_Length']) >= int(min_row)) if (queryRelevance.isEmpty()): print("Sorry, nothing matched in column search, please try a different keyword") else: print("Here is your column search result") queryRelevance.toDF().show() '''
def features_normal(self, featuresRDD): from pyspark.mllib.feature import Normalizer # featuresRDD=featuresRDD.map(lambda point:Row(point.label,point.features)) # fields=[StructField('label',FloatType(),nullable=True),StructField('features',ArrayType(elementType=FloatType()),nullable=True)] # schema=StructType(fields) fl_df = self.spark.createDataFrame(featuresRDD).toDF( 'label', 'features') # fl_df.foreach(print) # scaler=MinMaxScaler(inputCol='features',outputCol='SCfeatures') # scalerModel=scaler.fit(fl_df) # SCdata=scalerModel.transform(fl_df).select('label','SCfeatures') # SCdata.foreach(print) normalizer = Normalizer() featuresRDD = featuresRDD.map( lambda p: [p[0], normalizer.transform(p[1])]) # featuresRDD=featuresRDD.map(lambda p:LabeledPoint(int(p[0]),p[1])) return featuresRDD
def normTFIDF(fn_tokens_RDD, vecDim, caching=True): keysRDD = fn_tokens_RDD.keys() tokensRDD = fn_tokens_RDD.values() tfVecRDD = tokensRDD.map(lambda tokens: hashing_vectorize( tokens, vecDim)) # passing the vecDim value. TIP: you need a lambda. if caching: tfVecRDD.persist( StorageLevel.MEMORY_ONLY ) # since we will read more than once, caching in Memory will make things quicker. idf = IDF() # create IDF object idfModel = idf.fit(tfVecRDD) # calculate IDF values tfIdfRDD = idfModel.transform( tfVecRDD) # 2nd pass needed (see lecture slides), transforms RDD norm = Normalizer( ) # create a Normalizer object like in the example linked above normTfIdfRDD = norm.transform(tfIdfRDD) # and apply it to the tfIdfRDD zippedRDD = keysRDD.zip(normTfIdfRDD) # zip the keys and values together return zippedRDD
def get_features(line, categorical_length,mappage): categorical_vector = np.zeros(categorical_length) i = 0 offset = 0 # access every categorical feature for field in line[0:3]: # access every dictionnaryy in overall mapping map_dict = mappage[i] # get the index in dict which value is different than 0 index = map_dict[field] # assign the value of 1 to the correcoping global index in categ_vector categorical_vector[index + offset] = 1 # go to next dict and increase offest i = i + 1 offset = offset + len(map_dict) # assign vector for numerical features by normalising them to fix scaling problems later normalizer = Normalizer() numerical_vector = normalizer.transform(np.array([float(val) for val in line[3:8]])) return np.concatenate((categorical_vector,numerical_vector))
def ml_features_normal(self, features): from pyspark.ml.linalg import Vectors from pyspark.ml.feature import Normalizer from pyspark.ml.classification import NaiveBayes from tools import f features.foreach(print) fea_df = features.map(lambda i: Row(**f(i))).toDF() # fea_df.show() normalizer = Normalizer().setInputCol('features').setOutputCol( 'norfeatures').setP(1.0) norfea_df = normalizer.transform(fea_df) # norfea_df.show() train_dt, test_dt = norfea_df.randomSplit([0.8, 0.2]) nvby = NaiveBayes(modelType="multinomial", smoothing=0.1) nvby_mod = nvby.fit(dataset=train_dt) predictRDD = nvby_mod.transform(test_dt).rdd count = predictRDD.count() print( predictRDD.map(lambda i: (i.label, i.prediction)).filter( lambda i: i[0] == i[1]).count() / count)
def get_output(in_data): text = sc.textFile(in_data) nltk_data_path = "[your nltk data path]" # maybe changed to the sfu server path nltk.data.path.append(nltk_data_path) stop_words = set(stopwords.words("english")) cleaned_review = text.map( lambda review_line: clean_review(review_line, stop_words)) data_set = cleaned_review.map(lambda cleaned_line: ( cleaned_line['reviewText'], cleaned_line['overall'], time.strptime(cleaned_line['reviewTime'], '%m %d, %Y'))) nor = Normalizer(1) training_data = data_set.filter(lambda (review_text, rating, review_date): review_date.tm_year < 2014).cache() training_ratings = training_data.map( lambda (review_text, rating, review_date): rating) training_reviews = training_data.map( lambda (review_text, rating, review_date): review_text) training_tfidf_features = get_tfidf_features(training_reviews) nor_training = nor.transform(training_tfidf_features) training_output = training_ratings.zip(nor_training).coalesce(1) testing_data = data_set.filter(lambda (review_text, rating, review_date): review_date.tm_year == 2014).cache() testing_ratings = testing_data.map( lambda (review_text, rating, review_date): rating) testing_reviews = testing_data.map( lambda (review_text, rating, review_date): review_text) testing_tfidf_features = get_tfidf_features(testing_reviews) nor_testing = nor.transform(testing_tfidf_features) testing_output = testing_ratings.zip(nor_testing).coalesce(1) return training_output, testing_output
def learn_model(sc, file_path, normalize): feature_file = sc.textFile(file_path).map(lambda l:l.split("\t")) points = feature_file.map(lambda f: LabeledPoint(f[1], f[2:])) #normalizing if normalize: nor = Normalizer() labels = points.map(lambda x: x.label) features = points.map(lambda x: x.features) points = labels.zip(nor.transform(features)) points = points.map(lambda i: LabeledPoint(i[0], i[1])) training, testing = points.randomSplit([0.7,0.3],11) index = 0 iterations = 100 p_mse = -1 converge = False result = {} while(not converge): x = time.clock() model = LinearRegressionWithSGD.train(training, iterations=iterations, step=0.00001,intercept=True,regType="l1") y = time.clock() print("========== time = " + str(y - x)) preds = testing.map(lambda p: (p.label, model.predict(p.features))) MSE = preds.map(lambda r: (r[1] - r[0])**2).reduce(lambda x, y: x + y) / preds.count() print("========== MSE = " + str(MSE)) if p_mse == MSE: converge = True iterations = iterations +100 result[iterations] = MSE p_mse = MSE print(result) return model
def normalizeData(sc, fileToNormalize="subalg/item_item/output/item_item_results_unnormalized.txt", fileToCreate="subalg/item_item/output/item_item_results.txt"): '''Normalizes values in a subalg output file and normalizes the values''' def parseLine(line): '''Inner helper for getting just the rating value''' return float(line.split(' ')[2]) #just the ratings n2 = Normalizer() ratings = sc.textFile(fileToNormalize).map(parseLine) #rdd = sc.parallelize(ratings,2) results = n2.transform(ratings.collect()) #open to read and write simaltaneously, update the weights i = 0 with open(fileToNormalize) as f: with open(fileToCreate, "a+") as fToCreate: #for each line in file to norm for line in f: line = line.split() fToCreate.write(line[0] + " " + line[1] + " " + str(results[i])+"\n") i+=1
from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF from pyspark.mllib.feature import Normalizer def get_tfidf_features(txt_rdd): hashingTF = HashingTF() tf = hashingTF.transform(txt_rdd) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf nor = Normalizer(1) words_bag1 = get_tfidf_features(all_words1) nor_words_bag1 = nor.transform(words_bag1) words_bag2 = get_tfidf_features(all_words2) nor_words_bag2 = nor.transform(words_bag2) # cell 6 # LDA Modeling ## REFERENCE: http://spark.apache.org/docs/latest/mllib-clustering.html#latent-dirichlet-allocation-lda from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors corpus = nor_words_bag1.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() ldaModel = LDA.train(corpus, k=5)
label, features = columnValues[0], columnValues[1:] return LabeledPoint(label, features) labels = msDataRDD.map(lambda x: x.split(',')[0]).collect() minYear = float(min(labels)) rawLabeledPoints = msDataRDD.map(parseLabeledPoint) labeledPoints = rawLabeledPoints.map( lambda lp: LabeledPoint(lp.label - minYear, lp.features)) labels = labeledPoints.map(lambda x: x.label) features = labeledPoints.map(lambda x: x.features) from pyspark.mllib.feature import Normalizer normalizer = Normalizer() data = labels.zip(normalizer.transform(features)) parsedData = data.map(lambda lp: LabeledPoint(lp[0], lp[1])) #Part 1 def lossFunction(weights, lp): """ function that computes the value (wT x - y) x and test this function on two examples. """ return np.dot((weights.dot(lp.features) - lp.label), lp.features) from pyspark.mllib.linalg import DenseVector #test example one weightOne = DenseVector([4, 5, 6])
#Converting data to LabeledPoint def getDataAndLabel(line): values = [float(x) for x in line.split(',')] return LabeledPoint(values[0], values[1:]) labelled_data = data.map(getDataAndLabel) labels = labelled_data.map(lambda x: x.label) features = labelled_data.map(lambda x: x.features) minYear = labels.min() scaledLabeleData = labelled_data.map( lambda m: LabeledPoint(m.label - minYear, m.features)) scaledLabels = scaledLabeleData.map(lambda x: x.label) normalizer = Normalizer() normalized_data = scaledLabels.zip(normalizer.transform(features)) normalized_lp_data = normalized_data.map(lambda x: LabeledPoint(x[0], x[1])) #Splitting the dataset into training, validation, test sets trainSplit, valSplit, testSplit = normalized_lp_data.randomSplit([.6, .2, .2], 50) iteration = 50 trainWeights, trainError = gradientDescent(trainSplit, iteration) trainingResult = trainSplit.map(lambda x: prediction(trainWeights, x)) trainRMSE = calculate_rmse(trainingResult) valResult = valSplit.map(lambda y: prediction(trainWeights, y)) valRMSE = calculate_rmse(valResult) print 'Error in Training data: ', trainRMSE print 'Error in Validation data: ', valRMSE
def moiveDataTest(): # 3.2.2探索电影数据 # conf = SparkConf().setAppName("moiveTest") sc = SparkContext(conf=conf) moiveFileName = "/home/zhb/Desktop/work/SparkData/ml-100k/u.item" moive_data = sc.textFile(moiveFileName) print moive_data.first() num_moives = moive_data.count() print "Moives:%d" % num_moives # 数据转换 def convert_year(x): try: return int(x[-4:]) except: return 1900#若数据缺失年份,将其年份设置为1900,在后续处理中会过滤掉这类数据 moive_fields = moive_data.map(lambda lines:lines.split("|")) years = moive_fields.map(lambda fields:fields[2]).map(lambda x:convert_year(x)) years_filtered = years.filter(lambda x:x != 1900) moive_ages = years_filtered.map(lambda yr:1998-yr).countByValue() values = moive_ages.values() bins = moive_ages.keys() pylab.hist(values, bins = bins, color = 'lightblue', normed = True) fig = matplotlib.pyplot.gcf() fig.set_size_inches(16,10) # 3.3非规整数据和缺失数据的填充 years_pre_processed = moive_fields.map(lambda fields:fields[2]).map(lambda x:convert_year(x)).collect() years_pre_processed_array = np.array(years_pre_processed) # 计算发行年份的平均数,不包括非规整数据 mean_year = np.mean(years_pre_processed_array[years_pre_processed_array != 1900]) # 计算发行年份的中位数,不包括非规整数据 median_year = np.median(years_pre_processed_array[years_pre_processed_array != 1900]) # 找到非规整数据点的序号 index_bad_data = np.where(years_pre_processed_array == 1900)[0][0] # 通过序号将中位数作为非规整数据的发行年份 years_pre_processed_array[index_bad_data] = median_year print "Mean year of release : %d" % mean_year print "Median year of release : %d" % median_year print "Index of '1900' after assigning median : %s" % np.where(years_pre_processed_array == 1900)[0] # 3.4.4 文本特征 def extrat_title(raw): import re grps = re.search("\((\w+)\)",raw) if grps: return raw[:grps.start()].strip() else: return raw raw_titles = moive_fields.map(lambda fields:fields[1]) b = [extrat_title(raw_title) for raw_title in raw_titles.take(5)] print b moive_titles = raw_titles.map(lambda m:extrat_title(m)) # 用简单空白分词法将标题分词为词 title_terms = moive_titles.map(lambda t:t.split(" ")) print title_terms.take(5) # 使用flatMap来扩展title_terms RDD中每个记录的字符串列表,以得到一个新的字符串RDD # 下面取回所有可能的词,以便构建一个词到序号的映射词典 all_terms = title_terms.flatMap(lambda x:x).distinct().collect() # 创建一个新的字典来保存词,并分配k之1序号 idx = 0 all_terms_dict = {} for term in all_terms: all_terms_dict[term] = idx idx += 1 print "Total number of terms:%d" % len(all_terms_dict) print "Index of term 'Dead':%d" % all_terms_dict['Dead'] print "Index of term 'Rooms': %d" % all_terms_dict['Rooms'] # 通过Spark的zipWithIndex函数可以更高效的得到相同结果 all_terms_dict2 = title_terms.flatMap(lambda x:x).distinct().zipWithIndex().collectAsMap() print "Index of term 'Dead':%d" % all_terms_dict['Dead'] print "Index of term 'Rooms': %d" % all_terms_dict['Rooms'] # 将一个词集合转换为一个稀疏向量的表示 def create_vector(terms,term_dict): from scipy import sparse as sp num_terms = len(term_dict) x = sp.csc_matrix((1,num_terms)) for t in terms: if t in term_dict: idx = term_dict[t] x[0,idx] = 1 return x all_terms_bcast = sc.broadcast(all_terms_dict) term_vector = title_terms.map(lambda terms:create_vector(terms,all_terms_bcast.value)) print term_vector.take(5) # 正则化特征 np.random.seed(42) x = np.random.randn(10) norm_x_2 = np.linalg.norm(x) normalized_x = x / norm_x_2 print "x:\n%s" % x print "2-Norm of x: %2.4f" %norm_x_2 print "Normalized x:\n%s" %normalized_x print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x) from pyspark.mllib.feature import Normalizer normalizer = Normalizer() vector = sc.parallelize([x]) normalized_x_mllib = normalizer.transform(vector).first().toArray() print "x:\n%s" % x print "2-Norm of x: %2.4f" %norm_x_2 print "Normalized x MlLib:\n%s" %normalized_x_mllib print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
def parseDataF(x): label = 0 features = x[1] if x[0][1] != "None" and x[0][1] is not None: label = int(x[0][1]) return LabeledPoint(int(label), features) conf = SparkConf().setAppName('Chicago Linear Reg') sc = SparkContext(conf=conf) normalizer1 = Normalizer(1) v = Vectors.dense(range(3)) nor = Normalizer(1) normalizer1.transform(v) print normalizer1.transform(v) inputs = sys.argv[ 1] #"/Volumes/personal/uzmaa-irmacs/Chicago/data/FeatureSetByCrimePickleC" output = sys.argv[2] #"/users/uzmaa/Desktop/output" inputrdd = sc.pickleFile(inputs) inputrddidlabel = inputrdd.map(lambda (id, (features, label)): (id, label)) inputrddidfeatures = inputrdd.map(lambda (id, (features, label)): features)
num_ratings_per_user = rating_data.map( lambda line: line.split("\t")[0]).countByValue() rating_freq = num_ratings_per_user.values() # plt.hist(rating_freq, bins=100) # plt.show() ''' 1 of k encoding ''' distinct_occupations = user_occupations.distinct() occupationLUT = {} for occupation in distinct_occupations.collect(): occupationLUT[occupation] = len(occupationLUT) def getOccupationEncoding(occupation, LUT): binary_x = np.zeros(len(LUT)) binary_x[LUT[occupation]] = 1 return binary_x # print getOccupationEncoding("administrator", occupationLUT) rating_times = rating_data.map( lambda line: datetime.datetime.fromtimestamp(int(line.split("\t")[-1]))) rating_hours = rating_times.map(lambda time: time.hour) # print rating_hours.take(10) x = np.random.rand(10) from pyspark.mllib.feature import Normalizer normalizer = Normalizer() # scales every row by itself vector = sc.parallelize([x, x + 1, x + 2]) # print normalizer.transform(vector).collect()
# vectores de longitud 1500. La variable documentos ya tiene los documentos seperados por palabras para poder aplicar # la función de hashing. hashingTF = HashingTF(1500) tf = hashingTF.transform(documents) # Se crea un objeto IDF el cual calcula la frecuencia inversa del documento de acuerdo a los vectores obtenidos con el # proceso de hashing en la frecuencia de terminos idf = IDF(minDocFreq=2).fit(tf) # Ya con los vectores tf e idf se puede hacer el calculo del tfidf con el fin de tener los documentos vectorizados # y acordes para el calculo de la similitud. tfidf = idf.transform(tf) # Se crea un objeto normalizer para hallar la norma o distancia euclidiana de los vectores tfidf. Esto se hace para # luego relizar el producto punto entre todos los valores, lo cual se traduce como la formula del coseno. normalizer = Normalizer() # Se agregan los nombres de los documentos a su correspondiente vector ya normalizado data = names.zip(normalizer.transform(tfidf)) # La operacion cartesian realiza el producto punto de los vectores normalizados (coseno) entre todos los vectores, es # decir, el resultado es la similitud de cada elemento con el resto result = data.cartesian(data)\ .map(lambda l: (l[0][0], {'doc_name':l[1][0], 'similarity':float(l[0][1].dot(l[1][1]))}))\ .groupByKey()\ .collect() # Por ultimo se recorren los resultados y se almacenan en MongoDB para ser consultados. Se guarda para cada documento, # el nombre como el _id y un listado de las relaciones con los demas documentos. for value in result: simil_docs = spark.createDataFrame([(value[0], list(value[1]))], ["_id", "simil_docs"])
return Vectors.sparse(2000, indexList,countList) reviewFeaturesRDD = reviewTextWithIndex\ .flatMapValues(lambda word: word)\ .map(lambda (rowNum, word): (word, rowNum))\ .join(wordsClustersRDD)\ .map(lambda (word, (rowNum,clusterIndex)): ((rowNum, clusterIndex), 1))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda ((rowNum, clusterIndex), count):(rowNum, (clusterIndex, count)))\ .groupByKey()\ .mapValues(lambda histogram: createSparseVector(histogram))\ .cache() norm = Normalizer(1) normalisedReviewFeaturesRDD=reviewFeaturesRDD.map(lambda (rowNum, features):rowNum)\ .zip(norm.transform(reviewFeaturesRDD.map(lambda (rowNum, features):features))) formatter_string = "%m %d %Y" allDataRDD=normalisedReviewFeaturesRDD.join(reviewScoreTimeWithIndex)\ .map(lambda (rowNum, (features, (score, time))):(score, features, datetime.datetime.strptime(time[-4:], "%Y").year))\ .cache() train_featureScoreTimeRDD=allDataRDD.filter(lambda (score, feature, time): time<2014 )\ .map(lambda (score, feature, time): LabeledPoint(float(score),feature))\ .repartition(10).cache() val_featureScoreTimeRDD=allDataRDD.filter(lambda (score, feature, time): time>=2014 )\ .map(lambda (score, feature, time): LabeledPoint(float(score),feature))\
for t in terms: if t in term_dict: _idx = term_dict[t] _x[0, _idx] = 1 return _x all_terms_bcast = sc.broadcast(all_terms_dict) terms_vectors = title_terms.map( lambda terms: create_vector(terms, all_terms_bcast.value)) np.random.seed(42) x = np.random.randn(10) norm_x_2 = np.linalg.norm(x) normalized_x = x / norm_x_2 print("x:\n%s" % x) print("2-Norm of x: %2.4f" % norm_x_2) print("Normalized x:\n%s" % normalized_x) print("2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)) normalizer = Normalizer() vector = sc.parallelize([x]) normalized_x_mllib = normalizer.transform(vector).first().toArray() print("x:\n%s" % x) print("2-Norm of x: %2.4f" % norm_x_2) print("Normalized x MLlib: \n%s" % normalized_x_mllib) print("2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib))
from pyspark.mllib.feature import Normalizer conf = SparkConf() conf.setMaster('yarn-client') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) path = "/Users/sradhakr/Desktop/Assignment3/Assignment3" train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10) val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10) norm = Normalizer(2) train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10) val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10) train_featuresRDD=train_featureScoreTimeRDD.map(lambda (feature, score): feature) trainfeatureScoreNormRDD=norm.transform(train_featuresRDD).zip(train_featuresRDD.map(lambda (feature, score): score)) val_featuresRDD=val_featureScoreTimeRDD.map(lambda (feature, score): feature) valfeatureScoreNormRDD=norm.transform(val_featuresRDD).zip(val_featuresRDD.map(lambda (feature, score): score))
def main(): k_input_model = sys.argv[1] #read kmean model from this location w_input_model = sys.argv[2] #read word2vec model from this location input_file = sys.argv[3] #read input file conf = SparkConf().setAppName('Clustering') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) '''sbaronia - load both kmean and Word2Vec model''' kmean_model = KMeansModel.load(sc,k_input_model) word2vec_model = Word2VecModel.load(sc,w_input_model) '''sbaronia - select fields from json and make data frame zipped with index''' review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache() review_df = review.filter(review.reviewText != "").cache() rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache() rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache() year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache() year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache() clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache() clean_list = clean_words_rdd.collect() '''sbaronia - make a list of all words in our model''' keys = sqlContext.read.parquet(w_input_model+"/data") keys_list = keys.rdd.map(lambda line: line.word).collect() '''sbaronia - here we create one vector per review, where vector contains the number of times a cluster is assinged to a word in a review. We make a SparseVector compatible format''' features = [] for i in range(len(clean_list)): histogram = [0] * 2000 for word in clean_list[i]: if word in keys_list: vec = word2vec_model.transform(word) clust = kmean_model.predict(vec) if histogram[clust] > 0: histogram[clust] = histogram[clust] + 1 else: histogram[clust] = 1 features.append((2000,range(2000),histogram)) '''sbaronia - create a normalized SparseVector rdd''' nor = Normalizer(1) features_rdd = rdd_zip(sc.parallelize(features) \ .map(lambda line: nor.transform(SparseVector.parse(line))) \ .cache()).cache() '''sbaronia - make a dataframe with rating, year and vector per review''' features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache() year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache() featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \ .drop(features_df.index).cache() '''sbaronia - create training and testing data based on year''' train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() '''sbaronia - find best step using validation and run LinearRegressionWithSGD with that step and report final RMSE''' step_best_norm = validation(train_rdd) RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm) print("Final RMSE(Normalization) = " + str(RMSE_norm) + " Best Step size = " + str(step_best_norm))
docData = re.sub(r'[^a-z0-9 ]', ' ', docData) docData = docData.split() docData = [x for x in docData if x not in stopWordList] docData = [porter.stem(word) for word in docData] return (docID, docData) data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse) titles = data.map(lambda x: x[0]) documents = data.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) normalizer = Normalizer() tfidf = normalizer.transform(idf.transform(tf)) tfidfData = titles.zip(tfidf).toDF(["label", "features"]) #idf.rdd.saveAsTextFile("idf_model") #sc.parallelize(idf.idf()).coalesce(1).saveAsTextFile("idf") #MLUtils.saveAsLibSVMFile(tfidfData, "tfidf_column.out") query = parse(( 0, "location_id organization_id name latitude longitude bbl bin cd council nta tract" ))[1] queryTF = hashingTF.transform(query) queryTFIDF = normalizer.transform(idf.transform(queryTF)) queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot( queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0) if (queryRelevance.isEmpty()):
def features_normal(self, featuresRDD): from pyspark.mllib.feature import Normalizer normalizer = Normalizer() featuresRDD = featuresRDD.map( lambda p: [p[0], normalizer.transform(p[1])]) return featuresRDD
from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import Normalizer from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="NormalizerExample") # SparkContext # $example on$ data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") labels = data.map(lambda x: x.label) features = data.map(lambda x: x.features) normalizer1 = Normalizer() normalizer2 = Normalizer(p=float("inf")) # Each sample in data1 will be normalized using $L^2$ norm. data1 = labels.zip(normalizer1.transform(features)) # Each sample in data2 will be normalized using $L^\infty$ norm. data2 = labels.zip(normalizer2.transform(features)) # $example off$ print("data1:") for each in data1.collect(): print(each) print("data2:") for each in data2.collect():
# %% [markdown] # 日期:2020-12-20 14:51:00 排名: 无 # score:0.5000562 # %% [markdown] # ## SVM with Normalized data # %% from pyspark.mllib.classification import SVMModel svm_model2 = SVMModel.load( sc, "hdfs://node1:9000/user/root/exp4/models/NormalizedSVMWithSGDModel") # %% #将数据规范化 from pyspark.mllib.feature import Normalizer features = test.map(lambda x: x[3]) normalizer = Normalizer() normalized_test = test.map(lambda lp: (lp[0], lp[1])).zip( normalizer.transform(features)).map(lambda lp: (lp[0][0], lp[0][1], lp[1])) # %% print(normalized_test.take(10)) # %% svm_predictions2 = normalized_test.map( lambda line: (line[0], line[1], float(svm_model2.predict(line[2])))) svm_predictions2.coalesce(1).toDF().write.options(header="true").csv( "hdfs://node1:9000/user/root/exp4/predictions/svm_predictions2.csv") # %% spark.stop()
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.mllib.feature import Normalizer from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('Normalize').setMaster('local[2]') sc = SparkContext(conf=conf) data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') labels = data.map(lambda x : x.label) features = data.map(lambda x : x.features) normalizer1 = Normalizer() normalizer2 = Normalizer(p=float('inf')) data1 = labels.zip(normalizer1.transform(features)) data2 = labels.zip(normalizer2.transform(features)) print(data.first()) print(data1.first()) print(data2.first()) sc.stop()
#Converting data to LabeledPoint def transform_to_labeled_point(line): values = [float(x) for x in line.split(',')] #return values return LabeledPoint(values[0], values[1:]) msd_labeled = msd.map(transform_to_labeled_point) labels = msd_labeled.map(lambda x: x.label) features = msd_labeled.map(lambda x: x.features) min_label = labels.min() scaled_label_msd = msd_labeled.map( lambda lp: LabeledPoint(lp.label - min_label, lp.features)) scaled_labels = scaled_label_msd.map(lambda x: x.label) scaled_min_label = scaled_labels.min() norm = Normalizer() normalized_msd = scaled_labels.zip(norm.transform(features)) normalized_lp_msd = normalized_msd.map(lambda x: LabeledPoint(x[0], x[1])) print 'Min label : ', min_label print 'Scaled Min label : ', scaled_min_label print 'Data with scaled label : ', scaled_label_msd.take(2) print 'Data with scaled features : ', normalized_msd.take(2) print 'Normalized data : ', normalized_lp_msd.take(2) #Output #Min label : 1930.0 #Split the dataset train_data, validation_data, test_data = normalized_lp_msd.randomSplit( [.7, .2, .1], 50)
# ## Normalizing Features # ### Scaling the Norm of Vectors # In[99]: np.random.seed(42) x = np.random.randn(10) norm_x_2 = np.linalg.norm(x) normalized_x = x / norm_x_2 print "x:\n%s" % x print "2-Norm of x: %2.4f" % norm_x_2 print "Normalized x:\n%s" % normalized_x print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x) # ### Scaling the Norm of Vectors with MLlib's Normalizer # In[101]: from pyspark.mllib.feature import Normalizer normalizer = Normalizer() vector = sc.parallelize([x]) normalized_x_mllib = normalizer.transform(vector).first().toArray() print "x:\n%s" % x print "2-Norm of x: %2.4f" % norm_x_2 print "Normalized x MLlib:\n%s" % normalized_x_mllib print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
# 11.5算法:MLlib的主要算法以及它们的输入和输出类型 # 11.5.1特征提取:mllib.feature包 # TF-IDF词频-逆文档频率 # 缩放至平均值为0,标准差为1 from pyspark.mllib.feature import StandardScaler vectors = [Vectors.dense([-2.0, 5.0, 1.0]), Vectors.dense([2.0, 0.0, 1.0])] dataset = sc.parallelize(vectors) scaler = StandardScaler(withMean=True, withStd=True) model = scaler.fit(dataset) result = model.transform(dataset) # 正规化长度为1 from pyspark.mllib.feature import Normalizer normalizer = Normalizer(p=3.0) result2 = normalizer.transform(dataset) # Word2Vec """ Word2Vec是一个基于神经网络的文本特征化算法,可以将数据传给许多下游算法 mllib.feature.Word2Vec类引入了该算法 """ # 11.5.2统计--mllib.stat.Statistics类中提供了几种统计函数可以直接在RDD上使用 """ Statistics.colStats(rdd)计算由向量组成的RDD的统计性综述,包括每列的最大值最小值平均值和方差 Statistics.corr(rdd,method)计算由向量组成的RDD中的列间的相关矩阵,method须是pearson皮尔森相关或spearman斯皮尔曼相关 Statistics.corr(rdd1,rdd2,method)计算两个RDD的相关矩阵,method同上 Statistics.chiSqTest(rdd)计算由LabeledPoint对象组成的RDD中每个特征与标签的皮尔森独立性测试, 返回一个ChiSqTestResult对象,其中有p值,测试统计,每个特征的自由度.特征和标签必须是分类的,即离散值 """
conf = SparkConf().setAppName('cosSim') sc = SparkContext(conf=conf) file = open('retweet_sim_analysis.json') results = json.load(file) rdd = sc.parallelize(results) def mapDocs(user): doc = [] for retweet in user['retweets']: doc.extend([retweet[0]] * retweet[1]) return doc normalizer1 = Normalizer() def cosineSimilarity(tupl): x, y = tupl return (x[0], y[0], x[1].dot(y[1])) def setNodeEdge(user): return {'source': user[0], 'target': user[1], 'weight': user[2]} rddDocs = rdd.map(lambda x: mapDocs(x)) rddLabels = rdd.map(lambda x: x['_id']) hashingTF = HashingTF()
def parseDataF(x): label=0 features=x[1][0] if x[1][1]!="None" and x[1][1] is not None: label=int(x[1][1]) return LabeledPoint(int(label),features) conf = SparkConf().setAppName('Chicago Linear Reg') sc = SparkContext(conf=conf) normalizer1 = Normalizer() inputs = sys.argv[1]#"/Volumes/personal/uzmaa-irmacs/Chicago/data/FeatureSetSocialCrimePickle" inputrdd=sc.pickleFile(inputs) inputrddtrain=inputrdd.filter(lambda ((Community,Year,Month),(features,label)):Year<2010) inputrddtest=inputrdd.filter(lambda ((Community,Year,Month),(features,label)):Year<2015 and Year>=2010) DataTrain = inputrddtrain.map(parseDataF) DataTest = inputrddtest.map(parseDataF) print DataTrain.take(5) print DataTest.take(5)