def get_output(in_data):
    text = sc.textFile(in_data)

    nltk_data_path = "[your nltk data path]"  # maybe changed to the sfu server path
    nltk.data.path.append(nltk_data_path)
    stop_words = set(stopwords.words("english"))

    cleaned_review = text.map(lambda review_line: clean_review(review_line, stop_words))

    data_set = cleaned_review.map(lambda cleaned_line:
                                (cleaned_line['reviewText'], cleaned_line['overall'],
                                time.strptime(cleaned_line['reviewTime'], '%m %d, %Y')))

    nor = Normalizer(1)
    training_data = data_set.filter(lambda (review_text, rating, review_date): review_date.tm_year < 2014).cache()
    training_ratings = training_data.map(lambda (review_text, rating, review_date): rating)
    training_reviews = training_data.map(lambda (review_text, rating, review_date): review_text)
    training_tfidf_features = get_tfidf_features(training_reviews)

    nor_training = nor.transform(training_tfidf_features)
    training_output = training_ratings.zip(nor_training).coalesce(1)


    testing_data = data_set.filter(lambda (review_text, rating, review_date): review_date.tm_year == 2014).cache()
    testing_ratings = testing_data.map(lambda (review_text, rating, review_date): rating)
    testing_reviews = testing_data.map(lambda (review_text, rating, review_date): review_text)
    testing_tfidf_features = get_tfidf_features(testing_reviews)

    nor_testing = nor.transform(testing_tfidf_features)
    testing_output = testing_ratings.zip(nor_testing).coalesce(1)

    return training_output, testing_output
def column_search(words,row_filter):
    
    if row_filter == 'n' or row_filter == 'N':
        min_row = 0
    else:
        min_row = row_filter

    rawData = table_cols.join(master_index, master_index["Table_Name"]==table_cols["Name"]).rdd
    data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse)

    titles = data.map(lambda x: x[0])
    documents = data.map(lambda x: x[1])
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    normalizer = Normalizer()
    tfidf = normalizer.transform(idf.transform(tf))
    tfidfData = titles.zip(tfidf).toDF(["label", "features"])
    
    query = parse((0, words))[1]
    queryTF = hashingTF.transform(query)
    queryTFIDF = normalizer.transform(idf.transform(queryTF))
    queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0)
    queryRelevance = queryRelevance.toDF(["Doc_ID", "scores"])
    queryRelevance = queryRelevance.join(table_desc,queryRelevance.Doc_ID == table_desc.Doc_ID).select(table_desc.Doc_ID, queryRelevance.scores, table_desc.Columns)
    queryRelevance = queryRelevance.join(master_index, master_index.Doc_ID==queryRelevance.Doc_ID).select(master_index.Table_Name,master_index.Table_Length, queryRelevance.Columns, queryRelevance.scores)
    queryRelevance = queryRelevance.rdd.filter(lambda x: int(x['Table_Length']) >= int(min_row))
    if (queryRelevance.isEmpty()):
        print("Sorry, nothing matched in column search, please try a different keyword")
    else:
        print("Here is your column search result")
        queryRelevance.toDF().show()
    '''
Example #3
0
 def features_normal(self, featuresRDD):
     from pyspark.mllib.feature import Normalizer
     # featuresRDD=featuresRDD.map(lambda point:Row(point.label,point.features))
     # fields=[StructField('label',FloatType(),nullable=True),StructField('features',ArrayType(elementType=FloatType()),nullable=True)]
     # schema=StructType(fields)
     fl_df = self.spark.createDataFrame(featuresRDD).toDF(
         'label', 'features')
     # fl_df.foreach(print)
     # scaler=MinMaxScaler(inputCol='features',outputCol='SCfeatures')
     # scalerModel=scaler.fit(fl_df)
     # SCdata=scalerModel.transform(fl_df).select('label','SCfeatures')
     # SCdata.foreach(print)
     normalizer = Normalizer()
     featuresRDD = featuresRDD.map(
         lambda p: [p[0], normalizer.transform(p[1])])
     # featuresRDD=featuresRDD.map(lambda p:LabeledPoint(int(p[0]),p[1]))
     return featuresRDD
def normTFIDF(fn_tokens_RDD, vecDim, caching=True):
    keysRDD = fn_tokens_RDD.keys()
    tokensRDD = fn_tokens_RDD.values()
    tfVecRDD = tokensRDD.map(lambda tokens: hashing_vectorize(
        tokens, vecDim))  # passing the vecDim value. TIP: you need a lambda.
    if caching:
        tfVecRDD.persist(
            StorageLevel.MEMORY_ONLY
        )  # since we will read more than once, caching in Memory will make things quicker.
    idf = IDF()  # create IDF object
    idfModel = idf.fit(tfVecRDD)  # calculate IDF values
    tfIdfRDD = idfModel.transform(
        tfVecRDD)  # 2nd pass needed (see lecture slides), transforms RDD
    norm = Normalizer(
    )  # create a Normalizer object like in the example linked above
    normTfIdfRDD = norm.transform(tfIdfRDD)  # and apply it to the tfIdfRDD
    zippedRDD = keysRDD.zip(normTfIdfRDD)  # zip the keys and values together
    return zippedRDD
Example #5
0
def get_features(line, categorical_length,mappage):
    categorical_vector = np.zeros(categorical_length)
    i = 0 
    offset = 0
    # access every categorical feature
    for field in line[0:3]:
        # access every dictionnaryy in overall mapping 
        map_dict = mappage[i]
        # get the index in dict which value is different than 0 
        index = map_dict[field]
        # assign the  value of 1 to the correcoping global index in categ_vector
        categorical_vector[index + offset] = 1
        # go to next dict and increase offest
        i = i + 1
        offset = offset + len(map_dict)
    
    # assign vector for numerical features by normalising them to fix scaling problems later
    normalizer = Normalizer()
    numerical_vector = normalizer.transform(np.array([float(val) for val in line[3:8]]))
    return np.concatenate((categorical_vector,numerical_vector))
Example #6
0
    def ml_features_normal(self, features):
        from pyspark.ml.linalg import Vectors
        from pyspark.ml.feature import Normalizer
        from pyspark.ml.classification import NaiveBayes
        from tools import f
        features.foreach(print)
        fea_df = features.map(lambda i: Row(**f(i))).toDF()
        # fea_df.show()
        normalizer = Normalizer().setInputCol('features').setOutputCol(
            'norfeatures').setP(1.0)
        norfea_df = normalizer.transform(fea_df)
        # norfea_df.show()
        train_dt, test_dt = norfea_df.randomSplit([0.8, 0.2])
        nvby = NaiveBayes(modelType="multinomial", smoothing=0.1)
        nvby_mod = nvby.fit(dataset=train_dt)

        predictRDD = nvby_mod.transform(test_dt).rdd
        count = predictRDD.count()
        print(
            predictRDD.map(lambda i: (i.label, i.prediction)).filter(
                lambda i: i[0] == i[1]).count() / count)
Example #7
0
def get_output(in_data):
    text = sc.textFile(in_data)

    nltk_data_path = "[your nltk data path]"  # maybe changed to the sfu server path
    nltk.data.path.append(nltk_data_path)
    stop_words = set(stopwords.words("english"))

    cleaned_review = text.map(
        lambda review_line: clean_review(review_line, stop_words))

    data_set = cleaned_review.map(lambda cleaned_line: (
        cleaned_line['reviewText'], cleaned_line['overall'],
        time.strptime(cleaned_line['reviewTime'], '%m %d, %Y')))

    nor = Normalizer(1)
    training_data = data_set.filter(lambda (review_text, rating, review_date):
                                    review_date.tm_year < 2014).cache()
    training_ratings = training_data.map(
        lambda (review_text, rating, review_date): rating)
    training_reviews = training_data.map(
        lambda (review_text, rating, review_date): review_text)
    training_tfidf_features = get_tfidf_features(training_reviews)

    nor_training = nor.transform(training_tfidf_features)
    training_output = training_ratings.zip(nor_training).coalesce(1)

    testing_data = data_set.filter(lambda (review_text, rating, review_date):
                                   review_date.tm_year == 2014).cache()
    testing_ratings = testing_data.map(
        lambda (review_text, rating, review_date): rating)
    testing_reviews = testing_data.map(
        lambda (review_text, rating, review_date): review_text)
    testing_tfidf_features = get_tfidf_features(testing_reviews)

    nor_testing = nor.transform(testing_tfidf_features)
    testing_output = testing_ratings.zip(nor_testing).coalesce(1)

    return training_output, testing_output
def learn_model(sc, file_path, normalize):
	feature_file = sc.textFile(file_path).map(lambda l:l.split("\t"))

	points = feature_file.map(lambda f: LabeledPoint(f[1], f[2:]))
	
	#normalizing
	if normalize:
		nor      = Normalizer()
		labels   = points.map(lambda x: x.label)
		features = points.map(lambda x: x.features)
		points = labels.zip(nor.transform(features))
		points = points.map(lambda i: LabeledPoint(i[0], i[1]))

	training, testing = points.randomSplit([0.7,0.3],11)
	index = 0
	iterations = 100
	p_mse = -1
	converge = False
	result = {}
	while(not converge):
		x = time.clock()
		model = LinearRegressionWithSGD.train(training, iterations=iterations, step=0.00001,intercept=True,regType="l1")
		y = time.clock()
		print("========== time = " + str(y - x))
		preds = testing.map(lambda p: (p.label, model.predict(p.features)))
		MSE = preds.map(lambda r: (r[1] - r[0])**2).reduce(lambda x, y: x + y) / preds.count()
		print("========== MSE = " + str(MSE))
		if p_mse == MSE:
			converge = True

		iterations = iterations +100
		result[iterations] = MSE
		p_mse = MSE
	
	print(result)
	return model
Example #9
0
def normalizeData(sc, fileToNormalize="subalg/item_item/output/item_item_results_unnormalized.txt", 
	fileToCreate="subalg/item_item/output/item_item_results.txt"):
	'''Normalizes values in a subalg output file and normalizes the values'''


	def parseLine(line):
		'''Inner helper for getting just the rating value'''
		return float(line.split(' ')[2]) 	#just the ratings

	n2 = Normalizer()
	ratings = sc.textFile(fileToNormalize).map(parseLine)
	#rdd = sc.parallelize(ratings,2)
	results = n2.transform(ratings.collect())

	#open to read and write simaltaneously, update the weights
	i = 0
	with open(fileToNormalize) as f:
		with open(fileToCreate, "a+") as fToCreate:

			#for each line in file to norm
			for line in f:
				line = line.split()
				fToCreate.write(line[0] + " " + line[1] + " " + str(results[i])+"\n")
				i+=1
Example #10
0
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.mllib.feature import Normalizer


def get_tfidf_features(txt_rdd):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt_rdd)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf


nor = Normalizer(1)

words_bag1 = get_tfidf_features(all_words1)
nor_words_bag1 = nor.transform(words_bag1)

words_bag2 = get_tfidf_features(all_words2)
nor_words_bag2 = nor.transform(words_bag2)

# cell 6
# LDA Modeling
## REFERENCE: http://spark.apache.org/docs/latest/mllib-clustering.html#latent-dirichlet-allocation-lda
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

corpus = nor_words_bag1.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
ldaModel = LDA.train(corpus, k=5)
Example #11
0
        label, features = columnValues[0], columnValues[1:]
        return LabeledPoint(label, features)

    labels = msDataRDD.map(lambda x: x.split(',')[0]).collect()
    minYear = float(min(labels))

    rawLabeledPoints = msDataRDD.map(parseLabeledPoint)
    labeledPoints = rawLabeledPoints.map(
        lambda lp: LabeledPoint(lp.label - minYear, lp.features))

    labels = labeledPoints.map(lambda x: x.label)
    features = labeledPoints.map(lambda x: x.features)

    from pyspark.mllib.feature import Normalizer

    normalizer = Normalizer()
    data = labels.zip(normalizer.transform(features))
    parsedData = data.map(lambda lp: LabeledPoint(lp[0], lp[1]))

    #Part 1
    def lossFunction(weights, lp):
        """
        function that computes the value (wT
        x - y) x and test this function on two examples.
        """
        return np.dot((weights.dot(lp.features) - lp.label), lp.features)

    from pyspark.mllib.linalg import DenseVector

    #test example one
    weightOne = DenseVector([4, 5, 6])
Example #12
0

#Converting data to LabeledPoint
def getDataAndLabel(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])


labelled_data = data.map(getDataAndLabel)
labels = labelled_data.map(lambda x: x.label)
features = labelled_data.map(lambda x: x.features)
minYear = labels.min()
scaledLabeleData = labelled_data.map(
    lambda m: LabeledPoint(m.label - minYear, m.features))
scaledLabels = scaledLabeleData.map(lambda x: x.label)
normalizer = Normalizer()
normalized_data = scaledLabels.zip(normalizer.transform(features))
normalized_lp_data = normalized_data.map(lambda x: LabeledPoint(x[0], x[1]))

#Splitting the dataset into training, validation, test sets
trainSplit, valSplit, testSplit = normalized_lp_data.randomSplit([.6, .2, .2],
                                                                 50)
iteration = 50
trainWeights, trainError = gradientDescent(trainSplit, iteration)
trainingResult = trainSplit.map(lambda x: prediction(trainWeights, x))
trainRMSE = calculate_rmse(trainingResult)
valResult = valSplit.map(lambda y: prediction(trainWeights, y))
valRMSE = calculate_rmse(valResult)

print 'Error in Training data: ', trainRMSE
print 'Error in Validation data: ', valRMSE
Example #13
0
def moiveDataTest():
    # 3.2.2探索电影数据 #
    conf = SparkConf().setAppName("moiveTest")
    sc = SparkContext(conf=conf)
    moiveFileName = "/home/zhb/Desktop/work/SparkData/ml-100k/u.item"
    moive_data = sc.textFile(moiveFileName)
    print moive_data.first()
    num_moives = moive_data.count()
    print "Moives:%d" % num_moives

    # 数据转换
    def convert_year(x):
        try:
            return int(x[-4:])
        except:
            return 1900#若数据缺失年份,将其年份设置为1900,在后续处理中会过滤掉这类数据

    moive_fields = moive_data.map(lambda lines:lines.split("|"))
    years = moive_fields.map(lambda fields:fields[2]).map(lambda x:convert_year(x))
    years_filtered = years.filter(lambda x:x != 1900)

    moive_ages = years_filtered.map(lambda yr:1998-yr).countByValue()
    values = moive_ages.values()
    bins = moive_ages.keys()
    pylab.hist(values, bins = bins, color = 'lightblue', normed = True)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(16,10)

    # 3.3非规整数据和缺失数据的填充
    years_pre_processed = moive_fields.map(lambda fields:fields[2]).map(lambda x:convert_year(x)).collect()
    years_pre_processed_array = np.array(years_pre_processed)

    # 计算发行年份的平均数,不包括非规整数据
    mean_year = np.mean(years_pre_processed_array[years_pre_processed_array != 1900])
    # 计算发行年份的中位数,不包括非规整数据
    median_year = np.median(years_pre_processed_array[years_pre_processed_array != 1900])
    # 找到非规整数据点的序号
    index_bad_data = np.where(years_pre_processed_array == 1900)[0][0]
    # 通过序号将中位数作为非规整数据的发行年份
    years_pre_processed_array[index_bad_data] = median_year
    print "Mean year of release : %d" % mean_year
    print "Median year of release : %d" % median_year
    print "Index of '1900' after assigning median : %s" % np.where(years_pre_processed_array == 1900)[0]


    # 3.4.4 文本特征
    def extrat_title(raw):
        import re
        grps = re.search("\((\w+)\)",raw)
        if grps:
            return raw[:grps.start()].strip()
        else:
            return raw

    raw_titles = moive_fields.map(lambda fields:fields[1])
    b = [extrat_title(raw_title) for raw_title in raw_titles.take(5)]
    print b

    moive_titles = raw_titles.map(lambda m:extrat_title(m))
    # 用简单空白分词法将标题分词为词
    title_terms = moive_titles.map(lambda t:t.split(" "))
    print title_terms.take(5)

    # 使用flatMap来扩展title_terms RDD中每个记录的字符串列表,以得到一个新的字符串RDD
    # 下面取回所有可能的词,以便构建一个词到序号的映射词典
    all_terms = title_terms.flatMap(lambda x:x).distinct().collect()
    # 创建一个新的字典来保存词,并分配k之1序号
    idx = 0
    all_terms_dict = {}
    for term in all_terms:
        all_terms_dict[term] = idx
        idx += 1
    print "Total number of terms:%d" % len(all_terms_dict)
    print "Index of term 'Dead':%d" % all_terms_dict['Dead']
    print "Index of term 'Rooms': %d" % all_terms_dict['Rooms']

    # 通过Spark的zipWithIndex函数可以更高效的得到相同结果
    all_terms_dict2 = title_terms.flatMap(lambda x:x).distinct().zipWithIndex().collectAsMap()
    print "Index of term 'Dead':%d" % all_terms_dict['Dead']
    print "Index of term 'Rooms': %d" % all_terms_dict['Rooms']

    # 将一个词集合转换为一个稀疏向量的表示
    def create_vector(terms,term_dict):
        from scipy import sparse as sp
        num_terms = len(term_dict)
        x = sp.csc_matrix((1,num_terms))
        for t in terms:
            if t in term_dict:
                idx = term_dict[t]
                x[0,idx] = 1
        return x

    all_terms_bcast = sc.broadcast(all_terms_dict)
    term_vector = title_terms.map(lambda terms:create_vector(terms,all_terms_bcast.value))
    print term_vector.take(5)


    # 正则化特征
    np.random.seed(42)
    x = np.random.randn(10)
    norm_x_2 = np.linalg.norm(x)
    normalized_x = x / norm_x_2
    print "x:\n%s" % x
    print "2-Norm of x: %2.4f" %norm_x_2
    print "Normalized x:\n%s" %normalized_x
    print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)

    from pyspark.mllib.feature import Normalizer
    normalizer = Normalizer()
    vector = sc.parallelize([x])
    normalized_x_mllib = normalizer.transform(vector).first().toArray()
    print "x:\n%s" % x
    print "2-Norm of x: %2.4f" %norm_x_2
    print "Normalized x MlLib:\n%s" %normalized_x_mllib
    print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
Example #14
0

def parseDataF(x):

    label = 0

    features = x[1]
    if x[0][1] != "None" and x[0][1] is not None:
        label = int(x[0][1])
    return LabeledPoint(int(label), features)


conf = SparkConf().setAppName('Chicago Linear Reg')
sc = SparkContext(conf=conf)

normalizer1 = Normalizer(1)

v = Vectors.dense(range(3))
nor = Normalizer(1)
normalizer1.transform(v)
print normalizer1.transform(v)

inputs = sys.argv[
    1]  #"/Volumes/personal/uzmaa-irmacs/Chicago/data/FeatureSetByCrimePickleC"
output = sys.argv[2]  #"/users/uzmaa/Desktop/output"

inputrdd = sc.pickleFile(inputs)

inputrddidlabel = inputrdd.map(lambda (id, (features, label)): (id, label))

inputrddidfeatures = inputrdd.map(lambda (id, (features, label)): features)
Example #15
0
num_ratings_per_user = rating_data.map(
    lambda line: line.split("\t")[0]).countByValue()
rating_freq = num_ratings_per_user.values()
# plt.hist(rating_freq, bins=100)
# plt.show()
''' 1 of k encoding '''
distinct_occupations = user_occupations.distinct()
occupationLUT = {}
for occupation in distinct_occupations.collect():
    occupationLUT[occupation] = len(occupationLUT)


def getOccupationEncoding(occupation, LUT):
    binary_x = np.zeros(len(LUT))
    binary_x[LUT[occupation]] = 1
    return binary_x


# print getOccupationEncoding("administrator", occupationLUT)

rating_times = rating_data.map(
    lambda line: datetime.datetime.fromtimestamp(int(line.split("\t")[-1])))
rating_hours = rating_times.map(lambda time: time.hour)
# print rating_hours.take(10)

x = np.random.rand(10)
from pyspark.mllib.feature import Normalizer
normalizer = Normalizer()  # scales every row by itself
vector = sc.parallelize([x, x + 1, x + 2])
# print normalizer.transform(vector).collect()
Example #16
0
# vectores de longitud 1500. La variable documentos ya tiene los documentos seperados por palabras para poder aplicar
# la función de hashing.
hashingTF = HashingTF(1500)
tf = hashingTF.transform(documents)

# Se crea un objeto IDF el cual calcula la frecuencia inversa del documento de acuerdo a los vectores obtenidos con el
# proceso de hashing en la frecuencia de terminos
idf = IDF(minDocFreq=2).fit(tf)

# Ya con los vectores tf e idf se puede hacer el calculo del tfidf con el fin de tener los documentos vectorizados
# y acordes para el calculo de la similitud.
tfidf = idf.transform(tf)

# Se crea un objeto normalizer para hallar la norma o distancia euclidiana de los vectores tfidf. Esto se hace para
# luego relizar el producto punto entre todos los valores, lo cual se traduce como la formula del coseno.
normalizer = Normalizer()

# Se agregan los nombres de los documentos a su correspondiente vector ya normalizado
data = names.zip(normalizer.transform(tfidf))

# La operacion cartesian realiza el producto punto de los vectores normalizados (coseno) entre todos los vectores, es
# decir, el resultado es la similitud de cada elemento con el resto
result = data.cartesian(data)\
    .map(lambda l: (l[0][0], {'doc_name':l[1][0], 'similarity':float(l[0][1].dot(l[1][1]))}))\
    .groupByKey()\
    .collect()

# Por ultimo se recorren los resultados y se almacenan en MongoDB para ser consultados. Se guarda para cada documento,
# el nombre como el _id y un listado de las relaciones con los demas documentos.
for value in result:
	simil_docs = spark.createDataFrame([(value[0],  list(value[1]))], ["_id", "simil_docs"])
Example #17
0
	return Vectors.sparse(2000, indexList,countList)


reviewFeaturesRDD = reviewTextWithIndex\
	.flatMapValues(lambda word: word)\
	.map(lambda (rowNum, word): (word, rowNum))\
	.join(wordsClustersRDD)\
	.map(lambda (word, (rowNum,clusterIndex)): ((rowNum, clusterIndex), 1))\
	.reduceByKey(lambda x,y: x+y)\
	.map(lambda ((rowNum, clusterIndex), count):(rowNum, (clusterIndex, count)))\
	.groupByKey()\
	.mapValues(lambda histogram: createSparseVector(histogram))\
	.cache()


norm = Normalizer(1)

normalisedReviewFeaturesRDD=reviewFeaturesRDD.map(lambda (rowNum, features):rowNum)\
	.zip(norm.transform(reviewFeaturesRDD.map(lambda (rowNum, features):features)))

formatter_string = "%m %d %Y"

allDataRDD=normalisedReviewFeaturesRDD.join(reviewScoreTimeWithIndex)\
	.map(lambda (rowNum, (features, (score, time))):(score, features, datetime.datetime.strptime(time[-4:], "%Y").year))\
	.cache()

train_featureScoreTimeRDD=allDataRDD.filter(lambda (score, feature, time): time<2014 )\
	.map(lambda (score, feature, time): LabeledPoint(float(score),feature))\
	.repartition(10).cache()
val_featureScoreTimeRDD=allDataRDD.filter(lambda (score, feature, time): time>=2014 )\
	.map(lambda (score, feature, time): LabeledPoint(float(score),feature))\
Example #18
0
    for t in terms:
        if t in term_dict:
            _idx = term_dict[t]
            _x[0, _idx] = 1
    return _x


all_terms_bcast = sc.broadcast(all_terms_dict)
terms_vectors = title_terms.map(
    lambda terms: create_vector(terms, all_terms_bcast.value))

np.random.seed(42)
x = np.random.randn(10)
norm_x_2 = np.linalg.norm(x)
normalized_x = x / norm_x_2
print("x:\n%s" % x)
print("2-Norm of x: %2.4f" % norm_x_2)
print("Normalized x:\n%s" % normalized_x)
print("2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x))

normalizer = Normalizer()
vector = sc.parallelize([x])

normalized_x_mllib = normalizer.transform(vector).first().toArray()

print("x:\n%s" % x)
print("2-Norm of x: %2.4f" % norm_x_2)
print("Normalized x MLlib: \n%s" % normalized_x_mllib)
print("2-Norm of normalized_x_mllib: %2.4f" %
      np.linalg.norm(normalized_x_mllib))
Example #19
0
from pyspark.mllib.feature import Normalizer


conf = SparkConf()

conf.setMaster('yarn-client')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)


path = "/Users/sradhakr/Desktop/Assignment3/Assignment3"

train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10)
val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10)

norm = Normalizer(2)



train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10)
val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10)


train_featuresRDD=train_featureScoreTimeRDD.map(lambda (feature, score): feature)

trainfeatureScoreNormRDD=norm.transform(train_featuresRDD).zip(train_featuresRDD.map(lambda (feature, score): score))


val_featuresRDD=val_featureScoreTimeRDD.map(lambda (feature, score): feature)

valfeatureScoreNormRDD=norm.transform(val_featuresRDD).zip(val_featuresRDD.map(lambda (feature, score): score))
def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))
    docData = re.sub(r'[^a-z0-9 ]', ' ', docData)
    docData = docData.split()
    docData = [x for x in docData if x not in stopWordList]
    docData = [porter.stem(word) for word in docData]
    return (docID, docData)


data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse)

titles = data.map(lambda x: x[0])
documents = data.map(lambda x: x[1])
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF().fit(tf)
normalizer = Normalizer()
tfidf = normalizer.transform(idf.transform(tf))
tfidfData = titles.zip(tfidf).toDF(["label", "features"])
#idf.rdd.saveAsTextFile("idf_model")
#sc.parallelize(idf.idf()).coalesce(1).saveAsTextFile("idf")
#MLUtils.saveAsLibSVMFile(tfidfData, "tfidf_column.out")

query = parse((
    0,
    "location_id organization_id name latitude longitude bbl bin cd council nta tract"
))[1]
queryTF = hashingTF.transform(query)
queryTFIDF = normalizer.transform(idf.transform(queryTF))
queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(
    queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0)
if (queryRelevance.isEmpty()):
Example #22
0
 def features_normal(self, featuresRDD):
     from pyspark.mllib.feature import Normalizer
     normalizer = Normalizer()
     featuresRDD = featuresRDD.map(
         lambda p: [p[0], normalizer.transform(p[1])])
     return featuresRDD
Example #23
0
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="NormalizerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    labels = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    normalizer1 = Normalizer()
    normalizer2 = Normalizer(p=float("inf"))

    # Each sample in data1 will be normalized using $L^2$ norm.
    data1 = labels.zip(normalizer1.transform(features))

    # Each sample in data2 will be normalized using $L^\infty$ norm.
    data2 = labels.zip(normalizer2.transform(features))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
Example #24
0
# %% [markdown]
# 日期:2020-12-20 14:51:00 排名: 无
# score:0.5000562
# %% [markdown]
# ## SVM with Normalized data

# %%
from pyspark.mllib.classification import SVMModel
svm_model2 = SVMModel.load(
    sc, "hdfs://node1:9000/user/root/exp4/models/NormalizedSVMWithSGDModel")

# %%
#将数据规范化
from pyspark.mllib.feature import Normalizer
features = test.map(lambda x: x[3])
normalizer = Normalizer()
normalized_test = test.map(lambda lp: (lp[0], lp[1])).zip(
    normalizer.transform(features)).map(lambda lp: (lp[0][0], lp[0][1], lp[1]))

# %%
print(normalized_test.take(10))

# %%
svm_predictions2 = normalized_test.map(
    lambda line: (line[0], line[1], float(svm_model2.predict(line[2]))))
svm_predictions2.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/svm_predictions2.csv")

# %%

spark.stop()
Example #25
0
# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Normalize').setMaster('local[2]')
sc = SparkContext(conf=conf)

data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')
labels = data.map(lambda x : x.label)
features = data.map(lambda x : x.features)

normalizer1 = Normalizer()
normalizer2 = Normalizer(p=float('inf'))

data1 = labels.zip(normalizer1.transform(features))
data2 = labels.zip(normalizer2.transform(features))

print(data.first())
print(data1.first())
print(data2.first())


sc.stop()
Example #26
0
#Converting data to LabeledPoint
def transform_to_labeled_point(line):
    values = [float(x) for x in line.split(',')]
    #return values
    return LabeledPoint(values[0], values[1:])


msd_labeled = msd.map(transform_to_labeled_point)
labels = msd_labeled.map(lambda x: x.label)
features = msd_labeled.map(lambda x: x.features)
min_label = labels.min()
scaled_label_msd = msd_labeled.map(
    lambda lp: LabeledPoint(lp.label - min_label, lp.features))
scaled_labels = scaled_label_msd.map(lambda x: x.label)
scaled_min_label = scaled_labels.min()
norm = Normalizer()
normalized_msd = scaled_labels.zip(norm.transform(features))
normalized_lp_msd = normalized_msd.map(lambda x: LabeledPoint(x[0], x[1]))

print 'Min label                 : ', min_label
print 'Scaled Min label          : ', scaled_min_label
print 'Data with scaled label    : ', scaled_label_msd.take(2)
print 'Data with scaled features : ', normalized_msd.take(2)
print 'Normalized data           : ', normalized_lp_msd.take(2)

#Output
#Min label                 :  1930.0

#Split the dataset
train_data, validation_data, test_data = normalized_lp_msd.randomSplit(
    [.7, .2, .1], 50)
# ## Normalizing Features

# ### Scaling the Norm of Vectors

# In[99]:

np.random.seed(42)
x = np.random.randn(10)
norm_x_2 = np.linalg.norm(x)
normalized_x = x / norm_x_2
print "x:\n%s" % x
print "2-Norm of x: %2.4f" % norm_x_2
print "Normalized x:\n%s" % normalized_x
print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)


#  ### Scaling the Norm of Vectors with MLlib's Normalizer

# In[101]:

from pyspark.mllib.feature import Normalizer
normalizer = Normalizer()
vector = sc.parallelize([x])
normalized_x_mllib = normalizer.transform(vector).first().toArray()

print "x:\n%s" % x
print "2-Norm of x: %2.4f" % norm_x_2
print "Normalized x MLlib:\n%s" % normalized_x_mllib
print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)

# 11.5算法:MLlib的主要算法以及它们的输入和输出类型
# 11.5.1特征提取:mllib.feature包
# TF-IDF词频-逆文档频率
# 缩放至平均值为0,标准差为1
from pyspark.mllib.feature import StandardScaler

vectors = [Vectors.dense([-2.0, 5.0, 1.0]), Vectors.dense([2.0, 0.0, 1.0])]
dataset = sc.parallelize(vectors)
scaler = StandardScaler(withMean=True, withStd=True)
model = scaler.fit(dataset)
result = model.transform(dataset)
# 正规化长度为1
from pyspark.mllib.feature import Normalizer

normalizer = Normalizer(p=3.0)
result2 = normalizer.transform(dataset)
# Word2Vec
"""
Word2Vec是一个基于神经网络的文本特征化算法,可以将数据传给许多下游算法
mllib.feature.Word2Vec类引入了该算法
"""

# 11.5.2统计--mllib.stat.Statistics类中提供了几种统计函数可以直接在RDD上使用
"""
Statistics.colStats(rdd)计算由向量组成的RDD的统计性综述,包括每列的最大值最小值平均值和方差
Statistics.corr(rdd,method)计算由向量组成的RDD中的列间的相关矩阵,method须是pearson皮尔森相关或spearman斯皮尔曼相关
Statistics.corr(rdd1,rdd2,method)计算两个RDD的相关矩阵,method同上
Statistics.chiSqTest(rdd)计算由LabeledPoint对象组成的RDD中每个特征与标签的皮尔森独立性测试,
                         返回一个ChiSqTestResult对象,其中有p值,测试统计,每个特征的自由度.特征和标签必须是分类的,即离散值
"""
conf = SparkConf().setAppName('cosSim')
sc = SparkContext(conf=conf)

file = open('retweet_sim_analysis.json')
results = json.load(file)
rdd = sc.parallelize(results)


def mapDocs(user):
    doc = []
    for retweet in user['retweets']:
        doc.extend([retweet[0]] * retweet[1])
    return doc


normalizer1 = Normalizer()


def cosineSimilarity(tupl):
    x, y = tupl
    return (x[0], y[0], x[1].dot(y[1]))


def setNodeEdge(user):
    return {'source': user[0], 'target': user[1], 'weight': user[2]}


rddDocs = rdd.map(lambda x: mapDocs(x))
rddLabels = rdd.map(lambda x: x['_id'])

hashingTF = HashingTF()

def parseDataF(x):

     label=0
     features=x[1][0]
     if x[1][1]!="None" and x[1][1] is not None:
          label=int(x[1][1])
     return LabeledPoint(int(label),features)



conf = SparkConf().setAppName('Chicago Linear Reg')
sc = SparkContext(conf=conf)

normalizer1 = Normalizer()
inputs =  sys.argv[1]#"/Volumes/personal/uzmaa-irmacs/Chicago/data/FeatureSetSocialCrimePickle"

inputrdd=sc.pickleFile(inputs)

inputrddtrain=inputrdd.filter(lambda ((Community,Year,Month),(features,label)):Year<2010)

inputrddtest=inputrdd.filter(lambda ((Community,Year,Month),(features,label)):Year<2015 and Year>=2010)

DataTrain = inputrddtrain.map(parseDataF)
DataTest = inputrddtest.map(parseDataF)

print DataTrain.take(5)
print DataTest.take(5)