def generate_user_emb(self, spark_session: SparkSession, raw_sample_data_path: str, word2vec_model: Word2VecModel, emb_length: int, emb_output_file_name: str, save_to_redis: bool, redis_key_prefix: str): """ generate user embedding which is constructed by related movie embeddings :param spark_session: :param raw_sample_data_path: :param word2vec_model: :param emb_length: :param emb_output_file_name: :param save_to_redis: :param redis_key_prefix: :return: """ root_dir = dirname(dirname(dirname(abspath(__file__)))) rating_resource_path = join(root_dir, "resources", raw_sample_data_path) rating_samples = spark_session.read.format("csv").option( "header", "true").load(rating_resource_path) rating_samples.show(10, truncate=False) user_embeddings_dict = dict() movie_keys = word2vec_model.getVectors().keys() for row in rating_samples.collect(): user_id = row["userId"] movie_id = row["movieId"] if movie_id not in movie_keys: movie_emb = np.zeros(emb_length) else: movie_emb = word2vec_model.transform(movie_id).toArray() if user_id in user_embeddings_dict: user_embeddings_dict[user_id] += np.copy(movie_emb) else: user_embeddings_dict[user_id] = np.copy(movie_emb) root_dir = dirname(dirname(dirname(abspath(__file__)))) rating_resource_path = join(root_dir, "resources", "webroot/modeldata/") file = open(join(rating_resource_path, emb_output_file_name), "w") for user_id, user_emb in user_embeddings_dict.items(): file.write(user_id + ":" + " ".join([str(num) for num in user_emb]) + "\n") if save_to_redis: redis_client = redis.Redis(host=self.redisEndpoint, port=self.redisPort) for user_id, user_emb in user_embeddings_dict.items(): redis_client.set(redis_key_prefix + ":" + user_id, " ".join([str(num) for num in user_emb]), ex=60 * 60 * 24) redis_client.close()
def main(argv): import getopt dir = '/user/rmusters/' word2vec = Word2Vec() sc = SparkContext(appName='Word2Vec') # # try: # opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) # except getopt.GetoptError: # print 'test.py -i <inputfile> -o <outputfile>' # sys.exit(2) # for opt, arg in opts: # if opt == '-h': # print 'test.py -i <inputfile> -o <outputfile>' # sys.exit() # elif opt in ("-l"): # inputfile = arg # elif opt in ("-s"): # outputfile = arg # print 'Input file is "', inputfile # print 'Output file is "', outputfile filename = "12.txt" inp = sc.textFile(dir + filename).map(lambda row: row.split(" ")) model = word2vec.fit(inp) model.save(sc, dir + "pymodelF.bin") model = Word2VecModel.load(sc, dir + "pymodelF.bin") print model.getVectors()
def load_model (self, key_text): model_path = self.get_model_path (key_text) model = None if os.path.exists (model_path): logger.info ("Load existing word2vec model: {0}".format (model_path)) model = Word2VecModel.load (self.sc, model_path) return model
def model_instream(sc, **params): fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get( sc._jsc.hadoopConfiguration()) if not fs.exists( sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])): raise Exception("Invalid file path, path not exists!") if params['type'] == 'kmeans': model = KMeansModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'fpgrowth': model = FPGrowthModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'logistic-regression': model = LogisticRegressionModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'word2vec': model = Word2VecModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'decision-tree': model = DecisionTreeModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) else: raise Exception("Invalid model type!") return True, model
def write_data(path): import filter from pyspark.mllib.feature import Word2Vec, Word2VecModel # load data loc = '/user/rmusters/text/2015/01/*' text_file = sc.textFile(loc) data = text_file.map(lambda line: filter.filter(line).split(" ")) # load model word2vec = Word2Vec() model = Word2VecModel.load(sc, '/user/rmusters/2015model99') # get a tweet vector pair. from pyspark.sql import SQLContext sqlContext = SQLContext(sc) lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup") lookup_bd = sc.broadcast(lookup.rdd.collectAsMap()) vectors = data.map(lambda ws: [lookup_bd.value.get(w) for w in ws]) logger.info(vectors.count()) data = text_file.map(lambda line: (line, filter.filter(line).split(" ")))\ .map(lambda (text, filtered): (text, filtered, [lookup_bd.value.get(w) for w in filtered][0])) from pyspark.sql.functions import monotonicallyIncreasingId df = data.toDF(["text", "filtered_text", "vectors"]) # This will return a new DF with all the columns + id res = df.withColumn("id", monotonicallyIncreasingId()) res.write.parquet(path, mode="overwrite")
def __init__(self, sc, conf, articles): self.sc = sc self.conf = conf if os.path.exists (conf.w2v_model): logger.info ("Load existing word2vec model: {0}".format (self.conf.w2v_model)) self.model = Word2VecModel.load (self.sc, self.conf.w2v_model) else: logger.info ("Compute word2vec word embedding model...") text = articles. \ flatMap (lambda a : a.paragraphs ). \ flatMap (lambda p : p.sentences ). \ map (lambda s : s.replace(".", " ").split (" ") ) self.model = Word2Vec (). \ setNumPartitions (100). \ fit (text) self.model.save (self.sc, self.conf.w2v_model)
def get_word2vec_model (sc, app_home): logger.info ("Getting word2vec model of corpus") w2v_path = os.path.join (app_home, "cache", word2vec_model_path) corpus_path = os.path.join (app_home, "cache", corpus_file_name) model = None logger.info ("w2v model path: {0}".format (w2v_path)) logger.info ("corpus path: {0}".format (corpus_path)) if os.path.exists (w2v_path): logger.info ("Loading existing word2vec model") model = Word2VecModel.load(sc, w2v_path) elif os.path.exists (corpus_path): logger.info ("Computing new word2vec model from corpus") inp = sc.textFile (corpus_path).map(lambda row: row.split(" ")) word2vec = Word2Vec() #word2vec.setNumPartitions (200) model = word2vec.fit (inp) model.save (sc, w2v_path) else: logger.error ("No existing word2vecd model found and no pubmed corpus file found at {0}.".format (corpus_path)) return model
from pyspark.mllib.feature import Word2Vec, Word2VecModel #import text_proc from text_proc import clean_text #conf = SparkConf().setAppName("Jack").setMaster("local") conf = SparkConf().setAppName("Jack").setMaster("local").set( 'spark.driver.memory', '6G').set('spark.driver.maxResultSize', '10G') sc = SparkContext.getOrCreate(conf=conf) spark = SparkSession.builder.appName("Python Spark SQL Example").getOrCreate() #word2vec = Word2Vec() #word2vec_model = Word2VecModel() #word2vec.load(sc, "text8_long.mdl") model = Word2VecModel.load(sc, "text8_long.mdl") #synonyms = model.findSynonyms('team', 10) #for word, cosine_distance in synonyms: # print ("{}: {}".format(word, cosine_distance)) #vec = model.transform('good') #print(len(vec)) #vec = model.transform('bad') #print(len(vec)) #vec = model.transform('smart') #print(len(vec)) if __name__ == "__main__": filename = 'all_tweets_text.txt' file = open(filename, 'r') lines = file.readlines()
def loading_word2vec_Model(sc, path): logging.log(logging.CRITICAL, " [Loading] Model") Model = Word2VecModel.load(sc, path) logging.log(logging.CRITICAL, " [Loading] Model finish") return Model
def main(argv): sc = SparkContext("local", "Simple App") sc.setLogLevel("ERROR") args = parser.parse_args(argv[1:]) vector_size = int(args.vector_size) min_count = int(args.min_count) test = int(args.mode) resume = int(args.resume) # proc = int(args.proc) MAX_LEN = 500 K=10 # vector_size = 5 # min_count = 5 # Check the existence of word2vec_model folder model_name = "word2vec_model_playlist" model_folder = glob.glob(model_name+"*") model_num = len(model_folder) path = "data/df_data/df_playlistSong/" if test == 1: print("Mode test: ON") path = "data/df_data/df_small/df_playlistSong/" MAX_LEN = 100 print(path) print("Load Song-Playlist matrix") # path = "data/df_data/df_small/df_playlistSong/" df_ps_train = pd.read_hdf(path+'df_ps_train.hdf') df_ps_test = pd.read_hdf(path+'df_ps_test.hdf') df_ps_test_truth = pd.read_hdf(path+'df_ps_test_truth.hdf') df_sp_train = pd.read_hdf(path+'df_sp_train.hdf') data_str = [list(map(str,item)) for item in df_sp_train.pid.values] pid_list_pred = list(df_ps_test.index) current_list = list(df_ps_test.loc[pid_list_pred].tid) current_len = [len(i) for i in current_list] # K_list = [MAX_LEN - current_len[i] for i in range(len(current_len))] current_list_str = [list(map(str,item)) for item in current_list] record = [] index = 0 # Resume or not if resume == 0: print("Serialize data") doc = sc.parallelize(data_str).persist(StorageLevel.DISK_ONLY) print("Train Word2Vec model") model = Word2Vec().setVectorSize(vector_size).setSeed(3).setMinCount(min_count).fit(doc) print("Get vocabulary") vocab = model.getVectors().keySet() print("Save model") model_name = model_name + str(model_num) model.save(sc, model_name) elif resume == 1: print("load recent model") model_name = model_name + str(model_num-1) model = Word2VecModel.load(sc, model_name) print("Get vocabulary") vocab = model.getVectors().keySet() first_key = list(vocab)[0] vector_size = len(model.getVectors()[first_key]) print("Check resume file: ",end='') if(os.path.exists("resumefile")): print("Exist") with open ('resumefile', 'rb') as fp: resumefile = pickle.load(fp) pid,record = resumefile.get('pid'), resumefile.get('data') index = current_list_str.index(pid) print("Resume at point pid: {} \t index: {}".format(pid,index)) else: print("Not exist") print("Find K Relevant Songs") try: i = 0 for data_list in current_list_str[index:]: pid = pid_list_pred[i] print("Iter: {} \t pid: {} ".format(str(i+1),pid)) start = time.time() ######################## START CHANGING HERE ################################ syms = model.findSynonyms(str(pid),K) topK_pid = [s[0] for s in syms] # Need to convert str to Int here topK_pid = [int(i) for i in topK_pid] n = 0 while(1): # Get the top 1 pid top_pid = topK_pid[n] # Retrieve tid from the top 1 pid add_tid_list = df_ps_train.loc[top_pid].tid # Form new list new_tid_list = data_list + add_tid_list # Check duplicate lists new_tid_list = [tid for tid in new_tid_list if tid not in data_list] # Check number of songs and Add to data for prediction total_song = len(new_tid_list) if (total_song > MAX_LEN): new_list = new_tid_list[:MAX_LEN] break else: new_list = new_tid_list n += 1 if (n>=K): break record.append(new_list) i += 1 print("Time taken = {0:.5f}".format(time.time() - start)) print("Create new dataframe") df_ps_test['new_tid'] = record df_ps_test['tid']=df_ps_test.apply(lambda x: x[1]+ x[2],axis=1) df_ps_test=df_ps_test.drop(columns='new_tid') # df_ps_pred = pd.DataFrame.from_records(new_list,columns=['pid','tid']) # df_ps_pred = df_ps_pred.set_index('pid') print("Save test data") df_ps_test.to_hdf(path+'df_ps_test_complete.hdf', key='abc') print("Evaluation") result = my_evaluation(df_ps_test,df_ps_test_truth) print(result.aggregate_metric()) except Exception as e: print(e) print("Create a resume point") resume_dict = {'pid':pid,'data':record} with open('resumefile', 'wb') as fp: pickle.dump(resume_dict, fp)
# the rest are considered a string text = " ".join(elements[4:]) return id_and_labels + [text] def tokenize_line(line): text = line[-1] # the last element of line is the text from re import split return filter(lambda w: len(w) > 1, split(DELIMS, text.lower())) sc = SparkContext(appName="Train Logistic Regression (IMR)") parsed_data = sc.textFile(in_file).map(parse_raw_line) vectors_of_words = parsed_data.map(tokenize_line) word2vec = Word2Vec().setVectorSize(VEC_SIZE) model = word2vec.fit(vectors_of_words) try: rmtree(model_path) except: pass model.save(sc, model_path) sameModel = Word2VecModel.load(sc, model_path) print len(sameModel.getVectors())
data_location = config.get('DataSection', 'data_location') model_location = config.get('DataSection', 'model_location') spark_master = config.get('SparkSection', 'spark_master') spark_executor_memory = config.get('SparkSection', 'spark_executor_memory') min_word_count = config.get('ModelSection', 'min_word_count') num_iterations = config.get('ModelSection', 'num_iterations') vector_size = config.get('ModelSection', 'vector_size') debug_flag = config.get('Debug', 'debug') relations_test_file = config.get('DataSection', 'relations_test_file') relations_result_file = config.get('DataSection', 'relations_result_file') conf = (SparkConf().setMaster(spark_master).setAppName("WikiFindSynonyms").set( "spark.executor.memory", spark_executor_memory)) sc = SparkContext(conf=conf) #word2vec = Word2VecModel() model = Word2VecModel.load(sc, model_location) with open(relations_test_file, 'r') as f: reader = csv.reader(f) records = list(reader) with open(relations_result_file, 'w') as rf: writer = csv.writer(rf) for record in records: s = (record[0], record[1], record[2]) s1 = model_utils.getAnalogy(s, model) result_row = [] result_row.append(record[0]) result_row.append(record[1]) result_row.append(record[2])
def load_model(self, context, path): return Word2VecModel.load(context, path)
from pyspark import SparkContext, SparkConf from pyspark.mllib.feature import Word2Vec, Word2VecModel from pyspark.mllib.common import _java2py from pyspark.mllib.linalg import Vectors # local spark print('LOADING MODEL') conf = SparkConf().setAppName('ReadModel').setMaster('spark://cep16001s1:7077') sc = SparkContext(conf=conf) n = 1000 model = Word2VecModel.load(sc, 'ag_word2vec_n_' + str(n) + '.model') print('LOADING DATASET') # result = model.transform("computer") new_lines = [] with open('dataset-after-clean-with-label-10000-each.txt', 'r', encoding='utf8') as f: lines = f.readlines() total = len(lines) count = 0 for line in lines: count += 1 if count % 1000 == 0: print(count, total) label = line.split('\\C')[0] text = line.split('\\C')[1] words = text.split() vecs = [] for word in words: try: vec = model.transform(word)
text = " ".join(elements[4:]) return id_and_labels + [text] def tokenize_line(line): text = line[-1] # the last element of line is the text from re import split return filter(lambda w: len(w)>1, split(DELIMS, text.lower())) sc = SparkContext(appName="Train Logistic Regression (IMR)") parsed_data = sc.textFile(in_file).map(parse_raw_line) vectors_of_words = parsed_data.map(tokenize_line) word2vec = Word2Vec().setVectorSize(VEC_SIZE) model = word2vec.fit(vectors_of_words) try: rmtree(model_path) except: pass model.save(sc, model_path) sameModel = Word2VecModel.load(sc, model_path) print len(sameModel.getVectors())
import sys reload(sys) sys.setdefaultencoding('utf-8') from pyspark import SparkContext from pyspark.mllib.feature import Word2Vec, Word2VecModel USAGE = ("bin/spark-submit --driver-memory 4g " "examples/src/main/python/mllib/word2vec.py text8_lines") if __name__ == "__main__": if len(sys.argv) < 2: print(USAGE) sys.exit("Argument for file not provided") file_path = sys.argv[1] sc = SparkContext(appName='Word2Vec') inp = sc.textFile(file_path).map(lambda row: row.split(" ")) word2vec = Word2Vec() model = word2vec.fit(inp) synonyms = model.findSynonyms('한국', 40) for word, cosine_distance in synonyms: print("{}: {}".format(word, cosine_distance)) model.save(sc, "word_model") sameModel = Word2VecModel.load(sc, "./word_model") sc.stop()
sys.path.append("/home/robert/spark-1.6.1-bin-hadoop2.6/bin") try: from pyspark import SparkContext from pyspark import SparkConf from pyspark import SparkContext from pyspark.mllib.feature import Word2Vec, Word2VecModel print("Successfully imported Spark Modules") except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1) dir = '/user/rmusters/' # utils.removeCharFromFilename(dir) # utils.concatFiles(dir) word2vec = Word2Vec() sc = SparkContext(appName='Word2Vec') filename = "text8.zip" filename = "12.txt" inp = sc.textFile(dir + filename).map(lambda row: row.split(" ")) model = word2vec.fit(inp) model.save(sc, dir + "pymodel.bin") model = Word2VecModel.load(sc, dir + "pymodel.bin") print model.getVectors() #model.train(inp)
def main(): k_input_model = sys.argv[1] #read kmean model from this location w_input_model = sys.argv[2] #read word2vec model from this location input_file = sys.argv[3] #read input file conf = SparkConf().setAppName('Clustering') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) '''sbaronia - load both kmean and Word2Vec model''' kmean_model = KMeansModel.load(sc,k_input_model) word2vec_model = Word2VecModel.load(sc,w_input_model) '''sbaronia - select fields from json and make data frame zipped with index''' review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache() review_df = review.filter(review.reviewText != "").cache() rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache() rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache() year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache() year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache() clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache() clean_list = clean_words_rdd.collect() '''sbaronia - make a list of all words in our model''' keys = sqlContext.read.parquet(w_input_model+"/data") keys_list = keys.rdd.map(lambda line: line.word).collect() '''sbaronia - here we create one vector per review, where vector contains the number of times a cluster is assinged to a word in a review. We make a SparseVector compatible format''' features = [] for i in range(len(clean_list)): histogram = [0] * 2000 for word in clean_list[i]: if word in keys_list: vec = word2vec_model.transform(word) clust = kmean_model.predict(vec) if histogram[clust] > 0: histogram[clust] = histogram[clust] + 1 else: histogram[clust] = 1 features.append((2000,range(2000),histogram)) '''sbaronia - create a normalized SparseVector rdd''' nor = Normalizer(1) features_rdd = rdd_zip(sc.parallelize(features) \ .map(lambda line: nor.transform(SparseVector.parse(line))) \ .cache()).cache() '''sbaronia - make a dataframe with rating, year and vector per review''' features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache() year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache() featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \ .drop(features_df.index).cache() '''sbaronia - create training and testing data based on year''' train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() '''sbaronia - find best step using validation and run LinearRegressionWithSGD with that step and report final RMSE''' step_best_norm = validation(train_rdd) RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm) print("Final RMSE(Normalization) = " + str(RMSE_norm) + " Best Step size = " + str(step_best_norm))
from pyspark import SparkContext, SparkConf from pyspark.mllib.feature import Word2Vec, Word2VecModel from pyspark.mllib.common import _java2py # local spark conf = SparkConf().setAppName('ReadModel').setMaster('local') sc = SparkContext(conf=conf) model = Word2VecModel.load(sc, 'ag_word2vec_n_300.model') # sysn_lst = list(synonyms) result = model.transform("computer") vector = model.getVectors() sc.stop() def printNumberFeatures(): print('# of features') print(len(result)) def printSimilarWords(input): synonyms = list(model.findSynonyms(input, 10)) print('<==RESULT') for word, sim in synonyms: print(word, sim) print('RESULT==>') def getVector(word): v = model.transform(word) print(v)
from pyspark import SparkContext, SparkConf import sys #spark-submit --master yarn --executor-memory 32g --driver-memory 32g --deploy-mode cluster --num-executors 400 master/hadoop/loadw2v.py conf = (SparkConf() .set("spark.driver.maxResultSize", "0") #.set('spark.executor.memory','32g') #.set('spark.driver.memory','32g') ) sc = SparkContext(appName='loadw2v', conf=conf) from pyspark.mllib.feature import Word2VecModel model = Word2VecModel.load(sc, '/user/rmusters/pymodel_filtered3.bin') vec = model.getVectors() vec = str(vec) rdd = sc.parallelize(vec) path = 'hdfs:///user/rmusters/vectors4.txt' rdd.saveAsTextFile(path) # with open(path, 'w+') as textfile: # textfile.write(vec) print sys.getsizeof(model.getVectors)
def main(): input_model = sys.argv[1] #model to be read input_file = sys.argv[2] #review file to be read conf = SparkConf().setAppName('Word2Vec') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) '''sbaronia - load word2vec model last saved''' word2vec_model = Word2VecModel.load(sc,input_model) '''sbaronia - get three fields from json and make data frame with index''' review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache() review_df = review.filter(review.reviewText != "").cache() rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache() rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache() year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache() year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache() clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache() clean_list = clean_words_rdd.collect() '''sbaronia - make a list of all words in our model''' keys = sqlContext.read.parquet(input_model+"/data") keys_list = keys.rdd.map(lambda line: line.word).collect() '''sbaronia - using loaded model find vector for every word in review sum them and find average vector for a review''' avg_vec = [] for i in range(len(clean_list)): sum_init = 0 count = 0 for word in clean_list[i]: if word in keys_list: count = count + 1 vec = word2vec_model.transform(word) sum_init = sum_init + vec if count > 0: avg_vec.append(sum_init/count) '''sbaronia - create an rdd of this avg vector for all reviews''' avg_vec_rdd = rdd_zip(sc.parallelize(avg_vec).cache()).cache() avg_vec_df = sqlContext.createDataFrame(avg_vec_rdd, ['vector', 'index']).cache() '''sbaronia - make a dataframe with overall rating and avg vector''' year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache() vecyearrate_df = avg_vec_df.join(year_rating_df, avg_vec_df.index == year_rating_df.index, 'inner') \ .drop(avg_vec_df.index).cache() '''sbaronia - extract training and testing rdd based on year''' train_rdd = vecyearrate_df.filter(vecyearrate_df.year < 2014) \ .select('rating','vector') \ .map(lambda line: (LabeledPoint(line.rating, line.vector))) \ .coalesce(1) \ .cache() test_rdd = vecyearrate_df.filter(vecyearrate_df.year == 2014) \ .select('rating','vector') \ .map(lambda line: (LabeledPoint(line.rating, line.vector))) \ .coalesce(1) \ .cache() '''sbaronia - find best step using validation and run regression to get final RMSE''' step_best = validation(train_rdd) RMSE = regression_and_error(train_rdd,test_rdd,step_best) print("Final RMSE = " + str(RMSE) + " Best Step size = " + str(step_best))
def main(argv): sc = SparkContext("local", "Simple App") sc.setLogLevel("ERROR") args = parser.parse_args(argv[1:]) vector_size = int(args.vector_size) min_count = int(args.min_count) test = int(args.mode) resume = int(args.resume) MAX_LEN = 500 # vector_size = 5 # min_count = 5 # Check the existence of word2vec_model folder model_name = "word2vec_model" model_folder = glob.glob(model_name + "*") model_num = len(model_folder) path = "data/df_data/df_playlistSong/" if test == 1: print("Mode test: ON") path = "data/df_data/df_small/df_playlistSong/" MAX_LEN = 100 print(path) print("Load Song-Playlist matrix") # path = "data/df_data/df_small/df_playlistSong/" df_ps_train = pd.read_hdf(path + 'df_ps_train.hdf') df_ps_test = pd.read_hdf(path + 'df_ps_test.hdf') df_ps_test_truth = pd.read_hdf(path + 'df_ps_test_truth.hdf') data_str = [list(map(str, item)) for item in df_ps_train.tid.values] pid_list_pred = list(df_ps_test.index) current_list = list(df_ps_test.loc[pid_list_pred].tid) current_len = [len(i) for i in current_list] K_list = [MAX_LEN - current_len[i] for i in range(len(current_len))] current_list_str = [list(map(str, item)) for item in current_list] record = [] index = 0 # Resume or not if resume == 0: print("Serialize data") doc = sc.parallelize(data_str).persist(StorageLevel.DISK_ONLY) print("Train Word2Vec model") model = Word2Vec().setVectorSize(vector_size).setSeed(3).setMinCount( min_count).fit(doc) print("Get vocabulary") vocab = model.getVectors().keySet() print("Save model") model_name = model_name + str(model_num) model.save(sc, model_name) elif resume == 1: print("load recent model") model_name = model_name + str(model_num - 1) model = Word2VecModel.load(sc, model_name) print("Get vocabulary") vocab = model.getVectors().keySet() first_key = list(vocab)[0] vector_size = len(model.getVectors()[first_key]) print("Check resume file: ", end='') if (os.path.exists("resumefile")): print("Exist") with open('resumefile', 'rb') as fp: resumefile = pickle.load(fp) pid, record = resumefile.get('pid'), resumefile.get('data') index = current_list_str.index(pid) print("Resume at point pid: {} \t index: {}".format(pid, index)) else: print("Not exist") print("Find K Relevant Songs") try: i = 0 for data_list in current_list_str[index:]: # print("pid: {} {}".format(pid_list_pred[i],data_list)) pid = pid_list_pred[i] print("Iter: {} \t pid: {} ".format(str(i + 1), pid)) start = time.time() # Filter data not in vocabulary data_list_filter = [value for value in data_list if value in vocab] # topK = [value for value in topK if value not in data_list] # Find the centroid of data_list record.append( findK_relevant(model, K_list[i], data_list_filter, sc, vector_size)) i += 1 print("Time taken = {0:.5f}".format(time.time() - start)) print("Create new dataframe") df_ps_test['new_tid'] = record df_ps_test['tid'] = df_ps_test.apply(lambda x: x[1] + x[2], axis=1) df_ps_test = df_ps_test.drop(columns='new_tid') print("Save test data") df_ps_test.to_hdf(path + 'df_ps_test_complete.hdf', key='abc') print("Evaluation") result = my_evaluation(df_ps_test, df_ps_test_truth) print(result.aggregate_metric()) except: print("Create a resume point") resume_dict = {'pid': pid, 'data': record} with open('resumefile', 'wb') as fp: pickle.dump(resume_dict, fp)