def generate_user_emb(self, spark_session: SparkSession,
                          raw_sample_data_path: str,
                          word2vec_model: Word2VecModel, emb_length: int,
                          emb_output_file_name: str, save_to_redis: bool,
                          redis_key_prefix: str):
        """
        generate user embedding which is constructed by related movie embeddings
        :param spark_session:
        :param raw_sample_data_path:
        :param word2vec_model:
        :param emb_length:
        :param emb_output_file_name:
        :param save_to_redis:
        :param redis_key_prefix:
        :return:
        """
        root_dir = dirname(dirname(dirname(abspath(__file__))))
        rating_resource_path = join(root_dir, "resources",
                                    raw_sample_data_path)

        rating_samples = spark_session.read.format("csv").option(
            "header", "true").load(rating_resource_path)

        rating_samples.show(10, truncate=False)

        user_embeddings_dict = dict()
        movie_keys = word2vec_model.getVectors().keys()

        for row in rating_samples.collect():
            user_id = row["userId"]
            movie_id = row["movieId"]
            if movie_id not in movie_keys:
                movie_emb = np.zeros(emb_length)
            else:
                movie_emb = word2vec_model.transform(movie_id).toArray()
            if user_id in user_embeddings_dict:
                user_embeddings_dict[user_id] += np.copy(movie_emb)
            else:
                user_embeddings_dict[user_id] = np.copy(movie_emb)

        root_dir = dirname(dirname(dirname(abspath(__file__))))
        rating_resource_path = join(root_dir, "resources",
                                    "webroot/modeldata/")

        file = open(join(rating_resource_path, emb_output_file_name), "w")

        for user_id, user_emb in user_embeddings_dict.items():
            file.write(user_id + ":" + " ".join([str(num)
                                                 for num in user_emb]) + "\n")

        if save_to_redis:
            redis_client = redis.Redis(host=self.redisEndpoint,
                                       port=self.redisPort)
            for user_id, user_emb in user_embeddings_dict.items():
                redis_client.set(redis_key_prefix + ":" + user_id,
                                 " ".join([str(num) for num in user_emb]),
                                 ex=60 * 60 * 24)
            redis_client.close()
Example #2
0
def main(argv):
	import getopt

	dir = '/user/rmusters/'

	word2vec = Word2Vec()
	sc = SparkContext(appName='Word2Vec')
	#
	# try:
	# 	opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
	# except getopt.GetoptError:
	# 	print 'test.py -i <inputfile> -o <outputfile>'
	# 	sys.exit(2)
	# 	for opt, arg in opts:
	# if opt == '-h':
	# 	print 'test.py -i <inputfile> -o <outputfile>'
	# 	sys.exit()
	# elif opt in ("-l"):
	# 	inputfile = arg
	# elif opt in ("-s"):
	# 	outputfile = arg
	# 	print 'Input file is "', inputfile
	# 	print 'Output file is "', outputfile

	filename = "12.txt"
	inp = sc.textFile(dir + filename).map(lambda row: row.split(" "))

	model = word2vec.fit(inp)

	model.save(sc, dir + "pymodelF.bin")

	model =  Word2VecModel.load(sc, dir + "pymodelF.bin")

	print model.getVectors()
Example #3
0
 def load_model (self, key_text):
     model_path = self.get_model_path (key_text)
     model = None
     if os.path.exists (model_path):
         logger.info ("Load existing word2vec model: {0}".format (model_path))
         model = Word2VecModel.load (self.sc, model_path)
     return model
Example #4
0
def model_instream(sc, **params):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration())
    if not fs.exists(
            sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) +
                                              '/model/' + params['path'])):
        raise Exception("Invalid file path, path not exists!")
    if params['type'] == 'kmeans':
        model = KMeansModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'fpgrowth':
        model = FPGrowthModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'logistic-regression':
        model = LogisticRegressionModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'word2vec':
        model = Word2VecModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'decision-tree':
        model = DecisionTreeModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    else:
        raise Exception("Invalid model type!")
    return True, model
def write_data(path):
	import filter
	from pyspark.mllib.feature import Word2Vec, Word2VecModel

	# load data
	loc = '/user/rmusters/text/2015/01/*'
	text_file = sc.textFile(loc)
	data = text_file.map(lambda line: filter.filter(line).split(" "))

	# load model
	word2vec = Word2Vec()
	model = Word2VecModel.load(sc, '/user/rmusters/2015model99')

	# get a tweet vector pair.
	from pyspark.sql import SQLContext
	sqlContext = SQLContext(sc)
	lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup")
	lookup_bd = sc.broadcast(lookup.rdd.collectAsMap())

	vectors = data.map(lambda ws: [lookup_bd.value.get(w) for w in ws])
	logger.info(vectors.count())

	data = text_file.map(lambda line: (line, filter.filter(line).split(" ")))\
							.map(lambda (text, filtered): (text, filtered, [lookup_bd.value.get(w) for w in filtered][0]))

	from pyspark.sql.functions import monotonicallyIncreasingId
	df = data.toDF(["text", "filtered_text", "vectors"])
	# This will return a new DF with all the columns + id
	res = df.withColumn("id", monotonicallyIncreasingId())
	res.write.parquet(path, mode="overwrite")
Example #6
0
 def __init__(self, sc, conf, articles):
     self.sc = sc
     self.conf = conf
     if os.path.exists (conf.w2v_model):
         logger.info ("Load existing word2vec model: {0}".format (self.conf.w2v_model))
         self.model = Word2VecModel.load (self.sc, self.conf.w2v_model)
     else:
         logger.info ("Compute word2vec word embedding model...")
         text = articles.                                    \
                flatMap (lambda a : a.paragraphs ).          \
                flatMap (lambda p : p.sentences ).           \
                map (lambda s : s.replace(".", " ").split (" ") )
         self.model = Word2Vec ().            \
                      setNumPartitions (100). \
                      fit (text)
         self.model.save (self.sc, self.conf.w2v_model)
Example #7
0
def get_word2vec_model (sc, app_home):
    logger.info ("Getting word2vec model of corpus")
    w2v_path = os.path.join (app_home, "cache", word2vec_model_path)
    corpus_path = os.path.join (app_home, "cache", corpus_file_name)
    model = None
    logger.info ("w2v model path: {0}".format (w2v_path))
    logger.info ("corpus path: {0}".format (corpus_path))

    if os.path.exists (w2v_path):
        logger.info ("Loading existing word2vec model")
        model = Word2VecModel.load(sc, w2v_path)
    elif os.path.exists (corpus_path):
        logger.info ("Computing new word2vec model from corpus")
        inp = sc.textFile (corpus_path).map(lambda row: row.split(" "))
        word2vec = Word2Vec()
        #word2vec.setNumPartitions (200)
        model = word2vec.fit (inp)
        model.save (sc, w2v_path)
    else:
        logger.error ("No existing word2vecd model found and no pubmed corpus file found at {0}.".format (corpus_path))
    return model
Example #8
0
from pyspark.mllib.feature import Word2Vec, Word2VecModel

#import text_proc
from text_proc import clean_text

#conf = SparkConf().setAppName("Jack").setMaster("local")
conf = SparkConf().setAppName("Jack").setMaster("local").set(
    'spark.driver.memory', '6G').set('spark.driver.maxResultSize', '10G')
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.appName("Python Spark SQL Example").getOrCreate()

#word2vec = Word2Vec()
#word2vec_model = Word2VecModel()
#word2vec.load(sc, "text8_long.mdl")
model = Word2VecModel.load(sc, "text8_long.mdl")
#synonyms = model.findSynonyms('team', 10)

#for word, cosine_distance in synonyms:
#    print ("{}: {}".format(word, cosine_distance))
#vec = model.transform('good')
#print(len(vec))
#vec = model.transform('bad')
#print(len(vec))
#vec = model.transform('smart')
#print(len(vec))

if __name__ == "__main__":
    filename = 'all_tweets_text.txt'
    file = open(filename, 'r')
    lines = file.readlines()
Example #9
0
def loading_word2vec_Model(sc, path):
    logging.log(logging.CRITICAL, " [Loading] Model")
    Model = Word2VecModel.load(sc, path)
    logging.log(logging.CRITICAL, " [Loading] Model finish")
    return Model
Example #10
0
def main(argv):
    sc = SparkContext("local", "Simple App")
    sc.setLogLevel("ERROR")
    
    args = parser.parse_args(argv[1:])
    vector_size = int(args.vector_size)
    min_count = int(args.min_count)    
    test = int(args.mode)
    resume = int(args.resume)
#    proc = int(args.proc)
    
    MAX_LEN = 500     
    K=10

#    vector_size = 5
#    min_count = 5
    
    # Check the existence of word2vec_model folder
    model_name = "word2vec_model_playlist"
    model_folder = glob.glob(model_name+"*")
    model_num = len(model_folder)
    
 
    
    path = "data/df_data/df_playlistSong/"
    if test == 1:
        print("Mode test: ON")
        path = "data/df_data/df_small/df_playlistSong/"
        MAX_LEN = 100 
    print(path)
    print("Load Song-Playlist matrix")    
    
#    path = "data/df_data/df_small/df_playlistSong/"
    
    df_ps_train = pd.read_hdf(path+'df_ps_train.hdf')
    
    df_ps_test = pd.read_hdf(path+'df_ps_test.hdf')
    df_ps_test_truth = pd.read_hdf(path+'df_ps_test_truth.hdf')
       
    df_sp_train = pd.read_hdf(path+'df_sp_train.hdf')
    
    data_str = [list(map(str,item)) for item in df_sp_train.pid.values]

    pid_list_pred = list(df_ps_test.index)    
    current_list = list(df_ps_test.loc[pid_list_pred].tid)
    current_len = [len(i) for i in current_list]    
    
#    K_list = [MAX_LEN - current_len[i] for i in range(len(current_len))]
    
    current_list_str = [list(map(str,item)) for item in current_list]
    
    record = []
    index = 0
    
    # Resume or not
    if resume == 0:
        print("Serialize data")
        doc = sc.parallelize(data_str).persist(StorageLevel.DISK_ONLY)
        
        print("Train Word2Vec model")
        model = Word2Vec().setVectorSize(vector_size).setSeed(3).setMinCount(min_count).fit(doc)
    
        print("Get vocabulary")
        vocab = model.getVectors().keySet()
    
        print("Save model")
        model_name = model_name + str(model_num)
        model.save(sc, model_name)
        
    elif resume == 1:
        print("load recent model")
        model_name = model_name + str(model_num-1)
        model = Word2VecModel.load(sc, model_name)
        
        print("Get vocabulary")
        vocab = model.getVectors().keySet()
        
        first_key = list(vocab)[0]
        vector_size = len(model.getVectors()[first_key])
        
        print("Check resume file: ",end='')
        
        if(os.path.exists("resumefile")):
            print("Exist")
            with open ('resumefile', 'rb') as fp:
                resumefile = pickle.load(fp)
    
            pid,record = resumefile.get('pid'), resumefile.get('data')
            index = current_list_str.index(pid)
           
            print("Resume at point pid: {} \t index: {}".format(pid,index))
        else:
            print("Not exist")
    

        
    print("Find K Relevant Songs")
    try:
      
        
        i = 0
        for data_list in current_list_str[index:]:
            pid = pid_list_pred[i]
            print("Iter: {} \t pid: {} ".format(str(i+1),pid))
            start = time.time()

            ######################## START CHANGING HERE ################################
            
            syms = model.findSynonyms(str(pid),K)
            topK_pid = [s[0] for s in syms]
    
            # Need to convert str to Int here
            topK_pid = [int(i) for i in topK_pid]
               
            n = 0
            while(1):
                # Get the top 1 pid 
                top_pid = topK_pid[n]
                
                # Retrieve tid from the top 1 pid
                add_tid_list = df_ps_train.loc[top_pid].tid
                        
                # Form new list
                new_tid_list = data_list + add_tid_list
                
                # Check duplicate lists
                new_tid_list = [tid for tid in new_tid_list if tid not in data_list]
                     
                # Check number of songs and Add to data for prediction
                total_song = len(new_tid_list)
        
    
                if (total_song > MAX_LEN):
                    new_list = new_tid_list[:MAX_LEN]            
                    break
                else:
                    new_list = new_tid_list
                    
                n += 1
                if (n>=K):
                    break
                
            record.append(new_list)
            i += 1
            print("Time taken = {0:.5f}".format(time.time() - start))
            
        

        print("Create new dataframe")
        df_ps_test['new_tid'] = record
        
        df_ps_test['tid']=df_ps_test.apply(lambda x: x[1]+ x[2],axis=1)
        df_ps_test=df_ps_test.drop(columns='new_tid')
        
        
#        df_ps_pred = pd.DataFrame.from_records(new_list,columns=['pid','tid'])
#        df_ps_pred = df_ps_pred.set_index('pid')
        
        print("Save test data")
        df_ps_test.to_hdf(path+'df_ps_test_complete.hdf', key='abc')
        
        print("Evaluation")
        result = my_evaluation(df_ps_test,df_ps_test_truth)
        print(result.aggregate_metric())
    except Exception as e:
        print(e)
        print("Create a resume point")
        resume_dict = {'pid':pid,'data':record}
        with open('resumefile', 'wb') as fp:
            pickle.dump(resume_dict, fp)
    # the rest are considered a string
    text = " ".join(elements[4:])
    return id_and_labels + [text]


def tokenize_line(line):
    text = line[-1]  # the last element of line is the text
    from re import split
    return filter(lambda w: len(w) > 1, split(DELIMS, text.lower()))


sc = SparkContext(appName="Train Logistic Regression (IMR)")

parsed_data = sc.textFile(in_file).map(parse_raw_line)

vectors_of_words = parsed_data.map(tokenize_line)

word2vec = Word2Vec().setVectorSize(VEC_SIZE)
model = word2vec.fit(vectors_of_words)

try:
    rmtree(model_path)
except:
    pass

model.save(sc, model_path)

sameModel = Word2VecModel.load(sc, model_path)

print len(sameModel.getVectors())
Example #12
0
data_location = config.get('DataSection', 'data_location')
model_location = config.get('DataSection', 'model_location')
spark_master = config.get('SparkSection', 'spark_master')
spark_executor_memory = config.get('SparkSection', 'spark_executor_memory')
min_word_count = config.get('ModelSection', 'min_word_count')
num_iterations = config.get('ModelSection', 'num_iterations')
vector_size = config.get('ModelSection', 'vector_size')
debug_flag = config.get('Debug', 'debug')
relations_test_file = config.get('DataSection', 'relations_test_file')
relations_result_file = config.get('DataSection', 'relations_result_file')

conf = (SparkConf().setMaster(spark_master).setAppName("WikiFindSynonyms").set(
    "spark.executor.memory", spark_executor_memory))
sc = SparkContext(conf=conf)
#word2vec = Word2VecModel()
model = Word2VecModel.load(sc, model_location)

with open(relations_test_file, 'r') as f:
    reader = csv.reader(f)
    records = list(reader)

with open(relations_result_file, 'w') as rf:
    writer = csv.writer(rf)

    for record in records:
        s = (record[0], record[1], record[2])
        s1 = model_utils.getAnalogy(s, model)
        result_row = []
        result_row.append(record[0])
        result_row.append(record[1])
        result_row.append(record[2])
Example #13
0
 def load_model(self, context, path):
     return Word2VecModel.load(context, path)
Example #14
0
from pyspark import SparkContext, SparkConf
from pyspark.mllib.feature import Word2Vec, Word2VecModel
from pyspark.mllib.common import _java2py
from pyspark.mllib.linalg import Vectors
# local spark
print('LOADING MODEL')
conf = SparkConf().setAppName('ReadModel').setMaster('spark://cep16001s1:7077')
sc = SparkContext(conf=conf)

n = 1000
model = Word2VecModel.load(sc, 'ag_word2vec_n_' + str(n) + '.model')
print('LOADING DATASET')
# result = model.transform("computer")
new_lines = []
with open('dataset-after-clean-with-label-10000-each.txt',
          'r',
          encoding='utf8') as f:
    lines = f.readlines()
    total = len(lines)
    count = 0
    for line in lines:
        count += 1
        if count % 1000 == 0:
            print(count, total)
        label = line.split('\\C')[0]
        text = line.split('\\C')[1]
        words = text.split()
        vecs = []
        for word in words:
            try:
                vec = model.transform(word)
Example #15
0
    text = " ".join(elements[4:])
    return id_and_labels + [text]


def tokenize_line(line):
    text = line[-1]  # the last element of line is the text
    from re import split
    return filter(lambda w: len(w)>1, split(DELIMS, text.lower()))


sc = SparkContext(appName="Train Logistic Regression (IMR)")

parsed_data = sc.textFile(in_file).map(parse_raw_line)

vectors_of_words = parsed_data.map(tokenize_line)

word2vec = Word2Vec().setVectorSize(VEC_SIZE)
model = word2vec.fit(vectors_of_words)

try:
    rmtree(model_path)
except:
    pass

model.save(sc, model_path)

sameModel = Word2VecModel.load(sc, model_path)

print len(sameModel.getVectors())

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from pyspark import SparkContext
from pyspark.mllib.feature import Word2Vec, Word2VecModel

USAGE = ("bin/spark-submit --driver-memory 4g "
         "examples/src/main/python/mllib/word2vec.py text8_lines")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(USAGE)
        sys.exit("Argument for file not provided")
    file_path = sys.argv[1]
    sc = SparkContext(appName='Word2Vec')
    inp = sc.textFile(file_path).map(lambda row: row.split(" "))

    word2vec = Word2Vec()
    model = word2vec.fit(inp)

    synonyms = model.findSynonyms('한국', 40)

    for word, cosine_distance in synonyms:
        print("{}: {}".format(word, cosine_distance))

    model.save(sc, "word_model")
    sameModel = Word2VecModel.load(sc, "./word_model")

    sc.stop()
Example #17
0
sys.path.append("/home/robert/spark-1.6.1-bin-hadoop2.6/bin")

try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark import SparkContext
    from pyspark.mllib.feature import Word2Vec, Word2VecModel
    print("Successfully imported Spark Modules")

except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)

dir = '/user/rmusters/'
# utils.removeCharFromFilename(dir)
# utils.concatFiles(dir)
word2vec = Word2Vec()
sc = SparkContext(appName='Word2Vec')

filename = "text8.zip"
filename = "12.txt"
inp = sc.textFile(dir + filename).map(lambda row: row.split(" "))

model = word2vec.fit(inp)

model.save(sc, dir + "pymodel.bin")

model = Word2VecModel.load(sc, dir + "pymodel.bin")

print model.getVectors()
#model.train(inp)
def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))
from pyspark import SparkContext, SparkConf
from pyspark.mllib.feature import Word2Vec, Word2VecModel
from pyspark.mllib.common import _java2py
# local spark
conf = SparkConf().setAppName('ReadModel').setMaster('local')
sc = SparkContext(conf=conf)

model = Word2VecModel.load(sc, 'ag_word2vec_n_300.model')

# sysn_lst = list(synonyms)
result = model.transform("computer")
vector = model.getVectors()
sc.stop()


def printNumberFeatures():
    print('# of features')
    print(len(result))


def printSimilarWords(input):
    synonyms = list(model.findSynonyms(input, 10))
    print('<==RESULT')
    for word, sim in synonyms:
        print(word, sim)
    print('RESULT==>')


def getVector(word):
    v = model.transform(word)
    print(v)
from pyspark import SparkContext, SparkConf
import sys

#spark-submit --master yarn --executor-memory 32g --driver-memory 32g --deploy-mode cluster --num-executors 400 master/hadoop/loadw2v.py

conf = (SparkConf()
		.set("spark.driver.maxResultSize", "0")
		#.set('spark.executor.memory','32g')
		#.set('spark.driver.memory','32g')
		)


sc = SparkContext(appName='loadw2v', conf=conf)

from pyspark.mllib.feature import Word2VecModel
model = Word2VecModel.load(sc, '/user/rmusters/pymodel_filtered3.bin')

vec = model.getVectors()
vec = str(vec)

rdd = sc.parallelize(vec)
path =  'hdfs:///user/rmusters/vectors4.txt'

rdd.saveAsTextFile(path)

	# with open(path, 'w+') as textfile:
	# 	textfile.write(vec)


print sys.getsizeof(model.getVectors)
Example #21
0
def main():
  input_model = sys.argv[1] #model to be read
  input_file = sys.argv[2] #review file to be read

  conf = SparkConf().setAppName('Word2Vec')
  sc = SparkContext(conf=conf)
  assert sc.version >= '1.5.1'

  sqlContext = SQLContext(sc)

  '''sbaronia - load word2vec model last saved'''
  word2vec_model = Word2VecModel.load(sc,input_model)

  '''sbaronia - get three fields from json and make data frame with index'''
  review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
  review_df = review.filter(review.reviewText != "").cache()

  rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
  rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

  year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
  year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

  clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()

  clean_list = clean_words_rdd.collect()

  '''sbaronia - make a list of all words in our model'''
  keys = sqlContext.read.parquet(input_model+"/data")
  keys_list = keys.rdd.map(lambda line: line.word).collect()

  '''sbaronia - using loaded model find vector for every word in review
  sum them and find average vector for a review'''
  avg_vec = []
  for i in range(len(clean_list)):
    sum_init = 0
    count = 0
    for word in clean_list[i]:
      if word in keys_list:
      	count = count + 1
      	vec = word2vec_model.transform(word)
      	sum_init = sum_init + vec
    if count > 0:
      avg_vec.append(sum_init/count)  

  '''sbaronia - create an rdd of this avg vector for all reviews'''
  avg_vec_rdd = rdd_zip(sc.parallelize(avg_vec).cache()).cache()
  avg_vec_df = sqlContext.createDataFrame(avg_vec_rdd, ['vector', 'index']).cache()

  '''sbaronia - make a dataframe with overall rating and avg vector'''
  year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
  vecyearrate_df = avg_vec_df.join(year_rating_df, avg_vec_df.index == year_rating_df.index, 'inner') \
                             .drop(avg_vec_df.index).cache()

  '''sbaronia - extract training and testing rdd based on year'''
  train_rdd = vecyearrate_df.filter(vecyearrate_df.year < 2014) \
                          .select('rating','vector') \
                          .map(lambda line: (LabeledPoint(line.rating, line.vector))) \
                          .coalesce(1) \
                          .cache()
  
  test_rdd = vecyearrate_df.filter(vecyearrate_df.year == 2014) \
                         .select('rating','vector') \
                         .map(lambda line: (LabeledPoint(line.rating, line.vector))) \
                         .coalesce(1) \
                         .cache()

  '''sbaronia - find best step using validation and run regression to get final RMSE'''
  step_best = validation(train_rdd)

  RMSE = regression_and_error(train_rdd,test_rdd,step_best)

  print("Final RMSE = " + str(RMSE) + "  Best Step size = " + str(step_best))
Example #22
0
def main(argv):
    sc = SparkContext("local", "Simple App")
    sc.setLogLevel("ERROR")

    args = parser.parse_args(argv[1:])
    vector_size = int(args.vector_size)
    min_count = int(args.min_count)
    test = int(args.mode)
    resume = int(args.resume)
    MAX_LEN = 500
    #    vector_size = 5
    #    min_count = 5

    # Check the existence of word2vec_model folder
    model_name = "word2vec_model"
    model_folder = glob.glob(model_name + "*")
    model_num = len(model_folder)

    path = "data/df_data/df_playlistSong/"
    if test == 1:
        print("Mode test: ON")
        path = "data/df_data/df_small/df_playlistSong/"
        MAX_LEN = 100
    print(path)
    print("Load Song-Playlist matrix")

    #    path = "data/df_data/df_small/df_playlistSong/"

    df_ps_train = pd.read_hdf(path + 'df_ps_train.hdf')
    df_ps_test = pd.read_hdf(path + 'df_ps_test.hdf')
    df_ps_test_truth = pd.read_hdf(path + 'df_ps_test_truth.hdf')

    data_str = [list(map(str, item)) for item in df_ps_train.tid.values]

    pid_list_pred = list(df_ps_test.index)
    current_list = list(df_ps_test.loc[pid_list_pred].tid)
    current_len = [len(i) for i in current_list]
    K_list = [MAX_LEN - current_len[i] for i in range(len(current_len))]

    current_list_str = [list(map(str, item)) for item in current_list]

    record = []
    index = 0

    # Resume or not
    if resume == 0:
        print("Serialize data")
        doc = sc.parallelize(data_str).persist(StorageLevel.DISK_ONLY)

        print("Train Word2Vec model")
        model = Word2Vec().setVectorSize(vector_size).setSeed(3).setMinCount(
            min_count).fit(doc)

        print("Get vocabulary")
        vocab = model.getVectors().keySet()

        print("Save model")
        model_name = model_name + str(model_num)
        model.save(sc, model_name)

    elif resume == 1:
        print("load recent model")
        model_name = model_name + str(model_num - 1)
        model = Word2VecModel.load(sc, model_name)

        print("Get vocabulary")
        vocab = model.getVectors().keySet()

        first_key = list(vocab)[0]
        vector_size = len(model.getVectors()[first_key])

        print("Check resume file: ", end='')

        if (os.path.exists("resumefile")):
            print("Exist")
            with open('resumefile', 'rb') as fp:
                resumefile = pickle.load(fp)

            pid, record = resumefile.get('pid'), resumefile.get('data')
            index = current_list_str.index(pid)

            print("Resume at point pid: {} \t index: {}".format(pid, index))
        else:
            print("Not exist")

    print("Find K Relevant Songs")
    try:
        i = 0
        for data_list in current_list_str[index:]:
            #        print("pid: {} {}".format(pid_list_pred[i],data_list))
            pid = pid_list_pred[i]
            print("Iter: {} \t pid: {} ".format(str(i + 1), pid))
            start = time.time()

            # Filter data not in vocabulary
            data_list_filter = [value for value in data_list if value in vocab]

            #        topK = [value for value in topK if value not in data_list]

            # Find the centroid of data_list
            record.append(
                findK_relevant(model, K_list[i], data_list_filter, sc,
                               vector_size))
            i += 1
            print("Time taken = {0:.5f}".format(time.time() - start))

        print("Create new dataframe")
        df_ps_test['new_tid'] = record

        df_ps_test['tid'] = df_ps_test.apply(lambda x: x[1] + x[2], axis=1)
        df_ps_test = df_ps_test.drop(columns='new_tid')

        print("Save test data")
        df_ps_test.to_hdf(path + 'df_ps_test_complete.hdf', key='abc')

        print("Evaluation")
        result = my_evaluation(df_ps_test, df_ps_test_truth)
        print(result.aggregate_metric())
    except:
        print("Create a resume point")
        resume_dict = {'pid': pid, 'data': record}
        with open('resumefile', 'wb') as fp:
            pickle.dump(resume_dict, fp)