def wordtovec(wordrdd):
    word2vec = Word2Vec()
    model = word2vec.fit(wordrdd)
    print(model.getVectors())
    synonyms = model.findSynonyms('1', 5)
    for word, cosine_distance in synonyms:
        print("{}: {}".format(word, cosine_distance))
コード例 #2
0
ファイル: word2vec.py プロジェクト: herospYL/ads-data-process
def word2vec(file_dir):
    word2vec_training_file = file_dir + WORD2VEC_TRAINING_FILE
    synonym_data_file = file_dir + SYNONYM_DATA_FILE
    word2vec_trace_data = file_dir + WORD2VEC_TRACE

    sc = SparkContext(appName="word2vec")
    inp = sc.textFile(word2vec_training_file).map(lambda line: line.split(" "))

    word2vec = Word2Vec()
    model = word2vec.setLearningRate(0.02).setMinCount(5).setVectorSize(10).setSeed(2017).fit(inp)

    vec = model.getVectors()
    synonyms_data = open(synonym_data_file, "w")

    logger = logging.getLogger()
    logger.debug("len of vec:{0}".format(len(vec)))
    for word in vec.keys():
        synonyms = model.findSynonyms(word, 5)
        entry = {"word": word}
        synon_list = []
        for synonym, cosine_distance in synonyms:
            synon_list.append(synonym)
        entry["synonyms"] = synon_list
        synonyms_data.write(json.dumps(entry))
        synonyms_data.write('\n')

    synonyms_data.close()
    model.save(sc, word2vec_trace_data)
    sc.stop()

    logger.info("Word2Vec training finished")
コード例 #3
0
def train_w2v():
    threshold = 20

    data = sqlContext.read.parquet('hdfs:///user/rmusters/data_jan').select(
        "filtered_text")

    counts = data.flatMap(lambda line: line) \
     .map(lambda word: (word, 1)) \
     .reduceByKey(lambda a, b: a + b) \
     .filter(lambda pair: pair[1] >= threshold)

    vocab_size = 67585  #counts.count()
    print "Vocabulary size is: ", vocab_size

    data = data.map(lambda line: line.filtered_text.split())

    max_int_size = 268435455
    vector_size = max_int_size / vocab_size
    print "Vector size is: ", vector_size
    word2vec = Word2Vec()
    word2vec.setMinCount(threshold)
    word2vec.setVectorSize(vector_size)

    for idx in range(1, 100, 1):
        print idx
        model = word2vec.fit(data.sample(False, 0.01))
        model.save(sc, '/user/rmusters/jan_threshold20_2015model' + str(idx))
コード例 #4
0
def word2vec(rdd,**kw):
    """
    生成向量
    vec_len:生成向量的长度
    min_count:出现最少次数
    window_size:窗口长度
    learning_rate:学习率
    """
    seed = int(time.time())
    vec_len = kw.get('vec_len',300)
    min_count = kw.get('min_count',3)
    window_size = kw.get('window_size',5)
    partitions = kw.get('partitions',5)
    lr = kw.get('learning_rate',0.025)
    
    step_1 = time.time()
    model = Word2Vec().setVectorSize(vec_len).setLearningRate(lr).setMinCount(min_count).\
        setNumPartitions(partitions).setSeed(seed).setWindowSize(window_size).fit(rdd)
        
    vectors = model.getVectors()
    step_2 = time.time()
    print 'Build Word2vec Model Using:%s s!' % (step_2-step_1)
    result = dict(vectors)
    keys = result.keys()
    for key in keys:
        result[key] = list(result[key])
    return result
コード例 #5
0
ファイル: test01.py プロジェクト: jack-ZG/MovieRecOnline
def word2vec(df, inputcol, outputcol, vecsize):
    from pyspark.mllib.feature import Word2Vec
    from pyspark.ml.feature import Word2Vec
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover
    # 使用自定义函数
    df.drop('seg')
    df_seg = df.withColumn("seg", segUDF(inputcol))
    df_w = df_seg.drop('words')
    tokenizer = Tokenizer(inputCol=inputcol, outputCol='words')
    t_words = tokenizer.transform(df_w)
    t_words.select('words').head()
    #4.将文本向量转换成稀疏表示的数值向量(字符频率向量)
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         vocabSize=5,
                         minDF=2.0)
    df_f = t_words.drop("features")
    cv_model = cv.fit(df_f)
    cv_result = cv_model.transform(df_f)
    #5.将tokenizer得到的分词结果转换数字向量
    word2Vec = Word2Vec(vectorSize=vecsize,
                        minCount=0,
                        inputCol="words",
                        outputCol=outputcol)
    w2v_model = word2Vec.fit(cv_result)
    result = w2v_model.transform(cv_result)
    for feature in result.select(outputcol).take(3):
        print(feature)
        return t_words
コード例 #6
0
def write_data(path):
	import filter
	from pyspark.mllib.feature import Word2Vec, Word2VecModel

	# load data
	loc = '/user/rmusters/text/2015/01/*'
	text_file = sc.textFile(loc)
	data = text_file.map(lambda line: filter.filter(line).split(" "))

	# load model
	word2vec = Word2Vec()
	model = Word2VecModel.load(sc, '/user/rmusters/2015model99')

	# get a tweet vector pair.
	from pyspark.sql import SQLContext
	sqlContext = SQLContext(sc)
	lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup")
	lookup_bd = sc.broadcast(lookup.rdd.collectAsMap())

	vectors = data.map(lambda ws: [lookup_bd.value.get(w) for w in ws])
	logger.info(vectors.count())

	data = text_file.map(lambda line: (line, filter.filter(line).split(" ")))\
							.map(lambda (text, filtered): (text, filtered, [lookup_bd.value.get(w) for w in filtered][0]))

	from pyspark.sql.functions import monotonicallyIncreasingId
	df = data.toDF(["text", "filtered_text", "vectors"])
	# This will return a new DF with all the columns + id
	res = df.withColumn("id", monotonicallyIncreasingId())
	res.write.parquet(path, mode="overwrite")
コード例 #7
0
def main(argv):
	import getopt

	dir = '/user/rmusters/'

	word2vec = Word2Vec()
	sc = SparkContext(appName='Word2Vec')
	#
	# try:
	# 	opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
	# except getopt.GetoptError:
	# 	print 'test.py -i <inputfile> -o <outputfile>'
	# 	sys.exit(2)
	# 	for opt, arg in opts:
	# if opt == '-h':
	# 	print 'test.py -i <inputfile> -o <outputfile>'
	# 	sys.exit()
	# elif opt in ("-l"):
	# 	inputfile = arg
	# elif opt in ("-s"):
	# 	outputfile = arg
	# 	print 'Input file is "', inputfile
	# 	print 'Output file is "', outputfile

	filename = "12.txt"
	inp = sc.textFile(dir + filename).map(lambda row: row.split(" "))

	model = word2vec.fit(inp)

	model.save(sc, dir + "pymodelF.bin")

	model =  Word2VecModel.load(sc, dir + "pymodelF.bin")

	print model.getVectors()
コード例 #8
0
def main(sc):
    spark = SparkSession.builder.appName("Charembedding").config(
        "spark.some.config.option", "Charembedding").getOrCreate()
    # readjson and preprocess
    file_path = '/user/ichongxiang/data/positions/20180518/dedup_json/part-'
    df = spark.read.json(
        '/user/ichongxiang/data/positions/20180518/dedup_json/part-00000'
    ).select("requirement", "description")
    text_00000 = df.rdd.map(list)
    text_00000 = text_00000.map(lambda r: [r[0] + r[1]])
    inp_all = text_00000.map(split)
    for i in range(1, 300):
        print('Processing input files:%s/%s' % (i, 300))
        i = "%05d" % i
        df = spark.read.json(file_path + str(i)).select(
            "requirement", "description")
        text = df.rdd.map(list)
        text = text.map(lambda r: [r[0] + r[1]])
        inp = text.map(split)
        inp_all = inp_all.union(inp)

    print('Start Traing Word2vec')
    word2vec = Word2Vec()
    model = word2vec.setVectorSize(100).setMinCount(0).setSeed(
        100000000000000).fit(inp_all)
    w2v_dict = model.getVectors()
    print('Saving Word2vec model vectors')
    w2v_save = open("char_embedding_w2v.csv", 'w')
    for i, v in w2v_dict.items():
        w2v_save.write(str(i))
        w2v_save.write('\t')
        w2v_save.write(str(v))
        w2v_save.write('\n')
    w2v_save.close()
    print("succeed")
コード例 #9
0
def trainItem2vec(spark, samples, embLength, embOutputPath, saveToRedis,
                  redisKeyPrefix):
    word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(
        5).setNumIterations(10)
    model = word2vec.fit(samples)
    synonyms = model.findSynonyms("158", 20)
    print("similarMovieId, cosineSimilarity")
    for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)
    embOutputDir = '/'.join(embOutputPath.split('/')[:-1])
    if not os.path.exists(embOutputDir):
        os.makedirs(embOutputDir)
    with open(embOutputPath, 'w') as f:
        for movie_id in model.getVectors(
        ):  # model.getVectors() -> {movie_id: List[movie_embedding]}
            vectors = " ".join(
                [str(emb) for emb in model.getVectors()[movie_id]])
            f.write(movie_id + ":" + vectors + "\n")

    # save to Redis
    if saveToRedis:
        r = redis.Redis(host=redisHost,
                        port=redisPort,
                        db=0,
                        decode_responses=True)
        for movieId in model.getVectors():
            r.set(redisKeyPrefix + ":" + movieId,
                  " ".join([str(emb) for emb in model.getVectors()[movie_id]]),
                  ex=TTL)

    embeddingLSH(spark, model.getVectors())
    return model
コード例 #10
0
ファイル: test_feature.py プロジェクト: zoelin7/spark
 def test_word2vec_setters(self):
     model = (Word2Vec().setVectorSize(2).setLearningRate(
         0.01).setNumPartitions(2).setNumIterations(10).setSeed(
             1024).setMinCount(3).setWindowSize(6))
     self.assertEqual(model.vectorSize, 2)
     self.assertTrue(model.learningRate < 0.02)
     self.assertEqual(model.numPartitions, 2)
     self.assertEqual(model.numIterations, 10)
     self.assertEqual(model.seed, 1024)
     self.assertEqual(model.minCount, 3)
     self.assertEqual(model.windowSize, 6)
コード例 #11
0
 def __get_word2vec(self, word2vec_setting):
     min_count, seed, learning_rate, vector_size = word2vec_setting
     word2vec = Word2Vec()
     # Word2Vec's default min count is 100; our default min count is 20.
     word2vec.setMinCount(min_count)
     word2vec.setSeed(seed)
     # Word2Vec's default learning rate is 0.025; our default min count is also 0.025.
     word2vec.setLearningRate(learning_rate)
     # Word2Vec's default vector size is 100; our default vector size is 50.
     word2vec.setVectorSize(vector_size)
     return word2vec
コード例 #12
0
ファイル: tests.py プロジェクト: yunchat/spark
 def test_word2vec_get_vectors(self):
     data = [
         ["a", "b", "c", "d", "e", "f", "g"],
         ["a", "b", "c", "d", "e", "f"],
         ["a", "b", "c", "d", "e"],
         ["a", "b", "c", "d"],
         ["a", "b", "c"],
         ["a", "b"],
         ["a"]
     ]
     model = Word2Vec().fit(self.sc.parallelize(data))
     self.assertEquals(len(model.getVectors()), 3)
コード例 #13
0
    def create_model_text(self, data, params):

        learningRate = float(params.get('learningRate', 0.025))
        numIterations = int(params.get('numIterations', 10))
        minCount = int(params.get('minCount', 5))

        word2vec = Word2Vec()
        word2vec.setLearningRate(learningRate)
        word2vec.setNumIterations(numIterations)
        word2vec.setMinCount(minCount)

        inp = data.map(lambda row: row.split(" "))
        return word2vec.fit(inp)
コード例 #14
0
 def _skip_gram(self, walks_rdd):
     vector_size = self.getOrDefault("vector_size")
     min_count = self.getOrDefault("min_count")
     num_partitions = self.getOrDefault("num_partitions")
     learning_rate = self.getOrDefault("learning_rate")
     num_iter = self.getOrDefault("num_iter")
     model = Word2Vec() \
         .setVectorSize(vector_size) \
         .setMinCount(min_count) \
         .setNumPartitions(num_partitions) \
         .setLearningRate(learning_rate) \
         .setNumIterations(num_iter) \
         .fit(walks_rdd)
     return model.getVectors()
コード例 #15
0
ファイル: spark.py プロジェクト: colinsongf/pythia
def word2vecModel(text):
    """Computes distributed vector representation of words using a skip-gram model. The training objective of skip-gram
  is to learn word vector representations that are good at predicting its context in the same sentence.

  :parameter text: (REQUIRED) - the input data of text words/strings you'd like to use
  :return: word2vec model

  Use it as:
  .. code-block::python

      model = word2vecModel(text)
      synonyms = model.findSynonyms('random_word', 40)
  """
    word2vec = Word2Vec()
    return word2vec.fit(text)
コード例 #16
0
ファイル: tests.py プロジェクト: bopopescu/SparkNew
 def test_word2vec_setters(self):
     data = [["I", "have", "a", "pen"],
             ["I", "like", "soccer", "very", "much"],
             ["I", "live", "in", "Tokyo"]]
     model = Word2Vec() \
         .setVectorSize(2) \
         .setLearningRate(0.01) \
         .setNumPartitions(2) \
         .setNumIterations(10) \
         .setSeed(1024) \
         .setMinCount(3)
     self.assertEquals(model.vectorSize, 2)
     self.assertTrue(model.learningRate < 0.02)
     self.assertEquals(model.numPartitions, 2)
     self.assertEquals(model.numIterations, 10)
     self.assertEquals(model.seed, 1024)
     self.assertEquals(model.minCount, 3)
コード例 #17
0
def trainItem2vec(spark, samples, embLength, embOutputPath, saveToRedis,
                  redisKeyPrefix):
    word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(
        5).setNumIterations(10)
    model = word2vec.fit(samples)
    synonyms = model.findSynonyms("158", 20)
    for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)
    embOutputDir = '/'.join(embOutputPath.split('/')[:-1])
    if not os.path.exists(embOutputDir):
        os.makedirs(embOutputDir)
    with open(embOutputPath, 'w') as f:
        for movie_id in model.getVectors():
            vectors = " ".join(
                [str(emb) for emb in model.getVectors()[movie_id]])
            f.write(movie_id + ":" + vectors + "\n")
    embeddingLSH(spark, model.getVectors())
    return model
コード例 #18
0
def main():
    # Threshold to limit words which occur less than the threshold
    threshold = 10  #10
    text_file = sc.textFile(loc)

    data = text_file.map(lambda line: filter.filter(line))

    counts = data.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a + b) \
       .filter(lambda pair: pair[1] >= threshold)
    #.sortBy(lambda x:x[1], ascending=True) #only use for inspection
    counts.cache()
    vocab_size = counts.count()
    print "Vocabulary size is: ", vocab_size

    inp = data.map(lambda line: line.split(" "))
    inp.cache()

    max_int_size = 268435455
    vector_size = max_int_size / vocab_size
    print "Vector size is: ", vector_size
    word2vec = Word2Vec()
    word2vec.setMinCount(threshold)  #40
    word2vec.setVectorSize(vector_size)  #/100

    for idx in range(1, 100, 1):
        print idx
        model = word2vec.fit(inp.sample(False, 0.01))
        # if idx == 1 or idx == 2:
        # 	print "Vector size of current model:  ", word2vec.getVectorSize()
        # 	inputcol = word2vec.getInputCol()
        # 	outputcol =  word2vec.getOutputCol()
        # 	print "input column: ", inputcol
        # 	try:
        # 		print len(inputcol)
        # 		print len(outputcol)
        # 	except:
        # 		pass
        # 	print "output column", outputcol

        model.save(sc, '/user/rmusters/threshold20_2015model' + str(idx))
コード例 #19
0
def trainItem2vecAndSave(spark, samples, embLength, embOutputPath):
    bucket = embOutputPath.split('//')[1].split('/', 1)[0]
    key = embOutputPath.split('//')[1].split('/', 1)[1]

    word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(
        5).setNumIterations(10)
    model = word2vec.fit(samples)
    synonyms = model.findSynonyms("158", 20)
    #print(synonyms)
    for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)

    buffer = StringIO()
    for movie_id in model.getVectors():
        vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
        buffer.write(movie_id + ":" + vectors + "\n")
    buffer.seek(0)
    s3.put_object(Bucket=bucket, Key=key, Body=buffer.read())

    embeddingLSH(spark, model.getVectors())
    return model
コード例 #20
0
    def train_item_to_vec(self, spark_session: SparkSession, samples,
                          emb_length: int, emb_output_file_name: str,
                          save_to_redis: bool, redis_key_prefix: str):
        """
        train a word2vec model based on movie samples
        :param spark_session:
        :param samples:
        :param emb_length:
        :param emb_output_file_name:
        :param save_to_redis:
        :param redis_key_prefix:
        :return:
        """
        word2vec = Word2Vec().setVectorSize(emb_length).setWindowSize(
            5).setNumIterations(10)
        model = word2vec.fit(samples)
        synonyms = model.findSynonyms("158", 20)
        for synonym, cosine_sim in synonyms:
            print(synonym, cosine_sim)

        root_dir = dirname(dirname(dirname(abspath(__file__))))
        rating_resource_path = join(root_dir, "resources",
                                    "webroot/modeldata/")

        file = open(join(rating_resource_path, emb_output_file_name), "w")

        for movieId, vector in model.getVectors().items():
            file.write(movieId + ":" + " ".join([str(num)
                                                 for num in vector]) + "\n")

        if save_to_redis:
            redis_client = redis.Redis(host=self.redisEndpoint,
                                       port=self.redisPort)
            for movieId, vector in model.getVectors().items():
                redis_client.set(redis_key_prefix + ":" + movieId,
                                 " ".join([str(num) for num in vector]),
                                 ex=60 * 60 * 24)
            redis_client.close()
        self.embedding_lsh(spark_session, model.getVectors())
        return model
コード例 #21
0
def main():
    inp = sc.textFile("hdfs://hadoop2/input/result.txt").map(lambda row: row.split(" "))

    word2vec = Word2Vec()
    model = word2vec.fit(inp)

    ket = model.getVectors().keys()

    for noun in ket:
        num = 0
        synonym = ["" for _ in range(5)]
        synonyms = model.findSynonyms(noun, 5)

        for word, cosince_distance in synonyms:
            synonym[num] = word
            num = num + 1
        try:
            print(noun.encode('utf-8'))
            db_manager(noun, synonym[0], synonym[1], synonym[2], synonym[3],synonym[4])
        
        except Exception as err:
            print(err)
            pass
コード例 #22
0
def trainItem2vec(spark, samples, embLength, embOutputPath, redisKeyPrefix, saveToRedis=False):
    # 构造Word2vec网络模型结构
    # setVectorSize设置Embedding向量的维度,即Word2vec的隐含层的神经元数目
    # setWindowSize设置在序列上进行滑动的滑动窗口大小(windowSize=2c+1)
    # setNumIterations设置训练模型时的迭代次数,类似epoch
    word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(5).setNumIterations(10)
    model = word2vec.fit(samples)
    # 调用封装好的函数寻找与某个item最相似的N个其它item
    synonyms = model.findSynonyms("592", 20)  # id"592"为蝙蝠侠Batman
    for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)

    # 准备从训练完毕后的Word2vec中取出Embedding向量并存入目标文件夹中或redis中
    if not saveToRedis:
        embOutputDir = '/'.join(embOutputPath.split('/')[:-1])  #
        if not os.path.exists(embOutputDir):
            os.mkdir(embOutputDir)
        # 使用getVectors()方法得到存放word及其向量表达(Embedding向量,W_vxn的行向量)的map<movie_id : String, Embedding : Vector>
        with open(embOutputPath, 'w') as file:
            for movie_id in model.getVectors():
                vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
                file.write(movie_id + ":" + vectors + "\n")
    else:
        # 将Item的Embedding写入Redis中
        redis_client = redis.StrictRedis(host='66.42.66.135', port='6379', db=0, password='******')
        expire_time = 60*60*24  # 设置缓存时间为24h
        # 使用Pipeline,否则每一次连接Redis都消耗一次RTT,过于慢了
        pipe = redis_client.pipeline(transaction=True)
        for movie_id in model.getVectors():
            vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
            pipe.set(redisKeyPrefix + ":" + movie_id, vectors)
            # pipe.expire(redisKeyPrefix + ":" + movie_id, expire_time)  # 还没部署到线上,暂时不设置缓存时间
        # 执行管道里的各请求
        pipe.execute()
        redis_client.close()

    return model
コード例 #23
0
ファイル: eventGen.py プロジェクト: hemavakade/magichour
def event_gen_word2vec(sc, log_lines, window_size=60):
    import hdbscan
    D = log_lines.map(
        lambda logline: (
            int(logline.ts / window_size),
            (logline.ts,
             logline.templateId))) .groupByKey() .map(
        lambda window_loglines: [
            str(templateId) for (
                ts,
                templateId) in sorted(
                window_loglines[1])])

    # Run Word2Vec
    model = Word2Vec().setVectorSize(16).setSeed(42).fit(D)
    model_vectors = model.getVectors()

    # mapping dict_distrib
    labels = []
    vectors = []
    for label, vector in model_vectors.items():
        labels.append(label)
        vectors.append(list(vector))

    # Clsutering
    output_events = defaultdict(list)
    for i, val in enumerate(hdbscan.HDBSCAN(
            min_cluster_size=2).fit_predict(vectors)):
        output_events[val].append(labels[i])

    # Create event objects
    events = []
    for item in output_events:
        event = Event(id=item, template_ids=map(int, output_events[item]))
        if len(event.template_ids) > 0:
            events.append(event)
    return events
コード例 #24
0
import urllib
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
import os
from pyspark.mllib.feature import Word2Vec
from time import time
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from classes.PosNeg import PosNegCount
from classes.WordVector import WordVectorAnalyzer
import nltk
import json
from nltk.tree import Tree
import json
from Levenstein import Lev
stop_words = nltk.corpus.stopwords.words('english')
stop_words += ['?', '.', '!', ',']
sparkConf = SparkConf().setMaster("local").setAppName(
    "PredictKafkaTweetStreaming").set("spark.app.id", "Predict")
sc = SparkContext(appName="WordVectorTrainer")
sc.setLogLevel("WARN")
inp = sc.textFile("WordTraining.txt").map(lambda row: row.split(" "))
word2vec = Word2Vec()
model = word2vec.fit(inp)
WordVectors = {}

for i in model.getVectors().keys():
    WordVectors[i] = model.findSynonyms(i, 7)

with open('WordVectors.json', 'w') as fp:
    json.dump(WordVectors, fp)
コード例 #25
0
def trainOne(sc, url):
    w2v = Word2Vec()
    return w2v.fit(url2rdd(sc, url))
コード例 #26
0
def train(sc, urls):
    w2v = Word2Vec()
    rdds = reduce(lambda a, b: a.union(b),
                  [url2rdd(sc, url) for url in urls.split("\n")])
    return w2v.fit(rdd)
コード例 #27
0
def generate_word2vec_model(doc):
    return Word2Vec().setVectorSize(10).setSeed(42).fit(doc)
コード例 #28
0
def run_word_embedding_word2vec(sc, sentences, wepath):
    word2vec = Word2Vec()
    model = word2vec.fit(sentences)
    # model.transform(sentences).saveAsTextFile(wepath)
    print 'saving model to output: {}'.format(wepath)
    model.save(sc, wepath)
コード例 #29
0
	for feature in features_l:
		if all(ord(char) < 128 for char in feature):
			if feature.rfind("#") == -1 and feature.rfind("@") == -1 and feature.rfind("https"):
				feature.replace(",", "").replace(".", "").replace(":", "").replace(";", "").replace("\"", "").lower()
				new_feature_l.append(feature)

	return new_feature_l

first_n_rows = int(sys.argv[1])

vectorSize = int(sys.argv[2])

sc = SparkContext(appName = "Prova")

sqlContext = SQLContext(sc)
test = sc.textFile("txt/dataset_" + str(first_n_rows) + ".txt")

#test = test.map(lambda tweet: tweet[1].replace("\"", ""))
test = test.filter(lambda tweet: "," in tweet).map(lambda tweet: tweet.split(","))

test = test.map(lambda tweet: filtering(tweet[0].split(" ")))


word2vec = Word2Vec().setVectorSize(vectorSize)

model = word2vec.fit(test)

print(model.getVectors())

model.save(sc, "word2vec_models/" + str(first_n_rows) + "_" + str(vectorSize))
コード例 #30
0

embedding_size = 2
start = time.time()
load_edgelist(file_name, graph)
end = time.time()
print "Loading edgelist\t", (end - start)
data = generate_walks(graph)
data_matrix = []
for row in data:
    data_matrix.append(row)
print "Data Matrix Created"
s = sc.parallelize(data_matrix)
print "Building Word Vectors"
start = time.time()
model = Word2Vec().setVectorSize(embedding_size).setSeed(22).setMinCount(
    1).fit(s)
end = time.time()
print "Word2vec\t", (end - start)
embeddings = model.getVectors()
d = defaultdict(list)
for key in embeddings:
    for x in embeddings[key]:
        d[key].append(x)

l = sc.textFile(file_name)
X = l.map(lambda row: row.strip().split(','))
t = sc.textFile("combined_lab.csv")
y = t.map(lambda row: map(int, row.strip()))
temp = y.zip(X)
data = temp.map(lambda row: getEmbeddings(row, d))
for x in [.01, .02, .03, .04, .05, .06, .07, .08, .09]: