def train_glove(spark: SparkContext, word_cooc: RDD, num_iterations=100, vector_size=10, learning_rate=0.001, max_value=100, alpha=3. / 4) -> (Dict[str, Array], Dict[str, float]): """Train a glove model TODO: add option to initialize form existing parameters for continued training Parameters ---------- spark : The Spark context of the session word_cooc : The co-occurrence RDD of words, ([word, word], count) max_value : The max value of the loss weighting. Counts higher then this do not have the loss applied to them num_iterations : the number of training iterations to run max_value : The maximum value where loss weighting is applied learning_rate : The learning rate of the vector alpha : Part of the loss weighting Returns ------- ’ """ if num_iterations > 0: raise ValueError( 'The number of training iterations must be greater than 0') if (alpha > 1) or (alpha < 0): raise ValueError('Alpha should be between 0 and 1') # Model Hyper-parameters max_value_bc = spark.broadcast(max_value) learning_rate_bc = spark.broadcast(learning_rate) alpha_bc = spark.broadcast(alpha) # Get the unique words to initialize the parameter dicts unique_words = word_cooc.keys().flatMap(lambda x: x).distinct().collect() # Initialize the model parameters init_vectors, init_biases, init_vectors_grads, init_biases_grads = _initialize_parameters( unique_words, vector_size) # Broadcast the new model params word_vectors = spark.broadcast(init_vectors) word_biases = spark.broadcast(init_biases) word_vector_grads = spark.broadcast(init_vectors_grads) word_bias_grads = spark.broadcast(init_biases_grads) # Start training for i in range(1, num_iterations + 1): print('Iteration Number:', i) print('\tComputing Gradients...') # Compute the loss for every word co-occurrence updates = word_cooc.flatMap(lambda x: _gradient_update( x, word_vectors.value, word_vector_grads.value, word_biases.value, word_bias_grads.value, max_value_bc.value, learning_rate_bc.value, alpha_bc.value)) # Collect gradients and sum over words aggregated_grads = updates.reduceByKey( lambda x, y: [x[i] + y[i] for i in range(4)]).collect() print('\tUpdating Params') # Separate update components updated_vectors = {} for word, grad in [(word, grad[0]) for word, grad in aggregated_grads]: updated_vectors[word] = word_vectors.value[word] - grad updated_biases = {} for word, grad in [(word, grad[1]) for word, grad in aggregated_grads]: updated_biases[word] = word_biases.value[word] - grad updated_vector_grads = {} for word, grad in [(word, grads[2]) for word, grads in aggregated_grads]: updated_vector_grads[word] = word_vector_grads.value[word] + grad updated_bias_grads = {} for word, grad in [(word, grads[3]) for word, grads in aggregated_grads]: updated_bias_grads[word] = word_bias_grads.value[word] + grad # Un-persist old values for bc_var in [ word_vectors, word_vector_grads, word_biases, word_vector_grads ]: bc_var.unpersist() # Broadcast updates word_vectors = spark.broadcast(updated_vectors) word_biases = spark.broadcast(updated_biases) word_vector_grads = spark.broadcast(updated_vector_grads) word_bias_grads = spark.broadcast(updated_bias_grads) # noinspection PyUnboundLocalVariable return updated_vectors, updated_biases