Ejemplo n.º 1
0
def train_glove(spark: SparkContext,
                word_cooc: RDD,
                num_iterations=100,
                vector_size=10,
                learning_rate=0.001,
                max_value=100,
                alpha=3. / 4) -> (Dict[str, Array], Dict[str, float]):
    """Train a glove model

  TODO: add option to initialize form existing parameters for continued training

  Parameters
  ----------
  spark : The Spark context of the session
  word_cooc :  The co-occurrence RDD of words, ([word, word], count)
  max_value :  The max value of the loss weighting. Counts higher then this do
    not have the loss applied to them
  num_iterations : the number of training iterations to run
  max_value : The maximum value where loss weighting is applied
  learning_rate : The learning rate of the vector
  alpha : Part of the loss weighting

  Returns
  -------
’
  """
    if num_iterations > 0:
        raise ValueError(
            'The number of training iterations must be greater than 0')

    if (alpha > 1) or (alpha < 0):
        raise ValueError('Alpha should be between 0 and 1')

    # Model Hyper-parameters
    max_value_bc = spark.broadcast(max_value)
    learning_rate_bc = spark.broadcast(learning_rate)
    alpha_bc = spark.broadcast(alpha)

    # Get the unique words to initialize the parameter dicts
    unique_words = word_cooc.keys().flatMap(lambda x: x).distinct().collect()

    # Initialize the model parameters
    init_vectors, init_biases, init_vectors_grads, init_biases_grads = _initialize_parameters(
        unique_words, vector_size)

    # Broadcast the new model params
    word_vectors = spark.broadcast(init_vectors)
    word_biases = spark.broadcast(init_biases)
    word_vector_grads = spark.broadcast(init_vectors_grads)
    word_bias_grads = spark.broadcast(init_biases_grads)

    # Start training
    for i in range(1, num_iterations + 1):
        print('Iteration Number:', i)
        print('\tComputing Gradients...')
        # Compute the loss for every word co-occurrence
        updates = word_cooc.flatMap(lambda x: _gradient_update(
            x, word_vectors.value, word_vector_grads.value, word_biases.value,
            word_bias_grads.value, max_value_bc.value, learning_rate_bc.value,
            alpha_bc.value))

        # Collect gradients and sum over words
        aggregated_grads = updates.reduceByKey(
            lambda x, y: [x[i] + y[i] for i in range(4)]).collect()
        print('\tUpdating Params')

        # Separate update components
        updated_vectors = {}
        for word, grad in [(word, grad[0]) for word, grad in aggregated_grads]:
            updated_vectors[word] = word_vectors.value[word] - grad

        updated_biases = {}
        for word, grad in [(word, grad[1]) for word, grad in aggregated_grads]:
            updated_biases[word] = word_biases.value[word] - grad

        updated_vector_grads = {}
        for word, grad in [(word, grads[2])
                           for word, grads in aggregated_grads]:
            updated_vector_grads[word] = word_vector_grads.value[word] + grad

        updated_bias_grads = {}
        for word, grad in [(word, grads[3])
                           for word, grads in aggregated_grads]:
            updated_bias_grads[word] = word_bias_grads.value[word] + grad

        # Un-persist old values
        for bc_var in [
                word_vectors, word_vector_grads, word_biases, word_vector_grads
        ]:
            bc_var.unpersist()

        # Broadcast updates
        word_vectors = spark.broadcast(updated_vectors)
        word_biases = spark.broadcast(updated_biases)
        word_vector_grads = spark.broadcast(updated_vector_grads)
        word_bias_grads = spark.broadcast(updated_bias_grads)

    # noinspection PyUnboundLocalVariable
    return updated_vectors, updated_biases