Ejemplos de RDD.keys en Python

Lenguaje de programación: Python

Namespace/Package Name: pyspark

Clase / Tipo: RDD

Método / Función: keys

Ejemplos en hotexamples.com: 1

Python RDD.keys - 1 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de pyspark.RDD.keys extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

RDD(30)

map(30)

flatMap(16)

count(11)

mapPartitionsWithIndex(10)

getNumPartitions(9)

filter(9)

repartition(6)

mapPartitions(6)

toDF(5)

collect(5)

mapValues(5)

groupByKey(4)

isEmpty(4)

coalesce(3)

cache(3)

take(3)

toDebugString(2)

persist(2)

unpersist(2)

zip(2)

zipWithIndex(2)

__init__(2)

_reserialize(2)

first(2)

distinct(2)

join(2)

sum(1)

_to_java_object_rdd(1)

union(1)

cogroup(1)

countApproxDistinct(1)

sortByKey(1)

subtractByKey(1)

sortBy(1)

sample(1)

randomSplit(1)

foreach(1)

name(1)

groupBy(1)

keys(1)

Ejemplo n.º 1

Mostrar archivo

def train_glove(spark: SparkContext,
                word_cooc: RDD,
                num_iterations=100,
                vector_size=10,
                learning_rate=0.001,
                max_value=100,
                alpha=3. / 4) -> (Dict[str, Array], Dict[str, float]):
    """Train a glove model

  TODO: add option to initialize form existing parameters for continued training

  Parameters
  ----------
  spark : The Spark context of the session
  word_cooc :  The co-occurrence RDD of words, ([word, word], count)
  max_value :  The max value of the loss weighting. Counts higher then this do
    not have the loss applied to them
  num_iterations : the number of training iterations to run
  max_value : The maximum value where loss weighting is applied
  learning_rate : The learning rate of the vector
  alpha : Part of the loss weighting

  Returns
  -------
’
  """
    if num_iterations > 0:
        raise ValueError(
            'The number of training iterations must be greater than 0')

    if (alpha > 1) or (alpha < 0):
        raise ValueError('Alpha should be between 0 and 1')

    # Model Hyper-parameters
    max_value_bc = spark.broadcast(max_value)
    learning_rate_bc = spark.broadcast(learning_rate)
    alpha_bc = spark.broadcast(alpha)

    # Get the unique words to initialize the parameter dicts
    unique_words = word_cooc.keys().flatMap(lambda x: x).distinct().collect()

    # Initialize the model parameters
    init_vectors, init_biases, init_vectors_grads, init_biases_grads = _initialize_parameters(
        unique_words, vector_size)

    # Broadcast the new model params
    word_vectors = spark.broadcast(init_vectors)
    word_biases = spark.broadcast(init_biases)
    word_vector_grads = spark.broadcast(init_vectors_grads)
    word_bias_grads = spark.broadcast(init_biases_grads)

    # Start training
    for i in range(1, num_iterations + 1):
        print('Iteration Number:', i)
        print('\tComputing Gradients...')
        # Compute the loss for every word co-occurrence
        updates = word_cooc.flatMap(lambda x: _gradient_update(
            x, word_vectors.value, word_vector_grads.value, word_biases.value,
            word_bias_grads.value, max_value_bc.value, learning_rate_bc.value,
            alpha_bc.value))

        # Collect gradients and sum over words
        aggregated_grads = updates.reduceByKey(
            lambda x, y: [x[i] + y[i] for i in range(4)]).collect()
        print('\tUpdating Params')

        # Separate update components
        updated_vectors = {}
        for word, grad in [(word, grad[0]) for word, grad in aggregated_grads]:
            updated_vectors[word] = word_vectors.value[word] - grad

        updated_biases = {}
        for word, grad in [(word, grad[1]) for word, grad in aggregated_grads]:
            updated_biases[word] = word_biases.value[word] - grad

        updated_vector_grads = {}
        for word, grad in [(word, grads[2])
                           for word, grads in aggregated_grads]:
            updated_vector_grads[word] = word_vector_grads.value[word] + grad

        updated_bias_grads = {}
        for word, grad in [(word, grads[3])
                           for word, grads in aggregated_grads]:
            updated_bias_grads[word] = word_bias_grads.value[word] + grad

        # Un-persist old values
        for bc_var in [
                word_vectors, word_vector_grads, word_biases, word_vector_grads
        ]:
            bc_var.unpersist()

        # Broadcast updates
        word_vectors = spark.broadcast(updated_vectors)
        word_biases = spark.broadcast(updated_biases)
        word_vector_grads = spark.broadcast(updated_vector_grads)
        word_bias_grads = spark.broadcast(updated_bias_grads)

    # noinspection PyUnboundLocalVariable
    return updated_vectors, updated_biases