コード例 #1
0
def naive_multiplication_rdd(mat_a: pyspark.RDD, mat_b: pyspark.RDD, is_triangle=False):
    """
    mat_a is the left matrix
    mat_b is the right matix
    :param mat_a:
    :param mat_b:
    :param is_triangle:
    :return:
    """
    if is_triangle:
        left_rdd = (
            mat_a.flatMap(lambda x: [((x.j, x.i), x.value), ((x.i, x.j), x.value)])
                .aggregateByKey(zeroValue=(0.0, 0.0),
                                seqFunc=lambda x, y: (x[0]+y, x[1]+1),
                                combFunc=lambda x, y: (x[0] + y[0], x[1]+y[1]))
                .mapValues(lambda x: x[0] / x[1])
                .map(lambda x: (x[0][0], (x[0][1], x[1])))
        )
    else:
        left_rdd = mat_a.map(lambda x: (x.j, (x.i, x.value)))

    right_rdd = mat_b.map(lambda x: (x.i, (x.j, x.value)))
    combined_rdd = (left_rdd.join(right_rdd).map(lambda x: x[1])
        .map(lambda x: ((x[0][0], x[1][0]), x[0][1]*x[1][1]))
        .reduceByKey(lambda x, y: x+y)
        .map(lambda x: distributed.MatrixEntry(i=x[0][0], j=x[0][1], value=x[1]))
    )
    return combined_rdd
コード例 #2
0
ファイル: wikidata.py プロジェクト: Pinafore/qb
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast, b_item_map: Broadcast):
    def parse_item_claims(item):
        item_id = item['id']
        item_map = b_item_map.value
        if item_id not in item_map:
            return []
        item_label = item_map[item_id]
        property_map = b_property_map.value
        if 'enwiki' in item['sitelinks']:
            title = item['sitelinks']['enwiki']['title']
        else:
            title = None
        item_claims = []
        for property_id, property_claims in item['claims'].items():
            if property_id in property_map:
                if property_id not in property_map:
                    continue
                property_name = property_map[property_id]
                for claim in property_claims:
                    mainsnak = claim['mainsnak']
                    if 'datatype' in mainsnak and 'datavalue' in mainsnak:
                        datatype = mainsnak['datatype']
                        datavalue = mainsnak['datavalue']
                        if datatype in datatype_parsers:
                            wiki_object = datatype_parsers[datatype](datavalue)
                            if wiki_object is not None:
                                item_claims.append(
                                    Claim(item_label, property_name, wiki_object, datatype, title, property_id, item_id)
                                )
        return item_claims

    return wikidata_items.flatMap(parse_item_claims)
コード例 #3
0
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast,
                   b_item_map: Broadcast):
    def parse_item_claims(item):
        item_id = item['id']
        item_map = b_item_map.value
        if item_id not in item_map:
            return []
        item_label = item_map[item_id]
        property_map = b_property_map.value
        if 'enwiki' in item['sitelinks']:
            title = item['sitelinks']['enwiki']['title']
        else:
            title = None
        item_claims = []
        for property_id, property_claims in item['claims'].items():
            if property_id in property_map:
                if property_id not in property_map:
                    continue
                property_name = property_map[property_id]
                for claim in property_claims:
                    mainsnak = claim['mainsnak']
                    if 'datatype' in mainsnak and 'datavalue' in mainsnak:
                        datatype = mainsnak['datatype']
                        datavalue = mainsnak['datavalue']
                        if datatype in datatype_parsers:
                            wiki_object = datatype_parsers[datatype](datavalue)
                            if wiki_object is not None:
                                item_claims.append(
                                    Claim(item_label, property_name,
                                          wiki_object, datatype, title,
                                          property_id, item_id))
        return item_claims

    return wikidata_items.flatMap(parse_item_claims)
コード例 #4
0
ファイル: train_model.py プロジェクト: AlexFridman/code_temp
def train_model(data: RDD, l=1.0) -> MLNaiveBayesModel:
    aggregated = data.flatMap(lambda x:
                              [(l, x['features']) for l in x['labels']]) \
        .combineByKey(lambda v: (1, v),
                      lambda c, v: (c[0] + 1, c[1] + v),
                      lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])) \
        .sortBy(lambda x: x[0]) \
        .collect()
    num_labels = len(aggregated)
    num_documents = data.count()
    num_features = aggregated[0][1][1].size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggregated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom

        sum_term_freq_dense = sum_term_freq.toarray()
        theta_log_denom = math.log(sum_term_freq.sum() + num_features * l)
        theta[i, :] = np.log(sum_term_freq_dense + l) - theta_log_denom
        i += 1
    return MLNaiveBayesModel(labels, pi, theta)
コード例 #5
0
ファイル: wikidata.py プロジェクト: Pinafore/qb
def extract_item_page_map(wikidata_items: RDD):
    def parse_item_page(item):
        item_id = item['id']
        if 'enwiki' in item['sitelinks']:
            return [(item_id, item['sitelinks']['enwiki']['title'])]
        else:
            return []
    return wikidata_items.flatMap(parse_item_page).collectAsMap()
コード例 #6
0
def extract_item_page_map(wikidata_items: RDD):
    def parse_item_page(item):
        item_id = item['id']
        if 'enwiki' in item['sitelinks']:
            return [(item_id, item['sitelinks']['enwiki']['title'])]
        else:
            return []

    return wikidata_items.flatMap(parse_item_page).collectAsMap()
コード例 #7
0
def extract_item_page_map(wikidata_items: RDD):
    def parse_item_page(item):
        item_id = item["id"]
        if "enwiki" in item["sitelinks"]:
            return [(item_id, item["sitelinks"]["enwiki"]["title"])]
        else:
            return []

    return wikidata_items.flatMap(parse_item_page).collectAsMap()
コード例 #8
0
def extract_claim_types(wikidata_items: RDD):
    def parse_types(item):
        value_types = []
        for property_claims in item["claims"].values():
            for c in property_claims:
                mainsnak = c["mainsnak"]
                if "datatype" in mainsnak:
                    value_types.append(mainsnak["datatype"])
        return value_types

    return set(wikidata_items.flatMap(parse_types).distinct().collect())
コード例 #9
0
ファイル: wikidata.py プロジェクト: Pinafore/qb
def extract_claim_types(wikidata_items: RDD):
    def parse_types(item):
        value_types = []
        for property_claims in item['claims'].values():
            for c in property_claims:
                mainsnak = c['mainsnak']
                if 'datatype' in mainsnak:
                    value_types.append(mainsnak['datatype'])
        return value_types

    return set(wikidata_items.flatMap(parse_types).distinct().collect())
コード例 #10
0
def extract_claim_types(wikidata_items: RDD):
    def parse_types(item):
        value_types = []
        for property_claims in item['claims'].values():
            for c in property_claims:
                mainsnak = c['mainsnak']
                if 'datatype' in mainsnak:
                    value_types.append(mainsnak['datatype'])
        return value_types

    return set(wikidata_items.flatMap(parse_types).distinct().collect())
コード例 #11
0
 def __group_by_blocks(train: RDD, test: RDD, similarity: RDD,
                       n_bucket_block: int, n_cross_block: int,
                       n_item_block: int) -> (RDD, RDD, RDD):
     """
     :param train:
         RDD<(cross_block, item_block), (cross_bucket, item, rating)>
     :param test:
         RDD<(bucket_block, item_block), (bucket, item)>
     :param similarity:
         RDD<(bucket_block, cross_block), (bucket, cross_bucket, similarity)>
     """
     """
     train -> RDD<(b, bucket_block, item_block), ((bucket, item, rating), 0)>, b=1, ..., n_bucket_block
     test -> RDD<(bucket_block, b, item_block), ((bucket, item), 1)>, b=1, ..., n_cross_block
     similarity-> RDD<(bucket_block, bucket_block, i), ((bucket, bucket, similarity), 2)>, i=1, ..., n_item_block
     """
     train = train.flatMap(lambda u: (
         ((b, u[0][0], u[0][1]), (u[1], 0)) for b in range(n_bucket_block)))
     test = test.flatMap(lambda u: (
         ((u[0][0], c, u[0][1]), (u[1], 1)) for c in range(n_cross_block)))
     similarity = similarity.flatMap(lambda u: (
         ((u[0][0], u[0][1], i), (u[1], 2)) for i in range(n_item_block)))
     return train, test, similarity
コード例 #12
0
    def __find_candidates(self, signature: RDD) -> RDD:
        """
        Generate candidates from signatures.
        :param signature: RDD<(Hashable, tuple<int>)>
        :return: RDD<(Hashable, Hashable)>
            Item pairs which are candidates for computing similarity later.
        """
        divide = self.__divide_signature
        generate = self.__generate_candidates
        n_bands, n_rows = self.__n_bands, self.__n_rows

        return signature.flatMap(lambda key_values: divide(
            *key_values, n_bands=n_bands, n_rows=n_rows)).aggregateByKey(
                tuple(), lambda u, v: u + (v, ),
                lambda u1, u2: u1 + u2).flatMap(
                    lambda key_values: generate(key_values[1])).distinct()
コード例 #13
0
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast,
                   b_item_map: Broadcast):
    def parse_item_claims(item):
        item_id = item["id"]
        item_map = b_item_map.value
        if item_id not in item_map:
            return []
        item_label = item_map[item_id]
        property_map = b_property_map.value
        if "enwiki" in item["sitelinks"]:
            title = item["sitelinks"]["enwiki"]["title"]
        else:
            title = None
        item_claims = []
        for property_id, property_claims in item["claims"].items():
            if property_id in property_map:
                if property_id not in property_map:
                    continue
                property_name = property_map[property_id]
                for claim in property_claims:
                    mainsnak = claim["mainsnak"]
                    if "datatype" in mainsnak and "datavalue" in mainsnak:
                        datatype = mainsnak["datatype"]
                        datavalue = mainsnak["datavalue"]
                        if datatype in datatype_parsers:
                            wiki_object = datatype_parsers[datatype](datavalue)
                            if wiki_object is not None:
                                item_claims.append(
                                    Claim(
                                        item_label,
                                        property_name,
                                        wiki_object,
                                        datatype,
                                        title,
                                        property_id,
                                        item_id,
                                    ))
        return item_claims

    return wikidata_items.flatMap(parse_item_claims)
コード例 #14
0
ファイル: utils.py プロジェクト: rdguerrerom/drudge
def _nest_bind_no_balance(rdd: RDD, func):
    """Nest the flat map of the given function without load balancing.
    """
    def wrapped(obj):
        """Wrapped function for nest bind."""
        curr = [obj]
        res = []
        while len(curr) > 0:
            new_curr = []
            for i in curr:
                step_res = func(i)
                if step_res is None:
                    res.append(i)
                else:
                    new_curr.extend(step_res)
                continue
            curr = new_curr
            continue

        return res

    return rdd.flatMap(wrapped)
コード例 #15
0
    def __call__(self, rdd: RDD, **kwargs: Any) -> RDD:
        """
        Performs a single step of an algorithm, running all operations in sequence
        and ensuring data is partitioned correctly.

        Any additional keyword arguments passed to this function will be available
        in all life-cycle functions of the step:
        - `group`
        - `emit_by_group`
        - `broadcast`
        - `step`

        **DO NOT OVERRIDE WHEN DEFINING CUSTOM STEPS.**
        """
        if rdd.getNumPartitions() != self._n_partitions:
            rdd = rdd.repartition(self._n_partitions)

        step_cls: Type[Step] = self.__class__
        rdd = step_cls.group(
            rdd, **kwargs
        ).cache()  # cache because we use it twice (emit and step)

        def unwrap_emit(kv: Tuple[Any, Iterable[Any]]) -> Optional[Tuple[Any, Any]]:
            k, v = kv
            new_v = step_cls.emit_by_group(k, v, **kwargs)
            return new_v

        emitted = list(rdd.map(unwrap_emit, preservesPartitioning=True).collect())
        to_broadcast = step_cls.broadcast(emitted, **kwargs)
        broadcast: Broadcast = self._sc.broadcast(to_broadcast)

        def unwrap_step(kv: Tuple[Any, Iterable[Any]]) -> Iterable[Any]:
            k, v = kv
            for new_v in step_cls.step(k, v, broadcast, **kwargs):
                yield new_v

        rdd = rdd.flatMap(unwrap_step, preservesPartitioning=True)
        return rdd
コード例 #16
0
 def __call__(self, rows: RDD) -> RDD:
     return rows.flatMap(self.deserialize_uast)
コード例 #17
0
def train_glove(spark: SparkContext,
                word_cooc: RDD,
                num_iterations=100,
                vector_size=10,
                learning_rate=0.001,
                max_value=100,
                alpha=3. / 4) -> (Dict[str, Array], Dict[str, float]):
    """Train a glove model

  TODO: add option to initialize form existing parameters for continued training

  Parameters
  ----------
  spark : The Spark context of the session
  word_cooc :  The co-occurrence RDD of words, ([word, word], count)
  max_value :  The max value of the loss weighting. Counts higher then this do
    not have the loss applied to them
  num_iterations : the number of training iterations to run
  max_value : The maximum value where loss weighting is applied
  learning_rate : The learning rate of the vector
  alpha : Part of the loss weighting

  Returns
  -------
’
  """
    if num_iterations > 0:
        raise ValueError(
            'The number of training iterations must be greater than 0')

    if (alpha > 1) or (alpha < 0):
        raise ValueError('Alpha should be between 0 and 1')

    # Model Hyper-parameters
    max_value_bc = spark.broadcast(max_value)
    learning_rate_bc = spark.broadcast(learning_rate)
    alpha_bc = spark.broadcast(alpha)

    # Get the unique words to initialize the parameter dicts
    unique_words = word_cooc.keys().flatMap(lambda x: x).distinct().collect()

    # Initialize the model parameters
    init_vectors, init_biases, init_vectors_grads, init_biases_grads = _initialize_parameters(
        unique_words, vector_size)

    # Broadcast the new model params
    word_vectors = spark.broadcast(init_vectors)
    word_biases = spark.broadcast(init_biases)
    word_vector_grads = spark.broadcast(init_vectors_grads)
    word_bias_grads = spark.broadcast(init_biases_grads)

    # Start training
    for i in range(1, num_iterations + 1):
        print('Iteration Number:', i)
        print('\tComputing Gradients...')
        # Compute the loss for every word co-occurrence
        updates = word_cooc.flatMap(lambda x: _gradient_update(
            x, word_vectors.value, word_vector_grads.value, word_biases.value,
            word_bias_grads.value, max_value_bc.value, learning_rate_bc.value,
            alpha_bc.value))

        # Collect gradients and sum over words
        aggregated_grads = updates.reduceByKey(
            lambda x, y: [x[i] + y[i] for i in range(4)]).collect()
        print('\tUpdating Params')

        # Separate update components
        updated_vectors = {}
        for word, grad in [(word, grad[0]) for word, grad in aggregated_grads]:
            updated_vectors[word] = word_vectors.value[word] - grad

        updated_biases = {}
        for word, grad in [(word, grad[1]) for word, grad in aggregated_grads]:
            updated_biases[word] = word_biases.value[word] - grad

        updated_vector_grads = {}
        for word, grad in [(word, grads[2])
                           for word, grads in aggregated_grads]:
            updated_vector_grads[word] = word_vector_grads.value[word] + grad

        updated_bias_grads = {}
        for word, grad in [(word, grads[3])
                           for word, grads in aggregated_grads]:
            updated_bias_grads[word] = word_bias_grads.value[word] + grad

        # Un-persist old values
        for bc_var in [
                word_vectors, word_vector_grads, word_biases, word_vector_grads
        ]:
            bc_var.unpersist()

        # Broadcast updates
        word_vectors = spark.broadcast(updated_vectors)
        word_biases = spark.broadcast(updated_biases)
        word_vector_grads = spark.broadcast(updated_vector_grads)
        word_bias_grads = spark.broadcast(updated_bias_grads)

    # noinspection PyUnboundLocalVariable
    return updated_vectors, updated_biases
コード例 #18
0
ファイル: uast2bag_features.py プロジェクト: pareion/ml
 def __call__(self, rows: RDD):
     return rows.flatMap(self.process_row)
コード例 #19
0
ファイル: moder.py プロジェクト: shobrook/ml
 def call_func(self, rows: RDD):
     return rows.flatMap(self.extract_functions_from_row)
コード例 #20
0
def flatten_trips(
    rdd: RDD,
    airport: AirportFinder,
    air_info,
    sticky: float = float("inf"),
    time_threshold: float = float("inf")
) -> RDD:
    def flatmap_split_trips(record,
                            airport: AirportFinder = airport,
                            air_info=air_info,
                            sticky=sticky,
                            time_threshold=time_threshold):
        """ Split the record for each plane in multiple records:
        One for each trip the plane took.
        """
        alt, talt, lat, long, speed, time = record.Alt, record.TargetAlt, record.Lat, record.Long, record.Speed, record.Time
        # Ideas: - Apply low frequency filter on the altitude to
        #          remove noise from data ?
        #        - Do the same on Speed values ?

        # Get local minimums of the altitude -> this might be take off and landing

        if len(time) <= 2:
            return []

        # local_alt_bool = np.array([a < local_alt for a, local_alt in zip(alt,
        #                                                                  [airport.local_max_alt(pos, air_info) for pos in zip(lat, long)])])

        # split_indices = [i for i, (a, b) in enumerate(zip(np.gradient(np.sign(np.gradient(alt))) > 0,
        #                                                   local_alt_bool))
        #                  if a and b]
        ta_max = 0.5 * max(talt)

        local_alt_bool = np.array([
            a < local_alt and ta < ta_max
            for a, ta, local_alt in zip(alt, talt, [
                airport.local_max_alt(pos, air_info) for pos in zip(lat, long)
            ])
        ])

        split_indices = [
            i for i, (a, b, c) in enumerate(
                zip(
                    np.gradient(np.sign(np.gradient(alt))) > 0,
                    np.gradient(np.sign(np.gradient(talt))) >= 0,
                    local_alt_bool)) if a and b and c
        ]

        # TODO: Here compute distance airport - position and validate the landing.
        # TODO: if too big time shifts appears -> filter out ?
        #                                      -> land and cut ?

        corresponding_airports = [
            airport.closest_airport((lat[i], long[i])) for i in split_indices
        ]

        couples = [(i, j)
                   for i, j in zip(split_indices[:-1], split_indices[1:])]

        trip_l = []
        for k, (i, j) in enumerate(couples):
            air_from = air_info[corresponding_airports[k]]
            air_to = air_info[corresponding_airports[k + 1]]

            # Get real trips only
            if air_from == air_to:
                continue

            # A trip must last a bit
            if time[j] - time[i] < time_threshold:
                continue

            # TODO: control here for monotonicity of lat, long and not too long trip
            #       -> thoughts for monotonicity:
            #               -> real monotonicity (bad)
            #               -> having a direction (between airports), make sure at
            #                  least 80% steps go into that direction
            #       -> cut into multiple trips in the middle where monotonicity fails!

            row = Row(
                Icao=record.Icao,
                Op=record.Op,
                Engines=record.Engines,
                Mil=record.Mil,
                Cou=record.Cou,
                Mdl=record.Mdl,
                From=air_from,
                To=air_to,
                Lat=lat[i:j + 1],
                Long=long[i:j + 1],
                Alt=alt[i:j + 1],
                TargetAlt=talt[i:j + 1],
                Speed=speed[i:j + 1],
                Time=time[i:j + 1],
            )
            trip_l.append(row)
        return trip_l

    rdd = rdd.flatMap(flatmap_split_trips)
    # print(f"Applied trips recognition, remains {rdd.count()} records")
    return rdd