def test_do_cartesian(self):
     spark_session = sql.SparkSession(self.sc)
     string_rdd = self.sc.parallelize(self.test_data).map(
         lambda x: pyspark.Row(id=x[0], label=x[1], vector=ml_linalg.DenseVector(x[2])))
     string_df = string_rdd.toDF()
     test_demon = do_cartesian(sc=self.sc, df=string_df, id_col='id', feature_col='vector')
     check_diagonal = test_demon.filter(lambda x: x.i == x.j).map(lambda x: x.value).collect()
     for diag in check_diagonal:
         self.assertEqual(1.0, diag)
Esempio n. 2
0
    def asML(self):
        """
        Convert this vector to the new mllib-local representation.
        This does NOT copy the data; it copies references.

        :return: :py:class:`pyspark.ml.linalg.DenseVector`

        .. versionadded:: 2.0.0
        """
        return newlinalg.DenseVector(self.array)
    def _extract_descriptors_from_data(data):
        """
        Extracts descriptors from a given data instance

        :param data: A single data instance (label, image)
        :return:
        """

        label = data[0]

        return label, ml.DenseVector(
            FeatureExtractor.extract_descriptors_from_image(data))
    def extract_descriptors_from_image(data):
        """
        Extracts descriptors related to the key-points detected by the used algorithm (SIFT, SURF...)

        :param data: Tuple that contains (label, image data)
        :return: List of dense descriptors (the descriptor length varies depending on the used algorithm) or None if no
                 key points have been detected
        """

        label = data[0]
        image = data[1]

        key_points, descriptors = FeatureExtractor._get_extraction_algorithm(
        ).detectAndCompute(image, None)

        if descriptors is None:
            return list([(ml.DenseVector([]), label)])

        dense_descriptors = list(
            map(
                lambda descriptor:
                (ml.DenseVector(descriptor.tolist()), label), descriptors))

        return dense_descriptors
Esempio n. 5
0
    def get_features_from_descriptors(self, instance):
        """
        For a given instance, finds out what is the cluster related to every descriptor. And returns a list with the
        number of descriptors that contains each cluster (features).

        :param instance: An image instance with label and descriptors
        :return: Tuple of dense vector features and label
        """

        label = instance[0][1]
        instance_df = self.spark.createDataFrame(instance, self.schema)

        cluster_predictions = self.cluster_model.transform(instance_df)

        predictions_column = cluster_predictions.select("prediction")
        predictions = [int(row.prediction) for row in predictions_column.collect()]

        frequencies = self._get_cluster_frequency(predictions)

        return ml.DenseVector(frequencies), label
def _to_dense(x):
    try:
        return ml_linalg.DenseVector(x.toArray())
    except Exception as e:
        print(e)
        return x