def test_do_cartesian(self): spark_session = sql.SparkSession(self.sc) string_rdd = self.sc.parallelize(self.test_data).map( lambda x: pyspark.Row(id=x[0], label=x[1], vector=ml_linalg.DenseVector(x[2]))) string_df = string_rdd.toDF() test_demon = do_cartesian(sc=self.sc, df=string_df, id_col='id', feature_col='vector') check_diagonal = test_demon.filter(lambda x: x.i == x.j).map(lambda x: x.value).collect() for diag in check_diagonal: self.assertEqual(1.0, diag)
def asML(self): """ Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. :return: :py:class:`pyspark.ml.linalg.DenseVector` .. versionadded:: 2.0.0 """ return newlinalg.DenseVector(self.array)
def _extract_descriptors_from_data(data): """ Extracts descriptors from a given data instance :param data: A single data instance (label, image) :return: """ label = data[0] return label, ml.DenseVector( FeatureExtractor.extract_descriptors_from_image(data))
def extract_descriptors_from_image(data): """ Extracts descriptors related to the key-points detected by the used algorithm (SIFT, SURF...) :param data: Tuple that contains (label, image data) :return: List of dense descriptors (the descriptor length varies depending on the used algorithm) or None if no key points have been detected """ label = data[0] image = data[1] key_points, descriptors = FeatureExtractor._get_extraction_algorithm( ).detectAndCompute(image, None) if descriptors is None: return list([(ml.DenseVector([]), label)]) dense_descriptors = list( map( lambda descriptor: (ml.DenseVector(descriptor.tolist()), label), descriptors)) return dense_descriptors
def get_features_from_descriptors(self, instance): """ For a given instance, finds out what is the cluster related to every descriptor. And returns a list with the number of descriptors that contains each cluster (features). :param instance: An image instance with label and descriptors :return: Tuple of dense vector features and label """ label = instance[0][1] instance_df = self.spark.createDataFrame(instance, self.schema) cluster_predictions = self.cluster_model.transform(instance_df) predictions_column = cluster_predictions.select("prediction") predictions = [int(row.prediction) for row in predictions_column.collect()] frequencies = self._get_cluster_frequency(predictions) return ml.DenseVector(frequencies), label
def _to_dense(x): try: return ml_linalg.DenseVector(x.toArray()) except Exception as e: print(e) return x