def findSynonyms(self, word, num): """ Find "num" number of words closest in similarity to "word". word can be a string or vector representation. Returns a dataframe with two fields word and similarity (which gives the cosine similarity). """ if not isinstance(word, basestring): word = _convert_to_vector(word) return self._call_java("findSynonyms", word, num)
def findSynonymsArray(self, word, num): """ Find "num" number of words closest in similarity to "word". word can be a string or vector representation. Returns an array with two fields word and similarity (which gives the cosine similarity). """ if not isinstance(word, basestring): word = _convert_to_vector(word) tuples = self._java_obj.findSynonymsArray(word, num) return list(map(lambda st: (st._1(), st._2()), list(tuples)))
def unpack_bus_attributes(row): """ Unpacks Business attributes and assigns them an index value.""" # List to store business attributes. unpacked = list() # Unpack all attributes except PriceRange and Parking temp = [row[s] for s in bus_attributes] # Process PriceRange try: priceRange = int(row["RestaurantsPriceRange2"]) except (TypeError, ValueError): # If no price range specified - default=2 priceRange = 2 #Process Parking try: parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1 except AttributeError: parking = 0 # Process WiFi if row["WiFi"] == 'no' or row["WiFi"] == "u'no'": wifi = -1 elif row["WiFi"] == None: wifi = 0 else: wifi = 1 # Tokenize all Boolean attributes. for i in temp: if i == "True": unpacked.append(1) elif i == "False": unpacked.append(-1) else: unpacked.append(0) # Append the Parking and PriceRange attributes unpacked.append(wifi) unpacked.append(parking) unpacked.append(priceRange) # Print any arrays that are not of desired length (=30). if len(unpacked) != 30: print(unpacked) return _convert_to_vector( csc_matrix(np.asarray(unpacked).astype(float)).T)
def dense_to_sparse(vector): return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T)
from pyspark.ml.linalg import VectorUDT, _convert_to_vector from pyspark.sql.types import Row, StructField, StructType import time # Small vectors num_features = 100 # The number of clusters k = 10 num_points = 100000 num_iters = 10 FEATURES_COL = "features" np.random.seed(2) np_data = [x.tolist() for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features))] schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) mllib_rows = [Row(_convert_to_vector(x)) for x in np_data] mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache() df = sqlContext.createDataFrame([[r] for r in np_data]).toDF(FEATURES_COL).coalesce(1) # For now, analysis is still required. We cache the output because we are going to perform # multiple runs on the dataset. df0 = tfs.analyze(df).cache() mllib_df.count() df0.count() np.random.seed(2) init_centers = np.random.randn(k, num_features) start_centers = init_centers dataframe = df0
# Small vectors num_features = 100 # The number of clusters k = 10 num_points = 100000 num_iters = 10 FEATURES_COL = "features" np.random.seed(2) np_data = [ x.tolist() for x in np.random.uniform(0.0, 1.0, size=(num_points, num_features)) ] schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) mllib_rows = [Row(_convert_to_vector(x)) for x in np_data] mllib_df = sqlContext.createDataFrame(mllib_rows, schema).coalesce(1).cache() df = sqlContext.createDataFrame([[r] for r in np_data ]).toDF(FEATURES_COL).coalesce(1) # For now, analysis is still required. We cache the output because we are going to perform # multiple runs on the dataset. df0 = tfs.analyze(df).cache() mllib_df.count() df0.count() np.random.seed(2) init_centers = np.random.randn(k, num_features) start_centers = init_centers dataframe = df0