def obtem_dados_parquet(): pass if __name__ == "__main__": configuracao = (SparkConf().set("spark.driver.maxResultSize", "2g")) spark = (SparkSession.builder.config( conf=configuracao).appName(NOME_JOB).enableHiveSupport().getOrCreate()) sc = spark.sparkContext sqlContext = SQLContext(sc) to_value = lambda v: float(v.replace(",", ".")) udf_to_value = F.udf(to_value, FloatType()) caminho_csv = "C:/Users/francoise.moreira/OneDrive - JM Confitec Sistemas de Computação LTDA/ESTUDO/SPARK_JUPITER_PANDA_PYTHON/lerParquet e tratar dados/dadosCSV/2021_Pagamento.csv" caminho_parquet = "C:/Users/francoise.moreira/OneDrive - JM Confitec Sistemas de Computação LTDA/ESTUDO/SPARK_JUPITER_PANDA_PYTHON/lerParquet e tratar dados/dadosParquet/" df = obtem_dados_csv(caminho_csv, ";", "true", "true") # NOTE transformar os dados recebidos em string DadosGoverno = (df.transform(lambda df: df.withColumn( "id_processo", F.col("Identificador do processo de viagem").cast(StringType()) )).transform(lambda df: df.withColumn( "proposta", F.col("N�mero da Proposta (PCDP)").cast(StringType()) )).transform(lambda df: df.withColumn( "cod_orgao_sup",
class ProtoDash(): def create_spark_session(): global sc sc = SparkContext.getOrCreate() sc.setLogLevel("ERROR") return SparkSession(sc) def create_vec_rdd(X, part=4): """ Function returning a DenseVector RDD from a dataset X. Args: -X: a dataset with rows corresponding to observations and columns corresponding to features. -part: n of partitions. Returns: the RDD for X """ # creating a Spark session spark_session = ProtoDash.create_spark_session() X_rdd = (sc.parallelize( X, part).map(lambda x: DenseVector(x)).zipWithIndex()) return X_rdd def mean_inner_product(inp, sigma, n): """ Function computing the gaussian kernel inner product of a vector in Y vs. a vector in X, divided by n the total number of observations in X. """ index_1 = inp[0][1] inner_product = float( np.exp(inp[0][0].squared_distance(inp[1][0]) / (-2 * sigma**2)) / n) return (index_1, inner_product) def inner_product(inp, sigma): """ Function computing the gaussian kernel inner product of a vector vs. another. """ index_1 = inp[0][1] index_2 = inp[1][1] inner_product = float( np.exp(inp[0][0].squared_distance(inp[1][0]) / (-2 * sigma**2))) return (index_1, [(index_2, inner_product)]) def weighted_sum(inp, w_arr): """ compute the weighted sum of matrix values for a set of indices and weights. Note it is fine using a list comprehension here since the number of prototypes m << |X^{(1)}|. """ return float(np.sum(np.array([x[1] for x in inp]) * w_arr)) def udf_weighted_sum(w_arr): """ UDF instance of the weighted_sum function. """ return F.udf(lambda l: ProtoDash.weighted_sum(l, w_arr)) def merge_lists(x, y): """ merge lists. """ return sorted(x + y, key=lambda tup: tup[0]) # Create UDF corresponding to merge_lists function. DType = ArrayType( StructType( [StructField("_1", LongType()), StructField("_2", FloatType())])) udf_merge_lists = F.udf(merge_lists, DType) def optimize(K, u, opt_w0, init_val, max_w=10000): """ Function solving quadratic optimization problem. Args: -K: inner product matrix -u: mean inner product of each prototype -opt_w0: initial weights vector -init_val: starting run -max_w: an upper bound on weight value Returns: -weights and the objective value """ dim = u.shape[0] low_b = np.zeros((dim, 1)) upper_b = max_w * np.ones((dim, 1)) x_init = np.append(opt_w0, init_val / K[dim - 1, dim - 1]) G = np.vstack((np.identity(dim), -1 * np.identity(dim))) h = np.vstack((upper_b, -1 * low_b)) # solve constrained quadratic problem soltn = solve_qp(K, -u, G, h, A=None, b=None, solver='cvxopt', initvals=x_init) # calculate the objective function value for optimal solution x_sol = soltn.reshape(soltn.shape[0], 1) q = -u.reshape(u.shape[0], 1) obj_value = 1 / 2 * np.matmul(np.matmul(x_sol.T, K), x_sol) + np.matmul(q.T, x_sol) return (soltn, obj_value[0, 0]) def ProtoDashAlgoritm(X, Y, m, sigma, partitions=20, verbose=True): """ Implementation of the ProtoDash algorithm Args: -X (RDD of indexed DenseVector rows): Target dataset/ the dataset to be represented. -Y (RDD of indexed DenseVector rows): Source dataset/ the dataset to select prototypes from. -m (integer): total number of prototypes to select. -sigma (strictly positive float): gaussian kernel parameter. -partitions (integer): number of RDD partitions to compute inner product RDDs with. -verbose (boolean): whether or not to print the cumulative number of prototypes selected at each iteration. Returns: -L (integer list): the set of indices corresponding to selected prototypes. -w (float list): the optimal set of weights corresponding to each selected prototype. """ # get count of observations in X n_X = X.count() # build mu DataFrame mu_df = (Y.cartesian(X).map(lambda x: ProtoDash.mean_inner_product( x, sigma, n_X)).reduceByKey(lambda x, y: x + y).toDF(["obs", "mu"])) # initialise key variables L = np.zeros(m, dtype=int) # set of prototype indices L w = np.zeros(m, dtype=float) # set of optimal prototype weights f_eval = np.zeros( m, dtype=float) # set of the f(w) eval. at prototype selection n_L = 0 # count of prototypes selected so far # find the index corresponding to the maximum mu value max_grad_0 = mu_df.orderBy(F.desc("mu")).limit(1).collect()[0] # collect values L[n_L] = max_grad_0.obs w[n_L] = max_grad_0.mu f_eval[n_L] = 1 / 2 * max_grad_0.mu**2 n_L += 1 # select the row of Y corresponding to the first chosen index Y_row_j0 = Y.filter(lambda x: x[1] == L[:n_L]).collect()[0] # take its inner product with all rows of Y to build the starting K dataframe K_init_df = (Y.map(lambda x: ProtoDash.inner_product( (x, Y_row_j0), sigma)).toDF(["obs", "K"])) # join mu and K dataframes join_df = (mu_df.join(K_init_df, "obs").repartition(partitions)) # cache join_df as it is reused often join_df.cache() # compute the new gradient vector grad_df = (join_df.withColumn( "K_weighted", ProtoDash.udf_weighted_sum(w[:n_L])(F.col("K"))).withColumn( "grad", F.col("mu") - F.col("K_weighted")).select("obs", "grad")) # begin while loop while n_L < m: # remove the rows that have an index already included in L grad_df = grad_df.filter( ~grad_df.obs.isin([int(x) for x in L[:n_L]])) # find the row that has the maximum value in the filtered gradient vector argmax_grad = grad_df.orderBy(F.desc("grad")).limit(1).collect()[0] # update L L[n_L] = argmax_grad.obs # select the row of Y corresponding to the chosen index Y_row_j = Y.filter(lambda x: x[1] == L[n_L]).collect()[0] # take its inner product with all rows of Y to build new K K_int_df = (Y.map(lambda x: ProtoDash.inner_product( (x, Y_row_j), sigma)).toDF(["obs", "new_K_col"])) # add new K col to previous K col join_df = (join_df.join(K_int_df, "obs").withColumn( "K_merged", ProtoDash.udf_merge_lists(F.col("K"), F.col("new_K_col"))).select( "obs", "mu", "K_merged").withColumnRenamed( "K_merged", "K")) # cache new joined_df join_df.cache() # increment n_L n_L += 1 # sort L L[:n_L] = sorted(L[:n_L]) if verbose is True and n_L % 5 == 0: print("Prototypes selected - " + str(n_L)) # take max gradient val. max_grad = argmax_grad.grad # filter join dataframe for given indices in L filt_df = (join_df.filter( join_df.obs.isin([int(x) for x in L[:n_L] ])).orderBy(F.col("obs").asc())) # take mu vector mu_arr = np.array(filt_df.select("mu").collect(), dtype=float) # take K matrix K_mat = np.array( filt_df.rdd.map(lambda x: [y[1] for y in x[2]]).collect(), dtype=float) # find optimal weights for the index set L opt_res = ProtoDash.optimize(K_mat, mu_arr, w[:n_L - 1], max_grad) (w[:n_L], f_eval[n_L - 1]) = opt_res[0], -opt_res[1] # compute gradient vector with new optimal weights grad_df = (join_df.withColumn( "K_weighted", ProtoDash.udf_weighted_sum(w[:n_L])(F.col("K"))).withColumn( "grad", F.col("mu") - F.col("K_weighted")).select("obs", "grad")) # tuple of indices and their corresponding weight, sorted by weight in descending order. res = sorted([(w[i], L[i]) for i in range(m)], key=lambda tup: -tup[0]) # return tuple of index set L and optimal weight set w, set of f_eval return res, f_eval ####################################################### ######### RDF IMPLEMENTATION ########### ####################################################### rdf_dataset = None numeric_dataset = None subject_index = {} predicate_index = {} object_index = {} def infer_index(token, token_indices): """ Enumerate the distinct tokens. If the token is found in the token_indices, then return it, else assign the next integer number (after the last assigned index) which is also the size of the token_indices. """ if token in token_indices: return token_indices[token] else: token_index = len(token_indices) token_indices[token] = token_index return token_index def convert_rdf_to_ntriples(dataset): """ Loads rdf data and converts into n-triples, treating each triple as a datapoint in the dataset. """ g = Graph() g.load(dataset) rem_object = URIRef( "http://www.w3.org/2002/07/owl#NamedIndividual" ) # deleting the triples that have object value as 'owl#NamedIndividual' for s, p, o in g: g.remove((s, p, rem_object)) global rdf_dataset global numeric_dataset # create n-triples of strings rdf_dataset = [(str(s), str(p), str(o)) for s, p, o in g] # preprocess and create a numeric dataset in order to input to ProtoDash numeric_dataset = list( map( lambda e: (ProtoDash.infer_index(e[0], ProtoDash.subject_index), ProtoDash.infer_index(e[1], ProtoDash.predicate_index), ProtoDash.infer_index(e[2], ProtoDash.object_index)), rdf_dataset)) #print(rdf_dataset) #print('************************************') #print('Size of dataset:', len(rdf_dataset)) #print('Subjects cardinality:', len(subject_index)) #print('Predicates cardinality:', len(predicate_index)) #print('Objects cardinality:', len(object_index)) print('************************************') return numeric_dataset def strip_rdf_prefix(triple): """ Strips the common URL-like prefixes from the RDF data and takes the suffix after '#'. Example: Input triple: ('http://www.semanticweb.org/vinu/ontologies/2014/6/untitled-ontology-91#naomi_watts', 'http://www.semanticweb.org/vinu/ontologies/2014/6/untitled-ontology-91#acted_in', 'http://www.semanticweb.org/vinu/ontologies/2014/6/untitled-ontology-91#rabbits') Output: naomi_watts acted_in rabbits """ return ' '.join(tuple(map(lambda e: e[e.find('#') + 1:], triple))) def get_sample_index(dataset, sample): # Function returns the index of the triple in the dataset global dataset_rdd dataset_rdd = ProtoDash.convert_rdf_to_ntriples(dataset) index_list = [x for x, y in enumerate(rdf_dataset)] for i in range(len(rdf_dataset)): if rdf_dataset[i] == sample: return index_list[i] def get_rdf_prototypes(dataset, sample_triple, num_proto): # Index of the sample from the dataset to be given to the ProtoDash as dataset to be explained # These prototypes that come out of ProtoDash can be thought as the cluster that this sample belongs to. # Or vice versa, the sampled datapoint can be thought as cluster centroid, and the explaining prototypes # as the data that belong to that cluster. sample_index = ProtoDash.get_sample_index(dataset, sample_triple) if sample_index is not None: # Create a target dataset comprising of the selected sample target = [numeric_dataset[sample_index]] # Create a source dataset comprising of all triples but the selected sample source = numeric_dataset[:sample_index] + numeric_dataset[ sample_index + 1:] # Convert the datasets to PySpark RDDs target_rdd = ProtoDash.create_vec_rdd(target) source_rdd = ProtoDash.create_vec_rdd(source) print('Starting ProtoDash on RDF') res, f = ProtoDash.ProtoDashAlgoritm(target_rdd, source_rdd, num_proto, 50, partitions=4, verbose=True)[:2] print('Finished ProtoDash on RDF') print('The chosen sample_index:', sample_index) # Raw RDF triples has a long common prefixes, for the sake presentation (to keep it short), # I strip the common long URL-like prefixes and take the suffix after '#' - the data that matters. stripped_target = ProtoDash.strip_rdf_prefix( rdf_dataset[sample_index]) # Print the target datapoint print('Target (sampled) datapoint: ', stripped_target) # create the Y and X axis of the plot # The result (res) that comes from the ProtoDash is a list of pairs of weight and index # I use the index find the triples from the raw dataset to be used X-axis # and the weights are used as Y-coordinates values = list(map(lambda e: e[0], res)) # e[0] is weight names = list(map(lambda e: rdf_dataset[e[1]], res)) # e[1] is index # strip the names to fit into the plot names = list(map(ProtoDash.strip_rdf_prefix, names)) plt.barh(names, values) plt.title(stripped_target) plt.show() else: print("Please enter a valid triple") def ProtoDashOnRDF(dataset, num_proto, sample_triple): # dataset: path to the file # num_proto: number of prototypes for ProtoDash to select # sample_triple: the sample_triple is string which refer to the triple if os.path.isfile(dataset): if num_proto.isdigit(): sample_triple = tuple(sample_triple.split(',')) ProtoDash.get_rdf_prototypes(dataset, sample_triple, int(num_proto)) else: print("Number of prototypes can be only integer") else: print("File do not exists") ####################################################### ######### Image IMPLEMENTATION ########### ####################################################### # collect MNIST train/test sets train_images = np.array(mnist.train_images(), dtype='float') train_labels = mnist.train_labels() test_images = np.array(mnist.test_images(), dtype='float') test_labels = mnist.test_labels() def create_target_set(labels, images, digit, target_n, percentage): """ This function creates a MNIST image dataset in which a specified percentage of the total observations correspond to a specific digit, while the remaining observations correspond to other randomly chosen digits. Args: -labels: the digit label for each MNIST image. -images: the MNIST image. -digit: a digit between 0 and 9. -target_n: the number of total observations required in the target dataset. -percentage: the percentage of images in the target dataset that correspond to the specified digit. Returns: -the target images. """ # take integer number of obs. corresponding to digit n_dig = int(np.floor(percentage * target_n)) # get indices corresponding to digit idx = np.where(labels == digit)[0] # reduce indices to specific % idx_red = idx[:n_dig] # slice images with index and reshape target_set_dig = images[idx_red, :] target_set_dig = np.reshape(target_set_dig, (target_set_dig.shape[0], 28 * 28)) # get remaining indices rem = target_n - n_dig rem_ind = np.setdiff1d(np.arange(len(labels)), idx_red)[:rem] # fill the remaining observations with images corresponding to other digits target_set_non_dig = images[rem_ind] target_set_non_dig = np.reshape(target_set_non_dig, (target_set_non_dig.shape[0], 28 * 28)) # create the dataset target_set = np.vstack((target_set_non_dig, target_set_dig)) # shuffle it arr = np.arange(target_n) np.random.shuffle(arr) return target_set def get_image_prototypes(num_proto, digit): part = 6 # number of Pyspark RDD partitions to use sigma = 50 # gaussian kernel parameter n_1 = 5420 # the number of observations in X_1 n_2 = 1500 # the number of observations in X_2 #percentages = [.3, .5, .7, .9, 1.] percentages = [ 1. ] # the percentage of X_1 that will correspond to the chosen digit # list of experiment results exp_1_res_list = [] # list of f_eval sequences exp_1_f_eval_list = [] # set source dataset and labels source_set = np.reshape(ProtoDash.test_images[:n_2], (n_2, 28 * 28)) # select the target datasets target_set = ProtoDash.create_target_set(ProtoDash.train_labels, ProtoDash.train_images, digit, n_1, 1) # convert target and source datasets to RDDs target_rdd = ProtoDash.create_vec_rdd(target_set, part) source_rdd = ProtoDash.create_vec_rdd(source_set, part) # collect the indices of m prototypes along with their ascribed weight res, f = ProtoDash.ProtoDashAlgoritm(target_rdd, source_rdd, num_proto, sigma, partitions=part, verbose=True)[:2] # collect the results exp_1_res_list.append(res) exp_1_f_eval_list.append(f) fig, axes = plt.subplots(num_proto, 1, figsize=(12, 10), squeeze=False) for i in range(num_proto): for j in range(len(percentages)): axes[i][j].imshow( np.reshape(source_set[exp_1_res_list[j][i][1], :], (28, 28))) axes[i][j].get_xaxis().set_ticks([]) axes[i][j].get_yaxis().set_ticks([]) fig.suptitle("\n".join( wrap( "Top %d prototypes selected by ProtoDash corresponding to the digit %d" % (num_proto, digit), 60)), fontsize=20) plt.show() spark.stop() def ProtoDashOnImage(digit, num_proto): # digit: the digit to be represented in the target dataset X_1 # num_proto: number of prototypes for ProtoDash to select if digit.isdigit() and 0 <= int(digit) <= 9: if num_proto.isdigit(): ProtoDash.get_image_prototypes(int(num_proto), int(digit)) else: print("Please enter an integer value for number of prototypes") else: print("Please enter a digit between 0-9")
def programaPrincipal(): parser = argparse.ArgumentParser() parser.add_argument( "ciclistas", help="Nombre del archivo csv que contiene la lista de ciclistas") parser.add_argument( "rutas", help="Nombre del archivo csv que contiene la lista de rutas") parser.add_argument( "actividades", help="Nombre del archivo csv que contiene la lista de actividades") parser.add_argument("N", help="Top N a consultar de ciclistas por provincia", type=int) args = parser.parse_args() ciclista_schema = StructType([ StructField('cedula', IntegerType()), StructField('nombre_Completo', StringType()), StructField('provincia', StringType()), ]) ciclista_df = spark.read.csv(args.ciclistas, schema=ciclista_schema, header=False) #ciclista_df.show() ruta_schema = StructType([ StructField('codigo', IntegerType()), StructField('nombre_Ruta', StringType()), StructField('kilometros', FloatType()), ]) ruta_df = spark.read.csv(args.rutas, schema=ruta_schema, header=False) #ruta_df.show() actividad_schema = StructType([ StructField('codigo_Ruta', IntegerType()), StructField('cedula_Ciclista', IntegerType()), StructField('fecha', DateType()), ]) actividad_df = spark.read.csv(args.actividades, schema=actividad_schema, header=False) #actividad_df.show() ciclista_actividad_ruta_df = tarea1_funciones.join_dataframes( ciclista_df, ruta_df, actividad_df) print( "Dataframe que contiene el join de los 3 archivos: ciclista.csv, actividad.csv y ruta.csv:" ) ciclista_actividad_ruta_df.show() ciclistas_kilometros_df = tarea1_funciones.obtener_kilometros_por_ciclista( ciclista_actividad_ruta_df) print( "Kilómetros recorridos por ciclista, por ruta, por provincia y por día:" ) ciclistas_kilometros_df.show() #indica el top N de cilclistas por provincia que se quieren obtener N = args.N provincia_ciclistas_kilometros_total_df = tarea1_funciones.obtener_topN_ciclistas_por_provincia_en_total_de_kilometros( ciclistas_kilometros_df, N) print("Top", N, "de ciclistas por provincia, en total de kilómetros:") provincia_ciclistas_kilometros_total_df.show() provincia_ciclistas_kilometros_promedio_df = tarea1_funciones.obtener_topN_ciclistas_por_provincia_en_promedio_de_kilometros_por_dia( ciclistas_kilometros_df, N) print("Top", N, "de ciclistas por provincia, en promedio de kilómetros por día:") provincia_ciclistas_kilometros_promedio_df.show() top_N_ciclistas_por_provincia = tarea1_funciones.unir_dataframes_Top_N_ciclistas_por_provincia( provincia_ciclistas_kilometros_total_df, provincia_ciclistas_kilometros_promedio_df) print( "Top", N, "de ciclistas por provincia, tanto en total de kilómetros como en promedio de kilómetros por día:" ) top_N_ciclistas_por_provincia.show()
remove_quotes = lambda airport_string: airport_string.replace('"', '') def is_filtered_airport(airport): return len(airport) == 3 and airport[0] == airportarg def split_airport_and_airport_string_into_origin_aiport_and_dest_airport_tuple( airport_and_airport_string, average_delay): [airport, carrier] = airport_and_airport_string.split(" ") return [airport, carrier, average_delay] top10AirportsPerAirportSchema = StructType([ StructField('Airport', StringType(), True), StructField('Carrier', StringType(), True), StructField('Average Departure Delay', FloatType(), True) ]) lines = file.rdd \ .cache() \ .keys() \ .map(lambda l: l.split("\t")) \ .map(lambda t: [remove_quotes(t[0]), float(t[1])]) \ .map(lambda t: split_airport_and_airport_string_into_origin_aiport_and_dest_airport_tuple(t[0], t[1])) \ .filter(lambda t: is_filtered_airport(t)) df = spark \ .createDataFrame(lines, schema=top10AirportsPerAirportSchema) \ .orderBy(["Airport", "Average Departure Delay"]) df.coalesce(1) \
) # In[23]: observation_with_bundle.registerTempTable("ob_with_bundle") observation_with_bundle_rdd = observation_with_bundle.rdd.coalesce( 16) #decrease num of partitions to make collection faster observation_with_bundle_rdd.persist() observation_with_bundle_rdd.getNumPartitions() # In[3]: from pyspark.sql.types import IntegerType, FloatType from pyspark.sql.functions import udf changetofloat = udf(lambda s: float(s), FloatType()) #change normalized_price to float. It was string type before. ob_with_bundle_float = observation_with_bundle.withColumn( "n_price_float", changetofloat(observation_with_bundle.normalized_price)) ob_with_bundle_float.registerTempTable('ob_with_bundle_float') # In[29]: df_zc_income_SD.printSchema() # get average of normalized price for each bundle based on normalized_size_units ob_with_bundle_float_group = ob_with_bundle_float.groupBy( 'bundle', 'normalized_size_units') ob_average = ob_with_bundle_float_group.avg('n_price_float').withColumnRenamed( "AVG(n_price_float)", "avg_nprice")
def test_select_subset_of_columns_as_entity_primary_keys( spark: SparkSession, composite_entity_schema: StructType, customer_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=2)), (2001, 8002, datetime(year=2020, month=9, day=2)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) feature_table_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 100.0, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 400.0, ), ] feature_table_df = spark.createDataFrame( spark.sparkContext.parallelize(feature_table_data), customer_feature_schema) feature_table = FeatureTable( name="transactions", features=[Field("daily_transactions", "double")], entities=[Field("customer_id", "int32")], ) joined_df = as_of_join( entity_df, "event_timestamp", feature_table_df, feature_table, ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, ), ( 2001, 8002, datetime(year=2020, month=9, day=2), 400.0, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def test_historical_feature_retrieval(spark: SparkSession): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'customer_driver_pairs.csv')}", "event_timestamp_column": "event_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } booking_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'bookings.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'transactions.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } booking_table = { "name": "bookings", "entities": [{ "name": "driver_id", "type": "int32" }], "features": [{ "name": "completed_bookings", "type": "int32" }], } transaction_table = { "name": "transactions", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "daily_transactions", "type": "double" }], "max_age": 86400, } joined_df = retrieve_historical_features( spark, entity_source, [transaction_source, booking_source], [transaction_table, booking_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), StructField("bookings__completed_bookings", IntegerType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300, ), ( 1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500, ), ( 1001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=4), None, 500, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
StructField("RelatedImages",StringType(),True), StructField("SocialImageEmbeds",StringType(),True), StructField("SocialVideoEmbeds",StringType(),True), StructField("Quotations",StringType(),True), StructField("AllNames",StringType(),True), StructField("Amounts",StringType(),True), StructField("TranslationInfo",StringType(),True), StructField("Extras",StringType(),True) ]) EVENTS_SCHEMA = StructType([ StructField("GLOBALEVENTID",LongType(),True), StructField("Day_DATE",LongType(),True), StructField("MonthYear_Date",StringType(),True), StructField("Year_Date",StringType(),True), StructField("FractionDate",FloatType(),True), StructField("Actor1Code",StringType(),True), StructField("Actor1Name",StringType(),True), StructField("Actor1CountryCode",StringType(),True), StructField("Actor1KnownGroupCode",StringType(),True), StructField("Actor1EthnicCode",StringType(),True), StructField("Actor1Religion1Code",StringType(),True), StructField("Actor1Religion2Code",StringType(),True), StructField("Actor1Type1Code",StringType(),True), StructField("Actor1Type2Code",StringType(),True), StructField("Actor1Type3Code",StringType(),True), StructField("Actor2Code",StringType(),True), StructField("Actor2Name",StringType(),True), StructField("Actor2CountryCode",StringType(),True), StructField("Actor2KnownGroupCode",StringType(),True), StructField("Actor2EthnicCode",StringType(),True),
def get_structfield(colname): if colname in ['ARR_DELAY', 'DEP_DELAY', 'DISTANCE', 'TAXI_OUT', 'DEP_AIRPORT_TZOFFSET']: return StructField(colname, FloatType(), True) else: return StructField(colname, StringType(), True)
bankProspectsDF.show() """## Remove the record with unknow value in country column""" bankProspectsDF1 = bankProspectsDF.filter(bankProspectsDF['country'] != "unknown") bankProspectsDF1.show() """## Cast the String datatype to Integer/Float""" bankProspectsDF1.printSchema() from pyspark.sql.types import IntegerType,FloatType bankProspectsDF2 = bankProspectsDF1.withColumn("age", bankProspectsDF1["age"].cast(IntegerType())).withColumn("salary", bankProspectsDF1["salary"].cast(FloatType())) bankProspectsDF2.printSchema() """## Replace Age and Salary with average values of their respective column import mean from sql.fuctions """ from pyspark.sql.functions import mean """### Calculate "mean" value of the age""" mean_age_val = bankProspectsDF2.select(mean(bankProspectsDF2['age'])).collect() type(mean_age_val)
def run(self, input_dict=None, block_params=None, program_arguments=None): try: t1 = time.time() output_dict = dict() configs = input_dict["Config"] # test_args = {'spark.app.name': 'spark_app_test', 'spark.shuffle.service.enabled': 'true', 'spark.dynamicAllocation.minExecutors': '1', 'spark.dynamicAllocation.enabled': 'true'} queue_dict = {} queue_dict['left_df'] = input_dict['leftData']['queueTopicName'] queue_dict['right_df'] = input_dict['rightData']['queueTopicName'] kafka_handler = sdk.kafka_handler(None) kafka_api_instance = kafka_handler.get_api_instance() channels = {} for key, topic in queue_dict.items(): consumer_pool = { "count": 1, "groupId": str(uuid.uuid4()), "registerId": "", "topicsListToSubscribe": [topic] } try: consumer_pool_res = kafka_api_instance.create_consumer_list_using_post( consumer_pool) channels[key] = consumer_pool_res.result except Exception as e: self.logger.error( "Error Trying To Create a Consumer Of Topic:" + str(topic)) self.block_status = "FAILED" raise e optional_param = {} optional_param['queue_dict'] = queue_dict optional_param["api_instance"] = kafka_api_instance optional_param["channels"] = channels self.spark = SparkConfCustom().get_spark_session() self.spark.sparkContext.setLogLevel('ERROR') self.spark_schema = {} self.field_list = {} print('waiting') # time.sleep(200) arrary_of_threads = [] for key, topic in queue_dict.items(): req = {"topicName": topic} try: schema = kafka_api_instance.get_topic_meta_using_post(req) schema = json.loads(json.loads(schema.result)["schema"]) optional_param['schema'] = schema self.logger.debug("Schema Received") except Exception as e: self.logger.error("Error Fetching Schema") self.logger.error(str(e)) self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e col_names = schema.keys() parsed_schema_dict = {} for name in col_names: values = schema.get(name) parsed_schema_dict[name] = values['type'] self.logger.info("schemaaaa hereeee") self.logger.info(schema) self.logger.info(parsed_schema_dict) self.list_of_struct_fields = [] self.field_list[key] = [] for name in parsed_schema_dict.keys(): if parsed_schema_dict[name] == 'FloatType()': self.field_list[key].append(('float')) self.list_of_struct_fields.append( StructField(name, FloatType(), True)) elif parsed_schema_dict[name] == 'IntegerType()': self.field_list[key].append('int') self.list_of_struct_fields.append( StructField(name, IntegerType(), True)) elif parsed_schema_dict[name] == 'DoubleType()': self.field_list[key].append('float') self.list_of_struct_fields.append( StructField(name, DoubleType(), True)) else: self.field_list[key].append('string') self.list_of_struct_fields.append( StructField(name, StringType(), True)) self.spark_schema[key] = StructType(self.list_of_struct_fields) fpath = '/bigbrain/' + str(key) if os.path.exists(fpath): rmtree(fpath) os.makedirs(fpath, exist_ok=True) t = self.ReadRecords(self.spark, topic, key, self.spark_schema[key], input_dict, block_params, optional_param, self.field_list) t.start() arrary_of_threads.append(t) for t in arrary_of_threads: t.join() print('Both topics read done') # self.stream_block(input_dict=input_dict, block_params=block_params, optional_arg=optional_param) self.left_df = self.spark.read.parquet('/bigbrain/left_df') print(self.left_df.count()) self.right_df = self.spark.read.parquet('/bigbrain/right_df') print(self.right_df.count()) exec("self.resultant_join_df" + "=" + "self.left_df.join(self.right_df,self.left_df['" + configs['unique_key_left'] + "']== self.right_df['" + configs['unique_key_right'] + "'] ,how='" + configs['join_type'] + "')") print(self.left_df.rdd.getNumPartitions()) print(self.right_df.rdd.getNumPartitions()) new_column_name_list = self.resultant_join_df.columns renamed_cols = {} for col in new_column_name_list: count = new_column_name_list.count(col) if count > 1: idx = new_column_name_list.index(col) new_column_name_list[idx] = col + '_1' print(self.resultant_join_df.columns) self.resultant_join_df = self.resultant_join_df.toDF( *new_column_name_list) print(self.resultant_join_df.columns) temp_fp = '/bigbrain/' + str(t1) + '.csv' print(temp_fp) # os.makedirs(temp_fp, exist_ok=True) temp_join_time_st = time.time() self.resultant_join_df.write.mode("overwrite").option( "header", "true").csv(temp_fp) temp_join_time_end = time.time() print('Time for Join: ' + str(temp_join_time_end - temp_join_time_st) + ', File Partitions' + str(self.resultant_join_df.rdd.getNumPartitions())) self.logger.info("*****************************") self.logger.info("Join Completed") # self.logger.info("Count: " + str(self.resultant_join_df.count())) self.data_target_params = input_dict["DataTarget"] try: self.validate_target_params() except Exception as e: self.logger.error(str(e)) raise e try: self.client = self.validate_hdfs_connection( input_dict=input_dict, block_params=block_params) self.data_target_params[ 'fileWithFullPath'] = self.data_target_params['filePath'] exists = self.file_exits(hdfs_connection=self.client) if exists: if self.data_target_params['overwrite']: # remove file self.delete_file(hdfs_connection=self.client) self.append = False else: raise FileExistsError( "File Already Exists: " + str(self.data_target_params['filePath'])) except Exception as e: self.logger.error(str(e)) raise e write_start_time = time.time() self.logger.info("Writing to HDFS:") self.block_folder_write(self.client, temp_fp) # if os.path.isdir(temp_fp): # self.logger.info("dir") # for filename in os.listdir(temp_fp): # print(filename) # if filename.endswith(".csv"): # csv_path = temp_fp + '/' + filename # print(csv_path) # self.block_line_write(self.client, csv_path) # else: # self.block_line_write(self.client, temp_fp) print('Time for Join: ' + str(temp_join_time_end - temp_join_time_st)) self.logger.info("Time taken to write to HDFS: " + str(time.time() - write_start_time)) self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) output_dict["queueTopicName"] = '' output_dict['readerInfo'] = None output_dict['readerInfoError'] = None output_dict["infoKeys"] = None self.logger.info("Output:") self.logger.info(json.dumps(output_dict, indent=2)) return output_dict except Exception as e: self.logger.error(traceback.format_exc()) self.block_status = "FAILED" raise e
from pyspark.sql import DataFrame from pyspark.sql.functions import udf from pyspark.sql.types import FloatType, ArrayType, LongType @udf(ArrayType(FloatType())) def assemble_features(*cols): return [x for x in cols] @udf(LongType()) def transform_label(cl): class_to_label = { 'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2, } return class_to_label[cl] def transform_iris_data(data: DataFrame): data = (data.withColumn( 'features', assemble_features('sepal-length', 'sepal-width', 'petal-length', 'petal-width')).withColumn( 'label', transform_label('class')).select( 'features', 'label')) return data
tweets = tweets \ .withColumn('soup_text', decode_html_udf('text')) \ .withColumn('han_rem', regexp_replace(col('soup_text'), handles_pat, '')) \ .withColumn('tag_rem', regexp_replace(col('han_rem'), hashtag_pat, '')) \ .withColumn('http_rem', regexp_replace(col('tag_rem'), http_pat, '')) \ .withColumn('www_rem', regexp_replace(col('http_rem'), www_pat, '')) \ .withColumn('utf_text', decode_utf_udf('www_rem')) \ .withColumn('neg_handel', regexp_replace(col('utf_text'), r"won't", 'will not')) \ .withColumn('neg_handel', regexp_replace(col('neg_handel'), r"can't", 'can not')) \ .withColumn('neg_handel', regexp_replace(col('neg_handel'), r"n't", ' not')) \ .withColumn('sp_rem', regexp_replace(col('neg_handel'), sp_pat, ' ')) \ .withColumn('low_text', lower(col('sp_rem'))) \ .withColumn('cleaned', rem_space_udf('low_text')) \ .selectExpr('cleaned as text', 'tags') neg_tweets_udf = udf(lambda x: 0.0 if x == 1.0 else 1.0, FloatType()) model = PipelineModel.load('./tweets_analyzer.model') predictions = model.transform(tweets) \ .select('tags', 'prediction') \ .withColumn('hashtag', explode(col('tags'))) \ .withColumn('pos_tweet', col('prediction')) \ .withColumn('neg_tweet', neg_tweets_udf('prediction')) \ .groupby('hashtag') \ .agg(psf.sum('pos_tweet').alias('pos_tweets'), psf.sum('neg_tweet').alias('neg_tweets'), psf.count('pos_tweet').alias('total_tweets')) predictions.writeStream \ .outputMode('complete') \ .format('console') \ .option('truncate', False) \
def convert(self, ma_field: ma_fields.Field) -> DataType: return FloatType()
def test_join_with_max_age( spark: SparkSession, single_entity_schema: StructType, customer_feature_schema: StructType, ): entity_data = [ (1001, datetime(year=2020, month=9, day=1)), (1001, datetime(year=2020, month=9, day=3)), (2001, datetime(year=2020, month=9, day=2)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), single_entity_schema) feature_table_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 100.0, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 200.0, ), ] feature_table_df = spark.createDataFrame( spark.sparkContext.parallelize(feature_table_data), customer_feature_schema) feature_table = FeatureTable( name="transactions", features=[Field("daily_transactions", "double")], entities=[Field("customer_id", "int32")], max_age=86400, ) joined_df = as_of_join( entity_df, "event_timestamp", feature_table_df, feature_table, ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), ]) expected_joined_data = [ ( 1001, datetime(year=2020, month=9, day=1), 100.0, ), (1001, datetime(year=2020, month=9, day=3), None), ( 2001, datetime(year=2020, month=9, day=2), 200.0, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
delimiter = "," # The applied options are for CSV files. For other file types, these will be ignored. df = spark.read.format(file_type) \ .option("inferSchema", infer_schema) \ .option("header", first_row_is_header) \ .option("sep", delimiter) \ .load(file_location) # We drop the time stamp df = df.drop("timestamp") # We convert to ints df = df.withColumn("userId", df["userId"].cast(IntegerType())) df = df.withColumn("movieId", df["movieId"].cast(IntegerType())) df = df.withColumn("rating", df["rating"].cast(FloatType())) # COMMAND ---------- # First we compute the unique users unique_usr = df.select('userId').distinct().collect() unique_usr = [row.asDict()["userId"] for row in unique_usr] usr_to_emb = {usr : i for i,usr in enumerate(unique_usr)} emb_to_usr = {i : usr for i,usr in enumerate(unique_usr)} unique_movie = df.select('movieId').distinct().collect() unique_movie = [int(row.asDict()["movieId"]) for row in unique_movie]
def test_join_with_composite_entity( spark: SparkSession, composite_entity_schema: StructType, rating_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=1)), (1001, 8002, datetime(year=2020, month=9, day=3)), (1001, 8003, datetime(year=2020, month=9, day=1)), (2001, 8001, datetime(year=2020, month=9, day=2)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) feature_table_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 3.0, 5.0, ), ( 1001, 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 4.0, 3.0, ), ( 2001, 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 4.0, 4.5, ), ] feature_table_df = spark.createDataFrame( spark.sparkContext.parallelize(feature_table_data), rating_feature_schema, ) feature_table = FeatureTable( name="ratings", features=[ Field("customer_rating", "double"), Field("driver_rating", "double") ], entities=[Field("customer_id", "int32"), Field("driver_id", "int32")], max_age=86400, ) joined_df = as_of_join( entity_df, "event_timestamp", feature_table_df, feature_table, ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("ratings__customer_rating", FloatType()), StructField("ratings__driver_rating", FloatType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=1), 3.0, 5.0, ), (1001, 8002, datetime(year=2020, month=9, day=3), None, None), (1001, 8003, datetime(year=2020, month=9, day=1), None, None), ( 2001, 8001, datetime(year=2020, month=9, day=2), 4.0, 4.5, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def set_col(df, columns, func, data_type, summary): dict_types = { 'string': StringType(), 'str': StringType(), 'integer': IntegerType(), 'int': IntegerType(), 'float': FloatType(), 'double': DoubleType(), 'Double': DoubleType() } types = { 'string': 'string', 'str': 'string', 'String': 'string', 'integer': 'int', 'int': 'int', 'float': 'float', 'double': 'double', 'Double': 'double' } try: function = udf(func, dict_types[data_type]) except KeyError: assert False, "Error, data_type not recognized" assert_type_str_or_list(df, columns, "columns") # Filters all string columns in dataFrame valid_cols = [ c for (c, t) in filter(lambda t: t[1] == types[data_type], df.dtypes) ] if columns == "*": columns = valid_cols[:] if isinstance(columns, str): columns = [columns] assert_cols_in_df(df, columns_provided=columns, columns_df=df.columns) col_not_valids = (set([column for column in columns ]).difference(set([column for column in valid_cols]))) assert ( col_not_valids == set() ), 'Error: The following columns do not have same datatype argument provided: %s' % col_not_valids oldUnique = [find_unique(df, column=c) for c in columns] exprs = [ function(col(c)).alias(c) if c in columns else c for (c, t) in df.dtypes ] newDF = df.select(*exprs) if summary: newUnique = [find_unique(newDF, column=c) for c in columns] count = int(totChanges(oldUnique, newUnique)) summary = sqlContext.createDataFrame([(count, )], [ 'Total Cells Modified', ]) return (newDF, summary) return newDF
def test_multiple_join( spark: SparkSession, composite_entity_schema: StructType, customer_feature_schema: StructType, driver_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=2)), (1001, 8002, datetime(year=2020, month=9, day=2)), (2001, 8002, datetime(year=2020, month=9, day=3)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) customer_table_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 100.0, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 200.0, ), ] customer_table_df = spark.createDataFrame( spark.sparkContext.parallelize(customer_table_data), customer_feature_schema) customer_table = FeatureTable( name="transactions", features=[Field("daily_transactions", "double")], entities=[Field("customer_id", "int32")], max_age=86400, ) driver_table_data = [ ( 8001, datetime(year=2020, month=8, day=31), datetime(year=2020, month=8, day=31), 200, ), ( 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 300, ), ( 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 600, ), ( 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 500, ), ] driver_table_df = spark.createDataFrame( spark.sparkContext.parallelize(driver_table_data), driver_feature_schema) driver_table = FeatureTable( name="bookings", features=[Field("completed_bookings", "int32")], entities=[Field("driver_id", "int32")], ) joined_df = join_entity_to_feature_tables( entity_df, "event_timestamp", [customer_table_df, driver_table_df], [customer_table, driver_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), StructField("bookings__completed_bookings", IntegerType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300, ), ( 1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
df.show(n=5) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print(" Chacking data types") print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") var1 = df.schema for i in var1: print(i) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print(" Changing data type to float") print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") dfnew = df.withColumn("Largest Property Use Type - Gross Floor Area (ft)", df["Largest Property Use Type - Gross Floor Area (ft)"].cast(FloatType())) \ .withColumn("2nd Largest Property Use - Gross Floor Area (ft)", df["2nd Largest Property Use - Gross Floor Area (ft)"].cast(FloatType())) \ .withColumn("3rd Largest Property Use Type - Gross Floor Area (ft)", df["3rd Largest Property Use Type - Gross Floor Area (ft)"].cast(FloatType())) \ .withColumn("Site EUI (kBtu/ft)", df["Site EUI (kBtu/ft)"].cast(FloatType())) \ .withColumn("Weather Normalized Site EUI (kBtu/ft)", df["Weather Normalized Site EUI (kBtu/ft)"].cast(FloatType())) \ .withColumn("Weather Normalized Site Electricity Intensity (kWh/ft)", df["Weather Normalized Site Electricity Intensity (kWh/ft)"].cast(FloatType())) \ .withColumn("Weather Normalized Site Natural Gas Intensity (therms/ft)", df["Weather Normalized Site Natural Gas Intensity (therms/ft)"].cast(FloatType())) \ .withColumn("Weather Normalized Source EUI (kBtu/ft)", df["Weather Normalized Source EUI (kBtu/ft)"].cast(FloatType())) \ .withColumn("Water Intensity (All Water Sources) (gal/ft)", df["Water Intensity (All Water Sources) (gal/ft)"].cast(FloatType())) \ .withColumn("Source EUI (kBtu/ft)", df["Source EUI (kBtu/ft)"].cast(FloatType())) \
return iso_region.strip().split('-')[-1] udf_state = f.udf(lambda x: to_state(x), StringType()) def to_lat(coordinates): ''' Split latitude from the airport coordinates Parameters: coordinates (str): Coordinates like '{latitude}, {longitude}' ''' return float(coordinates.strip().split(',')[0]) udf_lat = f.udf(lambda x: to_lat(x), FloatType()) def to_long(coordinates): ''' Split longitude from the airport coordinates Parameters: coordinates (str): Coordinates like '{latitude}, {longitude}' ''' return float(coordinates.strip().split(',')[1]) udf_long = f.udf(lambda x: to_long(x), FloatType()) # Strip quotes from the country name udf_strip_quotes = f.udf(lambda x: x.strip('\''), StringType())
def transform_demographics_data(self): # read file demographics_df = self._spark_session.read.load( f"{self._processing_bucket}/us-cities-demographics.csv", format="csv", header="true", sep=";") # change column name and data type demographics_df = ( demographics_df.withColumn( "city", demographics_df["City"].cast(StringType())).withColumn( "median_age", demographics_df["Median Age"].cast(FloatType())). withColumn("male_pop", demographics_df["Male Population"].cast( IntegerType())).withColumn( "female_pop", demographics_df["Female Population"].cast( IntegerType())).withColumn( "total_pop", demographics_df["Total Population"].cast( IntegerType())).withColumn( "num_vets", demographics_df["Number of Veterans"].cast( IntegerType())). withColumn("foreign_born", demographics_df["Foreign-born"].cast( IntegerType())).withColumn( "avg_household_size", demographics_df["Average Household Size"].cast( FloatType())).withColumn( "state_code", demographics_df["State Code"].cast( StringType())).withColumn( "race", demographics_df["Race"].cast( StringType())).withColumn( "count", demographics_df["Count"].cast( IntegerType()))) # choose columns to keep demographics_summary_df = demographics_df[[ "state_code", "city", "median_age", "male_pop", "female_pop", "total_pop", "num_vets", "foreign_born", "avg_household_size", ]] # choose columns to keep demographics_race_df = demographics_df[[ "state_code", "city", "race", "count" ]] # remove duplicates demographics_summary_df = demographics_summary_df.distinct() # write to csv demographics_summary_df.write.csv( path=f"{self._processed_bucket}/dim_cities_demographics_summary", mode="overwrite", header=True) demographics_race_df.write.csv( path=f"{self._processed_bucket}/dim_cities_demographics_race", mode="overwrite", header=True)
def align_diff_frames(resolve_func, this, that, fillna=True, how="full", preserve_order_column=False): """ This method aligns two different DataFrames with a given `func`. Columns are resolved and handled within the given `func`. To use this, `compute.ops_on_diff_frames` should be True, for now. :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and the column of another DataFrame. It returns an iterable that produces Series. >>> from databricks.koalas.config import set_option, reset_option >>> >>> set_option("compute.ops_on_diff_frames", True) >>> >>> kdf1 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> kdf2 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> >>> def func(kdf, this_column_labels, that_column_labels): ... kdf # conceptually this is A + B. ... ... # Within this function, Series from A or B can be performed against `kdf`. ... this_label = this_column_labels[0] # this is ('a',) from kdf1. ... that_label = that_column_labels[0] # this is ('a',) from kdf2. ... new_series = (kdf[this_label] - kdf[that_label]).rename(str(this_label)) ... ... # This new series will be placed in new DataFrame. ... yield (new_series, this_label) >>> >>> >>> align_diff_frames(func, kdf1, kdf2).sort_index() a 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 >>> reset_option("compute.ops_on_diff_frames") :param this: a DataFrame to align :param that: another DataFrame to align :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`. Otherwise, it returns as are. :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict. - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and 'that_columns' in this function are B, C and B, C. - left: `resolve_func` should resolve columns including that columns. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is B, C but `that_columns` are B, C, D. - inner: Same as 'full' mode; however, internally performs inner join instead. :return: Aligned DataFrame """ assert how == "full" or how == "left" or how == "inner" this_column_labels = this._internal.column_labels that_column_labels = that._internal.column_labels common_column_labels = set(this_column_labels).intersection( that_column_labels) # 1. Perform the join given two dataframes. combined = combine_frames(this, that, how=how, preserve_order_column=preserve_order_column) # 2. Apply the given function to transform the columns in a batch and keep the new columns. combined_column_labels = combined._internal.column_labels that_columns_to_apply = [] this_columns_to_apply = [] additional_that_columns = [] columns_to_keep = [] column_labels_to_keep = [] for combined_label in combined_column_labels: for common_label in common_column_labels: if combined_label == tuple(["this", *common_label]): this_columns_to_apply.append(combined_label) break elif combined_label == tuple(["that", *common_label]): that_columns_to_apply.append(combined_label) break else: if how == "left" and combined_label in [ tuple(["that", *label]) for label in that_column_labels ]: # In this case, we will drop `that_columns` in `columns_to_keep` but passes # it later to `func`. `func` should resolve it. # Note that adding this into a separate list (`additional_that_columns`) # is intentional so that `this_columns` and `that_columns` can be paired. additional_that_columns.append(combined_label) elif fillna: columns_to_keep.append( F.lit(None).cast(FloatType()).alias(str(combined_label))) column_labels_to_keep.append(combined_label) else: columns_to_keep.append( combined._internal.spark_column_for(combined_label)) column_labels_to_keep.append(combined_label) that_columns_to_apply += additional_that_columns # Should extract columns to apply and do it in a batch in case # it adds new columns for example. if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0: kser_set, column_labels_applied = zip(*resolve_func( combined, this_columns_to_apply, that_columns_to_apply)) columns_applied = [c.spark.column for c in kser_set] column_labels_applied = list(column_labels_applied) else: columns_applied = [] column_labels_applied = [] applied = combined[columns_applied + columns_to_keep] applied.columns = pd.MultiIndex.from_tuples(column_labels_applied + column_labels_to_keep) # 3. Restore the names back and deduplicate columns. this_labels = OrderedDict() # Add columns in an order of its original frame. for this_label in this_column_labels: for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels and this_label == new_label[1:]: this_labels[new_label[1:]] = new_label # After that, we will add the rest columns. other_labels = OrderedDict() for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels: other_labels[new_label[1:]] = new_label kdf = applied[list(this_labels.values()) + list(other_labels.values())] kdf.columns = kdf.columns.droplevel() return kdf
def convert( spark: SparkSession, dataset_root: str, limit: int = 0, asset_dir: Optional[str] = None, ) -> DataFrame: """Convert a Coco Dataset into Rikai dataset. This function expects the COCO datasets are stored in directory with the following structure: - dataset - annotations - captions_train2017.json - instances_train2017.json - ... - train2017 - val2017 - test2017 Parameters ---------- spark : SparkSession A live spark session dataset_root : str The directory of dataset limit : int, optional The number of images of each split to be converted. asset_dir : str, optional The asset directory to store images, can be a s3 directory. Return ------ DataFrame Returns a Spark DataFrame """ train_json = os.path.join(dataset_root, "annotations", "instances_train2017.json") val_json = os.path.join(dataset_root, "annotations", "instances_val2017.json") categories = load_categories(train_json) examples = [] for split, anno_file in zip(["train", "val"], [train_json, val_json]): coco = COCO(annotation_file=anno_file) # Coco has native dependencies, so we do not distributed them # to the workers. image_ids = coco.imgs if limit > 0: image_ids = islice(image_ids, limit) for image_id in image_ids: ann_id = coco.getAnnIds(imgIds=image_id) annotations = coco.loadAnns(ann_id) annos = [] for ann in annotations: bbox = Box2d(*ann["bbox"]) annos.append({ "category_id": ann["category_id"], "category_text": categories[ann["category_id"]]["name"], "bbox": bbox, "area": float(ann["area"]), }) image_payload = coco.loadImgs(ids=image_id)[0] example = { "image_id": image_id, "annotations": annos, "image": Image( os.path.abspath( os.path.join( os.curdir, "dataset", "{}2017".format(split), image_payload["file_name"], ))), "split": split, } examples.append(example) schema = StructType([ StructField("image_id", LongType(), False), StructField( "annotations", ArrayType( StructType([ StructField("category_id", IntegerType()), StructField("category_text", StringType()), StructField("area", FloatType()), StructField("bbox", Box2dType()), ])), False, ), StructField("image", ImageType(), False), StructField("split", StringType(), False), ]) df = spark.createDataFrame(examples, schema=schema) if asset_dir: asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/" print("ASSET DIR: ", asset_dir) df = df.withColumn("image", image_copy(col("image"), lit(asset_dir))) return df
def main(): raw_training_data = sc.textFile("dataset/training.data") # TODO: Convert text file into an RDD which can be converted to a DataFrame # Hint: For types and format look at what the format required by the # `train` method for the random forest classifier # Hint 2: Look at the imports above rdd_train = raw_training_data.map(lambda row: row.split(",")) parsed_rdd_train = rdd_train.map(lambda r: Row(count1=float(r[0]), count2=float(r[1]), count3=float(r[2]), count4=float(r[3]), count5=float(r[4]), count6=float(r[5]), count7=float(r[6]), count8=float(r[7]), id=int(r[8]))) trainschema = StructType([ StructField("count1", FloatType(), True), StructField("count2", FloatType(), True), StructField("count3", FloatType(), True), StructField("count4", FloatType(), True), StructField("count5", FloatType(), True), StructField("count6", FloatType(), True), StructField("count7", FloatType(), True), StructField("count8", FloatType(), True), StructField("id", IntegerType(), True) ]) # TODO: Create dataframe from the RDD df_train = sqlContext.createDataFrame(parsed_rdd_train, schema=trainschema) #df_train.show() raw_test_data = sc.textFile("dataset/test-features.data") # TODO: Convert text file lines into an RDD we can use later rdd_test = raw_test_data.map(lambda row: row.split(",")) parsed_rdd_test = rdd_test.map(lambda r: Row(count1=float(r[0]), count2=float(r[1]), count3=float(r[2]), count4=float(r[3]), count5=float(r[4]), count6=float(r[5]), count7=float(r[6]), count8=float(r[7]))) testschema = StructType([ StructField("count1", FloatType(), True), StructField("count2", FloatType(), True), StructField("count3", FloatType(), True), StructField("count4", FloatType(), True), StructField("count5", FloatType(), True), StructField("count6", FloatType(), True), StructField("count7", FloatType(), True), StructField("count8", FloatType(), True) ]) # TODO:Create dataframe from RDD df_test = sqlContext.createDataFrame(parsed_rdd_test, schema=testschema) #df_test.show() predictions = predict(df_train, df_test) # You can take a look at dataset/test-labels.data to see if your # predictions were right for pred in predictions: print(int(pred))
def perform_preprocessing( df, states: List[int], actions: List[str], metrics: List[str], multi_steps: Optional[int] = None, ): """ Perform (1) sparse-to-dense, (2) preprocessing for actions, and (3) other miscellaneous columns. (1) For each column of type Map, w/ name X, output two columns. Map values are assumed to be scalar. This process is called sparse-to-dense. X = {"state_features", "next_state_features", "metrics"}. (a) Replace column X with a dense repesentation of the inputted (sparse) map. Dense representation is to concatenate map values into a list. (b) Create new column X_presence, which is a list of same length as (a) and the ith entry is 1 iff the key was present in the original map. (2) Inputted actions and possible_actions are strings, which isn't supported for PyTorch Tensors. Here, we represent them with LongType. (a) action and next_action are strings, so simply return their position in the action_space (as given by argument actions). (b) possible_actions and possible_next_actions are list of strs, so return an existence bitvector of length len(actions), where ith index is true iff actions[i] was in the list. (3) Miscellaneous columns are step, time_diff, sequence_number, not_terminal """ # step refers to n in n-step RL; special case when approaching terminal df = df.withColumn("step", make_get_step_udf(multi_steps)("next_state_features")) # take the next time_diff next_long_udf = make_next_udf(multi_steps, LongType()) df = df.withColumn("time_diff", next_long_udf("time_diff")) # sparse-to-dense of states and metrics next_map_udf = make_next_udf(multi_steps, MapType(LongType(), FloatType())) df = df.withColumn("next_state_features", next_map_udf("next_state_features")) df = df.withColumn("metrics", next_map_udf("metrics")) df = make_sparse2dense(df, "state_features", states) df = make_sparse2dense(df, "next_state_features", states) df = make_sparse2dense(df, "metrics", metrics) # turn string actions into indices where_udf = make_where_udf(actions) df = df.withColumn("action", where_udf("action")) df = df.withColumn("next_action", where_udf(next_long_udf("next_action"))) # turn List[str] possible_actions into existence bitvectors next_long_arr_udf = make_next_udf(multi_steps, ArrayType(LongType())) existence_bitvector_udf = make_existence_bitvector_udf(actions) df = df.withColumn( "possible_actions_mask", existence_bitvector_udf("possible_actions") ) df = df.withColumn( "possible_next_actions_mask", existence_bitvector_udf(next_long_arr_udf("possible_next_actions")), ) # calculate not_terminal not_terminal_udf = make_not_terminal_udf(actions) df = df.withColumn("not_terminal", not_terminal_udf("next_action")) # assuming use_seq_num_diff_as_time_diff = False for now df = df.withColumn("sequence_number", col("sequence_number_ordinal")) return df
# MAGIC <a target="_blank" href="https://fast.wistia.net/embed/iframe/qk2is6llgl?seo=false"> # MAGIC <img alt="Opens in new tab" src="https://files.training.databricks.com/static/images/external-link-icon-16x16.png"/> Watch full-screen.</a> # MAGIC </div> # COMMAND ---------- # MAGIC %md # MAGIC The ZIP Code dataset contains an array with the latitude and longitude of the cities. Use an `ArrayType`, which takes the primitive type of its elements as an argument. # COMMAND ---------- from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, FloatType zipsSchema3 = StructType([ StructField("city", StringType(), True), StructField("loc", ArrayType(FloatType(), True), True), StructField("pop", IntegerType(), True) ]) # COMMAND ---------- # MAGIC %md # MAGIC Apply the schema using the `.schema()` method and observe the results. Expand the array values in the column `loc` to explore further. # COMMAND ---------- zipsDF3 = (spark.read.schema(zipsSchema3).json("/mnt/training/zips.json")) display(zipsDF3) # COMMAND ----------
def test_implicit_type_conversion(spark: SparkSession, ): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'single_customer.csv')}", "event_timestamp_column": "event_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'transactions.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_table = { "name": "transactions", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "daily_transactions", "type": "float" }], } joined_df = retrieve_historical_features( spark, entity_source, [transaction_source], [transaction_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), ]) expected_joined_data = [ ( 1001, datetime(year=2020, month=9, day=2), 100.0, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def change_to_float(data, col): for conv_col in col: data = data.withColumn(conv_col, data[conv_col].cast(FloatType())) return (data)
__short_type: ShortType = ShortType() _SMALLINT_TYPE: str = __short_type.simpleString() __int_type: IntegerType = IntegerType() _INT_TYPE: str = __int_type.simpleString() assert _INT_TYPE == int.__name__ assert __int_type.typeName().startswith(_INT_TYPE) __long_type: LongType = LongType() _BIGINT_TYPE: str = __long_type.simpleString() assert __long_type.typeName() == 'long' _INT_TYPES: Tuple[str] = _TINYINT_TYPE, _SMALLINT_TYPE, _INT_TYPE, _BIGINT_TYPE __float_type: FloatType = FloatType() _FLOAT_TYPE: str = __float_type.simpleString() assert _FLOAT_TYPE == __float_type.typeName() __double_type: DoubleType = DoubleType() _DOUBLE_TYPE: str = __double_type.simpleString() assert _DOUBLE_TYPE == __double_type.typeName() _FLOAT_TYPES: Tuple[str] = _FLOAT_TYPE, _DOUBLE_TYPE _NUM_TYPES: Tuple[str] = _INT_TYPES + _FLOAT_TYPES _POSSIBLE_CAT_TYPES: Tuple[str] = (_BOOL_TYPE, _STR_TYPE) + _NUM_TYPES _POSSIBLE_FEATURE_TYPES: Tuple[str] = _POSSIBLE_CAT_TYPES + _NUM_TYPES