Beispiel #1
0
def obtem_dados_parquet():
    pass


if __name__ == "__main__":

    configuracao = (SparkConf().set("spark.driver.maxResultSize", "2g"))

    spark = (SparkSession.builder.config(
        conf=configuracao).appName(NOME_JOB).enableHiveSupport().getOrCreate())

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    to_value = lambda v: float(v.replace(",", "."))
    udf_to_value = F.udf(to_value, FloatType())

    caminho_csv = "C:/Users/francoise.moreira/OneDrive - JM Confitec Sistemas de Computação LTDA/ESTUDO/SPARK_JUPITER_PANDA_PYTHON/lerParquet e tratar dados/dadosCSV/2021_Pagamento.csv"
    caminho_parquet = "C:/Users/francoise.moreira/OneDrive - JM Confitec Sistemas de Computação LTDA/ESTUDO/SPARK_JUPITER_PANDA_PYTHON/lerParquet e tratar dados/dadosParquet/"

    df = obtem_dados_csv(caminho_csv, ";", "true", "true")

    # NOTE transformar os dados recebidos em string
    DadosGoverno = (df.transform(lambda df: df.withColumn(
        "id_processo",
        F.col("Identificador do processo de viagem").cast(StringType())
    )).transform(lambda df: df.withColumn(
        "proposta",
        F.col("N�mero da Proposta (PCDP)").cast(StringType())
    )).transform(lambda df: df.withColumn(
        "cod_orgao_sup",
Beispiel #2
0
class ProtoDash():
    def create_spark_session():
        global sc
        sc = SparkContext.getOrCreate()
        sc.setLogLevel("ERROR")
        return SparkSession(sc)

    def create_vec_rdd(X, part=4):
        """
        Function returning a DenseVector RDD from a dataset X.    
        Args:
        -X: a dataset with rows corresponding to observations and columns corresponding to features.
        -part: n of partitions.    
        Returns: the RDD for X
        """
        # creating a Spark session
        spark_session = ProtoDash.create_spark_session()

        X_rdd = (sc.parallelize(
            X, part).map(lambda x: DenseVector(x)).zipWithIndex())
        return X_rdd

    def mean_inner_product(inp, sigma, n):
        """
        Function computing the gaussian kernel inner product of a vector in Y vs.
        a vector in X, divided by n the total number of observations in X.
        """
        index_1 = inp[0][1]
        inner_product = float(
            np.exp(inp[0][0].squared_distance(inp[1][0]) / (-2 * sigma**2)) /
            n)
        return (index_1, inner_product)

    def inner_product(inp, sigma):
        """
        Function computing the gaussian kernel inner product of a vector vs.
        another.
        """
        index_1 = inp[0][1]
        index_2 = inp[1][1]
        inner_product = float(
            np.exp(inp[0][0].squared_distance(inp[1][0]) / (-2 * sigma**2)))
        return (index_1, [(index_2, inner_product)])

    def weighted_sum(inp, w_arr):
        """
        compute the weighted sum of matrix values for a set of indices and weights.
        Note it is fine using a list comprehension here since the number of prototypes m << |X^{(1)}|.
        """
        return float(np.sum(np.array([x[1] for x in inp]) * w_arr))

    def udf_weighted_sum(w_arr):
        """
        UDF instance of the weighted_sum function.
        """
        return F.udf(lambda l: ProtoDash.weighted_sum(l, w_arr))

    def merge_lists(x, y):
        """
        merge lists.
        """
        return sorted(x + y, key=lambda tup: tup[0])

    # Create UDF corresponding to merge_lists function.
    DType = ArrayType(
        StructType(
            [StructField("_1", LongType()),
             StructField("_2", FloatType())]))
    udf_merge_lists = F.udf(merge_lists, DType)

    def optimize(K, u, opt_w0, init_val, max_w=10000):
        """
        Function solving quadratic optimization problem.        
        Args:
        -K: inner product matrix
        -u: mean inner product of each prototype
        -opt_w0: initial weights vector
        -init_val: starting run
        -max_w: an upper bound on weight value    
        Returns:
        -weights and the objective value
        """
        dim = u.shape[0]
        low_b = np.zeros((dim, 1))
        upper_b = max_w * np.ones((dim, 1))
        x_init = np.append(opt_w0, init_val / K[dim - 1, dim - 1])
        G = np.vstack((np.identity(dim), -1 * np.identity(dim)))
        h = np.vstack((upper_b, -1 * low_b))

        # solve constrained quadratic problem
        soltn = solve_qp(K,
                         -u,
                         G,
                         h,
                         A=None,
                         b=None,
                         solver='cvxopt',
                         initvals=x_init)

        # calculate the objective function value for optimal solution
        x_sol = soltn.reshape(soltn.shape[0], 1)
        q = -u.reshape(u.shape[0], 1)
        obj_value = 1 / 2 * np.matmul(np.matmul(x_sol.T, K),
                                      x_sol) + np.matmul(q.T, x_sol)

        return (soltn, obj_value[0, 0])

    def ProtoDashAlgoritm(X, Y, m, sigma, partitions=20, verbose=True):
        """
        Implementation of the ProtoDash algorithm
        
        Args:
        -X (RDD of indexed DenseVector rows): Target dataset/ the dataset to be represented.
        -Y (RDD of indexed DenseVector rows): Source dataset/ the dataset to select prototypes from.
        -m (integer): total number of prototypes to select.
        -sigma (strictly positive float): gaussian kernel parameter.
        -partitions (integer): number of RDD partitions to compute inner product RDDs with.
        -verbose (boolean): whether or not to print the cumulative number of prototypes selected at each iteration.
        
        Returns:
        -L (integer list): the set of indices corresponding to selected prototypes.
        -w (float list): the optimal set of weights corresponding to each selected prototype.
        """

        # get count of observations in X
        n_X = X.count()

        # build mu DataFrame
        mu_df = (Y.cartesian(X).map(lambda x: ProtoDash.mean_inner_product(
            x, sigma, n_X)).reduceByKey(lambda x, y: x + y).toDF(["obs",
                                                                  "mu"]))

        # initialise key variables
        L = np.zeros(m, dtype=int)  # set of prototype indices L
        w = np.zeros(m, dtype=float)  # set of optimal prototype weights
        f_eval = np.zeros(
            m, dtype=float)  # set of the f(w) eval. at prototype selection
        n_L = 0  # count of prototypes selected so far

        # find the index corresponding to the maximum mu value
        max_grad_0 = mu_df.orderBy(F.desc("mu")).limit(1).collect()[0]

        # collect values
        L[n_L] = max_grad_0.obs
        w[n_L] = max_grad_0.mu
        f_eval[n_L] = 1 / 2 * max_grad_0.mu**2
        n_L += 1

        # select the row of Y corresponding to the first chosen index
        Y_row_j0 = Y.filter(lambda x: x[1] == L[:n_L]).collect()[0]

        # take its inner product with all rows of Y to build the starting K dataframe
        K_init_df = (Y.map(lambda x: ProtoDash.inner_product(
            (x, Y_row_j0), sigma)).toDF(["obs", "K"]))

        # join mu and K dataframes
        join_df = (mu_df.join(K_init_df, "obs").repartition(partitions))

        # cache join_df as it is reused often
        join_df.cache()

        # compute the new gradient vector
        grad_df = (join_df.withColumn(
            "K_weighted",
            ProtoDash.udf_weighted_sum(w[:n_L])(F.col("K"))).withColumn(
                "grad",
                F.col("mu") - F.col("K_weighted")).select("obs", "grad"))

        # begin while loop
        while n_L < m:

            # remove the rows that have an index already included in L
            grad_df = grad_df.filter(
                ~grad_df.obs.isin([int(x) for x in L[:n_L]]))

            # find the row that has the maximum value in the filtered gradient vector
            argmax_grad = grad_df.orderBy(F.desc("grad")).limit(1).collect()[0]

            # update L
            L[n_L] = argmax_grad.obs

            # select the row of Y corresponding to the chosen index
            Y_row_j = Y.filter(lambda x: x[1] == L[n_L]).collect()[0]

            # take its inner product with all rows of Y to build new K
            K_int_df = (Y.map(lambda x: ProtoDash.inner_product(
                (x, Y_row_j), sigma)).toDF(["obs", "new_K_col"]))

            # add new K col to previous K col
            join_df = (join_df.join(K_int_df, "obs").withColumn(
                "K_merged",
                ProtoDash.udf_merge_lists(F.col("K"),
                                          F.col("new_K_col"))).select(
                                              "obs", "mu",
                                              "K_merged").withColumnRenamed(
                                                  "K_merged", "K"))

            # cache new joined_df
            join_df.cache()

            # increment n_L
            n_L += 1

            # sort L
            L[:n_L] = sorted(L[:n_L])

            if verbose is True and n_L % 5 == 0:
                print("Prototypes selected - " + str(n_L))

            # take max gradient val.
            max_grad = argmax_grad.grad

            # filter join dataframe for given indices in L
            filt_df = (join_df.filter(
                join_df.obs.isin([int(x) for x in L[:n_L]
                                  ])).orderBy(F.col("obs").asc()))

            # take mu vector
            mu_arr = np.array(filt_df.select("mu").collect(), dtype=float)

            # take K matrix
            K_mat = np.array(
                filt_df.rdd.map(lambda x: [y[1] for y in x[2]]).collect(),
                dtype=float)

            # find optimal weights for the index set L
            opt_res = ProtoDash.optimize(K_mat, mu_arr, w[:n_L - 1], max_grad)
            (w[:n_L], f_eval[n_L - 1]) = opt_res[0], -opt_res[1]

            # compute gradient vector with new optimal weights
            grad_df = (join_df.withColumn(
                "K_weighted",
                ProtoDash.udf_weighted_sum(w[:n_L])(F.col("K"))).withColumn(
                    "grad",
                    F.col("mu") - F.col("K_weighted")).select("obs", "grad"))

        # tuple of indices and their corresponding weight, sorted by weight in descending order.
        res = sorted([(w[i], L[i]) for i in range(m)], key=lambda tup: -tup[0])

        # return tuple of index set L and optimal weight set w, set of f_eval
        return res, f_eval

    #######################################################
    #########     RDF IMPLEMENTATION             ###########
    #######################################################

    rdf_dataset = None
    numeric_dataset = None
    subject_index = {}
    predicate_index = {}
    object_index = {}

    def infer_index(token, token_indices):
        """
        Enumerate the distinct tokens. If the token is found in the token_indices, then return it,
        else assign the next integer number (after the last assigned index) which is also the size of the token_indices.
        """
        if token in token_indices:
            return token_indices[token]
        else:
            token_index = len(token_indices)
            token_indices[token] = token_index
            return token_index

    def convert_rdf_to_ntriples(dataset):
        """
        Loads rdf data and converts into n-triples, treating each triple as a datapoint in the dataset.
        """
        g = Graph()
        g.load(dataset)

        rem_object = URIRef(
            "http://www.w3.org/2002/07/owl#NamedIndividual"
        )  # deleting the triples that have object value as 'owl#NamedIndividual'
        for s, p, o in g:
            g.remove((s, p, rem_object))

        global rdf_dataset
        global numeric_dataset

        # create n-triples of strings
        rdf_dataset = [(str(s), str(p), str(o)) for s, p, o in g]

        # preprocess and create a numeric dataset in order to input to ProtoDash
        numeric_dataset = list(
            map(
                lambda e:
                (ProtoDash.infer_index(e[0], ProtoDash.subject_index),
                 ProtoDash.infer_index(e[1], ProtoDash.predicate_index),
                 ProtoDash.infer_index(e[2], ProtoDash.object_index)),
                rdf_dataset))
        #print(rdf_dataset)
        #print('************************************')
        #print('Size of dataset:', len(rdf_dataset))
        #print('Subjects cardinality:', len(subject_index))
        #print('Predicates cardinality:', len(predicate_index))
        #print('Objects cardinality:', len(object_index))
        print('************************************')

        return numeric_dataset

    def strip_rdf_prefix(triple):
        """
        Strips the common URL-like prefixes from the RDF data and takes the suffix after '#'.
    
        Example:
    
        Input triple:  ('http://www.semanticweb.org/vinu/ontologies/2014/6/untitled-ontology-91#naomi_watts',
                        'http://www.semanticweb.org/vinu/ontologies/2014/6/untitled-ontology-91#acted_in',
                        'http://www.semanticweb.org/vinu/ontologies/2014/6/untitled-ontology-91#rabbits')
    
        Output: naomi_watts acted_in rabbits
        """
        return ' '.join(tuple(map(lambda e: e[e.find('#') + 1:], triple)))

    def get_sample_index(dataset, sample):

        # Function returns the index of the triple in the dataset
        global dataset_rdd
        dataset_rdd = ProtoDash.convert_rdf_to_ntriples(dataset)
        index_list = [x for x, y in enumerate(rdf_dataset)]
        for i in range(len(rdf_dataset)):
            if rdf_dataset[i] == sample:
                return index_list[i]

    def get_rdf_prototypes(dataset, sample_triple, num_proto):

        # Index of the sample from the dataset to be given to the ProtoDash as dataset to be explained
        # These prototypes that come out of ProtoDash can be thought as the cluster that this sample belongs to.
        # Or vice versa, the sampled datapoint can be thought as cluster centroid, and the explaining prototypes
        # as the data that belong to that cluster.
        sample_index = ProtoDash.get_sample_index(dataset, sample_triple)

        if sample_index is not None:
            # Create a target dataset comprising of the selected sample
            target = [numeric_dataset[sample_index]]
            # Create a source dataset comprising of all triples but the selected sample
            source = numeric_dataset[:sample_index] + numeric_dataset[
                sample_index + 1:]

            # Convert the datasets to PySpark RDDs
            target_rdd = ProtoDash.create_vec_rdd(target)
            source_rdd = ProtoDash.create_vec_rdd(source)

            print('Starting ProtoDash on RDF')
            res, f = ProtoDash.ProtoDashAlgoritm(target_rdd,
                                                 source_rdd,
                                                 num_proto,
                                                 50,
                                                 partitions=4,
                                                 verbose=True)[:2]
            print('Finished ProtoDash on RDF')

            print('The chosen sample_index:', sample_index)

            # Raw RDF triples has a long common prefixes, for the sake presentation (to keep it short),
            # I strip the common long URL-like prefixes and take the suffix after '#' - the data that matters.
            stripped_target = ProtoDash.strip_rdf_prefix(
                rdf_dataset[sample_index])

            # Print the target datapoint
            print('Target (sampled) datapoint: ', stripped_target)

            # create the Y and X axis of the plot
            # The result (res) that comes from the ProtoDash is a list of pairs of weight and index
            # I use the index find the triples from the raw dataset to be used X-axis
            # and the weights are used as Y-coordinates
            values = list(map(lambda e: e[0], res))  # e[0] is weight
            names = list(map(lambda e: rdf_dataset[e[1]],
                             res))  # e[1] is index
            # strip the names to fit into the plot
            names = list(map(ProtoDash.strip_rdf_prefix, names))

            plt.barh(names, values)
            plt.title(stripped_target)
            plt.show()
        else:
            print("Please enter a valid triple")

    def ProtoDashOnRDF(dataset, num_proto, sample_triple):

        # dataset: path to the file
        # num_proto: number of prototypes for ProtoDash to select
        # sample_triple: the sample_triple is string which refer to the triple

        if os.path.isfile(dataset):
            if num_proto.isdigit():
                sample_triple = tuple(sample_triple.split(','))
                ProtoDash.get_rdf_prototypes(dataset, sample_triple,
                                             int(num_proto))
            else:
                print("Number of prototypes can be only integer")
        else:
            print("File do not exists")

    #######################################################
    #########     Image IMPLEMENTATION             ###########
    #######################################################

    # collect MNIST train/test sets
    train_images = np.array(mnist.train_images(), dtype='float')
    train_labels = mnist.train_labels()

    test_images = np.array(mnist.test_images(), dtype='float')
    test_labels = mnist.test_labels()

    def create_target_set(labels, images, digit, target_n, percentage):
        """
        This function creates a MNIST image dataset in which a specified percentage of the total observations
        correspond to a specific digit, while the remaining observations correspond to other randomly
        chosen digits.
        
        Args:
        -labels: the digit label for each MNIST image.
        -images: the MNIST image.
        -digit: a digit between 0 and 9.
        -target_n: the number of total observations required in the target dataset.
        -percentage: the percentage of images in the target dataset that correspond to the specified digit.
        
        Returns:
        -the target images.
        
        """

        # take integer number of obs. corresponding to digit
        n_dig = int(np.floor(percentage * target_n))

        # get indices corresponding to digit
        idx = np.where(labels == digit)[0]

        # reduce indices to specific %
        idx_red = idx[:n_dig]

        # slice images with index and reshape
        target_set_dig = images[idx_red, :]
        target_set_dig = np.reshape(target_set_dig,
                                    (target_set_dig.shape[0], 28 * 28))

        # get remaining indices
        rem = target_n - n_dig
        rem_ind = np.setdiff1d(np.arange(len(labels)), idx_red)[:rem]

        # fill the remaining observations with images corresponding to other digits
        target_set_non_dig = images[rem_ind]
        target_set_non_dig = np.reshape(target_set_non_dig,
                                        (target_set_non_dig.shape[0], 28 * 28))

        # create the dataset
        target_set = np.vstack((target_set_non_dig, target_set_dig))

        # shuffle it
        arr = np.arange(target_n)
        np.random.shuffle(arr)

        return target_set

    def get_image_prototypes(num_proto, digit):

        part = 6  # number of Pyspark RDD partitions to use
        sigma = 50  # gaussian kernel parameter
        n_1 = 5420  # the number of observations in X_1
        n_2 = 1500  # the number of observations in X_2
        #percentages = [.3, .5, .7, .9, 1.]
        percentages = [
            1.
        ]  # the percentage of X_1 that will correspond to the chosen digit

        # list of experiment results
        exp_1_res_list = []

        # list of f_eval sequences
        exp_1_f_eval_list = []

        # set source dataset and labels
        source_set = np.reshape(ProtoDash.test_images[:n_2], (n_2, 28 * 28))

        # select the target datasets
        target_set = ProtoDash.create_target_set(ProtoDash.train_labels,
                                                 ProtoDash.train_images, digit,
                                                 n_1, 1)

        # convert target and source datasets to RDDs
        target_rdd = ProtoDash.create_vec_rdd(target_set, part)
        source_rdd = ProtoDash.create_vec_rdd(source_set, part)

        # collect the indices of m prototypes along with their ascribed weight
        res, f = ProtoDash.ProtoDashAlgoritm(target_rdd,
                                             source_rdd,
                                             num_proto,
                                             sigma,
                                             partitions=part,
                                             verbose=True)[:2]

        # collect the results
        exp_1_res_list.append(res)
        exp_1_f_eval_list.append(f)

        fig, axes = plt.subplots(num_proto, 1, figsize=(12, 10), squeeze=False)

        for i in range(num_proto):
            for j in range(len(percentages)):
                axes[i][j].imshow(
                    np.reshape(source_set[exp_1_res_list[j][i][1], :],
                               (28, 28)))
                axes[i][j].get_xaxis().set_ticks([])
                axes[i][j].get_yaxis().set_ticks([])

        fig.suptitle("\n".join(
            wrap(
                "Top %d prototypes selected by ProtoDash corresponding to the digit %d"
                % (num_proto, digit), 60)),
                     fontsize=20)

        plt.show()
        spark.stop()

    def ProtoDashOnImage(digit, num_proto):

        # digit: the digit to be represented in the target dataset X_1
        # num_proto: number of prototypes for ProtoDash to select

        if digit.isdigit() and 0 <= int(digit) <= 9:
            if num_proto.isdigit():
                ProtoDash.get_image_prototypes(int(num_proto), int(digit))
            else:
                print("Please enter an integer value for number of prototypes")
        else:
            print("Please enter a digit between 0-9")
Beispiel #3
0
def programaPrincipal():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "ciclistas",
        help="Nombre del archivo csv que contiene la lista de ciclistas")
    parser.add_argument(
        "rutas", help="Nombre del archivo csv que contiene la lista de rutas")
    parser.add_argument(
        "actividades",
        help="Nombre del archivo csv que contiene la lista de actividades")
    parser.add_argument("N",
                        help="Top N a consultar de ciclistas por provincia",
                        type=int)

    args = parser.parse_args()

    ciclista_schema = StructType([
        StructField('cedula', IntegerType()),
        StructField('nombre_Completo', StringType()),
        StructField('provincia', StringType()),
    ])

    ciclista_df = spark.read.csv(args.ciclistas,
                                 schema=ciclista_schema,
                                 header=False)

    #ciclista_df.show()

    ruta_schema = StructType([
        StructField('codigo', IntegerType()),
        StructField('nombre_Ruta', StringType()),
        StructField('kilometros', FloatType()),
    ])

    ruta_df = spark.read.csv(args.rutas, schema=ruta_schema, header=False)

    #ruta_df.show()

    actividad_schema = StructType([
        StructField('codigo_Ruta', IntegerType()),
        StructField('cedula_Ciclista', IntegerType()),
        StructField('fecha', DateType()),
    ])

    actividad_df = spark.read.csv(args.actividades,
                                  schema=actividad_schema,
                                  header=False)

    #actividad_df.show()

    ciclista_actividad_ruta_df = tarea1_funciones.join_dataframes(
        ciclista_df, ruta_df, actividad_df)

    print(
        "Dataframe que contiene el join de los 3 archivos: ciclista.csv, actividad.csv y ruta.csv:"
    )
    ciclista_actividad_ruta_df.show()

    ciclistas_kilometros_df = tarea1_funciones.obtener_kilometros_por_ciclista(
        ciclista_actividad_ruta_df)

    print(
        "Kilómetros recorridos por ciclista, por ruta, por provincia y por día:"
    )
    ciclistas_kilometros_df.show()

    #indica el top N de cilclistas por provincia que se quieren obtener
    N = args.N

    provincia_ciclistas_kilometros_total_df = tarea1_funciones.obtener_topN_ciclistas_por_provincia_en_total_de_kilometros(
        ciclistas_kilometros_df, N)
    print("Top", N, "de ciclistas por provincia, en total de kilómetros:")
    provincia_ciclistas_kilometros_total_df.show()

    provincia_ciclistas_kilometros_promedio_df = tarea1_funciones.obtener_topN_ciclistas_por_provincia_en_promedio_de_kilometros_por_dia(
        ciclistas_kilometros_df, N)
    print("Top", N,
          "de ciclistas por provincia, en promedio de kilómetros por día:")
    provincia_ciclistas_kilometros_promedio_df.show()

    top_N_ciclistas_por_provincia = tarea1_funciones.unir_dataframes_Top_N_ciclistas_por_provincia(
        provincia_ciclistas_kilometros_total_df,
        provincia_ciclistas_kilometros_promedio_df)
    print(
        "Top", N,
        "de ciclistas por provincia, tanto en total de kilómetros como en promedio de kilómetros por día:"
    )
    top_N_ciclistas_por_provincia.show()
    remove_quotes = lambda airport_string: airport_string.replace('"', '')

    def is_filtered_airport(airport):
        return len(airport) == 3 and airport[0] == airportarg

    def split_airport_and_airport_string_into_origin_aiport_and_dest_airport_tuple(
            airport_and_airport_string, average_delay):

        [airport, carrier] = airport_and_airport_string.split(" ")
        return [airport, carrier, average_delay]

    top10AirportsPerAirportSchema = StructType([
        StructField('Airport', StringType(), True),
        StructField('Carrier', StringType(), True),
        StructField('Average Departure Delay', FloatType(), True)
    ])

    lines = file.rdd \
        .cache() \
        .keys() \
        .map(lambda l: l.split("\t")) \
        .map(lambda t: [remove_quotes(t[0]), float(t[1])]) \
        .map(lambda t: split_airport_and_airport_string_into_origin_aiport_and_dest_airport_tuple(t[0], t[1])) \
        .filter(lambda t: is_filtered_airport(t))

    df = spark \
        .createDataFrame(lines, schema=top10AirportsPerAirportSchema) \
        .orderBy(["Airport", "Average Departure Delay"])

    df.coalesce(1) \
Beispiel #5
0
)

# In[23]:

observation_with_bundle.registerTempTable("ob_with_bundle")
observation_with_bundle_rdd = observation_with_bundle.rdd.coalesce(
    16)  #decrease num of partitions to make collection faster
observation_with_bundle_rdd.persist()
observation_with_bundle_rdd.getNumPartitions()

# In[3]:

from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.functions import udf

changetofloat = udf(lambda s: float(s), FloatType())

#change normalized_price to float. It was string type before.
ob_with_bundle_float = observation_with_bundle.withColumn(
    "n_price_float", changetofloat(observation_with_bundle.normalized_price))
ob_with_bundle_float.registerTempTable('ob_with_bundle_float')

# In[29]:

df_zc_income_SD.printSchema()

# get average of normalized price for each bundle based on normalized_size_units
ob_with_bundle_float_group = ob_with_bundle_float.groupBy(
    'bundle', 'normalized_size_units')
ob_average = ob_with_bundle_float_group.avg('n_price_float').withColumnRenamed(
    "AVG(n_price_float)", "avg_nprice")
Beispiel #6
0
def test_select_subset_of_columns_as_entity_primary_keys(
    spark: SparkSession,
    composite_entity_schema: StructType,
    customer_feature_schema: StructType,
):
    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=2)),
        (2001, 8002, datetime(year=2020, month=9, day=2)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    feature_table_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            400.0,
        ),
    ]
    feature_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(feature_table_data),
        customer_feature_schema)
    feature_table = FeatureTable(
        name="transactions",
        features=[Field("daily_transactions", "double")],
        entities=[Field("customer_id", "int32")],
    )

    joined_df = as_of_join(
        entity_df,
        "event_timestamp",
        feature_table_df,
        feature_table,
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
    ])
    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=2),
            400.0,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Beispiel #7
0
def test_historical_feature_retrieval(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'customer_driver_pairs.csv')}",
            "event_timestamp_column": "event_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'bookings.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'transactions.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_table = {
        "name": "bookings",
        "entities": [{
            "name": "driver_id",
            "type": "int32"
        }],
        "features": [{
            "name": "completed_bookings",
            "type": "int32"
        }],
    }
    transaction_table = {
        "name": "transactions",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "daily_transactions",
            "type": "double"
        }],
        "max_age": 86400,
    }

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [transaction_source, booking_source],
        [transaction_table, booking_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
        StructField("bookings__completed_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
            300,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=2),
            100.0,
            500,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=4),
            None,
            500,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Beispiel #8
0
        StructField("RelatedImages",StringType(),True),
        StructField("SocialImageEmbeds",StringType(),True),
        StructField("SocialVideoEmbeds",StringType(),True),
        StructField("Quotations",StringType(),True),
        StructField("AllNames",StringType(),True),
        StructField("Amounts",StringType(),True),
        StructField("TranslationInfo",StringType(),True),
        StructField("Extras",StringType(),True)
        ])

EVENTS_SCHEMA = StructType([
    StructField("GLOBALEVENTID",LongType(),True),
    StructField("Day_DATE",LongType(),True),
    StructField("MonthYear_Date",StringType(),True),
    StructField("Year_Date",StringType(),True),
    StructField("FractionDate",FloatType(),True),
    StructField("Actor1Code",StringType(),True),
    StructField("Actor1Name",StringType(),True),
    StructField("Actor1CountryCode",StringType(),True),
    StructField("Actor1KnownGroupCode",StringType(),True),
    StructField("Actor1EthnicCode",StringType(),True),
    StructField("Actor1Religion1Code",StringType(),True),
    StructField("Actor1Religion2Code",StringType(),True),
    StructField("Actor1Type1Code",StringType(),True),
    StructField("Actor1Type2Code",StringType(),True),
    StructField("Actor1Type3Code",StringType(),True),
    StructField("Actor2Code",StringType(),True),
    StructField("Actor2Name",StringType(),True),
    StructField("Actor2CountryCode",StringType(),True),
    StructField("Actor2KnownGroupCode",StringType(),True),
    StructField("Actor2EthnicCode",StringType(),True),
def get_structfield(colname):
   if colname in ['ARR_DELAY', 'DEP_DELAY', 'DISTANCE', 'TAXI_OUT', 'DEP_AIRPORT_TZOFFSET']:
      return StructField(colname, FloatType(), True)
   else:
      return StructField(colname, StringType(), True)
bankProspectsDF.show()

"""## Remove the record with unknow value in country column"""

bankProspectsDF1 = bankProspectsDF.filter(bankProspectsDF['country'] != "unknown")

bankProspectsDF1.show()

"""##  Cast the String datatype to Integer/Float"""

bankProspectsDF1.printSchema()

from pyspark.sql.types import IntegerType,FloatType

bankProspectsDF2 = bankProspectsDF1.withColumn("age", bankProspectsDF1["age"].cast(IntegerType())).withColumn("salary", bankProspectsDF1["salary"].cast(FloatType()))

bankProspectsDF2.printSchema()

"""## Replace Age and Salary with average values of their respective column

import mean from sql.fuctions
"""

from pyspark.sql.functions import mean

"""### Calculate "mean" value of the age"""

mean_age_val = bankProspectsDF2.select(mean(bankProspectsDF2['age'])).collect()

type(mean_age_val)
Beispiel #11
0
    def run(self, input_dict=None, block_params=None, program_arguments=None):
        try:
            t1 = time.time()
            output_dict = dict()

            configs = input_dict["Config"]
            # test_args = {'spark.app.name': 'spark_app_test', 'spark.shuffle.service.enabled': 'true', 'spark.dynamicAllocation.minExecutors': '1', 'spark.dynamicAllocation.enabled': 'true'}

            queue_dict = {}

            queue_dict['left_df'] = input_dict['leftData']['queueTopicName']
            queue_dict['right_df'] = input_dict['rightData']['queueTopicName']

            kafka_handler = sdk.kafka_handler(None)
            kafka_api_instance = kafka_handler.get_api_instance()
            channels = {}

            for key, topic in queue_dict.items():
                consumer_pool = {
                    "count": 1,
                    "groupId": str(uuid.uuid4()),
                    "registerId": "",
                    "topicsListToSubscribe": [topic]
                }

                try:
                    consumer_pool_res = kafka_api_instance.create_consumer_list_using_post(
                        consumer_pool)
                    channels[key] = consumer_pool_res.result
                except Exception as e:
                    self.logger.error(
                        "Error Trying To Create a Consumer Of Topic:" +
                        str(topic))
                    self.block_status = "FAILED"
                    raise e

            optional_param = {}
            optional_param['queue_dict'] = queue_dict
            optional_param["api_instance"] = kafka_api_instance
            optional_param["channels"] = channels

            self.spark = SparkConfCustom().get_spark_session()
            self.spark.sparkContext.setLogLevel('ERROR')

            self.spark_schema = {}
            self.field_list = {}
            print('waiting')
            # time.sleep(200)

            arrary_of_threads = []
            for key, topic in queue_dict.items():
                req = {"topicName": topic}
                try:
                    schema = kafka_api_instance.get_topic_meta_using_post(req)
                    schema = json.loads(json.loads(schema.result)["schema"])
                    optional_param['schema'] = schema
                    self.logger.debug("Schema Received")
                except Exception as e:
                    self.logger.error("Error Fetching Schema")
                    self.logger.error(str(e))
                    self.logger.error(traceback.format_exc())
                    self.block_status = "FAILED"
                    raise e

                col_names = schema.keys()
                parsed_schema_dict = {}

                for name in col_names:
                    values = schema.get(name)
                    parsed_schema_dict[name] = values['type']

                self.logger.info("schemaaaa hereeee")
                self.logger.info(schema)
                self.logger.info(parsed_schema_dict)

                self.list_of_struct_fields = []
                self.field_list[key] = []

                for name in parsed_schema_dict.keys():
                    if parsed_schema_dict[name] == 'FloatType()':
                        self.field_list[key].append(('float'))
                        self.list_of_struct_fields.append(
                            StructField(name, FloatType(), True))
                    elif parsed_schema_dict[name] == 'IntegerType()':
                        self.field_list[key].append('int')
                        self.list_of_struct_fields.append(
                            StructField(name, IntegerType(), True))
                    elif parsed_schema_dict[name] == 'DoubleType()':
                        self.field_list[key].append('float')
                        self.list_of_struct_fields.append(
                            StructField(name, DoubleType(), True))
                    else:
                        self.field_list[key].append('string')
                        self.list_of_struct_fields.append(
                            StructField(name, StringType(), True))

                self.spark_schema[key] = StructType(self.list_of_struct_fields)
                fpath = '/bigbrain/' + str(key)
                if os.path.exists(fpath):
                    rmtree(fpath)
                os.makedirs(fpath, exist_ok=True)
                t = self.ReadRecords(self.spark, topic, key,
                                     self.spark_schema[key], input_dict,
                                     block_params, optional_param,
                                     self.field_list)
                t.start()
                arrary_of_threads.append(t)

            for t in arrary_of_threads:
                t.join()

            print('Both topics read done')

            # self.stream_block(input_dict=input_dict, block_params=block_params, optional_arg=optional_param)

            self.left_df = self.spark.read.parquet('/bigbrain/left_df')
            print(self.left_df.count())
            self.right_df = self.spark.read.parquet('/bigbrain/right_df')
            print(self.right_df.count())
            exec("self.resultant_join_df" + "=" +
                 "self.left_df.join(self.right_df,self.left_df['" +
                 configs['unique_key_left'] + "']== self.right_df['" +
                 configs['unique_key_right'] + "'] ,how='" +
                 configs['join_type'] + "')")

            print(self.left_df.rdd.getNumPartitions())
            print(self.right_df.rdd.getNumPartitions())

            new_column_name_list = self.resultant_join_df.columns
            renamed_cols = {}
            for col in new_column_name_list:
                count = new_column_name_list.count(col)
                if count > 1:
                    idx = new_column_name_list.index(col)
                    new_column_name_list[idx] = col + '_1'

            print(self.resultant_join_df.columns)
            self.resultant_join_df = self.resultant_join_df.toDF(
                *new_column_name_list)
            print(self.resultant_join_df.columns)

            temp_fp = '/bigbrain/' + str(t1) + '.csv'
            print(temp_fp)
            # os.makedirs(temp_fp, exist_ok=True)
            temp_join_time_st = time.time()
            self.resultant_join_df.write.mode("overwrite").option(
                "header", "true").csv(temp_fp)
            temp_join_time_end = time.time()
            print('Time for Join: ' +
                  str(temp_join_time_end - temp_join_time_st) +
                  ', File Partitions' +
                  str(self.resultant_join_df.rdd.getNumPartitions()))

            self.logger.info("*****************************")
            self.logger.info("Join Completed")
            # self.logger.info("Count: " + str(self.resultant_join_df.count()))

            self.data_target_params = input_dict["DataTarget"]
            try:
                self.validate_target_params()
            except Exception as e:
                self.logger.error(str(e))
                raise e

            try:
                self.client = self.validate_hdfs_connection(
                    input_dict=input_dict, block_params=block_params)
                self.data_target_params[
                    'fileWithFullPath'] = self.data_target_params['filePath']
                exists = self.file_exits(hdfs_connection=self.client)
                if exists:
                    if self.data_target_params['overwrite']:
                        # remove file
                        self.delete_file(hdfs_connection=self.client)
                        self.append = False
                    else:
                        raise FileExistsError(
                            "File Already Exists: " +
                            str(self.data_target_params['filePath']))
            except Exception as e:
                self.logger.error(str(e))
                raise e

            write_start_time = time.time()
            self.logger.info("Writing to HDFS:")
            self.block_folder_write(self.client, temp_fp)
            # if os.path.isdir(temp_fp):
            #     self.logger.info("dir")
            #     for filename in os.listdir(temp_fp):
            #         print(filename)
            #         if filename.endswith(".csv"):
            #             csv_path = temp_fp + '/' + filename
            #             print(csv_path)
            #             self.block_line_write(self.client, csv_path)
            # else:
            #     self.block_line_write(self.client, temp_fp)

            print('Time for Join: ' +
                  str(temp_join_time_end - temp_join_time_st))

            self.logger.info("Time taken to write to HDFS: " +
                             str(time.time() - write_start_time))

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            output_dict["queueTopicName"] = ''
            output_dict['readerInfo'] = None
            output_dict['readerInfoError'] = None
            output_dict["infoKeys"] = None

            self.logger.info("Output:")
            self.logger.info(json.dumps(output_dict, indent=2))

            return output_dict

        except Exception as e:
            self.logger.error(traceback.format_exc())
            self.block_status = "FAILED"
            raise e
from pyspark.sql import DataFrame
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType, ArrayType, LongType


@udf(ArrayType(FloatType()))
def assemble_features(*cols):
    return [x for x in cols]


@udf(LongType())
def transform_label(cl):
    class_to_label = {
        'Iris-setosa': 0,
        'Iris-versicolor': 1,
        'Iris-virginica': 2,
    }

    return class_to_label[cl]


def transform_iris_data(data: DataFrame):
    data = (data.withColumn(
        'features',
        assemble_features('sepal-length', 'sepal-width',
                          'petal-length', 'petal-width')).withColumn(
                              'label', transform_label('class')).select(
                                  'features', 'label'))

    return data
Beispiel #13
0
tweets = tweets \
   .withColumn('soup_text', decode_html_udf('text')) \
   .withColumn('han_rem', regexp_replace(col('soup_text'), handles_pat, '')) \
   .withColumn('tag_rem', regexp_replace(col('han_rem'), hashtag_pat, '')) \
   .withColumn('http_rem', regexp_replace(col('tag_rem'), http_pat, '')) \
   .withColumn('www_rem', regexp_replace(col('http_rem'), www_pat, '')) \
   .withColumn('utf_text', decode_utf_udf('www_rem')) \
   .withColumn('neg_handel', regexp_replace(col('utf_text'), r"won't", 'will not')) \
   .withColumn('neg_handel', regexp_replace(col('neg_handel'), r"can't", 'can not')) \
   .withColumn('neg_handel', regexp_replace(col('neg_handel'), r"n't", ' not')) \
   .withColumn('sp_rem', regexp_replace(col('neg_handel'), sp_pat, ' ')) \
   .withColumn('low_text', lower(col('sp_rem'))) \
   .withColumn('cleaned', rem_space_udf('low_text')) \
   .selectExpr('cleaned as text', 'tags')

neg_tweets_udf = udf(lambda x: 0.0 if x == 1.0 else 1.0, FloatType())

model = PipelineModel.load('./tweets_analyzer.model')
predictions = model.transform(tweets) \
       .select('tags', 'prediction') \
       .withColumn('hashtag', explode(col('tags'))) \
       .withColumn('pos_tweet', col('prediction')) \
       .withColumn('neg_tweet', neg_tweets_udf('prediction')) \
       .groupby('hashtag') \
       .agg(psf.sum('pos_tweet').alias('pos_tweets'), psf.sum('neg_tweet').alias('neg_tweets'), psf.count('pos_tweet').alias('total_tweets'))


predictions.writeStream \
  .outputMode('complete') \
  .format('console') \
  .option('truncate', False) \
Beispiel #14
0
 def convert(self, ma_field: ma_fields.Field) -> DataType:
     return FloatType()
Beispiel #15
0
def test_join_with_max_age(
    spark: SparkSession,
    single_entity_schema: StructType,
    customer_feature_schema: StructType,
):
    entity_data = [
        (1001, datetime(year=2020, month=9, day=1)),
        (1001, datetime(year=2020, month=9, day=3)),
        (2001, datetime(year=2020, month=9, day=2)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), single_entity_schema)

    feature_table_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            100.0,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            200.0,
        ),
    ]
    feature_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(feature_table_data),
        customer_feature_schema)
    feature_table = FeatureTable(
        name="transactions",
        features=[Field("daily_transactions", "double")],
        entities=[Field("customer_id", "int32")],
        max_age=86400,
    )

    joined_df = as_of_join(
        entity_df,
        "event_timestamp",
        feature_table_df,
        feature_table,
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
    ])
    expected_joined_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            100.0,
        ),
        (1001, datetime(year=2020, month=9, day=3), None),
        (
            2001,
            datetime(year=2020, month=9, day=2),
            200.0,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Beispiel #16
0
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

# We drop the time stamp
df = df.drop("timestamp")

# We convert to ints
df = df.withColumn("userId", df["userId"].cast(IntegerType()))
df = df.withColumn("movieId", df["movieId"].cast(IntegerType()))
df = df.withColumn("rating", df["rating"].cast(FloatType()))


# COMMAND ----------

# First we compute the unique users

unique_usr = df.select('userId').distinct().collect()
unique_usr = [row.asDict()["userId"] for row in unique_usr]

usr_to_emb = {usr : i for i,usr in enumerate(unique_usr)}
emb_to_usr = {i : usr for i,usr in enumerate(unique_usr)}

unique_movie = df.select('movieId').distinct().collect()
unique_movie = [int(row.asDict()["movieId"]) for row in unique_movie]
Beispiel #17
0
def test_join_with_composite_entity(
    spark: SparkSession,
    composite_entity_schema: StructType,
    rating_feature_schema: StructType,
):
    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=1)),
        (1001, 8002, datetime(year=2020, month=9, day=3)),
        (1001, 8003, datetime(year=2020, month=9, day=1)),
        (2001, 8001, datetime(year=2020, month=9, day=2)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    feature_table_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            3.0,
            5.0,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            4.0,
            3.0,
        ),
        (
            2001,
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            4.0,
            4.5,
        ),
    ]
    feature_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(feature_table_data),
        rating_feature_schema,
    )
    feature_table = FeatureTable(
        name="ratings",
        features=[
            Field("customer_rating", "double"),
            Field("driver_rating", "double")
        ],
        entities=[Field("customer_id", "int32"),
                  Field("driver_id", "int32")],
        max_age=86400,
    )

    joined_df = as_of_join(
        entity_df,
        "event_timestamp",
        feature_table_df,
        feature_table,
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("ratings__customer_rating", FloatType()),
        StructField("ratings__driver_rating", FloatType()),
    ])
    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=1),
            3.0,
            5.0,
        ),
        (1001, 8002, datetime(year=2020, month=9, day=3), None, None),
        (1001, 8003, datetime(year=2020, month=9, day=1), None, None),
        (
            2001,
            8001,
            datetime(year=2020, month=9, day=2),
            4.0,
            4.5,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Beispiel #18
0
def set_col(df, columns, func, data_type, summary):
    dict_types = {
        'string': StringType(),
        'str': StringType(),
        'integer': IntegerType(),
        'int': IntegerType(),
        'float': FloatType(),
        'double': DoubleType(),
        'Double': DoubleType()
    }
    types = {
        'string': 'string',
        'str': 'string',
        'String': 'string',
        'integer': 'int',
        'int': 'int',
        'float': 'float',
        'double': 'double',
        'Double': 'double'
    }

    try:
        function = udf(func, dict_types[data_type])
    except KeyError:
        assert False, "Error, data_type not recognized"

    assert_type_str_or_list(df, columns, "columns")

    # Filters all string columns in dataFrame
    valid_cols = [
        c for (c, t) in filter(lambda t: t[1] == types[data_type], df.dtypes)
    ]

    if columns == "*":
        columns = valid_cols[:]

    if isinstance(columns, str):
        columns = [columns]

    assert_cols_in_df(df, columns_provided=columns, columns_df=df.columns)
    col_not_valids = (set([column for column in columns
                           ]).difference(set([column
                                              for column in valid_cols])))
    assert (
        col_not_valids == set()
    ), 'Error: The following columns do not have same datatype argument provided: %s' % col_not_valids

    oldUnique = [find_unique(df, column=c) for c in columns]
    exprs = [
        function(col(c)).alias(c) if c in columns else c
        for (c, t) in df.dtypes
    ]
    newDF = df.select(*exprs)

    if summary:
        newUnique = [find_unique(newDF, column=c) for c in columns]
        count = int(totChanges(oldUnique, newUnique))
        summary = sqlContext.createDataFrame([(count, )], [
            'Total Cells Modified',
        ])
        return (newDF, summary)
    return newDF
Beispiel #19
0
def test_multiple_join(
    spark: SparkSession,
    composite_entity_schema: StructType,
    customer_feature_schema: StructType,
    driver_feature_schema: StructType,
):

    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=2)),
        (1001, 8002, datetime(year=2020, month=9, day=2)),
        (2001, 8002, datetime(year=2020, month=9, day=3)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    customer_table_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            100.0,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            200.0,
        ),
    ]
    customer_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(customer_table_data),
        customer_feature_schema)
    customer_table = FeatureTable(
        name="transactions",
        features=[Field("daily_transactions", "double")],
        entities=[Field("customer_id", "int32")],
        max_age=86400,
    )

    driver_table_data = [
        (
            8001,
            datetime(year=2020, month=8, day=31),
            datetime(year=2020, month=8, day=31),
            200,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            300,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            600,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            500,
        ),
    ]
    driver_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(driver_table_data),
        driver_feature_schema)

    driver_table = FeatureTable(
        name="bookings",
        features=[Field("completed_bookings", "int32")],
        entities=[Field("driver_id", "int32")],
    )

    joined_df = join_entity_to_feature_tables(
        entity_df,
        "event_timestamp",
        [customer_table_df, driver_table_df],
        [customer_table, driver_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
        StructField("bookings__completed_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
            300,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=2),
            100.0,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
df.show(n=5)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("                             Chacking data types")
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
var1 = df.schema
for i in var1:
    print(i)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("                             Changing data type to float")
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
dfnew = df.withColumn("Largest Property Use Type - Gross Floor Area (ft)",
                      df["Largest Property Use Type - Gross Floor Area (ft)"].cast(FloatType())) \
    .withColumn("2nd Largest Property Use - Gross Floor Area (ft)",
                df["2nd Largest Property Use - Gross Floor Area (ft)"].cast(FloatType())) \
    .withColumn("3rd Largest Property Use Type - Gross Floor Area (ft)",
                df["3rd Largest Property Use Type - Gross Floor Area (ft)"].cast(FloatType())) \
    .withColumn("Site EUI (kBtu/ft)", df["Site EUI (kBtu/ft)"].cast(FloatType())) \
    .withColumn("Weather Normalized Site EUI (kBtu/ft)", df["Weather Normalized Site EUI (kBtu/ft)"].cast(FloatType())) \
    .withColumn("Weather Normalized Site Electricity Intensity (kWh/ft)",
                df["Weather Normalized Site Electricity Intensity (kWh/ft)"].cast(FloatType())) \
    .withColumn("Weather Normalized Site Natural Gas Intensity (therms/ft)",
                df["Weather Normalized Site Natural Gas Intensity (therms/ft)"].cast(FloatType())) \
    .withColumn("Weather Normalized Source EUI (kBtu/ft)",
                df["Weather Normalized Source EUI (kBtu/ft)"].cast(FloatType())) \
    .withColumn("Water Intensity (All Water Sources) (gal/ft)",
                df["Water Intensity (All Water Sources) (gal/ft)"].cast(FloatType())) \
    .withColumn("Source EUI (kBtu/ft)", df["Source EUI (kBtu/ft)"].cast(FloatType())) \
Beispiel #21
0
    return iso_region.strip().split('-')[-1]


udf_state = f.udf(lambda x: to_state(x), StringType())


def to_lat(coordinates):
    '''
    Split latitude from the airport coordinates
    Parameters: 
        coordinates (str): Coordinates like '{latitude}, {longitude}'
    '''
    return float(coordinates.strip().split(',')[0])


udf_lat = f.udf(lambda x: to_lat(x), FloatType())


def to_long(coordinates):
    '''
    Split longitude from the airport coordinates
    Parameters: 
        coordinates (str): Coordinates like '{latitude}, {longitude}'
    '''
    return float(coordinates.strip().split(',')[1])


udf_long = f.udf(lambda x: to_long(x), FloatType())

# Strip quotes from the country name
udf_strip_quotes = f.udf(lambda x: x.strip('\''), StringType())
Beispiel #22
0
    def transform_demographics_data(self):

        # read file
        demographics_df = self._spark_session.read.load(
            f"{self._processing_bucket}/us-cities-demographics.csv",
            format="csv",
            header="true",
            sep=";")

        # change column name and data type
        demographics_df = (
            demographics_df.withColumn(
                "city", demographics_df["City"].cast(StringType())).withColumn(
                    "median_age",
                    demographics_df["Median Age"].cast(FloatType())).
            withColumn("male_pop", demographics_df["Male Population"].cast(
                IntegerType())).withColumn(
                    "female_pop", demographics_df["Female Population"].cast(
                        IntegerType())).withColumn(
                            "total_pop",
                            demographics_df["Total Population"].cast(
                                IntegerType())).withColumn(
                                    "num_vets",
                                    demographics_df["Number of Veterans"].cast(
                                        IntegerType())).
            withColumn("foreign_born", demographics_df["Foreign-born"].cast(
                IntegerType())).withColumn(
                    "avg_household_size",
                    demographics_df["Average Household Size"].cast(
                        FloatType())).withColumn(
                            "state_code", demographics_df["State Code"].cast(
                                StringType())).withColumn(
                                    "race", demographics_df["Race"].cast(
                                        StringType())).withColumn(
                                            "count",
                                            demographics_df["Count"].cast(
                                                IntegerType())))

        # choose columns to keep
        demographics_summary_df = demographics_df[[
            "state_code",
            "city",
            "median_age",
            "male_pop",
            "female_pop",
            "total_pop",
            "num_vets",
            "foreign_born",
            "avg_household_size",
        ]]
        # choose columns to keep
        demographics_race_df = demographics_df[[
            "state_code", "city", "race", "count"
        ]]

        # remove duplicates
        demographics_summary_df = demographics_summary_df.distinct()

        # write to csv
        demographics_summary_df.write.csv(
            path=f"{self._processed_bucket}/dim_cities_demographics_summary",
            mode="overwrite",
            header=True)
        demographics_race_df.write.csv(
            path=f"{self._processed_bucket}/dim_cities_demographics_race",
            mode="overwrite",
            header=True)
Beispiel #23
0
def align_diff_frames(resolve_func,
                      this,
                      that,
                      fillna=True,
                      how="full",
                      preserve_order_column=False):
    """
    This method aligns two different DataFrames with a given `func`. Columns are resolved and
    handled within the given `func`.
    To use this, `compute.ops_on_diff_frames` should be True, for now.

    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
        the column of another DataFrame. It returns an iterable that produces Series.

        >>> from databricks.koalas.config import set_option, reset_option
        >>>
        >>> set_option("compute.ops_on_diff_frames", True)
        >>>
        >>> kdf1 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>> kdf2 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>>
        >>> def func(kdf, this_column_labels, that_column_labels):
        ...    kdf  # conceptually this is A + B.
        ...
        ...    # Within this function, Series from A or B can be performed against `kdf`.
        ...    this_label = this_column_labels[0]  # this is ('a',) from kdf1.
        ...    that_label = that_column_labels[0]  # this is ('a',) from kdf2.
        ...    new_series = (kdf[this_label] - kdf[that_label]).rename(str(this_label))
        ...
        ...    # This new series will be placed in new DataFrame.
        ...    yield (new_series, this_label)
        >>>
        >>>
        >>> align_diff_frames(func, kdf1, kdf2).sort_index()
           a
        0  0
        1  0
        2  0
        3  0
        4  0
        5  0
        6  0
        7  0
        8  0
        >>> reset_option("compute.ops_on_diff_frames")

    :param this: a DataFrame to align
    :param that: another DataFrame to align
    :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
        Otherwise, it returns as are.
    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
            'that_columns' in this function are B, C and B, C.
        - left: `resolve_func` should resolve columns including that columns.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
            B, C but `that_columns` are B, C, D.
        - inner: Same as 'full' mode; however, internally performs inner join instead.
    :return: Aligned DataFrame
    """
    assert how == "full" or how == "left" or how == "inner"

    this_column_labels = this._internal.column_labels
    that_column_labels = that._internal.column_labels
    common_column_labels = set(this_column_labels).intersection(
        that_column_labels)

    # 1. Perform the join given two dataframes.
    combined = combine_frames(this,
                              that,
                              how=how,
                              preserve_order_column=preserve_order_column)

    # 2. Apply the given function to transform the columns in a batch and keep the new columns.
    combined_column_labels = combined._internal.column_labels

    that_columns_to_apply = []
    this_columns_to_apply = []
    additional_that_columns = []
    columns_to_keep = []
    column_labels_to_keep = []

    for combined_label in combined_column_labels:
        for common_label in common_column_labels:
            if combined_label == tuple(["this", *common_label]):
                this_columns_to_apply.append(combined_label)
                break
            elif combined_label == tuple(["that", *common_label]):
                that_columns_to_apply.append(combined_label)
                break
        else:
            if how == "left" and combined_label in [
                    tuple(["that", *label]) for label in that_column_labels
            ]:
                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
                # it later to `func`. `func` should resolve it.
                # Note that adding this into a separate list (`additional_that_columns`)
                # is intentional so that `this_columns` and `that_columns` can be paired.
                additional_that_columns.append(combined_label)
            elif fillna:
                columns_to_keep.append(
                    F.lit(None).cast(FloatType()).alias(str(combined_label)))
                column_labels_to_keep.append(combined_label)
            else:
                columns_to_keep.append(
                    combined._internal.spark_column_for(combined_label))
                column_labels_to_keep.append(combined_label)

    that_columns_to_apply += additional_that_columns

    # Should extract columns to apply and do it in a batch in case
    # it adds new columns for example.
    if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0:
        kser_set, column_labels_applied = zip(*resolve_func(
            combined, this_columns_to_apply, that_columns_to_apply))
        columns_applied = [c.spark.column for c in kser_set]
        column_labels_applied = list(column_labels_applied)
    else:
        columns_applied = []
        column_labels_applied = []

    applied = combined[columns_applied + columns_to_keep]
    applied.columns = pd.MultiIndex.from_tuples(column_labels_applied +
                                                column_labels_to_keep)

    # 3. Restore the names back and deduplicate columns.
    this_labels = OrderedDict()
    # Add columns in an order of its original frame.
    for this_label in this_column_labels:
        for new_label in applied._internal.column_labels:
            if new_label[1:] not in this_labels and this_label == new_label[1:]:
                this_labels[new_label[1:]] = new_label

    # After that, we will add the rest columns.
    other_labels = OrderedDict()
    for new_label in applied._internal.column_labels:
        if new_label[1:] not in this_labels:
            other_labels[new_label[1:]] = new_label

    kdf = applied[list(this_labels.values()) + list(other_labels.values())]
    kdf.columns = kdf.columns.droplevel()
    return kdf
Beispiel #24
0
def convert(
    spark: SparkSession,
    dataset_root: str,
    limit: int = 0,
    asset_dir: Optional[str] = None,
) -> DataFrame:
    """Convert a Coco Dataset into Rikai dataset.

    This function expects the COCO datasets are stored in directory with the
    following structure:

    - dataset
        - annotations
          - captions_train2017.json
          - instances_train2017.json
          - ...
        - train2017
        - val2017
        - test2017

    Parameters
    ----------
    spark : SparkSession
        A live spark session
    dataset_root : str
        The directory of dataset
    limit : int, optional
        The number of images of each split to be converted.
    asset_dir : str, optional
        The asset directory to store images, can be a s3 directory.

    Return
    ------
    DataFrame
        Returns a Spark DataFrame
    """
    train_json = os.path.join(dataset_root, "annotations",
                              "instances_train2017.json")
    val_json = os.path.join(dataset_root, "annotations",
                            "instances_val2017.json")

    categories = load_categories(train_json)

    examples = []
    for split, anno_file in zip(["train", "val"], [train_json, val_json]):
        coco = COCO(annotation_file=anno_file)
        # Coco has native dependencies, so we do not distributed them
        # to the workers.
        image_ids = coco.imgs
        if limit > 0:
            image_ids = islice(image_ids, limit)
        for image_id in image_ids:
            ann_id = coco.getAnnIds(imgIds=image_id)
            annotations = coco.loadAnns(ann_id)
            annos = []
            for ann in annotations:
                bbox = Box2d(*ann["bbox"])
                annos.append({
                    "category_id":
                    ann["category_id"],
                    "category_text":
                    categories[ann["category_id"]]["name"],
                    "bbox":
                    bbox,
                    "area":
                    float(ann["area"]),
                })
            image_payload = coco.loadImgs(ids=image_id)[0]
            example = {
                "image_id":
                image_id,
                "annotations":
                annos,
                "image":
                Image(
                    os.path.abspath(
                        os.path.join(
                            os.curdir,
                            "dataset",
                            "{}2017".format(split),
                            image_payload["file_name"],
                        ))),
                "split":
                split,
            }
            examples.append(example)

    schema = StructType([
        StructField("image_id", LongType(), False),
        StructField(
            "annotations",
            ArrayType(
                StructType([
                    StructField("category_id", IntegerType()),
                    StructField("category_text", StringType()),
                    StructField("area", FloatType()),
                    StructField("bbox", Box2dType()),
                ])),
            False,
        ),
        StructField("image", ImageType(), False),
        StructField("split", StringType(), False),
    ])
    df = spark.createDataFrame(examples, schema=schema)
    if asset_dir:
        asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/"
        print("ASSET DIR: ", asset_dir)
        df = df.withColumn("image", image_copy(col("image"), lit(asset_dir)))
    return df
Beispiel #25
0
def main():
    raw_training_data = sc.textFile("dataset/training.data")

    # TODO: Convert text file into an RDD which can be converted to a DataFrame
    # Hint: For types and format look at what the format required by the
    # `train` method for the random forest classifier
    # Hint 2: Look at the imports above
    rdd_train = raw_training_data.map(lambda row: row.split(","))
    parsed_rdd_train = rdd_train.map(lambda r: Row(count1=float(r[0]),
                                                   count2=float(r[1]),
                                                   count3=float(r[2]),
                                                   count4=float(r[3]),
                                                   count5=float(r[4]),
                                                   count6=float(r[5]),
                                                   count7=float(r[6]),
                                                   count8=float(r[7]),
                                                   id=int(r[8])))

    trainschema = StructType([
        StructField("count1", FloatType(), True),
        StructField("count2", FloatType(), True),
        StructField("count3", FloatType(), True),
        StructField("count4", FloatType(), True),
        StructField("count5", FloatType(), True),
        StructField("count6", FloatType(), True),
        StructField("count7", FloatType(), True),
        StructField("count8", FloatType(), True),
        StructField("id", IntegerType(), True)
    ])

    # TODO: Create dataframe from the RDD
    df_train = sqlContext.createDataFrame(parsed_rdd_train, schema=trainschema)

    #df_train.show()

    raw_test_data = sc.textFile("dataset/test-features.data")

    # TODO: Convert text file lines into an RDD we can use later
    rdd_test = raw_test_data.map(lambda row: row.split(","))

    parsed_rdd_test = rdd_test.map(lambda r: Row(count1=float(r[0]),
                                                 count2=float(r[1]),
                                                 count3=float(r[2]),
                                                 count4=float(r[3]),
                                                 count5=float(r[4]),
                                                 count6=float(r[5]),
                                                 count7=float(r[6]),
                                                 count8=float(r[7])))

    testschema = StructType([
        StructField("count1", FloatType(), True),
        StructField("count2", FloatType(), True),
        StructField("count3", FloatType(), True),
        StructField("count4", FloatType(), True),
        StructField("count5", FloatType(), True),
        StructField("count6", FloatType(), True),
        StructField("count7", FloatType(), True),
        StructField("count8", FloatType(), True)
    ])

    # TODO:Create dataframe from RDD
    df_test = sqlContext.createDataFrame(parsed_rdd_test, schema=testschema)
    #df_test.show()

    predictions = predict(df_train, df_test)

    # You can take a look at dataset/test-labels.data to see if your
    # predictions were right
    for pred in predictions:
        print(int(pred))
Beispiel #26
0
def perform_preprocessing(
    df,
    states: List[int],
    actions: List[str],
    metrics: List[str],
    multi_steps: Optional[int] = None,
):
    """ Perform (1) sparse-to-dense, (2) preprocessing for actions,
    and (3) other miscellaneous columns.

    (1) For each column of type Map, w/ name X, output two columns.
        Map values are assumed to be scalar. This process is called sparse-to-dense.
        X = {"state_features", "next_state_features", "metrics"}.
        (a) Replace column X with a dense repesentation of the inputted (sparse) map.
            Dense representation is to concatenate map values into a list.
        (b) Create new column X_presence, which is a list of same length as (a) and
            the ith entry is 1 iff the key was present in the original map.

    (2) Inputted actions and possible_actions are strings, which isn't supported
        for PyTorch Tensors. Here, we represent them with LongType.
        (a) action and next_action are strings, so simply return their position
            in the action_space (as given by argument actions).
        (b) possible_actions and possible_next_actions are list of strs, so
            return an existence bitvector of length len(actions), where ith
            index is true iff actions[i] was in the list.

    (3) Miscellaneous columns are step, time_diff, sequence_number, not_terminal
    """

    # step refers to n in n-step RL; special case when approaching terminal
    df = df.withColumn("step", make_get_step_udf(multi_steps)("next_state_features"))

    # take the next time_diff
    next_long_udf = make_next_udf(multi_steps, LongType())
    df = df.withColumn("time_diff", next_long_udf("time_diff"))

    # sparse-to-dense of states and metrics
    next_map_udf = make_next_udf(multi_steps, MapType(LongType(), FloatType()))
    df = df.withColumn("next_state_features", next_map_udf("next_state_features"))
    df = df.withColumn("metrics", next_map_udf("metrics"))
    df = make_sparse2dense(df, "state_features", states)
    df = make_sparse2dense(df, "next_state_features", states)
    df = make_sparse2dense(df, "metrics", metrics)

    # turn string actions into indices
    where_udf = make_where_udf(actions)
    df = df.withColumn("action", where_udf("action"))
    df = df.withColumn("next_action", where_udf(next_long_udf("next_action")))

    # turn List[str] possible_actions into existence bitvectors
    next_long_arr_udf = make_next_udf(multi_steps, ArrayType(LongType()))
    existence_bitvector_udf = make_existence_bitvector_udf(actions)
    df = df.withColumn(
        "possible_actions_mask", existence_bitvector_udf("possible_actions")
    )
    df = df.withColumn(
        "possible_next_actions_mask",
        existence_bitvector_udf(next_long_arr_udf("possible_next_actions")),
    )

    # calculate not_terminal
    not_terminal_udf = make_not_terminal_udf(actions)
    df = df.withColumn("not_terminal", not_terminal_udf("next_action"))

    # assuming use_seq_num_diff_as_time_diff = False for now
    df = df.withColumn("sequence_number", col("sequence_number_ordinal"))
    return df
Beispiel #27
0
# MAGIC <a target="_blank" href="https://fast.wistia.net/embed/iframe/qk2is6llgl?seo=false">
# MAGIC   <img alt="Opens in new tab" src="https://files.training.databricks.com/static/images/external-link-icon-16x16.png"/>&nbsp;Watch full-screen.</a>
# MAGIC </div>

# COMMAND ----------

# MAGIC %md
# MAGIC The ZIP Code dataset contains an array with the latitude and longitude of the cities.  Use an `ArrayType`, which takes the primitive type of its elements as an argument.

# COMMAND ----------

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, FloatType

zipsSchema3 = StructType([
    StructField("city", StringType(), True),
    StructField("loc", ArrayType(FloatType(), True), True),
    StructField("pop", IntegerType(), True)
])

# COMMAND ----------

# MAGIC %md
# MAGIC Apply the schema using the `.schema()` method and observe the results.  Expand the array values in the column `loc` to explore further.

# COMMAND ----------

zipsDF3 = (spark.read.schema(zipsSchema3).json("/mnt/training/zips.json"))
display(zipsDF3)

# COMMAND ----------
Beispiel #28
0
def test_implicit_type_conversion(spark: SparkSession, ):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'single_customer.csv')}",
            "event_timestamp_column": "event_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'transactions.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_table = {
        "name": "transactions",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "daily_transactions",
            "type": "float"
        }],
    }

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [transaction_source],
        [transaction_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
    ])

    expected_joined_data = [
        (
            1001,
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Beispiel #29
0
def change_to_float(data, col):
    for conv_col in col:
        data = data.withColumn(conv_col, data[conv_col].cast(FloatType()))
    return (data)
Beispiel #30
0
__short_type: ShortType = ShortType()
_SMALLINT_TYPE: str = __short_type.simpleString()

__int_type: IntegerType = IntegerType()
_INT_TYPE: str = __int_type.simpleString()
assert _INT_TYPE == int.__name__
assert __int_type.typeName().startswith(_INT_TYPE)

__long_type: LongType = LongType()
_BIGINT_TYPE: str = __long_type.simpleString()
assert __long_type.typeName() == 'long'

_INT_TYPES: Tuple[str] = _TINYINT_TYPE, _SMALLINT_TYPE, _INT_TYPE, _BIGINT_TYPE


__float_type: FloatType = FloatType()
_FLOAT_TYPE: str = __float_type.simpleString()
assert _FLOAT_TYPE == __float_type.typeName()

__double_type: DoubleType = DoubleType()
_DOUBLE_TYPE: str = __double_type.simpleString()
assert _DOUBLE_TYPE == __double_type.typeName()

_FLOAT_TYPES: Tuple[str] = _FLOAT_TYPE, _DOUBLE_TYPE


_NUM_TYPES: Tuple[str] = _INT_TYPES + _FLOAT_TYPES


_POSSIBLE_CAT_TYPES: Tuple[str] = (_BOOL_TYPE, _STR_TYPE) + _NUM_TYPES
_POSSIBLE_FEATURE_TYPES: Tuple[str] = _POSSIBLE_CAT_TYPES + _NUM_TYPES