def capitol_to_country_density_ratio(sc: SparkContext):
    sql_sc = SQLContext(sc)

    countries = sql_sc.createDataFrame(
        pd.DataFrame(class_utils.read_countries()))
    cities = sql_sc.createDataFrame(pd.DataFrame(class_utils.read_cities()))

    countries.registerTempTable('countries')
    cities.registerTempTable('cities')

    sql_sc.sql(
        'SELECT '
        'countries.name AS country_name, '
        'cities.name AS capitol_name, '
        'round(countries.population / countries.area, 2) AS country_density, '
        'round(cities.population / cities.area, 2) AS capitol_density '
        'FROM countries JOIN cities on countries.capitol == cities.name'
    ).registerTempTable('country_capitol')

    result = sql_sc.sql(
        'SELECT *, '
        'round(capitol_density / country_density, 2) AS density_ratio '
        'FROM country_capitol')

    show_all(result)
def convert_04(sc: SparkContext):
    sql_sc = SQLContext(sc)

    cities = class_utils.read_cities()
    pd_df = pd.DataFrame(cities)

    df = sql_sc.createDataFrame(pd_df)
    show_all(df)
def convert_02(sc: SparkContext):
    sql_sc = SQLContext(sc)

    cities = class_utils.read_cities()
    cities_row = [Row(**x) for x in cities]

    df = sql_sc.createDataFrame(cities_row)
    show_all(df)
def convert_03(sc: SparkContext):
    sql_sc = SQLContext(sc)

    cities = class_utils.read_cities()
    cities = [class_utils.City.from_dict(x) for x in cities]

    df = sql_sc.createDataFrame(cities)
    show_all(df)
Esempio n. 5
0
def convert_01(sc: SparkContext):
    cities = class_utils.read_cities()
    rdd = sc.parallelize(cities)

    def add_density(x):
        x['density'] = round(x['population'] / x['area'], 2)
        return x
    rdd = rdd.map(add_density)
    show_all(rdd)
    show_separately(rdd)
def convert_01(sc: SparkContext):
    sql_sc = SQLContext(sc)

    cities = class_utils.read_cities()
    rdd = sc.parallelize(cities)
    rdd_row = rdd.map(
        lambda x: Row(**x))  # Using RDD of dict to inferSchema is deprecated

    df = sql_sc.createDataFrame(rdd_row)
    show_all(df)
def convert_05(sc: SparkContext):
    sql_sc = SQLContext(sc)

    cities = class_utils.read_cities()
    cities_row = [Row(**x) for x in cities]

    schema = StructType([
        StructField('name', StringType(), nullable=False),
        StructField('country', StringType(), nullable=False),
        StructField('area', DoubleType(), nullable=False),
        # StructField('population', IntegerType(), nullable=False)
    ])

    df = sql_sc.createDataFrame(cities_row, schema)
    show_all(df)
Esempio n. 8
0
def convert_02(sc: SparkContext):
    cities = class_utils.read_cities()
    cities = [class_utils.City.from_dict(x) for x in cities]
    rdd = sc.parallelize(cities)

    def add_density(x: class_utils.City):
        return class_utils.CityWithDensity(
            name=x.name,
            country=x.country,
            population=x.population,
            area=x.area,
            density=round(x.population / x.area, 2)
        )
    rdd = rdd.map(add_density)  # no prompt in lambda
    show_all(rdd)
    show_separately(rdd)
def multiple_country_mean_city_density(sc: SparkContext):
    sql_sc = SQLContext(sc)

    countries = class_utils.read_countries()
    cities = sql_sc.createDataFrame(pd.DataFrame(class_utils.read_cities()))

    iso_translator = {x['iso']: x['name'] for x in countries}

    translator_udf = F.udf(lambda x: iso_translator.get(x), StringType())

    result = cities\
        .filter(F.col('country').isin(list(iso_translator.keys())))\
        .withColumn('country_name', translator_udf(F.col('country')))\
        .withColumn('density', F.col('population') / F.col('area'))\
        .groupBy('country_name')\
        .agg({'density': 'mean'})\
        .withColumn('mean_density', F.round(F.col('avg(density)'), 2))

    show_all(result)
Esempio n. 10
0
def multiple_country_mean_city_density_02(sc: SparkContext):
    countries = sc.parallelize(class_utils.read_countries())
    cities = sc.parallelize(class_utils.read_cities())

    iso_translator = countries.map(lambda x: (x['iso'], x['name'])).collectAsMap()

    def calculate_mean_density(cities_list: List[Dict[str, Any]]) -> float:
        return round(sum([c['density'] for c in cities_list]) / len(cities_list), 2)

    result = cities \
        .filter(lambda x: x['country'] in iso_translator.keys()) \
        .map(lambda x: {'name': x['name'],
                        'density': round(x['population'] / x['area'], 2),
                        'country_name': iso_translator.get(x['country'])}) \
        .groupBy(lambda x: x['country_name'])\
        .map(lambda x: {'country': x[0],
                        'mean_density': calculate_mean_density(x[1])})

    show_separately(result)
Esempio n. 11
0
def capitol_to_country_density_ratio(sc: SparkContext):
    countries = sc.parallelize(class_utils.read_countries())
    cities = sc.parallelize(class_utils.read_cities())

    def with_density(x: Dict[str, Any]) -> Dict[str, Any]:
        return {**x, 'density': round(x['population'] / x['area'], 2)}

    # you cannot use one RDD inside function of other RDD
    result = countries\
        .map(lambda x: (x['capitol'], x))\
        .join(cities.map(lambda x: (x['name'], x)))\
        .map(lambda x: {'country': with_density(x[1][0]), 'capitol': with_density(x[1][1])})\
        .map(lambda x: {'country_name': x['country']['name'],
                        'capitol_name': x['capitol']['name'],
                        'country_density': x['country']['density'],
                        'capitol_density': x['capitol']['density'],
                        'density_ratio': round(x['capitol']['density'] / x['country']['density'], 2),
                        })

    show_separately(result)
Esempio n. 12
0
def multiple_country_mean_city_density_01(sc: SparkContext):
    countries = sc.parallelize(class_utils.read_countries())
    cities = sc.parallelize(class_utils.read_cities())

    iso_translator = countries.map(lambda x: (x['iso'], x['name'])).collectAsMap()

    result = cities \
        .filter(lambda x: x['country'] in iso_translator.keys()) \
        .map(lambda x: (iso_translator.get(x['country']),
                        {'name': x['name'], 'density': round(x['population'] / x['area'], 2)})) \
        .aggregateByKey({'sum': 0, 'count': 0},
                        lambda agg, x: {
                            'sum': agg['sum'] + x['density'],
                            'count': agg['count'] + 1
                        },
                        lambda agg1, agg2: {
                            'sum': agg1['sum'] + agg2['sum'],
                            'count': agg1['count'] + agg2['count']
                        })\
        .map(lambda x: {'country': x[0],
                        'mean_density': round(x[1]['sum'] / x[1]['count'], 2)})

    show_separately(result)
Esempio n. 13
0
def single_country_mean_city_density(sc: SparkContext, country_name="Poland"):
    countries = sc.parallelize(class_utils.read_countries())
    cities = sc.parallelize(class_utils.read_cities())

    country = countries.filter(lambda x: x['name'] == country_name).collect()
    iso_code = country[0]['iso']

    cities_density = cities\
        .filter(lambda x: x['country'] == iso_code)\
        .map(lambda x: {'name': x['name'], 'density': round(x['population'] / x['area'], 2)})
    show_all(cities_density)
    aggregation = cities_density.aggregate({'sum': 0, 'count': 0},
                                           lambda agg, x: {
                                               'sum': agg['sum'] + x['density'],
                                               'count': agg['count'] + 1
                                           },
                                           lambda agg1, agg2: {
                                               'sum': agg1['sum'] + agg2['sum'],
                                               'count': agg1['count'] + agg2['count']
                                           })
    print(aggregation['sum'])
    mean_density = round(aggregation['sum'] / aggregation['count'], 2)
    print(mean_density)