Esempio n. 1
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"hdfs://xdata/qcr/gnip": 66}
LOWER_TIME = date(2006, 03, 21)
UPPER_TIME = date(3000, 01, 01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath", help="Output destination")
    parser.add_argument("-jobNm",
                        help="Application name, default = 'Geqe Data Filter'",
                        default='Geqe data filter.')
    parser.add_argument(
        "-cNum",
        type=int,
        help=
Esempio n. 2
0
dropoff_centroid_longitude
dropoff_centroid_location"""

# partion non-string type column types
bool_fields = set(["shared_trip_authorized"])

float_fields = set([
    "trip_seconds", "trip_miles", "fare", "tip", "additional_charges",
    "trip_total"
])

int_fields = set(["trips_pooled"])

# for each column name, assign it a specific type
fields = [
    StructField(field_name, BooleanType())
    if field_name in bool_fields else StructField(field_name, FloatType())
    if field_name in float_fields else StructField(field_name, IntegerType())
    if field_name in int_fields else StructField(field_name, StringType())
    for field_name in schemaString.split("\n")
]

# store schema
schema = StructType(fields)

# start spark session ----
spark = (SparkSession.builder.master("local[1]").appName(
    "Python Spark SQL example").getOrCreate())

# load necessary data ----
Esempio n. 3
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType, FloatType

# create a StructType for the Kafka redis-server topic which has all changes made
# to Redis - before Spark 3.0.0, schema inference is not automatic

KAFKA_HOST = "kafka:19092"
SPARK_HOST = "spark://spark:7077"

redisRawMessageSchema = StructType([
    StructField("key", StringType()),
    StructField("existType", StringType()),
    StructField("Ch", BooleanType()),
    StructField("Incr", BooleanType()),
    StructField(
        "zSetEntries",
        ArrayType(
            StructType([
                StructField("element", StringType()),
                StructField("Score", StringType())
            ])))
])

# create a StructType for the Customer JSON that comes
# from Redis- before Spark 3.0.0, schema inference is not automatic

redisCustomerSchema = StructType([
    StructField("customerName", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
Esempio n. 4
0
    def make_not_terminal_udf():
        """ Return true iff next_action is an empty map """
        def get_not_terminal(next_action):
            return len(next_action) > 0

        return udf(get_not_terminal, BooleanType())
Esempio n. 5
0
    secret = open(app_secrets_path)
    app_secret = yaml.load(secret, Loader=yaml.FullLoader)

    # Setup spark to use s3
    hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3a.access.key", app_secret["s3_conf"]["access_key"])
    hadoop_conf.set("fs.s3a.secret.key",
                    app_secret["s3_conf"]["secret_access_key"])

    print(
        "\nCreating dataframe ingestion CSV file using 'SparkSession.read.format()'"
    )

    fin_schema = StructType() \
        .add("id", IntegerType(), True) \
        .add("has_debt", BooleanType(), True) \
        .add("has_financial_dependents", BooleanType(), True) \
        .add("has_student_loans", BooleanType(), True) \
        .add("income", DoubleType(), True)

    fin_df = spark.read \
        .option("header", "false") \
        .option("delimiter", ",") \
        .format("csv") \
        .schema(fin_schema) \
        .load("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/finances.csv")

    fin_df.printSchema()
    fin_df.show()

    print(
Esempio n. 6
0

# spark-submit --master yarn --num-executors 10 --executor-cores 5 --executor-memory 16G --driver-memory 32G --conf spark.driver.maxResultSize=10G trainready_data_cmp.py
sc = SparkContext.getOrCreate()
sc.setLogLevel('WARN')
hive_context = HiveContext(sc)

df = hive_context.read.format('com.databricks.spark.csv').options(
    header='true').load('train_ready_bad_uckey_removal_10percent_denoise.csv')
c = df.columns
del c[0]
df = df.withColumn(
    'ts',
    udf(lambda x: [x[_] for _ in c], ArrayType(StringType()))(fn.struct(c)))
df = df.withColumn('sparse',
                   udf(lambda x: 'vir' in x, BooleanType())(col('_c0')))
df = df.filter('sparse==False')
df = df.withColumn(
    'imp',
    udf(lambda x: int(sum([float(_) for _ in x if _])), IntegerType())(df.ts))
df = df.select('_c0', 'ts', 'sparse', 'imp')
# df.show(1, False)
jimmy = df.collect()
jimmy_dict = {}
for _ in jimmy:
    jimmy_dict[_['_c0']] = _

df = hive_context.sql('select * from dlpm_06242021_1635_trainready')
df = df.withColumn('sparse',
                   udf(lambda x: ',' not in x, BooleanType())(col('uckey')))
df = df.filter('sparse==False')
Esempio n. 7
0
    return df

#BOOLEAN Savoir si le match est joué à domicile
def resultat_a_domicile_oui_non(dframe):
    if dframe[0:6] == 'France':
        return True
    else:
        return False


#Rajout de la colonne A Domicile avec la réponse du boolean
def a_domicile(dframe):
    df = dframe.withColumn('a_Domicile', resultat_a_domicile_oui_non(dframe.match));
    return df

resultat_a_domicile_oui_non = F.udf(resultat_a_domicile_oui_non, BooleanType())


#Savoir si ils sont champion du monde
def en_coupe_du_monde(competition_colonne):
    if competition_colonne[:5] == 'Coupe':
        return 1
    else:
        return 0
    
jouer_en_coupe_du_monde = F.udf(en_coupe_du_monde, IntegerType())    
    
def statistiques(dframe):
    df = (dframe
        .groupBy("adversaire")
        .agg(
Esempio n. 8
0
def getCrosswalkDF(spark=None, columns=None, strong_mcd_states=STRONG_MCD_STATES, aian_areas=AIAN_AREAS, aian_ranges_path=AIAN_RANGES_PATH, fed_airs=FED_AIRS):
    """
    Loads the 2010 crosswalk files that Simson generated from the 2010 GRFC into a Spark DF

    Parameters
    ==========
    spark : SparkSession

    columns : str or list of str (default is None, which will return all columns in the file)
        - This determines which columns survive from the original crosswalk data file, as the function will
          only return a Spark DF with the columns listed here

    Returns
    =======
    a Spark DF containing crosswalk columns

    Notes
    =====
    - This function also generates a number of additional columns to expand the ease-of-use when aggregating
      blocks to form geographic units in different geographic levels.
        - e.g. Rather than COUNTY being the 3-digit FIPS code, the COUNTY column will concatenate both the
               2-digit STATE FIPS code and the 3-digit COUNTY FIPS code to create a 5-digit COUNTY code that
               is unique from all other 5-digit COUNTY codes.
    """
    crosswalk = f"{DAS_S3ROOT}/2010/geounit_crosswalks/24vars/"

    crossdf = spark.read.option("header", "true").csv(crosswalk)
    # add "geocode" column based on GEOID (which is the 16 digit block id)
    crossdf = crossdf.withColumn("geocode", crossdf['GEOID'])

    # generate unique counties
    crossdf = crossdf.withColumn("COUNTY", sf.concat(sf.col("STATE"), sf.col("COUNTY")))

    # generate unique tract groups
    crossdf = crossdf.withColumn("TRACT_GROUP", sf.concat(sf.col("County"), crossdf.TRACT[0:4]))

    # generate unique tracts
    crossdf = crossdf.withColumn("TRACT", sf.concat(sf.col("COUNTY"), sf.col("TRACT")))

    # generate block group column
    crossdf = crossdf.withColumn("BLOCK_GROUP", crossdf.BLOCK[0:1])

    # generate unique block groups
    crossdf = crossdf.withColumn("BLOCK_GROUP", sf.concat(sf.col("TRACT"), sf.col("BLOCK_GROUP")))

    # generate unique blocks
    crossdf = crossdf.withColumn("BLOCK", sf.concat(sf.col("BLOCK_GROUP"), sf.col("BLOCK")))

    # generate unique SLDLs (only unique if state fips has been prepended to the SLDL identifier)
    crossdf = crossdf.withColumn("SLDL", sf.concat(sf.col("STATE"), sf.col("SLDL")))

    # generate unique SLDUs (only unique if state fips has been prepended to the SLDU identifier)
    crossdf = crossdf.withColumn("SLDU", sf.concat(sf.col("STATE"), sf.col("SLDU")))

    # generate unique Congressional Districts (111th Congress) - only unique if state fips has been prepended to the CD identifier
    crossdf = crossdf.withColumn("CD", sf.concat(sf.col("STATE"), sf.col("CD")))

    # generate unique school districts (only unique if state fips has been prepended to the identifiers)
    crossdf = crossdf.withColumn("SDELM", sf.concat(sf.col("STATE"), sf.col("SDELM")))
    crossdf = crossdf.withColumn("SDSEC", sf.concat(sf.col("STATE"), sf.col("SDSEC")))
    crossdf = crossdf.withColumn("SDUNI", sf.concat(sf.col("STATE"), sf.col("SDUNI")))

    # generate unique urban areas and urban growth areas (only unique if state prepended)
    crossdf = crossdf.withColumn("UA", sf.concat(sf.col("STATE"), sf.col("UA")))
    crossdf = crossdf.withColumn("UGA", sf.concat(sf.col("STATE"), sf.col("UGA")))

    # generate unique puma and place ids (only unique if state prepended)
    crossdf = crossdf.withColumn("PUMA", sf.concat(sf.col("STATE"), sf.col("PUMA")))
    crossdf = crossdf.withColumn("PLACE", sf.concat(sf.col("STATE"), sf.col("PLACE")))

    # generate unique county subdivisions (only unique if state and county prepended)
    crossdf = crossdf.withColumn("COUSUB", sf.concat(sf.col("COUNTY"), sf.col("COUSUB")))

    # generate unique subminor civil divisions (only unique if state, county, and county subdivisions prepended)
    crossdf = crossdf.withColumn("SUBMCD", sf.concat(sf.col("COUSUB"), sf.col("SUBMCD")))

    # voting districts appear to have a floating space (" ") character in every VTD code, so we'll remove them as they
    # don't appear in the BlockAssign files for VTD
    ### Update - 2019-06-25 - The floating space is a valid character in the 6-character VTD codes; the first character
    #                         isn't always a " ", so " " is just another part of the code.
    #crossdf = crossdf.withColumn("VTD1st", crossdf.VTD[0:1])

    # generate unique voting districts (only unique if state and county prepended)
    crossdf = crossdf.withColumn("VTD", sf.concat(sf.col("COUNTY"), sf.col("VTD")))

    # create a column for the nation
    crossdf = crossdf.withColumn("US", sf.lit("Nation"))

    # Note: When using any of the columns from the next block, filter out IDs composed only of "9"'s
    aian_ranges_dict = make_aian_ranges_dict(aian_ranges_path, aian_areas)

    is_fed_air_udf = udf(lambda aiannhce: in_aian_class(aiannhce, fed_airs, aian_ranges_dict), BooleanType())
    is_aian_udf = udf(lambda aiannhce: in_aian_class(aiannhce, aian_areas, aian_ranges_dict), BooleanType())
    crossdf = add_aiannhce_col(spark, crossdf)
    # aian_areas:
    crossdf = crossdf.withColumn("AIAN_AREAS", sf.when(is_aian_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA))
    crossdf = crossdf.withColumn("FED_AIRS", sf.when(is_fed_air_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA))
    # portions of Blocks/Tracts/States within aian_areas:
    crossdf = crossdf.withColumn("AIANBlock", sf.when(sf.col("AIAN_AREAS") != CC.NOT_AN_AIAN_AREA, sf.col("BLOCK")).otherwise(CC.NOT_AN_AIAN_BLOCK))
    crossdf = crossdf.withColumn("AIANTract", sf.col("AIANBlock")[0:11])
    crossdf = crossdf.withColumn("AIANState", sf.col("AIANTract")[0:2])
    # Define an off-spine entity (OSE) as Place in AIAN areas/ non-strong-MCD states and MCD otherwise:
    crossdf = crossdf.withColumn("OSE", sf.when((sf.col("AIAN_AREAS") == CC.NOT_AN_AIAN_AREA) & (sf.col("STATE").isin(strong_mcd_states)), sf.col("COUSUB")).otherwise(sf.col("PLACE")))
    crossdf = crossdf.withColumn("COUNTY_NSMCD", sf.when(sf.col("STATE").isin(strong_mcd_states), CC.STRONG_MCD_COUNTY).otherwise(sf.col("COUNTY")))
    crossdf = crossdf.withColumn("MCD", sf.when(sf.col("STATE").isin(strong_mcd_states), sf.col("COUSUB")).otherwise(sf.lit(CC.NOT_A_MCD)))

    if columns is None:
        columns = crossdf.columns
    else:
        # always want 'geocode' (aka Block ID, GEOID) in the crosswalk dataframe
        columns = np.unique(du.aslist(columns) + ['geocode']).tolist()

    crossdf = crossdf.select(columns)
    return crossdf
Esempio n. 9
0
def main(sc):
    spark = SparkSession(sc)

    #b= b.select('L_LOW_HN', 'L_HIGH_HN','FULL_STREE','ST_LABEL','BOROCODE','PHYSICALID')
    b = spark.read.load('centerline (1).csv',
                        format='csv',
                        header=True,
                        inferSchema=True)
    b.head(10)
    b = b.withColumn('New L_LOW_HN',
                     funct.concat_ws('.', funct.split(b['L_LOW_HN'], '-')))
    b = b.withColumn('New R_LOW_HN',
                     funct.concat_ws('.', funct.split(b['R_LOW_HN'], '-')))
    b = b.withColumn('New L_HIGH_HN',
                     funct.concat_ws('.', funct.split(b['L_HIGH_HN'], '-')))
    b = b.withColumn('New R_HIGH_HN',
                     funct.concat_ws('.', funct.split(b['R_HIGH_HN'], '-')))
    #b.select('BOROCODE').distinct().show()
    b = b.select('NEW L_LOW_HN', 'NEW L_HIGH_HN', 'NEW R_LOW_HN',
                 'NEW R_HIGH_HN', 'FULL_STREE', 'ST_LABEL', 'BOROCODE',
                 'PHYSICALID')
    b.head(100)

    # In[7]:

    a = spark.read.load('small_data1.csv',
                        format='csv',
                        header=True,
                        inferSchema=True)
    #a.head()
    a = a.withColumn('Year', funct.split(a['Issue Date'], '/').getItem(2))

    a = a.filter((a['Year'] >= '2015') | (a['Year'] <= '2019'))
    a = a.filter(a['House Number'].rlike('^[0-9]*\-*[0-9]*$'))
    a = a.withColumn('New House Number',
                     funct.concat_ws('.', funct.split(a['House Number'], '-')))
    a = a.select('Violation County', 'New House Number', 'Street Name', 'Year')
    a.head(10)

    def process(c, d, e, f, g, h, i, j, k, l, m, n):
        a = [c, d, e, f]
        b = [g, h, i, j, k, l, m, n]
        #     if a[0] == 'K':
        #         return True
        boro = {
            3: ['K', 'KINGS', 'KING', 'BK'],
            2: ['BX', 'BRONX'],
            1: ['NY', 'MAN', 'MH', 'NEW Y', 'NEWY', 'MN'],
            5: ['R', 'RICHMOND'],
            4: ['Q', 'QU', 'QUEEN', 'QN', 'QNS']
        }
        if c in boro[m]:
            #         return True
            if ((b[4] == a[2]) | (b[5] == a[2])):
                return True

    #             if float(a[1])%2 == 0:
    #                 if ((float(b[3]) <= float(a[1]) ) & (float(b[2] )>= float(row[1]))):
    #                     return True
    #             else:
    #                 if ((float(b[1]) <= float(a[1]) ) & (float(b[0] )>= float(row[1]))):
    #                     return True
        return False

    acol = a.columns
    bcol = b.columns
    p = funct.udf(process, BooleanType())

    v = a.crossJoin(b).where(
        p(a['Violation County'], a['New House Number'], a['Street Name'],
          a['Year'], b['NEW L_LOW_HN'], b['NEW L_HIGH_HN'], b['NEW R_LOW_HN'],
          b['NEW R_HIGH_HN'], b['FULL_STREE'], b['ST_LABEL'], b['BOROCODE'],
          b['PHYSICALID']))

    v = v.groupBy("Year", "PHYSICALID").count()

    v.show(10)
Esempio n. 10
0
    'tolls_amt': 'tolls_amount',
    'total_amt': 'total_amount',
}

schema = StructType([
    StructField('vendor_name', StringType(), False),
    StructField('pickup_datetime', TimestampType(), False),
    StructField('dropoff_datetime', TimestampType(), False),
    StructField('passenger_count', IntegerType(), False),
    StructField('trip_distance', FloatType(), False),
    StructField('pickup_latitude', DoubleType(), False),
    StructField('pickup_longitude', DoubleType(), False),
    StructField('ratecode_id', IntegerType(), False),
    StructField('pickup_location_id', IntegerType(), False),
    StructField('dropoff_location_id', IntegerType(), False),
    StructField('store_and_forward_flag', BooleanType(), False),
    StructField('dropoff_latitude', DoubleType(), False),
    StructField('dropoff_longitude', DoubleType(), False),
    StructField('payment_type', IntegerType(), False),
    StructField('fare_amount', FloatType(), False),
    StructField('surcharge', FloatType(), False),
    StructField('improvement_surcharge', FloatType(), False),
    StructField('congestion_surcharge', FloatType(), False),
    StructField('mta_tax', FloatType(), False),
    StructField('tip_amount', FloatType(), False),
    StructField('tolls_amount', FloatType(), False),
    StructField('total_amount', FloatType(), False),
])


def payment_type_f(v):
Esempio n. 11
0
    if len(sys.argv) < 1:
        print('Usage: ' + sys.argv[0] + ' <database>')
        sys.exit(1)

    # Grab the parameters
    database = sys.argv[1]

    # Create a spark context for the job. The context is used to manage the job at a high level.
    appName = "ETL-%s" % database
    spark = SparkSession \
        .builder \
        .appName(appName) \
        .getOrCreate()

    # Register UDFs
    udfIsDurationCorrect = udf(isDurationCorrect, BooleanType())
    udfRemoveBraces = udf(removeBraces, StringType())

    # Read in the dataset
    logs = spark.read.csv("/incoming/logs/upload",
                          sep="\t",
                          inferSchema=True,
                          header="True")

    # Process the dataset
    streams_raw = logs.filter(logs['eventType'] == 'SongPlayed')
    streams_projected = streams_raw.drop('eventType')
    streams = streams_projected.withColumnRenamed('itemId', 'trackId')
    streams_correct = streams.filter(udfIsDurationCorrect('duration'))
    streams_cleaned = streams_correct.select(
        udfRemoveBraces('ts').alias('ts'), 'host', 'userId', 'trackId',
Esempio n. 12
0
    def test_as_spark_type_pandas_on_spark_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(pandas_on_spark_type(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            pandas_on_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            pandas_on_spark_type(np.dtype("object"))
Esempio n. 13
0
    StructField('ResponseDtTm', StringType(), True),
    StructField('OnSceneDtTm', StringType(), True),
    StructField('TransportDtTm', StringType(), True),
    StructField('HospitalDtTm', StringType(), True),
    StructField('CallFinalDisposition', StringType(), True),
    StructField('AvailableDtTm', StringType(), True),
    StructField('Address', StringType(), True),
    StructField('City', StringType(), True),
    StructField('ZipcodeofIncident', IntegerType(), True),
    StructField('Battalion', StringType(), True),
    StructField('StationArea', StringType(), True),
    StructField('Box', StringType(), True),
    StructField('OriginalPriority', StringType(), True),
    StructField('Priority', StringType(), True),
    StructField('FinalPriority', IntegerType(), True),
    StructField('ALSUnit', BooleanType(), True),
    StructField('CallTypeGroup', StringType(), True),
    StructField('NumberofAlarms', IntegerType(), True),
    StructField('UnitType', StringType(), True),
    StructField('Unitsequenceincalldispatch', IntegerType(), True),
    StructField('FirePreventionDistrict', StringType(), True),
    StructField('SupervisorDistrict', StringType(), True),
    StructField('NeighborhoodDistrict', StringType(), True),
    StructField('Location', StringType(), True),
    StructField('RowID', StringType(), True)
])

# read the file using DataFrameReader using format CSV
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)
fire_df.show(10)
Esempio n. 14
0
    .load("s3a://living-insight-data/DOB_NOW__Build___Approved_Permits.csv")
    
mental_health = spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema","true") \
    .load("s3a://living-insight-data/Mental_Health_Service_Finder_Data.csv")

buildings_rdd = buildings.limit(1000).rdd.map(processhouse)
buildings = buildings_rdd.toDF()

buildings.write.jdbc("jdbc:postgresql://localhost:5432/living_insight", table = 'buildings', properties = { "user": "******", "password" : "postgres" })


mental_health_rdd = mental_health.rdd.zipWithIndex().map(processmentalhealth)
mental_health = mental_health_rdd.toDF()
mental_health = mental_health.filter(mental_health.longitude.isNotNull())

mental_udf = udf(handle_building,BooleanType())



house_id_with_mental_health = buildings.crossJoin(mental_health).where(mental_udf(struct([buildings[x] for x in buildings.columns]), struct([mental_health[x] for x in mental_health.columns]))).select(buildings.house_id,mental_health.query_id)


mental_health.write.jdbc("jdbc:postgresql://localhost:5432/living_insight", table = 'mental_health', properties = { "user": "******", "password" : "postgres" })

house_id_with_mental_health.write.jdbc("jdbc:postgresql://localhost:5432/living_insight", table = 'house_id_mental_health', properties = { "user": "******", "password" : "postgres" })
print("--- %s seconds ---" % (time.time() - start_time))

spark.stop()
Esempio n. 15
0
    def test_train_val_split_col_boolean(self):
        with spark_session('test_train_val_split_col_boolean') as spark:
            data = [
                [1.0, False], [1.0, False], [1.0, False], [1.0, False], [1.0, True]
            ]
            schema = StructType([StructField('data', FloatType()), StructField('val', BooleanType())])
            df = create_test_data_from_schema(spark, data, schema)

            validation = 'val'
            train_df, val_df, validation_ratio = util._train_val_split(df, validation)

            # Only check counts as validation ratio cannot be guaranteed due to approx calculation
            assert train_df.count() == 4
            assert val_df.count() == 1
Esempio n. 16
0
    StructField('description', StringType(), True),
    StructField('amount', StringType(), True),
    StructField('day_number', StringType(), True),
    StructField('weekday', StringType(), True),
]

WRONG_DEFAULT_STRUCT = StructType(fields=wrong_default_types)

correct_fortnightly_regularity_types = [
    StructField('date', DateType(), True),
    StructField('description', StringType(), True),
    StructField('amount', DoubleType(), True),
    StructField('day_number', StringType(), True),
    StructField('weekday', StringType(), True),
    StructField('days_passed', IntegerType(), True),
    StructField('fortnightly', BooleanType(), True)
]

FINAL_FORTNIGHTLY_STRUCT = StructType(
    fields=correct_fortnightly_regularity_types)

wrong_fortnightly_regularity_types = [
    StructField('date', StringType(), True),
    StructField('description', StringType(), True),
    StructField('amount', DoubleType(), True),
    StructField('day_number', StringType(), True),
    StructField('weekday', StringType(), True),
    StructField('days_passed', StringType(), True),
    StructField('fortnightly', StringType(), True)
]
Esempio n. 17
0
def process_bus_data(bus_df):
    """ Method to process raw business data from Yelp."""
    def select_elibigble_bus(row):
        """ Select businesses which fall into selected categores."""

        global categories
        try:
            # Return true if business falls into category list, else false.
            row_cats = row.split(',')
            for cat in row_cats:
                if cat.strip() in categories:
                    return True
            return False
        except (TypeError, AttributeError):
            # Returns false if business has no defined categories.
            return False

    def unpack_bus_attributes(row):
        """ Unpacks Business attributes and assigns them an index value."""

        # List to store business attributes.
        unpacked = list()
        # Unpack all attributes except PriceRange and Parking
        temp = [row[s] for s in bus_attributes]

        # Process PriceRange
        try:
            priceRange = int(row["RestaurantsPriceRange2"])
        except (TypeError, ValueError):
            # If no price range specified - default=2
            priceRange = 2

        #Process Parking
        try:
            parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1
        except AttributeError:
            parking = 0

        # Process WiFi
        if row["WiFi"] == 'no' or row["WiFi"] == "u'no'":
            wifi = -1
        elif row["WiFi"] == None:
            wifi = 0
        else:
            wifi = 1

        # Tokenize all Boolean attributes.
        for i in temp:
            if i == "True":
                unpacked.append(1)
            elif i == "False":
                unpacked.append(-1)
            else:
                unpacked.append(0)
        # Append the Parking and PriceRange attributes
        unpacked.append(wifi)
        unpacked.append(parking)
        unpacked.append(priceRange)

        # Print any arrays that are not of desired length (=30).
        if len(unpacked) != 30:
            print(unpacked)
        return _convert_to_vector(
            csc_matrix(np.asarray(unpacked).astype(float)).T)

    def unpack_bus_categories(row):
        """Unpacks all business cattegories."""

        # List to store business categories.
        unpacked = list()
        # Unpack all attributes except PriceRange and Parking
        for cat in row.split(','):
            unpacked.append(cat.strip())
        return unpacked

    def unpack_price_range(row):
        """ Returns price range."""
        return int(row[-1])

    # Package the functions above into Spark SQL user-defined functions
    udf_select_eligible_bus = udf(select_elibigble_bus, BooleanType())
    udf_unpack_bus_attributes = udf(unpack_bus_attributes, VectorUDT())
    udf_unpack_bus_categories = udf(unpack_bus_categories,
                                    ArrayType(StringType()))
    udf_unpack_price_range = udf(unpack_price_range, IntegerType())

    # Find businesses to include.
    eligible_bus = bus_df.withColumn("include", udf_select_eligible_bus(col("categories"))) \
        .filter(col("include") == True)

    # Process business attributes feature.
    all_bus_attributes = set(
        bus_df.select("attributes").take(1)[0].attributes.asDict().keys())
    bus_attributes_to_exclude = {
        'AcceptsInsurance', 'AgesAllowed', 'ByAppointmentOnly', 'Caters',
        'Corkage', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours',
        'RestaurantsAttire', 'RestaurantsPriceRange2', 'BusinessParking',
        'WiFi'
    }
    bus_attributes = list(all_bus_attributes - bus_attributes_to_exclude)
    bus_attributes.sort()
    eligible_attr = eligible_bus.withColumn(
        "unpackedAttr", udf_unpack_bus_attributes(col("attributes")))

    # Process business categories feature.
    eligible_cats = eligible_attr.withColumn(
        "unpackedCats", udf_unpack_bus_categories(col("categories")))
    cv = CountVectorizer(inputCol="unpackedCats", outputCol="vectorizedCats")
    vectorized_cats = cv.fit(eligible_cats).transform(eligible_cats)

    # Un-bundle price range from all other attributes.
    unpacked_pr = vectorized_cats.withColumn(
        "priceRange", udf_unpack_price_range(col("unpackedAttr")))
    unpacked_pr.take(1)

    # Reduce dimensions of attributes and categories features, respectively.
    pca_attr = PCA(k=3, inputCol="unpackedAttr",
                   outputCol="pcaAttr").fit(unpacked_pr)
    temp = pca_attr.transform(unpacked_pr)
    temp.show()
    pca_cats = PCA(k=1, inputCol="vectorizedCats",
                   outputCol="pcaCats").fit(temp)
    temp2 = pca_cats.transform(temp)
    temp2.show()

    # Assemble into final feature vector.
    va = VectorAssembler(
        inputCols=["stars", "priceRange", "pcaAttr", "pcaCats"],
        outputCol="featureVec")
    features = va.transform(temp2).select("business_id", "stars", "categories",
                                          "featureVec")
    features.take(1)

    # Unpack
    n_features = len(features.select("featureVec").take(1)[0].featureVec)
    final = features.withColumn("f", vector_to_array(col("featureVec"))) \
        .select(["business_id", "stars", "categories"] + [col("f")[i] for i in range(n_features)])

    return final, n_features
Esempio n. 18
0
from json import loads
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import StructType, StructField, DataFrame
from pyspark.sql.types import ArrayType, IntegerType, LongType, BooleanType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col

sc = SparkSession.builder.appName("dotingestion").getOrCreate()

schema = StructType([StructField("dire_lineup", ArrayType(IntegerType(), False), False),
                    StructField("radiant_lineup", ArrayType(IntegerType(), False), False),
                    StructField("radiant_win", BooleanType(), False),
                    StructField("match_id", LongType(), False)])

path = "data.json"
df = sc.read.json(path, schema=schema).na.drop("all").distinct()

with open("heroes.json", 'r', encoding="utf-8") as f:
    heroes_dict = {hero['id']: i for i, hero in enumerate(loads(f.read()))}

def convert_heroes_to_lineup(df: DataFrame) -> DataFrame:

    def onehot(heroes: ArrayType):
        lineup = tuple(heroes_dict[hero] for hero in heroes)
        return Vectors.dense([1 if hero_slot in lineup else 0 for hero_slot in range(len(heroes_dict))])

    heros_to_lineup_udf = udf(onehot, VectorUDT())
    return df.withColumn("dire_lineup_vec", heros_to_lineup_udf(df.dire_lineup))\
             .withColumn("radiant_lineup_vec", heros_to_lineup_udf(df.radiant_lineup))

df = convert_heroes_to_lineup(df)
Esempio n. 19
0
def repeat(times, func, *args, **kwargs):
    for _ in range(times):
        yield func(*args, **kwargs)


# --------- Create dataframe (from fake data) -------------
data = list(repeat(10000, fake_entry))
dataDF = spark.createDataFrame(
    data, ('last_name', 'first_name', 'ssn', 'occupation', 'age'))

subDF = dataDF.select('last_name', 'first_name', 'ssn', 'occupation',
                      (dataDF.age - 1).alias('age'))
filteredDF = subDF.filter(subDF.age < 10)

from pyspark.sql.types import BooleanType
less_ten = udf(lambda s: s < 10, BooleanType())
lambdaDF = subDF.filter(less_ten(subDF.age))
lambdaDF.show()
lambdaDF.count()

# Let's collect the even values less than 10
even = udf(lambda s: s % 2 == 0, BooleanType())
evenDF = lambdaDF.filter(even(lambdaDF.age))
evenDF.show()
evenDF.count()

print("first: {0}\n".format(filteredDF.first()))
print("Four of them: {0}\n".format(filteredDF.take(4)))

tempDF = spark.createDataFrame([("Joe", 1), ("Joe", 1), ("Anna", 15),
                                ("Anna", 12), ("Ravi", 5)], ('name', 'score'))
Esempio n. 20
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pyspark.sql.functions import pandas_udf, PandasUDFType, col
from pyspark.sql.types import BooleanType

import pandas as pd

from typing import Any

from fink_broker.tester import spark_unit_tests


@pandas_udf(BooleanType(), PandasUDFType.SCALAR)
def keep_alert_based_on(nbad: Any, rb: Any, magdiff: Any) -> pd.Series:
    """ Experimental filtering service. For testing purposes only.

    Create a column whose entry is false if the alert has to be discarded,
    and true otherwise.

    Parameters
    ----------
    nbad: Spark DataFrame Column
        Column containing the nbad values
    rb: Spark DataFrame Column
        Column containing the rb values
    magdiff: Spark DataFrame Column
        Column containing the magdiff values
Esempio n. 21
0
    def make_not_terminal_udf(actions: List[str]):
        """ Return true iff next_action is terminal (i.e. idx = len(actions)). """
        def get_not_terminal(next_action):
            return next_action < len(actions)

        return udf(get_not_terminal, BooleanType())
def main():
    get_unstructured_file()
    logger.success('Downloaded Files')

    schema = StructType([
        StructField('Last Accessed Url', StringType(), True),
        StructField('Page Category', StringType(), True),
        StructField('Page Category 1', StringType(), True),
        StructField('Page Category 2', StringType(), True),
        StructField('Page Category 3', StringType(), True),
        StructField('Page Name', StringType(), True),
        StructField('at', StringType(), True),
        StructField('browser', StringType(), True),
        StructField('carrier', StringType(), True),
        StructField('city_name', StringType(), True),
        StructField('clv_total', LongType(), True),
        StructField('country', StringType(), True),
        StructField('custom_1', StringType(), True),
        StructField('custom_2', StringType(), True),
        StructField('custom_3', StringType(), True),
        StructField('custom_4', StringType(), True),
        StructField('device_new', BooleanType(), True),
        StructField('first-accessed-page', StringType(), True),
        StructField('install_uuid', StringType(), True),
        StructField('language', StringType(), True),
        StructField('library_ver', StringType(), True),
        StructField('marketing_campaign', StringType(), True),
        StructField('marketing_medium', StringType(), True),
        StructField('marketing_source', StringType(), True),
        StructField('model', StringType(), True),
        StructField('name', StringType(), True),
        StructField('nth', LongType(), True),
        StructField('os_ver', StringType(), True),
        StructField('platform', StringType(), True),
        StructField('region', StringType(), True),
        StructField('session_uuid', StringType(), True),
        StructField('studentId_clientType', StringType(), True),
        StructField('type', StringType(), True),
        StructField('user_type', StringType(), True),
        StructField('uuid', StringType(), True)
    ])

    logger.debug('Creating DataFrame...')
    df = spark.read.schema(schema).json('*.json')
    logger.success('DataFrame created with {} rows'.format(df.count()))

    df = df.select('at',
                'browser',
                'country',
                'custom_4',
                'studentId_clientType',
                'Page Name',
                'Last Accessed Url') \
            .filter(df['studentId_clientType'].isNotNull())

    df_country = df.select(df.country) \
        .filter(df.country != 'br') \
        .groupBy('country').count()
    #df_country.repartition(1).write.format('csv').mode('overwrite').option('header', 'true').save('country')



    df_users = df.filter(df.custom_4.isNotNull()) \
                .select(df.custom_4) \
                .groupBy(df.custom_4).count()
    #df_users.repartition(1).write.format('csv').mode('overwrite').option('header', 'true').save('user.csv')

    df_result = df.withColumn('id',
                              clean_studentId(df['studentId_clientType']))
    df_result = df_result.drop('studentId_clientType')

    query = """SELECT fat.id, state, city, cou.name course
                                FROM "DM_PASSEI_DIRETO".fat_students fat
                                INNER JOIN "DM_PASSEI_DIRETO".dim_courses cou
                                ON fat.course_id = cou.id
                                INNER JOIN "DM_PASSEI_DIRETO".dim_sessions ds 
                                ON fat.id = ds.student_id 
                                WHERE CAST(ds.start_time as VARCHAR) LIKE '2017-11-16%'"""
    students = dw_get_data(query)

    students_schema = StructType([
        StructField('id', StringType(), True),
        StructField('state', StringType(), True),
        StructField('city', StringType(), True),
        StructField('course', StringType(), True)
    ])

    df_dim = spark.createDataFrame(students, students_schema)

    df_result = df_result.join(df_dim, 'id', how='inner').distinct()
    #df_result.repartition(1).write.format('csv').mode('overwrite').option('header', 'true').save('full.csv')

    df_country.toPandas().to_csv('country.csv')
    df_users.toPandas().to_csv('users.csv')
    df_result.toPandas().to_csv('full.csv')
    send_files('*.csv')
Esempio n. 23
0
def select_relevant_columns(df,
                            discrete_action: bool = True,
                            include_possible_actions: bool = True):
    """ Select all the relevant columns and perform type conversions. """
    if not discrete_action and include_possible_actions:
        raise NotImplementedError(
            "currently we don't support include_possible_actions")

    select_col_list = [
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("reward").cast(FloatType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("state_features").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("state_features_presence").cast(ArrayType(BooleanType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("next_state_features").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("next_state_features_presence").cast(ArrayType(BooleanType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("not_terminal").cast(BooleanType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("action_probability").cast(FloatType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("mdp_id").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("sequence_number").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("step").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("time_diff").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("metrics").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("metrics_presence").cast(ArrayType(BooleanType())),
    ]

    if discrete_action:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action").cast(LongType()),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action").cast(LongType()),
        ]
    else:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action").cast(ArrayType(FloatType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action").cast(ArrayType(FloatType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action_presence").cast(ArrayType(BooleanType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action_presence").cast(ArrayType(BooleanType())),
        ]

    if include_possible_actions:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("possible_actions_mask").cast(ArrayType(LongType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("possible_next_actions_mask").cast(ArrayType(LongType())),
        ]

    return df.select(*select_col_list)
Esempio n. 24
0
    def test_as_spark_type_pandas_on_spark_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(pandas_on_spark_type(numpy_or_python_type),
                             (dtype, spark_type))

            if isinstance(numpy_or_python_type, CategoricalDtype):
                # Nested CategoricalDtype is not yet supported.
                continue

            self.assertEqual(as_spark_type(List[numpy_or_python_type]),
                             ArrayType(spark_type))
            self.assertEqual(
                pandas_on_spark_type(List[numpy_or_python_type]),
                (np.dtype("object"), ArrayType(spark_type)),
            )

            # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
            if sys.version_info >= (3, 8) and LooseVersion(
                    np.__version__) >= LooseVersion("1.21"):
                import numpy.typing as ntp

                self.assertEqual(
                    as_spark_type(ntp.NDArray[numpy_or_python_type]),
                    ArrayType(spark_type))
                self.assertEqual(
                    pandas_on_spark_type(ntp.NDArray[numpy_or_python_type]),
                    (np.dtype("object"), ArrayType(spark_type)),
                )

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            pandas_on_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            pandas_on_spark_type(np.dtype("object"))
Esempio n. 25
0
# STRING COMPARISON


@udf(returnType=IntegerType())
def damerau_levenshtein_distance(s1, s2):
    return None if s1 == None or s2 == None else J.damerau_levenshtein_distance(
        s1, s2)


@udf(returnType=IntegerType())
def hamming_distance(s1, s2):
    return None if s1 == None or s2 == None else J.hamming_distance(s1, s2)


@udf(returnType=FloatType())
def jaro_similarity(s1, s2):
    return None if s1 == None or s2 == None else J.jaro_similarity(s1, s2)


@udf(returnType=FloatType())
def jaro_winkler_similarity(s1, s2):
    return None if s1 == None or s2 == None else J.jaro_winkler_similarity(
        s1, s2)


@udf(returnType=BooleanType())
def match_rating_comparison(s1, s2):
    return None if s1 == None or s2 == None else J.match_rating_comparison(
        s1, s2)
Esempio n. 26
0
from pyspark.sql.types import (
    DoubleType,
    LongType,
    StringType,
    StructField,
    StructType,
    BooleanType,
)

retention_schema = StructType([
    StructField("client_id", StringType(), True),
    StructField("subsession_start", StringType(), True),
    StructField("profile_creation", StringType(), True),
    StructField("days_since_creation", LongType(), True),
    StructField("channel", StringType(), True),
    StructField("app_version", StringType(), True),
    StructField("geo", StringType(), True),
    StructField("distribution_id", StringType(), True),
    StructField("is_funnelcake", BooleanType(), True),
    StructField("source", StringType(), True),
    StructField("medium", StringType(), True),
    StructField("campaign", StringType(), True),
    StructField("content", StringType(), True),
    StructField("sync_usage", StringType(), True),
    StructField("is_active", BooleanType(), True),
    StructField("usage_hours", DoubleType(), True),
    StructField("sum_squared_usage_hours", DoubleType(), True),
    StructField("total_uri_count", LongType(), True),
    StructField("unique_domains_count", LongType(), True),
])
cultures = ['japanese', 'american', 'african (general)']

# Uncomment the above for more robust and large scale searches!

classes = cultures + mediums

medium_set = set(mediums)
culture_set = set(cultures)
selected_ids = {"AK-RBK-17525-2", "AK-MAK-1204", "AK-RAK-2015-2-9"}

small_df = df.where(
    udf(
        lambda medium, culture, id_val: (medium in medium_set) or
        (culture in culture_set) or (id_val in selected_ids),
        BooleanType())("Classification", "Culture", "id"))

small_df.count()

# COMMAND ----------

# MAGIC %md ### Define and fit ConditionalKNN models
# MAGIC Below, we create ConditionalKNN models for both the medium and culture columns; each model takes in an output column, features column (feature vector), values column (cell values under the output column), and label column (the quality that the respective KNN is conditioned on).

# COMMAND ----------

medium_cknn = (
    ConditionalKNN().setOutputCol("Matches").setFeaturesCol("Norm_Features").
    setValuesCol("Thumbnail_Url").setLabelCol("Classification").fit(small_df))

# COMMAND ----------
Esempio n. 28
0
def transactions_feature_table(spark, client):
    schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("created_timestamp", TimestampType()),
        StructField("total_transactions", DoubleType()),
        StructField("is_vip", BooleanType()),
    ])
    df_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            50.0,
            True,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            100.0,
            True,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            400.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            200.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=4, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            300.0,
            False,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "transactions",
                                                  schema, df_data)
    file_source = FileSource(
        event_timestamp_column="event_timestamp",
        created_timestamp_column="created_timestamp",
        file_format=ParquetFormat(),
        file_url=file_uri,
    )
    features = [
        Feature("total_transactions", ValueType.DOUBLE),
        Feature("is_vip", ValueType.BOOL),
    ]
    feature_table = FeatureTable("transactions", ["customer_id"],
                                 features,
                                 batch_source=file_source)
    yield client.apply(feature_table)
    shutil.rmtree(temp_dir)
Esempio n. 29
0
    StructField(
        "build",
        ArrayType(
            StructType([StructField("application_name", StringType(), True)]),
            True), True),
    StructField(
        "settings",
        ArrayType(StructType([StructField("locale", StringType(), True)]),
                  True), True),
    StructField(
        "active_addons",
        ArrayType(
            MapType(
                StringType(),
                StructType([
                    StructField("blocklisted", BooleanType(), True),
                    StructField("type", StringType(), True),
                    StructField("signed_state", LongType(), True),
                    StructField("user_disabled", BooleanType(), True),
                    StructField("app_disabled", BooleanType(), True),
                    StructField("is_system", BooleanType(), True)
                ]), True), True))
])

default_sample = {
    "client_id":
    "client-id",
    "normalized_channel":
    "release",
    "build": [{
        "application_name": "Firefox"
Esempio n. 30
0
def helper_method_process_data(total_data, spark_context, sql_context):
    """ Process the raw input data

    This function parallelize and filter raw input data 
    to be ready for model building.

    Keyword arguments:
    total_data -- raw input data
    spark_context -- Spark context created in caller module
    sql_context -- SQL/HIVE Context created in caller module

    """
    def table_creator(row):
        '''
        This method will give minimal structure to data asper requirement. 
        '''
        temp_file_name = str(row['ci_job']['artifacts'].get('name', ''))
        return (Row(
            original_url=str(row['ci_job']['artifacts'].get(
                'original_url', '')),
            result=str(row['ci_job'].get('result', '')),
            message=row.get('message', ''),
            file_name='XML'
            if temp_file_name.endswith('.xml') else temp_file_name,
        ))

    def datetime_substitutor(a):
        return re.sub('\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}', '', a)

    def space_remover(a):
        return re.sub('\s+', ' ', a)

    def caller_method(a):
        return space_remover(datetime_substitutor(a))

    print "===============One data point=============:"
    print "inside  helper_method_process_data KEYS:", total_data[0].keys()

    print "After filtering out the data without _source, length of data:", len(
        total_data)

    total_df = spark_context.parallelize(total_data).map(table_creator)

    total_df = total_df.filter(
        lambda a: a.original_url.endswith('xunit.xml') == False)

    total_df = sql_context.createDataFrame(data=total_df, samplingRatio=0.3)

    udf_caller_method = udf(caller_method)

    #msg_df_failed = total_df.rdd.filter(lambda a:a.result != 'SUCCESS').toDF(sampleRatio=0.3).select('message','original_url')
    msg_df_failed = total_df.select('message', 'original_url')

    msg_df_failed = msg_df_failed.withColumn(
        "cleaned_message", udf_caller_method(msg_df_failed.message))

    msg_df_failed.first()

    #regex = re.compile('[%s]' % re.escape(string.punctuation))

    color_code_words = ['', '']

    stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
    stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
    stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
    stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
    stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
    stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
    stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
    stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
    stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
    stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
    stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
    stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
    stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
    stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
    stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
    stopwords += [
        'false', 'few', 'fifteen', 'fifty', 'file', 'fill', 'find', 'fire',
        'first'
    ]
    stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
    stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
    stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
    stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
    stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
    stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
    stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
    stopwords += [
        'last', 'latter', 'latterly', 'least', 'less', 'line', 'ltd', 'made'
    ]
    stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
    stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
    stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
    stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
    stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
    stopwords += ['off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or']
    stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
    stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
    stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
    stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
    stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
    stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
    stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
    stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
    stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
    stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
    stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
    stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
    stopwords += [
        'together', 'too', 'top', 'toward', 'towards', 'true', 'twelve'
    ]
    stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
    stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
    stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
    stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
    stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
    stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
    stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
    stopwords += ['yours', 'yourself', 'yourselves']
    stopwords += color_code_words
    stopwords = set(stopwords)

    broadcast_var = spark_context.broadcast(stopwords)

    total_df.unpersist()
    del total_df

    ## Notes: Map Reduce was replaced by Aggregation of Spark Framework.
    temp_df = msg_df_failed.filter(
        udf(lambda x: x.strip() != '', BooleanType())(
            msg_df_failed.cleaned_message)).groupby('original_url').agg(
                func.concat_ws(
                    " ", func.collect_list(
                        msg_df_failed.cleaned_message))).withColumnRenamed(
                            "original_url", "artifact_url").withColumnRenamed(
                                "concat_ws( , collect_list(cleaned_message))",
                                "concat_msg")

    print "=================== NLTK version Print====================="
    print nltk.__version__

    def word_tokenize(x):
        return nltk.word_tokenize(x)

    df = (temp_df.rdd.map(lambda x: (
        x.artifact_url, word_tokenize(x.concat_msg))).toDF().withColumnRenamed(
            "_1", "artifact_url")).withColumnRenamed("_2", "features")

    temp_df.unpersist()
    msg_df_failed.unpersist()

    del temp_df
    del msg_df_failed

    df = df.rdd.map(lambda a: Row(
        artifact_url=a.artifact_url,
        features=filter(lambda y: len(y) > 3, [x.lower() for x in a.features])
    )).map(
        lambda a: Row(artifact_url=a.artifact_url,
                      features=filter(lambda x: (x not in broadcast_var.value),
                                      a.features))).toDF()

    return df