def get_telemetry_crashes(sc, versions, days, product='Firefox'):
    days = utils.get_days(days)
    dataset = SQLContext(sc).read.load(['s3://telemetry-parquet/socorro_crash/v2/crash_date=' + day.strftime('%Y%m%d') for day in days], 'parquet')

    if product != 'FennecAndroid':
        dataset = dataset.select([c for c in dataset.columns if c not in [
            'android_board', 'android_brand', 'android_cpu_abi', 'android_cpu_abi2',
            'android_device', 'android_hardware', 'android_manufacturer',
            'android_model', 'android_version',
        ]])

    return dataset.filter((dataset['product'] == product) & (dataset['version'].isin(versions)))
Beispiel #2
0
	def load_dataFrame_from_csv(self, csvFilePath):
		schema = StructType([
			StructField("X2", StringType(), True),
			StructField("X4", StringType(), True),
			StructField("X5", StringType(), True),
			StructField("X6", StringType(), True),
			StructField("adversaire", StringType(), True),
			StructField("score_france", IntegerType(), True),
			StructField("score_adversaire", IntegerType(), True),
			StructField("penalty_france", StringType(), True),
			StructField("penalty_adversaire", StringType(), True),
			StructField("date", DateType(), True),
			StructField("year", IntegerType(), True),
			StructField("outcome", StringType(), True),
			StructField("no", StringType(), True)
		])
		dfNotFiltered = SQLContext(self.spark).read.csv(csvFilePath, header=True,schema=schema)
		return dfNotFiltered.filter(dfNotFiltered.no != "None")