def _flatten_encounter(self, base_encounter_url: str, force_location_type_columns: bool = True): """Returns a custom flat view of encoutners.""" # When merging flattened encounters and observations, we need to be careful # with flattened columns for encounter type and location and only include # them if there is a constraints on them. Otherwise we may end up with a # single observation repeated multiple times in the view. flat_df = self._enc_df.select("subject", "id", "location", "type", "period").withColumn( "encounterId", F.regexp_replace( "id", base_encounter_url, "")) column_list = [ F.col("encounterId"), F.col("subject.patientId").alias("encPatientId"), F.col("period.start").alias("first"), F.col("period.end").alias("last"), ] if self._enc_constraint.has_location() or force_location_type_columns: flat_df = flat_df.withColumn("locationFlat", F.explode_outer("location")) column_list += [ F.col("locationFlat.location.LocationId").alias("locationId"), F.col("locationFlat.location.display").alias( "locationDisplay"), ] if self._enc_constraint.has_type() or force_location_type_columns: flat_df = flat_df.withColumn("typeFlat", F.explode_outer("type")) column_list += [ F.col("typeFlat.coding.system").alias("encTypeSystem"), F.col("typeFlat.coding.code").alias("encTypeCode"), ] return flat_df.select(column_list).where( self._construct_encounter_constraint(self._enc_constraint))
def test_explode(self): from pyspark.sql.functions import explode, explode_outer, posexplode_outer d = [ Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}), Row(a=1, intlist=[], mapfield={}), Row(a=1, intlist=None, mapfield=None), ] rdd = self.sc.parallelize(d) data = self.spark.createDataFrame(rdd) result = data.select(explode(data.intlist).alias("a")).select("a").collect() self.assertEqual(result[0][0], 1) self.assertEqual(result[1][0], 2) self.assertEqual(result[2][0], 3) result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect() self.assertEqual(result[0][0], "a") self.assertEqual(result[0][1], "b") result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()] self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)]) result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()] self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)]) result = [x[0] for x in data.select(explode_outer("intlist")).collect()] self.assertEqual(result, [1, 2, 3, None, None]) result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()] self.assertEqual(result, [('a', 'b'), (None, None), (None, None)])
def _flatten_encounter(self, base_encounter_url: str, force_location_type_columns: bool = True): """Returns a custom flat view of encoutners.""" # When merging flattened encounters and observations, we need to be careful # with flattened columns for encounter type and location and only include # them if there is a constraints on them. Otherwise we may end up with a # single observation repeated multiple times in the view. flat_df = self._enc_df.select('subject', 'id', 'location', 'type', 'period').withColumn( 'encounterId', F.regexp_replace( 'id', base_encounter_url, '')) column_list = [ F.col('encounterId'), F.col('subject.patientId').alias('encPatientId'), F.col('period.start').alias('first'), F.col('period.end').alias('last') ] if self._enc_constraint.has_location() or force_location_type_columns: flat_df = flat_df.withColumn('locationFlat', F.explode_outer('location')) column_list += [ F.col('locationFlat.location.LocationId').alias('locationId'), F.col('locationFlat.location.display').alias('locationDisplay') ] if self._enc_constraint.has_type() or force_location_type_columns: flat_df = flat_df.withColumn('typeFlat', F.explode_outer('type')) column_list += [ F.col('typeFlat.coding.system').alias('encTypeSystem'), F.col('typeFlat.coding.code').alias('encTypeCode') ] return flat_df.select(column_list).where(self._enc_constraint.sql())
def test_explode(self): from pyspark.sql.functions import explode, explode_outer, posexplode_outer d = [ Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}), Row(a=1, intlist=[], mapfield={}), Row(a=1, intlist=None, mapfield=None), ] rdd = self.sc.parallelize(d) data = self.spark.createDataFrame(rdd) result = data.select(explode(data.intlist).alias("a")).select("a").collect() self.assertEqual(result[0][0], 1) self.assertEqual(result[1][0], 2) self.assertEqual(result[2][0], 3) result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect() self.assertEqual(result[0][0], "a") self.assertEqual(result[0][1], "b") result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()] self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)]) result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()] self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)]) result = [x[0] for x in data.select(explode_outer("intlist")).collect()] self.assertEqual(result, [1, 2, 3, None, None]) result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()] self.assertEqual(result, [('a', 'b'), (None, None), (None, None)])
def process_study_table(study_index: str) -> DataFrame: 'Loads and processes disease information from the study table.' return ( spark.read.json(study_index).select( 'study_id', 'pmid', 'pub_date', 'pub_author', 'trait_reported', 'trait_efos', col('n_initial').alias('sample_size'), ) # Assign project based on the study author information .withColumn( 'projectId', when(col('study_id').contains('FINNGEN'), 'FINNGEN').when( col('study_id').contains('NEALE'), 'NEALE').when(col('study_id').contains('SAIGE'), 'SAIGE').when( col('study_id').contains('GCST'), 'GCST'), ) # Warning! Not all studies have an EFO annotated (trait_efos is an empty array) # Also, some have multiple EFOs! # Studies with no EFO are kept, the array is exploded to capture each mapped trait .withColumn('efo', explode_outer(col('trait_efos'))).drop('trait_efos') # Drop records with HANCESTRO IDs as mapped trait .filter((~col('efo').contains('HANCESTRO')) | (col('efo').isNull())))
def flatten(df, exc_flat=list()): complex_fields = dict([ (field.name, field.dataType) for field in df.schema.fields if (type(field.dataType) == ArrayType or type(field.dataType) == StructType) and (field.name not in exc_flat) ]) while len(complex_fields) != 0: col_name = list(complex_fields.keys())[0] if (type(complex_fields[col_name]) == StructType): expanded = [ col(col_name + '.' + k).alias(col_name + '_' + k) for k in [n.name for n in complex_fields[col_name]] ] df = df.select("*", *expanded).drop(col_name) elif (type(complex_fields[col_name]) == ArrayType): df = df.withColumn(col_name, explode_outer(col_name)) # recompute remaining Complex Fields in Schema complex_fields = dict([ (field.name, field.dataType) for field in df.schema.fields if (type(field.dataType) == ArrayType or type(field.dataType) == StructType) and (field.name not in exc_flat) ]) return df
def transfLineitems(ds, config): invoiceDS: DataFrame = ds invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \ .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto")) if invoiceDS.schema["TypeOfService"].dataType == StringType(): invoiceDS = invoiceDS.withColumn("TypeOfService", array(col("_LineItems._Description"))) \ .withColumn("ServiceAmount", array(col("_LineItems._TotalPriceNetto"))) invoiceDS = invoiceDS.withColumn("ServiceNRate", arrays_zip(col("TypeOfService"), col("ServiceAmount")))\ .withColumn("ServiceNRate", explode_outer(col("ServiceNRate"))) # udf_service = udf(get_genesis_TOS, StringType()) # same code of line 56, remove costTypeList billPath = config["Master"]["billPath"] CostTypeList = createDatasetFromCSVFile("", billPath).filter(col("VENDOR_NAME").rlike("GENESIS MARINE, LLC"))\ .rdd.map(lambda x: x.COST_LINE_ITEM_TYPE).collect() def udf_service(costType): return udf(lambda l: get_genesis_TOS_search(l, costType), StringType()) invoiceDS = invoiceDS.withColumn("TypeOfService", udf_service(CostTypeList)(regexp_replace( col("ServiceNRate.TypeOfService"), "\n", " ")))\ .withColumn("ServiceAmount", col("ServiceNRate.ServiceAmount")) \ return invoiceDS
def process_collection(spark_session, impress_api_url, current_schema, current_type, entity_df): """ :param spark_session: :param impress_api_url: :param current_schema: :param current_type: :param entity_df: :return: """ impress_subtype = '' for column_name in current_schema.names: if 'Collection' in column_name: impress_subtype = column_name.replace('Collection', '') if current_type != '': column_name = current_type + '.' + column_name entity_id_column_name = impress_subtype + 'Id' entity_df = entity_df.withColumn( entity_id_column_name, explode_outer(entity_df[column_name])) sub_entity_schema = get_impress_entity_schema( spark_session, impress_api_url, impress_subtype) get_entity_udf = udf( lambda x: get_impress_entity_by_id(impress_api_url, impress_subtype, x), StructType(sub_entity_schema)) entity_column_name = impress_subtype entity_df = entity_df.withColumn( entity_column_name, get_entity_udf(entity_df[entity_id_column_name])) entity_df = process_collection(spark_session, impress_api_url, sub_entity_schema, impress_subtype, entity_df) return entity_df
def json_to_flat(self): # method to flatten the dataframe df = self.df nested_columns = dict([(x.name, x.dataType) for x in df.schema.fields if isinstance(x.dataType, T.StructType) or isinstance(x.dataType, T.ArrayType)]) if len(nested_columns) > 0: col_name = list(nested_columns.keys())[0] if isinstance(nested_columns[col_name], T.ArrayType): df = df.withColumn(col_name, explode_outer(col_name)) return Ops(df).json_to_flat() elif isinstance(nested_columns[col_name], T.StructType): df = df.select( "*", *[ col(col_name + "." + x.name).alias(col_name + "_" + x.name) for x in nested_columns[col_name] ]).drop(col_name) return Ops(df).json_to_flat() else: return df
def flatten_array(df): for column, column_type in df.dtypes: if column_type.startswith("array<"): df = df.select("*", f.explode_outer(df[column]).alias(f"{column}_2")) df = df.drop(column) df = df.toDF(*(c.replace("_2", "") for c in df.columns)) return df.select("*")
def consume_transactions(input_df, checkpoint_path): # The Below Section is for Flight Transactions transformation and loading new_df = input_df.withColumn( 'new_val', F.regexp_replace(input_df['value'], '\\\\', '')).drop('value') new_df = new_df.withColumn('value', F.regexp_replace(new_df['new_val'], '""', "'")).drop('new_val') new_df = new_df.withColumn('new_val', F.regexp_replace(new_df['value'], '}n', "}")).drop('value') new_df = new_df.withColumn('new_val', F.regexp_replace(new_df['new_val'], "'", "")) #new_df = new_df.withColumn('struct_val',F.struct(new_df['new_val'])).drop('new_val') transaction_schema = StructType( (StructField("DestinationAirportCode", StringType()), StructField("Itinerary", StringType()), StructField("OneWayOrReturn", StringType()), StructField("OriginAirportCode", StringType()), StructField( "Segment", ArrayType( StructType([ StructField("ArrivalAirportCode", StringType()), StructField("DepartureAirportCode", StringType()), StructField("LegNumber", StringType()), StructField("NumberOfPassengers", StringType()), StructField("SegmentNumber", StringType()) ]))), StructField("TransactionDateUTC", StringType()), StructField("UniqueId", StringType()))) json_df = new_df.select( F.from_json(F.col("new_val"), transaction_schema).alias("value")).selectExpr('value.*') json_df = json_df.withColumn('Segment', F.explode_outer(json_df['Segment'])) json_df = json_df.withColumn('DepartureAirportCode', F.col('Segment')['DepartureAirportCode']) \ .withColumn('ArrivalAirportCode', F.col('Segment')['ArrivalAirportCode']) \ .withColumn('LegNumber', F.col('Segment')['LegNumber']) \ .withColumn('NumberOfPassengers', F.col('Segment')['NumberOfPassengers']) \ .withColumn('SegmentNumber', F.col('Segment')['SegmentNumber']) \ .drop('Segment') # Stream the data, from a Kafka topic to a Spark in-memory table query = json_df\ .writeStream \ .format("memory") \ .queryName("TransactionTable") \ .outputMode("append") \ .option("checkpoint",checkpoint_path)\ .start() query.awaitTermination(5) # Let it Fill up the table sleep(10)
def columnsToRowsDf(cls, df, columnsdict): columnset = set() for key, values in columnsdict.items(): values = tuple(values) df = df.withColumn(key, explode_outer(array(*values))) columnset.update(values) df_columns = tuple(df.columns) dfcoltupl = filter(lambda x: x not in columnset, df_columns) return df.select(*dfcoltupl)
def firetv_events(spark, experiment): te = spark.table('telemetry_mobile_event_parquet') return te.filter( te.app_name == 'FirefoxForFireTV' ).select( te.submission_date_s3, te.client_id, F.explode_outer(te.events).alias('event') )
def process(df: DataFrame) -> pd.DataFrame: """Delete stop words and compute the most frequent words over the text of all patents""" counts = df.select( sf.explode_outer(f"{OUTPUT_COL_ENGLISH_TEXT}_stopwords").alias("word")) # TODO check num partitions result = counts.groupBy("word").count() result = result.sort(sf.col("count").desc()).limit(NUM_MOST_FREQUENT_WORDS) result_p = result.toPandas() return result_p
def _explode_all(df, df_array_fields): """ :param df: :param df_array_fields: :return: """ for coluna in df_array_fields.keys(): df = df.withColumn(coluna, explode_outer(col(coluna))) return df
def log_language_distribution(df: DataFrame, field_name: str): """Generates a log with the distribution of languages""" logger.info(f"Getting language distribution for: {field_name}") if isinstance(df.select(field_name).schema.fields[0].dataType, ArrayType): languages = df.select( sf.explode_outer(field_name).alias("target_field")) else: languages = df.select(sf.col(field_name).alias("target_field")) languages_p = languages.groupby("target_field._lang").count().toPandas() logger.info( f"Distribution of languages in {field_name}:\n{languages_p.to_string()}" )
def _explode_columns(df, df_array_fields, columns): """ :param df: :param df_array_fields: :param columns: :return: """ for column in columns: if column in df_array_fields.keys(): df = df.withColumn(column, explode_outer(col(column))) return df
def _flatten_obs(obs: pyspark_sql.DataFrame, code_system: str = None) -> pyspark_sql.DataFrame: """Creates a flat version of Observation FHIR resources. Note `code_system` is only applied on `code.coding` which is a required filed, i.e., it is not applied on `value.codeableConcept.coding`. Args: obs: A collection of Observation FHIR resources. code_system: The code system to be used for filtering `code.coding`. Returns: A DataFrame with the following columns (note one input observation might be repeated, once for each of its codes): - `coding` from the input obsservation's `code.coding` - `valueCoding` from the input's `value.codeableConcept.coding` - `value` from the input's `value` - `patientId` from the input's `subject.patientId` - `dateTime` from the input's `effective.dateTime` """ sys_str = ('coding.system="{}"'.format(code_system) if code_system else "coding.system IS NULL") value_sys_str_base = ('valueCoding.system="{}"'.format(code_system) if code_system else "valueCoding.system IS NULL") value_sys_str = "(valueCoding IS NULL OR {})".format( value_sys_str_base) merge_udf = F.UserDefinedFunction(_merge_date_and_value, T.StringType()) return (obs.withColumn( "coding", F.explode("code.coding")).where(sys_str).withColumn( "valueCoding", # Note valueCoding can be null. F.explode_outer("value.codeableConcept.coding"), ).where(value_sys_str).withColumn( "dateAndValue", merge_udf(F.col("effective.dateTime"), F.col("value.quantity.value")), ).withColumn( "dateAndValueCode", merge_udf(F.col("effective.dateTime"), F.col("valueCoding.code")), ).select( F.col("coding"), F.col("valueCoding"), F.col("value"), F.col("subject.patientId").alias("patientId"), F.col("effective.dateTime").alias("dateTime"), F.col("dateAndValue"), F.col("dateAndValueCode"), F.col("context.EncounterId").alias("encounterId"), ))
def do(self, workflow, etl_process): from pyspark.sql.functions import explode_outer, explode, col self.target = self.action_details.pop("target") self.keep_nulls = self.action_details.pop("keep_nulls", True) other_cols = [col(i) for i in workflow.df.columns if i != self.target] if self.keep_nulls: workflow.df = workflow.df \ .select(*other_cols, explode_outer(col(self.target)).alias(self.target)) else: workflow.df = workflow.df \ .select(*other_cols, explode(col(self.target)).alias(self.target))
def format_dataframe(df: DataFrame, verbose=True) -> DataFrame: if verbose: print(f'Formatting dataframe:') # Save start time for timing start_time = time.time() # Explode the columns: "devices" and "tracks" if verbose: print(f'\tExploding columns containing lists.') df = (df.withColumn('devices', f.explode_outer('devices')).withColumn( 'tracks', f.explode_outer('tracks'))) # Flatten the schema. if verbose: print(f'\tFlattening the dataframe schema.') df = t.spark.flatten_schema(df) if verbose: print(f'\tExecution time: {time.time() - start_time:.5f}') return df
def transformed_df(books_info): transformed_emp = books_info.withColumn("languages", explode_outer("languages")) \ .withColumn("subjects", explode_outer("subjects")) \ .withColumn("other_titles", explode_outer("other_titles")) \ .withColumn("publishers", explode_outer("publishers")) \ .withColumn("authors", explode_outer("authors")) \ .withColumn("genres", explode_outer("genres")) \ .withColumn("publish_places", explode_outer("publish_places")) \ .withColumn("download_url", explode_outer("download_url")) \ .withColumn("batch_date", lit(date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss"))) \ .select("Title", regexp_replace(expr("languages.key"), "/languages/", "").alias("languages"), "subjects", regexp_replace(expr("location"), "[\[\]\"]", "").alias("location"), "other_titles", "publishers", "publish_places", col("last_modified.value").alias("last_modified"), regexp_replace(expr("authors.key"), "/authors/", "").alias("authors") , col("created.value").alias("created"), "genres", "contributions", "number_of_pages", "publish_country", "publish_date", "download_url", "batch_date") return transformed_emp
def prepare_version_multiplicity_data(sbom_dataframe: DataFrame) -> DataFrame: vm_with_score_df = sbom_dataframe.select( 'firmware_hash', explode_outer('components').alias('component')).select( 'firmware_hash', col('component.name').alias('name'), col('component.version').alias('version'), ).na.drop(subset=['version']).groupBy('firmware_hash', 'name').agg( countDistinct('version').alias('version_count')).select( 'firmware_hash', when(col('version_count') > 1, col('version_count') - 1).otherwise(0).alias( 'multiplicity')).groupBy('firmware_hash').agg( _sum('multiplicity').cast('int').alias( 'software_version_multiplicity')) return vm_with_score_df
def _flatten_obs(obs: DataFrame, code_system: str = None) -> DataFrame: """Creates a flat version of Observation FHIR resources. Note `code_system` is only applied on `code.coding` which is a required filed, i.e., it is not applied on `value.codeableConcept.coding`. Args: obs: A collection of Observation FHIR resources. code_system: The code system to be used for filtering `code.coding`. Returns: A DataFrame with the following columns (note one input observation might be repeated, once for each of its codes): - `coding` from the input obsservation's `code.coding` - `valueCoding` from the input's `value.codeableConcept.coding` - `value` from the input's `value` - `patientId` from the input's `subject.patientId` - `dateTime` from the input's `effective.dateTime` """ sys_str = 'coding.system="{}"'.format( code_system) if code_system else 'coding.system IS NULL' value_sys_str_base = 'valueCoding.system="{}"'.format( code_system) if code_system else 'valueCoding.system IS NULL' value_sys_str = '(valueCoding IS NULL OR {})'.format( value_sys_str_base) merge_udf = F.UserDefinedFunction( lambda d, v: merge_date_and_value(d, v), T.StringType()) return obs.withColumn( 'coding', F.explode('code.coding')).where(sys_str).withColumn( 'valueCoding', # Note valueCoding can be null. F.explode_outer('value.codeableConcept.coding') ).where(value_sys_str).withColumn( 'dateAndValue', merge_udf( F.col('effective.dateTime'), F.col('value.quantity.value'))).withColumn( 'dateAndValueCode', merge_udf(F.col('effective.dateTime'), F.col('valueCoding.code')) ).select(F.col('coding'), F.col('valueCoding'), F.col('value'), F.col('subject.patientId').alias('patientId'), F.col('effective.dateTime').alias('dateTime'), F.col('dateAndValue'), F.col('dateAndValueCode'), F.col('context.EncounterId').alias('encounterId'))
def get_drug_citation_df(df, col_name, df_join): """ Returns a Dataframe with drug description for each publication or trial (each row) that mentioned the drug :param df: :param col_name: :param df_join: :return: a Dataframe for each citation """ # explode dataframe by remaining tokens (drug name) df_article_drug = df.withColumn("drug_lower", explode_outer( col(col_name))).drop(col_name) # join on drugs dataframe df_drug_citation = df_join.join(df_article_drug, how='full', on='drug_lower') return df_drug_citation
def process_collection(spark_session, impress_api_url, current_schema, current_type, entity_df, proxies): """ :param spark_session: :param impress_api_url: :param current_schema: :param current_type: :param entity_df: :return: """ impress_subtype = "" collection_types = [] for column_name in current_schema.names: if "Collection" in column_name: impress_subtype = column_name.replace("Collection", "") if current_type != "": column_name = current_type + "." + column_name sub_entity_schema = get_impress_entity_schema( spark_session, impress_api_url, impress_subtype, proxies) get_entities_udf = udf( lambda x: get_impress_entity_by_ids( impress_api_url, impress_subtype, x, proxies), ArrayType(StructType(sub_entity_schema)), ) entity_df = entity_df.withColumn( impress_subtype, get_entities_udf(entity_df[column_name])) collection_types.append( dict(type=impress_subtype, schema=sub_entity_schema)) entity_df = entity_df.withColumn( impress_subtype, explode_outer(entity_df[impress_subtype])) for collection_type in collection_types: logger.info("Calling to process:" + collection_type["type"]) entity_df = process_collection( spark_session, impress_api_url, collection_type["schema"], collection_type["type"], entity_df, proxies, ) return entity_df
def _get_entity_by_type(dcc_df: DataFrame, entity_type: str, centre_columns: List[str]) -> DataFrame: """ Takes a DCC DataFrame and obtains a given the entity type adding a '_type' column to the DataFrame :param DataFrame dcc_df: a DataFrame generated by extract_dcc_xml_files :param str entity_type: 'line', 'experiment', 'mouse', 'embryo' :param List[str] centre_columns: the Centre columns that the output DataFrame should maintain :return: A DataFrame containing only the specified entity type :rtype: DataFrame """ centre_columns.append(dcc_df[entity_type]) entity_df = dcc_df.where( dcc_df[entity_type].isNotNull()).select(centre_columns + [entity_type]) entity_df = (entity_df.withColumn( "tmp", explode_outer(entity_df[entity_type])).select( ["tmp.*"] + centre_columns).withColumn( "_type", lit(entity_type)).drop(entity_type)) return entity_df
def flatten(df_input, drop_column_list): """ This function is a generic function which can flatten any complex netsed json structure into a single flat dataframe. The function recursively traverses each element in the dataframe and if the element is a nested (StructType/Arraytype) structure, explodes or flattens it accordingly. N levels of nestings can also be flattened. Args: df_input (DataFrame): the dataframe to be flattened drop_column_list (List): While flattening if any column that need not be flattened in the dataframe, the column name should be provided and will be dropped while flattening. Raises: excep_msg: Any Exception Occured during the flattening is thrown. Returns: DataFrame: The resultant flattened DataFrame is returned. """ try: complex_fields = { field.name: field.dataType for field in df_input.schema.fields if isinstance(field.dataType, (types.ArrayType, types.StructType)) } while len(complex_fields) != 0: col_name = list(complex_fields.keys())[0] if col_name in drop_column_list: df_input = df_input.drop(col_name) elif isinstance(complex_fields[col_name], types.StructType): expanded = [func.col(col_name + '.' + k).alias(col_name + '_' + k) for k in [n.name for n in complex_fields[col_name]]] df_input = df_input.select("*", *expanded).drop(col_name) elif isinstance(complex_fields[col_name], types.ArrayType): df_input = df_input.withColumn(col_name, func.explode_outer(col_name)) complex_fields = { field.name: field.dataType for field in df_input.schema.fields if isinstance(field.dataType, (types.ArrayType, types.StructType)) } return df_input except Exception as excep_msg: raise excep_msg
def transfLineitems_old(ds): invoiceDS = ds # always an array, if format changes... no amount due in line items.. then check for an array type(_Description) # removing few special characters for TOS and TOSPrice invoiceDS = invoiceDS.withColumn("LineItemsDescription", concat_ws(",", col("_LineItems._Description"))) \ .withColumn("TOS", split(regexp_replace(col("LineItemsDescription"), "[:;\\[\\]]", ""), ",")) \ .withColumn("Price", col("_LineItems._TotalPriceNetto")) udf_serviceprice = udf(get_custom_service_price, StringType()) invoiceDS = invoiceDS.withColumn("servicePrice", explode_outer(split(udf_serviceprice(col("TOS"), col("Price")), ","))) \ .withColumn("servicePriceFinal", split(col("servicePrice"), "<>")) \ .withColumn("TypeOfService", col("servicePriceFinal").getItem(0)) \ .withColumn("ServiceAmount", col("servicePriceFinal").getItem(1).cast(DoubleType())) \ .where(~(col('TypeOfService').like("Total Amount%"))) return invoiceDS
def get_entity_by_type(dcc_df: DataFrame, entity_type: str, centre_columns: List[str]) -> DataFrame: """ Takes a DCC DataFrame and obtains a given the entity type adding a '_type' column to the DataFrame, Takes in a DataFrame generated by `impc_etl.jobs.extract.dcc_extractor_helper.extract_dcc_xml_files`, an entity_type (can be 'line', 'experiment', 'mouse' or 'embryo'), the list of centre_columns that the output DataFrame should maintain and returns A DataFrame containing only the data for the specified entity type. """ if entity_type not in dcc_df.columns: print("No entries for " + entity_type + " in the provided DCC XML files") raise NoDataFoundError centre_columns.append(dcc_df[entity_type]) entity_df = dcc_df.where( dcc_df[entity_type].isNotNull()).select(centre_columns + [entity_type]) entity_df = (entity_df.withColumn( "tmp", explode_outer(entity_df[entity_type])).select( ["tmp.*"] + centre_columns).withColumn( "_type", lit(entity_type)).drop(entity_type)) return entity_df
def upsertPhones(mongoUserDF): phonesToBeUpsertedDF = mongoUserDF.select( sf.col('phone').alias('primaryPhone'), 'createdAt', 'updatedAt', 'phones', 'mongoid', ) mongoIDs = [row.mongoid for row in phonesToBeUpsertedDF.collect()] mongoIDsString = ("' , '").join(mongoIDs) pgQuery = "(SELECT id, mongoid FROM customers WHERE mongoid IN ('" + mongoIDsString + "')) foo" pgCustomerDF = getPGDataframe(pgQuery) phonesAssociatedCustomerDF = phonesToBeUpsertedDF.join(pgCustomerDF, on=['mongoid'], how='left') # The id in the join is customer's id not postGres Phone id phonesAssociatedCustomerDF = phonesAssociatedCustomerDF.select( sf.col('id').alias('customer'), 'updatedAt', 'createdAt', 'primaryPhone', 'phones') phones = phonesAssociatedCustomerDF.select( 'customer', 'createdAt', 'updatedAt', 'primaryPhone', sf.explode_outer('phones').alias('otherPhone')) phones = phones.withColumn( 'otherPhone', sf.when(sf.col('otherPhone').isNull(), sf.col('primaryPhone')).otherwise(sf.col('otherPhone'))) phones = phones.withColumn( 'isPrimary', sf.when(sf.col('otherPhone') == sf.col('primaryPhone'), True).otherwise(False)) phones = phones.select('customer', 'createdat', 'updatedat', sf.col('otherPhone').alias('phone'), 'isPrimary') pgPhonesDF = getPGDataframe("customerphones") phones = phones.filter( ~(sf.concat(sf.col('customer'), sf.lit('_'), sf.col('phone'))).isin([ str(row.customer) + '_' + str(row.phone) for row in pgPhonesDF.collect() ])) writeDataframetoPG(phones, "customerphones")
def transfLineitems(ds): invoiceDS: DataFrame = ds invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \ .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto")) if invoiceDS.schema["TypeOfService"].dataType == StringType(): invoiceDS = invoiceDS.withColumn("TypeOfService", split(col("TypeOfService"), ",")) \ .withColumn("ServiceAmount", split(col("ServiceAmount"), ",")) # invoiceDS.printSchema() # import sys # sys.exit(3) invoiceDS = invoiceDS.withColumn("ServiceNPrice", arrays_zip(col("TypeOfService"), col("ServiceAmount"))) \ .withColumn("ServiceNPrice", explode_outer(col("ServiceNPrice"))) invoiceDS = invoiceDS.withColumn("TypeOfService", when(col("ServiceNPrice.TypeOfService").isNotNull(), regexp_replace(col("ServiceNPrice.TypeOfService"), "[:;]", "")).otherwise(""))\ .withColumn("ServiceAmount", col("ServiceNPrice.ServiceAmount")) \ .filter(~(col('TypeOfService').like("Total Amount%"))) return invoiceDS