def _flatten_encounter(self,
                        base_encounter_url: str,
                        force_location_type_columns: bool = True):
     """Returns a custom flat view of encoutners."""
     # When merging flattened encounters and observations, we need to be careful
     # with flattened columns for encounter type and location and only include
     # them if there is a constraints on them. Otherwise we may end up with a
     # single observation repeated multiple times in the view.
     flat_df = self._enc_df.select("subject", "id", "location", "type",
                                   "period").withColumn(
                                       "encounterId",
                                       F.regexp_replace(
                                           "id", base_encounter_url, ""))
     column_list = [
         F.col("encounterId"),
         F.col("subject.patientId").alias("encPatientId"),
         F.col("period.start").alias("first"),
         F.col("period.end").alias("last"),
     ]
     if self._enc_constraint.has_location() or force_location_type_columns:
         flat_df = flat_df.withColumn("locationFlat",
                                      F.explode_outer("location"))
         column_list += [
             F.col("locationFlat.location.LocationId").alias("locationId"),
             F.col("locationFlat.location.display").alias(
                 "locationDisplay"),
         ]
     if self._enc_constraint.has_type() or force_location_type_columns:
         flat_df = flat_df.withColumn("typeFlat", F.explode_outer("type"))
         column_list += [
             F.col("typeFlat.coding.system").alias("encTypeSystem"),
             F.col("typeFlat.coding.code").alias("encTypeCode"),
         ]
     return flat_df.select(column_list).where(
         self._construct_encounter_constraint(self._enc_constraint))
Esempio n. 2
0
    def test_explode(self):
        from pyspark.sql.functions import explode, explode_outer, posexplode_outer
        d = [
            Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}),
            Row(a=1, intlist=[], mapfield={}),
            Row(a=1, intlist=None, mapfield=None),
        ]
        rdd = self.sc.parallelize(d)
        data = self.spark.createDataFrame(rdd)

        result = data.select(explode(data.intlist).alias("a")).select("a").collect()
        self.assertEqual(result[0][0], 1)
        self.assertEqual(result[1][0], 2)
        self.assertEqual(result[2][0], 3)

        result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect()
        self.assertEqual(result[0][0], "a")
        self.assertEqual(result[0][1], "b")

        result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()]
        self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)])

        result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()]
        self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)])

        result = [x[0] for x in data.select(explode_outer("intlist")).collect()]
        self.assertEqual(result, [1, 2, 3, None, None])

        result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()]
        self.assertEqual(result, [('a', 'b'), (None, None), (None, None)])
Esempio n. 3
0
 def _flatten_encounter(self,
                        base_encounter_url: str,
                        force_location_type_columns: bool = True):
     """Returns a custom flat view of encoutners."""
     # When merging flattened encounters and observations, we need to be careful
     # with flattened columns for encounter type and location and only include
     # them if there is a constraints on them. Otherwise we may end up with a
     # single observation repeated multiple times in the view.
     flat_df = self._enc_df.select('subject', 'id', 'location', 'type',
                                   'period').withColumn(
                                       'encounterId',
                                       F.regexp_replace(
                                           'id', base_encounter_url, ''))
     column_list = [
         F.col('encounterId'),
         F.col('subject.patientId').alias('encPatientId'),
         F.col('period.start').alias('first'),
         F.col('period.end').alias('last')
     ]
     if self._enc_constraint.has_location() or force_location_type_columns:
         flat_df = flat_df.withColumn('locationFlat',
                                      F.explode_outer('location'))
         column_list += [
             F.col('locationFlat.location.LocationId').alias('locationId'),
             F.col('locationFlat.location.display').alias('locationDisplay')
         ]
     if self._enc_constraint.has_type() or force_location_type_columns:
         flat_df = flat_df.withColumn('typeFlat', F.explode_outer('type'))
         column_list += [
             F.col('typeFlat.coding.system').alias('encTypeSystem'),
             F.col('typeFlat.coding.code').alias('encTypeCode')
         ]
     return flat_df.select(column_list).where(self._enc_constraint.sql())
Esempio n. 4
0
    def test_explode(self):
        from pyspark.sql.functions import explode, explode_outer, posexplode_outer
        d = [
            Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}),
            Row(a=1, intlist=[], mapfield={}),
            Row(a=1, intlist=None, mapfield=None),
        ]
        rdd = self.sc.parallelize(d)
        data = self.spark.createDataFrame(rdd)

        result = data.select(explode(data.intlist).alias("a")).select("a").collect()
        self.assertEqual(result[0][0], 1)
        self.assertEqual(result[1][0], 2)
        self.assertEqual(result[2][0], 3)

        result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect()
        self.assertEqual(result[0][0], "a")
        self.assertEqual(result[0][1], "b")

        result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()]
        self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)])

        result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()]
        self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)])

        result = [x[0] for x in data.select(explode_outer("intlist")).collect()]
        self.assertEqual(result, [1, 2, 3, None, None])

        result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()]
        self.assertEqual(result, [('a', 'b'), (None, None), (None, None)])
def process_study_table(study_index: str) -> DataFrame:
    'Loads and processes disease information from the study table.'

    return (
        spark.read.json(study_index).select(
            'study_id',
            'pmid',
            'pub_date',
            'pub_author',
            'trait_reported',
            'trait_efos',
            col('n_initial').alias('sample_size'),
        )
        # Assign project based on the study author information
        .withColumn(
            'projectId',
            when(col('study_id').contains('FINNGEN'), 'FINNGEN').when(
                col('study_id').contains('NEALE'),
                'NEALE').when(col('study_id').contains('SAIGE'), 'SAIGE').when(
                    col('study_id').contains('GCST'), 'GCST'),
        )
        # Warning! Not all studies have an EFO annotated (trait_efos is an empty array)
        # Also, some have multiple EFOs!
        # Studies with no EFO are kept, the array is exploded to capture each mapped trait
        .withColumn('efo', explode_outer(col('trait_efos'))).drop('trait_efos')
        # Drop records with HANCESTRO IDs as mapped trait
        .filter((~col('efo').contains('HANCESTRO')) | (col('efo').isNull())))
Esempio n. 6
0
def flatten(df, exc_flat=list()):
    complex_fields = dict([
        (field.name, field.dataType) for field in df.schema.fields
        if (type(field.dataType) == ArrayType or type(field.dataType) ==
            StructType) and (field.name not in exc_flat)
    ])

    while len(complex_fields) != 0:
        col_name = list(complex_fields.keys())[0]

        if (type(complex_fields[col_name]) == StructType):
            expanded = [
                col(col_name + '.' + k).alias(col_name + '_' + k)
                for k in [n.name for n in complex_fields[col_name]]
            ]
            df = df.select("*", *expanded).drop(col_name)

        elif (type(complex_fields[col_name]) == ArrayType):
            df = df.withColumn(col_name, explode_outer(col_name))

        # recompute remaining Complex Fields in Schema
        complex_fields = dict([
            (field.name, field.dataType) for field in df.schema.fields
            if (type(field.dataType) == ArrayType or type(field.dataType) ==
                StructType) and (field.name not in exc_flat)
        ])

    return df
    def transfLineitems(ds, config):
        invoiceDS: DataFrame = ds

        invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \
            .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto"))

        if invoiceDS.schema["TypeOfService"].dataType == StringType():
            invoiceDS = invoiceDS.withColumn("TypeOfService", array(col("_LineItems._Description"))) \
                .withColumn("ServiceAmount", array(col("_LineItems._TotalPriceNetto")))

        invoiceDS = invoiceDS.withColumn("ServiceNRate", arrays_zip(col("TypeOfService"), col("ServiceAmount")))\
                    .withColumn("ServiceNRate", explode_outer(col("ServiceNRate")))

        # udf_service = udf(get_genesis_TOS, StringType()) # same code of line 56, remove costTypeList

        billPath = config["Master"]["billPath"]
        CostTypeList = createDatasetFromCSVFile("", billPath).filter(col("VENDOR_NAME").rlike("GENESIS MARINE, LLC"))\
            .rdd.map(lambda x: x.COST_LINE_ITEM_TYPE).collect()

        def udf_service(costType):
            return udf(lambda l: get_genesis_TOS_search(l, costType),
                       StringType())

        invoiceDS = invoiceDS.withColumn("TypeOfService", udf_service(CostTypeList)(regexp_replace(
                                                        col("ServiceNRate.TypeOfService"), "\n", " ")))\
            .withColumn("ServiceAmount", col("ServiceNRate.ServiceAmount")) \

        return invoiceDS
Esempio n. 8
0
def process_collection(spark_session, impress_api_url, current_schema,
                       current_type, entity_df):
    """

    :param spark_session:
    :param impress_api_url:
    :param current_schema:
    :param current_type:
    :param entity_df:
    :return:
    """
    impress_subtype = ''
    for column_name in current_schema.names:
        if 'Collection' in column_name:
            impress_subtype = column_name.replace('Collection', '')
            if current_type != '':
                column_name = current_type + '.' + column_name
            entity_id_column_name = impress_subtype + 'Id'
            entity_df = entity_df.withColumn(
                entity_id_column_name, explode_outer(entity_df[column_name]))
            sub_entity_schema = get_impress_entity_schema(
                spark_session, impress_api_url, impress_subtype)
            get_entity_udf = udf(
                lambda x: get_impress_entity_by_id(impress_api_url,
                                                   impress_subtype, x),
                StructType(sub_entity_schema))
            entity_column_name = impress_subtype
            entity_df = entity_df.withColumn(
                entity_column_name,
                get_entity_udf(entity_df[entity_id_column_name]))
            entity_df = process_collection(spark_session, impress_api_url,
                                           sub_entity_schema, impress_subtype,
                                           entity_df)
    return entity_df
    def json_to_flat(self):  # method to flatten the dataframe
        df = self.df
        nested_columns = dict([(x.name, x.dataType) for x in df.schema.fields
                               if isinstance(x.dataType, T.StructType)
                               or isinstance(x.dataType, T.ArrayType)])

        if len(nested_columns) > 0:
            col_name = list(nested_columns.keys())[0]
            if isinstance(nested_columns[col_name], T.ArrayType):

                df = df.withColumn(col_name, explode_outer(col_name))

                return Ops(df).json_to_flat()

            elif isinstance(nested_columns[col_name], T.StructType):

                df = df.select(
                    "*", *[
                        col(col_name + "." + x.name).alias(col_name + "_" +
                                                           x.name)
                        for x in nested_columns[col_name]
                    ]).drop(col_name)
                return Ops(df).json_to_flat()
        else:
            return df
Esempio n. 10
0
def flatten_array(df):
    for column, column_type in df.dtypes:
        if column_type.startswith("array<"):
            df = df.select("*",
                           f.explode_outer(df[column]).alias(f"{column}_2"))
            df = df.drop(column)
    df = df.toDF(*(c.replace("_2", "") for c in df.columns))
    return df.select("*")
def consume_transactions(input_df, checkpoint_path):
    # The Below Section is for Flight Transactions transformation and loading
    new_df = input_df.withColumn(
        'new_val', F.regexp_replace(input_df['value'], '\\\\',
                                    '')).drop('value')
    new_df = new_df.withColumn('value',
                               F.regexp_replace(new_df['new_val'], '""',
                                                "'")).drop('new_val')
    new_df = new_df.withColumn('new_val',
                               F.regexp_replace(new_df['value'], '}n',
                                                "}")).drop('value')
    new_df = new_df.withColumn('new_val',
                               F.regexp_replace(new_df['new_val'], "'", ""))
    #new_df = new_df.withColumn('struct_val',F.struct(new_df['new_val'])).drop('new_val')

    transaction_schema = StructType(
        (StructField("DestinationAirportCode",
                     StringType()), StructField("Itinerary", StringType()),
         StructField("OneWayOrReturn", StringType()),
         StructField("OriginAirportCode", StringType()),
         StructField(
             "Segment",
             ArrayType(
                 StructType([
                     StructField("ArrivalAirportCode", StringType()),
                     StructField("DepartureAirportCode", StringType()),
                     StructField("LegNumber", StringType()),
                     StructField("NumberOfPassengers", StringType()),
                     StructField("SegmentNumber", StringType())
                 ]))), StructField("TransactionDateUTC", StringType()),
         StructField("UniqueId", StringType())))

    json_df = new_df.select(
        F.from_json(F.col("new_val"),
                    transaction_schema).alias("value")).selectExpr('value.*')
    json_df = json_df.withColumn('Segment',
                                 F.explode_outer(json_df['Segment']))

    json_df = json_df.withColumn('DepartureAirportCode', F.col('Segment')['DepartureAirportCode']) \
        .withColumn('ArrivalAirportCode', F.col('Segment')['ArrivalAirportCode']) \
        .withColumn('LegNumber', F.col('Segment')['LegNumber']) \
        .withColumn('NumberOfPassengers', F.col('Segment')['NumberOfPassengers']) \
        .withColumn('SegmentNumber', F.col('Segment')['SegmentNumber']) \
        .drop('Segment')

    # Stream the data, from a Kafka topic to a Spark in-memory table
    query = json_df\
        .writeStream \
        .format("memory") \
        .queryName("TransactionTable") \
        .outputMode("append") \
        .option("checkpoint",checkpoint_path)\
        .start()

    query.awaitTermination(5)

    # Let it Fill up the table
    sleep(10)
Esempio n. 12
0
 def columnsToRowsDf(cls, df, columnsdict):
     columnset = set()
     for key, values in columnsdict.items():
         values = tuple(values)
         df = df.withColumn(key, explode_outer(array(*values)))
         columnset.update(values)
     df_columns = tuple(df.columns)
     dfcoltupl = filter(lambda x: x not in columnset, df_columns)
     return df.select(*dfcoltupl)
Esempio n. 13
0
def firetv_events(spark, experiment):
    te = spark.table('telemetry_mobile_event_parquet')
    return te.filter(
        te.app_name == 'FirefoxForFireTV'
    ).select(
      te.submission_date_s3,
      te.client_id,
      F.explode_outer(te.events).alias('event')
    )
def process(df: DataFrame) -> pd.DataFrame:
    """Delete stop words and compute the most frequent words over the text of all patents"""
    counts = df.select(
        sf.explode_outer(f"{OUTPUT_COL_ENGLISH_TEXT}_stopwords").alias("word"))
    # TODO check num partitions
    result = counts.groupBy("word").count()
    result = result.sort(sf.col("count").desc()).limit(NUM_MOST_FREQUENT_WORDS)
    result_p = result.toPandas()
    return result_p
Esempio n. 15
0
    def _explode_all(df, df_array_fields):
        """

        :param df:
        :param df_array_fields:
        :return:
        """
        for coluna in df_array_fields.keys():
            df = df.withColumn(coluna, explode_outer(col(coluna)))
        return df
def log_language_distribution(df: DataFrame, field_name: str):
    """Generates a log with the distribution of languages"""
    logger.info(f"Getting language distribution for: {field_name}")
    if isinstance(df.select(field_name).schema.fields[0].dataType, ArrayType):
        languages = df.select(
            sf.explode_outer(field_name).alias("target_field"))
    else:
        languages = df.select(sf.col(field_name).alias("target_field"))
    languages_p = languages.groupby("target_field._lang").count().toPandas()
    logger.info(
        f"Distribution of languages in {field_name}:\n{languages_p.to_string()}"
    )
Esempio n. 17
0
    def _explode_columns(df, df_array_fields, columns):
        """

        :param df:
        :param df_array_fields:
        :param columns:
        :return:
        """

        for column in columns:
            if column in df_array_fields.keys():
                df = df.withColumn(column, explode_outer(col(column)))
        return df
    def _flatten_obs(obs: pyspark_sql.DataFrame,
                     code_system: str = None) -> pyspark_sql.DataFrame:
        """Creates a flat version of Observation FHIR resources.

        Note `code_system` is only applied on `code.coding` which is a required
        filed, i.e., it is not applied on `value.codeableConcept.coding`.

        Args:
          obs: A collection of Observation FHIR resources.
          code_system: The code system to be used for filtering `code.coding`.
        Returns:
          A DataFrame with the following columns (note one input observation might
          be repeated, once for each of its codes):
          - `coding` from the input obsservation's `code.coding`
          - `valueCoding` from the input's `value.codeableConcept.coding`
          - `value` from the input's `value`
          - `patientId` from the input's `subject.patientId`
          - `dateTime` from the input's `effective.dateTime`
        """
        sys_str = ('coding.system="{}"'.format(code_system)
                   if code_system else "coding.system IS NULL")
        value_sys_str_base = ('valueCoding.system="{}"'.format(code_system)
                              if code_system else "valueCoding.system IS NULL")
        value_sys_str = "(valueCoding IS NULL OR {})".format(
            value_sys_str_base)
        merge_udf = F.UserDefinedFunction(_merge_date_and_value,
                                          T.StringType())
        return (obs.withColumn(
            "coding", F.explode("code.coding")).where(sys_str).withColumn(
                "valueCoding",  # Note valueCoding can be null.
                F.explode_outer("value.codeableConcept.coding"),
            ).where(value_sys_str).withColumn(
                "dateAndValue",
                merge_udf(F.col("effective.dateTime"),
                          F.col("value.quantity.value")),
            ).withColumn(
                "dateAndValueCode",
                merge_udf(F.col("effective.dateTime"),
                          F.col("valueCoding.code")),
            ).select(
                F.col("coding"),
                F.col("valueCoding"),
                F.col("value"),
                F.col("subject.patientId").alias("patientId"),
                F.col("effective.dateTime").alias("dateTime"),
                F.col("dateAndValue"),
                F.col("dateAndValueCode"),
                F.col("context.EncounterId").alias("encounterId"),
            ))
Esempio n. 19
0
    def do(self, workflow, etl_process):

        from pyspark.sql.functions import explode_outer, explode, col

        self.target = self.action_details.pop("target")
        self.keep_nulls = self.action_details.pop("keep_nulls", True)

        other_cols = [col(i) for i in workflow.df.columns if i != self.target]

        if self.keep_nulls:
            workflow.df = workflow.df \
                .select(*other_cols, explode_outer(col(self.target)).alias(self.target))
        else:
            workflow.df = workflow.df \
                .select(*other_cols, explode(col(self.target)).alias(self.target))
Esempio n. 20
0
def format_dataframe(df: DataFrame, verbose=True) -> DataFrame:
    if verbose:
        print(f'Formatting dataframe:')

    # Save start time for timing
    start_time = time.time()

    # Explode the columns: "devices" and "tracks"
    if verbose:
        print(f'\tExploding columns containing lists.')

    df = (df.withColumn('devices', f.explode_outer('devices')).withColumn(
        'tracks', f.explode_outer('tracks')))

    # Flatten the schema.
    if verbose:
        print(f'\tFlattening the dataframe schema.')

    df = t.spark.flatten_schema(df)

    if verbose:
        print(f'\tExecution time: {time.time() - start_time:.5f}')

    return df
Esempio n. 21
0
def transformed_df(books_info):
    transformed_emp = books_info.withColumn("languages", explode_outer("languages")) \
        .withColumn("subjects", explode_outer("subjects")) \
        .withColumn("other_titles", explode_outer("other_titles")) \
        .withColumn("publishers", explode_outer("publishers")) \
        .withColumn("authors", explode_outer("authors")) \
        .withColumn("genres", explode_outer("genres")) \
        .withColumn("publish_places", explode_outer("publish_places")) \
        .withColumn("download_url", explode_outer("download_url")) \
        .withColumn("batch_date", lit(date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss"))) \
        .select("Title", regexp_replace(expr("languages.key"), "/languages/", "").alias("languages"),
                "subjects", regexp_replace(expr("location"), "[\[\]\"]", "").alias("location"),
                "other_titles", "publishers", "publish_places", col("last_modified.value").alias("last_modified"),
                regexp_replace(expr("authors.key"), "/authors/", "").alias("authors")
                , col("created.value").alias("created"), "genres", "contributions", "number_of_pages",
                "publish_country", "publish_date", "download_url", "batch_date")

    return transformed_emp
Esempio n. 22
0
def prepare_version_multiplicity_data(sbom_dataframe: DataFrame) -> DataFrame:
    vm_with_score_df = sbom_dataframe.select(
        'firmware_hash',
        explode_outer('components').alias('component')).select(
            'firmware_hash',
            col('component.name').alias('name'),
            col('component.version').alias('version'),
        ).na.drop(subset=['version']).groupBy('firmware_hash', 'name').agg(
            countDistinct('version').alias('version_count')).select(
                'firmware_hash',
                when(col('version_count') > 1,
                     col('version_count') - 1).otherwise(0).alias(
                         'multiplicity')).groupBy('firmware_hash').agg(
                             _sum('multiplicity').cast('int').alias(
                                 'software_version_multiplicity'))

    return vm_with_score_df
Esempio n. 23
0
    def _flatten_obs(obs: DataFrame, code_system: str = None) -> DataFrame:
        """Creates a flat version of Observation FHIR resources.

    Note `code_system` is only applied on `code.coding` which is a required
    filed, i.e., it is not applied on `value.codeableConcept.coding`.

    Args:
      obs: A collection of Observation FHIR resources.
      code_system: The code system to be used for filtering `code.coding`.
    Returns:
      A DataFrame with the following columns (note one input observation might
      be repeated, once for each of its codes):
      - `coding` from the input obsservation's `code.coding`
      - `valueCoding` from the input's `value.codeableConcept.coding`
      - `value` from the input's `value`
      - `patientId` from the input's `subject.patientId`
      - `dateTime` from the input's `effective.dateTime`
    """
        sys_str = 'coding.system="{}"'.format(
            code_system) if code_system else 'coding.system IS NULL'
        value_sys_str_base = 'valueCoding.system="{}"'.format(
            code_system) if code_system else 'valueCoding.system IS NULL'
        value_sys_str = '(valueCoding IS NULL OR {})'.format(
            value_sys_str_base)
        merge_udf = F.UserDefinedFunction(
            lambda d, v: merge_date_and_value(d, v), T.StringType())
        return obs.withColumn(
            'coding', F.explode('code.coding')).where(sys_str).withColumn(
                'valueCoding',  # Note valueCoding can be null.
                F.explode_outer('value.codeableConcept.coding')
            ).where(value_sys_str).withColumn(
                'dateAndValue',
                merge_udf(
                    F.col('effective.dateTime'),
                    F.col('value.quantity.value'))).withColumn(
                        'dateAndValueCode',
                        merge_udf(F.col('effective.dateTime'),
                                  F.col('valueCoding.code'))
                    ).select(F.col('coding'), F.col('valueCoding'),
                             F.col('value'),
                             F.col('subject.patientId').alias('patientId'),
                             F.col('effective.dateTime').alias('dateTime'),
                             F.col('dateAndValue'), F.col('dateAndValueCode'),
                             F.col('context.EncounterId').alias('encounterId'))
Esempio n. 24
0
def get_drug_citation_df(df, col_name, df_join):
    """
    Returns a Dataframe with drug description
    for each publication or trial (each row)
    that mentioned the drug
    :param df:
    :param col_name:
    :param df_join:
    :return: a Dataframe for each citation
    """
    # explode dataframe by remaining tokens (drug name)
    df_article_drug = df.withColumn("drug_lower", explode_outer(
        col(col_name))).drop(col_name)

    # join on drugs dataframe
    df_drug_citation = df_join.join(df_article_drug,
                                    how='full',
                                    on='drug_lower')
    return df_drug_citation
Esempio n. 25
0
def process_collection(spark_session, impress_api_url, current_schema,
                       current_type, entity_df, proxies):
    """

    :param spark_session:
    :param impress_api_url:
    :param current_schema:
    :param current_type:
    :param entity_df:
    :return:
    """
    impress_subtype = ""
    collection_types = []
    for column_name in current_schema.names:
        if "Collection" in column_name:
            impress_subtype = column_name.replace("Collection", "")
            if current_type != "":
                column_name = current_type + "." + column_name
            sub_entity_schema = get_impress_entity_schema(
                spark_session, impress_api_url, impress_subtype, proxies)
            get_entities_udf = udf(
                lambda x: get_impress_entity_by_ids(
                    impress_api_url, impress_subtype, x, proxies),
                ArrayType(StructType(sub_entity_schema)),
            )
            entity_df = entity_df.withColumn(
                impress_subtype, get_entities_udf(entity_df[column_name]))
            collection_types.append(
                dict(type=impress_subtype, schema=sub_entity_schema))
            entity_df = entity_df.withColumn(
                impress_subtype, explode_outer(entity_df[impress_subtype]))

    for collection_type in collection_types:
        logger.info("Calling to process:" + collection_type["type"])
        entity_df = process_collection(
            spark_session,
            impress_api_url,
            collection_type["schema"],
            collection_type["type"],
            entity_df,
            proxies,
        )
    return entity_df
Esempio n. 26
0
def _get_entity_by_type(dcc_df: DataFrame, entity_type: str,
                        centre_columns: List[str]) -> DataFrame:
    """
    Takes a DCC DataFrame and obtains a given the entity type
    adding a '_type' column to the DataFrame
    :param DataFrame dcc_df: a DataFrame generated by extract_dcc_xml_files
    :param str entity_type: 'line', 'experiment', 'mouse', 'embryo'
    :param List[str] centre_columns: the Centre columns that the output DataFrame should maintain
    :return: A DataFrame containing only the specified entity type
    :rtype: DataFrame
    """
    centre_columns.append(dcc_df[entity_type])
    entity_df = dcc_df.where(
        dcc_df[entity_type].isNotNull()).select(centre_columns + [entity_type])
    entity_df = (entity_df.withColumn(
        "tmp", explode_outer(entity_df[entity_type])).select(
            ["tmp.*"] + centre_columns).withColumn(
                "_type", lit(entity_type)).drop(entity_type))
    return entity_df
Esempio n. 27
0
File: dfs.py Progetto: 5-k/test
def flatten(df_input, drop_column_list):
    """
    This function is a generic function which can flatten any complex netsed json
    structure into a single flat dataframe.
    The function recursively traverses each element in the dataframe and if the element is a
    nested (StructType/Arraytype) structure, explodes or flattens it accordingly.
    N levels of nestings can also be flattened.
    Args:
        df_input (DataFrame): the dataframe to be flattened
        drop_column_list (List): While flattening if any column that need not be flattened in
                                 the dataframe, the column name should be provided and will be
                                 dropped while flattening.
    Raises:
        excep_msg: Any Exception Occured during the flattening is thrown.

    Returns:
        DataFrame: The resultant flattened DataFrame is returned.
    """
    try:
        complex_fields = {
            field.name: field.dataType
            for field in df_input.schema.fields
            if isinstance(field.dataType, (types.ArrayType, types.StructType))
        }
        while len(complex_fields) != 0:
            col_name = list(complex_fields.keys())[0]
            if col_name in drop_column_list:
                df_input = df_input.drop(col_name)
            elif isinstance(complex_fields[col_name], types.StructType):
                expanded = [func.col(col_name + '.' + k).alias(col_name + '_' + k)
                            for k in [n.name for n in complex_fields[col_name]]]
                df_input = df_input.select("*", *expanded).drop(col_name)
            elif isinstance(complex_fields[col_name], types.ArrayType):
                df_input = df_input.withColumn(col_name, func.explode_outer(col_name))
            complex_fields = {
                field.name: field.dataType
                for field in df_input.schema.fields
                if isinstance(field.dataType, (types.ArrayType, types.StructType))
            }
        return df_input
    except Exception as excep_msg:
        raise excep_msg
Esempio n. 28
0
    def transfLineitems_old(ds):
        invoiceDS = ds

        # always an array, if format changes... no amount due in line items.. then check for an array type(_Description)

        # removing few special characters for TOS and TOSPrice
        invoiceDS = invoiceDS.withColumn("LineItemsDescription", concat_ws(",", col("_LineItems._Description"))) \
            .withColumn("TOS", split(regexp_replace(col("LineItemsDescription"), "[:;\\[\\]]", ""), ",")) \
            .withColumn("Price", col("_LineItems._TotalPriceNetto"))

        udf_serviceprice = udf(get_custom_service_price, StringType())

        invoiceDS = invoiceDS.withColumn("servicePrice",
                                         explode_outer(split(udf_serviceprice(col("TOS"), col("Price")), ","))) \
            .withColumn("servicePriceFinal", split(col("servicePrice"), "<>")) \
            .withColumn("TypeOfService", col("servicePriceFinal").getItem(0)) \
            .withColumn("ServiceAmount", col("servicePriceFinal").getItem(1).cast(DoubleType())) \
            .where(~(col('TypeOfService').like("Total Amount%")))

        return invoiceDS
Esempio n. 29
0
def get_entity_by_type(dcc_df: DataFrame, entity_type: str,
                       centre_columns: List[str]) -> DataFrame:
    """
    Takes a DCC DataFrame and obtains a given the entity type
    adding a '_type' column to the DataFrame, Takes in a DataFrame generated by
    `impc_etl.jobs.extract.dcc_extractor_helper.extract_dcc_xml_files`, an entity_type
    (can be 'line', 'experiment', 'mouse' or 'embryo'), the list of centre_columns
    that the output DataFrame should maintain and returns A DataFrame containing only the data for the
    specified entity type.
    """
    if entity_type not in dcc_df.columns:
        print("No entries for " + entity_type +
              " in the provided DCC XML files")
        raise NoDataFoundError
    centre_columns.append(dcc_df[entity_type])
    entity_df = dcc_df.where(
        dcc_df[entity_type].isNotNull()).select(centre_columns + [entity_type])
    entity_df = (entity_df.withColumn(
        "tmp", explode_outer(entity_df[entity_type])).select(
            ["tmp.*"] + centre_columns).withColumn(
                "_type", lit(entity_type)).drop(entity_type))
    return entity_df
Esempio n. 30
0
def upsertPhones(mongoUserDF):
    phonesToBeUpsertedDF = mongoUserDF.select(
        sf.col('phone').alias('primaryPhone'),
        'createdAt',
        'updatedAt',
        'phones',
        'mongoid',
    )
    mongoIDs = [row.mongoid for row in phonesToBeUpsertedDF.collect()]
    mongoIDsString = ("' , '").join(mongoIDs)
    pgQuery = "(SELECT id, mongoid FROM customers WHERE mongoid IN ('" + mongoIDsString + "')) foo"
    pgCustomerDF = getPGDataframe(pgQuery)
    phonesAssociatedCustomerDF = phonesToBeUpsertedDF.join(pgCustomerDF,
                                                           on=['mongoid'],
                                                           how='left')
    # The id in the join is customer's id not postGres Phone id
    phonesAssociatedCustomerDF = phonesAssociatedCustomerDF.select(
        sf.col('id').alias('customer'), 'updatedAt', 'createdAt',
        'primaryPhone', 'phones')
    phones = phonesAssociatedCustomerDF.select(
        'customer', 'createdAt', 'updatedAt', 'primaryPhone',
        sf.explode_outer('phones').alias('otherPhone'))
    phones = phones.withColumn(
        'otherPhone',
        sf.when(sf.col('otherPhone').isNull(),
                sf.col('primaryPhone')).otherwise(sf.col('otherPhone')))
    phones = phones.withColumn(
        'isPrimary',
        sf.when(sf.col('otherPhone') == sf.col('primaryPhone'),
                True).otherwise(False))
    phones = phones.select('customer', 'createdat', 'updatedat',
                           sf.col('otherPhone').alias('phone'), 'isPrimary')
    pgPhonesDF = getPGDataframe("customerphones")
    phones = phones.filter(
        ~(sf.concat(sf.col('customer'), sf.lit('_'), sf.col('phone'))).isin([
            str(row.customer) + '_' + str(row.phone)
            for row in pgPhonesDF.collect()
        ]))
    writeDataframetoPG(phones, "customerphones")
Esempio n. 31
0
    def transfLineitems(ds):
        invoiceDS: DataFrame = ds

        invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \
            .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto"))

        if invoiceDS.schema["TypeOfService"].dataType == StringType():
            invoiceDS = invoiceDS.withColumn("TypeOfService", split(col("TypeOfService"), ",")) \
                .withColumn("ServiceAmount", split(col("ServiceAmount"), ","))

        # invoiceDS.printSchema()
        # import sys
        # sys.exit(3)

        invoiceDS = invoiceDS.withColumn("ServiceNPrice", arrays_zip(col("TypeOfService"), col("ServiceAmount"))) \
            .withColumn("ServiceNPrice", explode_outer(col("ServiceNPrice")))

        invoiceDS = invoiceDS.withColumn("TypeOfService", when(col("ServiceNPrice.TypeOfService").isNotNull(),
                                         regexp_replace(col("ServiceNPrice.TypeOfService"), "[:;]", "")).otherwise(""))\
            .withColumn("ServiceAmount", col("ServiceNPrice.ServiceAmount")) \
            .filter(~(col('TypeOfService').like("Total Amount%")))

        return invoiceDS