def generateCanonicalName(self, dataFrame, filename):
        logger.info('in generateCanonicalName')
        mapDict = self.createDictfromMapfile(filename)

        def gen(text):
            Manufacturer = mapDict.get(text, -1)
            if Manufacturer != -1:
                return Manufacturer
            else:
                return 'UNKNOWN'

        udf = Functions.UserDefinedFunction(lambda x: gen(x))

        localDf = dataFrame.groupBy(
            Functions.col("Cleaned Manufacturer").alias(
                'Cleaned Manufacturer2')).agg(Functions.lit('1'))

        localDf = localDf.withColumn(
            "Canonical_Manufacturer",
            udf(Functions.col("Cleaned Manufacturer2"))).drop('1')
        dataFrame = dataFrame.join(localDf,
                                   (dataFrame['Cleaned Manufacturer']
                                    == localDf['Cleaned Manufacturer2'])).drop(
                                        localDf['Cleaned Manufacturer2'])

        return dataFrame
Exemple #2
0
def calc_TX_PVLS(patient_agg_obs: DataFrame,
                 VL_code: str,
                 end_date_str: str = None) -> pandas.DataFrame:
    """Calculates TX_PVLS indicator with its corresponding disaggregations.

  Args:
    patient_agg_obs: A DataFrame generated by `join_patients_agg_obs()`.
    VL_code: The code for viral load values.
    end_date: The string representation of the last date as 'YYYY-MM-DD'.
  Returns:
  """
    end_date = datetime.today()
    if end_date_str:
        end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    agg_buckets_udf = F.UserDefinedFunction(
        lambda a, g: agg_buckets(a, g, end_date), T.ArrayType(T.StringType()))
    VL_df = patient_agg_obs.withColumn(
        'sup_VL', patient_agg_obs[VL_code + '_max_value'] < 150).withColumn(
            'agg_buckets',
            agg_buckets_udf(patient_agg_obs['birthDate'],
                            patient_agg_obs['gender']))
    num_patients = VL_df.count()
    VL_agg_P = VL_df.select(
        VL_df.sup_VL,
        F.explode(VL_df.agg_buckets).alias('agg_bucket')).groupBy(
            'sup_VL', 'agg_bucket').agg(
                F.count('*').alias('count')).toPandas().sort_values(
                    ['agg_bucket', 'sup_VL'])
    VL_agg_P['ratio'] = VL_agg_P['count'] / num_patients
    return VL_agg_P
def process_income_csv(income_csv_arg):
    income_df = spark.read.csv(income_csv_arg)
    income_df = income_df.filter((income_df['_c2'] != 'GEO.display-label') & (income_df['_c2'] != 'Geography')) \
        .select(income_df['_c2'].alias('county_state'), income_df['_c7'].alias('income'))

    county_state_column = pyspark.sql.functions.split(
        income_df['county_state'], ',')
    county_name = pyspark.sql.functions.split(county_state_column.getItem(0),
                                              ' ').getItem(0)
    state_name = functions.trim(county_state_column.getItem(1))

    income_df = income_df.withColumn('county', pyspark.sql.functions.lower(county_name)) \
        .withColumn('state', functions.lower(state_name)) \
        .withColumn('income', income_df['income'].cast('int')) \
        .drop('county_state')

    state_abbr_udf = functions.UserDefinedFunction(
        lambda state: states_abbr_mapping.get(state), types.StringType())
    income_df = income_df.withColumn("state_abbr",
                                     state_abbr_udf(
                                         income_df["state"])).drop('state')
    income_df = income_df.withColumn('combine', functions.concat(income_df['county'], income_df['state_abbr'])) \
        .filter((income_df['state_abbr'].isNotNull()) & (income_df['state_abbr'].isin(required_states_abbr)))

    return income_df
Exemple #4
0
    def generateRxInfo(self, dataFrame):

        logger.info('in generateRxInfo')
        base_uri = 'http://{}:{}/REST/rxcui/'.format(
            Constants.POSTGRESQL_HOST_IP, Constants.RXNAV_PORT)

        def genRxInfo(RxNormId):
            d = {'BN': [], 'IN': [], 'tag': set()}
            if RxNormId != '':
                try:
                    resp = requests.get(url=base_uri + RxNormId +
                                        "/allrelatedextension")
                    root = ET.fromstring(resp.content)
                    for conceptGroup in root.findall(
                            './allRelatedGroup/conceptGroup'):
                        for tty in conceptGroup.findall('./tty'):
                            if tty.text == 'BN':
                                for brandname in conceptGroup.findall(
                                        './conceptProperties/name'):
                                    d['BN'].append(brandname.text)
                            if tty.text == 'IN':
                                for Ingredientname in conceptGroup.findall(
                                        './conceptProperties/name'):
                                    d['IN'].append(Ingredientname.text)
                                for humandrug in conceptGroup.findall(
                                        './conceptProperties/inferedhuman'):
                                    if humandrug.text == 'US':
                                        d['tag'].add('Human_Drug')
                                for animaldrug in conceptGroup.findall(
                                        './conceptProperties/inferedvet'):
                                    if animaldrug.text == 'US':
                                        d['tag'].add('Animal_Drug')
                except ET.ParseError as e:
                    print("Invalid XML received from uri {}".format(e))
            return str(d['BN']) + '|' + str(d['IN']) + '|' + ','.join(d['tag'])

        udf = Functions.UserDefinedFunction(genRxInfo, StringType())

        localDf = dataFrame.groupBy(
            Functions.col("RxNormId").alias("RxNormId2")).agg(
                Functions.lit('1'))

        localDf = localDf.withColumn("RxInfo",
                                     udf(Functions.col("RxNormId2"))).drop('1')

        localDf = dataFrame.join(
            localDf, (dataFrame['RxNormId'] == localDf['RxNormId2'])).drop(
                localDf['RxNormId2'])

        split_col = Functions.split(localDf['RxInfo'], '\|')
        localDf = localDf.withColumn("Brand_Names", split_col.getItem(0))\
                        .withColumn("Ingredients", split_col.getItem(1)) \
                        .withColumn("tag",Functions.when(localDf.tag== '', split_col.getItem(2))\
                            .otherwise(Functions.when(Functions.length( split_col.getItem(2))==0,localDf.tag)\
                                .otherwise(Functions.concat(localDf.tag,Functions.lit(','),split_col.getItem(2))))).drop('RxInfo')

        #self.container1.stop()
        #self.container2.stop()
        return localDf
Exemple #5
0
def main():
    df = spark.read.json(katkam_in_directory)
    schema_file = open('schema')
    schema_lines = [i.strip() for i in schema_file.readlines()]
    schema = types.StructType([types.StructField(i, types.StringType(), False) for i in schema_lines])
    schema_file.close()
    weather = spark.read.csv(weather_in_directory, schema=schema)#.withColumn('filename', functions.input_file_name())

    df = df.join(weather, 'Date/Time')

    # https://stackoverflow.com/questions/39025707/how-to-convert-arraytype-to-densevector-in-pyspark-dataframe
    to_vec = functions.UserDefinedFunction(lambda vs: Vectors.dense(vs), VectorUDT())
    get_rid_of_rain = functions.UserDefinedFunction(lambda vs: rain_gone(vs), types.LongType())

    df = df.select(get_rid_of_rain(df['Weather']).alias('label'), to_vec(df['image']).alias('features'))

    # Do machine learning
    splits = df.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # Naive Bayes Model
    #nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # Logistic Regression Model
    lr = LogisticRegression()

    model = lr.fit(train)
    predictions = model.transform(test)

    # Compute accuracy on the test set
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Write the final predictions dataframe to a CSV directory
    predictions.write.json(out_directory, mode='overwrite')

    # Write the final accuracy score to a text file, tide analysis will write to the same file
    with open(out_directory + '/final-results.txt', 'w+') as fp:
        fp.write('Test set accuracy for weather analysis: ' + str(accuracy))
    fp.close()
Exemple #6
0
def convert_bigint_to_int(df):
    for col, t in df.dtypes:
        if t == 'bigint':
            df = df.withColumn(col, df[col].cast(IntegerType()))

    f_udf = F.UserDefinedFunction(lambda x: 1 if x is not None else 0,
                                  IntegerType())
    df = df.withColumn("blog", f_udf(df.blog))
    df = df.withColumn("company", f_udf(df.company))
    df = df.withColumn("hireable", f_udf(df.hireable))

    return df
def main(business_id_arg):
    concat_list = udf(lambda lst: ", ".join(lst), types.StringType())

    reviews_df = spark.read.format("org.apache.spark.sql.cassandra") \
        .options(table=TABLE_REVIEW, keyspace=KEY_SPACE) \
        .load()

    review_filter = reviews_df.filter(reviews_df.business_id == business_id_arg)
    review_concatenate = review_filter.groupby('business_id').agg(collect_list('review').alias("review"))
    review_concatenate.show()
    train_fin = review_concatenate.withColumn("review", concat_list("review"))
    train_fin = train_fin.withColumn("review", functions.regexp_replace(train_fin.review, "[^0-9A-Za-z ,]", ""))

    # Create a new pipeline to create Tokenizer and Lemmatizer
    documentAssembler = DocumentAssembler().setInputCol("review").setOutputCol("document")
    tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
    lemmatizer = Lemmatizer().setInputCols(["token"]).setOutputCol("lemma") \
        .setDictionary("lemmas001.txt", key_delimiter=" ", value_delimiter="\t")

    pipeline = Pipeline(stages=[documentAssembler, tokenizer, lemmatizer])
    pipelineFit = pipeline.fit(train_fin)

    train_df = pipelineFit.transform(train_fin)
    train_df.select('lemma').show(truncate=False)
    price_range_udf = functions.UserDefinedFunction(lambda attributes: get_attributes(attributes), types.StringType())
    train_df = train_df.withColumn('lemma', price_range_udf(train_df['lemma']))
    train_df = train_df.withColumn('lemma', functions.split(train_df['lemma'], ",").cast('array<string>'))

    # Create a new pipeline to remove the stop words
    test_review = train_df.select("lemma")
    stop_words_remover = StopWordsRemover(inputCol="lemma", outputCol="filtered")
    hash_tf = HashingTF(numFeatures=2 ** 16, inputCol="lemma", outputCol='tf')
    pipeline_too_remove_stop_words = Pipeline(stages=[hash_tf, stop_words_remover])
    pipeline_fit = pipeline_too_remove_stop_words.fit(train_df)
    test_df = pipeline_fit.transform(test_review)
    test_df.show()

    token_array = test_df.select('filtered').rdd.flatMap(lambda row: row).collect()

    counts = Counter(token_array[0])
    word_cloud = WordCloud(
        background_color='white',
        max_words=100,
        max_font_size=50,
        min_font_size=10,
        random_state=40
    ).fit_words(counts)

    plt.imshow(word_cloud)
    plt.axis('off')  # remove axis
    plt.show()
def validate_tffm(spark, sc, model, test_df, s3_metrics_path, s3_endpoint_path):
    # get predictions
    validation_df = model.transform(test_df)
    
    metricsSchema = StructType() \
        .add("metric", StringType()) \
        .add("value", DoubleType())
    metrics_names = []

    # apply threshold
    def thresholdScore(x):
        retval = 0.0
        if x > 0.5:
            retval = 1.0
        return retval
    
    thresholdScoreUdf = F.UserDefinedFunction(thresholdScore, T.FloatType())
    
    validation_df_round = validation_df.withColumn('rscore', thresholdScoreUdf(validation_df.score)) 
    predTffm = validation_df_round.select(['label','rscore'])

    predictionAndLabelsTffm = predTffm.rdd.map(lambda lp: (lp.rscore, lp.label))
    metricsTffm = BinaryClassificationMetrics(predictionAndLabelsTffm)

    metrics_names.append(("Area_under_PR",metricsTffm.areaUnderPR))
    metrics_names.append(("Area_under_ROC",metricsTffm.areaUnderROC))

    mmetricsTffm = MulticlassMetrics(predictionAndLabelsTffm)
    metrics_names.append(("Precision",mmetricsTffm.precision()))
    metrics_names.append(("Recall",mmetricsTffm.recall()))
    metrics_names.append(("F1",mmetricsTffm.fMeasure()))
    metrics_names.append(("Weighted_recall",mmetricsTffm.weightedRecall))
    metrics_names.append(("Weighted_precision",mmetricsTffm.weightedPrecision))
    metrics_names.append(("Weighted_F1",mmetricsTffm.weightedFMeasure()))
    metrics_names.append(("Weighted_F05",mmetricsTffm.weightedFMeasure(beta=0.5)))
    metrics_names.append(("Weighted_FP_rate",mmetricsTffm.weightedFalsePositiveRate))

    mRdd = sc.parallelize(metrics_names).coalesce(1)
    dfMetrics = spark.createDataFrame(mRdd, metricsSchema)
    dfMetrics.write.csv("{0}/{1}".format(s3_metrics_path, model.endpointName), mode="overwrite")

    endpointSchema = StructType() \
        .add("time", StringType()) \
        .add("endpoint", StringType())
    endpoint_name = []
    endpoint_name.append((str(time.time()),str(model.endpointName)))
    eRdd = sc.parallelize(endpoint_name).coalesce(1)
    dfEndpoint = spark.createDataFrame(eRdd, endpointSchema)
    dfEndpoint.write.csv("{0}/endpoint.txt".format(s3_endpoint_path), mode="overwrite")
    def _flatten_obs(obs: pyspark_sql.DataFrame,
                     code_system: str = None) -> pyspark_sql.DataFrame:
        """Creates a flat version of Observation FHIR resources.

        Note `code_system` is only applied on `code.coding` which is a required
        filed, i.e., it is not applied on `value.codeableConcept.coding`.

        Args:
          obs: A collection of Observation FHIR resources.
          code_system: The code system to be used for filtering `code.coding`.
        Returns:
          A DataFrame with the following columns (note one input observation might
          be repeated, once for each of its codes):
          - `coding` from the input obsservation's `code.coding`
          - `valueCoding` from the input's `value.codeableConcept.coding`
          - `value` from the input's `value`
          - `patientId` from the input's `subject.patientId`
          - `dateTime` from the input's `effective.dateTime`
        """
        sys_str = ('coding.system="{}"'.format(code_system)
                   if code_system else "coding.system IS NULL")
        value_sys_str_base = ('valueCoding.system="{}"'.format(code_system)
                              if code_system else "valueCoding.system IS NULL")
        value_sys_str = "(valueCoding IS NULL OR {})".format(
            value_sys_str_base)
        merge_udf = F.UserDefinedFunction(_merge_date_and_value,
                                          T.StringType())
        return (obs.withColumn(
            "coding", F.explode("code.coding")).where(sys_str).withColumn(
                "valueCoding",  # Note valueCoding can be null.
                F.explode_outer("value.codeableConcept.coding"),
            ).where(value_sys_str).withColumn(
                "dateAndValue",
                merge_udf(F.col("effective.dateTime"),
                          F.col("value.quantity.value")),
            ).withColumn(
                "dateAndValueCode",
                merge_udf(F.col("effective.dateTime"),
                          F.col("valueCoding.code")),
            ).select(
                F.col("coding"),
                F.col("valueCoding"),
                F.col("value"),
                F.col("subject.patientId").alias("patientId"),
                F.col("effective.dateTime").alias("dateTime"),
                F.col("dateAndValue"),
                F.col("dateAndValueCode"),
                F.col("context.EncounterId").alias("encounterId"),
            ))
def main():
    # do things...
    messages = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', '199.60.17.210:9092,199.60.17.193:9092') \
        .option('subscribe', topic).load()
    line=messages.selectExpr('CAST (value as string)')
    my_udf_x=F.UserDefinedFunction(get_x, returnType=types.FloatType())
    my_udf_y=F.UserDefinedFunction(get_y, returnType=types.FloatType())
    line=line.withColumn('X',my_udf_x('value'))
    line=line.withColumn('Y',my_udf_y('value'))
    line=line.drop('value')
    line.createOrReplaceTempView('line')
    df=spark.sql('''select (sum(X*Y)-((1/count(*))*sum(X)*sum(Y)))
    /(sum(power(X,2))-((1/count(*))*power(sum(X),2)))
    as beta,(sum(Y)/count(*)) as lhs,((sum(X))/count(*)) as rhs from line
    ''')
    df.createOrReplaceTempView('final')
    df_final=spark.sql('''select beta,(lhs-(beta*rhs)) as alpha from final''')
    stream= df_final \
    .writeStream \
    .outputMode("complete")\
    .format("console") \
    .start()
    stream.awaitTermination(600)
    def generateDrugNames(self,dataFrame,filename):
        logger.info('in generateDrugNames')
        keywordsDict = self.createDictFromKeywords(filename)  
        split_col = Functions.split(dataFrame['Cleaned Item Description'],'(?<=^[^\\s]*)\\s')
        dataFrame = dataFrame.withColumn("Drug_Name", Functions.regexp_replace(split_col.getItem(0),'[\\,,\\.]',''))
        dataFrame = dataFrame.withColumn("Drug Strength", split_col.getItem(1))

        def genDosageDict(text):
            tokens = text.split()
            dosage_fullname = ''
            dosage = ''
            tag = ''
            keywords = []

            for token in tokens:
                token_fullname = keywordsDict.get(token,-1)
                if token_fullname != -1:
                    keywords.append(token_fullname)
                    if len(keywords) == 1:
                        dosage_fullname = token_fullname
                        dosage = token
                        tag = 'Drug'

            keywords = set(keywords)
            if len(keywords) > 0:
                return '{}!{}!{}!{}'.format(dosage,dosage_fullname,tag,keywords)
            else:
                return '{}!{}!{}!{}'.format(dosage,dosage_fullname,tag,'')


        udf = Functions.UserDefinedFunction(lambda x: genDosageDict(x))   
        localDf = dataFrame.groupBy(Functions.col("Cleaned Item Description").alias('Cleaned Item Description2')  ).agg(Functions.lit('1'))

        localDf = localDf.withColumn("dosage-fields", udf(Functions.col("Cleaned Item Description2"))).drop('1')

        split_col = Functions.split(localDf['dosage-fields'],'!')
        localDf = localDf.withColumn("dosage", split_col.getItem(0))\
                    .withColumn("dosage_fullname", split_col.getItem(1))  \
                    .withColumn("tag", split_col.getItem(2))\
                    .withColumn("keywords", split_col.getItem(3)).drop('dosage-fields')

        dataFrame = dataFrame.join(localDf,(dataFrame['Cleaned Item Description'] == localDf['Cleaned Item Description2'])).drop(localDf['Cleaned Item Description2'])

        dataFrame = dataFrame.select("Item_Description","Item Number ", "Manufacturer","Canonical_Manufacturer"\
                    ,"Drug_Name","tag","Key",Functions.col("IsDistinctKey").alias('Excluded'))
        

        return dataFrame
    def generateSpendInfoBasedOnManufacturer(self, sparkContext, sparkSession,
                                             DataFrame,
                                             shouldConsiderOnlyTopScore,
                                             matchingThreshold):
        def getPriceINfo(Canonical_Manufacturer, Item_Description,
                         shouldConsiderOnlyTopScore, matchingThreshold):
            db = Database()
            connection = db.getDbConn()

            cursor = connection.cursor()

            postgreSQL_select_Query = "select proddesc as prodDesc from uniqueProducts where lower(mfrcanonicalname)=\'{}\'".format(
                Canonical_Manufacturer)

            cursor.execute(postgreSQL_select_Query)

            records = cursor.fetchall()

            proddesc_list = [t[0] for t in records]

            result = self.common_process(proddesc_list, Item_Description,
                                         shouldConsiderOnlyTopScore,
                                         matchingThreshold, cursor, 'prodDesc')

            cursor.close()
            connection.close()
            return result

        localDf = DataFrame.groupBy(
            Functions.col("Canonical_Manufacturer").alias(
                'Canonical_Manufacturer2'),
            Functions.col("Item_Description").alias('Item_Description2')).agg(
                Functions.lit('1'))

        udf = Functions.UserDefinedFunction(getPriceINfo, StringType())


        localDf = localDf.withColumn('priceinfo',udf(Functions.col('Canonical_Manufacturer2'),Functions.col('Item_Description2'),\
            Functions.lit(shouldConsiderOnlyTopScore),Functions.lit(matchingThreshold))).drop('1')

        DataFrame = DataFrame.join(localDf,(localDf.Canonical_Manufacturer2 == DataFrame.Canonical_Manufacturer) & \
            (localDf.Item_Description2 == DataFrame.Item_Description)).drop(localDf.Item_Description2).drop(localDf.Canonical_Manufacturer2)

        DataFrame = self.common_process2(DataFrame)

        return DataFrame
Exemple #13
0
    def _flatten_obs(obs: DataFrame, code_system: str = None) -> DataFrame:
        """Creates a flat version of Observation FHIR resources.

    Note `code_system` is only applied on `code.coding` which is a required
    filed, i.e., it is not applied on `value.codeableConcept.coding`.

    Args:
      obs: A collection of Observation FHIR resources.
      code_system: The code system to be used for filtering `code.coding`.
    Returns:
      A DataFrame with the following columns (note one input observation might
      be repeated, once for each of its codes):
      - `coding` from the input obsservation's `code.coding`
      - `valueCoding` from the input's `value.codeableConcept.coding`
      - `value` from the input's `value`
      - `patientId` from the input's `subject.patientId`
      - `dateTime` from the input's `effective.dateTime`
    """
        sys_str = 'coding.system="{}"'.format(
            code_system) if code_system else 'coding.system IS NULL'
        value_sys_str_base = 'valueCoding.system="{}"'.format(
            code_system) if code_system else 'valueCoding.system IS NULL'
        value_sys_str = '(valueCoding IS NULL OR {})'.format(
            value_sys_str_base)
        merge_udf = F.UserDefinedFunction(
            lambda d, v: merge_date_and_value(d, v), T.StringType())
        return obs.withColumn(
            'coding', F.explode('code.coding')).where(sys_str).withColumn(
                'valueCoding',  # Note valueCoding can be null.
                F.explode_outer('value.codeableConcept.coding')
            ).where(value_sys_str).withColumn(
                'dateAndValue',
                merge_udf(
                    F.col('effective.dateTime'),
                    F.col('value.quantity.value'))).withColumn(
                        'dateAndValueCode',
                        merge_udf(F.col('effective.dateTime'),
                                  F.col('valueCoding.code'))
                    ).select(F.col('coding'), F.col('valueCoding'),
                             F.col('value'),
                             F.col('subject.patientId').alias('patientId'),
                             F.col('effective.dateTime').alias('dateTime'),
                             F.col('dateAndValue'), F.col('dateAndValueCode'),
                             F.col('context.EncounterId').alias('encounterId'))
Exemple #14
0
def main():
    df = spark.read.json(katkam_in_directory)
    schema = types.StructType([
        types.StructField('Date/Time', types.StringType(), False),
        types.StructField('label', types.LongType(), False)
    ])

    tides = spark.read.json(tide_in_directory, schema=schema)
    tides.show()
    df = df.join(tides, 'Date/Time')
    df.show()
    # https://stackoverflow.com/questions/39025707/how-to-convert-arraytype-to-densevector-in-pyspark-dataframe
    to_vec = functions.UserDefinedFunction(lambda vs: Vectors.dense(vs),
                                           VectorUDT())
    df = df.select(df['label'], to_vec(df['features']).alias('features'))

    df.show()

    # Do machine learning
    splits = df.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # Logistic Regression Model
    lr = LogisticRegression()

    models = [lr]
    model = [i.fit(train) for i in models]
    predictions = [i.transform(test) for i in model]
    [i.show() for i in predictions]

    # compute accuracy on the test set
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = [evaluator.evaluate(i) for i in predictions]

    # Write the final predictions dataframe to a CSV directory
    predictions.write.json(out_directory, mode='overwrite')

    # Write the final accuracy score to a text file, tide analysis will write to the same file
    with open(out_directory + '/final-results.txt', 'w+') as fp:
        fp.write('Test set accuracy for tides analysis: ' + str(accuracy))
    fp.close()
Exemple #15
0
def add_date_info_spark(df, convert=True):
    '''Take a spark df and add date info
    '''
    if convert:
        f_datestring = F.UserDefinedFunction(lambda x: x[:-4] + '+00:00',
                                             StringType())
        df = df.withColumn("first_event", f_datestring(df.first_event))
        df = df.withColumn("last_event", f_datestring(df.last_event))

    df = df.withColumn("first_event", F.to_timestamp(df.first_event))
    df = df.withColumn("last_event", F.to_timestamp(df.last_event))
    df = df.withColumn("created_at", F.to_timestamp(df.created_at))
    df = df.withColumn("updated_at", F.to_timestamp(df.updated_at))

    df = df.withColumn("recency", F.datediff(df.last_event, df.created_at))
    df = df.withColumn("time_between_first_last_event",
                       F.datediff(df.last_event, df.first_event))

    return df
    def generateNewSpendInfoBasedOnDescription(self, sparkContext,
                                               sparkSession, DataFrame,
                                               shouldConsiderOnlyTopScore,
                                               matchingThreshold):
        def getPriceINfo(Item_Description, shouldConsiderOnlyTopScore,
                         matchingThreshold):
            db = Database()
            connection = db.getDbConn()

            cursor = connection.cursor()

            postgreSQL_select_Query = "select itemdesc from priceinfo9july2019 where lower(itemdesc) like \'{}\'".format(
                Item_Description)

            cursor.execute(postgreSQL_select_Query)

            records = cursor.fetchall()

            proddesc_list = [t[0] for t in records]

            result = self.common_process(proddesc_list, Item_Description,
                                         shouldConsiderOnlyTopScore,
                                         matchingThreshold, cursor, 'itemDesc')

            cursor.close()
            connection.close()
            return result

        udf = Functions.UserDefinedFunction(getPriceINfo, StringType())

        localDf = DataFrame.groupBy(
            Functions.col("Item_Description").alias('Item_Description2')).agg(
                Functions.lit('1'))

        localDf = localDf.withColumn('priceinfo',udf(Functions.col('Item_Description2'),\
            Functions.lit(shouldConsiderOnlyTopScore),Functions.lit(matchingThreshold))).drop('1')

        DataFrame = DataFrame.join(localDf,\
            localDf.Item_Description2 == DataFrame.Item_Description).drop(localDf.Item_Description2)

        DataFrame = self.common_process3(DataFrame)

        return DataFrame
Exemple #17
0
def main():
    # do things...
    input_file = inputs
    schemas = StructType([StructField('Language', types.StringType(), False),StructField('Page', types.StringType(), False)\
    ,StructField('Requests', types.IntegerType(), False)\
                          ,StructField('Size', types.IntegerType(), False)])
    comment = spark.read.csv(input_file, sep=" ", schema=schemas)
    comment = comment.withColumn('Filename', functions.input_file_name())
    my_udf = F.UserDefinedFunction(path_to_hour, returnType=types.StringType())
    comment = comment.withColumn('Split', my_udf('Filename'))
    comment = comment.drop('Filename').withColumnRenamed('Split', 'Filename')
    comment = comment.filter((comment.Language == 'en')
                             & (comment.Page != 'Main_page')
                             & (~(comment.Page.like('Special:%'))))
    max_df = comment.groupBy('Filename').agg({'Requests': 'max'})
    max_df = max_df.withColumnRenamed('max(Requests)', 'Requests')
    final_df = max_df.join(comment, ['Filename', 'Requests'])
    final_df = final_df.select('Filename', 'Page',
                               'Requests').orderBy('Filename')
    final_df.write.csv(output, sep=',')
    def generateSpendInfoBasedOnUnspsc(self, sparkContext, sparkSession,
                                       DataFrame, shouldConsiderOnlyTopScore,
                                       matchingThreshold):
        def getPriceINfo(Unspsc_Code, Item_Description,
                         shouldConsiderOnlyTopScore, matchingThreshold):
            db = Database()
            connection = db.getDbConn()

            cursor = connection.cursor()

            postgreSQL_select_Query = "select prodcleandesc,id from uniqueProducts_UNSPSC where unspscclasscode={}".format(
                Unspsc_Code)

            cursor.execute(postgreSQL_select_Query)

            records = cursor.fetchall()

            result = self.common_process(records, Item_Description,
                                         shouldConsiderOnlyTopScore,
                                         matchingThreshold, cursor, 'prodDesc')
            cursor.close()
            connection.close()
            return result

        localDf = DataFrame.groupBy(
            Functions.col("Unspsc_Code").alias('Unspsc_Code2'),
            Functions.col("Item_Description").alias('Item_Description2')).agg(
                Functions.lit('1'))

        udf = Functions.UserDefinedFunction(getPriceINfo, StringType())

        localDf = localDf.withColumn('priceinfo',udf(Functions.col('Unspsc_Code2'),Functions.col('Item_Description2'),\
            Functions.lit(shouldConsiderOnlyTopScore),Functions.lit(matchingThreshold))).drop('1')

        DataFrame = DataFrame.join(localDf,(localDf.Unspsc_Code2 == DataFrame.Unspsc_Code) & \
            (localDf.Item_Description2 == DataFrame.Item_Description)).drop(localDf.Item_Description2).drop(localDf.Unspsc_Code2)

        DataFrame = self.common_process2(DataFrame)

        return DataFrame
    def createIMMClassifierInput(self, dataFrame):
        dataFrame = self.generatePIM(dataFrame)

        udf = Functions.UserDefinedFunction(lambda x: str(uuid.uuid1()))

        localDf = dataFrame.filter(dataFrame.Excluded == '0')
        localDf = localDf.withColumn('Part Number', udf(Functions.lit('')))
        localDf = localDf.select(Functions.col('PIM').alias('PIM_KEY')\
                    ,Functions.col('Canonical_Manufacturer').alias('COMPANY_NAME')\
                    ,Functions.col('Part Number').alias('PART_NUMBER')\
                    ,Functions.col('Item_Description').alias('PRODUCT_DESCRIPTION')\
                    ,Functions.lit('').alias('UNSPSC'))

        data_abs_path = os.path.abspath(
            Constants.IMM_CLASSIFIER_INPUT_FILE_LOCATION)
        localDf.toPandas().to_csv('{}/input.csv'.format(data_abs_path),
                                  index=False)

        logger.info("IMM classifier input.csv file placed in {} ".format(
            data_abs_path))

        return dataFrame
def process_business_json(input_json_business):
    business_schema = types.StructType([
        types.StructField('business_id', types.StringType(), True),
        types.StructField('city', types.StringType(), True),
        types.StructField('state', types.StringType(), True),
        types.StructField('stars', types.DoubleType(), True),
        types.StructField('review_count', types.LongType(), True),
        types.StructField('categories', types.StringType(), True),
        types.StructField('postal_code', types.StringType(), True),
        types.StructField('latitude', types.DoubleType(), True),
        types.StructField('longitude', types.DoubleType(), True),
        types.StructField(
            'attributes',
            types.StructType([
                types.StructField("RestaurantsPriceRange2", types.StringType(),
                                  True)
            ]), False),
        types.StructField('is_open', types.IntegerType(), True),
        types.StructField('name', types.StringType(), True)
    ])
    # Read business JSON file using the schema
    business_df = spark.read.json(input_json_business, schema=business_schema)

    price_range_udf = functions.UserDefinedFunction(
        lambda attributes: get_price_range(attributes), types.IntegerType())
    # Filter all the businesses which are still open in the business_states
    business_df = business_df.filter((business_df['is_open'] == 1) &
                                     business_df['state'].isin(business_states) &
                                     business_df['categories'].like(filter_categories)) \
        .withColumn('pricerange', price_range_udf(business_df['attributes'])) \
        .withColumnRenamed('business_id', 'b_id') \
        .withColumnRenamed('stars', 'b_stars') \
        .select('b_id', 'name', 'city', 'state', 'review_count', 'categories',
                'postal_code', 'latitude', 'longitude', 'pricerange', 'b_stars') \
        .repartition(100)

    write_to_cassandra(business_df, TABLE_BUSINESS)
    return business_df
    def generateRxNormId(self,dataFrame):
        logger.info('in generateRxNormId')
        base_uri = 'http://{}:{}/process/rxnorm?q='.format(Constants.POSTGRESQL_HOST_IP,Constants.CTAKES_PORT)
        
        def genRxNormId(drugName):
            resp = requests.get(url=base_uri+drugName)
            data = resp.json()
            if len(data)>0:
                return data[0]['code']                
            else:
                return ''    

        udf = Functions.UserDefinedFunction(genRxNormId, StringType())

        localDf = dataFrame.groupBy(Functions.col("Drug_Name").alias('Drug_Name2')).agg(Functions.lit('1'))

        localDf = localDf.withColumn("RxNormId",udf(Functions.col("Drug_Name2"))).drop('1')
        
        localDf = dataFrame.join(localDf,(dataFrame['Drug_Name'] == localDf['Drug_Name2'])).drop(localDf['Drug_Name2'])

        localDf = localDf.withColumn("tag",  Functions.when((localDf.tag == "") & (localDf.RxNormId != "") , "Drug").otherwise(localDf.tag))

        return localDf
    def generatePriorityDrugsInfo(self, sparkSession, dataFrame, fileName,
                                  matchingThreshold, matching_type):
        logger.info('in generatePriorityDrugsInfo ')

        priorityDF = self.getdata(sparkSession, fileName)

        if matching_type == 'FUZZY':
            cleandesc_list = priorityDF['Clean Description'].unique().tolist()

            def genpriorityfields(Item_description):
                bestMatch = process.extractOne(Item_description,
                                               cleandesc_list,
                                               scorer=fuzz.token_sort_ratio,
                                               score_cutoff=matchingThreshold)

                if bestMatch is not None:
                    priority_desc = priorityDF.loc[
                        priorityDF['Clean Description'] == bestMatch[0],
                        'Item Description'].iloc[0]
                    return '{}!{}!{}'.format('true', priority_desc,
                                             bestMatch[1])
                else:
                    return '{}!{}!{}'.format('false', '', '')
        else:
            cleandesc_list = priorityDF['processed clean desc'].unique(
            ).tolist()
            lav = MatchString()

            def genpriorityfields(Item_description):
                bestMatch = lav.extractOneWithcutoff(
                    Item_description,
                    cleandesc_list,
                    score_cutoff=matchingThreshold)

                if bestMatch is not None:
                    priority_desc = priorityDF.loc[
                        priorityDF['processed clean desc'] == bestMatch[0],
                        'Item Description'].iloc[0]
                    return '{}!{}!{}'.format('true', priority_desc,
                                             bestMatch[1])
                else:
                    return '{}!{}!{}'.format('false', '', '')

        localDf = dataFrame.groupBy(
            Functions.col("Item_Description").alias('Item_Description2')).agg(
                Functions.lit('1'))

        udf = Functions.UserDefinedFunction(lambda x: genpriorityfields(x))

        localDf = localDf.withColumn(
            "priority-fields",
            udf(Functions.col("Item_Description2"))).drop('1')

        localDf = dataFrame.join(localDf,
                                 (dataFrame['Item_Description']
                                  == localDf['Item_Description2'])).drop(
                                      localDf['Item_Description2'])

        split_col = Functions.split(localDf['priority-fields'], '!')
        localDf = localDf.withColumn("Priority Drug Match Flag", split_col.getItem(0))\
                    .withColumn("Priority Drug Accuracy", split_col.getItem(2))  \
                    .withColumn("Priority Drug Desc", split_col.getItem(1)).drop('priority-fields')

        localDf = localDf.withColumn(
            "tag",
            Functions.when(
                (localDf.tag == "") &
                (Functions.col('Priority Drug Match Flag') == 'true'),
                "Drug").otherwise(localDf.tag))

        return localDf
Exemple #23
0
            'infantil' in txt or \
            'niños' in txt or \
            'niñas' in txt or \
            'menino' in txt or \
            'menina' in txt:
        return 'infantil'
    if 'unisex' in txt or 'unissex' in txt:
        return 'unisex'
    if 'masculino' in txt:
        return 'masculino'
    if 'femenino' in txt or 'feminino' in txt:
        return 'feminino'
    return txt


gender_fix_udf = F.UserDefinedFunction(fix_gender, T.StringType())
flat = flat.withColumn('gender_fixed', gender_fix_udf('gender'))

# convert Order_date from int to datetime

# verify with flat.groupBy('gender_fixed').count().sort('count').show(100)

flat = flat.withColumn('average_cost_per_item',
                       flat.product_net_cost / flat.item_sold)
flat = flat.withColumn('absolute_margin',
                       flat.product_net_cost - flat.product_net_revenue)
flat = flat.withColumn('percentage_margin',
                       flat.absolute_margin / flat.product_net_cost)
flat = flat.withColumn('average_price_per_item',
                       flat.product_net_revenue / flat.item_sold)
def main(argv):
    inputfile = False
    outputfile = False
    jumbledwordsfile = False
    try:
        opts, args = getopt.getopt(argv, "i:o:j:",
                                   ["ifile=", "ofile=", "jumbletestfile="])
    except getopt.GetoptError:
        print(
            'spark-submit resolvejumbledword.py -i <dictionary inputfile> -o <dictionary outputfile> -j <jumbled words list file>'
        )
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-j", "--jumbletestfile"):
            jumbledwordsfile = arg

    if not inputfile or not outputfile or not jumbledwordsfile:
        print(
            'spark-submit resolvejumbledword.py -i <dictionary inputfile> -o <dictionary outputfile> -j <jumbled words list file>'
        )
        sys.exit(2)

    print inputfile
    print outputfile
    print jumbledwordsfile

    reformatjson(inputfile, outputfile)
    conf = SparkConf().setAppName("Spark jumble resolution")
    sc = SparkContext(conf=conf)
    #sc = pyspark.SparkContext('Reyane')
    sqlContext = SQLContext(sc)
    df = sqlContext.read.option('multinode', True).json(outputfile)
    # Displays the content of the DataFrame to stdout

    print('printing the dataframe as it is from the json file loaded ')
    df.show()

    word_udf = F.UserDefinedFunction(sort_string_chars, T.StringType())

    df = df.withColumn("sortedChartWORD", word_udf('word'))

    rank_udf = F.UserDefinedFunction(modify_rank_of_zero, T.IntegerType())

    df = df.withColumn("modifiedRank", rank_udf('rank'))

    print('printing the dataframe with a newly add column')

    df.show()

    sqlContext.registerDataFrameAsTable(df, "table1")

    lines = readjumbledwordfile(jumbledwordsfile)
    sqlstatment = "SELECT * FROM table1 WHERE sortedChartWORD='" + sort_string_chars(
        lines[0]) + "'"
    firstline = lines[0]
    lines.remove(lines[0])
    for x in lines:
        sqlstatment = sqlstatment + " OR sortedChartWORD='" + sort_string_chars(
            x) + "'"

    print(sqlstatment)

    #lines2=lines.append(firstline)
    #length=len(lines)
    lines = lines + [firstline]
    df2 = sqlContext.sql(sqlstatment)
    #df2=sqlContext.sql("SELECT * FROM table1 WHERE sortedChartWORD='adgln' OR sortedChartWORD='ajmor' OR sortedChartWORD='abcelm' OR sortedChartWORD='aelrwy'")

    df2 = df2.toPandas()
    print("Printing the pandas dataframe")
    print(df2)
    print('')
    print("********** Printing the results **********")

    for x in lines:
        df3 = df2[df2['sortedChartWORD'] == sort_string_chars(x)]
        print('')
        print('Printing the best matched word for ' + x + ':')
        if df3['word'].count() == 0:
            print("No match")
        else:
            print(
                df3.sort_values(by=['modifiedRank'],
                                ascending=True).iloc[0]['word'])
    print("********** End of the results **********")
Exemple #25
0
df_0 = spark.read.text("Data/NASA_access_log_Jul95.gz").cache()

# Question 1A

# exploratory analysis showed that there is one record not from Jul/1995
# so we can filter data at first
df = df_0.filter(df_0.value.contains("/Jul/1995")).cache()

#Extract date and time
regExpDate = '\d{2}\/Jul/1995:\d{2}:\d{2}:\d{2}'
df = df.withColumn('date_time', F.regexp_extract('value', regExpDate, 0))
# df.show(5)

# create new columns - transform into Timestamp
udf_new_date = F.UserDefinedFunction(
    lambda x: datetime.strptime(x, '%d/%b/%Y:%H:%M:%S'), TimestampType())
df = df.withColumn('new_date', udf_new_date('date_time'))
df.printSchema()

df = df.withColumn('day', F.dayofmonth('new_date'))
df = df.withColumn('hour', F.hour('new_date'))

# categorise hour intervals

udf_categor = F.UserDefinedFunction(lambda x: '1' if x < 4 else (
    "2" if x < 8 else ("3" if x < 12 else ("4" if x < 16 else
                                           ("5" if x < 20 else "6")))))
df = df.withColumn('hour_categor', udf_categor('hour'))

grouped_categor_hour = df.groupby('hour_categor').agg(
    F.count("value")).alias("count").orderBy("hour_categor").cache()
		.appName("Train Naive Bayes Model ").getOrCreate()

	# Create sql Context so that we can query data files in sql like syntax
	sqlContext = SQLContext(spark)

	spark.sparkContext.setLogLevel("ERROR")
	
	# Read in training data (from Yelp)  and pre-process it
	df = sqlContext.read.json(sys.argv[1])
	print ("Loaded " + str(df.count()) + " records with the following schema " + str(df.schema.names))  # print header names 
	df = df.select("business_id", "text", "stars") #select the fields we need
	df = df.where(col("text").isNotNull()) #Remove reviews that have no text
	df.show(2)

	# Since its not labeled with sentiment, we will assume that a 4 or above is positive
	my_udf = F.UserDefinedFunction(generateLabel, IntegerType())
	data = df.withColumn("label", my_udf("stars")) # generate a new column called label based on function 
	data.show(2)

	# Remove punctuation and convert to lowercase
	punc = udf(lambda x: re.sub(r'[^\w\s\d-]', '', x))
	newline = udf(lambda y: re.sub('[\n\t]',' ',y))
	data.createOrReplaceTempView("data") 
	data = spark.table('data').select("business_id", lower(col("text")).alias("text"), "label", "stars")
	data = data.withColumn("text", punc("text"))
	data = data.withColumn("text", newline("text"))
	data.show(2)

	# Generate tokens 
	tokenizer = Tokenizer(inputCol="text", outputCol="words")
	wordsData = tokenizer.transform(data)
Exemple #27
0
    return date


def create_cohort(date):

    rv = -1
    if date is not None:
        year = date.strftime("%y")
        week = "%02d" % int(date.strftime("%W"))
        rv = int("{0}{1}".format(year, week))

    return rv


clean_string_for_int_udf = Functions.UserDefinedFunction(
    clean_string_for_int, IntegerType())
clean_string_for_date_udf = Functions.UserDefinedFunction(
    clean_string_for_date, DateType())
create_cohort_udf = Functions.UserDefinedFunction(create_cohort, IntegerType())


schema = StructType([ \
    StructField("path", StringType()), \
    StructField("creator", StringType()), \
    StructField("remover", StringType()), \
    StructField("created", StringType()), \
    StructField("removed", StringType()), \
    StructField("lifespan", StringType()), \
    ])

line_decay_input_df = spark.read.csv( \
        table,
        split_size=split_size,
        row_format=pyspark_cassandra.RowFormat.DICT).setName(table)
    return rdd


def df_for(keyspace, table, split_size=None):
    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(
        sc.cassandraTable(keyspace, table,
                          split_size=split_size).setName(table))
    df.createOrReplaceTempView(table)
    return df


toStringList = functions.UserDefinedFunction(
    lambda names: names.split(' '), types.ArrayType(types.StringType()))


def row_to_dict(row):
    return row.asDict()


def save_rdd_to_cassandra(rdd):
    rdd.saveToCassandra(output_keyspace,
                        'orders_parts',
                        consistency_level=ConsistencyLevel.ONE,
                        batch_size=100)


def agg_orders(order_rdd):
    order_rdd.cache()
Exemple #29
0
if __name__ == "__main__":
    # Create Spark Instance
    spark = SparkSession \
        .builder \
        .appName("SimpleCrimeCSVParser") \
        .getOrCreate()

    # Location of CSV File
    filePath = "/Users/terrencebunkley/dev/git/data-eng-bootcamp/data/crimes/Crimes_-_One_year_prior_to_present.csv"


    def get_DD_MM_YYYY(col):
        return col.split(' ')[0]


    getDate = F.UserDefinedFunction(get_DD_MM_YYYY, T.StringType())

    # Load  DataFrame From the CSV
    df = spark.read.load(filePath, format="csv", sep=",", inferSchema="true", header="true").cache()

    print("The column types are %s" % df.dtypes)
    print("The columns we see in this doc are %s" % df.columns)
    print("The rows count %i" % df.count())
    print("Describe function")
    df.describe().show()

    print("Summary function")
    df.summary().show()

    # Show Arrests
    print("The number of arrests we see in this report are %i" % df.filter(df["ARREST"] == "Y").count())
BRP_model = BucketedRandomProjectionLSHModel.load("s3a://lsh-model/BRP_model.model")

lsh = BRP_model.transform(df)
lsh = lsh.drop('features')


# Transform the hash value into string format for storage
def to_str(x):
    y = ''
    for i in x:
        y += str(i[0])
        y += ';'
    y = y[:-1]
    return y

my_udf = F.UserDefinedFunction(to_str, T.StringType())
lsh = lsh.withColumn('hash', my_udf('hashes'))
lsh = lsh.drop('hashes').withColumnRenamed('hash', 'hashes')

# Combine distributed Spark DataFrame to a Pandas DataFrame
def _map_to_pandas(rdds):
    return [pd.DataFrame(list(rdds))]

def topas(df, n_partitions=None):
    if n_partitions is not None: df = df.repartition(n_partitions)
    df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()
    df_pand = pd.concat(df_pand)
    df_pand.columns = df.columns
    return df_pand

lsh_pandas = topas(lsh)