Beispiel #1
0
def nzpolice_link(env, inputs, settings):
    def set_progress(percentage):
        if env['ui'] is not None:
            widget: OWWidget = env['ui']
            widget.progressBarSet(percentage)

    df = inputs['DataFrame']

    print('Start group offender...')
    # offender -> crimes
    offender_count_list = df.groupby('offender').count().collect()
    offender_count_dict = {
        row['offender']: row['count']
        for row in offender_count_list
    }
    set_progress(5)

    @udf(returnType=IntegerType())
    def offender_count(offender):
        return offender_count_dict[offender]

    print('associate offence count...')
    df = df.withColumn('offender_offence_count', offender_count('offender'))

    set_progress(10)

    #####################################

    print('Start group reports by narrative...')
    groups_by_narrative = df.groupby('narrative_hash').agg(
        F.collect_list(struct(*df.columns)).alias('reports')).collect()

    narrative_primaryoffender = dict()
    for row in groups_by_narrative:
        primary_offender = max(
            row['reports'],
            key=lambda row: row['offender_offence_count'])['offender']
        narrative_primaryoffender[row['narrative_hash']] = primary_offender

    print('%d independent offence report' %
          len(narrative_primaryoffender.keys()))

    @udf(returnType=StringType())
    def replace_offender_for_group_offence(narrative_hash):
        return narrative_primaryoffender[narrative_hash]

    df = df.withColumnRenamed('offender', 'origin_offender')
    df = df.withColumn('offender',
                       replace_offender_for_group_offence('narrative_hash'))
    df = df.drop('origin_offender')

    reports = df.collect()

    # reports = [max(row['reports'], key=lambda row: row['offender_offence_count']) for row in groups_by_narrative]
    groups = {}  # group_by_offender
    for report in reports:
        groups.setdefault(report['offender'], []).append(report)

    set_progress(15)

    print('Start statistics for selection...')

    NUM_GROUPS = len(groups)
    NUM_LINKED = 0
    for group in groups:
        length = len(groups[group])
        if length == 1:
            continue
        NUM_LINKED += scipy.misc.comb(
            length, 2, exact=True)  # combination length*(length-1)-1

    NUM_TO_SELECT = int(math.ceil(
        NUM_LINKED / NUM_GROUPS)) * settings['select_ratio']

    print(
        '%d groups, %d linked, %d unlinked with %d select/r on average' %
        (NUM_GROUPS, NUM_LINKED, NUM_TO_SELECT * len(reports), NUM_TO_SELECT))
    set_progress(20)

    balancing_ratio = NUM_TO_SELECT * len(reports) / (
        NUM_LINKED + NUM_TO_SELECT * len(reports))

    print('Start links combination...')
    links = []
    for group in groups:
        group_weight = 1 / len(groups[group])
        internal_group_links = [
            t + (group_weight, 1) for t in combinations(groups[group], 2)
        ]
        external_group_links = []

        for report in groups[group]:
            random_groups = random.sample([g for g in groups if g != group],
                                          NUM_TO_SELECT)
            external_group_links += [(report, random.choice(groups[g]), 1.0, 0)
                                     for g in random_groups]

        links.extend(internal_group_links)
        links.extend(external_group_links)

    print('Links combination finished: %d' % len(links))
    set_progress(30)

    print('Start links with distance transformation...')
    linked_rows = []

    progress = 0
    for link in links:
        row1 = link[0]
        row2 = link[1]
        row = {
            feature: FEATURES_TO_USE[feature][2](row1[feature], row2[feature])
            for feature in FEATURES_TO_USE
            if FEATURES_TO_USE[feature][2] is not None
        }
        row['weight'] = link[2]
        row['class'] = link[3]
        linked_rows.append(Row(**row))
        progress += 1
        set_progress(30 + progress * 60 / len(links))

    fields = [
        StructField(feature, FEATURES_TO_USE[feature][3], True)
        for feature in FEATURES_TO_USE
        if FEATURES_TO_USE[feature][2] is not None
    ]
    fields.append(StructField('weight', DoubleType(), False))
    fields.append(StructField('class', IntegerType(), False))

    df = env['sqlContext'].createDataFrame(linked_rows,
                                           schema=StructType(fields))
    attributes = df.columns

    raw_df = _handle_missing(df)
    df = _vector_assembly(raw_df)

    df = _normalize(df)

    _write_arff(attributes, df)

    return {'DataFrame': df, 'RawDataFrame': raw_df}
Beispiel #2
0
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf


#
def match_func(predict, fraud_label):
    if ((predict == 0) & (fraud_label == 1)):
        return "no_match_predict_no_fraud"
    elif ((predict == 1) & (fraud_label == 0)):
        return "no_match_predict_fraud"
    else:
        return "match"


#
match_udf = udf(match_func, StringType())
#match_udf = udf(lambda (prediction,fraud_label): "no_match" if prediction!=fraud_label else "match", StringType())

#
# Arguments
#
import argparse
## Parse date_of execution
parser = argparse.ArgumentParser()
parser.add_argument("--datev1", help="Execution Date")
args = parser.parse_args()
if args.datev1:
    processdate = args.datev1
# GENERAL PREPARATION SCRIPT
#
#  Date in format YYYYMMDD
Beispiel #3
0
# train the model
model = ALS.train(
    dfRates.rdd, 20,
    20)  # you could tune these numbers, but these are reasonable choices
print("trained ...")

# use this model to predict what the user would rate accommodations that she has not rated
allPredictions = None
for USER_ID in range(0, 100):
    dfUserRatings = dfRates.filter(
        dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect()
    rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
    pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
    predictions = model.predictAll(pairsPotential).map(
        lambda p: (str(p[0]), str(p[1]), float(p[2])))
    predictions = predictions.takeOrdered(5, key=lambda x: -x[2])  # top 5
    print("predicted for user={0}".format(USER_ID))
    if (allPredictions == None):
        allPredictions = predictions
    else:
        allPredictions.extend(predictions)

# write them
schema = StructType([
    StructField("userId", StringType(), True),
    StructField("accoId", StringType(), True),
    StructField("prediction", FloatType(), True)
])
dfToSave = sqlContext.createDataFrame(allPredictions, schema)
dfToSave.write.jdbc(url=jdbcUrl, table='Recommendation', mode='overwrite')
Beispiel #4
0

# parse the datasets into row tuples
yellow_rows = yellow.mapPartitions(parse_yellow)
citi_rows = citi.mapPartitions(parse_citi)

# define dataframe schemas
yellow_schema = StructType([
    StructField('dropoff_time', TimestampType(), True),
    StructField('dropoff_lat', FloatType(), True),
    StructField('dropoff_lng', FloatType(), True)
])

citi_schema = StructType([
    StructField('station_id', IntegerType(), True),
    StructField('ride_id', StringType(), True),
    StructField('start_time', TimestampType(), True)
])

# instantiate the dataframes
yellow_df = sqlContext.createDataFrame(yellow_rows, yellow_schema)
citi_df = sqlContext.createDataFrame(citi_rows, citi_schema)


# filtering function to check if the taxi dropoff location is within 0.25 miles of citibike station
def is_dropoff_close(lat, lng):
    # greenwich and 8th ave station
    station = (40.73901691, -74.00263761)
    # taxi dropoff location
    dropoff = (lat, lng)
Beispiel #5
0
def test_parse_schema():
    struct = parse_schema("struct<foo:int, bar:string>")
    assert struct == StructType(
        [StructField("foo", IntegerType()),
         StructField("bar", StringType())])
Beispiel #6
0
def classify_spark(training, testing, target_domains, target_domains_dict):

    # Adjust
    target_domains_dict["_other"] = len(target_domains)
    target_domains.append(["_other"])

    feature_list = [c for c in training.columns if c.startswith("_")]

    assembler = VectorAssembler(inputCols=feature_list,
                                outputCol="features",
                                handleInvalid="skip")

    str2idx = udf(lambda s: float(target_domains_dict[s]), FloatType())
    idx2str = udf(lambda f: target_domains[int(f)], StringType())

    training = assembler.transform(training)
    testing = assembler.transform(testing)
    training = training.withColumn("label_idx", str2idx("label"))
    testing = testing.withColumn("label_idx", str2idx("label"))

    bins = np.zeros(len(target_domains))
    freqs = { row["label_idx"]: row["count"] for row in training.select("label_idx")\
                                                           .groupBy("label_idx").count().collect() }
    for i in freqs:
        bins[int(i)] = freqs[i]
    class_weights = np.sum(bins) / (len(bins) * bins)
    idx2cw = udf(lambda f: float(class_weights[int(f)]), FloatType())
    training = training.withColumn("weigth", idx2cw("label_idx"))

    #model = pyspark.ml.classification.DecisionTreeClassifier(labelCol="label_idx",
    #                                    featuresCol="features", predictionCol="prediction_idx")
    model = pyspark.ml.classification.LogisticRegression(
        labelCol="label_idx",
        weightCol="weigth",
        featuresCol="features",
        predictionCol="prediction_idx")

    model_fit = model.fit(training)

    training_predictions = model_fit.transform(training)
    testing_predictions = model_fit.transform(testing)

    training_predictions = training_predictions.withColumn(
        "prediction", idx2str("prediction_idx"))
    testing_predictions = testing_predictions.withColumn(
        "prediction", idx2str("prediction_idx"))

    labels_training = training_predictions.select("label").toPandas().values
    labels_test = testing_predictions.select("label").toPandas().values

    pred_training = training_predictions.select("prediction").toPandas().values
    pred_test = testing_predictions.select("prediction").toPandas().values

    training_report = classification_report(labels_training,
                                            pred_training,
                                            output_dict=True)
    testing_report = classification_report(labels_test,
                                           pred_test,
                                           output_dict=True)

    return model_fit, training_report, testing_report
"""
读取CSV文件信息
"""
# 创建sparkSession
spark = SparkSession.builder.appName("chronic").master("local[*]").enableHiveSupport().getOrCreate()
# 使用spark读取csv文件
manbing = spark.read.csv("manbin.csv", header=True, mode="DROPMALFORMED")

"""
对所有主治功能的列进行合并,
把所有String类型的字段合成
一个总String字段
"""
# 使用udf来注册方法,并且指定输入和输出类型
toUnionUDF = udf(union_col, StringType())
# 指定需要合并的列并使用withColumn方法和udf自定义定义方法来进行转换,并生成一个新的列
manbing = manbing.withColumn('d_func', toUnionUDF(
              manbing.d_func_1, manbing.d_func_2, manbing.d_func_3,
              manbing.d_func_4, manbing.d_func_5, manbing.d_func_6,
              manbing.d_func_7, manbing.d_func_8, manbing.d_func_9,
              manbing.d_func_10, manbing.d_func_11, manbing.d_func_12,
              manbing.d_func_13, manbing.d_func_14, manbing.d_func_15,
              manbing.d_func_16))

"""
对所有主治功能总字段进行分词
使用jieba分词的方法进行
"""
start=time.time()
# 使用udf来注册方法,并且指定输入和输出类型
Beispiel #8
0
def main(context):
    # TASK 1
    try:
        commentsDF = context.read.load('comments.parquet')
        submissionsDF = context.read.load('submissions.parquet')
        labeled_dataDF = context.read.load('label.parquet')
    except:
        commentsDF = sqlContext.read.json('comments-minimal.json.bz2')
        submissionsDF = sqlContext.read.json('submissions.json.bz2')
        labeled_dataDF = sqlContext.read.load('labeled_data.csv',
                                              format='csv',
                                              sep=',',
                                              header="true")
        commentsDF.write.parquet('comments.parquet')
        submissionsDF.write.parquet('submissions.parquet')
        labeled_dataDF.write.parquet('label.parquet')

    # TASK 2
    joined_data = commentsDF.join(labeled_dataDF,
                                  commentsDF.id == labeled_dataDF.Input_id,
                                  'inner').select(col('id'), col('body'),
                                                  col('labeldjt'))

    # TASK 4,5
    ngrams_udf = udf(get_ngrams, ArrayType(StringType()))
    joined_col = joined_data.withColumn('ngrams',
                                        ngrams_udf(joined_data['body']))

    try:
        model = CountVectorizerModel.load('cv.model')

    except:
        # task 6A
        cv = CountVectorizer(inputCol='ngrams',
                             outputCol="features",
                             binary=True)
        model = cv.fit(joined_col)
        vectors = model.transform(joined_col)

        # task 6B
        positive_udf = udf(lambda x: 1 if x == '1' else 0, IntegerType())
        negative_udf = udf(lambda x: 1 if x == '-1' else 0, IntegerType())
        vectors = vectors.withColumn('positive', positive_udf(col('labeldjt')))
        vectors = vectors.withColumn('negative', negative_udf(col('labeldjt')))

        pos = vectors.select(col('positive').alias('label'), col('features'))
        neg = vectors.select(col('negative').alias('label'), col('features'))
        pos.write.parquet('positive_ROC.parquet')
        neg.write.parquet('negative_ROC.parquet')
        model.save('cv.model')
    try:
        posModel = CrossValidatorModel.load('pos.model')
        negModel = CrossValidatorModel.load('neg.model')
    except:
        # Task 7
        # Initialize two logistic regression models.
        # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
        poslr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        neglr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam,
                                                  [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam,
                                                  [1.0]).build()
        # We initialize a 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(estimator=poslr,
                                     evaluator=posEvaluator,
                                     estimatorParamMaps=posParamGrid,
                                     numFolds=5)
        negCrossval = CrossValidator(estimator=neglr,
                                     evaluator=negEvaluator,
                                     estimatorParamMaps=negParamGrid,
                                     numFolds=5)
        # Although crossvalidation creates its own train/test sets for
        # tuning, we still need a labeled test set, because it is not
        # accessible from the crossvalidator (argh!)
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])

        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.save("pos.model")
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        negModel.save("neg.model")

    # Task 8,9
    try:
        finalDF = context.read.load('final.parquet')
    except:
        extract_id_udf = udf(lambda x: x[3:], StringType())
        comments = commentsDF.select(
            col('id').alias('comment_id'),
            extract_id_udf(col('link_id')).alias('link_id'),
            col('created_utc'), col('body'), col('author_flair_text'),
            col('score').alias('comment_score'))
        submissions = submissionsDF.select(
            col('id').alias('submission_id'), col('title'),
            col('score').alias('submission_score'))
        finalDF = comments.join(submissions,
                                comments.link_id == submissions.submission_id,
                                'inner')
        #sampling 20%
        finalDF = finalDF.sample(False, 0.02, None)
        pos_threshold_udf = udf(lambda x: 1
                                if x[1] > 0.2 else 0, IntegerType())
        neg_threshold_udf = udf(lambda x: 1
                                if x[1] > 0.25 else 0, IntegerType())
        finalDF = finalDF.filter(
            "body NOT LIKE '%/s%' and body NOT LIKE '&gt;%'")
        finalDF = finalDF.withColumn('ngrams', ngrams_udf(finalDF['body']))
        finalDF = model.transform(finalDF)
        posResult = posModel.transform(finalDF)
        temp = posResult.withColumn(
            'pos', pos_threshold_udf(posResult['probability']))
        temp = temp.select(col('comment_id'), col('link_id'),
                           col('created_utc'), col('body'),
                           col('author_flair_text'), col('comment_score'),
                           col('submission_id'), col('title'),
                           col('submission_score'), col('ngrams'), col('pos'))
        temp = model.transform(temp)
        negResult = negModel.transform(temp)
        temp = negResult.withColumn(
            'neg', neg_threshold_udf(negResult['probability']))
        finalDF = temp.select(col('comment_id'), col('link_id'),
                              col('created_utc'), col('body'),
                              col('author_flair_text'), col('comment_score'),
                              col('submission_id'), col('title'),
                              col('submission_score'), col('ngrams'),
                              col('pos'), col('neg'))
        finalDF.write.parquet('final.parquet')
    # Task 10
    # percentage of positive and negative comments
    try:
        task1 = context.read.load('percentage_value.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        total_rows = finalDF.count()
        total_pos_comments = finalDF.filter(col('pos') == '1').count()
        total_neg_comments = finalDF.filter(col('neg') == '1').count()

        pos_percentage = total_pos_comments / total_rows
        neg_percentage = total_neg_comments / total_rows

        values = [{
            'Total Rows': total_rows,
            'Percentage of Positive Comments': pos_percentage,
            'Percentage of Negative Comments': neg_percentage
        }]
        task1 = sqlContext.createDataFrame(values)
        task1.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("percentage_value.csv")
    #percent over date
    try:
        task2 = context.read.load('time_data.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        task2 = finalDF.withColumn(
            'date',
            F.from_unixtime(col('created_utc')).cast(DateType()))
        task2 = task2.groupBy('date').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task2.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("time_data.csv")
    #percent over states
    try:
        task3 = context.read.load('state_data.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        state = sqlContext.createDataFrame(states, StringType())
        task3 = finalDF.groupBy('author_flair_text').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task3 = task3.join(state, task3.author_flair_text == state.value,
                           'inner').na.drop(subset=['value']).select(
                               col('author_flair_text').alias('state'),
                               col('Positive'), col('Negative'))
        task3.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("state_data.csv")
    #percent over submission score
    try:
        task4 = context.read.load('submission_score.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        task4 = finalDF.groupBy('submission_score').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task4.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("submission_score.csv")
    #percent over commet score
    try:
        task5 = context.read.load('comment_score.csv/*.csv',
                                  format='csv',
                                  sep=',',
                                  header="true")
    except:
        task5 = finalDF.groupBy('comment_score').agg(
            (F.sum('pos') / F.count('pos')).alias('Positive'),
            (F.sum('neg') / F.count('neg')).alias('Negative'))
        task5.repartition(1).write.format("com.databricks.spark.csv").option(
            "header", "true").save("comment_score.csv")
    #list top 10 stories of each sentiment
    try:
        top_positive = context.read.load('top_positive.csv/*.csv',
                                         format='csv',
                                         sep=',',
                                         header="true")
        top_negative = context.read.load('top_negative.csv/*.csv',
                                         format='csv',
                                         sep=',',
                                         header="true")
    except:
        top_positive = finalDF.groupBy('title').agg(
            (F.sum('pos') / F.count('pos')).alias('Percentage')).orderBy(
                F.desc('Percentage')).limit(10)
        top_negative = finalDF.groupBy('title').agg(
            (F.sum('neg') / F.count('neg')).alias('Percentage')).orderBy(
                F.desc('Percentage')).limit(10)
        top_positive.repartition(1).write.format(
            "com.databricks.spark.csv").option("header",
                                               "true").save("top_positive.csv")
        top_negative.repartition(1).write.format(
            "com.databricks.spark.csv").option("header",
                                               "true").save("top_negative.csv")
def run(gz_paths_cols: List[Tuple[str, str]], ref_set_cols, ref_set_vals):
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext

    if not os.path.isdir('results_similarities'):
        os.makedirs('results_similarities')

    rand = get_rand_arg()
    if rand:
        random.shuffle(gz_paths_cols)
    
    # unzip
    gz_paths: List[str] = [gz_paths for gz_paths, _ in gz_paths_cols]

    to_sort = get_sort_arg()
    if to_sort:
        files_n_sizes = get_ds_file_sizes(gz_paths)
        sizes = [size for f, size in files_n_sizes]
        gz_paths_cols = list(zip(
            *sorted(list(zip(gz_paths_cols, sizes)),
                    key=lambda x: x[1])))[0]  # sort on sizes
        gz_paths: List[str] = [gz_paths for gz_paths, _ in gz_paths_cols]

    # unzip
    cols: List[str] = [cols for _, cols in gz_paths_cols]

    cache_col_name = {}  # {col_name: semantic_type} | may help if cols are repeated across dfs
    cache_col_val = {}  # there are many values repeated in a column so this helps

    # def _match_semantic_cols(col_name):
    #     if not col_name in cache_col_name:
    #         cache_col_name[col_name] = str(match_preprocess(col_name, ref_set_cols)[S_TYPE])
    #     return cache_col_name[col_name] 

    def _match_semantic_vals(col_val, s_type_col):
        """
        stage 1:
        run value matcher ('match_preprocess') on only the matched s_type_col

        if the cutoff not passed (avg distance from column is too high):
            stage 2:
            use heuristics (from manually examining frequent data for each col (ref_set)) to limit the amount of s_type_vals in ref_set_vals to compare to.
            I.e. null is automatically assigned the matched s_type_col
            I.e. check for subtrings, like if 'com' is in the val, then check 'website' s_type_vals for similarity. 'co' is implicitly in 'com' so check business_name as well, etc.
            this is to minimize misclassifications
            place them in 'check' to later build another s_type_vals using only those s_types

            stage 3:
            run 'match_preprocess' again on all s_types except the match s_type_col, or only on the heuristic matches in stage 2 (if they exist (if the heuristic check yielded results))
        
            stage 4:
            check whether the stage 3 result is significantly better than the stage 1 result--by checking whether the avg_dist is some percentage ('IMPROVE_RATIO') better than what it was. If not, assign the val to the matched s_type_col as would happen if the value was null

            stage 5 (doesn't work in spark):
            if the min_dist is less than some similarity cutoff: 'MIN_DIST' (meaning it is sufficiently small) and is larger than some similarity cutoff: 'IDENTICAL_CUTOFF' (meaning it isn't nearly identical to something already in the ref_set) add it to the ref_set. if initial matches are correct, later matches should be more accurate. the ref_set tops out at some sufficient size as to prevent slow down and redundant matching

        all {col_val: s_type} combinations are cached so that identical column values aren't recomputed, and so that spark can assign each to the dataframe by using a udf after they are computed outside of Spark. the cache is cleared after each dataset
        """
        col_val = str(col_val)
        s_type_col = str(s_type_col)

        # print(col_val, s_type_col, {s_type_col: [ref_set_vals[s_type_col]]})
        if not col_val in cache_col_val:
            AVG_CUTOFF = 0.9  # similarity measure worse than this triggers second more general run
            MIN_CUTOFF = 0.65
            IDENTICAL_CUTOFF = 0.10
            IMPROVE_RATIO = 0.5  # second run improved by some percent

            str_col_val = str(col_val).lower()
            # print(str_col_val)
            if str_col_val == 'null' or str_col_val == '-' or str_col_val == '0' or str_col_val == 'none' or str_col_val == '' or col_val is None:
                res_final = (s_type_col, col_val, 0.0, 0.0)  # default to s_type_col
            else:
                res0 = match_preprocess(col_val, {s_type_col: ref_set_vals[s_type_col]}, match_jacc_avg)  # compare to values of matched (based on col_name) semantic type
                # print('res0:', res0)
                # res0[MIN_DIST] != 0.0
                if AVG_CUTOFF < res0[AVG_DIST] or res0 is None:  # was the cutoff passed, i.e. was the value present for this semantic type based on the col_name match?
                    # check only these semantic types based on the content of the col_val (more explicit rules after examining data)
                    check = []
                    if len(str_col_val) == 1 and str_col_val.isalpha():
                        possibles = ['person_name (middle_initial)', 'borough']
                        for pos_s_type in possibles:
                            if s_type_col == pos_s_type:  # which of these is the s_type of the col?
                                check.extend([pos_s_type])
                                break
                    if len(str_col_val) == 2 and str_col_val.isalpha():
                        check.extend(['color'])
                    if len(str_col_val) >= 3:  # can have numbers and other chars
                        if 'llc' in str_col_val or 'inc' in str_col_val or 'co' in str_col_val:
                            check.extend(['business_name'])
                        if 'http' in str_col_val or 'www' in str_col_val or 'org' in str_col_val or 'com' in str_col_val:
                            check.extend(['website'])
                    if len(str_col_val) == 5 and str_col_val.isdigit():
                        check.extend(['zip_code'])
                    if len(str_col_val) >= 6 and 'school' in str_col_val:
                        check.extend(['city_agency', 'street_number',
                                    'phone_number', 'building_classification'])
                    if len(str_col_val) >= 3 and str_col_val.isdigit():
                        check.extend(['city_agency', 'street_number',
                                    'phone_number', 'building_classification'])
                    if len(str_col_val) >= 1 and str_col_val.isdigit():
                        check.extend(['street_number'])

                    # if len(check) > 0:
                    #     print('check:', check)

                    check = list(set(check))
                    
                    if len(check) == 0:
                        # compare to every semantic type but already checked
                        ref_set_diff = copy.deepcopy(ref_set_vals)  # clone
                        for key, val in ref_set_cols.items():  # compare to column names as well (for ms_core)
                            ref_set_diff[key].extend(copy.deepcopy(val))
                    else:
                        # compare to only those in check
                        ref_set_diff = {}
                        for s_type in check:
                            ref_set_diff[s_type] = copy.deepcopy(ref_set_vals[s_type])
                    ref_set_diff[s_type_col] = []  # prevent key error and delete all values for already matched

                    res1 = match_preprocess(col_val, ref_set_diff, match_jacc_avg)  # find similarity with other semantic value types
                    
                    if res0 is None and res1 is None:
                        res_final = (s_type_col, col_val, 0.0, 0.0)
                    elif res0 is None:
                        res_final = res1
                    elif res1 is None:
                        res_final = res0
                    else: # neither are None
                        res_final = min([res0, res1], key=lambda x: x[AVG_DIST])

                    # if AVG_CUTOFF < res_final[AVG_DIST]:  # still greater than cutoff and therefore unknown
                    if not (res_final[AVG_DIST] < res0[AVG_DIST] * (1 - IMPROVE_RATIO)):  # dist has not improved
                        res_final = _default(s_type_col, col_val)  # default to s_type_col
                        # ^ should the distance be non-0 to add to ref_set?
                else:
                    # print('FALSE')
                    res_final = res0  # cutoff passed, return initial result
                
            # not an exact match and up to n different values stored
            if res_final[MIN_DIST] <= MIN_CUTOFF and res_final[MIN_DIST] >= IDENTICAL_CUTOFF and len(ref_set_vals[res_final[S_TYPE]]) < 100:
                ref_set_vals[res_final[S_TYPE]].append(col_val)  # append to ref_set
            cache_col_val[col_val] = str(res_final[S_TYPE])
            # print('res_final:', res_final)

        return cache_col_val[col_val]
        

    # match_semantic_cols = udf(_match_semantic_cols, StringType())
    match_semantic_vals = udf(_match_semantic_vals, StringType())

    master_dct = {}
    def _run(df, i):
        print("col_name:", cols[i])

        col = None

        match_col = match_preprocess(cols[i], {'foo': df.columns})  # match the col from ta name to ds cols name
        if match_col is not None:
            col = match_col[COL]
        else:  # shouldn't exec
            raise Exception(f'{cols[i]} not matched in {str(df.columns)}')

        df_cols = map_cols(df.select(col))  # filter single col
        # df_cols = df_cols.sample(0.5, seed=3).limit(500)  # TEST

        if not col in cache_col_name:  # currently uneccessary since cache_col_name is cleared after every ds
            cache_col_name[col] = match_preprocess(col, ref_set_cols)[S_TYPE]  # match col to s_type
        s_type_col = cache_col_name[col]

        print('s_type_col:', s_type_col)
        print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col])

        df_cols = df_cols.withColumn('s_type_col', lit(s_type_col))  # populate df with col s_type
        
        # if i < 35: # run on small datasets (before it gets slow)
        s_types_all = []
        ### Python method: no spark to add to ref_set_vals
        for row in df_cols.select('value', 's_type_col').collect():
            s_type_i = _match_semantic_vals(row['value'], row['s_type_col'])
            s_types_all.append(s_type_i)
        # get (s_type, count)
        s_types_distinct = sc.parallelize(s_types_all).countByValue().items()
        ###

        # the below udf call just pulls out the s_types from the cache
        df_cols = df_cols.withColumn('s_type_val', match_semantic_vals('value', 's_type_col'))  # match uknown col value to semantic type
        df_test = df_cols.groupby('s_type_col', 'value', 's_type_val').count()
        df_test = df_test.sort('count', ascending=False)
        df_test.filter('s_type_val != s_type_col').show(25)
        df_test.show(25)
        # results = [str(list(row.asDict().values())) + '\n' for row in df_test.collect()]
        # print(results[:10])
        # with open('results_similarities/test.txt', '+a') as f:
        #     for s in results:
        #         f.write(s)

        ds_dict = {
            'column_name': col,
            'semantic_types': []
        }
        for s_type, count in s_types_distinct:
            if s_type in LABEL_LIST_TA:
                ds_dict['semantic_types'].append({
                    'semantic_type': s_type,
                    'count': count
                })
            else:
                ds_dict['semantic_types'].append({
                    'semantic_type': 'other',
                    'label': s_type,
                    'count': count
                })
        if gz_paths[i] not in master_dct:
            master_dct[gz_paths[i]] = {}
        master_dct[gz_paths[i]].update({col: ds_dict})

        print('gz_paths[i]:', {gz_paths[i]: master_dct[gz_paths[i]]})

        with open("results_similarities/master_dct.json", "w") as json_file:
            json.dump(master_dct, json_file, indent=4)

        cache_col_name.clear()
        cache_col_val.clear()


    timed(_run, gz_paths)
Beispiel #10
0
spark.sparkContext.setLogLevel("WARN")
spark

# In[3]:

# path to files
artistdata_path = 'AdvancedML_MusicRecommenderData2/artist_data.csv'
userartist_path = 'AdvancedML_MusicRecommenderData2/user_artist_data_train.csv'
test_path = 'AdvancedML_MusicRecommenderData2/LastFM_Test_Sample.csv'

# In[4]:

# Schemas for both files
artistdata_struct = StructType(
    [StructField('artistId', LongType()),
     StructField('name', StringType())])
userartist_struct = StructType([
    StructField('userId', LongType()),
    StructField('artistId', LongType()),
    StructField('song_count', LongType())
])

# In[5]:

# read artist_data file
artistdata_df = spark.read.csv(artistdata_path,
                               sep='\t',
                               schema=artistdata_struct)
artistdata_df.cache()

# read user_artist_data file
Beispiel #11
0
class CCSparkJob:

    name = 'CCSparkJob'

    output_schema = StructType([
        StructField("key", StringType(), True),
        StructField("val", LongType(), True)
    ])

    warc_parse_http_header = True

    args = None
    records_processed = None
    warc_input_processed = None
    warc_input_failed = None
    log_level = 'INFO'
    logging.basicConfig(level=log_level, format=LOGGING_FORMAT)

    num_input_partitions = 400
    num_output_partitions = 10

    def parse_arguments(self):
        """ Returns the parsed arguments from the command line """

        description = self.name
        if self.__doc__ is not None:
            description += " - "
            description += self.__doc__
        arg_parser = argparse.ArgumentParser(description=description)

        arg_parser.add_argument("input",
                                help="Path to file listing input paths")
        arg_parser.add_argument("output",
                                help="Name of output table"
                                " (saved in spark.sql.warehouse.dir)")

        arg_parser.add_argument("--num_input_partitions", type=int,
                                default=self.num_input_partitions,
                                help="Number of input splits/partitions")
        arg_parser.add_argument("--num_output_partitions", type=int,
                                default=self.num_output_partitions,
                                help="Number of output partitions")
        arg_parser.add_argument("--local_temp_dir", default=None,
                                help="Local temporary directory, used to"
                                "buffer content from S3")

        arg_parser.add_argument("--log_level", default=self.log_level,
                                help="Logging level")

        self.add_arguments(arg_parser)
        args = arg_parser.parse_args()
        self.validate_arguments(args)
        self.init_logging(args.log_level)

        return args

    def add_arguments(self, parser):
        pass

    def validate_arguments(self, args):
        return True

    def init_logging(self, level=None):
        if level is None:
            level = self.log_level
        else:
            self.log_level = level
        logging.basicConfig(level=level, format=LOGGING_FORMAT)

    def get_logger(self, spark_context=None):
        """Get logger from SparkContext or (if None) from logging module"""
        if spark_context is None:
            return logging.getLogger(self.name)
        return spark_context._jvm.org.apache.log4j.LogManager \
            .getLogger(self.name)

    def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf().setAll((
            ("spark.task.maxFailures", "10"),
            ("spark.locality.wait", "20s"),
            ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
        ))
        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.records_processed = sc.accumulator(0)
        self.warc_input_processed = sc.accumulator(0)
        self.warc_input_failed = sc.accumulator(0)

        self.run_job(sc, sqlc)

        sc.stop()

    def log_aggregator(self, sc, agg, descr):
        self.get_logger(sc).info(descr.format(agg.value))

    def log_aggregators(self, sc):
        self.log_aggregator(sc, self.warc_input_processed,
                            'WARC input files processed = {}')
        self.log_aggregator(sc, self.warc_input_failed,
                            'records processed = {}')
        self.log_aggregator(sc, self.records_processed,
                            'records processed = {}')

    @staticmethod
    def reduce_by_key_func(a, b):
        return a + b

    def run_job(self, sc, sqlc):
        input_data = sc.textFile(self.args.input,
                                 minPartitions=self.args.num_input_partitions)

        output = input_data.mapPartitionsWithIndex(self.process_warcs) \
            .reduceByKey(self.reduce_by_key_func)

        sqlc.createDataFrame(output, schema=self.output_schema) \
            .coalesce(self.args.num_output_partitions) \
            .write \
            .format("parquet") \
            .saveAsTable(self.args.output)

        self.get_logger(sc).info('records processed = {}'.format(
            self.records_processed.value))

    def process_warcs(self, id_, iterator):
        s3pattern = re.compile('^s3://([^/]+)/(.+)')
        base_dir = os.path.abspath(os.path.dirname(__file__))

        # S3 client (not thread-safe, initialize outside parallelized loop)
        no_sign_request = botocore.client.Config(
            signature_version=botocore.UNSIGNED)
        s3client = boto3.client('s3', config=no_sign_request)

        for uri in iterator:
            self.warc_input_processed.add(1)
            if uri.startswith('s3://'):
                self.get_logger().info('Reading from S3 {}'.format(uri))
                s3match = s3pattern.match(uri)
                if s3match is None:
                    self.get_logger().error("Invalid S3 URI: " + uri)
                    continue
                bucketname = s3match.group(1)
                path = s3match.group(2)
                warctemp = TemporaryFile(mode='w+b',
                                         dir=self.args.local_temp_dir)
                try:
                    s3client.download_fileobj(bucketname, path, warctemp)
                except botocore.client.ClientError as exception:
                    self.get_logger().error(
                        'Failed to download {}: {}'.format(uri, exception))
                    self.warc_input_failed.add(1)
                    continue
                warctemp.seek(0)
                stream = warctemp
            elif uri.startswith('hdfs://'):
                self.get_logger().error("HDFS input not implemented: " + uri)
                continue
            else:
                self.get_logger().info('Reading local stream {}'.format(uri))
                if uri.startswith('file:'):
                    uri = uri[5:]
                uri = os.path.join(base_dir, uri)
                try:
                    stream = open(uri, 'rb')
                except IOError as exception:
                    self.get_logger().error(
                        'Failed to open {}: {}'.format(uri, exception))
                    self.warc_input_failed.add(1)
                    continue

            no_parse = (not self.warc_parse_http_header)
            try:
                for record in ArchiveIterator(stream,
                                              no_record_parse=no_parse):
                    for res in self.process_record(record):
                        yield res
                    self.records_processed.add(1)
            except ArchiveLoadFailed as exception:
                self.warc_input_failed.add(1)
                self.get_logger().error(
                    'Invalid WARC: {} - {}'.format(uri, exception))
            finally:
                if uri.startswith('file:'):
                    stream.close()

    def process_record(self, record):
        raise NotImplementedError('Processing record needs to be customized')

    @staticmethod
    def is_wet_text_record(record):
        """Return true if WARC record is a WET text/plain record"""
        return (record.rec_type == 'conversion' and
                record.content_type == 'text/plain')

    @staticmethod
    def is_wat_json_record(record):
        """Return true if WARC record is a WAT record"""
        return (record.rec_type == 'metadata' and
                record.content_type == 'application/json')
                         persist()

donnees06 = donnees05.map( lambda ligne :
                          transformLignePoste(ligne)).\
                          persist()

donnees07 = donnees06.join(donnees03).persist()

donnees08 = donnees07.sortByKey().persist()

donnees09 = donnees08.map(lambda ligne : tuple([ligne[0]] +
                           [x for x in ligne[1][0]] +
                           [x for x in ligne[1][1]]) ).persist()

schema = StructType([
            StructField('Id'           , StringType() , True),
            StructField('ville'        , StringType() , True),
            StructField('latitude'     , StringType() , True),
            StructField('longitude'    , StringType() , True),
            StructField('altitude'     , StringType() , True),
            StructField('annee'        , IntegerType(), True),
            StructField('mois'         , IntegerType(), True),
            StructField('jour'         , IntegerType(), True),
            StructField('temperature'  , FloatType()  , True),
            StructField('humidite'     , FloatType()  , True),
            StructField('visibilite'   , FloatType()  , True),
            StructField('pression'     , FloatType()  , True)])

donnees10 = spark.createDataFrame(donnees09, schema).cache()
donnees11 = donnees10.filter(donnees10.Id < '08000')
donnees12 = donnees11.groupBy('ville').\
Beispiel #13
0
class PhoneNumbers:
    filt = r"[\(\)\- ]*"
    mid_zero = "(?:{0}\( *?0 *?\))".format(filt)
    phone_regex = "(?:(?<=\D)00{0}3{0}1|\+{0}3{0}1){1}?(?:{0}[0-9]){{9}}".format(
        filt, mid_zero)
    replace_regex = "{0}|{1}".format(mid_zero, filt)
    zeroplus_regex = "^00"
    phone_nl_filter = re.compile(phone_regex)
    replace_filter = re.compile(replace_regex)
    zeroplus_filter = re.compile(zeroplus_regex)

    output_schema = StructType([
        StructField("num", StringType(), True),
        StructField("urls", ArrayType(StringType()), True)
    ])

    def __init__(self, input_file, output_dir, name, partitions=None):
        self.name = name
        self.input_file = input_file
        self.output_dir = output_dir
        self.partitions = partitions

    def run(self):
        sc = SparkContext(appName=self.name)
        sqlc = SQLContext(sparkContext=sc)

        self.failed_record_parse = sc.accumulator(0)
        self.failed_segment = sc.accumulator(0)

        if self.partitions is None:
            self.partitions = sc.defaultParallelism

        input_data = sc.textFile(self.input_file,
                                 minPartitions=self.partitions)
        phone_numbers = input_data.flatMap(self.process_warcs)

        phone_numb_agg_web = phone_numbers.groupByKey().mapValues(list)

        sqlc.createDataFrame(phone_numb_agg_web, schema=self.output_schema) \
                .write \
                .mode("overwrite") \
                .format("parquet") \
                .save(self.output_dir)

        self.log(sc, "Failed segments: {}".format(self.failed_segment.value))
        self.log(sc,
                 "Failed parses: {}".format(self.failed_record_parse.value))

    def log(self, sc, message, level="warn"):
        log = sc._jvm.org.apache.log4j.LogManager.getLogger(self.name)
        if level == "info":
            log.info(message)
        elif level == "warn":
            log.warn(message)
        else:
            log.warn("Level unknown for logging: {}".format(level))

    def process_warcs(self, input_uri):
        stream = None
        if input_uri.startswith('file:'):
            stream = self.process_file_warc(input_uri)
        elif input_uri.startswith('s3:/'):
            stream = self.process_s3_warc(input_uri)
        if stream is None:
            return []
        return self.process_records(stream)

    def process_s3_warc(self, uri):
        try:
            no_sign_request = botocore.client.Config(
                signature_version=botocore.UNSIGNED)
            s3client = boto3.client('s3', config=no_sign_request)
            s3pattern = re.compile('^s3://([^/]+)/(.+)')
            s3match = s3pattern.match(uri)
            if s3match is None:
                print("Invalid URI: {}".format(uri))
                self.failed_segment.add(1)
                return None
            bucketname = s3match.group(1)
            path = s3match.group(2)
            warctemp = TemporaryFile(mode='w+b')
            s3client.download_fileobj(bucketname, path, warctemp)
            warctemp.seek(0)
            return warctemp
        except BaseException as e:
            print("Failed fetching {}\nError: {}".format(uri, e))
            self.failed_segment.add(1)
            return None

    def process_file_warc(self, input_file):
        try:
            return open(input_file[5:], 'rb')
        except BaseException as e:
            print("Error ocurred loading file: {}".format(input_file))
            self.failed_segment.add(1)
            return None

    def process_records(self, stream):
        try:
            for rec in ArchiveIterator(stream):
                uri = rec.rec_headers.get_header("WARC-Target-URI")
                if uri is None:
                    continue
                try:
                    for num in self.find_phone_numbers(rec.content_stream()):
                        yield (num, uri)
                except UnicodeDecodeError as e:
                    print("Error: {}".format(e))
                    self.failed_record_parse.add(1)
                    continue

        except BaseException as e:
            print("Failed parsing.\nError: {}".format(e))
            self.failed_segment.add(1)

    def find_phone_numbers(self, content):
        content = content.read().decode('utf-8')
        numbers = self.phone_nl_filter.findall(content)
        nums_filt = {
            re.sub(self.zeroplus_filter, "+",
                   re.sub(self.replace_filter, "", num))
            for num in numbers
        }
        for num in nums_filt:
            yield num
Beispiel #14
0
nargs = int(sys.argv[argctr])
for x in range(0, nargs):
	argctr = argctr + 1
	vecSizes.append(int(sys.argv[argctr]))


#get samplesizes
sampleSizes = []
argctr = argctr + 1
nargs = int(sys.argv[argctr])
for x in range(0, nargs):
	argctr = argctr + 1
	sampleSizes.append(int(sys.argv[argctr]))

argctr = argctr + 1
testHistory = sys.argv[argctr]
argctr = argctr + 1
confidence = float(sys.argv[argctr])

switch = 10
catSizeLog = 10
seqs = spark.read.csv(seqFile, sep=',', schema=StructType([StructField('word', StringType(), False)]))
seqs.show(10, False)

main(f, seqs, outDir)


sys.exit()


Beispiel #15
0
 def register(self, name, f, returnType=StringType()):
     return self.sqlContext.registerFunction(name, f, returnType)
Beispiel #16
0
def text_clustering(dataFrame,
                    k_value,
                    w2v=False,
                    w2v_value=None,
                    seed=2137,
                    normalize=True,
                    plot=True):
    """
    args:
        -dataFrame: spark Data Frame
        -k_value: number of clusters in k-means algorithm
        -w2v: if True word2Vec is used and w2v_value must be specified, otherwise tf-idf is used
        -w2v_value: number of parameters to be returned with Word2Vec
        -seed: seed
        -normalize: should normalization after Word2Vec be performed?
        -plot: if True, clusters are visualized with the use of PCA
        
    """

    #Data preprocessing
    tokenizer = Tokenizer(inputCol="text", outputCol="words_raw")
    dataFrame = tokenizer.transform(dataFrame)
    remover = StopWordsRemover(inputCol="words_raw", outputCol="words")
    dataFrame = remover.transform(dataFrame)

    if w2v and w2v_value is None:
        raise ValueError('You have to give w2v_values parameter')

    if not w2v:  #tf-idf
        hashingTF = HashingTF(inputCol="words_raw",
                              outputCol="rawFeatures",
                              numFeatures=20)
        featurizedData = hashingTF.transform(dataFrame)
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        memes_df = idfModel.transform(featurizedData)

    else:  #word2vec
        word2Vec = Word2Vec(vectorSize=w2v_value,
                            seed=seed,
                            inputCol="words",
                            outputCol="features_unnormalized")
        model_w2v = word2Vec.fit(dataFrame)
        memes_df = model_w2v.transform(dataFrame)
        model_w2v.write().overwrite().save("hdfs:///models/model_w2v")

        if normalize:
            scaler = StandardScaler(inputCol="features_unnormalized",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=True)
            scalerModel = scaler.fit(memes_df)
            memes_df = scalerModel.transform(memes_df)

    #kmeans
    kmeans = KMeans(k=k_value, seed=seed)
    model_kmeans = kmeans.fit(memes_df)
    memes_df = model_kmeans.transform(memes_df)
    model_kmeans.write().overwrite().save("hdfs:///models/model_kmeans")

    #clustering evaluation
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(memes_df)

    centers = model_kmeans.clusterCenters()

    if plot:

        import matplotlib.pyplot as plt  #virtual environment might have problems if imported "the classical" way

        #pca
        pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
        model_pca = pca.fit(memes_df)
        memes_df = model_pca.transform(memes_df)
        #memes_df.show()

        centers_pca = [None] * len(centers)
        for i in range(len(centers)):
            centers_pca[i] = np.multiply(model_pca.pc.toArray().T,
                                         centers[i]).sum(axis=1)
        centers_pca = np.array(centers_pca)

        #plot section
        split_col = functions.split(memes_df["pcaFeatures"].cast(StringType()),
                                    ',')
        memes_df = memes_df.withColumn(
            'x',
            translate(split_col.getItem(0), "[", "").cast(DoubleType()))
        memes_df = memes_df.withColumn(
            'y',
            translate(split_col.getItem(1), "]", "").cast(DoubleType()))
        #memes_df.show(truncate = False)

        df = memes_df.toPandas()
        groups = df.groupby('prediction')
        fig, ax = plt.subplots()
        ax.margins(0.05)
        for name, group in groups:
            ax.plot(group.x,
                    group.y,
                    marker='o',
                    linestyle='',
                    ms=5,
                    label=name)
            ax.text(centers_pca[name, 0],
                    centers_pca[name, 1],
                    s=name,
                    fontsize=10)
        ax.legend()
        ax.title.set_text("k={0}, wn={1}, Silhouette={2}".format(
            k_value, w2v_value, silhouette))
        plt.show()
        print("PCA, explained variance= {0}".format(
            model_pca.explainedVariance))

    return memes_df
Beispiel #17
0
        return '50-54'
    elif (age >= 55 and age < 60):
        return '55-59'
    elif (age >= 60 and age < 65):
        return '60-64'
    elif (age >= 65 and age < 70):
        return '65-69'
    elif (age >= 70 and age < 75):
        return '70-74'
    elif (age >= 75):
        return '75+'
    else:
        return ''


age_banding_udf = udf(bandingFunction, StringType())
trainingSDF = trainingSDF.withColumn('age_banding',
                                     age_banding_udf(trainingSDF.age))
trainingSDF = trainingSDF.drop('age')

temp_table_name = 'trainingSDF'

trainingSDF.createOrReplaceTempView(temp_table_name)

# COMMAND ----------

# MAGIC %md
# MAGIC Let's now visualize the distribution.

# COMMAND ----------
conf = SparkConf().setAppName("Max Temperature").setMaster("local[3]")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.config(conf=conf).getOrCreate()


def myFields(data_passed):
    field = data_passed.split(",")
    zone_id = field[0]
    date = field[1]
    temp_type = field[2]
    temperature = round(float(field[3]) * 0.1 * (9.0 / 5.0) + 32.0, 2)
    return zone_id, date, temp_type, temperature


my_schema = StructType([
    StructField("zone_id", StringType()),
    StructField("date", StringType()),
    StructField("temp_type", StringType()),
    StructField("temperature", FloatType())
])


#By rdd+df
# rdd1 = sc.textFile(r"D:\pythonProject\tammingBigDataSparkPython\1800.csv")
# rdd2 = rdd1.map(lambda x: myFields(x))
# df2 = spark.createDataFrame(rdd2, my_schema)
# df2.show(5)
# rdd3 = df2.filter("temp_type = 'TMAX'").select("zone_id","temperature").rdd\
#         .reduceByKey(lambda x,y: max(x,y))
# df3 = spark.createDataFrame(rdd3)
# df3.show(5)
Beispiel #19
0
        .appName('first_app') \
        .getOrCreate()

# 从集合中创建RDD
rdd = spark.sparkContext.parallelize([
    (1001, "张飞", 8341, "坦克"),
    (1002, "关羽", 7107, "战士"),
    (1003, "刘备", 6900, "战士")
])

# 指定模式, StructField(name,dataType,nullable)
# name: 该字段的名字,dataType:该字段的数据类型,nullable: 指示该字段的值是否为空
from pyspark.sql.types import StructType, StructField, LongType, StringType  # 导入类型
schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("hp", LongType(), True), #生命值
    StructField("role_main", StringType(), True)
])

# 对RDD应用该模式并且创建DataFrame
heros = spark.createDataFrame(rdd, schema)
heros.show()

# 利用DataFrame创建一个临时视图
heros.registerTempTable("HeroGames")
# 查看DataFrame的行数
print(heros.count())

# 使用自动类型推断的方式创建dataframe
data = [(1001, "张飞", 8341, "坦克"),
Beispiel #20
0
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"
# -

sp = SparkSession.builder.master("local").appName("data").getOrCreate()

sp.stop()

# +
data = [
    ('PNT', 'point', 1, 1, "точка"),
    ('LNS', 'line segment', 2, 2, "отрезок"),
    ('TRI', 'triangle', 3, 2, "треугольник"),
    ('QDR', 'quadrangle', 4, 2, "четырехугольник"),
    ('QDT', 'quadrant', 4, 2, "квадрат"),
]
schema = StructType([
    StructField('id', StringType(), True),
    StructField('name', StringType(), True),
    StructField('points', IntegerType(), True),
    StructField('dimensions', IntegerType(), True),
    StructField('description', StringType(), True)
])

# Convert list to RDD
rdd = sp.sparkContext.parallelize(data)

# Create data frame
df = sp.createDataFrame(rdd, schema)
df.show()
Beispiel #21
0
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import from_json, col, to_timestamp, window, expr, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("Tumbling Window Steram") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 2) \
        .getOrCreate()

    stock_schema = StructType([
        StructField("CreatedTime", StringType()),
        StructField("Type", StringType()),
        StructField("Amount", IntegerType()),
        StructField("BrokerCode", StringType())
    ])

    kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "trades") \
        .option("startingOffsets", "earliest") \
        .load()

    value_df = kafka_df.select(
        from_json(col("value").cast("string"), stock_schema).alias("value"))

    #value_df.printSchema()
Beispiel #22
0
hc = HiveContext(ss)

#---------------------------------------------------------------------------------------------------------------------------------------------------
first_month = "'2017-01-%'"
second_month = "'2017-02-%'"
non_dropouts_table = "temp_user_store.rt_nondropouts_jan2017"
dropouts_table = "temp_user_store.rt_dropouts_jan2017"

def rename_device(device):
  if device in ('games console', 'tv', 'set top box', 'desktop', ''):
    return 'desktop'
  elif device in ('mobile phone', 'media player', 'tablet', 'ereader'):
    return 'mobile'
  else:
    return 'other'
rename_device = F.udf(rename_device, StringType())

#---------------------------------------------------------------------------------------------------------------------------------------------------
# Extract individual events
events = hc.sql("SELECT  \
                        visitor_id, \
                        hit_time_gmt, \
                        file_date, \
                        month(file_date) as month, \
                        session_id, \
                        pageview_event_cnt, \
                        video_start_cnt, \
                        video_time_spent_secs, \
                        browser_typ_dsc, \
                        device_type_dsc as device \
                FROM user_business_defined_dataset.cnn_adobe_bdd_web \
Beispiel #23
0
    def register(self, name, f, returnType=None):
        """Register a Python function (including lambda function) or a user-defined function
        as a SQL function.

        .. versionadded:: 1.3.1

        Parameters
        ----------
        name : str,
            name of the user-defined function in SQL statements.
        f : function, :meth:`pyspark.sql.functions.udf` or :meth:`pyspark.sql.functions.pandas_udf`
            a Python function, or a user-defined function. The user-defined function can
            be either row-at-a-time or vectorized. See :meth:`pyspark.sql.functions.udf` and
            :meth:`pyspark.sql.functions.pandas_udf`.
        returnType : :class:`pyspark.sql.types.DataType` or str, optional
            the return type of the registered user-defined function. The value can
            be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
            `returnType` can be optionally specified when `f` is a Python function but not
            when `f` is a user-defined function. Please see the examples below.

        Returns
        -------
        function
            a user-defined function

        Notes
        -----
        To register a nondeterministic Python function, users need to first build
        a nondeterministic user-defined function for the Python function and then register it
        as a SQL function.

        Examples
        --------
        1. When `f` is a Python function:

            `returnType` defaults to string type and can be optionally specified. The produced
            object must match the specified type. In this case, this API works as if
            `register(name, f, returnType=StringType())`.

            >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x))
            >>> spark.sql("SELECT stringLengthString('test')").collect()
            [Row(stringLengthString(test)='4')]

            >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
            [Row(stringLengthString(text)='3')]

            >>> from pyspark.sql.types import IntegerType
            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
            >>> spark.sql("SELECT stringLengthInt('test')").collect()
            [Row(stringLengthInt(test)=4)]

            >>> from pyspark.sql.types import IntegerType
            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
            >>> spark.sql("SELECT stringLengthInt('test')").collect()
            [Row(stringLengthInt(test)=4)]

        2. When `f` is a user-defined function (from Spark 2.3.0):

            Spark uses the return type of the given user-defined function as the return type of
            the registered user-defined function. `returnType` should not be specified.
            In this case, this API works as if `register(name, f)`.

            >>> from pyspark.sql.types import IntegerType
            >>> from pyspark.sql.functions import udf
            >>> slen = udf(lambda s: len(s), IntegerType())
            >>> _ = spark.udf.register("slen", slen)
            >>> spark.sql("SELECT slen('test')").collect()
            [Row(slen(test)=4)]

            >>> import random
            >>> from pyspark.sql.functions import udf
            >>> from pyspark.sql.types import IntegerType
            >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
            >>> new_random_udf = spark.udf.register("random_udf", random_udf)
            >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
            [Row(random_udf()=82)]

            >>> import pandas as pd  # doctest: +SKIP
            >>> from pyspark.sql.functions import pandas_udf
            >>> @pandas_udf("integer")  # doctest: +SKIP
            ... def add_one(s: pd.Series) -> pd.Series:
            ...     return s + 1
            ...
            >>> _ = spark.udf.register("add_one", add_one)  # doctest: +SKIP
            >>> spark.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
            [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]

            >>> @pandas_udf("integer")  # doctest: +SKIP
            ... def sum_udf(v: pd.Series) -> int:
            ...     return v.sum()
            ...
            >>> _ = spark.udf.register("sum_udf", sum_udf)  # doctest: +SKIP
            >>> q = "SELECT sum_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2"
            >>> spark.sql(q).collect()  # doctest: +SKIP
            [Row(sum_udf(v1)=1), Row(sum_udf(v1)=5)]

        """

        # This is to check whether the input function is from a user-defined function or
        # Python function.
        if hasattr(f, 'asNondeterministic'):
            if returnType is not None:
                raise TypeError(
                    "Invalid return type: data type can not be specified when f is"
                    "a user-defined function, but got %s." % returnType)
            if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF,
                                  PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                                  PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                                  PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                                  PythonEvalType.SQL_MAP_PANDAS_ITER_UDF]:
                raise ValueError(
                    "Invalid f: f must be SQL_BATCHED_UDF, SQL_SCALAR_PANDAS_UDF, "
                    "SQL_SCALAR_PANDAS_ITER_UDF, SQL_GROUPED_AGG_PANDAS_UDF or "
                    "SQL_MAP_PANDAS_ITER_UDF.")
            register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name,
                                               evalType=f.evalType,
                                               deterministic=f.deterministic)
            return_udf = f
        else:
            if returnType is None:
                returnType = StringType()
            register_udf = UserDefinedFunction(f, returnType=returnType, name=name,
                                               evalType=PythonEvalType.SQL_BATCHED_UDF)
            return_udf = register_udf._wrapped()
        self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf)
        return return_udf
    header=True,
    inferSchema=True)
display(dfFire)

# COMMAND ----------

dfFire.printSchema()

# COMMAND ----------

from pyspark.sql.types import StructField, StructType, IntegerType, StringType, BooleanType

# COMMAND ----------

fire_schema = StructType([
    StructField('CallNumber', StringType(), True),
    StructField('UnitID', StringType(), True),
    StructField('IncidentNumber', IntegerType(), True),
    StructField('CallType', StringType(), True),
    StructField('CallDate', StringType(), True),
    StructField('WatchDate', StringType(), True),
    StructField('ReceivedDtTm', StringType(), True),
    StructField('EntryDtTm', StringType(), True),
    StructField('DispatchDtTm', StringType(), True),
    StructField('ResponseDtTm', StringType(), True),
    StructField('OnSceneDtTm', StringType(), True),
    StructField('TransportDtTm', StringType(), True),
    StructField('HospitalDtTm', StringType(), True),
    StructField('CallFinalDisposition', StringType(), True),
    StructField('AvailableDtTm', StringType(), True),
    StructField('Address', StringType(), True),
Beispiel #25
0
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, IntegerType, StructType


base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_DataFrames/'
file_name = 'people.json'

spark = SparkSession.builder.appName('Basics').getOrCreate()


# This is reading from column AGE assuming is value Integer type, True is to say that it is null
data_schema = [StructField('age', IntegerType(), True),
                StructField('name', StringType(), True)]

final_struct = StructType(fields = data_schema)


df = spark.read.json(base_path + file_name, schema = final_struct)

# select() method returns a DataFrame of a single column, using ['age'] we return a pyspark.Column object
print(df.select('age').show())

# withColumn() ---> creates a new column in the dataframe
print(df.withColumn('newage', df['age'] * 2).show())

# Just rename a column
print(df.withColumnRenamed('age', 'my_new_age').show())

df.createOrReplaceTempView('people')
    return result

# Get movie name by given movie id 
def getMovieName(movieNames, movieId):
    result = movieNames.filter(func.col("movieID") == movieId) \
        .select("movieTitle").collect()[0]

    return result[0]


spark = SparkSession.builder.appName("MovieSimilarities").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

movieNamesSchema = StructType([ \
                               StructField("movieID", IntegerType(), True), \
                               StructField("movieTitle", StringType(), True) \
                               ])
    
moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])
    
    
# Create a broadcast dataset of movieID and movieTitle.
# Apply ISO-885901 charset
movieNames = spark.read \
      .option("sep", "|") \
      .option("charset", "ISO-8859-1") \
      .schema(movieNamesSchema) \
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('rdd-to-dataframe').getOrCreate()

dept = [("Finance", 10), ("Marketing", 20), ("Sales", 30), ("IT", 40)]
rdd = spark.sparkContext.parallelize(dept)

df = rdd.toDF()
df.printSchema()
df.show(truncate=False)

deptColumns = ["dept_name", "dept_id"]
df2 = rdd.toDF(deptColumns)
df2.printSchema()
df2.show(truncate=False)

deptDF = spark.createDataFrame(data=dept, schema=deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

from pyspark.sql.types import StructType, StructField, StringType
deptSchema = StructType([
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)
])

deptDF1 = spark.createDataFrame(data=dept, schema=deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)
    app_conf = yaml.load(conf, Loader=yaml.FullLoader)
    secret = open(app_secrets_path)
    app_secret = yaml.load(secret, Loader=yaml.FullLoader)

    # Setup spark to use s3
    hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3a.access.key", app_secret["s3_conf"]["access_key"])
    hadoop_conf.set("fs.s3a.secret.key",
                    app_secret["s3_conf"]["secret_access_key"])

    print(
        "\nCreating dataframe ingestion CSV file using 'SparkSession.read.format()'"
    )

    crime_schema = StructType() \
        .add("Id", StringType(), True) \
        .add("City_Name", StringType(), True) \
        .add("Crime_Name", StringType(), True) \
        .add("Damages", StringType(), True) \
        .add("No_Of_Case", IntegerType(), True) \
        .add("Year", IntegerType(), True) \
        .add("Total_Case", IntegerType(), True)

    crime_df = spark.read \
        .option("header", "false") \
        .option("delimiter", ",") \
        .format("csv") \
        .schema(crime_schema) \
        .load("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/crime_dataset/xac.csv")

    crime_df.printSchema()
Beispiel #29
0
    "float": FloatType,
    "double": DoubleType,
    "boolean": BooleanType,
    "struct": StructType,
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType
}

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_TYPES = {
    "int", "float", "string", "bool", "date", "null", "array", "double"
}
PROFILER_LEGEND_TYPES = {
    "string": "ABC",
    "int": "#",
    "integer": "#",
    "float": "##.#",
    "double": "##.#",
Beispiel #30
0
import numpy
# import scipy.misc
# from scipy import spatial
import scipy
from Orange.widgets.widget import OWWidget
from nltk.corpus import wordnet
from nltk.wsd import lesk
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import IntegerType, StringType, ArrayType, Row, DoubleType, StructField, StructType

from scipy.optimize import linear_sum_assignment


@udf(returnType=StringType())
def p_locationType(string):
    if "Farm" in string:
        return 1
    if "Residenti" in string:
        return 2
    if "Commerci" in string:
        return 3
    if "Publi" in string:
        return 4
    return "UNKNOWN"


@udf(returnType=IntegerType())
def p_ordinalDate(string):
    start = datetime.datetime.strptime(string.strip(), '%d/%m/%Y')