Beispiel #1
0
    columns_to_return = [
        'trip_id', 'stop_id', 'prev_stop', 'stop_sequence', 'route_id',
        'filled_ts', 'day', 'hour', 'week', 'diff_schedule_real'
    ]
    df = df.filter(df.filled_ts.isNotNull()) \
     .filter((df.filled_ts > 0)) \
     .filter((df.filled_ts != 6666.6666)) \
     .select(*columns_to_return)

    return df


if __name__ == '__main__':
    scSpark = SparkSession.builder.appName('Stop to Stop').getOrCreate()
    scSpark.sparkContext.setLogLevel("ERROR")
    sqlCon = SQLContext(scSpark)

    stops_file = 'clean_stops.csv'
    trips_file = 'trips_agglom.csv'
    data_file = 'miniminiset.csv'

    if len(sys.argv) == 4:
        stops_file = sys.argv[1]
        data_file = sys.argv[2]
        trips_file = sys.argv[3]

    # Begin with a straightforward read from csv.
    trips_df = scSpark.read.csv(trips_file, header=True, sep=',')

    # Stop times require casting. Convert to DataFrame
    stops_df = stop_times_to_df(stops_file, scSpark, trips_df)
from pyspark.sql import SQLContext
from pyspark import SparkContext

if __name__ == "__main__":
    # create Spark context with necessary configuration
    spark = SparkContext("local", "Stock Returns")

    # read csv data from the stock_prices file
    df = SQLContext(spark).read.option("header", True).csv("stock_prices.csv")

    # calculate daily percentage returns
    df = df.withColumn("return", ((df["close"]-df["open"])/df["open"])*100)

    # average on date
    df = df.groupBy("date").avg().alias("avg_return")

    # save the average returns to output
    df.write.csv('./stockreturns/')
Beispiel #3
0
def get_sql_context_instance(spark_context):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
    return globals()['sqlContextSingletonInstance']
from pyspark.sql import SQLContext, Row, HiveContext
from pyspark.sql.window import Window
from pyspark.sql import functions as psf

HadoopLink = sys.argv[1]
HadoopLink2 = sys.argv[2]

#conf = SparkConf()
#conf.setMaster("local[*]")
#conf.setAppName("BuildChurnChannelVariables")
#conf.set("spark.executor.memory", "4g")
#conf.set("spark.executor.cores", 2)
#conf.set("spark.jars.packages", "com.databricks:spark-csv_2.11:1.4.0")

sc = SparkContext()
sq = SQLContext(sc)
hq = HiveContext(sc)

###HadoopLink = "hdfs://10.82.187.10:8020/hadoop/hdfs/INPUTPARQUET/"

CreditHistoryLog = hq.read.parquet(HadoopLink + "contr/CreditHistory_parquet")
CreditHistoryLog.registerTempTable("CreditHistory")

Client = hq.read.parquet(HadoopLink + "cli/Client_parquet")
Client.registerTempTable("Client")
ClientBlackList = hq.read.parquet(HadoopLink + "cli/ClientBlackList_parquet")
ClientBlackList.registerTempTable("ClientBlackList")

seip = hq.sql("\
SELECT distinct	ClientID,ReportingDate FROM CreditHistory")
seip.registerTempTable("seip")
from pyspark.sql import SQLContext
import numpy as np
import imutils
import cv2
from imageai.Detection import ObjectDetection
import tensorflow as tf
import time

conf = SparkConf().setAppName("object detection streaming").setMaster("yarn")
conf.set("spark.scheduler.mode", "FAIR")
conf.set("spark.scheduler.allocation.file",
         "/opt/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml")
sc = SparkContext(conf=conf)
sc.setLocalProperty("spark.scheduler.pool", "pool3")
ssc = StreamingContext(sc, 0.5)
sql_sc = SQLContext(sc)
input_topic = 'input'
output_topic = 'output3'
brokers = "G01-01:2181,G01-02:2181,G01-03:2181,G01-04:2181,G01-05:2181,G01-06:2181,G01-07:2181,G01-08:2181," \
          "G01-09:2181,G01-10:2181,G01-11:2181,G01-12:2181,G01-13:2181,G01-14:2181,G01-15:2181,G01-16:2181"


def my_decoder(s):
    return s


kafkaStream = KafkaUtils.createStream(ssc,
                                      brokers,
                                      'test-consumer-group-3',
                                      {input_topic: 15},
                                      valueDecoder=my_decoder)
#import path
from pyspark.sql import *

# create spark sql session
myspark = SparkSession\
    .builder\
    .config("spark.executor.instances", 3 ) \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.cores", 2) \
    .config("spark.dynamicAllocation.maxExecutors", 10) \
    .config("spark.scheduler.listenerbus.eventqueue.size", 10000) \
    .config("spark.sql.parquet.compression.codec", "snappy") \
    .appName("Sample_07_kmeans") \
    .getOrCreate()



sc = myspark.sparkContext



from pyspark.sql import SQLContext
print sc
#df = pd.read_csv("test.csv")
print type(df)
print df
sqlCtx = SQLContext(sc)
sqlCtx.createDataFrame(df).show()


"""
This is a Spark script for curating the historical options
"""

sc.install_pypi_package('scipy')

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import numpy as np

spark = SparkSession.builder.appName('MapRedBook').getOrCreate()
ctx = SQLContext(spark)

options = ctx.read.load('s3n://nbachmei.finsurf.data-us-west-2/data/csv/L2_options_*.csv', 
  header="true",
  inferSchema="true",
  format="csv")

import pyspark.sql.functions as F
import datetime
import numpy as np
import scipy.stats as si

@F.udf
def fix_date(dt):    
    if dt is None:
        return dt
    parts = dt.split('/')
    return "{}-{}-{}".format(parts[2],parts[0],parts[1])
    
@F.udf
Beispiel #8
0
from pyspark.sql.types import FloatType
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.stat import Correlation, Summarizer
import sys

import config

DATA_CSV = config.BUILDDIR / 'bd_lab_small_sample.csv'
DATA_PARQUET = DATA_CSV.with_suffix('.parquet')

spark = SparkContext.getOrCreate()
sql = SQLContext(spark)


def convert():
    """Convert CSV to Parquet."""
    df = sql.read.csv(str(DATA_CSV), header=True, inferSchema='true')
    for field in [
            'cost', 'call_duration_minutes', 'data_volume_mb', 'LAT', 'LON'
    ]:
        df = df.withColumn(field, df[field].cast(FloatType()))
    df.write.parquet(str(DATA_PARQUET))


def explore():

    df = sql.read.parquet(str(DATA_PARQUET))
def test(csv_report,
         all,
         dummy_test,
         separate_test,
         all_but_test,
         primary_pairs_test,
         custom_combos_test,
         non_zero_users_from_file):

    logging.info('Testing started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)
    logging.info('Spark context initialization')

    sc = SparkContext(cfg.spark.master, 'map_test: train')
    sqlContext = SQLContext(sc)

    logging.info('Train data reading')

    test_df = sqlContext.read.json(cfg.splitting.test_file).cache()
    test_data = test_df.filter("event = '%s'" % (cfg.testing.primary_event)).collect()

    #non_zero_users = set([r[0] for r in test_data][500:650]) # Because actually all our users have 0.0 scores -- too few data

    if all or dummy_test:
        logging.info('Train data reading')

        train_df = sqlContext.read.json(cfg.splitting.train_file).cache()
        counts = train_df.filter("event = '%s'" % (cfg.testing.primary_event)).groupBy("targetEntityId").count().collect()
        sorted_rating = sorted([(row.asDict()['count'], row.asDict()['targetEntityId']) for row in counts], reverse=True)
        elements = np.array([item for cnt, item in sorted_rating])
        probs = np.array([cnt for cnt, item in sorted_rating])
        probs = 1.0 * probs / probs.sum()

        logging.info('Process dummy test')
        # case 1. Random sampling from items (uniform)
        dummy_uniform_res = run_map_test_dummy(test_data, items=elements, probs=probs,
                                               uniform=True, top=False, K=cfg.testing.map_k)

        # case 2. Random sampling from items (according to their distribution in training data)
        dummy_res = run_map_test_dummy(test_data, items=elements, probs=probs,
                                       uniform=False, top=False, K=cfg.testing.map_k)

        # case 3. Top-N items from training data
        dummy_top_res = run_map_test_dummy(test_data, items=elements, probs=probs,
                                           uniform=True, top=True, K=cfg.testing.map_k)

        reporter.start_new_sheet('Dummy MAP benchmark')
        reporter.report(
            ['', 'Random uniform', 'Random sampled from train', 'Top - N'],
            [[('MAP @ %d' % i) for i in range(1, len(dummy_res)+1)]] + [dummy_uniform_res, dummy_res, dummy_top_res],
            cfg=cfg
        )
        reporter.finish_sheet()
        logging.info('Process top 20 dummy test')

        scores = []

        for i in range(20):
            scores.append(run_map_test_dummy(test_data, items=elements[i:], uniform=True,
                                             top=True, K=1, no_progress=True)[0])

        reporter.start_new_sheet('Top-20 perfomance')
        reporter.report(
            ['Rank', 'MAP@1'],
            [list(range(1, 21)), scores],
            bold_first_column=False,
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or separate_test or all_but_test or primary_pairs_test or custom_combos_test:
        logging.info('Non zero users')

        if non_zero_users_from_file:
            with open(cfg.testing.non_zero_users_file) as input:
                non_zero_users = set(input.read().split(','))
        else:
            _, r_data, _ = run_map_test(test_data, [cfg.testing.primary_event], test=False)
            non_zero_users = get_nonzero(r_data)
            with open(cfg.testing.non_zero_users_file, 'w') as output:
                output.write(','.join(non_zero_users))

    if all or separate_test:
        logging.info('Process "map separate events" test')
        columns = []

        for ev in cfg.testing.events:
            (r_scores, r_data, ipu) = run_map_test(test_data, [ev], users=non_zero_users, test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users']

        reporter.start_new_sheet('MAP separate events')
        reporter.report(
            ['event'] + cfg.testing.events,
            [first_column] + columns,
            selected_columns=[cfg.testing.events.index(cfg.testing.primary_event) + 1],
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or all_but_test:
        logging.info('Process "map all but..." test')
        events_scores = []
        for ev in cfg.testing.events:
            evs = list(cfg.testing.events)
            evs.remove(ev)
            (r_scores, r_data, ipu) = run_map_test(test_data, evs, users=non_zero_users, test=False)
            events_scores.append(r_scores + [len(non_zero_users)])

        evl = cfg.testing.events
        all_scores, r_data, ipu = run_map_test(test_data, evl, users=non_zero_users, test=False)
        all_scores.append(len(non_zero_users))

        first_column = [('MAP @ %d' % i) for i in range(1, len(all_scores))] + ['non-zero users']
        reporter.start_new_sheet('MAP all but...')
        reporter.report(
            ['event'] + cfg.testing.events + ['All'],
            [first_column] + events_scores + [all_scores],
            selected_columns=[cfg.testing.events.index(cfg.testing.primary_event) + 1],
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or primary_pairs_test:
        logging.info('Process "map pairs with primary" test')
        columns = []
        events_without_primary = [event for event in cfg.testing.events if event != cfg.testing.primary_event]

        for event in events_without_primary:
            (r_scores, r_data, ipu) = run_map_test(test_data, [cfg.testing.primary_event, event],
                                                   users=non_zero_users, test=False)
            columns.append(r_scores + [len(non_zero_users)])

        first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users']

        reporter.start_new_sheet('MAP pairs with primary')
        reporter.report(
            ['event'] + events_without_primary,
            [first_column] + columns,
            cfg=cfg
        )
        reporter.finish_sheet()

    if all or custom_combos_test:
        logging.info('Process "custom combos" test')
        columns = []

        for event_group in cfg.testing.custom_combos.event_groups:
            if len(event_group) == 2 and cfg.testing.primary_event in event_group and primary_pairs_test:
                logging.warn("Report for group %s already generated in 'MAP pairs with primary'" % str(event_group))
                continue

            if len(event_group) == 1 and separate_test:
                logging.warn("Report for group %s already generated in 'MAP separate events'" % str(event_group))
                continue

            if len(event_group) >= len(cfg.testing.events) - 1 and all_but_test:
                logging.warn("Report for group %s already generated in 'All but...'" % str(event_group))
                continue

            if not (set(cfg.testing.events) & set(event_group)):
                logging.warn("Event group is not corect!")
                continue

            (r_scores, r_data, ipu) = run_map_test(test_data, event_group,
                                                   users = non_zero_users,
                                                   test=False)
            columns.append(r_scores + [len(non_zero_users)])

        if columns:
            first_column = [('MAP @ %d' % i) for i in range(1, len(columns[0]))] + ['non-zero users']

            reporter.start_new_sheet('Custom combos')
            reporter.report(
                ['event'] + [str([s.encode('utf-8') for s in group]) for group in cfg.testing.custom_combos.event_groups],
                [first_column] + columns,
                cfg=cfg
            )
            reporter.finish_sheet()

    reporter.finish_document()
    logging.info('Testing finished successfully')
def run():
    # Creating the Spark Context
    sc = SparkContext(master="local[2]", appName="WindowWordCount")
    sc.setLogLevel("ERROR")

    # creating the streaming context
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    # creating the SQL context
    sqlContext = SQLContext(sc)

    host = "localhost"
    port = 5599

    lines = ssc.socketTextStream(host, port)

    hashtags = lines.filter(lambda text: len(text) > 0) \
        .flatMap(lambda text: text.split(" ")) \
        .filter(lambda text: text.lower().startswith('#'))

    Word = namedtuple('Word', ("word", "count"))
    Hashtag = namedtuple('Hashtag', ("tag", "count"))
    Tweet = namedtuple('Tweet', ('text', 'sentiment'))

    stop_words = set(stopwords.words('english'))
    list_punct = list(string.punctuation)
    lemmatizer = WordNetLemmatizer()

    # processing to obtain data about tweets text and sentiment
    lines.window(40) \
        .map(lambda p: clean_tweet(p)) \
        .filter(lambda text: len(text) > 0) \
        .map(lambda p: Tweet(p, analyze_sentiment_polarity(p))) \
        .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("tweets"))

    # processing to obtain data about single words in text and their count. NLP tools applied.
    lines.window(40) \
        .map(lambda p: clean_tweet(p)) \
        .filter(lambda text: len(text) > 0) \
        .flatMap(lambda text: text.split(" ")) \
        .map(lambda word: word.lower()) \
        .filter(lambda word: word not in stop_words) \
        .map(lambda word: ''.join(char for char in word if char not in list_punct)) \
        .map(lambda word: lemmatizer.lemmatize(word)) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda p: Word(p[0], p[1])) \
        .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("words"))

    # processing to obtain data about hashtags in text and their count.
    hashtags.window(40) \
        .map(lambda word: ''.join(char for char in word if char not in list_punct)) \
        .map(lambda word: (word.lower(), 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda p: Hashtag(p[0], p[1])) \
        .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("hashtags"))

    time_to_wait = 80
    ssc.start()
    print("Session Started.....")
    print("Collecting tweets...waiting for " + str(time_to_wait) +
          " seconds..")
    time.sleep(
        time_to_wait)  # waiting in to ensure that some data are yet collected.
    print("Tweets Collected....")

    all_hashtags_df = None
    all_tweets_df = None
    all_words_df = None

    count = 1
    count_max = 4
    while count <= count_max:
        print('Count: ' + str(count) + "/" + str(count_max))
        print("Waiting for 30 Seconds.....")
        time.sleep(40)

        words = sqlContext.sql('Select word, count from words')
        words_df = words.toPandas()
        print(words_df)
        if all_words_df is None:
            all_words_df = words_df
        else:
            all_words_df = pd.concat([all_words_df, words_df],
                                     join='inner',
                                     ignore_index=True)

        tags = sqlContext.sql('Select tag, count from hashtags')
        tags_df = tags.toPandas()
        print(tags_df)
        if all_hashtags_df is None:
            all_hashtags_df = tags_df
        else:
            all_hashtags_df = pd.concat([all_hashtags_df, tags_df],
                                        join='inner',
                                        ignore_index=True)

        tweets = sqlContext.sql('Select text, sentiment from tweets')
        tweets_df = tweets.toPandas()
        if all_tweets_df is None:
            all_tweets_df = tweets_df
        else:
            all_tweets_df = pd.concat([all_tweets_df, tweets_df],
                                      join='inner',
                                      ignore_index=True)

        count += 1

    ssc.stop()

    # Saving all dataframes as csv.
    if all_hashtags_df is not None:
        all_hashtags_df.to_csv('hashtags.csv')
    if all_words_df is not None:
        all_words_df.to_csv('words.csv')
    if all_tweets_df is not None:
        all_tweets_df.to_csv('tweets.csv')
Beispiel #11
0
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary":
                "Initialized The Generalized Linear Regression Scripts",
                "weight": 2
            },
            "predictionStart": {
                "summary":
                "Generalized Linear Regression Model Prediction Started",
                "weight": 2
            },
            "predictionFinished": {
                "summary":
                "Generalized Linear Regression Model Prediction Finished",
                "weight": 6
            }
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "predictionStart",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/data.csv"
        trained_model_path = "file://" + self._dataframe_context.get_model_path(
        )
        trained_model_path += "/model"
        pipeline_path = "/".join(
            trained_model_path.split("/")[:-1]) + "/pipeline"
        print("trained_model_path", trained_model_path)
        print("pipeline_path", pipeline_path)
        print("score_data_path", score_data_path)
        pipelineModel = MLUtils.load_pipeline(pipeline_path)
        trained_model = MLUtils.load_generalized_linear_regresssion_pyspark_model(
            trained_model_path)
        df = self._data_frame
        indexed = pipelineModel.transform(df)
        transformed = trained_model.transform(indexed)
        if result_column in transformed.columns:
            transformed = transformed.withColumnRenamed(
                result_column, "originalLabel")
        transformed = transformed.withColumnRenamed("prediction",
                                                    result_column)
        pandas_scored_df = transformed.select(
            list(set(self._data_frame.columns + [result_column]))).toPandas()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        pandas_scored_df.to_csv(score_data_path, header=True, index=False)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "predictionFinished",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        print("STARTING Measure ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []
        columns_to_keep = self._dataframe_context.get_score_consider_columns()
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]
        columns_to_drop = [
            x for x in columns_to_drop
            if x in df.columns and x != result_column
        ]
        print("columns_to_drop", columns_to_drop)
        spark_scored_df = transformed.select(
            list(set(columns_to_keep + [result_column])))

        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        df = df_helper.get_data_frame()
        # self._dataframe_context.set_dont_send_message(True)
        try:
            fs = time.time()
            descr_stats_obj = DescriptiveStatsScript(
                df,
                df_helper,
                self._dataframe_context,
                self._result_setter,
                self._spark,
                self._prediction_narrative,
                scriptWeight=self._scriptWeightDict,
                analysisName="Descriptive analysis")
            descr_stats_obj.Run()
            print("DescriptiveStats Analysis Done in ",
                  time.time() - fs, " seconds.")
        except:
            print("Frequency Analysis Failed ")

        try:
            fs = time.time()
            df_helper.fill_na_dimension_nulls()
            df = df_helper.get_data_frame()
            dt_reg = DecisionTreeRegressionScript(
                df,
                df_helper,
                self._dataframe_context,
                self._result_setter,
                self._spark,
                self._prediction_narrative,
                self._metaParser,
                scriptWeight=self._scriptWeightDict,
                analysisName="Predictive modeling")
            dt_reg.Run()
            print("DecisionTrees Analysis Done in ",
                  time.time() - fs, " seconds.")
        except:
            print("DTREE FAILED")

        try:
            fs = time.time()
            two_way_obj = TwoWayAnovaScript(
                df,
                df_helper,
                self._dataframe_context,
                self._result_setter,
                self._spark,
                self._prediction_narrative,
                self._metaParser,
                scriptWeight=self._scriptWeightDict,
                analysisName="Measure vs. Dimension")
            two_way_obj.Run()
            print("OneWayAnova Analysis Done in ",
                  time.time() - fs, " seconds.")
        except:
            print("Anova Analysis Failed")
Beispiel #12
0
import re
import nltk
from pyspark import SparkConf, SparkContext
from pyspark import sql
from pyspark.sql import SQLContext
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from nltk.stem import WordNetLemmatizer
import time
start_time = time.time()
nltk.download('vader_lexicon')
# Set of all stopwords
conf = SparkConf().setAppName("Yelp")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)
"""
Schema
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
"""
loc = 'c'  # 'c'
# Full
if loc == 'f':
Beispiel #13
0
def main():
    """Executes Batch pipeline to store dataset into Cloud Spanner table."""

    parser = argparse.ArgumentParser(
        description='Perform Batch processing to send session data to Spanner')

    parser.add_argument('--input',
                        help='''Path to data set in cloud storage
            Example: --input gs://project/path/to/GCS/file''',
                        required=True)

    parser.add_argument('--instance_id',
                        help='''Cloud Spanner instance ID
            Example: --instance_id spanner_instance_id''',
                        required=True)

    parser.add_argument('--database_id',
                        help='''Cloud Spanner database ID
            Example: --database-id spanner_database_id''',
                        required=True)

    args = parser.parse_args()

    logging.info('Reading Dataset')
    user_sessions_chunks_df = pd.read_csv(args.input,
                                          encoding='utf-8',
                                          chunksize=int(10**2))

    conf = SparkConf().setAppName("Batch Processing with Spark").setMaster(
        "local")

    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    instance = get_instance(args.instance_id)

    logging.info('Creating user_sessions Spanner Database')
    create_database(instance, args.database_id)

    product_attributes = [
        'category', 'sub_category', 'product', 'product_details'
    ]

    schema = StructType([
        StructField("event_time", StringType(), True),
        StructField("event_type", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("category_id", StringType(), True),
        StructField("category_code", StringType(), True),
        StructField("brand", StringType(), True),
        StructField("price", StringType(), True),
        StructField("user_id", StringType(), True),
        StructField("user_session", StringType(), True)
    ])

    for user_sessions_chunk_df in user_sessions_chunks_df:

        logging.info('Transforming data from the Batch')
        # print(user_sessions_chunk_df.count())
        user_sessions_df = transform_data(sqlContext, user_sessions_chunk_df,
                                          product_attributes, schema)

        logging.info(
            'Loading DF Data from the Batch into events_batch Spanner Table')

        user_sessions_rows = user_sessions_df.to_records(index=True,
                                                         index_dtypes=int)
        user_sessions_values = list(user_sessions_rows)
        write_to_spanner(instance, args.database_id, user_sessions_values)

    spanner_success_message = ('Finished Loading DF Data from all' +
                               ' Batches into events_batch Spanner Table')
    logging.info(spanner_success_message)
Beispiel #14
0
def main(sc, src_s3_bucket, target_gremlin_server):
    gremlin_method_insert_pkg_version =  """
        def insert_package_version(g, ecosystem, name, version) {
        def pred_pkg = g.V().has('vertex_label', 'Package').has('name', name).has('ecosystem', ecosystem);
        def pkg_vertex = (pred_pkg.hasNext()) ? pred_pkg.next() : g.addV('vertex_label', 'Package', 'name', name, 'ecosystem', ecosystem).next()

        def pred_version = g.V().has('vertex_label', 'Version').has('pecosystem', ecosystem).has('pname', name).has('version', version);
        if (!pred_version.hasNext()) {
            def version_vertex = g.addV('vertex_label', 'Version', 'pecosystem', ecosystem, 'pname', name, 'version', version).next();
            pkg_vertex.addEdge('has_version', version_vertex);
        }
    }
    """

    gremlin_method_insert_ref_stack = """
    def insert_ref_stack(g, sid, sname, secosystem, usage, source, is_ref_stack, dependencies) {
        def pred_stack = g.V().has('vertex_label', 'Stack').has('sname', sname).has('secosystem', secosystem)
        if (!pred_stack.hasNext()) {
            def stack_vertex = g.addV('vertex_label','Stack','sname', sname, 'secosystem', secosystem, 'usage', usage, 'source', source, 'is_ref_stack', is_ref_stack, 'sid', sid).next();

            for (k in dependencies.keySet()) {
                def version_vertex = g.V().has('vertex_label', 'Version').has('pecosystem', secosystem).has('pname', k).has('version', dependencies.get(k)).next();
                stack_vertex.addEdge('has_dependency', version_vertex);
            }
        }
    }
    """

    sqlContext = SQLContext(sc)
    input_data = sc.wholeTextFiles("s3n://" + src_s3_bucket + "/")

    not_null_data = input_data.filter(lambda x: x[1].strip() not in ['null', ''])
    json_formatted = not_null_data.map(lambda x: (x[0], json.loads(x[1])))

    only_npm = json_formatted.filter(lambda x: 'NPM' in extract_ecosystem(x[1]))
    package_versions = only_npm.map(lambda x: (x[0], map_package_versions(x[1])))
    non_fail_package_versions = package_versions.map(
        lambda x: (x[0], filter(lambda pv: pv[0] != 'fail' and pv[1] != 'fail', x[1])))
    non_empty_package_versions = non_fail_package_versions.filter(lambda x: len(x[1]) > 0)

    transactions = non_empty_package_versions.map(lambda x: map(lambda pv: "%s@@%s" % (pv[0], pv[1]), x[1]))
    unique_transactions = transactions.map(lambda x: list(set(x)))
    truncated_transactions = unique_transactions.map(lambda x: x[:MAX_WIDTH]).cache()
    count_transactions = truncated_transactions.count()
    model = FPGrowth.train(truncated_transactions,
                           minSupport=0.5, numPartitions=truncated_transactions.getNumPartitions())
    rddJsons = model.freqItemsets().map(
        lambda x: freqItemsetToRefStack(x.items, float(x.freq) / float(count_transactions)))
    # rddJsons = rddRefStacks.filter(lambda x: len(x.get('dependencies').items()) > 4 and len(x.get('dependencies').items()) <= 10 )

    # Save packages and versions
    rddVersions = rddJsons.flatMap(lambda x: x.get('dependencies').items())
    dfVersions = rddVersions.toDF().distinct()
    rddGremlinVersions = dfVersions.rdd.map(lambda x: gremlin_str_pkg_version('trial', x[0], x[1]))
    str_gremlin = gremlin_method_insert_pkg_version + ' '.join(rddGremlinVersions.collect())
    fire_gremlin(target_gremlin_server, str_gremlin)

    # Save stacks
    rdd_gremlin_stacks = rddJsons.map(lambda x: gremlin_str_ref_stack(x))
    str_gremlin = gremlin_method_insert_ref_stack + ' '.join(rdd_gremlin_stacks.collect())
    fire_gremlin(target_gremlin_server, str_gremlin)
Beispiel #15
0
def process_log_data(spark, input_data, output_data):
    """
        In this function we are loading the song_data file and create tables for songplays,users and time tables.
        Input: Sparksession, 
               Input_data filepath for songs data 
               Output_data filepath for songs data
               
        Output: We produce parquet files for songplays,users and time tables.
    """
    # get filepath to log data file
    log_data = input_data

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(col("page") == "NextSong")

    # extract columns for users table
    users_table = df['userId', 'firstName', 'lastName', 'gender', 'level']
    #drop duplicates
    users_table = users_table.drop_duplicates(subset=['userId'])

    # write users table to parquet files
    users_table = users_table.write.partitionBy('userId').parquet(
        os.path.join(output_data, 'users.parquet'), 'overwrite')
    print("users_table partitioned!")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: tstodatetime(x))
    df = df.withColumn('daytime', get_timestamp(col("ts")))

    # extract columns to create time table
    time_table = df.select(
        col("ts").alias('start_time'),
        year('daytime').alias('year'),
        month('daytime').alias('month'),
        dayofmonth('daytime').alias('day'),
        hour('daytime').alias('hour'),
        weekofyear('daytime').alias('weekofyear'))
    #We are going to partition later in the code!

    # read in song data to use for songplays table
    sqlContext = SQLContext(spark)
    songs_table = sqlContext.read.parquet(
        'data/outputs/song_data/songs.parquet')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location',
                         'userAgent', 'song']
    #add artists id and song id by joining with songs_table
    songplays_table = songplays_table.alias('s').join(songs_table.alias('e'),col('e.title') == col('s.song'))\
    .select(col('s.ts').alias('start_time'),
        col('s.userId'),
        col('s.level'),
        col('s.sessionId'),
        col('s.location'),
        col('s.userAgent'),
        col('s.song'),
        col('e.artist_id').alias('artist_id'),
        col('e.song_id').alias('song_id'))
    #add month and year for partitioning later based on those
    time_table_short = time_table['start_time', 'month', 'year']
    songplays_table = songplays_table.alias('s').join(time_table_short.alias('t'),col('t.start_time') == col('s.start_time'))\
    .select(col('s.start_time'),
        col('s.userId'),
        col('s.level'),
        col('s.sessionId'),
        col('s.location'),
        col('s.userAgent'),
        col('s.song'),
        col('s.artist_id'),
        col('s.song_id'),
        col('t.year'),
        col('t.month'),
       )
    # write time table to parquet files partitioned by year and month
    time_table = time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'times.parquet'), 'overwrite')
    print("time_table partitioned!")

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays_table.write.partitionBy(
        'year',
        'month').parquet(os.path.join(output_data, 'songplays.parquet'),
                         'overwrite')
    print("songplays_table partitioned!")
def split(intersections, csv_report):
    logging.info('Splitting started')

    if csv_report:
        if cfg.reporting.use_uuid:
            uuid = uuid4()
            reporter = CSVReport(cfg.reporting.csv_dir, uuid)
        else:
            reporter = CSVReport(cfg.reporting.csv_dir, None)
    else:
        reporter = ExcelReport(cfg.reporting.file)

    logging.info('Spark initialization')

    sc = SparkContext(cfg.spark.master, 'map_test: split')
    sqlContext = SQLContext(sc)

    logging.info('Source file reading')

    df = sqlContext.read.json(cfg.splitting.source_file)
    df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC"))
    df = df[(df.event != '$set') & (df.event != '$unset')]

    users_with_event_count = df.groupBy(F.col("entityId").alias("user")).count()

    logging.info('Filter users with small number of events')

    min_events = 10
    users_with_few_events = (users_with_event_count
                             .filter("count < %d" % (min_events))
                             .select(F.col("user").alias("user_with_few_events")))
    ndf = df.join(users_with_few_events,
                  F.col("entityId")==F.col("user_with_few_events"),
                  how="left_outer")
    df1 = ndf.filter("user_with_few_events is NULL").drop("user_with_few_events")


    logging.info('Split data into train and test')
    train_df, test_df = split_data(df)
    train_df.coalesce(1).write.format('json').save(cfg.splitting.train_file)
    test_df.coalesce(1).write.format('json').save(cfg.splitting.test_file)

    train_df = train_df.select("entityId", "event", "targetEntityId").cache()
    test_df = test_df.select("entityId", "event", "targetEntityId").cache()

    logging.info('Calculation of different stat metrics of datasets')

    events_by_type = (df
                      .groupBy("event")
                      .count()
                      .select(F.col("event"), F.col("count").alias("count_total"))
                      .toPandas())
    events_by_type_test = (test_df
                           .groupBy("event")
                           .count()
                           .select(F.col("event"), F.col("count").alias("count_test"))
                           .toPandas()
                           .set_index("event"))
    events_by_type_train = (train_df
                            .groupBy("event")
                            .count()
                            .select(F.col("event"), F.col("count").alias("count_train"))
                            .toPandas()
                            .set_index("event"))
    unique_users_by_event = (df
                             .select(F.col("entityId"), F.col("event"))
                             .distinct()
                             .groupBy("event")
                             .count()
                             .select(F.col("event"), F.col("count").alias("unique_users_total"))
                             .toPandas()
                             .set_index("event"))
    unique_users_by_event_train = (train_df
                                   .select(F.col("entityId"), F.col("event"))
                                   .distinct()
                                   .groupBy("event")
                                   .count()
                                   .select(F.col("event"), F.col("count").alias("unique_users_train"))
                                   .toPandas()
                                   .set_index("event"))
    unique_users_by_event_test = (test_df
                                  .select(F.col("entityId"), F.col("event"))
                                  .distinct()
                                  .groupBy("event")
                                  .count()
                                  .select(F.col("event"), F.col("count").alias("unique_users_test"))
                                  .toPandas()
                                  .set_index("event"))
    unique_items_by_event = (df
                             .select(F.col("targetEntityId"), F.col("event"))
                             .distinct()
                             .groupBy("event")
                             .count()
                             .select(F.col("event"), F.col("count").alias("unique_items_total"))
                             .toPandas()
                             .set_index("event"))
    unique_items_by_event_train = (train_df
                                   .select(F.col("targetEntityId"), F.col("event"))
                                   .distinct()
                                   .groupBy("event")
                                   .count()
                                   .select(F.col("event"), F.col("count").alias("unique_items_train"))
                                   .toPandas()
                                   .set_index("event"))
    unique_items_by_event_test = (test_df
                                  .select(F.col("targetEntityId"), F.col("event"))
                                  .distinct()
                                  .groupBy("event")
                                  .count()
                                  .select(F.col("event"), F.col("count").alias("unique_items_test"))
                                  .toPandas()
                                  .set_index("event"))

    logging.info('Calculate total counts')

    events = df.count()
    events_train = train_df.count()
    events_test = test_df.count()

    unique_users = df.select("entityId").distinct().count()
    unique_users_train = train_df.select("entityId").distinct().count()
    unique_users_test = test_df.select("entityId").distinct().count()

    unique_items = df.select(F.col("targetEntityId")).distinct().count()
    unique_items_train = train_df.select(F.col("targetEntityId")).distinct().count()
    unique_items_test = test_df.select(F.col("targetEntityId")).distinct().count()

    info_df = events_by_type
    dfs = [unique_users_by_event, unique_items_by_event,
            events_by_type_train, events_by_type_test,
            unique_users_by_event_train, unique_users_by_event_test,
            unique_items_by_event_train, unique_items_by_event_test]

    for data_frame in dfs:
        info_df = info_df.join(data_frame, on="event")

    n_rows, n_cols = info_df.shape

    # totals
    info_df.loc[n_rows] = ['ANY EVENT', events, unique_users, unique_items,
                        events_train, events_test,
                        unique_users_train, unique_users_test,
                        unique_items_train, unique_items_test]

    info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2])
    info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3])

    info_df = info_df.fillna(0)

    logging.info('Create event stat worksheet')

    reporter.start_new_sheet('Events stat')
    reporter.report(
        ['event', 'event count', 'unique users', 'unique items',
         'events per user', 'events per item',
         'event count train', 'event count test',
         'unique users train', 'unique users test',
         'unique items train', 'unique items test'],
        [column.tolist() for _, column in info_df.iteritems()],
        selected_rows=[next(info_df.iteritems())[1].tolist().index(cfg.testing.primary_event)],
        cfg=cfg)
    reporter.finish_sheet()

    if intersections:
        logging.info('Start intersections calculation')

        reporter.start_new_sheet('Intersections')

        columns_for_matrix = cfg.testing.events

        logging.info('Process train / train user intersection')

        train_train_users = (
            train_df
            .select(F.col("entityId").alias("user"), F.col("event").alias("event_left"))
            .distinct()
            .join(train_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(),
               on="user", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())
        trtru = mk_intersection_matrix(train_train_users, columns_for_matrix)

        reporter.report(
            [''] + list(trtru.columns.values),
            [trtru.index.tolist()] + [column for _, column in trtru.iteritems()],
            title='Train / train user intersection')
        logging.info('Process train / test user intersection')

        train_test_users = (
            train_df
            .select(F.col("entityId").alias("user"), F.col("event").alias("event_left"))
            .distinct()
            .join(test_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(),
               on="user", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtsu = mk_intersection_matrix(train_test_users, columns_for_matrix,
                                       horizontal_suffix=" train", vertical_suffix=" test")
        reporter.report(
            [''] + list(trtsu.columns.values),
            [trtsu.index.tolist()] + [column for _, column in trtsu.iteritems()],
            title='Train / test user intersection')
        logging.info('Process train / train item intersection')

        train_train_items = (
            train_df
            .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left"))
            .distinct()
            .join(train_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(),
               on="item", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtri = mk_intersection_matrix(train_train_items, columns_for_matrix)
        reporter.report(
            [''] + list(trtri.columns.values),
            [trtri.index.tolist()] + [column for _, column in trtri.iteritems()],
            title='Train / train item intersection'
        )
        logging.info('Process train / test item intersection')

        train_test_items = (
            train_df
            .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left"))
            .distinct()
            .join(test_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(),
               on="item", how="inner")
            .groupBy(["event_left", "event_right"])
            .count()
            .collect())

        trtsi = mk_intersection_matrix(train_test_items, columns_for_matrix,
                                       horizontal_suffix=" train", vertical_suffix=" test")
        reporter.report(
            [''] + list(trtsi.columns.values),
            [trtsi.index.tolist()] + [column for _, column in trtsi.iteritems()],
            title='Train / test item intersection'
        )
        reporter.report_config(cfg)

    reporter.finish_document()
    logging.info('Splitting finished successfully')
from pyspark.sql.session import SparkSession as spark
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from matplotlib import pyplot
import os
sc = SparkContext(appName="Task_usage")
sql_context = SQLContext(sc)

# folder_path ='/mnt/volume/ggcluster/clusterdata-2011-2/task_usage/'
folder_path = '/mnt/volume/ggcluster/spark-2.1.1-bin-hadoop2.7/thangbk2209/Jobid-NumberOfTask/'

dataSchema = StructType([
    StructField('JobId', LongType(), True),
    StructField('taskIndex', LongType(), True)
])
file_name = 'JobID-numberOfTask.csv'
df = (sql_context.read.format('com.databricks.spark.csv').schema(
    dataSchema).load("%s%s" % (folder_path, file_name)))
df.createOrReplaceTempView("dataFrame")
sumCPUUsage = sql_context.sql(
    "SELECT JobId, sum(taskIndex) as numberOfTask from dataFrame group by Jobid order by numberOfTask DESC"
)
# sumCPUUsage.show(5000)
schema_df = ["Jobid", "numberOfTaskIndex"]
sumCPUUsage.toPandas().to_csv('thangbk2209/Predictive_Scaling/results/%s' %
                              (file_name),
                              index=False,
                              header=None)
# sumCPUUsage.write.save("results/test.csv", format="csv", columns=schema_df)
Beispiel #18
0
from pyspark.ml.classification import RandomForestClassificationModel


# SparkSession singleton generator needed to operate on Dataframes within stream
def getSparkSessionInstance(sparkConf):
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession\
            .builder\
            .config(conf=sparkConf)\
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']


spark = SparkSession.builder.appName("Realtime Sensor Analytics").getOrCreate()
sc = spark.sparkContext
sqc = SQLContext(sc)

# Read in Kudu information
config = ConfigParser.ConfigParser()
config.read('config.ini')
kuduMaster = config.get('hadoop', 'kudu_masters')
kuduPort = config.get('hadoop', 'kudu_port')
kafkaTopic = config.get('hadoop', 'kafka_topic')
kafkaBroker = config.get('hadoop', 'kafka_brokers') + ':' + '9092'

# Read in Tag ID/Entity mappings from Kudu to join with sensor data
tag_mappings = sqc.read.format('org.apache.kudu.spark.kudu')\
    .option('kudu.master',kuduMaster)\
    .option('kudu.table','tag_mappings')\
    .load()
Beispiel #19
0
from pyspark.sql.types import DataType, IntegerType
# mllib for clustering
from pyspark.mllib.linalg import Vectors, DenseMatrix
from pyspark.mllib.clustering import GaussianMixture
# JSON
import json
import collections
# numpy
from numpy.testing import assert_equal
import numpy as np
from shutil import rmtree
from numpy import array
from datetime import timedelta, date

if __name__ == "__main__":
    sqlsc = SQLContext(sc)
    MYSQL_USERNAME = ""
    MYSQL_PWD = ""
    MYSQL_CONNECTION_URL = "jdbc:mysql://localhost:3306/telegramdb?autoReconnect=true&useSSL=false" + \
                           "&user="******"&password="******"jdbc").options(
        url=MYSQL_CONNECTION_URL,
        dbtable="information",
        driver="com.mysql.jdbc.Driver").load()
    tags = sqlsc.read.format("jdbc").options(
        url=MYSQL_CONNECTION_URL,
        dbtable="tags",
        driver="com.mysql.jdbc.Driver").load()
    # columns
    tags = tag_df.filter(
        tag_df.high == 'IT').map(lambda line: line.low).collect()
Beispiel #20
0
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc=SparkContext() 
path = "eventLogging/local-1588781816130.inprogress"
sqlConf = SQLContext(sc)

df=sqlConf.read.json(path)

df.createOrReplaceTempView("Events")

# The Number of events so far
print(f'The number of events So far = {df.count()}')
dist_events=df.groupBy('Event').count()
print(f'The number of distinct events So far = {dist_events.count()}')

event_list=[]
event_timestamp=dict()
for event in dist_events.collect():
    
    event_dict=event.asDict()
    
    event_list.append(event_dict.['Event'])
    
    event_timestamp[event_dict.['Event']]=event_dict.['Event']
     

open_event=[]

for event in event_list:
    if event[-5:]=='Start':
Beispiel #21
0
def getSqlContextInstance(sparkContext):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']
Beispiel #22
0
    with open(output, 'w') as outfile:  #write out into output file
        for lst in final:
            for x in lst:
                if len(x) == 1:
                    outfile.write("'")
                    outfile.write(x[0])
                    outfile.write("'")
                    outfile.write('\n')
                else:
                    for i in x:
                        outfile.write("'")
                        outfile.write(i)
                        outfile.write("'")
                        if i != x[-1]:
                            outfile.write(', ')
                    outfile.write('\n')
    outfile.close()
    end_time = datetime.datetime.now()
    difference1 = end_time - start_time
    difference_duration1 = round(difference1.total_seconds(), 2)
    print('Duration: ' + str(difference_duration1) + ' seconds')


if __name__ == "__main__":
    start_time = datetime.datetime.now()
    input_file = sys.argv[1]
    output = sys.argv[2]
    gf = SparkContext.getOrCreate()
    sqlContext = SQLContext(gf)
    Graph()

warehouse_location = abspath('spark-warehouse')
os.environ[
    "PYSPARK_SUBMIT_ARGS"] = '--jars /data/jupyter/kudu-spark2_2.11-1.8.0.jar pyspark-shell'

sc = SparkSession.builder \
    .appName("Special Day Weekly") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.num.executors", '10') \
    .config("spark.executor.memory", '15G') \
    .config("spark.executor.cores", '20') \
    .enableHiveSupport() \
    .getOrCreate()

sqlc = SQLContext(sc)

sql_query = f"""
select 
    trxn.item_id,
    trxn.sub_id, 
    trxn.store_code,
    trxn.date_key,
    trxn.daily_sales_sum,
    cal.week_key
from {database_name}.forecast_sprint4_add_dm_to_daily trxn
join ods.dim_calendar cal on
 trxn.date_key >= '{start_date}' and trxn.date_key <'{end_date}'
 and trxn.date_key = cal.date_key
""".replace("\n", " ")
Beispiel #24
0
def getSqlContextInstance(sparkContext):
    """Lazily instantiated global instance of SQLContext
    Below from https://spark.apache.org/docs/1.5.2/streaming-programming-guide.html#dataframe-and-sql-operations."""
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']
Beispiel #25
0
from pyspark.sql import SQLContext
from pyspark.mllib.linalg import Vectors
import numpy as np
import random
from pyspark import SparkContext
from pyspark import SparkConf

conf = SparkConf()
conf.set("spark.executor.cores", '3')
conf.set("spark.executor.instances", '1')
conf.set("spark.executor.memory", "1g")
conf.set("spark.locality.wait", "0")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf=conf)
sqlcontext = SQLContext(sc)


def shuffle_csv(csv_file):
    lines = open(csv_file).readlines()
    random.shuffle(lines)
    open(csv_file, 'w').writelines(lines)


def load_data_frame(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile(
        '/home/minglu/dist_spark/data/' + csv_file
    )  # This is an RDD, which will later be transformed to a data frame
    data = data.filter(lambda x: x.split(',')[0] != 'label').map(
        lambda line: line.split(','))
Beispiel #26
0
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, max as max_

import load_data
rdd_join = load_data.rdd_join
sqlContext = SQLContext(load_data.sc)
schema = StructType([
    StructField("trade_dt", StringType(), True),
    StructField("rec_type", StringType(), True),
    StructField("symbol", StringType(), True),
    StructField("exchange", StringType(), True),
    StructField("event_tm", StringType(), True),
    StructField("event_seq_nb", StringType(), True),
    StructField("arrival_tm", StringType(), True),
    StructField("trade_pr", StringType(), True),
    StructField("bid_pr", StringType(), True),
    StructField("bid_size", StringType(), True),
    StructField("ask_pr", StringType(), True),
    StructField("ask_size", StringType(), True),
    StructField("execution_id", StringType(), True),
    StructField("trade_size", StringType(), True)
])

# Creating a dataframe for all data
common_df = sqlContext.createDataFrame(rdd_join, schema)

# Working with TRADES data:
trade = common_df.filter(common_df.rec_type == "T")

# Selecting most important columns to save space
Beispiel #27
0
        enumerator = args[7]
    else:
        k = 10
        w = 0.5
        alpha = 6
        b_update = True
        debug = True
        loss_type = 0
        enumerator = "union"

    conf = SparkConf().setAppName("salary_test").setMaster('local[2]')
    num_partitions = 2
    model_type = "regression"
    label = 'salary'
    sparkContext = SparkContext(conf=conf)
    sqlContext = SQLContext(sparkContext)
    fileRDD = sparkContext.textFile('salaries.csv', num_partitions)
    header = fileRDD.first()
    head_split = header.split(",")
    head_split[0] = '_c0'
    fileRDD = fileRDD.filter(lambda line: line != header)
    data = fileRDD.map(lambda row: row.split(","))
    dataset_df = sqlContext.createDataFrame(data, head_split)

    cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"]
    # initializing stages of main transformation pipeline
    stages = []
    dataset_df = dataset_df.drop('_c0')
    dataset_df = dataset_df.withColumn("id", sf.monotonically_increasing_id())
    # bining numeric features by local binner udf function (specified for current dataset if needed)
    dataset_df = dataset_df.withColumn('sincephd_bin',
    print('Trying to get spark connection...')
    warehouse_location = os.path.abspath('spark-warehouse')

    spark = (SparkSession \
        .builder \
        .appName("Qiang (Charles)") \
        .config("spark.sql.warehouse.dir", warehouse_location) \
        .config("spark.num.executors", '15') \
        .config("spark.executor.memory", '20G') \
        .config("spark.executor.cores", '25') \
        .enableHiveSupport() \
        .getOrCreate()
    )
    print('Spark connection created!')

sqlc = SQLContext(spark)

# /* ===================== getopt added ==========================
# example: python3.6 ./sqls/9.7grouped_to_be_shipment_groupped_0729.py -d {config['database']}
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--database_name", help="database name")
args = parser.parse_args()

print(args.database_name)

config = {}
config['database'] = args.database_name

# config = {}
import pandas as pd
import numpy as np
import re
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import when
import ds_config
import logging
if __name__ == "__main__":
	logging.getLogger("py4j").setLevel(logging.ERROR)
	conf = SparkConf().setAppName("preprocess_04")
	sc = SparkContext(conf=conf)
	sqlContext = SQLContext(sc)
	df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").option("delimiter", '|').load(ds_config.preprocess_02_output_01)
	gsdf = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").option("delimiter", ds_config.gs_customer_prof_after_delim).load(ds_config.gs_customer_prof_after)
	gsdf.registerTempTable("GS_SUMMARY")
	selected = sqlContext.sql("SELECT analytic_id, foreigner_flag, mobile_region, service_month, urbanflag, billing_region from GS_SUMMARY")
	present_df = df.join(selected, ["analytic_id"], "left_outer")
	present_df.registerTempTable("present_df")
	service_month = sqlContext.sql("SELECT analytic_id, service_month from present_df")
	means = service_month.agg( *[func.mean(c).alias(c) for c in service_month.columns if c != 'analytic_id']).toPandas().to_dict('records')[0]
	means['foreigner_flag'] = 'N'
	means['mobile_region'] = 'NA'
	means['urbanflag'] = 'Y'
	means['billing_region'] = 'NA'
	means['service_month'] = 12
	present_df_eliminateGSna = present_df.fillna(means)
	present_df_eliminateGSna.registerTempTable("present_df_eliminateGSna")
    df = spark.createDataFrame(mapped, schema=schema).distinct()

    # Extract child and parent domains so we can easily use asin filtering after loading the parquet file
    df = df.select(
        '*', url_to_domain('childTLD').alias('childDomain'), url_to_domain('parentTLD').alias('parentDomain'))

    df.write.format("parquet").saveAsTable(output_file, path="../data/ETLout/{}".format(output_file))


if __name__ == '__main__':
    conf = SparkConf().setAll((
        ("spark.task.maxFailures", "10"),
        ("spark.locality.wait", "20s"),
        ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
    ))

    rec = re.compile(r"(https?://)?(www\.)?")  # Regex to clean parent/child links
    sc = SparkContext(appName='etl-common-crawl', conf=conf)
    spark = SQLContext(sparkContext=sc)

    parser = argparse.ArgumentParser(description='Perform ETL on CommonCrawl')
    parser.add_argument('input', type=str,  help='Input path')
    parser.add_argument('output', type=str, help='Output path')
    parser.add_argument('file_type', type=str, help='file or s3')
    parser.add_argument('crawl_path', type=str, help='file path or bucket name in case of s3')

    args = parser.parse_args()

    main(args.input, args.output, args.file_type, args.crawl_path)