def gen_phone_battery_data() -> object:
    """
    Create pyspark dataframe with some sample phone battery data

    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "offset", "battery_level", "ver", "user"]

    """
    column_name = [
        "timestamp", "localtime", "battery_level", "version", "user"
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    tmp = 1
    sample = 100
    sqlContext = get_or_create_sc("sqlContext")
    for row in range(1000, 1, -1):
        tmp += 1
        if tmp == 100:
            sample = sample - 1
            tmp = 1
        timestamp = timestamp + timedelta(0, 1)
        localtime = timestamp + timedelta(hours=5)
        sample_data.append((timestamp, localtime, sample, 1,
                            "bfb2ca0c-e19c-3956-9db2-5459ccadd40c"))
    df = sqlContext.createDataFrame(sample_data, column_name)
    return df
Beispiel #2
0
def gen_phone_battery_data(CC, user_id, stream_name) -> object:
    """
    Create pyspark dataframe with some sample phone battery data
    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"]

    """
    column_name = [
        "timestamp", "localtime", "user", "version", "battery_level"
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    tmp = 1
    sample = 100
    sqlContext = get_or_create_sc("sqlContext")
    for row in range(1000, 1, -1):
        tmp += 1
        if tmp == 100:
            sample = sample - 1
            tmp = 1
        timestamp = timestamp + timedelta(0, 1)
        localtime = timestamp - timedelta(hours=5)
        sample_data.append((timestamp, localtime, user_id, 1, sample))
    df = sqlContext.createDataFrame(sample_data, column_name)
    metadata = gen_phone_battery_metadata(stream_name=stream_name)
    ds = DataStream(df, metadata)
    CC.save_stream(ds)
def gen_phone_battery_data2() -> object:
    """
    Create pyspark dataframe with some sample phone battery data

    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "offset", "battery_level", "ver", "user"]

    """
    column_name = ["timestamp", "battery_level", "bat2", "version", "user"]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    tmp = 1
    sample = 100
    sample2 = 70
    sqlContext = get_or_create_sc("sqlContext")
    for row in range(1000, 1, -1):
        tmp += 1
        if tmp == 100:
            sample = sample - 1
            sample2 = sample2 - 2
            tmp = 1
        timestamp = timestamp + timedelta(0, 1)
        sample_data.append((timestamp, sample, sample2, 1,
                            "dfce1e65-2882-395b-a641-93f31748591b"))
    df = sqlContext.createDataFrame(sample_data, column_name)
    return df
def run():
    """
    This example:
     - Make call to CerebralCortex-APIServer to:
        - Authenticate a user
        - Register a new stream (`accelerometer--org.md2k.phonesensor--phone`)
        - Upload sample data
     - Create Pyspark-Kafka direct stream
     - Read parquet data and convert it into pandas dataframe
     - Add gaussian noise in sample data
     - Store noisy data as a new stream
     - Retrieve and print noisy/clean data streams
    """

    # upload sample data and publish messages on Kafka
    #rest_api_client("http://0.0.0.0:8089/")

    # create cerebralcortex object
    cc_config_path = "../../conf/"
    CC = Kernel(cc_config_path, enable_spark_ui=True)
    sample_stream_name = "accelerometer--org.md2k.phonesensor--phone"

    upload_stream_data(
        "http://localhost/", "demo", "demo", sample_stream_name,
        "../../resources/sample_data/msgpack_files/phone_accel.msgpack.gz")

    # raise Exception
    if CC.config["messaging_service"] == "none":
        raise Exception(
            "Messaging service is disabled (none) in cerebralcortex.yml. Please update configs."
        )

    # Kafka Consumer Configs    print("*"*100, type(user_metadata))
    spark_context = get_or_create_sc(type="sparkContext")

    ssc = StreamingContext(spark_context,
                           int(CC.config["kafka"]["ping_kafka"]))
    kafka_files_stream = CC.MessagingQueue.create_direct_kafka_stream(
        "filequeue", ssc)
    if kafka_files_stream is not None:
        kafka_files_stream.foreachRDD(
            lambda rdd: iterate_on_rdd(rdd, cc_config_path))

    ssc.start()
    ssc.awaitTermination(timeout=15)
    ssc.stop()

    CC = Kernel(cc_config_path, enable_spark_ui=True)
    print("*" * 15, "CLEAN DATA", "*" * 15)
    ds_clean = CC.get_stream(stream_name=sample_stream_name)
    ds_clean.show(5, truncate=False)

    print("*" * 15, "NOISY DATA", "*" * 15)
    ds_noise = CC.get_stream(stream_name=sample_stream_name +
                             "_gaussian_noise")
    ds_noise.show(5, truncate=False)
Beispiel #5
0
def main():
    date_format = '%Y%m%d'

    start_date = '20171001'
    #start_date = '20180401'
    start_date = datetime.strptime(start_date, date_format)
    end_date = '20180530'
    end_date = datetime.strptime(end_date, date_format)
    CC_CONFIG_FILEPATH = "/cerebralcortex/code/config/cc_starwars_configuration.yml"

    all_days = []
    while True:
        all_days.append(start_date.strftime(date_format))
        start_date += timedelta(days=1)
        if start_date > end_date: break

    userids = []
    f = open('users.txt', 'r')
    usrs = f.read()
    userids = usrs.split(',')
    userids = [x.strip() for x in userids]
    f.close()

    #userids = ['20940a76-976b-446e-b173-89237835ae6b']

    #  20180401 20940a76-976b-446e-b173-89237835ae6b

    print("Number of users ", len(userids))
    num_cores = 24

    useSpark = True
    #useSpark = False
    if useSpark:
        spark_context = get_or_create_sc(type="sparkContext")
        parallelize_per_day = []
        for usr in userids:
            for day in all_days:
                parallelize_per_day.append((usr, [day]))

        shuffle(parallelize_per_day)
        print(len(parallelize_per_day))
        rdd = spark_context.parallelize(parallelize_per_day,
                                        len(parallelize_per_day))
        try:
            results = rdd.map(lambda user_day: analyze_user_day(
                user_day[0], user_day[1], CC_CONFIG_FILEPATH))
            results.count()

            spark_context.stop()
        except Exception as e:
            print(e)
    else:
        for usr in userids:
            analyze_user_day(usr, all_days, CC_CONFIG_FILEPATH)
Beispiel #6
0
def gen_battery_data(CC, study_name, user_id, stream_name, version=1, hours=1):
    """
    Create pyspark dataframe with some sample phone battery data
    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"]

    """
    column_name = [
        "timestamp", "localtime", "user", "version", "level", "voltage",
        "temperature"
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    sample = 100
    voltage = 3700
    temperature = 70
    sqlContext = get_or_create_sc("sqlContext")
    total_data = hours * 60 * 60
    for row in range(total_data, 1, -1):
        sample = float(sample - 0.01)
        timestamp = timestamp + timedelta(0, 1)
        localtime = timestamp - timedelta(hours=5)
        sample_data.append((timestamp, localtime, user_id, version, sample,
                            voltage, temperature))
    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_study_name(study_name).set_name(stream_name).set_description("battery sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("level").set_type("float").set_attribute("description", "current battery charge")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("voltage").set_type("float").set_attribute("description", "current battery voltage level")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("temperature").set_type("float").set_attribute("description", "current battery temperature")) \
        .add_module(
        ModuleMetadata().set_name("battery").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    ds = DataStream(df, stream_metadata)
    CC.save_stream(ds)
def process_features(feature_list, all_users, all_days, num_cores=1):
    '''
    This method runs the processing pipeline for each of
    the features in the list.
    '''
    for module in feature_list:
        if num_cores > 1:
            #num_cores *= 4

            print('Driver: Spark job', module)
            spark_context = get_or_create_sc(type="sparkContext")
            if 'core.feature.gps.gps' == str(module) \
                or 'sleep_duration_analysis' in str(module) \
                or 'office_time' in str(module) \
                or 'phone_screen_touch_features' in str(module) \
                or 'socialjetlag' in str(module) \
                or 'gps_location_daywise' in str(module):
                '''
                # FIXME # TODO Currently only GPS feature computes features on a
                range of days. Need to find a better way if there are other
                modules that also works on range of days.
                '''
                print('-' * 120)
                print('MODULE parallelized on only users', module)
                rdd = spark_context.parallelize(all_users, num_cores)
                results = rdd.map(lambda user: process_feature_on_user(
                    user, module, all_days, cc_config_path))
                results.count()
            else:
                print('MODULE', module)
                parallelize_per_day = []
                for usr in all_users:
                    for day in all_days:
                        parallelize_per_day.append((usr, [day]))

                shuffle(parallelize_per_day)
                rdd = spark_context.parallelize(parallelize_per_day,
                                                len(parallelize_per_day))
                results = rdd.map(lambda user_day: process_feature_on_user(
                    user_day[0], module, user_day[1], cc_config_path))
                results.count()

            spark_context.stop()
        else:
            print('Driver: single threaded')
            for user in all_users:
                process_feature_on_user(user, module, all_days, cc_config_path)
def gen_accel_gyro_data(CC, study_name, user_id, stream_name, version=1, hours=1, frequency=32):
    """
    Create pyspark dataframe with some sample phone battery data
    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"]

    """
    column_name = ["timestamp", "localtime", "user" ,"version", "x", "y", "z"]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)

    sqlContext = get_or_create_sc("sqlContext")
    total_hours = (hours*60*60)*frequency
    for row in range(total_hours):
        x = round(random.uniform(-2,2),8)
        y = round(random.uniform(-2,2),8)
        z = round(random.uniform(-2,2),8)
        timestamp = timestamp + timedelta(milliseconds=1)
        localtime = timestamp - timedelta(hours=5)
        sample_data.append((timestamp, localtime, user_id, version, x, y, z))
    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_study_name(study_name).set_name(stream_name).set_description("wrist watch sensor sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("x").set_type("float").set_attribute("description", "x-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("y").set_type("float").set_attribute("description", "y-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("z").set_type("float").set_attribute("description", "z-axis")) \
        .add_module(
        ModuleMetadata().set_name("phone.sensors").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    ds = DataStream(df, stream_metadata)
    CC.save_stream(ds)
def gen_phone_battery_data(user_id) -> object:
    """
    Create pyspark dataframe with some sample phone battery data
    Returns:
        DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"]

    """
    column_name = ["timestamp", "battery_level", "version", "user"]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    tmp = 1
    sample = 100
    sqlContext = get_or_create_sc("sqlContext")
    for row in range(1000, 1, -1):
        tmp += 1
        if tmp == 100:
            sample = sample - 1
            tmp = 1
        timestamp = timestamp + timedelta(0, 1)
        sample_data.append((timestamp, sample, 1, user_id))
    df = sqlContext.createDataFrame(sample_data, column_name)
    return df
Beispiel #10
0
def run():
    selected_participants = [
        "622bf725-2471-4392-8f82-fcc9115a3745",
        "d3d33d63-101d-44fd-b6b9-4616a803225d",
        "c1f31960-dee7-45ea-ac13-a4fea1c9235c",
        "7b8358f3-c96a-4a17-87ab-9414866e18db",
        "8a3533aa-d6d4-450c-8232-79e4851b6e11",
        "e118d556-2088-4cc2-b49a-82aad5974167",
        "260f551d-e3c1-475e-b242-f17aad20ba2c",
        "dd13f25f-77a0-4a2c-83af-bb187b79a389",
        "17b07883-4959-4037-9b80-dde9a06b80ae",
        "5af23884-b630-496c-b04e-b9db94250307",
        "61519ad0-2aea-4250-9a82-4dcdb93a569c",
        "326a6c55-c963-42c2-bb8a-2591993aaaa2",
        "a54d9ef4-a46a-418b-b6cc-f10b49a946ac",
        "2fb5e890-afaf-428a-8e28-a7c70bf8bdf1",
        "c93a811e-1f47-43b6-aef9-c09338e43947",
        "9e4aeae9-8729-4b0f-9e84-5c1f4eeacc74",
        "479eea59-8ad8-46aa-9456-29ab1b8f2cb2",
        "b4ff7130-3055-4ed1-a878-8dfaca7191ac",
        "fbd7bc95-9f42-4c2c-94f4-27fd78a7273c",
        "bbc41a1e-4bbe-4417-a40c-64635cc552e6",
        "82a921b9-361a-4fd5-8db7-98961fdbf25a",
        "66a5cdf8-3b0d-4d85-bdcc-68ae69205206",
        "d4691f19-57be-44c4-afc2-5b5f82ec27b5",
        "136f8891-af6f-49c1-a69a-b4acd7116a3c"
    ]
    parser = argparse.ArgumentParser(
        description='CerebralCortex Kafka Message Handler.')
    parser.add_argument("-c",
                        "--config_filepath",
                        help="Configuration file path",
                        required=True)
    # parser.add_argument("-d", "--data_dir", help="Directory path where all the gz files are stored by API-Server",
    #                     required=True)

    parser.add_argument(
        "-bd",
        "--batch_duration",
        help=
        "How frequent kafka messages shall be checked (duration in seconds)",
        default="5",
        required=False)

    parser.add_argument(
        "-mbs",
        "--mydb_batch_size",
        help="Total number of messages to fetch from MySQL for processing.",
        default="5000",
        required=True)

    parser.add_argument(
        "-participants",
        "--participants",
        help="Whether run data replay on all participants or select one.",
        default="all",
        required=False)

    args = vars(parser.parse_args())

    participants = args["participants"]
    mydb_batch_size = args["mydb_batch_size"]
    config_filepath = str(args["config_filepath"]).strip()
    batch_duration = int(args["batch_duration"])
    # data_path = str(args["data_dir"]).strip()
    # if (data_path[-1] != '/'):
    #     data_path += '/'

    # Kafka Consumer Configs
    spark_context = get_or_create_sc(type="sparkContext")
    spark_context.setLogLevel("WARN")
    consumer_group_id = "md2k-test"

    CC = CerebralCortex(config_filepath)
    broker = str(CC.config["kafkaserver"]["host"]) + ":" + str(
        CC.config["kafkaserver"]["port"])
    data_replay_using = str(CC.config["data_replay"]["replay_type"])

    data_path = CC.config["data_replay"]["data_dir"]
    if data_replay_using == "mydb":
        for replay_batch in CC.SqlData.get_replay_batch(
                record_limit=mydb_batch_size):
            new_replay_batch = []
            #get records from mysql and process (skip kafka)
            if participants == "all":
                new_replay_batch = replay_batch
            else:
                for rb in replay_batch:
                    if rb["owner_id"] in selected_participants:
                        new_replay_batch.append(rb)
            mysql_batch_to_db(spark_context, new_replay_batch, data_path,
                              config_filepath)

    else:
        ssc = StreamingContext(spark_context, batch_duration)
        kafka_files_stream = spark_kafka_consumer(["filequeue"], ssc, broker,
                                                  consumer_group_id, CC)
        if kafka_files_stream is not None:
            kafka_files_stream.foreachRDD(
                lambda rdd: kafka_file_to_json_producer(
                    rdd, data_path, config_filepath, CC))

        ssc.start()
        ssc.awaitTermination()
Beispiel #11
0
                CC, config)


if __name__ == '__main__':
    # create and load CerebralCortex object and configs
    parser = argparse.ArgumentParser(
        description='CerebralCortex Kafka Message Handler.')
    parser.add_argument("-cc",
                        "--cc_config_filepath",
                        help="Configuration file path",
                        required=True)
    parser.add_argument("-mdc",
                        "--mdebugger_config_filepath",
                        help="mDebugger configuration file path",
                        required=True)
    args = vars(parser.parse_args())

    CC = CerebralCortex(args["cc_config_filepath"])

    # load data diagnostic configs
    md_config = Configuration(args["mdebugger_config_filepath"]).config

    # get/create spark context
    spark_context = get_or_create_sc(type="sparkContext")

    # run for one participant
    # DiagnoseData().one_user_data(["cd7c2cd6-d0a3-4680-9ba2-0c59d0d0c684"], md_config, CC, spark_context)

    # run for all the participants in a study
    all_users_data("mperf", md_config, CC, spark_context)
Beispiel #12
0
    def __init__(self,
                 configs_dir_path: str = "",
                 cc_configs: dict = None,
                 study_name: str = "default",
                 new_study: bool = False,
                 enable_spark: bool = True,
                 enable_spark_ui=False):
        """
        CerebralCortex constructor

        Args:
            configs_dir_path (str): Directory path of cerebralcortex configurations.
            cc_configs (dict or str): if sets to cc_configs="default" all defaults configs would be loaded. Or you can provide a dict of all available cc_configs as a param
            study_name (str): name of the study. If there is no study, you can pass study name as study_name="default"
            new_study (bool): create a new study with study_name if it does not exist
            enable_spark (bool): enable spark
            enable_spark_ui (bool): enable spark ui
        Raises:
            ValueError: If configuration_filepath is None or empty.
        Examples:
            >>> CC = Kernel(cc_configs="default", study_name="default")
            >>> # if you want to change any of the configs, pass cc_configs as dict with new configurations
            >>> updated_cc_configs = {"nosql_storage": "filesystem", "filesystem_path": "/path/to/store/data/"}
            >>> CC = Kernel(cc_configs=updated_cc_configs, study_name="default")
            >>> # for complete configs, have a look at default configs at: https://github.com/MD2Korg/CerebralCortex-Kernel/blob/3.3/cerebralcortex/core/config_manager/default.yml
        """
        try:

            if not os.getenv("PYSPARK_PYTHON"):
                os.environ["PYSPARK_PYTHON"] = os.popen(
                    'which python3').read().replace("\n", "")
            if not os.getenv("PYSPARK_DRIVER_PYTHON"):
                os.environ["PYSPARK_DRIVER_PYTHON"] = os.popen(
                    'which python3').read().replace("\n", "")
        except:
            raise Exception(
                "Please set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON environment variable. For example, export PYSPARK_DRIVER_PYTHON=/path/to/python/dir"
            )

        try:
            if not os.getenv("SPARK_HOME"):
                import pyspark
                spark_installation_path = os.path.dirname(pyspark.__file__)
                import findspark
                findspark.init(spark_installation_path)
        except:
            raise Exception("Set SPARK_HOME environment variable.")

        if not configs_dir_path and not cc_configs:
            raise ValueError("Please provide configs_dir_path or cc_configs.")
        elif configs_dir_path and cc_configs:
            raise ValueError("Provide only configs_dir_path OR cc_configs.")

        self.version = __version__
        self.config_filepath = configs_dir_path
        self.study_name = study_name
        os.environ["STUDY_NAME"] = study_name
        self.config = Configuration(configs_dir_path, cc_configs).config

        if enable_spark:
            self.sparkContext = get_or_create_sc(
                enable_spark_ui=enable_spark_ui)
            self.sqlContext = get_or_create_sc(type="sqlContext",
                                               enable_spark_ui=enable_spark_ui)
            self.sparkSession = get_or_create_sc(
                type="sparkSession", enable_spark_ui=enable_spark_ui)
        else:
            self.sparkContext = None
            self.sqlContext = None
            self.sparkSession = None

        if self.config["mprov"] == "pennprov":
            os.environ["MPROV_HOST"] = self.config["pennprov"]["host"]
            os.environ["MPROV_USER"] = self.config["pennprov"]["user"]
            os.environ["MPROV_PASSWORD"] = self.config["pennprov"]["password"]
            os.environ["ENABLE_MPROV"] = "True"
        elif self.config["mprov"] == "none":
            os.environ["ENABLE_MPROV"] = "False"
        else:
            raise ValueError(
                "Please check cerebralcortex.yml file. mprov is not properly configured."
            )

        self.new_study = new_study

        if not study_name:
            raise Exception("Study name cannot be None.")

        self.debug = self.config["cc"]["debug"]
        self.logging = CCLogging(self)
        self.logtypes = LogTypes()
        self.SqlData = SqlData(self)
        self.RawData = RawData(self)
        self.TimeSeriesData = None

        warnings.simplefilter('always', DeprecationWarning)

        if not new_study and not self.RawData.is_study():
            raise Exception(
                "Study name does not exist. If this is a new study set new_study param to True"
            )

        if self.config["visualization_storage"] != "none":
            self.TimeSeriesData = TimeSeriesData(self)
                        help="mDebugger configuration file path",
                        required=True)
    parser.add_argument("-sn",
                        "--study_name",
                        help="mDebugger configuration file path",
                        required=True)
    parser.add_argument("-spm",
                        "--spark_master",
                        help="mDebugger configuration file path",
                        required=False)
    args = vars(parser.parse_args())

    CC = CerebralCortex(args["cc_config_filepath"])

    # load data reporting configs
    cr_config = Configuration(args["cc_reporting_config_filepath"]).config
    cc_config_file = args["cc_config_filepath"]
    # get/create spark context
    if args["spark_master"]:
        spark_context = get_or_create_sc(type="sparkContext",
                                         master=args["spark_master"])
    else:
        spark_context = get_or_create_sc(type="sparkContext")

    # run for all the participants in a study
    #all_users_data("mperf", md_config, CC, spark_context)

    #TESTING
    all_users_data(args["study_name"], cc_config_file, cr_config, CC,
                   spark_context)
Beispiel #14
0
def gen_location_datastream(user_id, stream_name) -> object:
    """
    Create pyspark dataframe with some sample gps data (Memphis, TN, lat, long, alt coordinates)

    Args:
        user_id (str): id of a user
        stream_name (str): sample gps stream name

    Returns:
        DataStream: datastream object of gps location stream with its metadata

    """
    column_name = [
        "timestamp", "localtime", "user", "version", "latitude", "longitude",
        "altitude", "speed", "bearing", "accuracy"
    ]
    sample_data = []
    timestamp = datetime(2019, 9, 1, 11, 34, 59)
    sqlContext = get_or_create_sc("sqlContext")

    lower_left = [35.079678, -90.074136]
    upper_right = [35.194771, -89.868766]
    alt = [i for i in range(83, 100)]

    for location in range(5):
        lat = random.uniform(lower_left[0], upper_right[0])
        long = random.uniform(lower_left[1], upper_right[1])
        for dp in range(150):
            lat_val = random.gauss(lat, 0.001)
            long_val = random.gauss(long, 0.001)
            alt_val = random.choice(alt)

            speed_val = round(random.uniform(0.0, 5.0), 6)
            bearing_val = round(random.uniform(0.0, 350), 6)
            accuracy_val = round(random.uniform(10.0, 30.4), 6)

            timestamp = timestamp + timedelta(minutes=1)
            localtime = timestamp + timedelta(hours=5)
            sample_data.append(
                (timestamp, localtime, user_id, 1, lat_val, long_val, alt_val,
                 speed_val, bearing_val, accuracy_val))

    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_study_name("default").set_name(stream_name).set_description("GPS sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("float").set_attribute("description", "gps latitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("float").set_attribute("description", "gps longitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("altitude").set_type("float").set_attribute("description", "gps altitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("speed").set_type("float").set_attribute("description", "speed info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("bearing").set_type("float").set_attribute("description", "bearing info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accuracy").set_type("float").set_attribute("description", "accuracy of gps location")) \
        .add_module(
        ModuleMetadata().set_name("examples.util.data_helper.gen_location_data").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    ds = DataStream(data=df, metadata=stream_metadata)
    return ds
Beispiel #15
0
def gen_stress_data(stream_name, spark_df=False):
    data = [
        [0.7, "road", "Driving", "Was Tailgated", "IN_VEHICLE"],
        [0.3, "work", "Job", "Bored / Not enough to do", "STILL"],
        [0.5, "home", "Health", "Physical inability", "STILL"],
        [0.6, "road", "Driving", "Saw a police car", "IN_VEHICLE"],
        [0.38, "work", "Job", "Technology barriers", "STILL"],
        [0.2, "home", "Finance", "Missed payment", "UNKNOWN"],
        [0.9, "work", "Finance", "Unexpected losses", "WALKING"],
        [0.54, "road", "Driving", "Difficulty in navigating", "IN_VEHICLE"],
        [0.79, "work", "Job", "Unpleasant conversation", "ON_FOOT"],
        [0.28, "road", "Health", "My eating habits", "IN_VEHICLE"],
        [
            0.47, "road", "Driving", "Indecision at a traffic intersection",
            "IN_VEHICLE"
        ],
        [0.67, "work", "Job", "Late arrival", "WALKING"],
    ]

    column_name = [
        'user', 'timestamp', 'localtime', 'version', 'start_time', 'end_time',
        'density', 'location', 'stresser_main', 'stresser_sub', 'activity'
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)

    for row in range(20, 1, -1):
        if row > 10:
            user_id = "00000000-afb8-476e-9872-6472b4e66b68"
        else:
            user_id = "b1117354-ce48-4325-b2e3-78b0cc932819"
        timestamp = timestamp + timedelta(
            hours=random.choice([1, 3, 7, 2, 4, 5]))
        localtime = timestamp - timedelta(hours=5)
        start_time = timestamp
        end_time = timestamp + timedelta(
            minutes=random.choice([12, 6, 8, 16, 29, 45, 2, 3, 8]))
        data_vals = random.choice(data)
        sample_data.append([
            user_id, timestamp, localtime, 1, start_time, end_time,
            data_vals[0], data_vals[1], data_vals[2], data_vals[3],
            data_vals[4]
        ])

    stream_metadata = Metadata()
    stream_metadata.set_study_name("default").set_name(stream_name).set_description("GPS sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("start_time").set_type("datetime").set_attribute("description", "start time of a stress episode.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("end_time").set_type("datetime").set_attribute("description", "end time of a stress episode.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("density").set_type("float").set_attribute("description", "density of stress")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("location").set_type("string").set_attribute("description", "location where stress episode was captured.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("stresser_main").set_type("string").set_attribute("description", "stressers' main category.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("stresser_sub").set_type("string").set_attribute("description", "stressers' sub category.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("activity").set_type("string").set_attribute("description", "physical activity name")) \
        .add_module(
        ModuleMetadata().set_name("examples.util.data_helper.gen_stress_data").set_attribute("attribute_key", "attribute_value").set_author(
            "Nasir Ali", "*****@*****.**"))
    stream_metadata.is_valid()

    if spark_df:
        sqlContext = get_or_create_sc("sqlContext")
        df = sqlContext.createDataFrame(sample_data, column_name)
    else:
        df = pd.DataFrame(sample_data, columns=column_name)

    ds = DataStream(df, stream_metadata)
    return ds
def gen_location_datastream(user_id, stream_name) -> object:
    """
    Create pyspark dataframe with some sample gps data (Memphis, TN, lat, long, alt coordinates)

    Args:
        user_id (str): id of a user
        stream_name (str): sample gps stream name

    Returns:
        DataStream: datastream object of gps location stream with its metadata

    """
    column_name = [
        "timestamp", "localtime", "user", "version", "latitude", "longitude",
        "altitude", "speed", "bearing", "accuracy"
    ]
    sample_data = []
    timestamp = datetime(2019, 1, 9, 11, 34, 59)
    sqlContext = get_or_create_sc("sqlContext")
    lat = [
        35.1247391, 35.1257391, 35.1217391, 35.1117391, 35.1317391, 35.1287391,
        35.5217391
    ]
    long = [
        -89.9750021, -89.9710021, -89.9800021, -89.9670021, -89.9790021,
        -89.9710021, -89.8700021
    ]
    alt = [83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0]
    for dp in range(500):
        lat_val = random.choice(lat)
        long_val = random.choice(long)
        alt_val = random.choice(alt)
        #ts_val = 15094)+(16272882+(dp*1000000))
        speed_val = round(random.uniform(0.0, 5.0), 6)
        bearing_val = round(random.uniform(0.0, 350), 6)
        accuracy_val = round(random.uniform(10.0, 30.4), 6)
        #all_dps = ",".join([ts_val, lat_val, long_val, alt_val, speed_val, bearing_val, accuracy_val])
        timestamp = timestamp + timedelta(minutes=1)
        localtime = timestamp + timedelta(hours=5)
        sample_data.append(
            (timestamp, localtime, user_id, 1, lat_val, long_val, alt_val,
             speed_val, bearing_val, accuracy_val))

    df = sqlContext.createDataFrame(sample_data, column_name)

    stream_metadata = Metadata()
    stream_metadata.set_name(stream_name).set_version(1).set_description("GPS sample data stream.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("float").set_attribute("description", "gps latitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("float").set_attribute("description", "gps longitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("altitude").set_type("float").set_attribute("description", "gps altitude")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("speed").set_type("float").set_attribute("description", "speed info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("bearing").set_type("float").set_attribute("description", "bearing info")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accuracy").set_type("float").set_attribute("description", "accuracy of gps location")) \
        .add_module(
        ModuleMetadata().set_name("examples.util.data_helper.gen_location_data").set_version("0.0.1").set_attribute("attribute_key", "attribute_value").set_author(
            "test_user", "test_user@test_email.com"))
    stream_metadata.is_valid()

    return DataStream(data=df, metadata=stream_metadata)