Esempio n. 1
0
def get_cc(hostname):
    """
    Builds an instance of CerebralCortex to suit a known environment.

    Args:
        hostname (str): The hostname of the machine calling for the CerebralCortex instance.

    Returns:
        cc (CerebralCortex): An instance of CerebralCortex configured for the host machine.
    """

    from cerebralcortex.cerebralcortex import CerebralCortex

    if hostname == "cerebralcortex":
        cc = CerebralCortex(
            '/home/vagrant/CerebralCortex-DockerCompose/cc_config_file/cc_vagrant_configuration.yml'
        )

    elif '10dot' in hostname or 'memphis' in hostname:
        cc = CerebralCortex(
            '/cerebralcortex/code/config/cc_starwars_configuration.yml')

    else:
        print("unknownn environment!")
        return None

    return cc
Esempio n. 2
0
def audit_user_streams(user_id, all_days, cc_config):
    print('X' * 100, cc_config)
    CC = CerebralCortex(cc_config)
    all_user_streams = CC.get_user_streams(user_id)
    userbuf = ''
    for user_stream_key in all_user_streams:
        user_stream = all_user_streams[user_stream_key]

        if 'analysis' not in user_stream['name']:
            continue

        for day in all_days:
            data_points = 0
            for stream_id in user_stream['stream_ids']:
                ds = CC.get_stream(stream_id, user_id, day)
                data_points += len(ds.data)

            buf = '%s\t%s\t%s\t%d\n' % (user_id, user_stream['name'], str(day),
                                        data_points)
            userbuf += buf

    out_dir = '/tmp/data_audit'
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    file_path = os.path.join(out_dir, user_id)
    f = open(file_path, 'w')
    f.write(userbuf)
    f.close()
    def __init__(self, config):

        self.CC = CerebralCortex(config)
        self.config = self.CC.config
        self.sqlData = SqlData(self.config,
                               dbName="environmental_data_collection")
        self.process()
Esempio n. 4
0
def generate_report(user_id: uuid, username: str, cc_config_file,
                    config: dict):
    """
    Contains pipeline execution of all the reports
    :param user_id:
    :param CC:
    :param config:
    """
    CC = CerebralCortex(cc_config_file)
    # get all the streams belong to a participant
    streams = CC.get_user_streams(user_id)
    if streams and len(streams) > 0:

        # Data Yield
        if config["input_stream"][
                "motionsense_hrv_led_quality_left"] in streams:
            compute_data_yield(
                streams[config["input_stream"]
                        ["motionsense_hrv_led_quality_left"]]["identifier"],
                username, "motionsense_left_led", CC, config)

        if config["input_stream"][
                "motionsense_hrv_led_quality_right"] in streams:
            compute_data_yield(
                streams[config["input_stream"]
                        ["motionsense_hrv_led_quality_right"]]["identifier"],
                username, "motionsense_right_led", CC, config)

        if config["input_stream"][
                "motionsense_hrv_accel_quality_left"] in streams:
            compute_data_yield(
                streams[config["input_stream"]
                        ["motionsense_hrv_accel_quality_left"]]["identifier"],
                username, "motionsense_left_accel", CC, config)

        if config["input_stream"][
                "motionsense_hrv_accel_quality_right"] in streams:
            compute_data_yield(
                streams[config["input_stream"]
                        ["motionsense_hrv_accel_quality_right"]]["identifier"],
                username, "motionsense_right_accel", CC, config)

        if config["input_stream"]["autosense_ble_accel_quality"] in streams:
            compute_data_yield(
                streams[config["input_stream"]["autosense_ble_accel_quality"]]
                ["identifier"], username, "autosense_ble_accel", CC, config)

        if config["input_stream"][
                "autosense_ble_respiration_quality"] in streams:
            compute_data_yield(
                streams[config["input_stream"]
                        ["autosense_ble_respiration_quality"]]["identifier"],
                username, "autosense_ble_respiration", CC, config)
Esempio n. 5
0
def process_feature_on_user(user, module_name, all_days, cc_config_path):
    try:
        cc = CerebralCortex(cc_config_path)
        module = importlib.import_module(module_name)
        feature_class_name = getattr(module, 'feature_class_name')
        feature_class = getattr(module, feature_class_name)
        feature_class_instance = feature_class(cc)

        if gps_key is not None:
            feature_class_instance.gps_api_key = gps_key

        f = feature_class_instance.process
        f(user, all_days)
    except Exception as e:
        err = str(e) + "\n" + str(traceback.format_exc())
        print(err)
        syslog.openlog(ident="CerebralCortex-Driver")
        syslog.syslog(LOG_ERR, err)
        syslog.closelog()
Esempio n. 6
0
                CC, config)


if __name__ == '__main__':
    # create and load CerebralCortex object and configs
    parser = argparse.ArgumentParser(
        description='CerebralCortex Kafka Message Handler.')
    parser.add_argument("-cc",
                        "--cc_config_filepath",
                        help="Configuration file path",
                        required=True)
    parser.add_argument("-mdc",
                        "--mdebugger_config_filepath",
                        help="mDebugger configuration file path",
                        required=True)
    args = vars(parser.parse_args())

    CC = CerebralCortex(args["cc_config_filepath"])

    # load data diagnostic configs
    md_config = Configuration(args["mdebugger_config_filepath"]).config

    # get/create spark context
    spark_context = get_or_create_sc(type="sparkContext")

    # run for one participant
    # DiagnoseData().one_user_data(["cd7c2cd6-d0a3-4680-9ba2-0c59d0d0c684"], md_config, CC, spark_context)

    # run for all the participants in a study
    all_users_data("mperf", md_config, CC, spark_context)
def write_data_file(file, streams, user, s):
    cc = CerebralCortex(
        "/cerebralcortex/code/config/cc_starwars_configuration.yml")

    if os.path.isfile(file + '.gz'):
        print("Already Processed %s" % file + '.gz')
        return True

    with open(file + '_temp', 'wt') as output_file:
        for stream_id in streams[s]['stream_ids']:
            logger.info('Processing %s' % streams[s]['name'])
            print('Processing %s' % streams[s]['name'])
            days = get_stream_days(cc, stream_id, streams[s])
            for day in days:
                st = datetime.datetime.now()

                print("XXXXXXXXXX", streams[s]['name'], user['identifier'],
                      stream_id, day)

                datastream = cc.get_stream(stream_id,
                                           user['identifier'],
                                           day,
                                           localtime=False)
                et = datetime.datetime.now()
                if len(datastream.data) > 0:
                    if len(datastream.data) > 100000:
                        logger.info('%s %s %d %s' %
                                    (streams[s]['name'], day,
                                     len(datastream.data), str(et - st)))
                        print('%s %s %d %s' %
                              (streams[s]['name'], day, len(
                                  datastream.data), str(et - st)))
                    try:
                        for d in datastream.data:
                            output_string = str(
                                int(d.start_time.timestamp() * 1e6))

                            if type(d.end_time) is datetime:
                                output_string += ',' + str(
                                    int(d.end_time.timestamp() * 1e6))
                            else:
                                output_string += ',-1'

                            output_string += ',' + str(int(d.offset))

                            if type(d.sample) is list:
                                output_string += ',' + ','.join(
                                    map(str, d.sample))
                            else:
                                output_string += ',' + str(d.sample)

                            output_file.write(output_string + '\n')
                    except Exception as e:
                        logger.error("Stream %s has had a parsing error" %
                                     streams[s]['name'])
                        print("Stream %s has had a parsing error" %
                              streams[s]['name'])
                        logger.error(str(e))
                        print(str(e))

    os.system('sort ' + file + '_temp | gzip > ' + file + '.gz')
    os.system('rm ' + file + '_temp')

    return True
parser.add_argument('--study',
                    help="study name as appears in MySQL user metadata",
                    required=True)
parser.add_argument('--output',
                    help="Output directory for the exported files",
                    required=True)
parser.add_argument('--participant', help="Participant username")
parser.add_argument('-n',
                    '--num_jobs',
                    help="Number of concurrent export to run",
                    type=int,
                    default=1)
args = parser.parse_args()

CC = CerebralCortex(
    "/cerebralcortex/code/config/cc_starwars_configuration.yml")
output_dir = args.output
study_name = args.study

# In[3]:

users = CC.get_all_users(study_name=study_name)

# In[4]:


def get_stream_days(cc, identifier, stream):
    duration = cc.get_stream_duration(stream['identifier'])
    day = duration['start_time']
    result = []
    while day < (duration['end_time'] + datetime.timedelta(days=1)):
Esempio n. 9
0
def main():
    global cc_config_path
    global metadata_dir
    global gps_key
    # Get the list of the features to process
    parser = argparse.ArgumentParser(description='CerebralCortex '
                                     'Feature Processing Driver')
    parser.add_argument("-f",
                        "--feature-list",
                        help="List of feature names "
                        "seperated by commas",
                        nargs='?',
                        default=None,
                        required=False)
    parser.add_argument("-c",
                        "--cc-config",
                        help="Path to file containing the "
                        "CerebralCortex configuration",
                        required=True)
    parser.add_argument("-s",
                        "--study-name",
                        help="Study name.",
                        required=True)
    parser.add_argument("-u",
                        "--users",
                        help="Comma separated user uuids",
                        nargs='?',
                        default=None,
                        required=False)
    parser.add_argument("-sd",
                        "--start-date",
                        help="Start date in "
                        "YYYYMMDD Format",
                        required=True)
    parser.add_argument("-ed",
                        "--end-date",
                        help="End date in "
                        "YYYYMMDD Format",
                        required=True)
    parser.add_argument("-p",
                        "--num-cores",
                        type=int,
                        help="Set a number "
                        "greater than 1 to enable spark "
                        "parallel execution ",
                        required=False)
    parser.add_argument("-k",
                        "--gps-key",
                        help="GPS API "
                        "key",
                        required=False)

    args = vars(parser.parse_args())
    feature_list = None
    study_name = None
    users = None
    start_date = None
    end_date = None
    date_format = '%Y%m%d'
    num_cores = 1  # default single threaded

    if args['feature_list']:
        feature_list = args['feature_list'].split(',')
    if args['cc_config']:
        cc_config_path = args['cc_config']
    if args['study_name']:
        study_name = args['study_name']
    if args['users']:
        users = args['users'].split(',')
        print('X' * 100)
        print(len(users))
    if args['start_date']:
        start_date = datetime.strptime(args['start_date'], date_format)
    if args['end_date']:
        end_date = datetime.strptime(args['end_date'], date_format)
    if args['num_cores']:
        num_cores = args['num_cores']
    if args['gps_key']:
        gps_key = args['gps_key']

    all_days = []
    while True:
        all_days.append(start_date.strftime(date_format))
        start_date += timedelta(days=1)
        if start_date > end_date: break

    CC = None
    all_users = None
    try:
        CC = CerebralCortex(cc_config_path)
        if not users:
            users = CC.get_all_users(study_name)
            if not users:
                print('No users found')
                return
            if not len(users):
                print('No users found')
                return  # no point continuing
            all_users = [usr['identifier'] for usr in users]
        else:
            all_users = users
    except Exception as e:
        print(str(e))
        print(str(traceback.format_exc()))
    if not all_users:
        print('No users found for the study', study_name)
        return

    found_features = discover_features(feature_list)
    feature_to_process = generate_feature_processing_order(found_features)
    process_features(feature_to_process, all_users, all_days, num_cores)
Esempio n. 10
0
parser.add_argument('--conf',
                    dest='configuration_file',
                    required=True,
                    help='Cerebral Cortex configuration file')
parser.add_argument('--output',
                    dest='root_dir',
                    required=True,
                    help='Base output directory')
parser.add_argument('--study',
                    dest='study_name',
                    default='mperf',
                    help='Study name')
args = parser.parse_args()

root_dir = os.path.join(args.root_dir)
CC = CerebralCortex(args.configuration_file)


def append_csv(ds):
    with gzip.open(os.path.join(root_dir, ds.owner) + '/' + ds.name + '___' +
                   ds.identifier + '.csv.gz',
                   'at',
                   compresslevel=1,
                   encoding='utf-8') as f:
        for dp in ds.data:
            if type(dp.sample) is list:
                dp.sample = ','.join(map(str, dp.sample))

            if type(dp.sample) is str and dp.sample[-1] is '\n':
                dp.sample = dp.sample[:-1]
Esempio n. 11
0
    def analyze_user(self, userid, alldays, config_path):
        print(userid, alldays)
        self.CC = CerebralCortex(config_path)
        self.window_size = 3600
        metadata = """
        {
          "annotations":[],
          "data_descriptor":[
            {
              "name":"total_datapoints",
              "type":"int",
              "description":"Total number of data points that are present in the input stream followed by an array of the corrupt datapoints",
              "stream_type": "sparse"
            }
          ],
          "execution_context":{
            "processing_module":{
              "name":"core.admission_control_marker.phone_stream_analyzer",
              "input_streams":[
                {
                  "name":"name",
                  "identifier" : "id"
                }
              ]
            },
            "algorithm":{
              "method":"core.admission_control_marker",
              "authors":[
                {
                  "name":"Anand",
                  "email":"*****@*****.**"
                }
              ],
              "version":"0.0.4",
              "description":"Analyzer for the phone input streams"
            }
          },
          "name":"NAME_dynamically_generated"
        }
        """

        date_format = '%Y%m%d'
        for day in alldays:
            for phone_stream in phone_input_streams:
                current_date = datetime.strptime(day, date_format)
                day_data = self.get_day_data(userid, day, phone_stream)
                data_quality_analysis = []

                if len(day_data):
                    corrupt_data = \
                                self.get_corrupt_data(day_data,
                                                             phone_input_streams[phone_stream])

                    utc_offset = day_data[0].start_time.utcoffset(
                    ).total_seconds() * 1000
                    dp = DataPoint(start_time=current_date,
                                   end_time=current_date + timedelta(days=1),
                                   offset=utc_offset,
                                   sample=[len(day_data), corrupt_data])
                    data_quality_analysis.append(dp)

                else:
                    next_day = current_date + timedelta(days=1)
                    utc_offset = 0
                    dp = DataPoint(start_time=current_date,
                                   end_time=next_day,
                                   offset=utc_offset,
                                   sample=[0, []])
                    data_quality_analysis.append(dp)

                metadata_json = json.loads(metadata)
                metadata_name = phone_stream + '_corrupt_data'
                output_stream_id = str(
                    uuid.uuid3(uuid.NAMESPACE_DNS,
                               str(metadata_name + userid + str(metadata))))
                input_streams = []
                input_stream_ids = self.CC.get_stream_id(userid, phone_stream)
                for inpstrm in input_stream_ids:
                    stream_info = {}
                    stream_info['name'] = phone_stream
                    stream_info['identifier'] = inpstrm['identifier']
                    input_streams.append(stream_info)

                metadata_json["execution_context"]["processing_module"][
                    "input_streams"] = input_streams

                quality_ds = DataStream(
                    identifier=output_stream_id,
                    owner=userid,
                    name=metadata_name,
                    data_descriptor=metadata_json['data_descriptor'],
                    execution_context=metadata_json['execution_context'],
                    annotations=metadata_json['annotations'],
                    stream_type=1,
                    data=data_quality_analysis)
                try:
                    self.CC.save_stream(quality_ds)
                except Exception as e:
                    print(e)
Esempio n. 12
0
def get_corrupt_data_count(userid, all_days, cc_config_path):
    stream_names = []

    sms_stream_name = 'CU_SMS_LENGTH--edu.dartmouth.eureka_corrupt_data'
    stream_names.append(sms_stream_name)

    call_stream_name = 'CU_CALL_DURATION--edu.dartmouth.eureka_corrupt_data'
    stream_names.append(call_stream_name)

    proximity_stream_name = \
    'PROXIMITY--org.md2k.phonesensor--PHONE_corrupt_data'
    stream_names.append(proximity_stream_name)

    cu_appusage_stream_name = 'CU_APPUSAGE--edu.dartmouth.eureka_corrupt_data'
    stream_names.append(cu_appusage_stream_name)

    light_stream_name = \
    'AMBIENT_LIGHT--org.md2k.phonesensor--PHONE_corrupt_data'
    stream_names.append(light_stream_name)

    call_number_stream_name = \
    "CU_CALL_NUMBER--edu.dartmouth.eureka_corrupt_data"
    stream_names.append(call_number_stream_name)

    sms_number_stream_name = "CU_SMS_NUMBER--edu.dartmouth.eureka_corrupt_data"
    stream_names.append(sms_number_stream_name)

    activity_stream_name = \
    "ACTIVITY_TYPE--org.md2k.phonesensor--PHONE_corrupt_data"
    stream_names.append(activity_stream_name)

    call_type_stream_name = "CU_CALL_TYPE--edu.dartmouth.eureka_corrupt_data"
    stream_names.append(call_type_stream_name)

    sms_type_stream_name = "CU_SMS_TYPE--edu.dartmouth.eureka_corrupt_data" 
    stream_names.append(sms_type_stream_name)

    location_stream = 'LOCATION--org.md2k.phonesensor--PHONE_corrupt_data'
    stream_names.append(location_stream)

    geofence_list_stream = \
    'GEOFENCE--LIST--org.md2k.phonesensor--PHONE_corrupt_data'
    stream_names.append(geofence_list_stream)

    CC = CerebralCortex(cc_config_path)

    all_stream_quality = {}        

    count = 0
    started_time = datetime.now()
    userids = [userid]
    for usr in userids[:1]:
        print('processing %d of %d' % (count,len(userids)))
        count += 1

        output_per_day_dir = '/tmp/corruption_per_day/'
        if not os.path.exists(output_per_day_dir):
            os.mkdir(output_per_day_dir)
        buf_day = ''
        for strm in stream_names:
            if not strm in all_stream_quality:
                all_stream_quality[strm] = [0, 0, 0]
            
            stream_ids = get_latest_stream_id(usr, strm, CC)
                
            strm_id = stream_ids[0]['identifier']
            stream_dps_count = 0
            stream_corrupt_dps_count = 0
            stream_possible_accl_gyro_dps = 0

            for day in all_days:
                ds = CC.get_stream(strm_id, usr, day)
                if len(ds.data):
                    dp = ds.data[0]
                    num_day_dps = dp.sample[0]
                    num_day_corrupt_dps = len(dp.sample[1])
                    num_possible_accl_sample = 0
                    # check if the corrupted datapoints could be accl or gyro
                    # samples
                    if num_day_corrupt_dps:
                        for corrupt_dp in dp.sample[1]:
                            if type(corrupt_dp.sample) is list and len(corrupt_dp.sample) == 3:
                                try:
                                    if corrupt_dp.sample[0] >=  MIN_ACCL_VAL and corrupt_dp.sample[0] <= MAX_ACCL_VAL:
                                        if corrupt_dp.sample[1] >=  MIN_ACCL_VAL and corrupt_dp.sample[1] <= MAX_ACCL_VAL:
                                            if corrupt_dp.sample[2] >=  MIN_ACCL_VAL and corrupt_dp.sample[2] <= MAX_ACCL_VAL:
                                                num_possible_accl_sample += 1
                                except Exception as e:
                                    print(corrupt_dp)
                                    print(str(e))

                    buf_day += str(usr) + '\t' + str(strm) + '\t' + str(day) +'\t' +\
                                str(num_day_dps) + '\t' + str(num_day_corrupt_dps) + '\t' +\
                                str(num_possible_accl_sample) + '\n'

                    stream_dps_count += num_day_dps
                    stream_corrupt_dps_count += num_day_corrupt_dps
                    stream_possible_accl_gyro_dps += num_possible_accl_sample
                    
            #print('X'*50)
            #print(usr, strm, stream_dps_count, stream_corrupt_dps_count)
            all_stream_quality[strm][0] += stream_dps_count
            all_stream_quality[strm][1] += stream_corrupt_dps_count
            all_stream_quality[strm][2] += stream_possible_accl_gyro_dps
        print(all_stream_quality)
    
        output_dir = '/tmp/corruption_count/'
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        file_name = usr + '.pickle'
        f = open(os.path.join(output_dir,file_name),'wb')
        pickle.dump(all_stream_quality, f)
        f.close()

        f = open(os.path.join(output_per_day_dir,file_name),'w')
        f.write(buf_day)
        f.close()

    return all_stream_quality
def save_data(msg, data_path, config_filepath):
    CC = CerebralCortex(config_filepath)
    file_to_db = FileToDB(CC)
    file_to_db.file_processor(msg, data_path,
                              CC.config['data_ingestion']['influxdb_in'],
                              CC.config['data_ingestion']['nosql_in'])
                        corrupt_files_buf += gzfilepath
                        corrupt_files_buf += '\n'

                    break
            except Exception as e:
                corrupt_files_buf += stream_name
                corrupt_files_buf += '\t'
                corrupt_files_buf += gzfilepath
                corrupt_files_buf += '\n'


if __name__ == '__main__':
    CC_CONFIG_FILEPATH = "/cerebralcortex/code/config/cc_starwars_configuration.yml"
    base_path = '/md2k/apiserver/data/'
    base_path2 = '/md2k2/apiserver/data/'
    CC = CerebralCortex(CC_CONFIG_FILEPATH)
    stream_id_map = {}
    '''
    participants = []
    for f in scandir(directory):
        if f.is_dir and UUID_re.match(f.name):
            participants.append(f.name)
    '''

    f = open('all_users_corruption.txt', 'r')
    skip_line = True

    for line in f:
        if skip_line:
            skip_line = False
            continue
        if data is not None and len(data) > 0:
            if data[0].offset is None or data[0].offset == "":
                return filename
        # else:
        #     return "Data is empty."


if __name__ == '__main__':
    # create and load CerebralCortex object and configs
    parser = argparse.ArgumentParser(
        description='CerebralCortex-Script to verify stream-format.')
    parser.add_argument("-conf",
                        "--conf",
                        help="Configuration file path",
                        required=True)
    parser.add_argument("-study_name",
                        "--study_name",
                        help="Configuration file path",
                        required=True)
    parser.add_argument(
        "-uid",
        "--uid",
        help=
        "User ID only if verification needs to be performed on a single participant",
        required=False)
    args = vars(parser.parse_args())

    CC = CerebralCortex(args["conf"])

    VerifyStreamFormat(args["study_name"], CC, args["uid"])
Esempio n. 16
0
def run():
    selected_participants = [
        "622bf725-2471-4392-8f82-fcc9115a3745",
        "d3d33d63-101d-44fd-b6b9-4616a803225d",
        "c1f31960-dee7-45ea-ac13-a4fea1c9235c",
        "7b8358f3-c96a-4a17-87ab-9414866e18db",
        "8a3533aa-d6d4-450c-8232-79e4851b6e11",
        "e118d556-2088-4cc2-b49a-82aad5974167",
        "260f551d-e3c1-475e-b242-f17aad20ba2c",
        "dd13f25f-77a0-4a2c-83af-bb187b79a389",
        "17b07883-4959-4037-9b80-dde9a06b80ae",
        "5af23884-b630-496c-b04e-b9db94250307",
        "61519ad0-2aea-4250-9a82-4dcdb93a569c",
        "326a6c55-c963-42c2-bb8a-2591993aaaa2",
        "a54d9ef4-a46a-418b-b6cc-f10b49a946ac",
        "2fb5e890-afaf-428a-8e28-a7c70bf8bdf1",
        "c93a811e-1f47-43b6-aef9-c09338e43947",
        "9e4aeae9-8729-4b0f-9e84-5c1f4eeacc74",
        "479eea59-8ad8-46aa-9456-29ab1b8f2cb2",
        "b4ff7130-3055-4ed1-a878-8dfaca7191ac",
        "fbd7bc95-9f42-4c2c-94f4-27fd78a7273c",
        "bbc41a1e-4bbe-4417-a40c-64635cc552e6",
        "82a921b9-361a-4fd5-8db7-98961fdbf25a",
        "66a5cdf8-3b0d-4d85-bdcc-68ae69205206",
        "d4691f19-57be-44c4-afc2-5b5f82ec27b5",
        "136f8891-af6f-49c1-a69a-b4acd7116a3c"
    ]
    parser = argparse.ArgumentParser(
        description='CerebralCortex Kafka Message Handler.')
    parser.add_argument("-c",
                        "--config_filepath",
                        help="Configuration file path",
                        required=True)
    # parser.add_argument("-d", "--data_dir", help="Directory path where all the gz files are stored by API-Server",
    #                     required=True)

    parser.add_argument(
        "-bd",
        "--batch_duration",
        help=
        "How frequent kafka messages shall be checked (duration in seconds)",
        default="5",
        required=False)

    parser.add_argument(
        "-mbs",
        "--mydb_batch_size",
        help="Total number of messages to fetch from MySQL for processing.",
        default="5000",
        required=True)

    parser.add_argument(
        "-participants",
        "--participants",
        help="Whether run data replay on all participants or select one.",
        default="all",
        required=False)

    args = vars(parser.parse_args())

    participants = args["participants"]
    mydb_batch_size = args["mydb_batch_size"]
    config_filepath = str(args["config_filepath"]).strip()
    batch_duration = int(args["batch_duration"])
    # data_path = str(args["data_dir"]).strip()
    # if (data_path[-1] != '/'):
    #     data_path += '/'

    # Kafka Consumer Configs
    spark_context = get_or_create_sc(type="sparkContext")
    spark_context.setLogLevel("WARN")
    consumer_group_id = "md2k-test"

    CC = CerebralCortex(config_filepath)
    broker = str(CC.config["kafkaserver"]["host"]) + ":" + str(
        CC.config["kafkaserver"]["port"])
    data_replay_using = str(CC.config["data_replay"]["replay_type"])

    data_path = CC.config["data_replay"]["data_dir"]
    if data_replay_using == "mydb":
        for replay_batch in CC.SqlData.get_replay_batch(
                record_limit=mydb_batch_size):
            new_replay_batch = []
            #get records from mysql and process (skip kafka)
            if participants == "all":
                new_replay_batch = replay_batch
            else:
                for rb in replay_batch:
                    if rb["owner_id"] in selected_participants:
                        new_replay_batch.append(rb)
            mysql_batch_to_db(spark_context, new_replay_batch, data_path,
                              config_filepath)

    else:
        ssc = StreamingContext(spark_context, batch_duration)
        kafka_files_stream = spark_kafka_consumer(["filequeue"], ssc, broker,
                                                  consumer_group_id, CC)
        if kafka_files_stream is not None:
            kafka_files_stream.foreachRDD(
                lambda rdd: kafka_file_to_json_producer(
                    rdd, data_path, config_filepath, CC))

        ssc.start()
        ssc.awaitTermination()
                                      "feature data" , required=True)
parser.add_argument("-m", "--metadata_file", help="Path to the file containing "
                    " the metadata information",  required=True)

args = vars(parser.parse_args())
metadata_map = {}
stream_names = {}

if args['cc_config']:
    CC_CONFIG_PATH = args['cc_config']
if args['data_dir']:
    DATA_DIR = args['data_dir']
if args['metadata_file']:
    METADATA = args['metadata_file']

CC = CerebralCortex(CC_CONFIG_PATH)

def load_metadata(metadata_dir):
    '''
    This method reads all the metadata files in the given directory and loads
    them with key as the stream name in the metadata_map dict. 
    '''
    metadata_files = [os.path.join(metadata_dir,f) for f in os.listdir(metadata_dir)
                      if os.path.isfile(os.path.join(metadata_dir,f))]
    for mf in metadata_files:
        mfp = open(mf,'r')
        metadata_json = json.loads(mfp.read())        
        metadata_map[metadata_json['name']] = metadata_json


def load_streamnames():