def should_create_visit_with_inconsistent_data(): visit = Visit(30, 'v1', DataAnomaly.INCONSISTENT_DATA, timer=Timer(-900), keep_private=False) def is_dict(field): return type(field) is dict def starts_with_www(field): return field.startswith('www.') inconsistenf_fields = list(filter(lambda result: result, [is_dict(visit.device), is_dict(visit.network), is_dict(visit.browser), starts_with_www(visit.source)])) assert_that(inconsistenf_fields).is_length(2)
def should_create_visit_with_incomplete_data(): visit = Visit(30, 'v1', DataAnomaly.INCOMPLETE_DATA, timer=Timer(-900), keep_private=False) incomplete_fields_candidates = [visit.device, visit.network, visit.browser, visit.source] incomplete_fields = list(filter(lambda value: not value, incomplete_fields_candidates)) assert_that(incomplete_fields).is_length(2)
def should_generate_the_next_action_closing_the_visit(): test_pages = generators_for_tests.generate_pages_map() visit = Visit(visit_duration_seconds=120, app_version='v1', data_anomaly=DataAnomaly.INCONSISTENT_DATA, timer=Timer(-900), keep_private=False) json.loads(visit.generate_new_action(test_pages, 130)) assert_that(visit.is_to_close).is_true()
def should_generate_invalid_log(): test_pages = generators_for_tests.generate_pages_map() visit = Visit(visit_duration_seconds=120, app_version='v1', data_anomaly=DataAnomaly.MISSING, timer=Timer(-900), keep_private=False) invalid_visit_log = json.loads( visit.generate_new_action(test_pages, 30, False)) assert_that(invalid_visit_log['visit_id']).is_none()
def should_generate_2_different_events_for_the_same_visit_without_anomaly(): test_pages = generators_for_tests.generate_pages_map() visit = Visit(visit_duration_seconds=120, app_version='v1', data_anomaly=DataAnomaly.MISSING, timer=Timer(-900), keep_private=False) action_1 = json.loads(visit.generate_new_action(test_pages, 30)) action_2 = json.loads(visit.generate_new_action(test_pages, 10)) assertions_for_test.assert_visits_consistency(action_1, action_2)
def from_yaml(configuration): dataset_configuration = configuration['dataset'] versions_configuration = dataset_configuration['versions_percentage'] duration_interval_configuration = dataset_configuration['session_duration_seconds'] composition_configuration = dataset_configuration['composition_percentage'] return Dataset( duration_min_seconds=duration_interval_configuration['min'], duration_max_seconds=duration_interval_configuration['max'], percentage_incomplete_data=composition_configuration['incomplete'], percentage_inconsistent_data=composition_configuration['inconsistent'], percentage_app_v1=versions_configuration['v1'], percentage_app_v2=versions_configuration['v2'], users_number=dataset_configuration['all_users'], timer=Timer(latency_seconds=dataset_configuration['real_time_delta_seconds']), no_data_consent_percentage=dataset_configuration['users_no_data_consent_percentage'] )
def should_reinitialize_a_visit_with_random_duration(): dataset = Dataset(10, 30, percentage_incomplete_data=1, percentage_inconsistent_data=1, percentage_app_v1=10, percentage_app_v2=15, users_number=100, timer=Timer(-900), no_data_consent_percentage=2) first_visit = dataset.visits[0] initial_app_version = first_visit.app_version initial_anomaly = first_visit.data_anomaly initial_attributes = {**first_visit.__dict__} dataset.reinitialize_visit(first_visit) assert_that(first_visit.app_version).is_equal_to(initial_app_version) assert_that(first_visit.data_anomaly).is_equal_to(initial_anomaly) # assert only on the fields that are certainly different every time assert_that(first_visit.visit_id).is_not_equal_to(initial_attributes['visit_id']) assert_that(first_visit.user_id).is_not_equal_to(initial_attributes['user_id']) assert_that(first_visit.duration_seconds).is_not_equal_to(initial_attributes['duration_seconds'])
from assertpy import assert_that from data_generator.model.entities import DataAnomaly from data_generator.model.generators import generate_visit_id, generate_user_id, generate_source, \ generate_user_context, generate_technical_context, generate_event_time, generate_visited_page, \ generate_keep_private_flag from data_generator.model.timer import Timer from data_generator.model.visit import Visit complete_visit = Visit(visit_duration_seconds=120, app_version='v1', data_anomaly=DataAnomaly.MISSING, timer=Timer(-900), keep_private=False) def should_generate_consistent_id_between_2_calls(): visit_id_1 = generate_visit_id(complete_visit) visit_id_2 = generate_visit_id(complete_visit) assert_that(visit_id_1).is_equal_to(visit_id_2) def should_generate_user_id_between_2_calls(): user_id_1 = generate_user_id(complete_visit) user_id_2 = generate_user_id(complete_visit) assert_that(user_id_1).is_equal_to(user_id_2) def should_generate_valid_source():
def should_generate_event_for_a_complete_visit(): visit = Visit(30, 'v1', entities.DataAnomaly.INCOMPLETE_DATA, timer=Timer(-120), keep_private=False) event_dict = entities.generate_event(visit) assert_that(event_dict).contains_key('source', 'page', 'user', 'visit_id', 'technical', 'event_time', 'user_id')
def should_create_a_correct_number_of_visits(): dataset = Dataset(10, 30, percentage_incomplete_data=1, percentage_inconsistent_data=1, percentage_app_v1=10, percentage_app_v2=15, users_number=100, timer=Timer(-900), no_data_consent_percentage=2) assert_that(dataset.visits).is_length(100)
sys.path.append(os.path.abspath(os.path.join('..', 'data-generator'))) from data_generator.model.unordered_data import UnorderedDataContainer from data_generator.model.timer import Timer from data_generator.model.dataset import Dataset if __name__ == '__main__': dataset = Dataset(duration_min_seconds=10, duration_max_seconds=300, percentage_incomplete_data=2, percentage_inconsistent_data=2, percentage_app_v1=20, percentage_app_v2=20, users_number=3000, timer=Timer(latency_seconds=-900)) unordered_data_container = UnorderedDataContainer( lambda: choice([0] * 90 + [1] * 10)) def should_send_unordered_actions(): flags = [0] * 90 + [1] * 10 return choice(flags) output_stream_name = 'data-generator-test' configuration = KinesisWriterConfiguration( {'topics': { output_stream_name: { 'shards': 1 } }})
import json import logging import os import random import sys sys.path.append(os.path.abspath(os.path.join('..', 'data-generator'))) from data_generator.model.dataset import Dataset from data_generator.model.timer import Timer from data_generator.sink.local_filesystem_writer import LocalFileSystemConfiguration logging.basicConfig(filename='/tmp/logs_writer.txt', level=logging.DEBUG, format='%(asctime)s %(message)s') if __name__ == '__main__': timer = Timer(-3*24*60*60) # starting from 3 days ago dataset = Dataset(duration_min_seconds=120, duration_max_seconds=600, percentage_incomplete_data=0, percentage_inconsistent_data=0, percentage_app_v1=20, percentage_app_v2=20, users_number=2, timer=timer, no_data_consent_percentage=0 ) def get_random_duration_in_seconds(): return random.randint(1, 10) def _extract_event_time(json_data): json_object = json.loads(json_data) # TODO: try to format it with a datetime formatter event_time = json_object['event_time'] return '{day}/{month}/{year}/{hour}'.format(day=event_time[0:4], month=event_time[5:7], year=event_time[8:10], hour=event_time[11:13])