def should_create_visit_with_inconsistent_data():
    visit = Visit(30, 'v1', DataAnomaly.INCONSISTENT_DATA, timer=Timer(-900), keep_private=False)
    def is_dict(field): return type(field) is dict
    def starts_with_www(field): return field.startswith('www.')

    inconsistenf_fields = list(filter(lambda result: result, [is_dict(visit.device), is_dict(visit.network),
                                                              is_dict(visit.browser), starts_with_www(visit.source)]))
    assert_that(inconsistenf_fields).is_length(2)
def should_create_visit_with_incomplete_data():
    visit = Visit(30, 'v1', DataAnomaly.INCOMPLETE_DATA, timer=Timer(-900), keep_private=False)

    incomplete_fields_candidates = [visit.device, visit.network, visit.browser, visit.source]

    incomplete_fields = list(filter(lambda value: not value, incomplete_fields_candidates))

    assert_that(incomplete_fields).is_length(2)
Example #3
0
def should_generate_the_next_action_closing_the_visit():
    test_pages = generators_for_tests.generate_pages_map()
    visit = Visit(visit_duration_seconds=120,
                  app_version='v1',
                  data_anomaly=DataAnomaly.INCONSISTENT_DATA,
                  timer=Timer(-900),
                  keep_private=False)

    json.loads(visit.generate_new_action(test_pages, 130))

    assert_that(visit.is_to_close).is_true()
Example #4
0
def should_generate_invalid_log():
    test_pages = generators_for_tests.generate_pages_map()
    visit = Visit(visit_duration_seconds=120,
                  app_version='v1',
                  data_anomaly=DataAnomaly.MISSING,
                  timer=Timer(-900),
                  keep_private=False)

    invalid_visit_log = json.loads(
        visit.generate_new_action(test_pages, 30, False))

    assert_that(invalid_visit_log['visit_id']).is_none()
Example #5
0
def should_generate_2_different_events_for_the_same_visit_without_anomaly():
    test_pages = generators_for_tests.generate_pages_map()
    visit = Visit(visit_duration_seconds=120,
                  app_version='v1',
                  data_anomaly=DataAnomaly.MISSING,
                  timer=Timer(-900),
                  keep_private=False)

    action_1 = json.loads(visit.generate_new_action(test_pages, 30))
    action_2 = json.loads(visit.generate_new_action(test_pages, 10))

    assertions_for_test.assert_visits_consistency(action_1, action_2)
Example #6
0
 def from_yaml(configuration):
     dataset_configuration = configuration['dataset']
     versions_configuration = dataset_configuration['versions_percentage']
     duration_interval_configuration = dataset_configuration['session_duration_seconds']
     composition_configuration = dataset_configuration['composition_percentage']
     return Dataset(
         duration_min_seconds=duration_interval_configuration['min'],
         duration_max_seconds=duration_interval_configuration['max'],
         percentage_incomplete_data=composition_configuration['incomplete'],
         percentage_inconsistent_data=composition_configuration['inconsistent'],
         percentage_app_v1=versions_configuration['v1'],
         percentage_app_v2=versions_configuration['v2'],
         users_number=dataset_configuration['all_users'],
         timer=Timer(latency_seconds=dataset_configuration['real_time_delta_seconds']),
         no_data_consent_percentage=dataset_configuration['users_no_data_consent_percentage']
     )
Example #7
0
def should_reinitialize_a_visit_with_random_duration():
    dataset = Dataset(10, 30, percentage_incomplete_data=1, percentage_inconsistent_data=1, percentage_app_v1=10,
                      percentage_app_v2=15, users_number=100, timer=Timer(-900), no_data_consent_percentage=2)
    first_visit = dataset.visits[0]

    initial_app_version = first_visit.app_version
    initial_anomaly = first_visit.data_anomaly
    initial_attributes = {**first_visit.__dict__}

    dataset.reinitialize_visit(first_visit)

    assert_that(first_visit.app_version).is_equal_to(initial_app_version)
    assert_that(first_visit.data_anomaly).is_equal_to(initial_anomaly)
    # assert only on the fields that are certainly different every time
    assert_that(first_visit.visit_id).is_not_equal_to(initial_attributes['visit_id'])
    assert_that(first_visit.user_id).is_not_equal_to(initial_attributes['user_id'])
    assert_that(first_visit.duration_seconds).is_not_equal_to(initial_attributes['duration_seconds'])
from assertpy import assert_that

from data_generator.model.entities import DataAnomaly
from data_generator.model.generators import generate_visit_id, generate_user_id, generate_source, \
    generate_user_context, generate_technical_context, generate_event_time, generate_visited_page, \
    generate_keep_private_flag
from data_generator.model.timer import Timer
from data_generator.model.visit import Visit

complete_visit = Visit(visit_duration_seconds=120,
                       app_version='v1',
                       data_anomaly=DataAnomaly.MISSING,
                       timer=Timer(-900),
                       keep_private=False)


def should_generate_consistent_id_between_2_calls():
    visit_id_1 = generate_visit_id(complete_visit)
    visit_id_2 = generate_visit_id(complete_visit)

    assert_that(visit_id_1).is_equal_to(visit_id_2)


def should_generate_user_id_between_2_calls():
    user_id_1 = generate_user_id(complete_visit)
    user_id_2 = generate_user_id(complete_visit)

    assert_that(user_id_1).is_equal_to(user_id_2)


def should_generate_valid_source():
Example #9
0
def should_generate_event_for_a_complete_visit():
    visit = Visit(30, 'v1', entities.DataAnomaly.INCOMPLETE_DATA, timer=Timer(-120), keep_private=False)

    event_dict = entities.generate_event(visit)

    assert_that(event_dict).contains_key('source', 'page', 'user', 'visit_id', 'technical', 'event_time', 'user_id')
Example #10
0
def should_create_a_correct_number_of_visits():
    dataset = Dataset(10, 30, percentage_incomplete_data=1, percentage_inconsistent_data=1, percentage_app_v1=10,
                      percentage_app_v2=15, users_number=100, timer=Timer(-900), no_data_consent_percentage=2)

    assert_that(dataset.visits).is_length(100)
sys.path.append(os.path.abspath(os.path.join('..', 'data-generator')))

from data_generator.model.unordered_data import UnorderedDataContainer
from data_generator.model.timer import Timer
from data_generator.model.dataset import Dataset

if __name__ == '__main__':
    dataset = Dataset(duration_min_seconds=10,
                      duration_max_seconds=300,
                      percentage_incomplete_data=2,
                      percentage_inconsistent_data=2,
                      percentage_app_v1=20,
                      percentage_app_v2=20,
                      users_number=3000,
                      timer=Timer(latency_seconds=-900))

    unordered_data_container = UnorderedDataContainer(
        lambda: choice([0] * 90 + [1] * 10))

    def should_send_unordered_actions():
        flags = [0] * 90 + [1] * 10
        return choice(flags)

    output_stream_name = 'data-generator-test'
    configuration = KinesisWriterConfiguration(
        {'topics': {
            output_stream_name: {
                'shards': 1
            }
        }})
import json
import logging
import os
import random

import sys

sys.path.append(os.path.abspath(os.path.join('..', 'data-generator')))
from data_generator.model.dataset import Dataset
from data_generator.model.timer import Timer
from data_generator.sink.local_filesystem_writer import LocalFileSystemConfiguration

logging.basicConfig(filename='/tmp/logs_writer.txt', level=logging.DEBUG, format='%(asctime)s %(message)s')

if __name__ == '__main__':
    timer = Timer(-3*24*60*60)  # starting from 3 days ago
    dataset = Dataset(duration_min_seconds=120, duration_max_seconds=600,
                      percentage_incomplete_data=0, percentage_inconsistent_data=0,
                      percentage_app_v1=20, percentage_app_v2=20,
                      users_number=2, timer=timer, no_data_consent_percentage=0
                      )

    def get_random_duration_in_seconds():
        return random.randint(1, 10)

    def _extract_event_time(json_data):
        json_object = json.loads(json_data)
        # TODO: try to format it with a datetime formatter
        event_time = json_object['event_time']
        return '{day}/{month}/{year}/{hour}'.format(day=event_time[0:4], month=event_time[5:7],
                                                    year=event_time[8:10], hour=event_time[11:13])