def month_visits(): return [ Visit( visitor_id, '2020-01-01T00:00:00.000Z', '0', 'Tyler Norlund', '/', '2020-01-01T00:00:00.000Z', scroll_events, '60', None, None, 'Blog', '/blog' ), Visit( '171a0329-f8b2-499c-867d-1942384ddd5a', '2020-01-03T00:00:01.000Z', '0', 'Tyler Norlund', '/', '2020-01-01T00:00:00.000Z', scroll_events, None, 'Tyler Norlund', '/', None, None ), Visit( '171a0329-f8b2-499c-867d-1942384ddd5a', '2020-01-25T00:00:00.000Z', '0', 'Tyler Norlund', '/', '2020-01-01T00:00:00.000Z', scroll_events, '120', None, None, 'Resume', '/resume' ), Visit( '171a0329-f8b2-499c-867d-1942384ddd5s', '2020-01-30T00:00:00.000Z', '0', 'Tyler Norlund', '/', '2020-01-01T00:00:00.000Z', scroll_events, '120', None, None, 'Resume', '/resume' ) ]
def visits(): return [ Visit('2020-01-01T00:00:00.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-01-01T00:00:00.000Z', '60', None, None, 'Blog', '/blog'), Visit('2020-01-01T00:01:00.000Z', '0.0.0.0', '0', 'Blog', '/blog', '2020-01-01T00:00:00.000Z', None, 'Tyler Norlund', '/', None, None) ]
def visits(): '''A list of proper Visit objects.''' return [ Visit('2020-01-03T00:00:00.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-01-03T00:00:00.000Z', '60', None, None, 'Blog', '/blog'), Visit('2020-01-03T00:01:00.000Z', '0.0.0.0', '0', 'Blog', '/blog', '2020-01-03T00:00:00.000Z', None, 'Tyler Norlund', '/', None, None) ]
def day_visits(): return [ Visit('2020-01-03T00:00:00.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-01-03T00:00:00.000Z', '60', None, None, 'Blog', '/blog'), Visit('2020-01-03T00:00:01.000Z', '0.0.0.1', '0', 'Tyler Norlund', '/', '2020-01-03T00:00:00.000Z', None, 'Tyler Norlund', '/', None, None), Visit('2020-01-03T00:00:00.000Z', '0.0.0.1', '0', 'Tyler Norlund', '/', '2020-01-03T00:00:00.000Z', '120', None, None, 'Resume', '/resume'), ]
def test_key(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) assert visit.key() == { 'PK': { 'S': f'VISITOR#{ visitor_id }' }, 'SK': { 'S': f'VISIT#{ visit_date }#{ page_slug }' } }
def test_gsi1(): visit = Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z') assert visit.gsi1() == { 'GSI1PK': { 'S': 'PAGE#/' }, 'GSI1SK': { 'S': 'VISIT#2020-12-23T20:32:26.000Z' } }
def test_gsi1(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) assert visit.gsi1() == { 'GSI1PK': { 'S': f'PAGE#{ page_slug }' }, 'GSI1SK': { 'S': f'VISIT#{ visit_date }' } }
def test_gsi2(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) assert visit.gsi2() == { 'GSI2PK': { 'S': f'SESSION#{ visitor_id }#{ session_start }' }, 'GSI2SK': { 'S': f'VISIT#{ visit_date }' } }
def test_gsi2(): visit = Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z') assert visit.gsi2() == { 'GSI2PK': { 'S': 'SESSION#0.0.0.0#2020-12-23T20:32:26.000Z' }, 'GSI2SK': { 'S': 'VISIT#2020-12-23T20:32:26.000Z' } }
def test_key(): visit = Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z') assert visit.key() == { 'PK': { 'S': 'VISITOR#0.0.0.0' }, 'SK': { 'S': 'VISIT#2020-12-23T20:32:26.000Z#/' } }
def day_visits(): '''A list of proper Visit objects that span a day.''' return [ Visit('2020-01-03T00:00:00.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-01-03T00:00:00.000Z', '60', None, None, 'Blog', '/blog'), Visit('2020-01-03T00:00:01.000Z', '0.0.0.1', '0', 'Tyler Norlund', '/', '2020-01-03T00:00:00.000Z', None, 'Tyler Norlund', '/', None, None), Visit('2020-01-03T00:00:00.000Z', '0.0.0.1', '0', 'Tyler Norlund', '/', '2020-01-03T00:00:00.000Z', '120', None, None, 'Resume', '/resume'), ]
def test_toItem(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) assert visit.toItem() == { 'PK': { 'S': f'VISITOR#{ visitor_id }' }, 'SK': { 'S': f'VISIT#{ visit_date }#{ page_slug }' }, 'GSI1PK': { 'S': f'PAGE#{ page_slug }' }, 'GSI1SK': { 'S': f'VISIT#{ visit_date }' }, 'GSI2PK': { 'S': f'SESSION#{ visitor_id }#{ session_start }' }, 'GSI2SK': { 'S': f'VISIT#{ visit_date }' }, 'Type': { 'S': 'visit' }, 'User': { 'N': '0' }, 'ScrollEvents': objectToItemAtr(scroll_events), 'Title': { 'S': page_title }, 'Slug': { 'S': page_slug }, 'PreviousTitle': { 'NULL': True }, 'PreviousSlug': { 'NULL': True }, 'NextTitle': { 'NULL': True }, 'NextSlug': { 'NULL': True }, 'TimeOnPage': { 'NULL': True } }
def test_toItem(): visit = Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z') assert visit.toItem() == { 'PK': { 'S': 'VISITOR#0.0.0.0' }, 'SK': { 'S': 'VISIT#2020-12-23T20:32:26.000Z#/' }, 'GSI1PK': { 'S': 'PAGE#/' }, 'GSI1SK': { 'S': 'VISIT#2020-12-23T20:32:26.000Z' }, 'GSI2PK': { 'S': 'SESSION#0.0.0.0#2020-12-23T20:32:26.000Z' }, 'GSI2SK': { 'S': 'VISIT#2020-12-23T20:32:26.000Z' }, 'Type': { 'S': 'visit' }, 'User': { 'N': '0' }, 'Title': { 'S': 'Tyler Norlund' }, 'Slug': { 'S': '/' }, 'PreviousTitle': { 'NULL': True }, 'PreviousSlug': { 'NULL': True }, 'NextTitle': { 'NULL': True }, 'NextSlug': { 'NULL': True }, 'TimeOnPage': { 'NULL': True } }
def test_itemToVisit(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) item = visit.toItem() newVisit = itemToVisit(item) assert newVisit.id == visit.id assert newVisit.date == visit.date assert newVisit.user == visit.user assert newVisit.title == visit.title assert newVisit.slug == visit.slug assert newVisit.sessionStart == visit.sessionStart assert newVisit.prevTitle == visit.prevTitle assert newVisit.prevSlug == visit.prevTitle assert newVisit.nextTitle == visit.nextTitle assert newVisit.nextSlug == visit.nextSlug assert newVisit.timeOnPage == visit.timeOnPage
def test_itemToVisit(): visit = Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z') item = visit.toItem() newVisit = itemToVisit(item) assert newVisit.date == visit.date assert newVisit.ip == visit.ip assert newVisit.user == visit.user assert newVisit.title == visit.title assert newVisit.slug == visit.slug assert newVisit.sessionStart == visit.sessionStart assert newVisit.prevTitle == visit.prevTitle assert newVisit.prevSlug == visit.prevTitle assert newVisit.nextTitle == visit.nextTitle assert newVisit.nextSlug == visit.nextSlug assert newVisit.timeOnPage == visit.timeOnPage
def test_parameter_title_addPage(dynamo_client, table_init, table_name, year_visits): with pytest.raises(ValueError) as e: assert DynamoClient(table_name).addPage(year_visits + [ Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Resume', '/', '2020-12-23T20:32:26.000Z') ]) assert str(e.value) == 'List of visits must have the same title'
def test_parameter_year_addWeek(dynamo_client, table_init, table_name, week_visits): with pytest.raises(ValueError) as e: assert DynamoClient(table_name).addWeek(week_visits + [ Visit('2021-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z') ]) assert str(e.value) == 'List of visits must be from the same year and week'
def test_parameter_title_addDay( dynamo_client, table_init, table_name, day_visits ): with pytest.raises( ValueError ) as e: assert DynamoClient( table_name ).addDay( day_visits + [ Visit( visitor_id, '2020-12-23T20:32:26.000Z', '0', 'Resume', '/', '2020-12-23T20:32:26.000Z', scroll_events ) ] ) assert str( e.value ) == 'List of visits must have the same title'
def test_parameter_year_addMonth( dynamo_client, table_init, table_name, month_visits ): with pytest.raises( ValueError ) as e: assert DynamoClient( table_name ).addMonth( month_visits + [ Visit( visitor_id, '2021-12-23T20:32:26.000Z', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z', scroll_events ) ] ) assert str( e.value ) == 'List of visits must be from the same year and month'
def test_dict(): visit = dict( Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', '0', 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z')) assert visit['date'] == datetime.datetime(2020, 12, 23, 20, 32, 26) assert visit['ip'] == '0.0.0.0' assert visit['user'] == 0 assert visit['title'] == 'Tyler Norlund' assert visit['slug'] == '/' assert visit['prevTitle'] is None assert visit['prevSlug'] is None assert visit['nextTitle'] is None assert visit['nextSlug'] is None assert visit['timeOnPage'] is None
def test_no_user_init(): visit = Visit('2020-12-23T20:32:26.000Z', '0.0.0.0', None, 'Tyler Norlund', '/', '2020-12-23T20:32:26.000Z') assert visit.date == datetime.datetime(2020, 12, 23, 20, 32, 26) assert visit.ip == '0.0.0.0' assert visit.user == 0 assert visit.title == 'Tyler Norlund' assert visit.slug == '/' assert visit.sessionStart == datetime.datetime(2020, 12, 23, 20, 32, 26) assert visit.prevTitle is None assert visit.prevSlug is None assert visit.nextTitle is None assert visit.nextSlug is None assert visit.timeOnPage is None
def visits(): return[ Visit( visitor_id, session_start, '0', 'Tyler Norlund', '/', session_start, scroll_events, '1.647', nextTitle='Resume', nextSlug='/resume' ), Visit( visitor_id, '2021-02-10T11:27:51.216Z', '0', 'Resume', '/resume', session_start, scroll_events, '3.084', prevTitle='Tyler Norlund', prevSlug='/', nextTitle='Continuous Integration and Continuous Delivery', nextSlug='/blog/cicd' ), Visit( visitor_id, '2021-02-10T11:27:57.886Z', '0', 'Continuous Integration and Continuous Delivery', '/blog/cicd', session_start, scroll_events, timeOnPage='3.747', prevTitle='Continuous Integration and Continuous Delivery', prevSlug='/blog/cicd' ) ]
def test_dict(): visit = dict( Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events)) assert visit['date'] == datetime.datetime.strptime( visit_date, '%Y-%m-%dT%H:%M:%S.%fZ') assert visit['id'] == visitor_id assert visit['user'] == 0 assert visit['title'] == page_title assert visit['slug'] == page_slug assert visit['prevTitle'] is None assert visit['prevSlug'] is None assert visit['nextTitle'] is None assert visit['nextSlug'] is None assert visit['timeOnPage'] is None
def test_no_user_init(): visit = Visit(visitor_id, visit_date, None, page_title, page_slug, session_start, scroll_events) assert visit.id == visitor_id assert visit.date == datetime.datetime.strptime(visit_date, '%Y-%m-%dT%H:%M:%S.%fZ') assert visit.user == 0 assert visit.title == page_title assert visit.slug == page_slug assert visit.sessionStart == datetime.datetime.strptime( session_start, '%Y-%m-%dT%H:%M:%S.%fZ') assert visit.prevTitle is None assert visit.prevSlug is None assert visit.nextTitle is None assert visit.nextSlug is None assert visit.timeOnPage is None
def processVisits(visits): '''Formats a list of visits to have the proper attributes. Parameters ---------- visits : list[ Visit ] The list of visits to be modified to fit the session's attributes. Returns ------- visits : list[ Visit ] The list of visits that have the corrected attributes. ''' v_df = pd.DataFrame({ 'id': [visit.date for visit in visits], 'title': [visit.title for visit in visits], 'slug': [visit.slug for visit in visits], 'ip': [visit.ip for visit in visits], 'user': [visit.user for visit in visits], }) v_df = v_df.drop_duplicates().sort_values(by='id').reset_index() # Format the datetimes to be dates and then calculate the amount of time # between each request. v_df['seconds'] = v_df['id'].diff(+1).dt.total_seconds()[1:].append( pd.Series([None])).reset_index()[0] # Shift the slugs and title up and down in order to associate the # previous and next slugs and titles per each visit. v_df['prevSlug'] = v_df['slug'].shift(1) v_df['prevTitle'] = v_df['title'].shift(1) v_df['nextSlug'] = v_df['slug'].shift(-1) v_df['nextTitle'] = v_df['title'].shift(-1) # Replace the NaN's with the None type for the entities v_df = v_df.replace({np.nan: None}) return [ Visit(row['id'], row['ip'], row['user'], row['title'], row['slug'], v_df.iloc[0]['id'], row['seconds'], row['prevTitle'], row['prevSlug'], row['nextTitle'], row['nextSlug']) for index, row in v_df.iterrows() ]
def visit(): return Visit( visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events )
def test_repr(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) assert repr(visit) == f'{ visitor_id } - { visit_date}'
def s3_processor(event, context): """[summary] Args: event ([type]): [description] context ([type]): [description] Returns: [type]: [description] """ new = 0 updated = 0 additional = 0 # Get the necessary data from the S3 event. key = urllib.parse.unquote_plus( event['Records'][0]['s3']['object']['key'], encoding='utf-8' ) aws_region = event['Records'][0]['awsRegion'] bucket_name = event['Records'][0]['s3']['bucket']['name'] # Create the necessary clients dynamo_client = DynamoClient( os.environ['TABLE_NAME'], aws_region ) s3_client = S3Client( bucket_name, aws_region ) # Parse the record to get the browsers, visits, and session. record = processDF( key, s3_client ) # Get the visitor from the table visitor_details = dynamo_client.getVisitorDetails( Visitor( record['session'].id ) ) # Add the visitor, visits, session, and browsers if the visitor is not in # the table. if not 'visitor' in visitor_details: dynamo_client.addVisitor( Visitor( record['session'].id ) ) dynamo_client.addSession( record['session'] ) dynamo_client.addVisits( record['visits'] ) dynamo_client.addBrowsers( record['browsers'] ) new += 1 # Check to see if the last session can be combined with the one in this # record. else: last_session = visitor_details['sessions'][-1] last_sessions_visits = [ visit for visit in visitor_details['visits'] if visit.sessionStart == last_session.sessionStart ] # Combine the visits and update the session when the last session was # less than 30 minutes from this record, if ( ( last_sessions_visits[-1].date - record['visits'][0].date ).total_seconds() < 60 * 30 ): # Update all of the record's with the previous session start for visit in record['visits']: visit.sessionStart = last_session.sessionStart # Update the last visit of the last session when the first visit of # the record is the last page visited in the previous session. if ( last_sessions_visits[-1].title == record['visits'][0].title ): updated_visit = Visit( last_sessions_visits[-1].id, # visitor_id last_sessions_visits[-1].date, # date last_sessions_visits[-1].user, # user last_sessions_visits[-1].title, # title last_sessions_visits[-1].slug, # slug last_sessions_visits[-1].sessionStart, # sessionStart { **last_sessions_visits[-1].scrollEvents, **record['visits'][0].scrollEvents }, # scrollEvents ( # The total time on the updated page is the last scroll # event on the record's first visit minus the first # scroll event of the last visit of the session to # update. datetime.datetime.strptime( list( record['visits'][0].scrollEvents.keys() )[-1], '%Y-%m-%dT%H:%M:%S.%fZ' ) - datetime.datetime.strptime( list( last_sessions_visits[-1].scrollEvents.keys() )[0], '%Y-%m-%dT%H:%M:%S.%fZ' ) ).total_seconds(), #timeOnPage last_sessions_visits[-1].prevTitle, # prevTitle last_sessions_visits[-1].prevSlug, # prevSlug record['visits'][0].nextTitle, # nextTitle record['visits'][0].nextSlug # nextSlug ) visits_to_update = [ updated_visit ] + record['visits'][1:] + \ last_sessions_visits[:-1] else: visits_to_update = record['visits'] + last_sessions_visits # Update all of the visits in the record to have the session dynamo_client.updateVisits( visits_to_update ) dynamo_client.addBrowsers( record['browsers'] ) dynamo_client.updateSession( Session( last_session.sessionStart, # Start date-time last_session.id, # Visitor ID np.mean( [ visit.timeOnPage for visit in visits_to_update ] ), # avgTime np.sum( [ visit.timeOnPage for visit in visits_to_update ] ) # totalTime ), [] ) updated += 1 # Add a the new session, visits, and browsers when the last session was # more than 30 minutes from this record. else: dynamo_client.addSession( record['session'] ) dynamo_client.addVisits( record['visits'] ) dynamo_client.addBrowsers( record['browsers'] ) additional += 1 return { 'statusCode': 200, 'body': json.dumps(f'updated { updated }\nnew { new }\nadditional {additional}') }
def test_gsi1pk(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) assert visit.gsi1pk() == {'S': f'PAGE#{ page_slug }'}
def test_pk(): visit = Visit(visitor_id, visit_date, user_number, page_title, page_slug, session_start, scroll_events) assert visit.pk() == {'S': f'VISITOR#{ visitor_id }'}