def test_toItem(): visitor = Visitor( visitor_id, 1 ) assert visitor.toItem() == { 'PK': { 'S': f'VISITOR#{ visitor_id }' }, 'SK': { 'S': '#VISITOR' }, 'Type': { 'S': 'visitor' }, 'NumberSessions': { 'N': '1' } }
def test_key(): visitor = Visitor('0.0.0.0', 1) assert visitor.key() == { 'PK': { 'S': 'VISITOR#0.0.0.0' }, 'SK': { 'S': '#VISITOR' } }
def test_toItem(): visitor = Visitor('0.0.0.0', 1) assert visitor.toItem() == { 'PK': { 'S': 'VISITOR#0.0.0.0' }, 'SK': { 'S': '#VISITOR' }, 'Type': { 'S': 'visitor' }, 'NumberSessions': { 'N': '1' } }
def test_processPages(table_name): ip = randomIP() this_event = event(ip, table_name) visits = [ itemToVisit(record['dynamodb']['NewImage']) for record in this_event['Records'] if record['dynamodb']['NewImage']['Type']['S'] == 'visit' ] client = DynamoClient(table_name) client.addVisitor(Visitor(ip)) client.addVisits(visits) client.addBrowsers([ itemToBrowser(record['dynamodb']['NewImage']) for record in this_event['Records'] if record['dynamodb']['NewImage']['Type']['S'] == 'browser' ]) for session in [ itemToSession(record['dynamodb']['NewImage']) for record in this_event['Records'] if record['dynamodb']['NewImage']['Type']['S'] == 'session' ]: client.addSession(session) assert processPages( client, this_event ) == 'Successfully added ' + \ f'{ len( { visit.slug for visit in visits } ) } pages and updated 0 ' + \ f'from { len( visits ) } records.'
def _createNewVisitor(ip, browsers, visits, dynamo_client): '''Adds new Visitor data from a visitor-specific DataFrame to the table. Parameters ---------- ip : str The IP address of the visitor. v_df : pd.DataFrame The visitor-specific DataFrame that holds the session's data. dynamo_client : DynamoClient The DynamoDB client used to access the table Returns ------- result : dict The result of adding the new visitor and their data to the table. This could be new visitor, location, browser, session, and visits added or the error that occurred. ''' result = dynamo_client.addNewVisitor( Visitor(ip, 1), # Visitor requestToLocation( json.loads( http.request( 'GET', f'''https://geo.ipify.org/api/v1?apiKey={ os.environ.get('IPIFY_KEY') }&ipAddress={ ip }''').data.decode('utf8'))), # Location browsers, # Browsers visits # Visits ) if 'error' in result.keys(): print('ERROR _createNewSession ' + result['error']) return result
def _addSessionToVisitor(ip, visits, browsers, dynamo_client): '''Creates a new Session with the data from a visitor-specific DataFrame. Parameters ---------- ip : str The IP address of the visitor. v_df : pd.DataFrame The visitor-specific DataFrame that holds the session's data. dynamo_client : DynamoClient The DynamoDB client used to access the table. visits : list[ Visit ] The list of visits found in the parquet file. Returns ------- result : dict The result of adding the new visitor and their data to the table. This could be new visitor, location, browser, session, and visits added or the error that occurred. ''' result = dynamo_client.addNewSession( Visitor(ip), # Visitor browsers, # Browsers visits # Visits ) if 'error' in result.keys(): print('ERROR _addSessionToVisitor ' + result['error']) return result
def _updateSessions(oldSessions, visits, dynamo_client): '''Updates multiple sessions and visits to be a single session. Parameters ---------- oldSessions : list[ Session ] The old sessions that have been found to be close enough to be combined into a single session. visits : list[ Visit ] The visits found in the '.parquet' file. These are combined with the visits in the other sessions. dynamo_client : DynamoClient The DynamoDB client used to access the table. Returns ------- result : dict The result of combining the sessions and updating the visits in the table. These could be the updated session and visits or the error that occurred while accessing the table. ''' # Create a list of all of the visits from the old sessions. old_visits = [] for session in oldSessions: session_details = dynamo_client.getSessionDetails(session) if 'error' in session_details.keys(): return {'error': session_details['error']} old_visits += session_details['visits'] # Remove the unnecessary sessions from the table. for session in oldSessions[1:]: dynamo_client.removeSession(session) dynamo_client.decrementVisitorSessions(Visitor(session.ip)) # The visits must be combined and assigned the correct attributes before # adding them to the table. Combine the previous visits with the ones in the # last session and reassign their attributes. combined_visits = processVisits(visits + old_visits) # Update the previous session to have the attributes with the new # visits. oldSessions[0].avgTime = np.mean([ visit.timeOnPage for visit in combined_visits if isinstance(visit.timeOnPage, float) ]) oldSessions[0].totalTime = (combined_visits[-1].date - combined_visits[0].date).total_seconds() # Add the updated session and visits to the table. result = dynamo_client.updateSession(oldSessions[0], combined_visits) if 'error' in result.keys(): print('ERROR _updateSession ' + result['error']) return result
def test_key(): visitor = Visitor( visitor_id, 1 ) assert visitor.key() == { 'PK': { 'S': f'VISITOR#{ visitor_id }' }, 'SK': { 'S': '#VISITOR' } }
def test_numberSessions_init(): visitor = Visitor( visitor_id, 1 ) assert visitor.id == visitor_id assert visitor.numberSessions == 1
def processParquet(key, dynamo_client, s3_client): '''Adds the data from a '.parquet' file to the DynamoDB table. Parameters ---------- key : str The key of the '.parquet' file in the S3 bucket. dynamo_client : DynamoClient The DynamoDB client used to store the transformed data. s3_client : S3Client The S3 client used to get the '.parquet' file from. ''' try: request = s3_client.getObject(key) # Read the parquet file as a pandas DF df = pd.read_parquet(io.BytesIO(request['Body'].read())) # Get the unique IP addresses ips = df['ip'].unique() # Iterate over the IP addresses to organize the DF's per visitor for ip in ips: # Get the visitor details from the table. visitor_details = dynamo_client.getVisitorDetails(Visitor(ip)) # Get the browsers and visits of the specific IP address. visitor_dict = processDF(df, ip) # When the visitor is not found in the database, the visitor, location, # browser, session, and visits must be added to the database. if 'error' in visitor_details.keys() \ and visitor_details['error'] == 'Visitor not in table': # Add the new visitor and their data to the table _createNewVisitor(ip, visitor_dict['browsers'], visitor_dict['visits'], dynamo_client) # Otherwise, determine whether to add a new session, update a visitor's # session, or combine multiple sessions. else: # Skip the session when the session is already in the table. if Session(visitor_dict['visits'][0].date, ip, 0, 0).key() in [ session.key() for session in visitor_details['sessions'] ]: continue # Calculate the time deltas of the different sessions and the visitor's # first visit. time_deltas = [ ( visitor_dict['visits'][0].date - \ session.sessionStart + \ datetime.timedelta( seconds=session.totalTime ) \ if session.totalTime is not None \ else visitor_dict['visits'][0].date - session.sessionStart ) for session in visitor_details['sessions'] ] # Find all sessions that have the timedelta of less than 30 minutes on # the same day. sessions_to_update = [ visitor_details['sessions'][index] for index in range(len(time_deltas)) if time_deltas[index].days < 1 and time_deltas[index].days >= 0 and time_deltas[index].seconds / (60 * 60) < 0.5 and time_deltas[index].seconds > 0 ] # Update the visitor's session when only 1 session is found to be # within the timedelta. if len(sessions_to_update) == 1: _updateSession(sessions_to_update[0], visitor_dict['visits'], dynamo_client) elif len(sessions_to_update) > 1: _updateSessions(sessions_to_update, visitor_dict['visits'], dynamo_client) # Create a new session when the time between the last session and the # first of these visits is greater than 30 minutes. else: _addSessionToVisitor(ip, visitor_dict['visits'], visitor_dict['browsers'], dynamo_client) except Exception as e: print(f'ERROR processParquet { e }') print( f'Error getting object { key } from bucket { s3_client.bucketname }.' + \ ' Make sure they exist and your bucket is in the same region as ' + \ 'this function.' ) raise e
def test_itemToVisitor(): visitor = Visitor( visitor_id, 1 ) newVisitor = itemToVisitor( visitor.toItem() ) assert newVisitor.id == visitor.id assert newVisitor.numberSessions == visitor.numberSessions
def test_repr(): visitor = Visitor('0.0.0.0', 1) assert repr(visitor) == '0.0.0.0 - 1'
def visitor(): '''A proper Visit object.''' return Visitor('0.0.0.0')
def test_repr(): visitor = Visitor( visitor_id, 1 ) assert repr( visitor ) == f'{ visitor_id } - 1'
def visitor(): return Visitor('0.0.0.0')
def test_itemToVisitor(): visitor = Visitor('0.0.0.0', 1) newVisitor = itemToVisitor(visitor.toItem()) assert newVisitor.ip == visitor.ip assert newVisitor.numberSessions == visitor.numberSessions
def test_dict(): visitor = Visitor('0.0.0.0', 1) assert dict(visitor) == {'ip': '0.0.0.0', 'numberSessions': 1}
def test_pk(): visitor = Visitor( visitor_id, 1 ) assert visitor.pk() == { 'S': f'VISITOR#{ visitor_id }' }
def test_pk(): visitor = Visitor('0.0.0.0', 1) assert visitor.pk() == {'S': 'VISITOR#0.0.0.0'}
def visitor(): return Visitor( visitor_id )
def test_default_init(): visitor = Visitor('0.0.0.0') assert visitor.ip == '0.0.0.0' assert visitor.numberSessions == 0
def test_dict(): visitor = Visitor( visitor_id, 1 ) assert dict( visitor ) == { 'id': visitor_id, 'numberSessions': 1 }
def s3_processor(event, context): """[summary] Args: event ([type]): [description] context ([type]): [description] Returns: [type]: [description] """ new = 0 updated = 0 additional = 0 # Get the necessary data from the S3 event. key = urllib.parse.unquote_plus( event['Records'][0]['s3']['object']['key'], encoding='utf-8' ) aws_region = event['Records'][0]['awsRegion'] bucket_name = event['Records'][0]['s3']['bucket']['name'] # Create the necessary clients dynamo_client = DynamoClient( os.environ['TABLE_NAME'], aws_region ) s3_client = S3Client( bucket_name, aws_region ) # Parse the record to get the browsers, visits, and session. record = processDF( key, s3_client ) # Get the visitor from the table visitor_details = dynamo_client.getVisitorDetails( Visitor( record['session'].id ) ) # Add the visitor, visits, session, and browsers if the visitor is not in # the table. if not 'visitor' in visitor_details: dynamo_client.addVisitor( Visitor( record['session'].id ) ) dynamo_client.addSession( record['session'] ) dynamo_client.addVisits( record['visits'] ) dynamo_client.addBrowsers( record['browsers'] ) new += 1 # Check to see if the last session can be combined with the one in this # record. else: last_session = visitor_details['sessions'][-1] last_sessions_visits = [ visit for visit in visitor_details['visits'] if visit.sessionStart == last_session.sessionStart ] # Combine the visits and update the session when the last session was # less than 30 minutes from this record, if ( ( last_sessions_visits[-1].date - record['visits'][0].date ).total_seconds() < 60 * 30 ): # Update all of the record's with the previous session start for visit in record['visits']: visit.sessionStart = last_session.sessionStart # Update the last visit of the last session when the first visit of # the record is the last page visited in the previous session. if ( last_sessions_visits[-1].title == record['visits'][0].title ): updated_visit = Visit( last_sessions_visits[-1].id, # visitor_id last_sessions_visits[-1].date, # date last_sessions_visits[-1].user, # user last_sessions_visits[-1].title, # title last_sessions_visits[-1].slug, # slug last_sessions_visits[-1].sessionStart, # sessionStart { **last_sessions_visits[-1].scrollEvents, **record['visits'][0].scrollEvents }, # scrollEvents ( # The total time on the updated page is the last scroll # event on the record's first visit minus the first # scroll event of the last visit of the session to # update. datetime.datetime.strptime( list( record['visits'][0].scrollEvents.keys() )[-1], '%Y-%m-%dT%H:%M:%S.%fZ' ) - datetime.datetime.strptime( list( last_sessions_visits[-1].scrollEvents.keys() )[0], '%Y-%m-%dT%H:%M:%S.%fZ' ) ).total_seconds(), #timeOnPage last_sessions_visits[-1].prevTitle, # prevTitle last_sessions_visits[-1].prevSlug, # prevSlug record['visits'][0].nextTitle, # nextTitle record['visits'][0].nextSlug # nextSlug ) visits_to_update = [ updated_visit ] + record['visits'][1:] + \ last_sessions_visits[:-1] else: visits_to_update = record['visits'] + last_sessions_visits # Update all of the visits in the record to have the session dynamo_client.updateVisits( visits_to_update ) dynamo_client.addBrowsers( record['browsers'] ) dynamo_client.updateSession( Session( last_session.sessionStart, # Start date-time last_session.id, # Visitor ID np.mean( [ visit.timeOnPage for visit in visits_to_update ] ), # avgTime np.sum( [ visit.timeOnPage for visit in visits_to_update ] ) # totalTime ), [] ) updated += 1 # Add a the new session, visits, and browsers when the last session was # more than 30 minutes from this record. else: dynamo_client.addSession( record['session'] ) dynamo_client.addVisits( record['visits'] ) dynamo_client.addBrowsers( record['browsers'] ) additional += 1 return { 'statusCode': 200, 'body': json.dumps(f'updated { updated }\nnew { new }\nadditional {additional}') }
def test_default_init(): visitor = Visitor( visitor_id ) assert visitor.id == visitor_id assert visitor.numberSessions == 0
def test_numberSessions_init(): visitor = Visitor('0.0.0.0', 1) assert visitor.ip == '0.0.0.0' assert visitor.numberSessions == 1