def test_itemToSession(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) newSession = itemToSession(session.toItem()) assert newSession.sessionStart == session.sessionStart assert newSession.ip == session.ip assert newSession.avgTime == session.avgTime assert newSession.totalTime == session.totalTime
def test_toItem(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) assert session.toItem() == { 'PK': { 'S': 'VISITOR#0.0.0.0' }, 'SK': { 'S': 'SESSION#2020-01-01T00:00:00.000Z' }, 'GSI2PK': { 'S': 'SESSION#0.0.0.0#2020-01-01T00:00:00.000Z' }, 'GSI2SK': { 'S': '#SESSION' }, 'Type': { 'S': 'session' }, 'AverageTime': { 'N': '0.1' }, 'TotalTime': { 'N': '0.1' } }
def test_itemToSession(): session = Session(session_start, visitor_id, avg_time, total_time) newSession = itemToSession(session.toItem()) assert newSession.sessionStart == session.sessionStart assert newSession.id == session.id assert newSession.avgTime == session.avgTime assert newSession.totalTime == session.totalTime
def test_toItem(): session = Session(session_start, visitor_id, avg_time, total_time) assert session.toItem() == { 'PK': { 'S': f'VISITOR#{ visitor_id }' }, 'SK': { 'S': f'SESSION#{ session_start }' }, 'GSI2PK': { 'S': f'SESSION#{ visitor_id }#{ session_start }' }, 'GSI2SK': { 'S': '#SESSION' }, 'Type': { 'S': 'session' }, 'AverageTime': { 'N': str(avg_time) }, 'TotalTime': { 'N': str(total_time) } }
def test_key(): session = Session(session_start, visitor_id, avg_time, total_time) assert session.key() == { 'PK': { 'S': f'VISITOR#{ visitor_id }' }, 'SK': { 'S': f'SESSION#{ session_start }' } }
def test_gsi2(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) assert session.gsi2() == { 'GSI2PK': { 'S': 'SESSION#0.0.0.0#2020-01-01T00:00:00.000Z' }, 'GSI2SK': { 'S': '#SESSION' } }
def test_gsi2(): session = Session(session_start, visitor_id, avg_time, total_time) assert session.gsi2() == { 'GSI2PK': { 'S': f'SESSION#{ visitor_id }#{ session_start }' }, 'GSI2SK': { 'S': '#SESSION' } }
def test_key(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) assert session.key() == { 'PK': { 'S': 'VISITOR#0.0.0.0' }, 'SK': { 'S': 'SESSION#2020-01-01T00:00:00.000Z' } }
def updateSession(self, session, visits, print_error=True): '''Updates a session with new visits and attributes. Parameters ---------- session : Session The session to change the average time on page and the total time on the website. visits : list[ Visit ] All of the visits that belong to the session. ''' if not isinstance(session, Session): raise ValueError('Must pass a Session object') if not isinstance(visits, list): raise ValueError('Must pass a list of Visit objects') if not all([isinstance(visit, Visit) for visit in visits]): raise ValueError('List of visits must be of Visit type') # Get all of the seconds per page visit that exist. pageTimes = [ visit.timeOnPage for visit in visits if isinstance(visit.timeOnPage, float) ] # Calculate the average time the visitor spent on the pages. When there are # no page times, there is no average time. if len(pageTimes) == 1: averageTime = pageTimes[0] elif len(pageTimes) > 1: averageTime = np.mean(pageTimes) else: averageTime = None # Calculate the total time spent in this session. When there is only one # visit, there is no total time. if len(visits) == 1: totalTime = None else: totalTime = (visits[-1].date - visits[0].date).total_seconds() session = Session(visits[0].date, visits[0].ip, averageTime, totalTime) try: self.client.put_item(TableName=self.tableName, Item=session.toItem(), ConditionExpression='attribute_exists(PK)') self.addVisits(visits) return {'session': session, 'visits': visits} except ClientError as e: if print_error: print(f'ERROR updateSession: { e }') if e.response['Error'][ 'Code'] == 'ConditionalCheckFailedException': return {'error': f'Session not in table { session }'} return {'error': 'Could not update session in table'}
def test_default_init(): session = Session(session_start, visitor_id, avg_time, total_time) assert session.sessionStart == datetime.datetime.strptime( session_start, '%Y-%m-%dT%H:%M:%S.%fZ') assert session.id == visitor_id assert session.avgTime == avg_time assert session.totalTime == total_time
def addNewSession(self, visitor, browsers, visits): '''Adds a new session to the table for the given visitor. Parameters ---------- visitor : Visitor The returning visitor. They will have their number of sessions incremented. browsers : list[ Browser ] The visitor's browsers to be added to the table. visits: list[ Visit ] The visits to be added to the table. Returns ------- result : dict The result of adding a new session for a visitor. This could be either the error that occurs or the updated visitor, the browsers added, and the visits added to the table. ''' result = self.incrementVisitorSessions(visitor) if 'error' in result.keys(): return {'error': result['error']} visitor = result['visitor'] result = self.addBrowsers(browsers) if 'error' in result.keys(): return {'error': result['error']} # Get all of the seconds per page visit that exist. pageTimes = [ visit.timeOnPage for visit in visits if isinstance(visit.timeOnPage, float) ] # Calculate the average time the visitor spent on the pages. When there are # no page times, there is no average time. if len(pageTimes) == 1: averageTime = pageTimes[0] elif len(pageTimes) > 1: averageTime = np.mean(pageTimes) else: averageTime = None # Calculate the total time spent in this session. When there is only one # visit, there is no total time. if len(visits) == 1: totalTime = None else: totalTime = (visits[-1].date - visits[0].date).total_seconds() session = Session(visits[0].date, visits[0].ip, averageTime, totalTime) result = self.addSession(session) if 'error' in result.keys(): return {'error': result['error']} result = self.addVisits(visits) if 'error' in result.keys(): return {'error': result['error']} return { 'visitor': visitor, 'browsers': browsers, 'visits': visits, 'session': session }
def processDF( key, s3_client ): '''Reads a raw csv file S3 and parses the browsers, visits, and sessions. Parameters ---------- key : str The key of the '.parquet' file in the S3 bucket. s3_client : S3Client The S3 client used to get the '.parquet' file from. Returns ------- result : dict The browsers, visits, and sessions parsed from the file. ''' request = s3_client.getObject( key ) # Read the parquet file as a pandas DF df = pd.read_csv( io.BytesIO( request['Body'].read() ), sep = ',\t', engine = 'python', names = [ 'process', 'id', 'time', 'title', 'slug', 'userAgent', 'width', 'height', 'x', 'y' ], usecols = [ 'id', 'time', 'title', 'slug', 'userAgent', 'width', 'height', 'x', 'y' ], index_col = 'time' ) df = df.drop_duplicates().sort_index() index_change = df.ne( df.shift() ).apply( lambda x: x.index[x].tolist() ).title indexes = [ ( index_change[index], index_change[index + 1] - 1 ) if index != len( index_change ) - 1 else (index_change[index], df.tail(1).index[0]) for index in range( len( index_change ) ) ] visits = [] for ( start, stop ) in indexes: temp = df.loc[ start: stop ] visits.append( Visit( temp.id.unique()[0], formatEpoch( temp.iloc[[0]].index[0] ), '0', temp.title.unique()[0], temp.slug.unique()[0], formatEpoch( temp.iloc[[0]].index[0] ), { formatEpoch( index ): { 'x': row.x, 'y': row.y } for index, row in temp.iterrows() }, ( temp.iloc[[-1]].index[0] - temp.iloc[[0]].index[0] ) / 1000 ) ) for visit in visits: visit.sessionStart=visits[0].date for index in range( 1, len( visits ) ): visits[index - 1].nextTitle = visits[index].title visits[index - 1].nextSlug = visits[index].slug for index in range( len( visits ) - 1 ): visits[index + 1].prevTitle = visits[index].title visits[index + 1].prevSlug = visits[index].slug session = Session( visits[0].sessionStart, df.id.unique()[0], np.mean( [ visit.timeOnPage for visit in visits ] ), np.sum( [ visit.timeOnPage for visit in visits ] ) ) browsers = [ Browser( df.id.unique()[0], row.userAgent, row.width, row.height, formatEpoch( df.loc[ ( df['height'] == row.height ) & ( df['width'] == row.width ) ].head(1).index[0] ) ) for index, row in df.groupby( ['userAgent','height','width'] ).size().reset_index().rename( columns={0:'count'} ).iterrows() ] return{ 'visits': visits, 'session': session, 'browsers': browsers }
def test_pk(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) assert session.pk() == {'S': 'VISITOR#0.0.0.0'}
def test_gsi2pk(): session = Session(session_start, visitor_id, avg_time, total_time) assert session.gsi2pk() == { 'S': f'SESSION#{ visitor_id }#{ session_start }' }
def test_default_init(): session = Session('2020-01-01T00:00:00.000Z', '0.0.0.0', 0.1, 0.1) assert session.sessionStart == datetime.datetime(2020, 1, 1, 0, 0) assert session.ip == '0.0.0.0' assert session.avgTime == 0.1 assert session.totalTime == 0.1
def test_pk(): session = Session(session_start, visitor_id, avg_time, total_time) assert session.pk() == {'S': f'VISITOR#{ visitor_id }'}
def test_repr(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) assert repr(session) == '0.0.0.0 - 0.1'
def test_gsi2pk(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) assert session.gsi2pk() == { 'S': 'SESSION#0.0.0.0#2020-01-01T00:00:00.000Z' }
def session(): return Session( session_start, visitor_id, avg_time, total_time )
def s3_processor(event, context): """[summary] Args: event ([type]): [description] context ([type]): [description] Returns: [type]: [description] """ new = 0 updated = 0 additional = 0 # Get the necessary data from the S3 event. key = urllib.parse.unquote_plus( event['Records'][0]['s3']['object']['key'], encoding='utf-8' ) aws_region = event['Records'][0]['awsRegion'] bucket_name = event['Records'][0]['s3']['bucket']['name'] # Create the necessary clients dynamo_client = DynamoClient( os.environ['TABLE_NAME'], aws_region ) s3_client = S3Client( bucket_name, aws_region ) # Parse the record to get the browsers, visits, and session. record = processDF( key, s3_client ) # Get the visitor from the table visitor_details = dynamo_client.getVisitorDetails( Visitor( record['session'].id ) ) # Add the visitor, visits, session, and browsers if the visitor is not in # the table. if not 'visitor' in visitor_details: dynamo_client.addVisitor( Visitor( record['session'].id ) ) dynamo_client.addSession( record['session'] ) dynamo_client.addVisits( record['visits'] ) dynamo_client.addBrowsers( record['browsers'] ) new += 1 # Check to see if the last session can be combined with the one in this # record. else: last_session = visitor_details['sessions'][-1] last_sessions_visits = [ visit for visit in visitor_details['visits'] if visit.sessionStart == last_session.sessionStart ] # Combine the visits and update the session when the last session was # less than 30 minutes from this record, if ( ( last_sessions_visits[-1].date - record['visits'][0].date ).total_seconds() < 60 * 30 ): # Update all of the record's with the previous session start for visit in record['visits']: visit.sessionStart = last_session.sessionStart # Update the last visit of the last session when the first visit of # the record is the last page visited in the previous session. if ( last_sessions_visits[-1].title == record['visits'][0].title ): updated_visit = Visit( last_sessions_visits[-1].id, # visitor_id last_sessions_visits[-1].date, # date last_sessions_visits[-1].user, # user last_sessions_visits[-1].title, # title last_sessions_visits[-1].slug, # slug last_sessions_visits[-1].sessionStart, # sessionStart { **last_sessions_visits[-1].scrollEvents, **record['visits'][0].scrollEvents }, # scrollEvents ( # The total time on the updated page is the last scroll # event on the record's first visit minus the first # scroll event of the last visit of the session to # update. datetime.datetime.strptime( list( record['visits'][0].scrollEvents.keys() )[-1], '%Y-%m-%dT%H:%M:%S.%fZ' ) - datetime.datetime.strptime( list( last_sessions_visits[-1].scrollEvents.keys() )[0], '%Y-%m-%dT%H:%M:%S.%fZ' ) ).total_seconds(), #timeOnPage last_sessions_visits[-1].prevTitle, # prevTitle last_sessions_visits[-1].prevSlug, # prevSlug record['visits'][0].nextTitle, # nextTitle record['visits'][0].nextSlug # nextSlug ) visits_to_update = [ updated_visit ] + record['visits'][1:] + \ last_sessions_visits[:-1] else: visits_to_update = record['visits'] + last_sessions_visits # Update all of the visits in the record to have the session dynamo_client.updateVisits( visits_to_update ) dynamo_client.addBrowsers( record['browsers'] ) dynamo_client.updateSession( Session( last_session.sessionStart, # Start date-time last_session.id, # Visitor ID np.mean( [ visit.timeOnPage for visit in visits_to_update ] ), # avgTime np.sum( [ visit.timeOnPage for visit in visits_to_update ] ) # totalTime ), [] ) updated += 1 # Add a the new session, visits, and browsers when the last session was # more than 30 minutes from this record. else: dynamo_client.addSession( record['session'] ) dynamo_client.addVisits( record['visits'] ) dynamo_client.addBrowsers( record['browsers'] ) additional += 1 return { 'statusCode': 200, 'body': json.dumps(f'updated { updated }\nnew { new }\nadditional {additional}') }
def year_session(): '''A proper Session object.''' return Session( '2020-01-01T00:00:00.000Z', visitor_id, 60.0, 60.0 )
def test_datetime_init(): session = Session(datetime.datetime(2020, 1, 1, 0, 0), '0.0.0.0', 0.1, 0.1) assert session.sessionStart == datetime.datetime(2020, 1, 1, 0, 0) assert session.ip == '0.0.0.0' assert session.avgTime == 0.1 assert session.totalTime == 0.1
def session(): '''A proper Session object.''' return Session('2020-01-03T00:00:00.000Z', '0.0.0.0', 60.0, 60.0)
def test_repr(): session = Session(session_start, visitor_id, avg_time, total_time) assert repr(session) == f'{ visitor_id } - { total_time }'
def year_session(): '''A proper Session object.''' return Session('2020-01-01T00:00:00.000Z', '0.0.0.1', 60.0, 60.0)
def processParquet(key, dynamo_client, s3_client): '''Adds the data from a '.parquet' file to the DynamoDB table. Parameters ---------- key : str The key of the '.parquet' file in the S3 bucket. dynamo_client : DynamoClient The DynamoDB client used to store the transformed data. s3_client : S3Client The S3 client used to get the '.parquet' file from. ''' try: request = s3_client.getObject(key) # Read the parquet file as a pandas DF df = pd.read_parquet(io.BytesIO(request['Body'].read())) # Get the unique IP addresses ips = df['ip'].unique() # Iterate over the IP addresses to organize the DF's per visitor for ip in ips: # Get the visitor details from the table. visitor_details = dynamo_client.getVisitorDetails(Visitor(ip)) # Get the browsers and visits of the specific IP address. visitor_dict = processDF(df, ip) # When the visitor is not found in the database, the visitor, location, # browser, session, and visits must be added to the database. if 'error' in visitor_details.keys() \ and visitor_details['error'] == 'Visitor not in table': # Add the new visitor and their data to the table _createNewVisitor(ip, visitor_dict['browsers'], visitor_dict['visits'], dynamo_client) # Otherwise, determine whether to add a new session, update a visitor's # session, or combine multiple sessions. else: # Skip the session when the session is already in the table. if Session(visitor_dict['visits'][0].date, ip, 0, 0).key() in [ session.key() for session in visitor_details['sessions'] ]: continue # Calculate the time deltas of the different sessions and the visitor's # first visit. time_deltas = [ ( visitor_dict['visits'][0].date - \ session.sessionStart + \ datetime.timedelta( seconds=session.totalTime ) \ if session.totalTime is not None \ else visitor_dict['visits'][0].date - session.sessionStart ) for session in visitor_details['sessions'] ] # Find all sessions that have the timedelta of less than 30 minutes on # the same day. sessions_to_update = [ visitor_details['sessions'][index] for index in range(len(time_deltas)) if time_deltas[index].days < 1 and time_deltas[index].days >= 0 and time_deltas[index].seconds / (60 * 60) < 0.5 and time_deltas[index].seconds > 0 ] # Update the visitor's session when only 1 session is found to be # within the timedelta. if len(sessions_to_update) == 1: _updateSession(sessions_to_update[0], visitor_dict['visits'], dynamo_client) elif len(sessions_to_update) > 1: _updateSessions(sessions_to_update, visitor_dict['visits'], dynamo_client) # Create a new session when the time between the last session and the # first of these visits is greater than 30 minutes. else: _addSessionToVisitor(ip, visitor_dict['visits'], visitor_dict['browsers'], dynamo_client) except Exception as e: print(f'ERROR processParquet { e }') print( f'Error getting object { key } from bucket { s3_client.bucketname }.' + \ ' Make sure they exist and your bucket is in the same region as ' + \ 'this function.' ) raise e
def addNewVisitor(self, visitor, location, browsers, visits): '''Adds a new visitor and their details the the table. Parameters ---------- visitor : Visitor The visitor to be added to the table. location : Location The visitor's location to be added to the table. browsers : list[ Browser ] The visitor's browsers to be added to the table. visits : list[ Visit ] The visits to be added to the table. Returns ------- result : dict The result of adding the visitor and their attributes to the table. ''' result = self.addVisitor(visitor) if 'error' in result.keys(): return {'error': result['error']} result = self.addLocation(location) if 'error' in result.keys(): return {'error': result['error']} result = self.addBrowsers(browsers) if 'error' in result.keys(): return {'error': result['error']} # Get all of the seconds per page visit that exist. pageTimes = [ visit.timeOnPage for visit in visits if isinstance(visit.timeOnPage, float) ] # Calculate the average time the visitor spent on the pages. When there are # no page times, there is no average time. if len(pageTimes) == 1: averageTime = pageTimes[0] elif len(pageTimes) > 1: averageTime = np.mean(pageTimes) else: averageTime = None # Calculate the total time spent in this session. When there is only one # visit, there is no total time. if len(visits) == 1: totalTime = None else: totalTime = (visits[-1].date - visits[0].date).total_seconds() session = Session(visits[0].date, visits[0].ip, averageTime, totalTime) result = self.addSession(session) if 'error' in result.keys(): return {'error': result['error']} result = self.addVisits(visits) if 'error' in result.keys(): return {'error': result['error']} return { 'visitor': visitor, 'location': location, 'browsers': browsers, 'visits': visits, 'session': session }
def session(): '''A proper Session object.''' return Session( session_start, visitor_id, avg_time, total_time )
def session(): return Session('2020-01-01T00:00:00.000Z', '0.0.0.0', 60.0, 60.0)