Ejemplo n.º 1
def test_itemToSession():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    newSession = itemToSession(session.toItem())
    assert newSession.sessionStart == session.sessionStart
    assert newSession.ip == session.ip
    assert newSession.avgTime == session.avgTime
    assert newSession.totalTime == session.totalTime
Ejemplo n.º 2
def test_toItem():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    assert session.toItem() == {
        'PK': {
            'S': 'VISITOR#'
        'SK': {
            'S': 'SESSION#2020-01-01T00:00:00.000Z'
        'GSI2PK': {
            'S': 'SESSION#'
        'GSI2SK': {
            'S': '#SESSION'
        'Type': {
            'S': 'session'
        'AverageTime': {
            'N': '0.1'
        'TotalTime': {
            'N': '0.1'
Ejemplo n.º 3
def test_itemToSession():
    session = Session(session_start, visitor_id, avg_time, total_time)
    newSession = itemToSession(session.toItem())
    assert newSession.sessionStart == session.sessionStart
    assert newSession.id == session.id
    assert newSession.avgTime == session.avgTime
    assert newSession.totalTime == session.totalTime
Ejemplo n.º 4
def test_toItem():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.toItem() == {
        'PK': {
            'S': f'VISITOR#{ visitor_id }'
        'SK': {
            'S': f'SESSION#{ session_start }'
        'GSI2PK': {
            'S': f'SESSION#{ visitor_id }#{ session_start }'
        'GSI2SK': {
            'S': '#SESSION'
        'Type': {
            'S': 'session'
        'AverageTime': {
            'N': str(avg_time)
        'TotalTime': {
            'N': str(total_time)
Ejemplo n.º 5
def test_key():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.key() == {
        'PK': {
            'S': f'VISITOR#{ visitor_id }'
        'SK': {
            'S': f'SESSION#{ session_start }'
Ejemplo n.º 6
def test_gsi2():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    assert session.gsi2() == {
        'GSI2PK': {
            'S': 'SESSION#'
        'GSI2SK': {
            'S': '#SESSION'
Ejemplo n.º 7
def test_gsi2():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.gsi2() == {
        'GSI2PK': {
            'S': f'SESSION#{ visitor_id }#{ session_start }'
        'GSI2SK': {
            'S': '#SESSION'
Ejemplo n.º 8
def test_key():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    assert session.key() == {
        'PK': {
            'S': 'VISITOR#'
        'SK': {
            'S': 'SESSION#2020-01-01T00:00:00.000Z'
Ejemplo n.º 9
    def updateSession(self, session, visits, print_error=True):
        '''Updates a session with new visits and attributes.

    session : Session
      The session to change the average time on page and the total time on the
    visits : list[ Visit ]
      All of the visits that belong to the session.
        if not isinstance(session, Session):
            raise ValueError('Must pass a Session object')
        if not isinstance(visits, list):
            raise ValueError('Must pass a list of Visit objects')
        if not all([isinstance(visit, Visit) for visit in visits]):
            raise ValueError('List of visits must be of Visit type')
        # Get all of the seconds per page visit that exist.
        pageTimes = [
            visit.timeOnPage for visit in visits
            if isinstance(visit.timeOnPage, float)
        # Calculate the average time the visitor spent on the pages. When there are
        # no page times, there is no average time.
        if len(pageTimes) == 1:
            averageTime = pageTimes[0]
        elif len(pageTimes) > 1:
            averageTime = np.mean(pageTimes)
            averageTime = None
        # Calculate the total time spent in this session. When there is only one
        # visit, there is no total time.
        if len(visits) == 1:
            totalTime = None
            totalTime = (visits[-1].date - visits[0].date).total_seconds()
        session = Session(visits[0].date, visits[0].ip, averageTime, totalTime)
            return {'session': session, 'visits': visits}
        except ClientError as e:
            if print_error:
                print(f'ERROR updateSession: { e }')
            if e.response['Error'][
                    'Code'] == 'ConditionalCheckFailedException':
                return {'error': f'Session not in table { session }'}
            return {'error': 'Could not update session in table'}
Ejemplo n.º 10
def test_default_init():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.sessionStart == datetime.datetime.strptime(
        session_start, '%Y-%m-%dT%H:%M:%S.%fZ')
    assert session.id == visitor_id
    assert session.avgTime == avg_time
    assert session.totalTime == total_time
Ejemplo n.º 11
    def addNewSession(self, visitor, browsers, visits):
        '''Adds a new session to the table for the given visitor.

    visitor : Visitor
      The returning visitor. They will have their number of sessions
    browsers : list[ Browser ]
      The visitor's browsers to be added to the table.
    visits: list[ Visit ]
      The visits to be added to the table.

    result : dict
      The result of adding a new session for a visitor. This could be either
      the error that occurs or the updated visitor, the browsers added, and the
      visits added to the table.
        result = self.incrementVisitorSessions(visitor)
        if 'error' in result.keys():
            return {'error': result['error']}
        visitor = result['visitor']
        result = self.addBrowsers(browsers)
        if 'error' in result.keys():
            return {'error': result['error']}
        # Get all of the seconds per page visit that exist.
        pageTimes = [
            visit.timeOnPage for visit in visits
            if isinstance(visit.timeOnPage, float)
        # Calculate the average time the visitor spent on the pages. When there are
        # no page times, there is no average time.
        if len(pageTimes) == 1:
            averageTime = pageTimes[0]
        elif len(pageTimes) > 1:
            averageTime = np.mean(pageTimes)
            averageTime = None
        # Calculate the total time spent in this session. When there is only one
        # visit, there is no total time.
        if len(visits) == 1:
            totalTime = None
            totalTime = (visits[-1].date - visits[0].date).total_seconds()
        session = Session(visits[0].date, visits[0].ip, averageTime, totalTime)
        result = self.addSession(session)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addVisits(visits)
        if 'error' in result.keys():
            return {'error': result['error']}
        return {
            'visitor': visitor,
            'browsers': browsers,
            'visits': visits,
            'session': session
Ejemplo n.º 12
def processDF( key, s3_client ):
  '''Reads a raw csv file S3 and parses the browsers, visits, and sessions.

  key : str
    The key of the '.parquet' file in the S3 bucket.
  s3_client : S3Client
    The S3 client used to get the '.parquet' file from.

  result : dict
    The browsers, visits, and sessions parsed from the file.
  request = s3_client.getObject( key )
  # Read the parquet file as a pandas DF
  df = pd.read_csv(
    io.BytesIO( request['Body'].read() ),
    sep = ',\t', engine = 'python',
    names = [
      'process', 'id', 'time', 'title', 'slug', 'userAgent', 'width',
      'height', 'x', 'y'
    usecols = [
      'id', 'time', 'title', 'slug', 'userAgent', 'width', 'height', 'x', 'y'
    index_col = 'time'
  df = df.drop_duplicates().sort_index()
  index_change = df.ne(
  ).apply( lambda x: x.index[x].tolist() ).title
  indexes = [
    ( index_change[index], index_change[index + 1] - 1 )
      if index != len( index_change ) - 1
    else (index_change[index], df.tail(1).index[0])
    for index in  range( len( index_change ) )
  visits = []
  for ( start, stop ) in indexes:
    temp = df.loc[ start: stop ]
        formatEpoch( temp.iloc[[0]].index[0] ),
        formatEpoch( temp.iloc[[0]].index[0] ),
          formatEpoch( index ): { 'x': row.x, 'y': row.y }
          for index, row in temp.iterrows()
        ( temp.iloc[[-1]].index[0] - temp.iloc[[0]].index[0] ) / 1000
  for visit in visits:
  for index in range( 1, len( visits ) ):
    visits[index - 1].nextTitle = visits[index].title
    visits[index - 1].nextSlug = visits[index].slug
  for index in range( len( visits ) - 1 ):
    visits[index + 1].prevTitle = visits[index].title
    visits[index + 1].prevSlug = visits[index].slug
  session = Session(
    np.mean( [ visit.timeOnPage for visit in visits ] ),
    np.sum( [ visit.timeOnPage for visit in visits ] )
  browsers = [
          ( df['height'] == row.height ) & ( df['width'] == row.width )
    for index, row in df.groupby(
  return{ 'visits': visits, 'session': session, 'browsers': browsers }
Ejemplo n.º 13
def test_pk():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    assert session.pk() == {'S': 'VISITOR#'}
Ejemplo n.º 14
def test_gsi2pk():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.gsi2pk() == {
        'S': f'SESSION#{ visitor_id }#{ session_start }'
Ejemplo n.º 15
def test_default_init():
    session = Session('2020-01-01T00:00:00.000Z', '', 0.1, 0.1)
    assert session.sessionStart == datetime.datetime(2020, 1, 1, 0, 0)
    assert session.ip == ''
    assert session.avgTime == 0.1
    assert session.totalTime == 0.1
Ejemplo n.º 16
def test_pk():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert session.pk() == {'S': f'VISITOR#{ visitor_id }'}
Ejemplo n.º 17
def test_repr():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    assert repr(session) == ' - 0.1'
Ejemplo n.º 18
def test_gsi2pk():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    assert session.gsi2pk() == {
        'S': 'SESSION#'
Ejemplo n.º 19
def session():
  return Session( session_start, visitor_id, avg_time, total_time )
Ejemplo n.º 20
def s3_processor(event, context):

      event ([type]): [description]
      context ([type]): [description]

      [type]: [description]
  new = 0
  updated = 0
  additional = 0
  # Get the necessary data from the S3 event.
  key = urllib.parse.unquote_plus(
    event['Records'][0]['s3']['object']['key'], encoding='utf-8'
  aws_region = event['Records'][0]['awsRegion']
  bucket_name = event['Records'][0]['s3']['bucket']['name']
  # Create the necessary clients
  dynamo_client = DynamoClient( os.environ['TABLE_NAME'], aws_region )
  s3_client = S3Client( bucket_name, aws_region )
  # Parse the record to get the browsers, visits, and session.
  record = processDF( key, s3_client )
  # Get the visitor from the table
  visitor_details = dynamo_client.getVisitorDetails( 
    Visitor( record['session'].id ) 
  # Add the visitor, visits, session, and browsers if the visitor is not in 
  # the table.
  if not 'visitor' in visitor_details:
    dynamo_client.addVisitor( Visitor( record['session'].id ) )
    dynamo_client.addSession( record['session'] )
    dynamo_client.addVisits( record['visits'] )
    dynamo_client.addBrowsers( record['browsers'] ) 
    new += 1
  # Check to see if the last session can be combined with the one in this
  # record.
    last_session = visitor_details['sessions'][-1]
    last_sessions_visits = [ 
      visit for visit in visitor_details['visits'] 
      if visit.sessionStart == last_session.sessionStart
    # Combine the visits and update the session when the last session was
    # less than 30 minutes from this record,
    if (
        last_sessions_visits[-1].date - record['visits'][0].date
      ).total_seconds() < 60 * 30
      # Update all of the record's with the previous session start
      for visit in record['visits']:
        visit.sessionStart = last_session.sessionStart
      # Update the last visit of the last session when the first visit of
      # the record is the last page visited in the previous session.
      if ( last_sessions_visits[-1].title == record['visits'][0].title ):
        updated_visit = Visit(
          last_sessions_visits[-1].id, # visitor_id 
          last_sessions_visits[-1].date, # date 
          last_sessions_visits[-1].user, # user 
          last_sessions_visits[-1].title, # title
          last_sessions_visits[-1].slug, # slug
          last_sessions_visits[-1].sessionStart, # sessionStart 
          }, # scrollEvents
            # The total time on the updated page is the last scroll
            # event on the record's first visit minus the first 
            # scroll event of the last visit of the session to 
            # update.
            ) - datetime.datetime.strptime(
          ).total_seconds(), #timeOnPage 
          last_sessions_visits[-1].prevTitle, # prevTitle
          last_sessions_visits[-1].prevSlug, # prevSlug
          record['visits'][0].nextTitle, # nextTitle
          record['visits'][0].nextSlug # nextSlug
        visits_to_update = [ updated_visit ] + record['visits'][1:] + \
        visits_to_update = record['visits'] + last_sessions_visits
      # Update all of the visits in the record to have the session
      dynamo_client.updateVisits( visits_to_update )
      dynamo_client.addBrowsers( record['browsers'] ) 
          last_session.sessionStart, # Start date-time
          last_session.id, # Visitor ID
          np.mean( [
            visit.timeOnPage for visit in visits_to_update
          ] ), # avgTime
          np.sum( [
            visit.timeOnPage for visit in visits_to_update
          ] ) # totalTime
      updated += 1
    # Add a the new session, visits, and browsers when the last session was
    # more than 30 minutes from this record.
      dynamo_client.addSession( record['session'] )
      dynamo_client.addVisits( record['visits'] )
      dynamo_client.addBrowsers( record['browsers'] ) 
      additional += 1
  return {
    'statusCode': 200,
    'body': json.dumps(f'updated { updated }\nnew { new }\nadditional {additional}')
Ejemplo n.º 21
def year_session():
  '''A proper Session object.'''
  return Session( '2020-01-01T00:00:00.000Z', visitor_id, 60.0, 60.0 )
Ejemplo n.º 22
def test_datetime_init():
    session = Session(datetime.datetime(2020, 1, 1, 0, 0), '', 0.1, 0.1)
    assert session.sessionStart == datetime.datetime(2020, 1, 1, 0, 0)
    assert session.ip == ''
    assert session.avgTime == 0.1
    assert session.totalTime == 0.1
Ejemplo n.º 23
def session():
    '''A proper Session object.'''
    return Session('2020-01-03T00:00:00.000Z', '', 60.0, 60.0)
Ejemplo n.º 24
def test_repr():
    session = Session(session_start, visitor_id, avg_time, total_time)
    assert repr(session) == f'{ visitor_id } - { total_time }'
Ejemplo n.º 25
def year_session():
    '''A proper Session object.'''
    return Session('2020-01-01T00:00:00.000Z', '', 60.0, 60.0)
Ejemplo n.º 26
def processParquet(key, dynamo_client, s3_client):
    '''Adds the data from a '.parquet' file to the DynamoDB table.

  key : str
    The key of the '.parquet' file in the S3 bucket.
  dynamo_client : DynamoClient
    The DynamoDB client used to store the transformed data.
  s3_client : S3Client
    The S3 client used to get the '.parquet' file from.
        request = s3_client.getObject(key)
        # Read the parquet file as a pandas DF
        df = pd.read_parquet(io.BytesIO(request['Body'].read()))
        # Get the unique IP addresses
        ips = df['ip'].unique()
        # Iterate over the IP addresses to organize the DF's per visitor
        for ip in ips:
            # Get the visitor details from the table.
            visitor_details = dynamo_client.getVisitorDetails(Visitor(ip))
            # Get the browsers and visits of the specific IP address.
            visitor_dict = processDF(df, ip)
            # When the visitor is not found in the database, the visitor, location,
            # browser, session, and visits must be added to the database.
            if 'error' in visitor_details.keys() \
              and visitor_details['error'] == 'Visitor not in table':
                # Add the new visitor and their data to the table
                _createNewVisitor(ip, visitor_dict['browsers'],
                                  visitor_dict['visits'], dynamo_client)
            # Otherwise, determine whether to add a new session, update a visitor's
            # session, or combine multiple sessions.
                # Skip the session when the session is already in the table.
                if Session(visitor_dict['visits'][0].date, ip, 0, 0).key() in [
                        for session in visitor_details['sessions']
                # Calculate the time deltas of the different sessions and the visitor's
                # first visit.
                time_deltas = [
                    visitor_dict['visits'][0].date - \
                    session.sessionStart + \
                    datetime.timedelta( seconds=session.totalTime ) \
                      if session.totalTime is not None \
                      else visitor_dict['visits'][0].date - session.sessionStart
                  for session in visitor_details['sessions']
                # Find all sessions that have the timedelta of less than 30 minutes on
                # the same day.
                sessions_to_update = [
                    for index in range(len(time_deltas))
                    if time_deltas[index].days < 1 and time_deltas[index].days
                    >= 0 and time_deltas[index].seconds /
                    (60 * 60) < 0.5 and time_deltas[index].seconds > 0
                # Update the visitor's session when only 1 session is found to be
                # within the timedelta.
                if len(sessions_to_update) == 1:
                                   visitor_dict['visits'], dynamo_client)
                elif len(sessions_to_update) > 1:
                    _updateSessions(sessions_to_update, visitor_dict['visits'],
                # Create a new session when the time between the last session and the
                # first of these visits is greater than 30 minutes.
                    _addSessionToVisitor(ip, visitor_dict['visits'],
    except Exception as e:
        print(f'ERROR processParquet { e }')
          f'Error getting object { key } from bucket { s3_client.bucketname }.' + \
            ' Make sure they exist and your bucket is in the same region as ' + \
            'this function.'
        raise e
Ejemplo n.º 27
    def addNewVisitor(self, visitor, location, browsers, visits):
        '''Adds a new visitor and their details the the table.

    visitor : Visitor
      The visitor to be added to the table.
    location : Location
      The visitor's location to be added to the table.
    browsers : list[ Browser ]
      The visitor's browsers to be added to the table.
    visits : list[ Visit ]
      The visits to be added to the table.

    result : dict
      The result of adding the visitor and their attributes to the table.
        result = self.addVisitor(visitor)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addLocation(location)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addBrowsers(browsers)
        if 'error' in result.keys():
            return {'error': result['error']}
        # Get all of the seconds per page visit that exist.
        pageTimes = [
            visit.timeOnPage for visit in visits
            if isinstance(visit.timeOnPage, float)
        # Calculate the average time the visitor spent on the pages. When there are
        # no page times, there is no average time.
        if len(pageTimes) == 1:
            averageTime = pageTimes[0]
        elif len(pageTimes) > 1:
            averageTime = np.mean(pageTimes)
            averageTime = None
        # Calculate the total time spent in this session. When there is only one
        # visit, there is no total time.
        if len(visits) == 1:
            totalTime = None
            totalTime = (visits[-1].date - visits[0].date).total_seconds()
        session = Session(visits[0].date, visits[0].ip, averageTime, totalTime)
        result = self.addSession(session)
        if 'error' in result.keys():
            return {'error': result['error']}
        result = self.addVisits(visits)
        if 'error' in result.keys():
            return {'error': result['error']}
        return {
            'visitor': visitor,
            'location': location,
            'browsers': browsers,
            'visits': visits,
            'session': session
Ejemplo n.º 28
def session():
  '''A proper Session object.'''
  return Session( session_start, visitor_id, avg_time, total_time )
Ejemplo n.º 29
def session():
    return Session('2020-01-01T00:00:00.000Z', '', 60.0, 60.0)