def create_location_visit( compute_index, task_period, user_id, activating_event, deactivating_event): ''' Create a record of the start and end of a visit to a URL within a tab. Note that while an `activating_event` will necessarily be associated with the URL and page title for the visited page, the deactivating event may be associated with a different URL and page. ''' LocationVisit.create( compute_index=compute_index, user_id=user_id, task_index=task_period.task_index, concern_index=task_period.concern_index, # While the location events are currently selected based on their "log date" # (when they're detected by the server), we store their start and end time # based on the time reported from the browser (the "visit_date"). We think that # these will best preserve the actual order that each visit occurred, as well # as the time spent on each of the pages, which will be invariant to the time # that it took to upload each event to the server. start=activating_event.visit_date, end=deactivating_event.visit_date, url=activating_event.url, title=activating_event.title, tab_id=activating_event.tab_id, )
def test_chain_multiple_location_visits_by_activations(self): create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", url="http://url1.com", tab_id='1', ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), event_type="Tab activated", url="http://url2.com", tab_id='2', ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0), event_type="Tab activated", url="http://url3.com", tab_id='3', ) compute_location_visits() visits = LocationVisit.select() self.assertEqual(visits.count(), 2) urls = [visit.url for visit in LocationVisit.select()] self.assertIn("http://url1.com", urls) self.assertIn("http://url2.com", urls)
def test_ignore_consecutive_page_loads_of_same_url(self): create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", url="http://url1.com", tab_id='1', ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), event_type="Tab content loaded (pageshow)", url="http://url2.com", tab_id='1', ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0), event_type="Tab content loaded (ready)", url="http://url2.com", tab_id='1', ) compute_location_visits() visits = LocationVisit.select() self.assertEqual(visits.count(), 1) urls = [visit.url for visit in LocationVisit.select()] self.assertIn("http://url1.com", urls)
def test_window_deactivated_flushes_old_location(self): time = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 3, 0, 0), ) create_location_event( log_date=time, visit_date=time, event_type="Tab activated", ) create_location_event( log_date=time + datetime.timedelta(seconds=1), visit_date=time + datetime.timedelta(seconds=1), event_type="Window deactivated", ) create_location_event( log_date=time + datetime.timedelta(seconds=2), visit_date=time + datetime.timedelta(seconds=2), event_type="Tab activated", ) compute_location_visits() # Make sure that only one event was created---when the tab was deactivated self.assertEqual(LocationVisit.select().count(), 1) # Make sure that the event that was created eneded when the window was # deactivated, and not when the next tab was activated. visits = LocationVisit.select() visit = visits[0] self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
def compute_navigation_ngrams(length, page_type_lookup): ''' Compute n-grams of sequences of pages visited, of a certain length. A `page_type_lookup` dictionary must be provided, that maps URLs to their page types. ''' # Create a new index for this computation last_compute_index = NavigationNgram.select(fn.Max(NavigationNgram.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where(LocationVisit.compute_index == visit_compute_index) # Get the distinct participant IDs and concern indexes participant_ids = set([visit.user_id for visit in visits]) concern_indexes = set([visit.concern_index for visit in visits]) # Go through every concern for every participant. For each page they visit, # increment the visits to a vertex. For each transition from one page to the next, # increment the occurrence of a transition between two page types. for participant_id in participant_ids: for concern_index in concern_indexes: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id, LocationVisit.concern_index == concern_index, ).order_by(LocationVisit.start.asc()) # Create a list of unique URLs that each participant visited urls = [visit.url for visit in participant_concern_visits] standardized_urls = [standardize_url(url) for url in urls] # Create a list of all page types visited. # If this is a redirect, then skip it. For all intents and purposes, # someone is traveling between two the page type before and after it. page_types = [] for url in standardized_urls: if url in page_type_lookup: url_info = page_type_lookup[url] if not url_info['redirect']: page_types.append(url_info['main_type']) else: logger.warn("URL %s not in page type lookup. Giving it 'Unknown' type", url) page_types.append("Unknown") # Compute n-grams using NLTK command ngrams = nltk_compute_ngrams(page_types, length) # Save each n-gram to the database for ngram_tuple in ngrams: NavigationNgram.create( compute_index=compute_index, user_id=participant_id, concern_index=concern_index, length=length, ngram=", ".join(ngram_tuple), )
def test_acceptable_activating_location_events(self): time = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), ) for activating_event_type in [ "Tab activated", "Window activated", ]: create_location_event( log_date=time, event_type=activating_event_type, tab_id='1', ) time += datetime.timedelta(seconds=1) create_location_event( log_date=time, event_type="Window deactivated", tab_id='1', ) time += datetime.timedelta(seconds=1) compute_location_visits() self.assertEqual(LocationVisit.select().count(), 2)
def test_make_location_visit_associated_with_tasks_of_same_user(self): create_location_event( user_id=0, log_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) create_location_event( user_id=0, log_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0), event_type="Tab activated", tab_id='2', url="http://url1.com" ) create_task_period( user_id=1, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), ) self.assertEqual(LocationVisit.select().count(), 0)
def test_order_events_by_visit_date(self): # We want to handle the case where the server may receive browser events # in a different order than the browser encounters them. So, when we order # the events that we've found for a task, they get ordered in "browser order." # In this test case, we create jumbled server log order, but the browser # order needs to come through for the events created. # Logged first, visited second create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") # Logged second, visited first create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0), event_type="Tab activated", tab_id='2', url="http://url2.com") create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, concern_index=5, ) compute_location_visits() visits = LocationVisit.select() visit = visits[0] self.assertEqual(visit.url, "http://url2.com")
def test_associate_location_visit_with_task_period_it_occured_within(self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), event_type="Tab activated", tab_id='2', url="http://url2.com" ) create_task_period( start=datetime.datetime(2000, 1, 1, 11, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 11, 59, 0, 0), task_index=1, ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=2, ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 3, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 4, 0, 0), task_index=3, ) compute_location_visits() visit = LocationVisit.select()[0] self.assertEqual(visit.task_index, 2)
def test_visit_times_based_on_visit_dates_not_log_dates(self): # We've found browsers are rarely synced with the server time. # To preserve the timing as it appeared to the user, we save the # times that they visited each location in the browser. # While we associate visits with tasks based on the logging date (as that # is most likely to match well on the server side), we store all visits # with times seen by the browser. create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0), event_type="Tab activated", tab_id='2', url="http://url2.com" ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, concern_index=5, ) compute_location_visits() visits = LocationVisit.select() visit = visits[0] self.assertEqual(visit.start, datetime.datetime(2000, 1, 1, 10, 0, 1, 0)) self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 10, 0, 2, 0))
def test_associate_location_visit_with_task_period_it_occured_within(self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), event_type="Tab activated", tab_id='2', url="http://url2.com") create_task_period( start=datetime.datetime(2000, 1, 1, 11, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 11, 59, 0, 0), task_index=1, ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=2, ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 3, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 4, 0, 0), task_index=3, ) compute_location_visits() visit = LocationVisit.select()[0] self.assertEqual(visit.task_index, 2)
def test_visit_times_based_on_visit_dates_not_log_dates(self): # We've found browsers are rarely synced with the server time. # To preserve the timing as it appeared to the user, we save the # times that they visited each location in the browser. # While we associate visits with tasks based on the logging date (as that # is most likely to match well on the server side), we store all visits # with times seen by the browser. create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0), event_type="Tab activated", tab_id='2', url="http://url2.com") create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, concern_index=5, ) compute_location_visits() visits = LocationVisit.select() visit = visits[0] self.assertEqual(visit.start, datetime.datetime(2000, 1, 1, 10, 0, 1, 0)) self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 10, 0, 2, 0))
def test_make_no_location_visit_if_it_doesnt_start_after_or_end_before_end_of_task( self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0), event_type="Tab activated", tab_id='2', url="http://url1.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0), event_type="Tab activated", tab_id='1', url="http://url2.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0), event_type="Tab activated", tab_id='2', url="http://url2.com") create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), ) self.assertEqual(LocationVisit.select().count(), 0)
def create_location_visit(**kwargs): arguments = { 'compute_index': 0, 'user_id': 0, 'task_index': 1, 'concern_index': 1, 'url': "http://url.com", 'tab_id': 1, 'title': "Title", 'start': datetime.datetime(2000, 1, 1, 12, 0, 1, 0), 'end': datetime.datetime(2000, 1, 1, 12, 0, 2, 0), } arguments.update(kwargs) return LocationVisit.create(**arguments)
def test_ignore_content_loaded_in_other_tabs(self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", url="http://url1.com", tab_id='1', ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), event_type="Tab content loaded (pageshow)", url="http://url2.com", tab_id='2', ) compute_location_visits() visits = LocationVisit.select() self.assertEqual(visits.count(), 0)
def test_by_default_associate_visit_with_latest_computed_task_periods(self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), event_type="Tab activated", tab_id='2', url="http://url2.com" ) # All three of these tasks have the same (matching) periods. # But the second one was the latest one to be computed (compute_index=2) create_task_period( compute_index=0, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=1, ) create_task_period( compute_index=2, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=2, ) create_task_period( compute_index=1, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, ) compute_location_visits() visit = LocationVisit.select()[0] self.assertEqual(visit.task_index, 2)
def test_if_task_compute_index_specified_only_match_tasks_with_that_index(self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), event_type="Tab activated", tab_id='2', url="http://url2.com" ) create_task_period( compute_index=0, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=1, ) create_task_period( compute_index=2, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=2, ) create_task_period( compute_index=1, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, ) # By specifying the task compute index here, we should restrict the # location to match only the task with this compute index. compute_location_visits(task_compute_index=0) visit = LocationVisit.select()[0] self.assertEqual(visit.task_index, 1)
def test_by_default_associate_visit_with_latest_computed_task_periods( self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), event_type="Tab activated", tab_id='2', url="http://url2.com") # All three of these tasks have the same (matching) periods. # But the second one was the latest one to be computed (compute_index=2) create_task_period( compute_index=0, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=1, ) create_task_period( compute_index=2, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=2, ) create_task_period( compute_index=1, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, ) compute_location_visits() visit = LocationVisit.select()[0] self.assertEqual(visit.task_index, 2)
def test_if_task_compute_index_specified_only_match_tasks_with_that_index( self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0), event_type="Tab activated", tab_id='2', url="http://url2.com") create_task_period( compute_index=0, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=1, ) create_task_period( compute_index=2, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=2, ) create_task_period( compute_index=1, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, ) # By specifying the task compute index here, we should restrict the # location to match only the task with this compute index. compute_location_visits(task_compute_index=0) visit = LocationVisit.select()[0] self.assertEqual(visit.task_index, 1)
def test_make_location_visit_associated_with_tasks_of_same_user(self): create_location_event( user_id=0, log_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") create_location_event( user_id=0, log_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0), event_type="Tab activated", tab_id='2', url="http://url1.com") create_task_period( user_id=1, start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), ) self.assertEqual(LocationVisit.select().count(), 0)
def test_create_location_visit(self): # Setup: create two location events bounding a single visit create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), event_type="Tab activated", tab_id='2', url="http://url2.com" ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, concern_index=5, ) # Test: make sure a 'visit' is created for a URL that is visited and then left, # that inherits the time bounds defined by entering and exiting the URL, and that includes # the index of the task and concern of the task period that was taking place at that time. compute_location_visits() visits = LocationVisit.select() self.assertEqual(visits.count(), 1) visit = visits[0] self.assertEqual(visit.user_id, 0) self.assertEqual(visit.task_index, 3) self.assertEqual(visit.concern_index, 5) self.assertEqual(visit.start, datetime.datetime(2000, 1, 1, 12, 0, 1, 0)) self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0)) self.assertEqual(visit.url, "http://url1.com")
def test_order_events_by_visit_date(self): # We want to handle the case where the server may receive browser events # in a different order than the browser encounters them. So, when we order # the events that we've found for a task, they get ordered in "browser order." # In this test case, we create jumbled server log order, but the browser # order needs to come through for the events created. # Logged first, visited second create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) # Logged second, visited first create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0), event_type="Tab activated", tab_id='2', url="http://url2.com" ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, concern_index=5, ) compute_location_visits() visits = LocationVisit.select() visit = visits[0] self.assertEqual(visit.url, "http://url2.com")
def test_make_no_location_visit_if_it_doesnt_start_after_or_end_before_end_of_task(self): create_location_event( log_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0), event_type="Tab activated", tab_id='2', url="http://url1.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0), event_type="Tab activated", tab_id='1', url="http://url2.com" ) create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0), event_type="Tab activated", tab_id='2', url="http://url2.com" ) create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), ) self.assertEqual(LocationVisit.select().count(), 0)
def test_create_location_visit(self): # Setup: create two location events bounding a single visit create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0), event_type="Tab activated", tab_id='1', url="http://url1.com") create_location_event( log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0), event_type="Tab activated", tab_id='2', url="http://url2.com") create_task_period( start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0), end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0), task_index=3, concern_index=5, ) # Test: make sure a 'visit' is created for a URL that is visited and then left, # that inherits the time bounds defined by entering and exiting the URL, and that includes # the index of the task and concern of the task period that was taking place at that time. compute_location_visits() visits = LocationVisit.select() self.assertEqual(visits.count(), 1) visit = visits[0] self.assertEqual(visit.user_id, 0) self.assertEqual(visit.task_index, 3) self.assertEqual(visit.concern_index, 5) self.assertEqual(visit.start, datetime.datetime(2000, 1, 1, 12, 0, 1, 0)) self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0)) self.assertEqual(visit.url, "http://url1.com")
def compute_unique_urls(page_type_lookup, exclude_users=None): exclude_users = [] if exclude_users is None else exclude_users # Create a new index for this computation last_compute_index = UniqueUrl.select(fn.Max( UniqueUrl.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select( fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where( LocationVisit.compute_index == visit_compute_index, LocationVisit.user_id.not_in(exclude_users), ) # Get the distinct participant IDs and concern indexes participant_ids = set([visit.user_id for visit in visits]) # Go through every concern for every participant. Find the number of URLs # they visited that no one else visited. for participant_id in participant_ids: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id) others_visits = visits.where(LocationVisit.user_id != participant_id) # Create a list of unique URLs that this participant visited participant_urls = [visit.url for visit in participant_concern_visits] participant_standardized_urls = [ standardize_url(url) for url in participant_urls ] # Create a list of unique URLs that all others visited others_urls = [visit.url for visit in others_visits] others_standardized_urls = [ standardize_url(url) for url in others_urls ] # Compute the URLs that this participant visited uniquely, and that they share with others unique_participant_urls = set(participant_standardized_urls) - set( others_standardized_urls) shared_participant_urls = set(participant_standardized_urls) - set( unique_participant_urls) # Save all URLs that the participant visited to the database, including # whether they visited them uniquely. for url in unique_participant_urls: UniqueUrl.create( compute_index=compute_index, user_id=participant_id, url=url, unique=True, ) for url in shared_participant_urls: UniqueUrl.create( compute_index=compute_index, user_id=participant_id, url=url, unique=False, )
def compute_navigation_graph(page_type_lookup, exclude_users=None, show_progress=False, concern_index=None): exclude_users = [] if exclude_users is None else exclude_users # Create a new index for this computation last_compute_index = NavigationVertex.select( fn.Max(NavigationVertex.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select( fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where( LocationVisit.compute_index == visit_compute_index) # If the user has provided a concern index that they want to compute the graph for, # then restrict navigation data to only that concern if concern_index is not None: visits = visits.where(LocationVisit.concern_index == concern_index) # Get the distinct participant IDs and concern indexes # Exclude any users that were not requested as part of the analysis participant_ids = set([ visit.user_id for visit in visits if visit.user_id not in exclude_users ]) concern_indexes = set([visit.concern_index for visit in visits]) # Set up progress bar. total_iterations_count = len(participant_ids) * len(concern_indexes) if show_progress: progress_bar = ProgressBar( maxval=total_iterations_count, widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Read ', Counter(), ' / ' + str(total_iterations_count) + ' sessions.' ]) progress_bar.start() # The list of vertices needs to be populated with a start and end node. # All navigation behavior starts at the "Start" node, and ends at the "End" node vertices = { "Start": Vertex("Start", occurrences=1), "End": Vertex("End", occurrences=1), } edges = {} last_vertex = vertices["Start"] iterations_count = 0 # Go through every concern for every participant. For each page they visit, # increment the visits to the corresponding vertex. For each transition from one # page to the next, increment the occurrence of a transition between two page types. for participant_id in participant_ids: for concern_index in concern_indexes: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id, LocationVisit.concern_index == concern_index, ).order_by(LocationVisit.start.asc()) for visit in participant_concern_visits: # Get the type of the page visited standardized_url = standardize_url(visit.url) if standardized_url in page_type_lookup: url_info = page_type_lookup[standardized_url] page_type = url_info['main_type'] # If this is a redirect, then just skip it. It's more important # to link the URL before it to the link the redirect points to. if url_info['redirect']: continue else: logger.warn( "URL %s not in page type lookup. Giving it 'Unknown' type", standardized_url) page_type = "Unknown" # Add a new vertex for this page type if it doesn't exist if page_type not in vertices: vertices[page_type] = Vertex(page_type) # Save that we have seen this page type one more time vertex = vertices[page_type] vertex.occurrences += 1 # Add the time spent to the total time spent for this page type time_passed = visit.end - visit.start seconds = time_passed.seconds + (time_passed.microseconds / float(1000000)) vertex.total_time += seconds # Connect an edge between the last page visited and this one if (last_vertex.page_type, vertex.page_type) not in edges: edges[(last_vertex.page_type, vertex.page_type)] = Edge(last_vertex, vertex) edge = edges[(last_vertex.page_type, vertex.page_type)] edge.occurrences += 1 # Redefine the last page so we know in the next iteration what was just visited. last_vertex = vertex # After each participant or each concern, connect from the last URL to the end vertex end_vertex = vertices['End'] if (last_vertex.page_type, end_vertex.page_type) not in edges: edges[(last_vertex.page_type, end_vertex.page_type)] = Edge(last_vertex, end_vertex) edge = edges[(last_vertex.page_type, end_vertex.page_type)] edge.occurrences += 1 # After each participant or each concern, we reset the last_page_type to "Start" last_vertex = vertices['Start'] if show_progress: iterations_count += 1 progress_bar.update(iterations_count) # Compute the mean time spent on each vertex for vertex in vertices.values(): vertex.mean_time = vertex.total_time / float(vertex.occurrences) # Compute the transition probability for each edge leaving a vertex. # First, group all edges by their source vertex get_source_page_type = lambda (source_type, target_type): source_type sorted_edge_keys = sorted(edges.keys(), key=get_source_page_type) edge_groups = itertools.groupby(sorted_edge_keys, get_source_page_type) for _, edge_group in edge_groups: # Fetch those edges in the current group # (Thos in the current group share the same source.) edge_keys = [_ for _ in edge_group] group_edges = dict(filter(lambda (k, v): k in edge_keys, edges.items())) # Compute the probability of each edge being taken total_occurrences = sum([e.occurrences for e in group_edges.values()]) for edge in group_edges.values(): edge.probability = float(edge.occurrences) / total_occurrences # Save all vertices to the database vertex_models = {} for vertex in vertices.values(): vertex_model = NavigationVertex.create( compute_index=compute_index, page_type=vertex.page_type, occurrences=vertex.occurrences, total_time=vertex.total_time, mean_time=vertex.mean_time, ) # We store a dictionary from page type to vertex model so # we can look up these models when saving the edges. vertex_models[vertex.page_type] = vertex_model # Save all edges to the database # We use a progress bar for this as there might be a lot of edges and # we upload each of them separately to the database. if show_progress: progress_bar = ProgressBar(maxval=len(edges), widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Updated graph with ', Counter(), ' / ' + str(len(edges)) + ' edges.' ]) progress_bar.start() for edge_index, edge in enumerate(edges.values(), start=1): NavigationEdge.create( compute_index=compute_index, source_vertex=vertex_models[edge.source_vertex.page_type], target_vertex=vertex_models[edge.target_vertex.page_type], occurrences=edge.occurrences, probability=edge.probability, ) if show_progress: progress_bar.update(edge_index) if show_progress: progress_bar.finish() if show_progress: progress_bar.finish()
def main(page_types_json_filename, *args, **kwargs): with open(page_types_json_filename) as page_types_file: page_types = json.load(page_types_file) # Only dump the most recently computed location visits (ignore all others). latest_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar() visits = ( LocationVisit .select() .where( LocationVisit.compute_index == latest_compute_index, ) ) # Store a list of URLs for which labels are missing urls_without_labels = set() for visit in visits: # Split URL into the constituent parts that can be used # to uniquely identify this URL in relation to others. # Note that while the same URL with different query strings may refer to the same # page, this isn't always true. Take the forum PHP script for Panda3D as an example. # The same is true with fragments, specifically for Google Groups, where fragments # are used to select different groups and topics. url_parsed = urlparse(visit.url) path = url_parsed.path fragment = url_parsed.fragment query = url_parsed.query domain = url_parsed.netloc.lstrip("www.") # Fetch semantic labels for this URL # Store missing URLs for non-pilot study participants. # Currently, it's not important for us to be able to classify URLs for pilot participants. unique_url = standardize_url(visit.url) if unique_url not in page_types: if visit.user_id > PILOT_MAX_USER_ID: urls_without_labels.add(unique_url) else: page_type = page_types[unique_url]['main_type'] time_passed = visit.end - visit.start seconds = time_passed.seconds + (time_passed.microseconds / float(1000000)) yield [[ visit.compute_index, visit.user_id, visit.task_index, visit.concern_index, visit.tab_id, visit.url, unique_url, domain, path, fragment, query, page_type, visit.title, visit.start, visit.end, seconds, ]] # Print out a list of URLs for which labels were not found for url in sorted(urls_without_labels): logger.debug("No label found for URL: %s", url) raise StopIteration
def compute_navigation_ngrams(length, page_type_lookup): ''' Compute n-grams of sequences of pages visited, of a certain length. A `page_type_lookup` dictionary must be provided, that maps URLs to their page types. ''' # Create a new index for this computation last_compute_index = NavigationNgram.select( fn.Max(NavigationNgram.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select( fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where( LocationVisit.compute_index == visit_compute_index) # Get the distinct participant IDs and concern indexes participant_ids = set([visit.user_id for visit in visits]) concern_indexes = set([visit.concern_index for visit in visits]) # Go through every concern for every participant. For each page they visit, # increment the visits to a vertex. For each transition from one page to the next, # increment the occurrence of a transition between two page types. for participant_id in participant_ids: for concern_index in concern_indexes: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id, LocationVisit.concern_index == concern_index, ).order_by(LocationVisit.start.asc()) # Create a list of unique URLs that each participant visited urls = [visit.url for visit in participant_concern_visits] standardized_urls = [standardize_url(url) for url in urls] # Create a list of all page types visited. # If this is a redirect, then skip it. For all intents and purposes, # someone is traveling between two the page type before and after it. page_types = [] for url in standardized_urls: if url in page_type_lookup: url_info = page_type_lookup[url] if not url_info['redirect']: page_types.append(url_info['main_type']) else: logger.warn( "URL %s not in page type lookup. Giving it 'Unknown' type", url) page_types.append("Unknown") # Compute n-grams using NLTK command ngrams = nltk_compute_ngrams(page_types, length) # Save each n-gram to the database for ngram_tuple in ngrams: NavigationNgram.create( compute_index=compute_index, user_id=participant_id, concern_index=concern_index, length=length, ngram=", ".join(ngram_tuple), )
def compute_location_visits(task_compute_index=None): # Create a new index for this computation last_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Determine what will be the compute index of the task periods that these visits are matched to. # This will become the latest compute index if it hasn't been specified. if task_compute_index is None: task_compute_index = TaskPeriod.select(fn.Max(TaskPeriod.compute_index)).scalar() # Compute the ID of the last user to complete the study max_user_id = LocationEvent.select(fn.Max(LocationEvent.user_id)).scalar() # Compute the time that each user spends in each question for user_id in range(0, max_user_id + 1): # Visit all tasks for each user for task_index in TASK_RANGE: # Fetch the period of time for this task task_periods = ( TaskPeriod.select() .where( TaskPeriod.compute_index == task_compute_index, TaskPeriod.task_index == task_index, TaskPeriod.user_id == user_id, ) ) if task_periods.count() < 1: continue task_period = task_periods[0] # Fetch the events for all locations the user has visited during this task location_events = ( LocationEvent .select() .where( LocationEvent.user_id == user_id, LocationEvent.log_date >= task_period.start, LocationEvent.log_date <= task_period.end, ) # While we inspect the "log date" when the server received notice of # the event, we use the "visit date" when the browser experienced the # events to sort them, as we think this will preserve the original # ordering much better. See the notes in the `create_location_visit` # method for more details. .order_by(LocationEvent.visit_date.asc()) ) # In the space below, we assemble "visits" from sequences of events. # This dictionary maps a tab-URL tuple to the event that made it active. active_tab_id = None active_tab_latest_url_event = None for event in location_events: # When a new page is loaded in the current tab, this is the end of the # last event and the start of a new one (that will be in the same tab). if event.event_type in NEW_PAGE_EVENTS: if active_tab_id is not None and event.tab_id == active_tab_id: if event.url != active_tab_latest_url_event.url: create_location_visit( compute_index=compute_index, task_period=task_period, user_id=user_id, activating_event=active_tab_latest_url_event, deactivating_event=event, ) active_tab_latest_url_event = event # If the window has been deactivated, then end the visit in the current tab if event.event_type in DEACTIVATING_EVENTS: if active_tab_id is not None: create_location_visit( compute_index=compute_index, task_period=task_period, user_id=user_id, activating_event=active_tab_latest_url_event, deactivating_event=event, ) active_tab_id = None active_tab_latest_url_event = None # If a tab or window has been activated, that tab is now active. if event.event_type in ACTIVATING_EVENTS: # End any visits in progress for other tabs if active_tab_id is not None: create_location_visit( compute_index=compute_index, task_period=task_period, user_id=user_id, activating_event=active_tab_latest_url_event, deactivating_event=event, ) # Set the new active tab active_tab_id = event.tab_id active_tab_latest_url_event = event
def main(page_types_json_filename, *args, **kwargs): with open(page_types_json_filename) as page_types_file: page_types = json.load(page_types_file) # Only dump the most recently computed location visits (ignore all others). latest_compute_index = LocationVisit.select( fn.Max(LocationVisit.compute_index)).scalar() visits = (LocationVisit.select().where( LocationVisit.compute_index == latest_compute_index, )) # Store a list of URLs for which labels are missing urls_without_labels = set() for visit in visits: # Split URL into the constituent parts that can be used # to uniquely identify this URL in relation to others. # Note that while the same URL with different query strings may refer to the same # page, this isn't always true. Take the forum PHP script for Panda3D as an example. # The same is true with fragments, specifically for Google Groups, where fragments # are used to select different groups and topics. url_parsed = urlparse(visit.url) path = url_parsed.path fragment = url_parsed.fragment query = url_parsed.query domain = url_parsed.netloc.lstrip("www.") # Fetch semantic labels for this URL # Store missing URLs for non-pilot study participants. # Currently, it's not important for us to be able to classify URLs for pilot participants. unique_url = standardize_url(visit.url) if unique_url not in page_types: if visit.user_id > PILOT_MAX_USER_ID: urls_without_labels.add(unique_url) else: page_type = page_types[unique_url]['main_type'] time_passed = visit.end - visit.start seconds = time_passed.seconds + (time_passed.microseconds / float(1000000)) yield [[ visit.compute_index, visit.user_id, visit.task_index, visit.concern_index, visit.tab_id, visit.url, unique_url, domain, path, fragment, query, page_type, visit.title, visit.start, visit.end, seconds, ]] # Print out a list of URLs for which labels were not found for url in sorted(urls_without_labels): logger.debug("No label found for URL: %s", url) raise StopIteration