Exemple #1
0
def create_location_visit(
        compute_index, task_period, user_id,
        activating_event, deactivating_event):
    '''
    Create a record of the start and end of a visit to a URL within a tab.
    Note that while an `activating_event` will necessarily be associated with the URL
    and page title for the visited page, the deactivating event may be associated
    with a different URL and page.
    '''
    LocationVisit.create(
        compute_index=compute_index,
        user_id=user_id,
        task_index=task_period.task_index,
        concern_index=task_period.concern_index,
        # While the location events are currently selected based on their "log date"
        # (when they're detected by the server), we store their start and end time
        # based on the time reported from the browser (the "visit_date").  We think that
        # these will best preserve the actual order that each visit occurred, as well
        # as the time spent on each of the pages, which will be invariant to the time
        # that it took to upload each event to the server.
        start=activating_event.visit_date,
        end=deactivating_event.visit_date,
        url=activating_event.url,
        title=activating_event.title,
        tab_id=activating_event.tab_id,
    )
    def test_chain_multiple_location_visits_by_activations(self):

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            url="http://url1.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab activated",
            url="http://url2.com",
            tab_id='2',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            event_type="Tab activated",
            url="http://url3.com",
            tab_id='3',
        )

        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 2)
        urls = [visit.url for visit in LocationVisit.select()]
        self.assertIn("http://url1.com", urls)
        self.assertIn("http://url2.com", urls)
    def test_ignore_consecutive_page_loads_of_same_url(self):

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            url="http://url1.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab content loaded (pageshow)",
            url="http://url2.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            event_type="Tab content loaded (ready)",
            url="http://url2.com",
            tab_id='1',
        )

        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 1)
        urls = [visit.url for visit in LocationVisit.select()]
        self.assertIn("http://url1.com", urls)
Exemple #4
0
    def test_chain_multiple_location_visits_by_activations(self):

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            url="http://url1.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab activated",
            url="http://url2.com",
            tab_id='2',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            event_type="Tab activated",
            url="http://url3.com",
            tab_id='3',
        )

        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 2)
        urls = [visit.url for visit in LocationVisit.select()]
        self.assertIn("http://url1.com", urls)
        self.assertIn("http://url2.com", urls)
Exemple #5
0
    def test_window_deactivated_flushes_old_location(self):

        time = datetime.datetime(2000, 1, 1, 12, 0, 1, 0)

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
        )
        create_location_event(
            log_date=time,
            visit_date=time,
            event_type="Tab activated",
        )
        create_location_event(
            log_date=time + datetime.timedelta(seconds=1),
            visit_date=time + datetime.timedelta(seconds=1),
            event_type="Window deactivated",
        )
        create_location_event(
            log_date=time + datetime.timedelta(seconds=2),
            visit_date=time + datetime.timedelta(seconds=2),
            event_type="Tab activated",
        )

        compute_location_visits()

        # Make sure that only one event was created---when the tab was deactivated
        self.assertEqual(LocationVisit.select().count(), 1)

        # Make sure that the event that was created eneded when the window was
        # deactivated, and not when the next tab was activated.
        visits = LocationVisit.select()
        visit = visits[0]
        self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
Exemple #6
0
    def test_ignore_consecutive_page_loads_of_same_url(self):

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            url="http://url1.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab content loaded (pageshow)",
            url="http://url2.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            event_type="Tab content loaded (ready)",
            url="http://url2.com",
            tab_id='1',
        )

        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 1)
        urls = [visit.url for visit in LocationVisit.select()]
        self.assertIn("http://url1.com", urls)
    def test_window_deactivated_flushes_old_location(self):

        time = datetime.datetime(2000, 1, 1, 12, 0, 1, 0)

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
        )
        create_location_event(
            log_date=time,
            visit_date=time,
            event_type="Tab activated",
        )
        create_location_event(
            log_date=time + datetime.timedelta(seconds=1),
            visit_date=time + datetime.timedelta(seconds=1),
            event_type="Window deactivated",
        )
        create_location_event(
            log_date=time + datetime.timedelta(seconds=2),
            visit_date=time + datetime.timedelta(seconds=2),
            event_type="Tab activated",
        )

        compute_location_visits()

        # Make sure that only one event was created---when the tab was deactivated
        self.assertEqual(LocationVisit.select().count(), 1)

        # Make sure that the event that was created eneded when the window was
        # deactivated, and not when the next tab was activated.
        visits = LocationVisit.select()
        visit = visits[0]
        self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
def compute_navigation_ngrams(length, page_type_lookup):
    '''
    Compute n-grams of sequences of pages visited, of a certain length.
    A `page_type_lookup` dictionary must be provided, that maps URLs to their page types.
    '''
    # Create a new index for this computation
    last_compute_index = NavigationNgram.select(fn.Max(NavigationNgram.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(LocationVisit.compute_index == visit_compute_index)

    # Get the distinct participant IDs and concern indexes
    participant_ids = set([visit.user_id for visit in visits])
    concern_indexes = set([visit.concern_index for visit in visits])

    # Go through every concern for every participant.  For each page they visit,
    # increment the visits to a vertex.  For each transition from one page to the next,
    # increment the occurrence of a transition between two page types.
    for participant_id in participant_ids:
        for concern_index in concern_indexes:

            participant_concern_visits = visits.where(
                LocationVisit.user_id == participant_id,
                LocationVisit.concern_index == concern_index,
            ).order_by(LocationVisit.start.asc())

            # Create a list of unique URLs that each participant visited
            urls = [visit.url for visit in participant_concern_visits]
            standardized_urls = [standardize_url(url) for url in urls]

            # Create a list of all page types visited.
            # If this is a redirect, then skip it.  For all intents and purposes,
            # someone is traveling between two the page type before and after it.
            page_types = []
            for url in standardized_urls:
                if url in page_type_lookup:
                    url_info = page_type_lookup[url]
                    if not url_info['redirect']:
                        page_types.append(url_info['main_type'])
                else:
                    logger.warn("URL %s not in page type lookup.  Giving it 'Unknown' type", url)
                    page_types.append("Unknown")

            # Compute n-grams using NLTK command
            ngrams = nltk_compute_ngrams(page_types, length)

            # Save each n-gram to the database
            for ngram_tuple in ngrams:
                NavigationNgram.create(
                    compute_index=compute_index,
                    user_id=participant_id,
                    concern_index=concern_index,
                    length=length,
                    ngram=", ".join(ngram_tuple),
                )
Exemple #9
0
    def test_acceptable_activating_location_events(self):

        time = datetime.datetime(2000, 1, 1, 12, 0, 1, 0)
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )

        for activating_event_type in [
                "Tab activated",
                "Window activated",
        ]:
            create_location_event(
                log_date=time,
                event_type=activating_event_type,
                tab_id='1',
            )
            time += datetime.timedelta(seconds=1)
            create_location_event(
                log_date=time,
                event_type="Window deactivated",
                tab_id='1',
            )
            time += datetime.timedelta(seconds=1)

        compute_location_visits()
        self.assertEqual(LocationVisit.select().count(), 2)
    def test_acceptable_activating_location_events(self):

        time = datetime.datetime(2000, 1, 1, 12, 0, 1, 0)
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )

        for activating_event_type in [
                "Tab activated",
                "Window activated",
                ]:
            create_location_event(
                log_date=time,
                event_type=activating_event_type,
                tab_id='1',
            )
            time += datetime.timedelta(seconds=1)
            create_location_event(
                log_date=time,
                event_type="Window deactivated",
                tab_id='1',
            )
            time += datetime.timedelta(seconds=1)

        compute_location_visits()
        self.assertEqual(LocationVisit.select().count(), 2)
    def test_make_location_visit_associated_with_tasks_of_same_user(self):

        create_location_event(
            user_id=0,
            log_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )
        create_location_event(
            user_id=0,
            log_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url1.com"
        )
        create_task_period(
            user_id=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )

        self.assertEqual(LocationVisit.select().count(), 0)
Exemple #12
0
    def test_order_events_by_visit_date(self):

        # We want to handle the case where the server may receive browser events
        # in a different order than the browser encounters them.  So, when we order
        # the events that we've found for a task, they get ordered in "browser order."
        # In this test case, we create jumbled server log order, but the browser
        # order needs to come through for the events created.

        # Logged first, visited second
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")

        # Logged second, visited first
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com")

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )

        compute_location_visits()
        visits = LocationVisit.select()
        visit = visits[0]
        self.assertEqual(visit.url, "http://url2.com")
    def test_associate_location_visit_with_task_period_it_occured_within(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com"
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 11, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 11, 59, 0, 0),
            task_index=1,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 4, 0, 0),
            task_index=3,
        )

        compute_location_visits()
        visit = LocationVisit.select()[0]
        self.assertEqual(visit.task_index, 2)
    def test_visit_times_based_on_visit_dates_not_log_dates(self):

        # We've found browsers are rarely synced with the server time.
        # To preserve the timing as it appeared to the user, we save the
        # times that they visited each location in the browser.
        # While we associate visits with tasks based on the logging date (as that
        # is most likely to match well on the server side), we store all visits
        # with times seen by the browser.

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com"
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )

        compute_location_visits()
        visits = LocationVisit.select()
        visit = visits[0]
        self.assertEqual(visit.start, datetime.datetime(2000, 1, 1, 10, 0, 1, 0))
        self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 10, 0, 2, 0))
Exemple #15
0
    def test_associate_location_visit_with_task_period_it_occured_within(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com")
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 11, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 11, 59, 0, 0),
            task_index=1,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 4, 0, 0),
            task_index=3,
        )

        compute_location_visits()
        visit = LocationVisit.select()[0]
        self.assertEqual(visit.task_index, 2)
Exemple #16
0
    def test_visit_times_based_on_visit_dates_not_log_dates(self):

        # We've found browsers are rarely synced with the server time.
        # To preserve the timing as it appeared to the user, we save the
        # times that they visited each location in the browser.
        # While we associate visits with tasks based on the logging date (as that
        # is most likely to match well on the server side), we store all visits
        # with times seen by the browser.

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com")
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )

        compute_location_visits()
        visits = LocationVisit.select()
        visit = visits[0]
        self.assertEqual(visit.start,
                         datetime.datetime(2000, 1, 1, 10, 0, 1, 0))
        self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 10, 0, 2, 0))
Exemple #17
0
    def test_make_no_location_visit_if_it_doesnt_start_after_or_end_before_end_of_task(
            self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url1.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url2.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com")
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )

        self.assertEqual(LocationVisit.select().count(), 0)
def create_location_visit(**kwargs):
    arguments = {
        'compute_index': 0,
        'user_id': 0,
        'task_index': 1,
        'concern_index': 1,
        'url': "http://url.com",
        'tab_id': 1,
        'title': "Title",
        'start': datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
        'end': datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
    }
    arguments.update(kwargs)
    return LocationVisit.create(**arguments)
Exemple #19
0
    def test_ignore_content_loaded_in_other_tabs(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            url="http://url1.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab content loaded (pageshow)",
            url="http://url2.com",
            tab_id='2',
        )

        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 0)
    def test_ignore_content_loaded_in_other_tabs(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            url="http://url1.com",
            tab_id='1',
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab content loaded (pageshow)",
            url="http://url2.com",
            tab_id='2',
        )

        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 0)
    def test_by_default_associate_visit_with_latest_computed_task_periods(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com"
        )

        # All three of these tasks have the same (matching) periods.
        # But the second one was the latest one to be computed (compute_index=2)
        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        compute_location_visits()
        visit = LocationVisit.select()[0]
        self.assertEqual(visit.task_index, 2)
    def test_if_task_compute_index_specified_only_match_tasks_with_that_index(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com"
        )
        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        # By specifying the task compute index here, we should restrict the
        # location to match only the task with this compute index.
        compute_location_visits(task_compute_index=0)
        visit = LocationVisit.select()[0]
        self.assertEqual(visit.task_index, 1)
Exemple #23
0
    def test_by_default_associate_visit_with_latest_computed_task_periods(
            self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com")

        # All three of these tasks have the same (matching) periods.
        # But the second one was the latest one to be computed (compute_index=2)
        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        compute_location_visits()
        visit = LocationVisit.select()[0]
        self.assertEqual(visit.task_index, 2)
Exemple #24
0
    def test_if_task_compute_index_specified_only_match_tasks_with_that_index(
            self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com")
        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        # By specifying the task compute index here, we should restrict the
        # location to match only the task with this compute index.
        compute_location_visits(task_compute_index=0)
        visit = LocationVisit.select()[0]
        self.assertEqual(visit.task_index, 1)
Exemple #25
0
    def test_make_location_visit_associated_with_tasks_of_same_user(self):

        create_location_event(
            user_id=0,
            log_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 11, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")
        create_location_event(
            user_id=0,
            log_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 11, 0, 2, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url1.com")
        create_task_period(
            user_id=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )

        self.assertEqual(LocationVisit.select().count(), 0)
    def test_create_location_visit(self):

        # Setup: create two location events bounding a single visit
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com"
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )

        # Test: make sure a 'visit' is created for a URL that is visited and then left,
        # that inherits the time bounds defined by entering and exiting the URL, and that includes
        # the index of the task and concern of the task period that was taking place at that time.
        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 1)
        visit = visits[0]
        self.assertEqual(visit.user_id, 0)
        self.assertEqual(visit.task_index, 3)
        self.assertEqual(visit.concern_index, 5)
        self.assertEqual(visit.start, datetime.datetime(2000, 1, 1, 12, 0, 1, 0))
        self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
        self.assertEqual(visit.url, "http://url1.com")
    def test_order_events_by_visit_date(self):

        # We want to handle the case where the server may receive browser events
        # in a different order than the browser encounters them.  So, when we order
        # the events that we've found for a task, they get ordered in "browser order."
        # In this test case, we create jumbled server log order, but the browser
        # order needs to come through for the events created.

        # Logged first, visited second
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 2, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )

        # Logged second, visited first
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 10, 0, 1, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com"
        )

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )

        compute_location_visits()
        visits = LocationVisit.select()
        visit = visits[0]
        self.assertEqual(visit.url, "http://url2.com")
    def test_make_no_location_visit_if_it_doesnt_start_after_or_end_before_end_of_task(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 11, 59, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 1, 0, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url1.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url2.com"
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 5, 0, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com"
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
        )

        self.assertEqual(LocationVisit.select().count(), 0)
Exemple #29
0
    def test_create_location_visit(self):

        # Setup: create two location events bounding a single visit
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab activated",
            tab_id='1',
            url="http://url1.com")
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
            event_type="Tab activated",
            tab_id='2',
            url="http://url2.com")
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )

        # Test: make sure a 'visit' is created for a URL that is visited and then left,
        # that inherits the time bounds defined by entering and exiting the URL, and that includes
        # the index of the task and concern of the task period that was taking place at that time.
        compute_location_visits()
        visits = LocationVisit.select()
        self.assertEqual(visits.count(), 1)
        visit = visits[0]
        self.assertEqual(visit.user_id, 0)
        self.assertEqual(visit.task_index, 3)
        self.assertEqual(visit.concern_index, 5)
        self.assertEqual(visit.start,
                         datetime.datetime(2000, 1, 1, 12, 0, 1, 0))
        self.assertEqual(visit.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
        self.assertEqual(visit.url, "http://url1.com")
def compute_unique_urls(page_type_lookup, exclude_users=None):

    exclude_users = [] if exclude_users is None else exclude_users

    # Create a new index for this computation
    last_compute_index = UniqueUrl.select(fn.Max(
        UniqueUrl.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(
        LocationVisit.compute_index == visit_compute_index,
        LocationVisit.user_id.not_in(exclude_users),
    )

    # Get the distinct participant IDs and concern indexes
    participant_ids = set([visit.user_id for visit in visits])

    # Go through every concern for every participant.  Find the number of URLs
    # they visited that no one else visited.
    for participant_id in participant_ids:

        participant_concern_visits = visits.where(
            LocationVisit.user_id == participant_id)
        others_visits = visits.where(LocationVisit.user_id != participant_id)

        # Create a list of unique URLs that this participant visited
        participant_urls = [visit.url for visit in participant_concern_visits]
        participant_standardized_urls = [
            standardize_url(url) for url in participant_urls
        ]

        # Create a list of unique URLs that all others visited
        others_urls = [visit.url for visit in others_visits]
        others_standardized_urls = [
            standardize_url(url) for url in others_urls
        ]

        # Compute the URLs that this participant visited uniquely, and that they share with others
        unique_participant_urls = set(participant_standardized_urls) - set(
            others_standardized_urls)
        shared_participant_urls = set(participant_standardized_urls) - set(
            unique_participant_urls)

        # Save all URLs that the participant visited to the database, including
        # whether they visited them uniquely.
        for url in unique_participant_urls:
            UniqueUrl.create(
                compute_index=compute_index,
                user_id=participant_id,
                url=url,
                unique=True,
            )

        for url in shared_participant_urls:
            UniqueUrl.create(
                compute_index=compute_index,
                user_id=participant_id,
                url=url,
                unique=False,
            )
def compute_navigation_graph(page_type_lookup,
                             exclude_users=None,
                             show_progress=False,
                             concern_index=None):

    exclude_users = [] if exclude_users is None else exclude_users

    # Create a new index for this computation
    last_compute_index = NavigationVertex.select(
        fn.Max(NavigationVertex.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(
        LocationVisit.compute_index == visit_compute_index)

    # If the user has provided a concern index that they want to compute the graph for,
    # then restrict navigation data to only that concern
    if concern_index is not None:
        visits = visits.where(LocationVisit.concern_index == concern_index)

    # Get the distinct participant IDs and concern indexes
    # Exclude any users that were not requested as part of the analysis
    participant_ids = set([
        visit.user_id for visit in visits if visit.user_id not in exclude_users
    ])
    concern_indexes = set([visit.concern_index for visit in visits])

    # Set up progress bar.
    total_iterations_count = len(participant_ids) * len(concern_indexes)
    if show_progress:
        progress_bar = ProgressBar(
            maxval=total_iterations_count,
            widgets=[
                'Progress: ',
                Percentage(), ' ',
                Bar(marker=RotatingMarker()), ' ',
                ETA(), ' Read ',
                Counter(), ' / ' + str(total_iterations_count) + ' sessions.'
            ])
        progress_bar.start()

    # The list of vertices needs to be populated with a start and end node.
    # All navigation behavior starts at the "Start" node, and ends at the "End" node
    vertices = {
        "Start": Vertex("Start", occurrences=1),
        "End": Vertex("End", occurrences=1),
    }
    edges = {}
    last_vertex = vertices["Start"]
    iterations_count = 0

    # Go through every concern for every participant.  For each page they visit,
    # increment the visits to the corresponding vertex.  For each transition from one
    # page to the next, increment the occurrence of a transition between two page types.
    for participant_id in participant_ids:
        for concern_index in concern_indexes:

            participant_concern_visits = visits.where(
                LocationVisit.user_id == participant_id,
                LocationVisit.concern_index == concern_index,
            ).order_by(LocationVisit.start.asc())

            for visit in participant_concern_visits:

                # Get the type of the page visited
                standardized_url = standardize_url(visit.url)
                if standardized_url in page_type_lookup:
                    url_info = page_type_lookup[standardized_url]
                    page_type = url_info['main_type']
                    # If this is a redirect, then just skip it.  It's more important
                    # to link the URL before it to the link the redirect points to.
                    if url_info['redirect']:
                        continue
                else:
                    logger.warn(
                        "URL %s not in page type lookup.  Giving it 'Unknown' type",
                        standardized_url)
                    page_type = "Unknown"

                # Add a new vertex for this page type if it doesn't exist
                if page_type not in vertices:
                    vertices[page_type] = Vertex(page_type)

                # Save that we have seen this page type one more time
                vertex = vertices[page_type]
                vertex.occurrences += 1

                # Add the time spent to the total time spent for this page type
                time_passed = visit.end - visit.start
                seconds = time_passed.seconds + (time_passed.microseconds /
                                                 float(1000000))
                vertex.total_time += seconds

                # Connect an edge between the last page visited and this one
                if (last_vertex.page_type, vertex.page_type) not in edges:
                    edges[(last_vertex.page_type,
                           vertex.page_type)] = Edge(last_vertex, vertex)
                edge = edges[(last_vertex.page_type, vertex.page_type)]
                edge.occurrences += 1

                # Redefine the last page so we know in the next iteration what was just visited.
                last_vertex = vertex

            # After each participant or each concern, connect from the last URL to the end vertex
            end_vertex = vertices['End']
            if (last_vertex.page_type, end_vertex.page_type) not in edges:
                edges[(last_vertex.page_type,
                       end_vertex.page_type)] = Edge(last_vertex, end_vertex)
            edge = edges[(last_vertex.page_type, end_vertex.page_type)]
            edge.occurrences += 1

            # After each participant or each concern, we reset the last_page_type to "Start"
            last_vertex = vertices['Start']

            if show_progress:
                iterations_count += 1
                progress_bar.update(iterations_count)

    # Compute the mean time spent on each vertex
    for vertex in vertices.values():
        vertex.mean_time = vertex.total_time / float(vertex.occurrences)

    # Compute the transition probability for each edge leaving a vertex.
    # First, group all edges by their source vertex
    get_source_page_type = lambda (source_type, target_type): source_type
    sorted_edge_keys = sorted(edges.keys(), key=get_source_page_type)
    edge_groups = itertools.groupby(sorted_edge_keys, get_source_page_type)

    for _, edge_group in edge_groups:

        # Fetch those edges in the current group
        # (Thos in the current group share the same source.)
        edge_keys = [_ for _ in edge_group]
        group_edges = dict(filter(lambda (k, v): k in edge_keys,
                                  edges.items()))

        # Compute the probability of each edge being taken
        total_occurrences = sum([e.occurrences for e in group_edges.values()])
        for edge in group_edges.values():
            edge.probability = float(edge.occurrences) / total_occurrences

    # Save all vertices to the database
    vertex_models = {}
    for vertex in vertices.values():
        vertex_model = NavigationVertex.create(
            compute_index=compute_index,
            page_type=vertex.page_type,
            occurrences=vertex.occurrences,
            total_time=vertex.total_time,
            mean_time=vertex.mean_time,
        )
        # We store a dictionary from page type to vertex model so
        # we can look up these models when saving the edges.
        vertex_models[vertex.page_type] = vertex_model

    # Save all edges to the database
    # We use a progress bar for this as there might be a lot of edges and
    # we upload each of them separately to the database.
    if show_progress:
        progress_bar = ProgressBar(maxval=len(edges),
                                   widgets=[
                                       'Progress: ',
                                       Percentage(), ' ',
                                       Bar(marker=RotatingMarker()), ' ',
                                       ETA(), ' Updated graph with ',
                                       Counter(),
                                       ' / ' + str(len(edges)) + ' edges.'
                                   ])
        progress_bar.start()

    for edge_index, edge in enumerate(edges.values(), start=1):
        NavigationEdge.create(
            compute_index=compute_index,
            source_vertex=vertex_models[edge.source_vertex.page_type],
            target_vertex=vertex_models[edge.target_vertex.page_type],
            occurrences=edge.occurrences,
            probability=edge.probability,
        )
        if show_progress:
            progress_bar.update(edge_index)

    if show_progress:
        progress_bar.finish()

    if show_progress:
        progress_bar.finish()
def main(page_types_json_filename, *args, **kwargs):

    with open(page_types_json_filename) as page_types_file:
        page_types = json.load(page_types_file)

    # Only dump the most recently computed location visits (ignore all others).
    latest_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar()
    visits = (
        LocationVisit
        .select()
        .where(
            LocationVisit.compute_index == latest_compute_index,
        )
    )

    # Store a list of URLs for which labels are missing
    urls_without_labels = set()

    for visit in visits:

        # Split URL into the constituent parts that can be used
        # to uniquely identify this URL in relation to others.
        # Note that while the same URL with different query strings may refer to the same
        # page, this isn't always true.  Take the forum PHP script for Panda3D as an example.
        # The same is true with fragments, specifically for Google Groups, where fragments
        # are used to select different groups and topics.
        url_parsed = urlparse(visit.url)
        path = url_parsed.path
        fragment = url_parsed.fragment
        query = url_parsed.query
        domain = url_parsed.netloc.lstrip("www.")

        # Fetch semantic labels for this URL
        # Store missing URLs for non-pilot study participants.
        # Currently, it's not important for us to be able to classify URLs for pilot participants.
        unique_url = standardize_url(visit.url)
        if unique_url not in page_types:
            if visit.user_id > PILOT_MAX_USER_ID:
                urls_without_labels.add(unique_url)
        else:
            page_type = page_types[unique_url]['main_type']

        time_passed = visit.end - visit.start
        seconds = time_passed.seconds + (time_passed.microseconds / float(1000000))

        yield [[
            visit.compute_index,
            visit.user_id,
            visit.task_index,
            visit.concern_index,
            visit.tab_id,
            visit.url,
            unique_url,
            domain,
            path,
            fragment,
            query,
            page_type,
            visit.title,
            visit.start,
            visit.end,
            seconds,
        ]]

    # Print out a list of URLs for which labels were not found
    for url in sorted(urls_without_labels):
        logger.debug("No label found for URL: %s", url)

    raise StopIteration
def compute_navigation_ngrams(length, page_type_lookup):
    '''
    Compute n-grams of sequences of pages visited, of a certain length.
    A `page_type_lookup` dictionary must be provided, that maps URLs to their page types.
    '''
    # Create a new index for this computation
    last_compute_index = NavigationNgram.select(
        fn.Max(NavigationNgram.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(
        LocationVisit.compute_index == visit_compute_index)

    # Get the distinct participant IDs and concern indexes
    participant_ids = set([visit.user_id for visit in visits])
    concern_indexes = set([visit.concern_index for visit in visits])

    # Go through every concern for every participant.  For each page they visit,
    # increment the visits to a vertex.  For each transition from one page to the next,
    # increment the occurrence of a transition between two page types.
    for participant_id in participant_ids:
        for concern_index in concern_indexes:

            participant_concern_visits = visits.where(
                LocationVisit.user_id == participant_id,
                LocationVisit.concern_index == concern_index,
            ).order_by(LocationVisit.start.asc())

            # Create a list of unique URLs that each participant visited
            urls = [visit.url for visit in participant_concern_visits]
            standardized_urls = [standardize_url(url) for url in urls]

            # Create a list of all page types visited.
            # If this is a redirect, then skip it.  For all intents and purposes,
            # someone is traveling between two the page type before and after it.
            page_types = []
            for url in standardized_urls:
                if url in page_type_lookup:
                    url_info = page_type_lookup[url]
                    if not url_info['redirect']:
                        page_types.append(url_info['main_type'])
                else:
                    logger.warn(
                        "URL %s not in page type lookup.  Giving it 'Unknown' type",
                        url)
                    page_types.append("Unknown")

            # Compute n-grams using NLTK command
            ngrams = nltk_compute_ngrams(page_types, length)

            # Save each n-gram to the database
            for ngram_tuple in ngrams:
                NavigationNgram.create(
                    compute_index=compute_index,
                    user_id=participant_id,
                    concern_index=concern_index,
                    length=length,
                    ngram=", ".join(ngram_tuple),
                )
Exemple #34
0
def compute_location_visits(task_compute_index=None):

    # Create a new index for this computation
    last_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Determine what will be the compute index of the task periods that these visits are matched to.
    # This will become the latest compute index if it hasn't been specified.
    if task_compute_index is None:
        task_compute_index = TaskPeriod.select(fn.Max(TaskPeriod.compute_index)).scalar()

    # Compute the ID of the last user to complete the study
    max_user_id = LocationEvent.select(fn.Max(LocationEvent.user_id)).scalar()

    # Compute the time that each user spends in each question
    for user_id in range(0, max_user_id + 1):

        # Visit all tasks for each user
        for task_index in TASK_RANGE:

            # Fetch the period of time for this task
            task_periods = (
                TaskPeriod.select()
                .where(
                    TaskPeriod.compute_index == task_compute_index,
                    TaskPeriod.task_index == task_index,
                    TaskPeriod.user_id == user_id,
                )
            )
            if task_periods.count() < 1:
                continue
            task_period = task_periods[0]

            # Fetch the events for all locations the user has visited during this task
            location_events = (
                LocationEvent
                .select()
                .where(
                    LocationEvent.user_id == user_id,
                    LocationEvent.log_date >= task_period.start,
                    LocationEvent.log_date <= task_period.end,
                )
                # While we inspect the "log date" when the server received notice of
                # the event, we use the "visit date" when the browser experienced the
                # events to sort them, as we think this will preserve the original
                # ordering much better.  See the notes in the `create_location_visit`
                # method for more details.
                .order_by(LocationEvent.visit_date.asc())
            )

            # In the space below, we assemble "visits" from sequences of events.
            # This dictionary maps a tab-URL tuple to the event that made it active.
            active_tab_id = None
            active_tab_latest_url_event = None

            for event in location_events:

                # When a new page is loaded in the current tab, this is the end of the
                # last event and the start of a new one (that will be in the same tab).
                if event.event_type in NEW_PAGE_EVENTS:
                    if active_tab_id is not None and event.tab_id == active_tab_id:
                        if event.url != active_tab_latest_url_event.url:
                            create_location_visit(
                                compute_index=compute_index,
                                task_period=task_period,
                                user_id=user_id,
                                activating_event=active_tab_latest_url_event,
                                deactivating_event=event,
                            )
                            active_tab_latest_url_event = event

                # If the window has been deactivated, then end the visit in the current tab
                if event.event_type in DEACTIVATING_EVENTS:
                    if active_tab_id is not None:
                        create_location_visit(
                            compute_index=compute_index,
                            task_period=task_period,
                            user_id=user_id,
                            activating_event=active_tab_latest_url_event,
                            deactivating_event=event,
                        )
                        active_tab_id = None
                        active_tab_latest_url_event = None

                # If a tab or window has been activated, that tab is now active.
                if event.event_type in ACTIVATING_EVENTS:

                    # End any visits in progress for other tabs
                    if active_tab_id is not None:
                        create_location_visit(
                            compute_index=compute_index,
                            task_period=task_period,
                            user_id=user_id,
                            activating_event=active_tab_latest_url_event,
                            deactivating_event=event,
                        )

                    # Set the new active tab
                    active_tab_id = event.tab_id
                    active_tab_latest_url_event = event
def main(page_types_json_filename, *args, **kwargs):

    with open(page_types_json_filename) as page_types_file:
        page_types = json.load(page_types_file)

    # Only dump the most recently computed location visits (ignore all others).
    latest_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = (LocationVisit.select().where(
        LocationVisit.compute_index == latest_compute_index, ))

    # Store a list of URLs for which labels are missing
    urls_without_labels = set()

    for visit in visits:

        # Split URL into the constituent parts that can be used
        # to uniquely identify this URL in relation to others.
        # Note that while the same URL with different query strings may refer to the same
        # page, this isn't always true.  Take the forum PHP script for Panda3D as an example.
        # The same is true with fragments, specifically for Google Groups, where fragments
        # are used to select different groups and topics.
        url_parsed = urlparse(visit.url)
        path = url_parsed.path
        fragment = url_parsed.fragment
        query = url_parsed.query
        domain = url_parsed.netloc.lstrip("www.")

        # Fetch semantic labels for this URL
        # Store missing URLs for non-pilot study participants.
        # Currently, it's not important for us to be able to classify URLs for pilot participants.
        unique_url = standardize_url(visit.url)
        if unique_url not in page_types:
            if visit.user_id > PILOT_MAX_USER_ID:
                urls_without_labels.add(unique_url)
        else:
            page_type = page_types[unique_url]['main_type']

        time_passed = visit.end - visit.start
        seconds = time_passed.seconds + (time_passed.microseconds /
                                         float(1000000))

        yield [[
            visit.compute_index,
            visit.user_id,
            visit.task_index,
            visit.concern_index,
            visit.tab_id,
            visit.url,
            unique_url,
            domain,
            path,
            fragment,
            query,
            page_type,
            visit.title,
            visit.start,
            visit.end,
            seconds,
        ]]

    # Print out a list of URLs for which labels were not found
    for url in sorted(urls_without_labels):
        logger.debug("No label found for URL: %s", url)

    raise StopIteration