def test_hand_label_rating_events(self):

        # This rating event isn't aligned with the task period below (it comes after).
        # But we will still be able to associate it with that task created because
        # with a list of hand labels that we pass in.
        rating_event = create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            event_type="Rating: 1",
            url="http://url2.com",
            user_id=2,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            user_id=2,
            task_index=4,
        )

        compute_location_ratings(labels=[
            {'user_id': 2, 'task_index': 4, 'event_id': rating_event.id},
        ])
        self.assertEqual(LocationRating.select().count(), 1)
        rating = LocationRating.select().first()
        self.assertEqual(rating.task_index, 4)
        self.assertEqual(rating.hand_aligned, True)
Beispiel #2
0
    def test_hand_label_rating_events(self):

        # This rating event isn't aligned with the task period below (it comes after).
        # But we will still be able to associate it with that task created because
        # with a list of hand labels that we pass in.
        rating_event = create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 3, 0, 0),
            event_type="Rating: 1",
            url="http://url2.com",
            user_id=2,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            user_id=2,
            task_index=4,
        )

        compute_location_ratings(labels=[
            {
                'user_id': 2,
                'task_index': 4,
                'event_id': rating_event.id
            },
        ])
        self.assertEqual(LocationRating.select().count(), 1)
        rating = LocationRating.select().first()
        self.assertEqual(rating.task_index, 4)
        self.assertEqual(rating.hand_aligned, True)
def add_location_rating(location, rating, comment):
    """
    @summary: Add the location, rating and comment for the location
    @param location : Query location
    @type location: An object of Location class
    @param rating: rating of the location as float
    @type rating: float
    @param comment: comment added for the location
    @type comment: str
    """
    loc_rating = LocationRating(loc, rating=rating, comment=comment)
    loc_rating.save()
def main(page_types_json_filename, *args, **kwargs):

    with open(page_types_json_filename) as page_types_file:
        page_types = json.load(page_types_file)

    # Only dump the most recently computed location ratings (ignore all others).
    latest_compute_index = LocationRating.select(
        fn.Max(LocationRating.compute_index)).scalar()
    ratings = (LocationRating.select().where(
        LocationRating.compute_index == latest_compute_index))

    # Store a list of URLs for which labels are missing
    urls_without_labels = set()

    for rating in ratings:

        # Get the domain name of where this rating happened
        url_parsed = urlparse(rating.url)
        domain = url_parsed.netloc.lstrip("www.")

        # Fetch semantic labels for this URL
        # Store missing URLs for non-pilot study participants.
        # Currently, it's not important for us to be able to classify URLs for pilot participants.
        unique_url = standardize_url(rating.url)
        if unique_url not in page_types:
            if rating.user_id > PILOT_MAX_USER_ID:
                urls_without_labels.add(unique_url)
        else:
            page_type = page_types[unique_url]['main_type']

        yield [[
            rating.compute_index,
            rating.user_id,
            rating.task_index,
            rating.concern_index,
            rating.url,
            domain,
            page_type,
            rating.rating,
            rating.title,
            rating.visit_date,
        ]]

    # Print out a list of URLs for which labels were not found
    for url in sorted(urls_without_labels):
        logger.debug("No label found for URL: %s", url)

    raise StopIteration
    def test_by_default_associate_rating_with_latest_computed_task_periods(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )

        # All three of these tasks have the same (matching) periods.
        # But the second one was the latest one to be computed (compute_index=2)
        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        compute_location_ratings()
        rating = LocationRating.select()[0]
        self.assertEqual(rating.task_index, 2)
Beispiel #6
0
    def test_if_task_compute_index_specified_only_match_tasks_with_that_index(
            self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )

        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        compute_location_ratings(task_compute_index=1)
        rating = LocationRating.select()[0]
        self.assertEqual(rating.task_index, 3)
Beispiel #7
0
    def test_skip_ratings_that_couldnt_be_classified(self):

        # The first and last events don't fall into a valid task period
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 11, 0, 0, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 1",
            url="http://url2.com",
            user_id=0,
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 13, 0, 0, 0),
            event_type="Rating: 2",
            url="http://url2.com",
            user_id=0,
        )

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            user_id=0,
        )

        compute_location_ratings()
        ratings = LocationRating.select()
        self.assertEqual(ratings.count(), 1)
        self.assertEqual(ratings.first().rating, 1)
    def test_match_with_log_date_not_visit_date(self):

        # Setup: create a rating event that occurred within a task
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(9000, 1, 1, 1, 0, 0, 0),
            event_type="Rating: 1",
            url="http://url1.com",
            user_id=0,
        )
        create_location_event(
            log_date=datetime.datetime(9000, 1, 1, 1, 0, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 1",
            url="http://url2.com",
            user_id=0,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
            user_id=0,
        )

        # Test: a rating should be created for the URL visited with
        # the index of the task and concern of the task period that was taking place at the time.
        compute_location_ratings()
        ratings = LocationRating.select()
        self.assertEqual(ratings.count(), 1)
        self.assertEqual(ratings.first().url, "http://url1.com")
    def test_skip_ratings_that_couldnt_be_classified(self):

        # The first and last events don't fall into a valid task period
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 11, 0, 0, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 1",
            url="http://url2.com",
            user_id=0,
        )
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 13, 0, 0, 0),
            event_type="Rating: 2",
            url="http://url2.com",
            user_id=0,
        )

        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            user_id=0,
        )

        compute_location_ratings()
        ratings = LocationRating.select()
        self.assertEqual(ratings.count(), 1)
        self.assertEqual(ratings.first().rating, 1)
    def test_if_task_compute_index_specified_only_match_tasks_with_that_index(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )

        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        compute_location_ratings(task_compute_index=1)
        rating = LocationRating.select()[0]
        self.assertEqual(rating.task_index, 3)
Beispiel #11
0
    def test_by_default_associate_rating_with_latest_computed_task_periods(
            self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )

        # All three of these tasks have the same (matching) periods.
        # But the second one was the latest one to be computed (compute_index=2)
        create_task_period(
            compute_index=0,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=1,
        )
        create_task_period(
            compute_index=2,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
        )
        create_task_period(
            compute_index=1,
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
        )

        compute_location_ratings()
        rating = LocationRating.select()[0]
        self.assertEqual(rating.task_index, 2)
Beispiel #12
0
    def test_match_with_log_date_not_visit_date(self):

        # Setup: create a rating event that occurred within a task
        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            visit_date=datetime.datetime(9000, 1, 1, 1, 0, 0, 0),
            event_type="Rating: 1",
            url="http://url1.com",
            user_id=0,
        )
        create_location_event(
            log_date=datetime.datetime(9000, 1, 1, 1, 0, 0, 0),
            visit_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 1",
            url="http://url2.com",
            user_id=0,
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
            user_id=0,
        )

        # Test: a rating should be created for the URL visited with
        # the index of the task and concern of the task period that was taking place at the time.
        compute_location_ratings()
        ratings = LocationRating.select()
        self.assertEqual(ratings.count(), 1)
        self.assertEqual(ratings.first().url, "http://url1.com")
def create_location_rating(compute_index, task_compute_index, event, rating, labels):
    ''' Returns True if this rating could be matched to an existing task, False otherwise. '''

    # Check for hand-written task index labels for this event
    matching_labels = filter(lambda l: l['event_id'] == event.id, labels)
    if len(matching_labels) > 0:
        task_index = matching_labels[0]['task_index']
        task_periods = (
            TaskPeriod.select()
            .where(
                TaskPeriod.compute_index == task_compute_index,
                TaskPeriod.task_index == task_index,
                TaskPeriod.user_id == event.user_id,
            )
        )
        hand_aligned = True
    # If a hand-written label wasn't found, search for a task that this rating could have
    # occurred within.  If we successfully find a task, then save the rating event.
    else:
        task_periods = (
            TaskPeriod.select()
            .where(
                TaskPeriod.compute_index == task_compute_index,
                TaskPeriod.user_id == event.user_id,
                TaskPeriod.start < event.log_date,
                TaskPeriod.end > event.log_date,
            )
        )
        hand_aligned = False
    # If a matching task has been found, then save the rating alongside that task.
    if task_periods.count() > 0:
        task_period = task_periods[0]
        LocationRating.create(
            compute_index=compute_index,
            user_id=event.user_id,
            task_index=task_period.task_index,
            concern_index=task_period.concern_index,
            url=event.url,
            rating=rating,
            title=event.title,
            visit_date=event.visit_date,
            hand_aligned=hand_aligned,
        )

    return (task_periods.count() > 0)
def create_location_rating(compute_index, task_compute_index, event, rating,
                           labels):
    ''' Returns True if this rating could be matched to an existing task, False otherwise. '''

    # Check for hand-written task index labels for this event
    matching_labels = filter(lambda l: l['event_id'] == event.id, labels)
    if len(matching_labels) > 0:
        task_index = matching_labels[0]['task_index']
        task_periods = (TaskPeriod.select().where(
            TaskPeriod.compute_index == task_compute_index,
            TaskPeriod.task_index == task_index,
            TaskPeriod.user_id == event.user_id,
        ))
        hand_aligned = True
    # If a hand-written label wasn't found, search for a task that this rating could have
    # occurred within.  If we successfully find a task, then save the rating event.
    else:
        task_periods = (TaskPeriod.select().where(
            TaskPeriod.compute_index == task_compute_index,
            TaskPeriod.user_id == event.user_id,
            TaskPeriod.start < event.log_date,
            TaskPeriod.end > event.log_date,
        ))
        hand_aligned = False
    # If a matching task has been found, then save the rating alongside that task.
    if task_periods.count() > 0:
        task_period = task_periods[0]
        LocationRating.create(
            compute_index=compute_index,
            user_id=event.user_id,
            task_index=task_period.task_index,
            concern_index=task_period.concern_index,
            url=event.url,
            rating=rating,
            title=event.title,
            visit_date=event.visit_date,
            hand_aligned=hand_aligned,
        )

    return (task_periods.count() > 0)
def main(page_types_json_filename, *args, **kwargs):

    with open(page_types_json_filename) as page_types_file:
        page_types = json.load(page_types_file)

    # Only dump the most recently computed location ratings (ignore all others).
    latest_compute_index = LocationRating.select(fn.Max(LocationRating.compute_index)).scalar()
    ratings = (
        LocationRating
        .select()
        .where(
            LocationRating.compute_index == latest_compute_index
        )
    )

    # Store a list of URLs for which labels are missing
    urls_without_labels = set()

    for rating in ratings:

        # Get the domain name of where this rating happened
        url_parsed = urlparse(rating.url)
        domain = url_parsed.netloc.lstrip("www.")

        # Fetch semantic labels for this URL
        # Store missing URLs for non-pilot study participants.
        # Currently, it's not important for us to be able to classify URLs for pilot participants.
        unique_url = standardize_url(rating.url)
        if unique_url not in page_types:
            if rating.user_id > PILOT_MAX_USER_ID:
                urls_without_labels.add(unique_url)
        else:
            page_type = page_types[unique_url]['main_type']

        yield [[
            rating.compute_index,
            rating.user_id,
            rating.task_index,
            rating.concern_index,
            rating.url,
            domain,
            page_type,
            rating.rating,
            rating.title,
            rating.visit_date,
        ]]

    # Print out a list of URLs for which labels were not found
    for url in sorted(urls_without_labels):
        logger.debug("No label found for URL: %s", url)

    raise StopIteration
    def test_dont_make_rating_for_non_rating_event(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Tab opened",  # this event is not a rating and shouldn't be read as one
            url="http://url1.com",
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )
        compute_location_ratings()
        self.assertEqual(LocationRating.select().count(), 0)
Beispiel #17
0
    def test_dont_make_rating_for_non_rating_event(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type=
            "Tab opened",  # this event is not a rating and shouldn't be read as one
            url="http://url1.com",
        )
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            concern_index=5,
        )
        compute_location_ratings()
        self.assertEqual(LocationRating.select().count(), 0)
def compute_location_ratings(labels=HAND_LABELED_EVENTS,
                             task_compute_index=None):

    # Create a new index for this computation
    last_compute_index = LocationRating.select(
        fn.Max(LocationRating.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Determine what will be the compute index of the task periods that ratings are matched to.
    # This will become the latest compute index if it hasn't been specified.
    if task_compute_index is None:
        task_compute_index = TaskPeriod.select(fn.Max(
            TaskPeriod.compute_index)).scalar()

    # Create a list to hold all ratings that couldn't be matched to a task period.
    # At the end, we want to return these, in case it's important for the caller to know
    # which events we couldn't create rating records for.
    unmatched_ratings = []

    for event in LocationEvent.select():

        # Check to see whether this is a rating event
        rating_match = re.match("^Rating: (\d)+$", event.event_type)
        if rating_match:

            # If this is a rating event, extract the rating
            rating = int(rating_match.group(1))
            rating_created = create_location_rating(
                compute_index=compute_index,
                task_compute_index=task_compute_index,
                event=event,
                rating=rating,
                labels=labels,
            )

            # If a rating wasn't created, this probably couldn't be matched to a task.
            # Save a record of which event failed to be matched to a task and which user
            # this event happened for.
            if not rating_created:
                unmatched_ratings.append({
                    'user_id': event.user_id,
                    'event_id': event.id,
                })

    return unmatched_ratings
def compute_location_ratings(labels=HAND_LABELED_EVENTS, task_compute_index=None):

    # Create a new index for this computation
    last_compute_index = LocationRating.select(fn.Max(LocationRating.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Determine what will be the compute index of the task periods that ratings are matched to.
    # This will become the latest compute index if it hasn't been specified.
    if task_compute_index is None:
        task_compute_index = TaskPeriod.select(fn.Max(TaskPeriod.compute_index)).scalar()

    # Create a list to hold all ratings that couldn't be matched to a task period.
    # At the end, we want to return these, in case it's important for the caller to know
    # which events we couldn't create rating records for.
    unmatched_ratings = []

    for event in LocationEvent.select():

        # Check to see whether this is a rating event
        rating_match = re.match("^Rating: (\d)+$", event.event_type)
        if rating_match:

            # If this is a rating event, extract the rating
            rating = int(rating_match.group(1))
            rating_created = create_location_rating(
                compute_index=compute_index,
                task_compute_index=task_compute_index,
                event=event,
                rating=rating,
                labels=labels,
            )

            # If a rating wasn't created, this probably couldn't be matched to a task.
            # Save a record of which event failed to be matched to a task and which user
            # this event happened for.
            if not rating_created:
                unmatched_ratings.append({
                    'user_id': event.user_id,
                    'event_id': event.id,
                })

    return unmatched_ratings
    def test_location_event_must_match_task_period_and_user_id(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )
        # For the task periods below, the `task_index` parameter serves no function,
        # except to vary between all periods so that we know which task the rating
        # is associated with after it has been extracted.
        # This task is a mismatch because it's for the wrong user.
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
            user_id=1,
        )
        # This task is a match, with the right timing and the right user.
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            user_id=0,
        )
        # This task is a mismatch, with the wrong timing but the right user.
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 4, 0, 0),
            task_index=4,
            user_id=0,
        )

        compute_location_ratings()
        ratings = LocationRating.select()
        self.assertEqual(ratings.count(), 1)
        rating = ratings[0]
        self.assertEqual(rating.user_id, 0)
        self.assertEqual(rating.task_index, 3)
Beispiel #21
0
    def test_location_event_must_match_task_period_and_user_id(self):

        create_location_event(
            log_date=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            event_type="Rating: 0",
            url="http://url1.com",
            user_id=0,
        )
        # For the task periods below, the `task_index` parameter serves no function,
        # except to vary between all periods so that we know which task the rating
        # is associated with after it has been extracted.
        # This task is a mismatch because it's for the wrong user.
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=2,
            user_id=1,
        )
        # This task is a match, with the right timing and the right user.
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 0, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            task_index=3,
            user_id=0,
        )
        # This task is a mismatch, with the wrong timing but the right user.
        create_task_period(
            start=datetime.datetime(2000, 1, 1, 12, 2, 0, 0),
            end=datetime.datetime(2000, 1, 1, 12, 4, 0, 0),
            task_index=4,
            user_id=0,
        )

        compute_location_ratings()
        ratings = LocationRating.select()
        self.assertEqual(ratings.count(), 1)
        rating = ratings[0]
        self.assertEqual(rating.user_id, 0)
        self.assertEqual(rating.task_index, 3)