def test_deduplicate_by_key(self):
     collection_of_dicts = [
         {'index': 'one', 'other': '1'},
         {'index': 'two', 'other': '2'},
         {'index': 'two', 'other': '3'},
     ]
     no_duplicates = deduplicate_by_key(collection_of_dicts, lambda r: r['index'])
     expected = collection_of_dicts[0:2]
     assert_equal(sorted(no_duplicates), expected)
 def test_deduplicate_by_key(self):
     collection_of_dicts = [
         {
             'index': 'one',
             'other': '1'
         },
         {
             'index': 'two',
             'other': '2'
         },
         {
             'index': 'two',
             'other': '3'
         },
     ]
     no_duplicates = deduplicate_by_key(collection_of_dicts,
                                        lambda r: r['index'])
     expected = collection_of_dicts[0:2]
     assert_equal(sorted(no_duplicates), expected)
 def validate_records(self, session, cohort):
     """
     Fetches the wiki_user(s) already added for self.cohort_id and validates
     their mediawiki_username against their stated project as either a user_id
     or user_name.  Once done, sets the valid state and deletes any duplicates.
     Then, it finishes filling in the data model by inserting corresponding
     records into the cohort_wiki_users table.
     
     This is meant to execute asynchronously on celery
     
     Parameters
         session : an active wikimetrics db session to use
         cohort  : the cohort to validate; must belong to session
     """
     # reset the cohort validation status so it can't be used for reports
     cohort.validated = False
     session.execute(
         WikiUser.__table__.update().values(valid=None).where(
             WikiUser.validating_cohort == cohort.id
         )
     )
     session.execute(CohortWikiUser.__table__.delete().where(
         CohortWikiUser.cohort_id == cohort.id
     ))
     session.commit()
     
     wikiusers = session.query(WikiUser) \
         .filter(WikiUser.validating_cohort == cohort.id) \
         .all()
     
     deduplicated = deduplicate_by_key(
         wikiusers,
         lambda r: (r.mediawiki_username, r.project)
     )
     
     wikiusers_by_project = {}
     for wu in deduplicated:
         try:
             normalized_project = normalize_project(wu.project)
             if normalized_project is None:
                 wu.reason_invalid = 'invalid project: {0}'.format(wu.project)
                 wu.valid = False
                 continue
             
             wu.project = normalized_project
             if wu.project not in wikiusers_by_project:
                 wikiusers_by_project[wu.project] = []
             wikiusers_by_project[wu.project].append(wu)
             
             # validate bunches of records to update the UI but not kill performance
             if len(wikiusers_by_project[wu.project]) > 999:
                 validate_users(
                     wikiusers_by_project[wu.project],
                     wu.project,
                     self.validate_as_user_ids
                 )
                 session.commit()
                 wikiusers_by_project[wu.project] = []
         except:
             continue
     
     # validate anything that wasn't big enough for a batch
     for project, wikiusers in wikiusers_by_project.iteritems():
         if len(wikiusers) > 0:
             validate_users(wikiusers, project, self.validate_as_user_ids)
     session.commit()
     
     unique_and_validated = deduplicate_by_key(
         deduplicated,
         lambda r: (r.mediawiki_username, r.project)
     )
     
     session.execute(
         CohortWikiUser.__table__.insert(), [
             {
                 'cohort_id'     : cohort.id,
                 'wiki_user_id'  : wu.id,
             } for wu in unique_and_validated
         ]
     )
     
     # clean up any duplicate wiki_user records
     session.execute(WikiUser.__table__.delete().where(and_(
         WikiUser.validating_cohort == cohort.id,
         WikiUser.id.notin_([wu.id for wu in unique_and_validated])
     )))
     cohort.validated = True
     session.commit()
Example #4
0
    def validate_records(self, session, cohort):
        """
        Fetches the wiki_user(s) already added for self.cohort_id and validates
        their raw_id_or_name field against their stated project as either a user_id
        or user_name.  Once done, sets the valid state and deletes any duplicates.
        Then, it finishes filling in the data model by inserting corresponding
        records into the cohort_wiki_users table.

        This is meant to execute asynchronously on celery

        Parameters
            session : an active wikimetrics db session to use
            cohort  : the cohort to validate; must belong to session
        """
        # reset the cohort validation status so it can't be used for reports
        cohort.validated = False
        session.execute(WikiUserStore.__table__.update().values(
            valid=None).where(WikiUserStore.validating_cohort == cohort.id))
        session.execute(CohortWikiUserStore.__table__.delete().where(
            CohortWikiUserStore.cohort_id == cohort.id))
        session.commit()

        wikiusers = session.query(WikiUserStore) \
            .filter(WikiUserStore.validating_cohort == cohort.id) \
            .all()

        deduplicated = deduplicate_by_key(
            wikiusers, lambda r:
            (r.raw_id_or_name, normalize_project(r.project) or r.project))

        wikiusers_by_project = {}
        for wu in deduplicated:
            normalized_project = normalize_project(wu.project)
            if normalized_project is None:
                wu.reason_invalid = 'invalid project: {0}'.format(wu.project)
                wu.valid = False
                continue

            wu.project = normalized_project
            if wu.project not in wikiusers_by_project:
                wikiusers_by_project[wu.project] = []
            wikiusers_by_project[wu.project].append(wu)

            # validate bunches of records to update the UI but not kill performance
            if len(wikiusers_by_project[wu.project]) > 999:
                validate_users(wikiusers_by_project[wu.project], wu.project,
                               self.validate_as_user_ids)
                session.commit()
                wikiusers_by_project[wu.project] = []

        # validate anything that wasn't big enough for a batch
        for project, wikiusers in wikiusers_by_project.iteritems():
            if len(wikiusers) > 0:
                validate_users(wikiusers, project, self.validate_as_user_ids)
        session.commit()

        session.execute(CohortWikiUserStore.__table__.insert(),
                        [{
                            'cohort_id': cohort.id,
                            'wiki_user_id': wu.id,
                        } for wu in deduplicated])

        # clean up any duplicate wiki_user records
        session.execute(WikiUserStore.__table__.delete().where(
            and_(WikiUserStore.validating_cohort == cohort.id,
                 WikiUserStore.id.notin_([wu.id for wu in deduplicated]))))
        cohort.validated = True
        session.commit()