def test_simple_names(self): names = [["xxx", "aaa", 2], ["xxx", "aab", 1], ["yyy", "ddd", 2], ["yyy", "dde", 1], ["zzz", "fff", 1]] result = find_correct_names(names) self.assertTrue( (result["correct_firstname"] == Series(["aaa", "aaa", "ddd", "ddd", "fff"])).all(), "\n{0}".format(result) ) self.assertTrue( (result["correct_lastname"] == Series(["xxx", "xxx", "yyy", "yyy", "zzz"])).all(), "\n{0}".format(result) )
def fix_mispelled_dpoh_names(): """Find and fix mispellings in DPOH names in the db""" logbook.debug("computing correct names") names = util.get_dpoh_name_freq() df = correct_names.find_correct_names(names) conn = db.get_sqlalchemy_connection() trans = conn.begin() counter = 0 logbook.debug("updating db") for row in df.iterrows(): row = row[1] firstname = row["firstname"] lastname = row["lastname"] correct_firstname = row["correct_firstname"] correct_lastname = row["correct_lastname"] stmt = update(db.CommunicationDPOH.__table__).where( db.CommunicationDPOH.dpoh_first_name == firstname ).where( db.CommunicationDPOH.dpoh_last_name == lastname ).values( dpoh_first_name=correct_firstname, dpoh_last_name=correct_lastname ) try: # misspellings can cause collisions trans.connection.execute(stmt) except sqlalchemy_exceptions.IntegrityError as e: logbook.error(e) continue counter += 1 if counter % 250 == 0: trans.commit() trans.close() trans = conn.begin() logbook.debug("committed {0} changes".format(counter)) trans.commit() trans.close() logbook.debug("committed {0} changes".format(counter))
def test_real_names(self): names = [ (u"Abbot", u"Jim", 2), (u"Abbott", u"James", 1), (u"Abbott", u"Jim", 25), (u"Abbott", u"Connie", 1), (u"Hoffman", u"Abby", 1), (u"Abernethy-Gillis", u"Robyn", 1), (u"Ablonczy", u"Diane", 110), (u"Ablonczy", u"Dianne", 1), (u"Ablonsky", u"Diane", 1), (u"Ablonczy", u"Honourable Diane", 2), ] expected = DataFrame() expected["correct_lastname"] = Series( [ "Abbott", "Abbott", "Abbott", "Abbott", "Hoffman", "Abernethy-Gillis", "Ablonczy", "Ablonczy", "Ablonczy", "Ablonczy", ] ) expected["correct_firstname"] = Series( ["Jim", "Jim", "Jim", "Connie", "Abby", "Robyn", "Diane", "Diane", "Diane", "Honourable Diane"] ) result = find_correct_names(names) result = result.sort(["correct_lastname", "correct_firstname"]) expected = expected.sort(["correct_lastname", "correct_firstname"]) self.assertTrue((result["correct_firstname"] == expected["correct_firstname"]).all()) self.assertTrue((result["correct_lastname"] == expected["correct_lastname"]).all())