def test_simple_names(self):
        names = [["xxx", "aaa", 2], ["xxx", "aab", 1], ["yyy", "ddd", 2], ["yyy", "dde", 1], ["zzz", "fff", 1]]

        result = find_correct_names(names)

        self.assertTrue(
            (result["correct_firstname"] == Series(["aaa", "aaa", "ddd", "ddd", "fff"])).all(), "\n{0}".format(result)
        )
        self.assertTrue(
            (result["correct_lastname"] == Series(["xxx", "xxx", "yyy", "yyy", "zzz"])).all(), "\n{0}".format(result)
        )
Esempio n. 2
0
def fix_mispelled_dpoh_names():
    """Find and fix mispellings in DPOH names in the db"""

    logbook.debug("computing correct names")
    names = util.get_dpoh_name_freq()
    df = correct_names.find_correct_names(names)

    conn = db.get_sqlalchemy_connection()
    trans = conn.begin()

    counter = 0
    logbook.debug("updating db")
    for row in df.iterrows():
        row = row[1]

        firstname = row["firstname"]
        lastname = row["lastname"]
        correct_firstname = row["correct_firstname"]
        correct_lastname = row["correct_lastname"]

        stmt = update(db.CommunicationDPOH.__table__).where(
            db.CommunicationDPOH.dpoh_first_name == firstname
        ).where(
            db.CommunicationDPOH.dpoh_last_name == lastname
        ).values(
            dpoh_first_name=correct_firstname,
            dpoh_last_name=correct_lastname
        )

        try:
            # misspellings can cause collisions
            trans.connection.execute(stmt)
        except sqlalchemy_exceptions.IntegrityError as e:
            logbook.error(e)
            continue

        counter += 1
        if counter % 250 == 0:
            trans.commit()
            trans.close()
            trans = conn.begin()
            logbook.debug("committed {0} changes".format(counter))

    trans.commit()
    trans.close()
    logbook.debug("committed {0} changes".format(counter))
    def test_real_names(self):
        names = [
            (u"Abbot", u"Jim", 2),
            (u"Abbott", u"James", 1),
            (u"Abbott", u"Jim", 25),
            (u"Abbott", u"Connie", 1),
            (u"Hoffman", u"Abby", 1),
            (u"Abernethy-Gillis", u"Robyn", 1),
            (u"Ablonczy", u"Diane", 110),
            (u"Ablonczy", u"Dianne", 1),
            (u"Ablonsky", u"Diane", 1),
            (u"Ablonczy", u"Honourable Diane", 2),
        ]

        expected = DataFrame()
        expected["correct_lastname"] = Series(
            [
                "Abbott",
                "Abbott",
                "Abbott",
                "Abbott",
                "Hoffman",
                "Abernethy-Gillis",
                "Ablonczy",
                "Ablonczy",
                "Ablonczy",
                "Ablonczy",
            ]
        )
        expected["correct_firstname"] = Series(
            ["Jim", "Jim", "Jim", "Connie", "Abby", "Robyn", "Diane", "Diane", "Diane", "Honourable Diane"]
        )

        result = find_correct_names(names)

        result = result.sort(["correct_lastname", "correct_firstname"])
        expected = expected.sort(["correct_lastname", "correct_firstname"])

        self.assertTrue((result["correct_firstname"] == expected["correct_firstname"]).all())
        self.assertTrue((result["correct_lastname"] == expected["correct_lastname"]).all())