def testSNameStandardiser(self): """Test name standardiser routines (surname first)""" # return ns = standardisation.NameStandardiser(descript = 'Test name standardiser', input_fields = ['in_sname'], output_fiel = ['title', 'gender_guess', 'given_name', 'alt_given_name', 'surname', 'alt_surname'], female_t = self.name_female_titles, male_t = self.name_male_titles, tag_t=self.name_tag_table, corr_l=self.name_corr_list, first_name_c = 'sname', hmm_train_fi = 'test-hmm-train.txt') rs = standardisation.RecordStandardiser(descr = 'Test record standardiser', input_dataset = self.in_ds, output_dataset = self.out_ds, comp_stand_list =[ns]) for (name_str, name_res) in self.names_snames: clean_name_str = ns.clean_component(name_str) test_name_res = ns.standardise(name_str, clean_name_str) # assert name_res == test_name_res, \ # 'Wrong surname first standardisation: %s, should be: %s' % \ # (str(test_name_res), str(name_res)) print 'Count dict:', ns.count_dict
def testDateStandardiser( self): # - - - - - - - - - - - - - - - - - - - - - """Test date standardiser routines""" return ds = standardisation.DateStandardiser( descript="Test date standardiser", parse_form=self.date_parse_formats, input_fields=["in_date"], output_fiel=["day", "month", "year"], ) rs = standardisation.RecordStandardiser( descr="Test record standardiser", input_dataset=self.in_ds, output_dataset=self.out_ds, comp_stand_list=[ds], pass_fiel=[("pass1", "out_pass1"), ("pass2", "out_pass2")], ) for (date_str, date_res) in self.dates: clean_date_str = ds.clean_component(date_str) test_date_res = ds.standardise(date_str, clean_date_str) assert date_res == test_date_res, ( "Wrong date standardisation: %s, should be: %s" % (str(test_date_res), str(date_res))) rs.standardise() # Use record standardiser and write output file # Test the content of the output data set # test_ds = dataset.DataSetCSV( description="Test standardised data set", access_mode="read", rec_ident="rec_id", field_list=[], header_line=True, write_header=True, file_name="test-standardised-dataset.csv", ) i = 0 for (rec_id, rec_list) in test_ds.readall(): test_day = rec_list[0] test_month = rec_list[1] test_year = rec_list[2] true_day = self.dates[i][1][0] true_month = self.dates[i][1][1] true_year = self.dates[i][1][2] assert test_day == true_day, (i, rec_list[0:3], self.dates[i][1]) assert test_month == true_month, (i, rec_list[0:3], self.dates[i][1]) assert test_year == true_year, (i, rec_list[0:3], self.dates[i][1]) i += 1
def testDateStandardiser( self): # - - - - - - - - - - - - - - - - - - - - - """Test date standardiser routines""" return ds = standardisation.DateStandardiser( descript='Test date standardiser', parse_form=self.date_parse_formats, input_fields=['in_date'], output_fiel=['day', 'month', 'year']) rs = standardisation.RecordStandardiser( descr='Test record standardiser', input_dataset=self.in_ds, output_dataset=self.out_ds, comp_stand_list=[ds], pass_fiel=[('pass1', 'out_pass1'), ('pass2', 'out_pass2')]) for (date_str, date_res) in self.dates: clean_date_str = ds.clean_component(date_str) test_date_res = ds.standardise(date_str, clean_date_str) assert date_res == test_date_res, \ 'Wrong date standardisation: %s, should be: %s' % \ (str(test_date_res), str(date_res)) rs.standardise() # Use record standardiser and write output file # Test the content of the output data set # test_ds = dataset.DataSetCSV(description='Test standardised data set', access_mode='read', rec_ident='rec_id', field_list=[], header_line=True, write_header=True, file_name='test-standardised-dataset.csv') i = 0 for (rec_id, rec_list) in test_ds.readall(): test_day = rec_list[0] test_month = rec_list[1] test_year = rec_list[2] true_day = self.dates[i][1][0] true_month = self.dates[i][1][1] true_year = self.dates[i][1][2] assert test_day == true_day, (i, rec_list[0:3], self.dates[i][1]) assert test_month == true_month, (i, rec_list[0:3], self.dates[i][1]) assert test_year == true_year, (i, rec_list[0:3], self.dates[i][1]) i += 1
def testGNameStandardiser( self): # ----------------------------------------- """Test name standardiser routines (given name first)""" # return ns = standardisation.NameStandardiser( descript="Test name standardiser", input_fields=["in_gname"], output_fiel=[ "title", "gender_guess", "given_name", "alt_given_name", "surname", "alt_surname", ], female_t=self.name_female_titles, male_t=self.name_male_titles, tag_t=self.name_tag_table, corr_l=self.name_corr_list, hmm_train_fil="test-hmm-train.txt", ) rs = standardisation.RecordStandardiser( descr="Test record standardiser", input_dataset=self.in_ds, output_dataset=self.out_ds, comp_stand_list=[ns], ) for (name_str, name_res) in self.names_gnames: clean_name_str = ns.clean_component(name_str) test_name_res = ns.standardise(name_str, clean_name_str) # assert name_res == test_name_res, \ # 'Wrong given name first standardisation: %s, should be: %s' % \ # (str(test_name_res), str(name_res)) # rs.standardise() # Use record standardiser and write output file print("Count dict:", ns.count_dict)
def testPhoneNumStandardiserNone(self): """Test phone number standardiser routines""" return ps = standardisation.PhoneNumStandardiser(descript = \ 'Test phone number standardiser', input_fields = ['in_phonenum'], output_fiel = ['country_code', None, 'area_code', 'number', None]) rs = standardisation.RecordStandardiser(descr = 'Test record standardiser', input_dataset = self.in_ds, output_dataset = self.out_ds, comp_stand_list = [ps]) for (phonenum_str, phonenum_res) in self.phonenums: clean_phonenum_str = ps.clean_component(phonenum_str) test_phonenum_res = ps.standardise(phonenum_str, clean_phonenum_str) assert phonenum_res == test_phonenum_res, \ 'Wrong phone number standardisation: %s, should be: %s' % \ (str(test_phonenum_res), str(phonenum_res)) rs.standardise() # Use record standardiser and write output file # Test the content of the output data set # test_ds = dataset.DataSetCSV(description='Test standardised data set', access_mode='read', rec_ident = 'rec_id', field_list = [], header_line=True, write_header=True, file_name='test-standardised-dataset.csv') i = 0 for (rec_id, rec_list) in test_ds.readall(): test_country_code = rec_list[3] test_country_name = rec_list[4] test_area_code = rec_list[5] test_number = rec_list[6] test_extension = rec_list[7] true_country_code = self.phonenums[i][1][0] true_area_code = self.phonenums[i][1][2] true_number = self.phonenums[i][1][3] assert test_country_code == true_country_code, \ (i, rec_list[3:8], self.phonenums[i][1]) assert test_country_name == '', \ (i, rec_list[3:8], self.phonenums[i][1]) assert test_area_code == true_area_code, \ (i, rec_list[3:8], self.phonenums[i][1]) assert test_number == true_number, \ (i, rec_list[3:8], self.phonenums[i][1]) assert test_extension == '', \ (i, rec_list[3:8], self.phonenums[i][1]) i += 1
def testPhoneNumStandardiserNone(self): """Test phone number standardiser routines""" return ps = standardisation.PhoneNumStandardiser( descript="Test phone number standardiser", input_fields=["in_phonenum"], output_fiel=["country_code", None, "area_code", "number", None], ) rs = standardisation.RecordStandardiser( descr="Test record standardiser", input_dataset=self.in_ds, output_dataset=self.out_ds, comp_stand_list=[ps], ) for (phonenum_str, phonenum_res) in self.phonenums: clean_phonenum_str = ps.clean_component(phonenum_str) test_phonenum_res = ps.standardise(phonenum_str, clean_phonenum_str) assert phonenum_res == test_phonenum_res, ( "Wrong phone number standardisation: %s, should be: %s" % (str(test_phonenum_res), str(phonenum_res))) rs.standardise() # Use record standardiser and write output file # Test the content of the output data set # test_ds = dataset.DataSetCSV( description="Test standardised data set", access_mode="read", rec_ident="rec_id", field_list=[], header_line=True, write_header=True, file_name="test-standardised-dataset.csv", ) i = 0 for (rec_id, rec_list) in test_ds.readall(): test_country_code = rec_list[3] test_country_name = rec_list[4] test_area_code = rec_list[5] test_number = rec_list[6] test_extension = rec_list[7] true_country_code = self.phonenums[i][1][0] true_area_code = self.phonenums[i][1][2] true_number = self.phonenums[i][1][3] assert test_country_code == true_country_code, ( i, rec_list[3:8], self.phonenums[i][1], ) assert test_country_name == "", (i, rec_list[3:8], self.phonenums[i][1]) assert test_area_code == true_area_code, ( i, rec_list[3:8], self.phonenums[i][1], ) assert test_number == true_number, (i, rec_list[3:8], self.phonenums[i][1]) assert test_extension == "", (i, rec_list[3:8], self.phonenums[i][1]) i += 1