class JapanesePreparator(AbstractGlobalPhonePreparator):
    """Japanese specific preprocessing for the GlobalPhone corpus"""
    language = 'Japanese'

    name = '-'.join([AbstractGlobalPhonePreparator.name, language]).lower()

    transcription_key = 'rmn'

    exclude_wavs = [
        'JA001_13', 'JA001_15', 'JA001_16', 'JA001_20', 'JA001_26', 'JA001_29',
        'JA001_32', 'JA001_33', 'JA001_34', 'JA001_41', 'JA001_50', 'JA001_54',
        'JA001_56', 'JA001_6', 'JA001_62', 'JA001_67', 'JA001_68', 'JA001_72',
        'JA001_74', 'JA001_78', 'JA001_8', 'JA001_80', 'JA001_85', 'JA001_91',
        'JA002_101', 'JA002_103', 'JA002_104', 'JA002_107', 'JA002_110',
        'JA002_18', 'JA002_19', 'JA002_23', 'JA002_27', 'JA002_28', 'JA002_30',
        'JA002_34', 'JA002_38', 'JA002_47', 'JA002_54', 'JA002_56', 'JA002_58',
        'JA002_59', 'JA002_6', 'JA002_7', 'JA002_70', 'JA002_71', 'JA002_72',
        'JA002_73', 'JA002_74', 'JA002_83', 'JA002_84', 'JA002_86', 'JA002_87',
        'JA002_88', 'JA002_89', 'JA002_90', 'JA002_93', 'JA002_96', 'JA002_97',
        'JA002_99', 'JA003_15', 'JA003_17', 'JA003_21', 'JA003_24', 'JA003_31',
        'JA003_34', 'JA003_5', 'JA003_53', 'JA003_55', 'JA003_61', 'JA004_13',
        'JA004_14', 'JA004_19', 'JA004_47', 'JA004_51', 'JA004_53', 'JA004_54',
        'JA004_55', 'JA004_64', 'JA004_67', 'JA004_72', 'JA004_85', 'JA004_86',
        'JA004_9', 'JA005_101', 'JA005_102', 'JA005_105', 'JA005_109',
        'JA005_11', 'JA005_110', 'JA005_113', 'JA005_114', 'JA005_115',
        'JA005_129', 'JA005_132', 'JA005_138', 'JA005_142', 'JA005_143',
        'JA005_15', 'JA005_16', 'JA005_2', 'JA005_23', 'JA005_24', 'JA005_25',
        'JA005_26', 'JA005_27', 'JA005_28', 'JA005_29', 'JA005_3', 'JA005_30',
        'JA005_35', 'JA005_40', 'JA005_42', 'JA005_45', 'JA005_46', 'JA005_5',
        'JA005_53', 'JA005_55', 'JA005_62', 'JA005_63', 'JA005_64', 'JA005_65',
        'JA005_69', 'JA005_7', 'JA005_70', 'JA005_74', 'JA005_76', 'JA005_79',
        'JA005_99', 'JA006_10', 'JA006_23', 'JA006_37', 'JA006_44', 'JA006_45',
        'JA006_46', 'JA006_5', 'JA006_55', 'JA006_9', 'JA007_15', 'JA007_19',
        'JA007_21', 'JA007_27', 'JA007_29', 'JA007_33', 'JA007_48', 'JA007_56',
        'JA007_61', 'JA007_64', 'JA007_65', 'JA007_68', 'JA008_14', 'JA008_25',
        'JA008_29', 'JA009_10', 'JA009_12', 'JA009_21', 'JA009_23', 'JA009_24',
        'JA009_26', 'JA009_44', 'JA009_48', 'JA009_50', 'JA009_51', 'JA009_52',
        'JA009_53', 'JA009_54', 'JA009_62', 'JA009_67', 'JA009_68', 'JA009_69',
        'JA009_71', 'JA009_72', 'JA009_97', 'JA009_98', 'JA009_99', 'JA010_15',
        'JA010_16', 'JA010_23', 'JA010_25', 'JA010_28', 'JA010_32', 'JA010_35',
        'JA010_40', 'JA010_48', 'JA010_56', 'JA010_57', 'JA010_58', 'JA010_62',
        'JA010_64', 'JA010_71', 'JA010_74', 'JA010_76', 'JA010_77', 'JA011_20',
        'JA011_38', 'JA011_54', 'JA011_55', 'JA012_29', 'JA012_3', 'JA012_32',
        'JA012_42', 'JA012_45', 'JA012_48', 'JA012_49', 'JA012_51', 'JA012_6',
        'JA012_62', 'JA012_63', 'JA012_7', 'JA013_15', 'JA013_16', 'JA013_2',
        'JA013_3', 'JA013_36', 'JA013_37', 'JA013_4', 'JA013_40', 'JA013_41',
        'JA013_48', 'JA013_53', 'JA013_56', 'JA013_60', 'JA013_67', 'JA013_79',
        'JA013_83', 'JA013_92', 'JA015_19', 'JA015_23', 'JA015_26', 'JA015_33',
        'JA015_42', 'JA015_45', 'JA015_58', 'JA015_61', 'JA015_66', 'JA015_69',
        'JA015_73', 'JA015_81', 'JA016_28', 'JA016_29', 'JA016_32', 'JA016_4',
        'JA016_42', 'JA016_43', 'JA016_53', 'JA016_54', 'JA016_8', 'JA017_3',
        'JA017_53', 'JA017_57', 'JA017_62', 'JA017_64', 'JA018_22', 'JA018_40',
        'JA018_49', 'JA018_50', 'JA018_52', 'JA018_7', 'JA018_8', 'JA019_4',
        'JA019_59', 'JA019_69', 'JA019_71', 'JA019_76', 'JA019_77', 'JA020_26',
        'JA020_31', 'JA020_38', 'JA020_47', 'JA020_50', 'JA020_56', 'JA020_58',
        'JA020_66', 'JA020_77', 'JA020_8', 'JA020_81', 'JA021_13', 'JA022_20',
        'JA023_70', 'JA023_8', 'JA024_1', 'JA024_3', 'JA024_30', 'JA024_34',
        'JA024_37', 'JA024_39', 'JA024_46', 'JA024_49', 'JA024_56', 'JA025_21',
        'JA026_105', 'JA026_17', 'JA026_21', 'JA026_24', 'JA026_27',
        'JA026_30', 'JA026_41', 'JA026_45', 'JA026_53', 'JA026_56', 'JA026_58',
        'JA026_68', 'JA026_78', 'JA026_8', 'JA026_84', 'JA026_86', 'JA027_10',
        'JA027_24', 'JA027_25', 'JA027_31', 'JA027_33', 'JA027_58', 'JA027_8',
        'JA028_10', 'JA028_11', 'JA028_20', 'JA028_22', 'JA028_26', 'JA028_31',
        'JA028_33', 'JA028_34', 'JA028_37', 'JA028_4', 'JA028_45', 'JA028_47',
        'JA028_48', 'JA028_51', 'JA028_52', 'JA028_56', 'JA028_61', 'JA029_14',
        'JA029_20', 'JA029_24', 'JA029_25', 'JA029_27', 'JA029_35', 'JA029_36',
        'JA029_49', 'JA029_6', 'JA029_61', 'JA029_7', 'JA030_103', 'JA030_105',
        'JA030_11', 'JA030_111', 'JA030_112', 'JA030_116', 'JA030_19',
        'JA030_20', 'JA030_24', 'JA030_25', 'JA030_28', 'JA030_3', 'JA030_31',
        'JA030_32', 'JA030_36', 'JA030_46', 'JA030_58', 'JA030_66', 'JA030_67',
        'JA030_68', 'JA030_77', 'JA030_80', 'JA030_86', 'JA030_87', 'JA030_90',
        'JA030_92', 'JA030_94', 'JA030_95', 'JA030_98', 'JA031_13', 'JA031_2',
        'JA031_32', 'JA031_37', 'JA031_42', 'JA031_44', 'JA032_10', 'JA032_25',
        'JA032_52', 'JA032_57', 'JA032_65', 'JA032_67', 'JA032_68', 'JA033_1',
        'JA033_15', 'JA033_19', 'JA033_24', 'JA033_28', 'JA033_30', 'JA033_31',
        'JA033_4', 'JA033_40', 'JA033_43', 'JA033_51', 'JA033_56', 'JA033_58',
        'JA033_6', 'JA033_61', 'JA033_66', 'JA034_18', 'JA034_3', 'JA034_47',
        'JA034_5', 'JA034_55', 'JA034_57', 'JA034_80', 'JA035_36', 'JA035_54',
        'JA035_59', 'JA035_83', 'JA036_101', 'JA036_19', 'JA036_3', 'JA036_31',
        'JA036_34', 'JA036_4', 'JA036_41', 'JA036_44', 'JA036_45', 'JA036_5',
        'JA036_51', 'JA036_52', 'JA036_54', 'JA036_60', 'JA036_62', 'JA036_67',
        'JA036_72', 'JA036_73', 'JA036_77', 'JA036_78', 'JA036_80', 'JA036_83',
        'JA036_85', 'JA036_88', 'JA036_92', 'JA036_95', 'JA037_11', 'JA037_12',
        'JA037_13', 'JA037_15', 'JA037_17', 'JA037_25', 'JA037_4', 'JA037_44',
        'JA037_45', 'JA037_46', 'JA037_47', 'JA037_49', 'JA037_55', 'JA037_56',
        'JA037_59', 'JA037_6', 'JA037_60', 'JA037_61', 'JA037_72', 'JA037_75',
        'JA037_76', 'JA037_77', 'JA038_22', 'JA038_25', 'JA038_5', 'JA039_1',
        'JA039_11', 'JA039_12', 'JA039_15', 'JA039_18', 'JA039_25', 'JA039_47',
        'JA039_49', 'JA039_51', 'JA039_62', 'JA039_70', 'JA039_71', 'JA039_73',
        'JA039_74', 'JA039_80', 'JA039_83', 'JA040_4', 'JA040_42', 'JA040_43',
        'JA040_48', 'JA040_5', 'JA040_53', 'JA040_55', 'JA040_56', 'JA040_58',
        'JA040_68', 'JA040_73', 'JA040_77', 'JA040_78', 'JA041_100',
        'JA041_103', 'JA041_104', 'JA041_105', 'JA041_12', 'JA041_14',
        'JA041_19', 'JA041_24', 'JA041_30', 'JA041_34', 'JA041_37', 'JA041_40',
        'JA041_45', 'JA041_51', 'JA041_57', 'JA041_67', 'JA041_69', 'JA041_71',
        'JA041_77', 'JA041_78', 'JA041_81', 'JA041_86', 'JA041_87', 'JA041_88',
        'JA041_90', 'JA041_95', 'JA042_17', 'JA042_21', 'JA042_24', 'JA042_35',
        'JA042_39', 'JA042_40', 'JA042_59', 'JA042_65', 'JA043_31', 'JA043_69',
        'JA043_82', 'JA044_113', 'JA044_114', 'JA044_120', 'JA044_24',
        'JA044_25', 'JA044_26', 'JA044_27', 'JA044_28', 'JA044_29', 'JA044_30',
        'JA044_38', 'JA044_39', 'JA044_42', 'JA044_43', 'JA044_51', 'JA044_52',
        'JA044_68', 'JA044_69', 'JA044_70', 'JA044_71', 'JA044_82', 'JA044_83',
        'JA044_88', 'JA044_89', 'JA044_90', 'JA044_93', 'JA044_94', 'JA045_3',
        'JA045_4', 'JA046_1', 'JA046_3', 'JA046_6', 'JA046_7', 'JA048_14',
        'JA048_35', 'JA048_5', 'JA049_103', 'JA049_113', 'JA049_2', 'JA049_26',
        'JA049_27', 'JA049_30', 'JA049_33', 'JA049_35', 'JA049_38', 'JA049_48',
        'JA049_51', 'JA049_57', 'JA049_62', 'JA049_7', 'JA049_73', 'JA049_75',
        'JA049_8', 'JA049_95', 'JA049_96', 'JA049_98', 'JA049_99', 'JA050_19',
        'JA050_40', 'JA050_74', 'JA050_80', 'JA051_1', 'JA051_15', 'JA051_17',
        'JA051_21', 'JA051_22', 'JA051_24', 'JA051_26', 'JA051_33', 'JA051_35',
        'JA051_40', 'JA051_42', 'JA051_43', 'JA051_45', 'JA051_48', 'JA051_50',
        'JA051_53', 'JA051_54', 'JA051_59', 'JA051_6', 'JA051_61', 'JA051_65',
        'JA051_66', 'JA051_68', 'JA051_74', 'JA051_78', 'JA051_80', 'JA051_87',
        'JA051_9', 'JA052_50', 'JA052_69', 'JA053_10', 'JA053_15', 'JA053_23',
        'JA053_26', 'JA053_29', 'JA053_3', 'JA053_30', 'JA053_31', 'JA053_33',
        'JA053_34', 'JA053_36', 'JA053_39', 'JA053_40', 'JA053_45', 'JA053_46',
        'JA053_47', 'JA053_49', 'JA053_57', 'JA053_62', 'JA053_68', 'JA053_71',
        'JA053_75', 'JA053_8', 'JA053_80', 'JA053_82', 'JA053_83', 'JA054_22',
        'JA054_25', 'JA054_30', 'JA054_31', 'JA054_37', 'JA054_45', 'JA054_53',
        'JA054_6', 'JA054_65', 'JA054_67', 'JA055_17', 'JA055_2', 'JA055_21',
        'JA055_25', 'JA055_26', 'JA055_40', 'JA055_42', 'JA055_44', 'JA055_45',
        'JA055_47', 'JA055_52', 'JA055_53', 'JA055_58', 'JA055_63', 'JA055_64',
        'JA055_66', 'JA055_7', 'JA055_70', 'JA055_71', 'JA055_72', 'JA055_73',
        'JA055_76', 'JA055_77', 'JA055_8', 'JA055_80', 'JA055_81', 'JA055_82',
        'JA055_85', 'JA055_89', 'JA055_9', 'JA055_92', 'JA056_11', 'JA056_21',
        'JA056_26', 'JA056_29', 'JA056_33', 'JA056_4', 'JA056_45', 'JA056_53',
        'JA056_57', 'JA056_58', 'JA056_61', 'JA056_62', 'JA056_74', 'JA056_76',
        'JA057_10', 'JA057_13', 'JA057_16', 'JA057_2', 'JA057_21', 'JA057_24',
        'JA057_25', 'JA057_28', 'JA057_3', 'JA057_30', 'JA057_33', 'JA057_34',
        'JA057_39', 'JA057_4', 'JA057_41', 'JA057_43', 'JA057_46', 'JA057_48',
        'JA057_51', 'JA057_52', 'JA057_53', 'JA057_57', 'JA057_6', 'JA057_61',
        'JA057_63', 'JA057_68', 'JA057_8', 'JA057_9', 'JA059_14', 'JA059_15',
        'JA059_16', 'JA059_23', 'JA059_26', 'JA059_3', 'JA059_39', 'JA059_41',
        'JA059_43', 'JA059_45', 'JA059_49', 'JA059_56', 'JA059_60', 'JA059_63',
        'JA059_7', 'JA059_72', 'JA060_107', 'JA060_43', 'JA060_52', 'JA060_75',
        'JA060_76', 'JA060_81', 'JA060_82', 'JA060_85', 'JA060_94', 'JA061_11',
        'JA061_17', 'JA061_2', 'JA061_20', 'JA061_4', 'JA061_57', 'JA061_61',
        'JA061_7', 'JA061_84', 'JA061_89', 'JA062_11', 'JA062_20', 'JA062_49',
        'JA062_50', 'JA062_7', 'JA062_9', 'JA063_24', 'JA063_84', 'JA063_87',
        'JA066_17', 'JA066_44', 'JA066_59', 'JA066_6', 'JA067_13', 'JA067_15',
        'JA067_16', 'JA067_20', 'JA067_22', 'JA067_29', 'JA067_33', 'JA067_35',
        'JA067_36', 'JA067_38', 'JA067_43', 'JA067_44', 'JA067_45', 'JA067_47',
        'JA067_5', 'JA067_50', 'JA067_52', 'JA067_54', 'JA067_55', 'JA067_56',
        'JA067_59', 'JA067_61', 'JA067_62', 'JA067_63', 'JA067_64', 'JA067_67',
        'JA067_68', 'JA067_69', 'JA067_70', 'JA067_8', 'JA068_49', 'JA069_21',
        'JA069_45', 'JA069_7', 'JA069_8', 'JA070_14', 'JA070_15', 'JA070_21',
        'JA070_22', 'JA070_3', 'JA070_33', 'JA070_38', 'JA070_39', 'JA070_4',
        'JA070_47', 'JA070_53', 'JA070_57', 'JA070_58', 'JA070_59', 'JA070_64',
        'JA070_67', 'JA070_68', 'JA070_7', 'JA070_71', 'JA070_77', 'JA070_8',
        'JA070_81', 'JA070_88', 'JA070_90', 'JA071_31', 'JA071_55', 'JA071_57',
        'JA071_62', 'JA071_70', 'JA073_13', 'JA073_15', 'JA073_16', 'JA073_24',
        'JA073_27', 'JA073_3', 'JA073_38', 'JA073_40', 'JA073_51', 'JA073_52',
        'JA073_53', 'JA073_7', 'JA073_9', 'JA074_16', 'JA074_20', 'JA074_24',
        'JA074_25', 'JA074_28', 'JA074_37', 'JA074_42', 'JA074_43', 'JA074_45',
        'JA074_47', 'JA074_5', 'JA074_50', 'JA074_51', 'JA074_52', 'JA074_56',
        'JA074_57', 'JA074_59', 'JA074_6', 'JA074_7', 'JA075_1', 'JA075_2',
        'JA075_3', 'JA076_103', 'JA076_13', 'JA076_14', 'JA076_25', 'JA076_27',
        'JA076_3', 'JA076_55', 'JA076_83', 'JA076_87', 'JA077_10', 'JA077_16',
        'JA077_28', 'JA077_31', 'JA077_51', 'JA077_59', 'JA077_64', 'JA077_65',
        'JA077_8', 'JA078_1', 'JA078_103', 'JA078_107', 'JA078_110',
        'JA078_112', 'JA078_116', 'JA078_120', 'JA078_121', 'JA078_123',
        'JA078_16', 'JA078_17', 'JA078_19', 'JA078_2', 'JA078_20', 'JA078_22',
        'JA078_23', 'JA078_24', 'JA078_27', 'JA078_28', 'JA078_3', 'JA078_39',
        'JA078_4', 'JA078_43', 'JA078_44', 'JA078_6', 'JA078_60', 'JA078_62',
        'JA078_66', 'JA078_7', 'JA078_73', 'JA078_83', 'JA078_90', 'JA078_93',
        'JA078_95', 'JA078_98', 'JA079_13', 'JA079_22', 'JA079_33', 'JA079_36',
        'JA079_74', 'JA080_101', 'JA080_12', 'JA080_16', 'JA080_18',
        'JA080_19', 'JA080_20', 'JA080_26', 'JA080_27', 'JA080_3', 'JA080_30',
        'JA080_32', 'JA080_33', 'JA080_37', 'JA080_38', 'JA080_39', 'JA080_4',
        'JA080_40', 'JA080_41', 'JA080_47', 'JA080_48', 'JA080_49', 'JA080_50',
        'JA080_51', 'JA080_59', 'JA080_6', 'JA080_60', 'JA080_61', 'JA080_62',
        'JA080_63', 'JA080_64', 'JA080_65', 'JA080_66', 'JA080_67', 'JA080_69',
        'JA080_7', 'JA080_71', 'JA080_72', 'JA080_74', 'JA080_79', 'JA080_8',
        'JA080_82', 'JA080_84', 'JA080_85', 'JA080_89', 'JA080_90', 'JA080_94',
        'JA080_95', 'JA080_98', 'JA080_99', 'JA081_17', 'JA081_21', 'JA081_26',
        'JA081_3', 'JA081_30', 'JA081_32', 'JA081_34', 'JA081_37', 'JA081_38',
        'JA081_58', 'JA081_66', 'JA081_7', 'JA081_74', 'JA081_76', 'JA081_80',
        'JA083_16', 'JA083_17', 'JA083_66', 'JA084_1', 'JA085_15', 'JA085_16',
        'JA085_4', 'JA085_53', 'JA085_80', 'JA086_17', 'JA086_18', 'JA086_58',
        'JA086_59', 'JA086_6', 'JA087_101', 'JA087_12', 'JA087_4', 'JA087_40',
        'JA087_53', 'JA087_63', 'JA088_13', 'JA088_15', 'JA088_62', 'JA089_1',
        'JA089_2', 'JA089_3', 'JA089_38', 'JA089_39', 'JA089_44', 'JA089_45',
        'JA089_46', 'JA089_47', 'JA089_67', 'JA089_7', 'JA089_77', 'JA089_78',
        'JA089_88', 'JA089_89', 'JA089_90', 'JA089_91', 'JA091_20', 'JA091_53',
        'JA091_62', 'JA091_63', 'JA091_64', 'JA091_69', 'JA092_22', 'JA092_62',
        'JA094_1', 'JA094_2', 'JA094_3', 'JA094_58', 'JA094_72', 'JA095_8',
        'JA096_12', 'JA096_62', 'JA097_19', 'JA097_59', 'JA098_25', 'JA099_11',
        'JA099_33', 'JA099_40', 'JA099_6', 'JA100_103', 'JA100_113',
        'JA100_119', 'JA100_14', 'JA100_18', 'JA100_69', 'JA100_72',
        'JA100_79', 'JA100_83', 'JA102_102', 'JA102_110', 'JA102_17',
        'JA102_20', 'JA102_21', 'JA102_24', 'JA102_60', 'JA102_63', 'JA102_69',
        'JA102_77', 'JA103_19', 'JA103_4', 'JA104_19', 'JA104_26', 'JA104_33',
        'JA104_6', 'JA106_43', 'JA107_10', 'JA107_22', 'JA107_25', 'JA108_38',
        'JA108_4', 'JA108_41', 'JA108_48', 'JA108_58', 'JA109_14', 'JA109_22',
        'JA109_35', 'JA109_36', 'JA109_40', 'JA109_43', 'JA109_51', 'JA109_54',
        'JA109_68', 'JA109_69', 'JA109_73', 'JA109_76', 'JA109_82',
        'JA110_100', 'JA110_12', 'JA110_30', 'JA110_31', 'JA110_45',
        'JA110_52', 'JA110_68', 'JA110_74', 'JA110_76', 'JA110_82', 'JA110_88',
        'JA110_90', 'JA111_16', 'JA111_23', 'JA111_25', 'JA111_38', 'JA111_39',
        'JA111_49', 'JA111_50', 'JA111_53', 'JA111_56', 'JA111_57', 'JA111_8',
        'JA111_89', 'JA112_11', 'JA112_14', 'JA112_22', 'JA112_25', 'JA112_36',
        'JA112_40', 'JA112_57', 'JA112_60', 'JA112_69', 'JA112_73', 'JA113_18',
        'JA113_19', 'JA113_22', 'JA113_25', 'JA113_26', 'JA113_39', 'JA113_4',
        'JA113_42', 'JA113_44', 'JA113_46', 'JA113_48', 'JA113_52', 'JA113_54',
        'JA113_56', 'JA113_62', 'JA113_65', 'JA113_67', 'JA113_68', 'JA113_71',
        'JA113_9', 'JA114_3', 'JA114_43', 'JA114_45', 'JA114_51', 'JA114_52',
        'JA114_54', 'JA114_55', 'JA115_14', 'JA115_15', 'JA115_17', 'JA115_20',
        'JA115_21', 'JA115_24', 'JA115_26', 'JA115_27', 'JA115_29', 'JA115_3',
        'JA115_31', 'JA115_32', 'JA115_34', 'JA115_35', 'JA115_37', 'JA115_38',
        'JA115_39', 'JA115_51', 'JA115_52', 'JA115_55', 'JA115_59', 'JA115_6',
        'JA115_60', 'JA115_62', 'JA115_63', 'JA115_68', 'JA115_70', 'JA115_73',
        'JA115_79', 'JA115_8', 'JA116_20', 'JA116_3', 'JA116_52', 'JA116_7',
        'JA117_11', 'JA117_16', 'JA117_32', 'JA117_40', 'JA117_52', 'JA117_56',
        'JA117_57', 'JA117_61', 'JA117_62', 'JA118_18', 'JA118_23', 'JA118_24',
        'JA118_28', 'JA118_29', 'JA118_38', 'JA118_4', 'JA118_8', 'JA201_10',
        'JA201_100', 'JA201_103', 'JA201_105', 'JA201_111', 'JA201_112',
        'JA201_113', 'JA201_114', 'JA201_115', 'JA201_121', 'JA201_122',
        'JA201_124', 'JA201_126', 'JA201_127', 'JA201_130', 'JA201_134',
        'JA201_135', 'JA201_136', 'JA201_137', 'JA201_138', 'JA201_14',
        'JA201_140', 'JA201_141', 'JA201_142', 'JA201_144', 'JA201_146',
        'JA201_149', 'JA201_15', 'JA201_150', 'JA201_152', 'JA201_153',
        'JA201_155', 'JA201_156', 'JA201_158', 'JA201_159', 'JA201_16',
        'JA201_160', 'JA201_19', 'JA201_2', 'JA201_23', 'JA201_27', 'JA201_29',
        'JA201_34', 'JA201_36', 'JA201_39', 'JA201_41', 'JA201_42', 'JA201_44',
        'JA201_45', 'JA201_47', 'JA201_49', 'JA201_5', 'JA201_52', 'JA201_53',
        'JA201_55', 'JA201_56', 'JA201_58', 'JA201_59', 'JA201_6', 'JA201_63',
        'JA201_65', 'JA201_66', 'JA201_67', 'JA201_68', 'JA201_71', 'JA201_72',
        'JA201_73', 'JA201_74', 'JA201_76', 'JA201_8', 'JA201_80', 'JA201_82',
        'JA201_83', 'JA201_88', 'JA201_89', 'JA201_9', 'JA201_92', 'JA201_94',
        'JA201_97', 'JA202_106', 'JA202_107', 'JA202_110', 'JA202_111',
        'JA202_113', 'JA202_120', 'JA202_126', 'JA202_127', 'JA202_132',
        'JA202_133', 'JA202_135', 'JA202_136', 'JA202_138', 'JA202_14',
        'JA202_140', 'JA202_141', 'JA202_142', 'JA202_145', 'JA202_17',
        'JA202_18', 'JA202_19', 'JA202_2', 'JA202_20', 'JA202_21', 'JA202_26',
        'JA202_28', 'JA202_29', 'JA202_35', 'JA202_45', 'JA202_46', 'JA202_51',
        'JA202_52', 'JA202_53', 'JA202_54', 'JA202_6', 'JA202_62', 'JA202_63',
        'JA202_64', 'JA202_65', 'JA202_66', 'JA202_67', 'JA202_7', 'JA202_70',
        'JA202_72', 'JA202_73', 'JA202_76', 'JA202_79', 'JA202_80', 'JA202_83',
        'JA202_85', 'JA202_86', 'JA202_88', 'JA202_89', 'JA202_9', 'JA202_91',
        'JA202_94', 'JA202_95', 'JA202_96', 'JA202_98', 'JA202_99', 'JA203_10',
        'JA203_103', 'JA203_104', 'JA203_11', 'JA203_114', 'JA203_117',
        'JA203_12', 'JA203_121', 'JA203_124', 'JA203_126', 'JA203_13',
        'JA203_14', 'JA203_15', 'JA203_16', 'JA203_2', 'JA203_54', 'JA203_55',
        'JA203_57', 'JA203_58', 'JA203_62', 'JA203_66', 'JA203_67', 'JA203_69',
        'JA203_70', 'JA203_71', 'JA203_73', 'JA203_77', 'JA203_78', 'JA203_80',
        'JA203_82', 'JA203_83', 'JA203_90', 'JA203_97', 'JA203_98', 'JA203_99',
        'JA204_102', 'JA204_103', 'JA204_104', 'JA204_105', 'JA204_11',
        'JA204_114', 'JA204_115', 'JA204_14', 'JA204_2', 'JA204_25',
        'JA204_47', 'JA204_8', 'JA204_88', 'JA204_9', 'JA204_90', 'JA204_97',
        'JA205_1', 'JA205_100', 'JA205_103', 'JA205_108', 'JA205_111',
        'JA205_113', 'JA205_114', 'JA205_119', 'JA205_120', 'JA205_121',
        'JA205_130', 'JA205_131', 'JA205_133', 'JA205_136', 'JA205_139',
        'JA205_150', 'JA205_151', 'JA205_152', 'JA205_155', 'JA205_162',
        'JA205_166', 'JA205_17', 'JA205_19', 'JA205_2', 'JA205_24', 'JA205_29',
        'JA205_30', 'JA205_4', 'JA205_41', 'JA205_42', 'JA205_45', 'JA205_47',
        'JA205_50', 'JA205_51', 'JA205_55', 'JA205_58', 'JA205_60', 'JA205_63',
        'JA205_67', 'JA205_70', 'JA205_71', 'JA205_75', 'JA205_76', 'JA205_79',
        'JA205_8', 'JA205_80', 'JA205_96', 'JA205_97', 'JA205_99', 'JA206_1',
        'JA206_2', 'JA206_3', 'JA206_4', 'JA207_1', 'JA207_100', 'JA207_105',
        'JA207_106', 'JA207_116', 'JA207_125', 'JA207_138', 'JA207_139',
        'JA207_14', 'JA207_141', 'JA207_143', 'JA207_148', 'JA207_153',
        'JA207_160', 'JA207_20', 'JA207_22', 'JA207_23', 'JA207_27',
        'JA207_31', 'JA207_37', 'JA207_38', 'JA207_4', 'JA207_43', 'JA207_49',
        'JA207_5', 'JA207_56', 'JA207_59', 'JA207_60', 'JA207_9', 'JA207_90',
        'JA207_92', 'JA207_93', 'JA207_96', 'JA209_1', 'JA209_100',
        'JA209_103', 'JA209_108', 'JA209_111', 'JA209_113', 'JA209_115',
        'JA209_116', 'JA209_124', 'JA209_133', 'JA209_15', 'JA209_20',
        'JA209_27', 'JA209_30', 'JA209_33', 'JA209_36', 'JA209_38', 'JA209_39',
        'JA209_41', 'JA209_43', 'JA209_49', 'JA209_51', 'JA209_57', 'JA209_60',
        'JA209_64', 'JA209_75', 'JA209_80', 'JA209_88', 'JA209_91', 'JA209_92',
        'JA209_94', 'JA209_98', 'JA210_100', 'JA210_109', 'JA210_112',
        'JA210_114', 'JA210_118', 'JA210_12', 'JA210_14', 'JA210_20',
        'JA210_22', 'JA210_26', 'JA210_27', 'JA210_29', 'JA210_3', 'JA210_31',
        'JA210_34', 'JA210_36', 'JA210_40', 'JA210_45', 'JA210_56', 'JA210_59',
        'JA210_71', 'JA210_73', 'JA210_79', 'JA210_81', 'JA210_83', 'JA210_84',
        'JA210_86', 'JA210_87', 'JA210_89', 'JA210_92', 'JA210_93', 'JA210_98',
        'JA210_99', 'JA211_104', 'JA211_106', 'JA211_109', 'JA211_118',
        'JA211_125', 'JA211_128', 'JA211_16', 'JA211_17', 'JA211_18',
        'JA211_31', 'JA211_32', 'JA211_33', 'JA211_35', 'JA211_37', 'JA211_45',
        'JA211_46', 'JA211_48', 'JA211_5', 'JA211_57', 'JA211_8', 'JA211_82',
        'JA211_89', 'JA211_9', 'JA211_91', 'JA211_99', 'JA212_15', 'JA212_30',
        'JA212_41', 'JA212_44', 'JA212_51', 'JA212_52', 'JA212_55', 'JA212_59',
        'JA212_6', 'JA212_63', 'JA212_64', 'JA212_72', 'JA212_74', 'JA212_8',
        'JA212_80', 'JA212_82', 'JA212_86', 'JA212_94', 'JA213_1', 'JA213_10',
        'JA213_101', 'JA213_102', 'JA213_104', 'JA213_105', 'JA213_113',
        'JA213_118', 'JA213_13', 'JA213_17', 'JA213_18', 'JA213_2', 'JA213_21',
        'JA213_23', 'JA213_25', 'JA213_27', 'JA213_35', 'JA213_36', 'JA213_37',
        'JA213_39', 'JA213_4', 'JA213_40', 'JA213_42', 'JA213_44', 'JA213_53',
        'JA213_56', 'JA213_57', 'JA213_60', 'JA213_61', 'JA213_64', 'JA213_67',
        'JA213_69', 'JA213_71', 'JA213_76', 'JA213_77', 'JA213_8', 'JA213_81',
        'JA213_82', 'JA213_84', 'JA213_88', 'JA213_9', 'JA213_93', 'JA213_95',
        'JA213_99', 'JA214_1', 'JA214_10', 'JA214_100', 'JA214_102',
        'JA214_104', 'JA214_120', 'JA214_121', 'JA214_124', 'JA214_127',
        'JA214_13', 'JA214_130', 'JA214_134', 'JA214_136', 'JA214_138',
        'JA214_139', 'JA214_140', 'JA214_22', 'JA214_25', 'JA214_26',
        'JA214_29', 'JA214_30', 'JA214_31', 'JA214_36', 'JA214_39', 'JA214_42',
        'JA214_44', 'JA214_45', 'JA214_54', 'JA214_57', 'JA214_60', 'JA214_63',
        'JA214_67', 'JA214_75', 'JA214_76', 'JA214_87', 'JA214_94', 'JA214_95',
        'JA214_97', 'JA215_101', 'JA215_102', 'JA215_105', 'JA215_106',
        'JA215_111', 'JA215_113', 'JA215_120', 'JA215_125', 'JA215_128',
        'JA215_131', 'JA215_133', 'JA215_134', 'JA215_135', 'JA215_138',
        'JA215_17', 'JA215_34', 'JA215_39', 'JA215_4', 'JA215_40', 'JA215_45',
        'JA215_46', 'JA215_55', 'JA215_57', 'JA215_58', 'JA215_64', 'JA215_66',
        'JA215_7', 'JA215_72', 'JA215_73', 'JA215_74', 'JA215_79', 'JA215_81',
        'JA215_83', 'JA215_84', 'JA215_86', 'JA215_98', 'JA216_104',
        'JA216_106', 'JA216_112', 'JA216_114', 'JA216_116', 'JA216_120',
        'JA216_123', 'JA216_127', 'JA216_133', 'JA216_136', 'JA216_138',
        'JA216_144', 'JA216_145', 'JA216_22', 'JA216_26', 'JA216_28',
        'JA216_30', 'JA216_33', 'JA216_37', 'JA216_39', 'JA216_47', 'JA216_5',
        'JA216_50', 'JA216_56', 'JA216_61', 'JA216_65', 'JA216_69', 'JA216_72',
        'JA216_85', 'JA216_91', 'JA216_92', 'JA216_99', 'JA217_10',
        'JA217_100', 'JA217_101', 'JA217_102', 'JA217_104', 'JA217_105',
        'JA217_11', 'JA217_117', 'JA217_118', 'JA217_119', 'JA217_120',
        'JA217_123', 'JA217_125', 'JA217_132', 'JA217_134', 'JA217_137',
        'JA217_138', 'JA217_14', 'JA217_141', 'JA217_143', 'JA217_144',
        'JA217_146', 'JA217_157', 'JA217_159', 'JA217_160', 'JA217_18',
        'JA217_19', 'JA217_20', 'JA217_21', 'JA217_23', 'JA217_24', 'JA217_25',
        'JA217_27', 'JA217_28', 'JA217_31', 'JA217_32', 'JA217_36', 'JA217_46',
        'JA217_5', 'JA217_51', 'JA217_59', 'JA217_6', 'JA217_61', 'JA217_62',
        'JA217_63', 'JA217_65', 'JA217_72', 'JA217_8', 'JA217_82', 'JA217_84',
        'JA217_86', 'JA217_88', 'JA217_89', 'JA217_90', 'JA217_91', 'JA217_92',
        'JA217_93', 'JA217_99', 'JA218_107', 'JA218_11', 'JA218_110',
        'JA218_117', 'JA218_125', 'JA218_132', 'JA218_141', 'JA218_22',
        'JA218_23', 'JA218_26', 'JA218_27', 'JA218_29', 'JA218_31', 'JA218_34',
        'JA218_38', 'JA218_42', 'JA218_43', 'JA218_51', 'JA218_56', 'JA218_60',
        'JA218_65', 'JA218_67', 'JA218_73', 'JA218_77', 'JA218_8', 'JA218_90',
        'JA219_105', 'JA219_109', 'JA219_11', 'JA219_112', 'JA219_113',
        'JA219_114', 'JA219_115', 'JA219_119', 'JA219_121', 'JA219_122',
        'JA219_127', 'JA219_133', 'JA219_142', 'JA219_143', 'JA219_148',
        'JA219_149', 'JA219_151', 'JA219_154', 'JA219_17', 'JA219_19',
        'JA219_20', 'JA219_21', 'JA219_24', 'JA219_28', 'JA219_32', 'JA219_34',
        'JA219_35', 'JA219_36', 'JA219_39', 'JA219_4', 'JA219_47', 'JA219_48',
        'JA219_49', 'JA219_50', 'JA219_56', 'JA219_6', 'JA219_69', 'JA219_73',
        'JA219_79', 'JA219_8', 'JA219_89', 'JA219_93', 'JA219_96', 'JA219_98',
        'JA220_115', 'JA220_119', 'JA220_39', 'JA220_45', 'JA220_47',
        'JA220_55', 'JA221_101', 'JA221_108', 'JA221_113', 'JA221_27',
        'JA221_3', 'JA221_52', 'JA221_54', 'JA221_58', 'JA221_63', 'JA221_66',
        'JA221_67', 'JA221_80', 'JA221_9', 'JA222_100', 'JA222_103',
        'JA222_105', 'JA222_106', 'JA222_112', 'JA222_33', 'JA222_54',
        'JA222_55', 'JA222_57', 'JA222_68', 'JA222_69', 'JA222_82', 'JA222_89',
        'JA222_91', 'JA222_96', 'JA222_99', 'JA223_100', 'JA223_101',
        'JA223_102', 'JA223_103', 'JA223_32', 'JA223_92', 'JA223_93',
        'JA223_94', 'JA223_95', 'JA223_96', 'JA223_97', 'JA223_98', 'JA223_99',
        'JA224_101', 'JA224_103', 'JA224_19', 'JA224_21', 'JA224_22',
        'JA224_23', 'JA224_26', 'JA224_29', 'JA224_35', 'JA224_37', 'JA224_41',
        'JA224_42', 'JA224_46', 'JA224_48', 'JA224_67', 'JA224_7', 'JA224_70',
        'JA224_80', 'JA224_82', 'JA224_85', 'JA224_87', 'JA224_9', 'JA224_92',
        'JA224_94', 'JA224_99', 'JA225_100', 'JA225_107', 'JA225_108',
        'JA225_114', 'JA225_27', 'JA225_3', 'JA225_30', 'JA225_33', 'JA225_43',
        'JA225_45', 'JA225_60', 'JA225_68', 'JA225_70', 'JA225_82', 'JA225_87',
        'JA226_114', 'JA226_119', 'JA226_21', 'JA226_29', 'JA226_54',
        'JA226_61', 'JA226_76', 'JA226_83', 'JA226_84', 'JA226_87', 'JA226_89',
        'JA226_97', 'JA228_11', 'JA228_12', 'JA228_13', 'JA228_17', 'JA228_18',
        'JA228_2', 'JA228_20', 'JA228_25', 'JA228_26', 'JA228_27', 'JA228_28',
        'JA228_3', 'JA228_30', 'JA228_32', 'JA228_33', 'JA228_34', 'JA228_35',
        'JA228_36', 'JA228_37', 'JA228_40', 'JA228_43', 'JA228_6', 'JA228_7',
        'JA228_9'
    ]

    # https://docs.google.com/spreadsheets/d/1a4ZWvuKfe2wMd_sVOid3KLY7PqKQkPe1uYNa_7zC5Gw/edit?pli=1#gid=0
    # with C+glide considered as two separate phonemes

    vowels = {
        'a': u'ä',
        'e': u'e',
        'i': u'i',
        'o': u'o',
        'u': u'ɯ',
        'a+H': u'ä:',
        'e+H': u'e:',
        'i+H': u'i:',
        'o+H': u'o:',
        'u+H': u'ɯ:'
    }

    consonants = {
        'w': u'w',
        'y': u'j',
        'm': u'm',
        'n': u'n',
        'N': u'ɴ',
        'd': u'd',
        't': u't',
        'Q+t': u't:',
        'c': u't͡s',
        'c+y': u't͡ɕ',
        'Q+c+y': u't͡ɕ:',
        's': u's',
        'Q+s': u's:',
        's+y': u'ɕ',
        'Q+s+y': u'ɕ:',
        'z': u'z',  # fricative or affricate
        'z+y': u'ʑ',  # fricative or affricate 
        'F': u'ɸ',
        'h': u'h',
        'g': u'g',
        'k': u'k',
        'Q+k': u'k:',
        'p': u'p',
        'Q+p': u'p:',
        'r': u'r',
        'b': u'b',
        # infrequent phones
        'Q+c': u't͡s:',
        'Q+g': u'g:',
        'Q+h': u'h:',
        'Q+d': u'd:',
        'Q+z': u'z:',  # fricative or affricate
        'Q+z+y': u'ʑ:',  # fricative or affricate
        'Q+F': u'ɸ:',
        'Q+b': u'b:'
    }
    """
    Frequencies of the various phones in the corpus
    produced by this recipe, before dropping infrequent
    phones:

        [('a', 1244643),
         ('o', 930707),
         ('i', 809173),
         ('e', 624466),
         ('k', 535058),
         ('n', 518087),
         ('u', 499639),
         ('t', 445306),
         ('m', 337715),
         ('r', 326103),
         ('s', 297367),
         ('d', 285754),
         ('N', 276768),
         ('o+H', 189680),
         ('s+y', 186292),
         ('y', 164743),
         ('g', 162616),
         ('Q+t', 108847),
         ('w', 108092),
         ('h', 92601),
         ('u+H', 82833),
         ('e+H', 81238),
         ('z+y', 76611),
         ('b', 72008),
         ('c+y', 60997),
         ('c', 54208),
         ('z', 36106),
         ('a+H', 31660),
         ('i+H', 23952),
         ('F', 20939),
         ('Q+k', 18819),
         ('p', 13239),
         ('Q+p', 10359),
         ('Q+s+y', 4791),
         ('Q+s', 4062),
         ('Q+c+y', 2851),
         ('Q+c', 1155),
         ('Q+d', 276),
         ('Q+g', 125),
         ('Q+F', 116),
         ('Q+h', 84),
         ('Q+z+y', 56),
         ('Q+z', 24),
         ('Q+b', 16)]

    Infrequent phones are removed from the corpus in a subsequent step.
    """

    # phones are vowels and consonants
    phones = utils.merge_dicts(vowels, consonants)

    silences = []

    variants = []
class SPSCSJPreparator(AbstractPreparator):
    """convert the CSJ corpus to the abkhazia format"""
    name = 'sps_csj'
    description = 'Corpus of Spontaneous Japanese'

    long_description = '''
    The Corpus of Spontaneous Japanese (CSJ) is a database of spoken
    Japanese. It contains 658 hours of speech consisting of
    approximately 7.5 million words from more than 1,400 speakers.
    It is publicly available at
    /corpus_center/csj/misc/preliminary/index_e.html
    '''

    url = 'http://www.ninjal.ac.jp/english/products/csj'
    audio_format = 'wav'

    # segment inventory based on
    # https://docs.google.com/spreadsheets/d/1a4ZWvuKfe2wMd_sVOid3KLY7PqKQkPe1uYNa_7zC5Gw/edit?pli=1#gid=0
    # with C+glide considered as two separate phonemes

    vowels = {
        'a': u'ä',
        'e': u'e',
        'i': u'i',
        'o': u'o',
        'u': u'ɯ',
        'a+H': u'ä:',
        'e+H': u'e:',
        'i+H': u'i:',
        'o+H': u'o:',
        'u+H': u'ɯ:'
    }

    consonants = {
        'w': u'w',
        'y': u'j',
        'm': u'm',
        'n': u'n',
        'N': u'ɴ',
        'd': u'd',
        #'Q+d': u'd:',
        't': u't',
        'Q+t': u't:',
        'c': u't͡s',
        'Q+c': u't͡s:',
        'c+y': u't͡ɕ',
        'Q+c+y': u't͡ɕ:',
        's': u's',
        'Q+s': u's:',
        's+y': u'ɕ',
        'Q+s+y': u'ɕ:',
        'z': u'z',  # fricative or affricate
        #'Q+z': u'z:',  # fricative or affricate
        'z+y': u'ʑ',  # fricative or affricate
        #'Q+z+y': u'ʑ:',  # fricative or affricate
        'F': u'ɸ',
        #'Q+F': u'ɸ:',
        'h': u'h',
        #'Q+h': u'h:',
        'g': u'g',
        #'Q+g': u'g:',
        'k': u'k',
        'Q+k': u'k:',
        'p': u'p',
        'Q+p': u'p:',
        'r': u'r',
        'b': u'b'
        #'Q+b': u'b:'
    }
    """
    Frequencies of the various phones in the corpus
    produced by this recipe, before dropping infrequent
    phones:

        [('a', 1244643),
         ('o', 930707),
         ('i', 809173),
         ('e', 624466),
         ('k', 535058),
         ('n', 518087),
         ('u', 499639),
         ('t', 445306),
         ('m', 337715),
         ('r', 326103),
         ('s', 297367),
         ('d', 285754),
         ('N', 276768),
         ('o+H', 189680),
         ('s+y', 186292),
         ('y', 164743),
         ('g', 162616),
         ('Q+t', 108847),
         ('w', 108092),
         ('h', 92601),
         ('u+H', 82833),
         ('e+H', 81238),
         ('z+y', 76611),
         ('b', 72008),
         ('c+y', 60997),
         ('c', 54208),
         ('z', 36106),
         ('a+H', 31660),
         ('i+H', 23952),
         ('F', 20939),
         ('Q+k', 18819),
         ('p', 13239),
         ('Q+p', 10359),
         ('Q+s+y', 4791),
         ('Q+s', 4062),
         ('Q+c+y', 2851),
         ('Q+c', 1155),
         ('Q+d', 276),
         ('Q+g', 125),
         ('Q+F', 116),
         ('Q+h', 84),
         ('Q+z+y', 56),
         ('Q+z', 24),
         ('Q+b', 16)]

    To be able to train reliable phone models, we decided to
    drop from the corpus all utterances involving phones with
    less than 1000 occurrences (see 'remove_infrequent_phones' in
    __init__).
    In terms of frequency of occurrence,
    the most frequent of these phones has frequency 0.0000316
    (around 3 occurrences per 100 000 phones).

    The removed phones are: Q+b, Q+g, Q+d, Q+F, Q+h, Q+z, Q+z+y
    """

    # phones are vowels and consonents
    phones = utils.merge_dicts(vowels, consonants)

    silences = []

    variants = []

    def __init__(self,
                 input_dir,
                 log=utils.logger.null_logger(),
                 copy_wavs=False):
        super(SPSCSJPreparator, self).__init__(input_dir, log)
        self.copy_wavs = copy_wavs
        # select laymen talks only
        xml_dir = os.path.join(self.input_dir, 'XML')
        self.data_files = os.listdir(xml_dir)
        self.data_files = [f.replace('.xml', '') for f in self.data_files]
        self.data_files = [f for f in self.data_files if f[0] == 'S']
        # gather label data TODO parallelize
        self.log.info('parsing {} xml files'.format(len(self.data_files)))
        self.all_utts = {}
        self.lexicon = {}
        N_parsed = 0
        N = 0
        for data in progressbar.ProgressBar()(self.data_files):
            print("xml : {}".format(data))
            utts, nb_parsed_utt, nb_utts = self.parse_xml(
                os.path.join(xml_dir, data + '.xml'))
            N_parsed = N_parsed + nb_parsed_utt
            N = N + nb_utts
            # we do not use directly the bootphon Japanese phoneset,
            # in particular we remove the + for the following phones:
            # k+y g+y n+y h+y b+y p+y m+y r+y t+y d+y
            # (i.e we consider the glide y as a separate phoneme)
            utts = break_glides_clusters(utts)
            # removing very infrequent phones
            utts, nb_removed = remove_infrequent_phones(utts)
            self.log.info(
                'Removed {} utts with infrequent phones'.format(nb_removed))
            N_parsed = N_parsed - nb_removed
            utts, utt_lexicon = self.lexicalize(utts)
            for utt_id in utts:
                assert not (utt_id in self.all_utts), utt_id
                self.all_utts[utt_id] = utts[utt_id]
            for word in utt_lexicon:
                if word not in self.lexicon:
                    self.lexicon[word] = utt_lexicon[word]
        proportion = 100. * N_parsed / float(N)
        self.log.info('{:.2f}% of {} utts successfully parsed'.format(
            proportion, N))
        print('{:.2f}% of {} utts successfully parsed'.format(proportion, N))

    def parse_xml(self, xml_file):
        """Parse raw transcript"""
        tree = ET.ElementTree(file=xml_file)
        talk = tree.getroot()
        talk_id = talk.attrib["TalkID"]
        speaker = talk.attrib["SpeakerID"]
        # make sure all speaker-ids have same length
        if len(speaker) < 4:
            speaker = "0" * (4 - len(speaker)) + speaker
        else:
            assert len(speaker) == 4, talk_id

        # using kanji for 'male'
        gender = 'M' if talk.attrib["SpeakerSex"] == u"男" else 'F'
        spk_id = gender + speaker

        # Utterance level
        nb_utts = 0
        nb_parsed_utts = 0
        utts = {}
        for ipu in talk.iter("IPU"):
            nb_utts = nb_utts + 1
            utt_id = spk_id + u"_" + talk_id + u"_" + ipu.attrib["IPUID"]
            channel = None
            utt_start = float(ipu.attrib["IPUStartTime"])
            utt_stop = float(ipu.attrib["IPUEndTime"])
            ipu_id = ipu.attrib["IPUID"]
            words, parse_successful = self.parse_ipu(ipu, ipu_id)
            if parse_successful:
                utts[utt_id] = Utt(words, utt_start, utt_stop, channel)
                nb_parsed_utts = nb_parsed_utts + 1
        #proportion = 100.*nb_parsed_utts/float(nb_utts)
        #print('{:.2f} percent of {} utts successfully parsed'.format(proportion,
        #                                                            nb_utts))
        return utts, nb_parsed_utts, nb_utts

    def parse_ipu(self, ipu, ipu_id):
        # Word level - Long Words Units (LUW) are taken as 'words'
        words = []
        parse_successful = True
        ipu_kanas = []
        for luw in ipu.iter("LUW"):
            # we drop any ipu where some noise occurs
            noise_in_luw = False
            for noise in luw.iter("Noise"):
                noise_in_luw = True
            if noise_in_luw:
                #print("Noise")
                ipu_kanas = None
                parse_successful = False
                break
            luw_kanas = u""
            for suw in luw.iter("SUW"):
                kanas = suw.attrib["PhoneticTranscription"]
                kanas.encode('utf8')
                luw_kanas = luw_kanas + kanas
            ipu_kanas.append(luw_kanas)
        if parse_successful:
            # print(ipu_kanas)
            ipu_kanas = u"#WB#".join(ipu_kanas)  # word boundary tag
            # appropriately deal with potential CSJ tags
            ipu_kanas = untagCSJphoneticTranscript(ipu_kanas)
            if ipu_kanas is None:
                parse_successful = False
        if parse_successful:
            # parse into phones
            words = kana2phones(ipu_kanas)
            # words.append(Word(phonemes, None, None))
            if words is None:
                parse_successful = False
            elif not (all([len(w) for w in words])):
                # print("Empty LUW in IPU {}".format(ipu_id))
                parse_successful = False
        if not (parse_successful):
            # print("Ignoring IPU {}".format(ipu_id))
            pass
        else:
            # print(words)
            pass
        return words, parse_successful

    def lexicalize(self, utts):
        lexicon = {}
        new_utts = {}
        for utt_id in utts:
            utt = utts[utt_id]
            start = utt.start
            stop = utt.end
            words = []
            for phones in utt.words:
                assert len(phones) > 0
                assert phones != ['H']
                word = u"-".join(phones)
                if word not in lexicon:
                    lexicon[word] = phones
                words.append(word)
                new_utts[utt_id] = {
                    'words': words,
                    'start': start,
                    'end': stop
                }

        return new_utts, lexicon

    def list_audio_files(self):
        return [
            os.path.join(self.input_dir, 'Waveforms', data + '.wav')
            for data in self.data_files
        ]

    def make_segment(self):
        segments = dict()
        for utt_id in self.all_utts:
            wavefile = utt_id.split("_")[1]
            start = self.all_utts[utt_id]['start']
            stop = self.all_utts[utt_id]['end']
            segments[utt_id] = (wavefile, float(start), float(stop))
        return segments

    def make_speaker(self):
        utt2spk = dict()
        for utt_id in self.all_utts:
            utt2spk[utt_id] = utt_id.split("_")[0]
        return utt2spk

    def make_transcription(self):
        text = dict()
        for utt_id in self.all_utts:
            words = u" ".join(self.all_utts[utt_id]['words'])
            text[utt_id] = words
        return text

    def make_lexicon(self):
        return {k: ' '.join(v) for k, v in self.lexicon.items()}
Exemple #3
0
class CSJPreparator(AbstractPreparator):
    """convert the CSJ corpus to the abkhazia format"""
    name = 'csj'
    description = 'Corpus of Spontaneous Japanese'

    long_description = '''
    The Corpus of Spontaneous Japanese (CSJ) is a database of spoken
    Japanese. It contains 658 hours of speech consisting of
    approximately 7.5 million words from more than 1,400 speakers.
    It is publicly available at
    /corpus_center/csj/misc/preliminary/index_e.html
    '''

    url = 'http://www.ninjal.ac.jp/english/products/csj'
    audio_format = 'wav'
    vowels = {
        'a': u'ä',
        'e': u'e',
        'i': u'i',
        'o': u'o',
        'u': u'ɯ',  # this one has lip-compression
        'a+H': u'ä+H',
        'e+H': u'e+H',
        'i+H': u'i+H',
        'o+H': u'o+H',
        'u+H': u'ɯ+H',
        'yu+H': u'yu+H'
    }

    # geminates: look at the effectives
    consonants = {
        'F': u'ɸ',  # not sure about this one
        'F:': u'ɸ:',  # not sure about this one
        'Q+F': u'Q+F',
        'N': u'ɴ',  # maybe we should transcribe the N like the Q
        # based on following consonant?
        'Q': u'ʔ',
        'b': u'b',
        'b:': u'b:',  # is this really a geminate with a voiced stop ?
        'd': u'd',
        'd:': u'd:',  # is this really a geminate with a voiced stop ?
        'Q+d': u'Q+d',
        'g': u'g',
        'g:': u'g:',  # is this really a geminate with a voiced stop ?
        'Q+g': u'Q+g',
        # look at difference between aspiration and gemination: gemination
        # is supposed to affect the duration of closure and aspiration the
        # VOT. This explains that gemination cannot occur at the beginning
        # of an utterance no way to determine the duration of closure
        'h': u'h',
        'h:': u'h:',  # TODO ASK THOMAS IF I SHOULD PUT IT ?
        'Q+h': u'Q+h',
        'k': u'k',
        'k:': u'k:',
        'Q+k': u'Q+k',
        'm': u'm',
        'n': u'n',
        'p': u'p',
        'p:': u'p:',
        'Q+p': u'Q+p',
        # TODO ASK THOMAS AND XUAN-NGA !!
        'Q+b': u'Q+b',
        'Q+p+y': u'Q+p+y',
        'Q+c+y': u'Q+c+y',
        'd+y': u'd+y',
        'g+y': u'g+y',
        'c+y': u'c+y',
        'h+y': u'h+y',
        'p+y': u'p+y',
        'r+y': u'r+y',
        'm+y': u'm+y',
        'n+y': u'n+y',
        'b+y': u'b+y',
        'c': u'c',
        'c:': u'c:',
        'Q+c': u'Q+c',
        'k+y': u'k+y',
        'Q+k+y': u'Q+k+y',
        'my': u'my',
        # TODO ASK THOMAS AND XUAN-NGA
        'r': u'r',
        's': u's',
        's:': u's:',
        'Q+s': u'Q+s',
        's+y': u'ɕ',
        'Q+s+y': u'Q+s+y',
        't': u't',
        't:': u't:',
        'Q+t': u'Q+t',
        'w': u'w',  # lip-compression here too...
        'y': u'j',
        'z': u'z',
        'z:': u'z:',  # TODO ASK THOMAS IF IS SHOULD PUT IT ?
        'Q+z': u'Q+z',
        'z+y': u'ʑ',  # very commonly an affricate...
        'Q+z+y': u'Q+z+y'
    }

    # XML with bad transcription:
    xml_pb = 'S05M1406.xml'

    # phones are vowels and consonents
    phones = utils.merge_dicts(vowels, consonants)

    silences = ['SPN', 'NSN']

    variants = []

    def __init__(self,
                 input_dir,
                 log=utils.logger.null_logger(),
                 copy_wavs=False,
                 clusters=False,
                 treat_core=False):
        super(CSJPreparator, self).__init__(input_dir, log)
        self.copy_wavs = copy_wavs

        # load the core_CSJ.txt from the abkhazia installation path
        core = resource_filename(Requirement.parse('abkhazia'),
                                 'abkhazia/share/CSJ_core.txt')
        if not os.path.exists(core):
            raise OSError('core_CSJ not found in {}'.format(core))
        core_files = [l[:-1] for l in open(core, 'r').readlines()]
        # select laymen talks only, from core part of the corpus
        xml_dir = os.path.join(self.input_dir, 'XML')
        self.data_files = os.listdir(xml_dir)
        self.data_files = [f.replace('.xml', '') for f in self.data_files]
        self.non_core_files = [
            f for f in self.data_files if f[0] == 'S' and f not in core_files
        ]
        self.data_core_files = [
            f for f in self.data_files if f[0] == 'S' and f in core_files
        ]
        self.data_files = [f for f in self.data_files if f[0] == 'S']

        self.kana_to_phone = self.parse_kana_to_phone(
            resource_filename(Requirement.parse('abkhazia'),
                              'abkhazia/share/kana-to-phone_bootphon_CSJ.txt'))

        # gather label data TODO parallelize
        self.log.info('parsing {} xml files'.format(len(self.data_files)))
        self.all_utts = {}
        self.lexicon = {}

        if treat_core:
            self.data_files = self.data_core_files
        for data in progressbar.ProgressBar()(self.data_files):
            print "xml :", data
            if treat_core:
                utts = self.parse_core_xml(os.path.join(
                    xml_dir, data + '.xml'))
            else:
                #if not data == 'S05F0612':
                #    continue
                #else :
                #    print data
                utts = self.parse_non_core_xml(
                    os.path.join(xml_dir, data + '.xml'), clusters)
                for utt_id in utts:
                    utt = utts[utt_id]
            utts, utt_lexicon = self.extract_basic_transcript(utts, clusters)
            for utt_id in utts:
                assert not (utt_id in self.all_utts), utt_id
                self.all_utts[utt_id] = utts[utt_id]

            for word in utt_lexicon:
                if word not in self.lexicon:
                    if word == '-':
                        continue
                    self.lexicon[word] = utt_lexicon[word]

    def parse_kana_to_phone(self, kana_csv):
        """Parse katakana phone transcription and pu it in a dict() """
        kana_to_phon = dict()
        with open_utf8(kana_csv, 'r') as fin:
            kana_transcript = fin.read()
            kana_transcript = kana_transcript.split('\n')
            for line in kana_transcript[1:]:
                if line == '':
                    continue
                phones = line.split('\t')
                katakana = phones[0].decode('utf8')
                bootphon = phones[3]
                if bootphon == '':
                    bootphon == "H"
                kana_to_phon[katakana] = bootphon
        return (kana_to_phon)

    def parse_core_xml(self, xml_file):
        """Parse raw transcript"""
        tree = ET.ElementTree(file=xml_file)
        talk = tree.getroot()
        talk_id = talk.attrib["TalkID"]
        speaker = talk.attrib["SpeakerID"]

        # make sure all speaker-ids have same length
        if len(speaker) < 4:
            speaker = "0" * (4 - len(speaker)) + speaker
        else:
            assert len(speaker) == 4, talk_id

        # using kanji for 'male'
        gender = 'M' if talk.attrib["SpeakerSex"] == u"男" else 'F'
        spk_id = gender + speaker

        if talk_id[0] == "D":
            is_dialog = True
        else:
            is_dialog = False

        # Utterance level
        utts = {}
        for ipu in talk.iter("IPU"):
            utt_id = spk_id + u"_" + talk_id + u"_" + ipu.attrib["IPUID"]
            channel = ipu.attrib["Channel"] if is_dialog else None
            utt_start = float(ipu.attrib["IPUStartTime"])
            utt_stop = float(ipu.attrib["IPUEndTime"])

            # Word level - Short Words Units (SUW) are taken as 'words'
            words = []
            for suw in ipu.iter("SUW"):
                # Phoneme level
                phonemes = []
                for phoneme in suw.iter("Phoneme"):
                    phoneme_id = phoneme.attrib["PhonemeEntity"]
                    # Phone level (detailed phonetic)
                    phones = []
                    for phone in phoneme.iter("Phone"):
                        start = float(phone.attrib["PhoneStartTime"])
                        stop = float(phone.attrib["PhoneEndTime"])
                        id = phone.attrib["PhoneEntity"]
                        phn_class = phone.attrib["PhoneClass"]
                        phones.append(Phone(id, phn_class, start, stop))
                    if phones:
                        phonemes.append(
                            Phoneme(phoneme_id, phones, phones[0].start,
                                    phones[-1].end))
                    # else:
                    #     self.log.debug(utt_id)
                if phonemes:
                    words.append(
                        Word(phonemes, phonemes[0].start, phonemes[-1].end))
                else:
                    try:
                        moras = [
                            mora.attrib["MoraEntity"]
                            for mora in suw.iter("Mora")
                        ]
                        # self.log.debug(moras)
                    except:
                        pass
                    # self.log.debug(utt_id)
                    # FIXME understand this
                    # assert u"φ" in moras, utt_id
            utts[utt_id] = Utt(words, utt_start, utt_stop, channel)
        return utts

    def parse_non_core_xml(self, xml_file, keep_clusters):
        """Parse raw transcript"""
        tree = ET.ElementTree(file=xml_file)
        talk = tree.getroot()
        talk_id = talk.attrib["TalkID"]
        speaker = talk.attrib["SpeakerID"]
        # make sure all speaker-ids have same length
        if len(speaker) < 4:
            speaker = "0" * (4 - len(speaker)) + speaker
        else:
            assert len(speaker) == 4, talk_id

        # using kanji for 'male'
        gender = 'M' if talk.attrib["SpeakerSex"] == u"男" else 'F'
        spk_id = gender + speaker

        if talk_id[0] == "D":
            is_dialog = True
        else:
            is_dialog = False

        # Utterance level
        utts = {}
        for ipu in talk.iter("IPU"):
            utt_id = spk_id + u"_" + talk_id + u"_" + ipu.attrib["IPUID"]
            channel = ipu.attrib["Channel"] if is_dialog else None
            utt_start = float(ipu.attrib["IPUStartTime"])
            utt_stop = float(ipu.attrib["IPUEndTime"])
            # Word level - Long Words Units (SUW) are taken as 'words'
            words = []
            for luw in ipu.iter("LUW"):
                phonemes = []
                for suw in luw.iter("SUW"):
                    # Phoneme level
                    #phonemes = []
                    phones = suw.attrib["PhoneticTranscription"]
                    phones.encode('utf8')
                    whole_phone = phones

                    # in X05M1406.xml, transcription starts a word with H : replace by "?"
                    xml_name = xml_file.split('/')[-1]
                    if xml_name == self.xml_pb and phones == 'ーノ':
                        phones = '?'
                    # TODO check why causes problem ? supposed to be N H
                    if 'ンー' in phones:
                        phones = '?'

                    if "W" in phones or "?" in phones or "B" in phones or "O" in phones:
                        # if W, ?, B or O appears, we replace the phoneme by SPN (spoken
                        # noise), since we can't be sure what is the good transcription.
                        phone = []
                        phone.append(Phone("SPN", '', None, None))
                        phonemes.append(
                            Phoneme("SPN", phone, phone[0].start,
                                    phone[-1].end))
                        continue

                    # P indicated a pause and is followed by 20 and ":"
                    # numbers, exlude everything
                    # TODO : replace P by SIL ?
                    while "P" in phones:
                        ind = phones.index('P')
                        phones = phones[0:ind] + phones[ind + 21:]

                    # Remove the transcription tags and other
                    # unwanted characters (e.g. ',' '-' etc..)
                    word_tags = [
                        'A', 'B', 'M', 'I', 'S', 'J', 'C', 'L', 'R', 'G', 'F',
                        'D', 'H', 'Q', 'R', 'O', 'V', 'W', '息', '笑', '咳', '泣'
                    ]
                    symbols = [
                        ',', '-', '>', '<', '(', ')', ' ', '×', '.', ':'
                    ]
                    additional_tags = [
                        '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'
                    ]
                    unwanted = word_tags + symbols + additional_tags

                    for tag in unwanted:
                        if tag in phones:
                            phones = phones.replace(tag, '')

                    # use mapping of every symbol in transcription
                    while len(phones) > 0:
                        phone = []
                        phoneme_id1 = None
                        phoneme_id2 = None
                        # First check if first two symbols are together
                        if len(phones[0:2]
                               ) > 1 and phones[0:2] in self.kana_to_phone:
                            phoneme_id = self.kana_to_phone[phones[0:2]]
                            phones = phones[2:]

                        elif phones[0] in self.kana_to_phone:
                            # Else check if the symbol is in kana to phon
                            phoneme_id = self.kana_to_phone[phones[0]]
                            phones = phones[1:]

                        else:
                            # If not, let it pass, it will be treated later on
                            phoneme_id = phones[0]
                            if phoneme_id == "?":
                                phones = phones[1:]
                                continue
                            print "Phone seems to have no mapping, check :", phoneme_id, ' in context :', whole_phone
                            raw_input()
                            phones = phones[1:]

                        if (('+' in phoneme_id) and (not keep_clusters)
                                and not (phoneme_id[-1] == 'H')):
                            # handle the x+x x case
                            plus_ind = phoneme_id.index('+')
                            phoneme_id2 = phoneme_id[plus_ind + 2]
                            phoneme_id1 = phoneme_id[0:plus_ind] + phoneme_id[
                                plus_ind + 1]

                        elif (('+' in phoneme_id) and (keep_clusters)
                              and not (phoneme_id[-1] == 'H')):
                            # if keep_clusters is enabled, keep the x+x
                            # (e.g. c+y) as is
                            try:
                                phoneme_id2 = phoneme_id[3]
                                phoneme_id1 = phoneme_id[0:3]
                            except:
                                phoneme_id = phoneme_id[0:3]
                                pass

                        if phoneme_id == "Q":
                            # If Q is at the end, remove it !
                            if len(phones) == 0:
                                phoneme_id = None
                                continue

                        if phoneme_id == 'Nfiller':
                            # TODO CHECK IF Nfiller is N
                            phoneme_id = 'N'

                        if phoneme_id2:
                            phone.append(Phone(phoneme_id1, '', None, None))
                            phonemes.append(
                                Phoneme(phoneme_id1, phone, phone[0].start,
                                        phone[-1].end))

                            phone.append(Phone(phoneme_id2, '', None, None))
                            phonemes.append(
                                Phoneme(phoneme_id2, phone, phone[0].start,
                                        phone[-1].end))
                            continue
                        else:
                            if '+' not in phoneme_id:
                                for char in phoneme_id:
                                    phone.append(Phone(char, '', None, None))
                                    phonemes.append(
                                        Phoneme(char, phone, phone[0].start,
                                                phone[-1].end))
                            else:
                                phone.append(Phone(phoneme_id, '', None, None))
                                phonemes.append(
                                    Phoneme(phoneme_id, phone, phone[0].start,
                                            phone[-1].end))

                if phonemes:
                    words.append(
                        Word(phonemes, phonemes[0].start, phonemes[-1].end))
            else:
                try:
                    moras = [
                        mora.attrib["MoraEntity"] for mora in suw.iter("Mora")
                    ]
                    # self.log.debug(moras)
                except:
                    pass
                # self.log.debug(utt_id)
                # FIXME understand this
                # assert u"φ" in moras, utt_id
            utts[utt_id] = Utt(words, utt_start, utt_stop, channel)
        return utts

    def check_transcript_consistency(self, utts):
        pass

    # TODO check consistency of starts, stops, subsequent starts at all levels
    # and the across level consistency

    def extract_basic_transcript(self, utts, encoding=None, clusters=False):
        lexicon = {}
        new_utts = {}
        for utt_id in utts:
            utt = utts[utt_id]
            # if not utt.words:
            #     self.log.debug('Empty utt: ' + utt_id)
            # else:
            if utt.words:
                # TODO correct these before this step
                # if utt.words[0].start < utt.start:
                #     self.log.debug(
                #         utt_id + ' start: ' +
                #         str(utt.start) + ' - ' +
                #         str(utt.words[0].start))
                # if utt.words[-1].end > utt.end:
                #     self.log.debug(
                #         utt_id + ' end: ' +
                #         str(utt.end) + ' - ' +
                #         str(utt.words[-1].end))

                #start = min(utt.words[0].start, utt.start)
                #stop = max(utt.words[-1].end, utt.end)
                start = utt.start
                stop = utt.end

                words = []
                for word in utt.words:
                    # use phonemic level

                    phonemes = self.reencode(
                        [phoneme.id for phoneme in word.phonemes], clusters,
                        encoding)

                    ###print('-'.join(phonemes))
                    ###print('-'.join([phoneme.id for phoneme in word.phonemes]))
                    if phonemes == ['H']:  # just drop these for now
                        pass  # TODO log this
                    else:
                        ##print phonemes
                        #if phonemes=='':
                        #print 'empty phoneme !!'
                        word = u"-".join(phonemes)
                        if word not in lexicon:
                            lexicon[word] = phonemes
                        words.append(word)
                new_utts[utt_id] = {
                    'words': words,
                    'start': start,
                    'end': stop
                }
        return new_utts, lexicon

    def reencode(self, phonemes, encoding=None, clusters=False):
        vowels = ['a', 'e', 'i', 'o', 'u']
        stops = [
            't', 'ty', 'b', 'by', 'g', 'gj', 'gy', 'k', 'ky', 'kj', 'p', 'py',
            'd', 'dy'
        ]
        affricates = ['z', 'zy', 'zj', 'c', 'cy', 'cj']
        fricatives = ['s', 'sj', 'sy', 'z', 'zy', 'zj', 'h', 'F', 'hy', 'hj']
        obstruents = affricates + fricatives + stops
        phonemes_1 = []
        for phoneme in phonemes:
            # 1 - Noise and rare phonemes
            out_phn = phoneme
            # getting rid of very rare phones as vocal noise
            if out_phn in ['kw', 'v', 'Fy']:
                out_phn = 'VN'
            # rewriting FV and VN (fricative voicing and vocal noise) as
            # SPN (spoken noise)
            if out_phn in ['FV', 'VN']:
                out_phn = 'SPN'
            # rewriting ? as NSN (generic noise)
            if out_phn == '?':
                out_phn = 'NSN'
            # 2 - breaking clusters
            seg_1 = {
                'ky': 'k',
                'ty': 't',
                'ry': 'r',
                'cy': 't',
                'cj': 't',
                'c': 't',
                'py': 'p',
                'ny': 'n',
                'by': 'b',
                'my': 'm',
                'hy': 'h',
                'gy': 'g',
                'dy': 'd'
            }
            seg_2 = {
                'ky': 'y',
                'ty': 'y',
                'ry': 'y',
                'cy': 'sy',
                'cj': 'sj',
                'c': 's',
                'py': 'y',
                'ny': 'y',
                'by': 'y',
                'my': 'y',
                'hy': 'y',
                'gy': 'y',
                'dy': 'y'
            }
            if out_phn in seg_1 and not clusters:
                out_phns = [seg_1[out_phn], seg_2[out_phn]]
            #elif "+" in out_phn and clusters:
            #    out_phns = [out_phn[0]+out_phn[2]]
            #    if out_phn not in self.phones:
            #        self.phones[out_phn] = out_phn
            else:
                out_phns = [out_phn]
                # 3 - group allophonic variants according to phonetics
            mapping = {
                'zj': 'zy',
                'cj': 'cy',
                'sj': 'sy',
                'nj': 'n',
                'kj': 'k',
                'hj': 'h',
                'gj': 'g'
            }

            out_phns = [
                mapping[phn] if phn in mapping else phn for phn in out_phns
            ]
            phonemes_1 = phonemes_1 + out_phns
            # 4 - Q before obstruent as geminate (long obstruent)
            if len(phonemes_1) <= 1:
                phonemes_2 = phonemes_1
            else:
                phonemes_2 = []
                previous = phonemes_1[0]

                for phoneme in phonemes_1[1:]:
                    out_phn = phoneme
                    if previous == 'Q':
                        #print phoneme,' in ',phonemes_1
                        assert out_phn != 'Q', "Two successive 'Q' in phoneme sequence"
                        if out_phn in obstruents:
                            previous = 'Q+' + out_phn
                        else:
                            # Q considered a glottal stop in other contexts
                            phonemes_2.append('Q')
                            previous = out_phn
                    else:
                        phonemes_2.append(previous)
                        previous = out_phn
                phonemes_2.append(previous)  # don't forget last item
            # 5 - H after vowel as long vowel
            if len(phonemes_2) <= 1:
                # if 'H' in phonemes_2:
                #     self.log.debug("Isolated H: " + str(phonemes) + str(phonemes_1))
                phonemes_3 = phonemes_2
            elif (phonemes_2[0] == 'H' and len(phonemes_2) == 2):
                print "Word starts with H : erasing H"
                phonemes_2 = phonemes_2[1:]
                phonemes_3 = phonemes_2
            else:
                phonemes_3 = []
                previous = phonemes_2[0]
                #assert not(previous == 'H'), "Word starts with H"
                if previous == 'H':
                    print "Word starts with H : erasing H"
                    phonemes_2 = phonemes_2[1:]
                for phoneme in phonemes_2[1:]:
                    out_phn = phoneme
                    if out_phn == 'H':
                        assert previous != 'H', "Two successive 'H' in phoneme sequence"
                        if previous in vowels:
                            phonemes_3.append(previous + '+H')
                        else:
                            print previous
                            assert previous == 'N' or previous == 'SPN', "H found after neither N nor vowel"
                            phonemes_3.append(previous)  # drop H after N
                        previous = 'H'
                    else:
                        if previous != 'H':
                            phonemes_3.append(previous)
                        previous = out_phn
                if previous != 'H':
                    phonemes_3.append(previous)  # don't forget last item
        return phonemes_3

    def list_audio_files(self):
        return [
            os.path.join(self.input_dir, 'Waveforms', data + '.wav')
            for data in self.data_files
        ]

    def make_segment(self):
        segments = dict()
        for utt_id in self.all_utts:
            wavefile = utt_id.split("_")[1]
            start = self.all_utts[utt_id]['start']
            stop = self.all_utts[utt_id]['end']
            segments[utt_id] = (wavefile, float(start), float(stop))
        return segments

    def make_speaker(self):
        utt2spk = dict()
        for utt_id in self.all_utts:
            utt2spk[utt_id] = utt_id.split("_")[0]
        return utt2spk

    def make_transcription(self):
        text = dict()
        for utt_id in self.all_utts:
            words = u" ".join(self.all_utts[utt_id]['words'])
            text[utt_id] = words
        return text

    def make_lexicon(self):
        return {k: ' '.join(v) for k, v in self.lexicon.iteritems()}