class JapaneseMiiProPOSMapper: pos_dict = parse_csv(get_full_path( 'parsers/corpora/main/japanese_miipro/resources/pos.csv')) pos_ud_dict = parse_pos_ud(get_full_path( 'parsers/corpora/main/japanese_miipro/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '')
class QaqetGlossMapper: gloss_dict = parse_csv( get_full_path('parsers/corpora/main/qaqet/resources/gloss.csv')) @classmethod def map(cls, gloss): gloss = ToolboxMorphemeCleaner.remove_morpheme_delimiters(gloss) return cls.infer_gloss(gloss) @classmethod def infer_gloss(cls, gloss): if gloss: atms_gloss_raw = gloss.split('.') gloss = [] for atm_gl_raw in atms_gloss_raw: if atm_gl_raw not in cls.gloss_dict: atm_gl = '???' else: atm_gl = cls.gloss_dict[atm_gl_raw] gloss.append(atm_gl) # If all atm_poses are '', set to None. for atm_gloss in gloss: if atm_gloss != '???': gloss = '.'.join(gloss) break else: gloss = '' else: gloss = '' return gloss
def __init__(self, path): # check if IMDI file exists if os.path.isfile(path): self.path = path else: # load dummy file self.path = get_full_path('parsers/metadata/resources/dummy.imdi') print(f'ERROR: IMDI {path} missing!!!') print('Loading dummy file...') self.tree = objectify.parse(self.path) self.root = self.tree.getroot() self.metadata = { '__attrs__': self.parse_attrs(self.root), } self.metadata['__attrs__']['Cname'] = re.sub( r'\.xml.*|\.imdi.*', "", os.path.basename(str(self.path))) # Special case for Indonesian # Explanation: this converts the session ID to the same format as in the body files # Unless that issue was fixed in another way we will probably still want it # TODO: figure out what's going wrong with Indonesian """
class InuktitutGlossMapper: gloss_dict = parse_csv( get_full_path('parsers/corpora/main/inuktitut/resources/gloss.csv')) @classmethod def map(cls, gloss): gloss = cls.clean_gloss(gloss) return cls.gloss_dict.get(gloss, '') @classmethod def clean_gloss(cls, gloss): """Replace the stem and grammatical gloss connector.""" return cls.replace_stem_gram_gloss_connector(gloss) @staticmethod def replace_stem_gram_gloss_connector(gloss): """Replace the stem and grammatical gloss connector. A stem gloss is connected with a grammatical gloss by an ampersand. The connector is replaced by a dot. Args: gloss (str): The gloss. Returns: str: The stem and grammatical connector replaced by a dot. """ return gloss.replace('&', '.')
def _parse(): """Return durations from session_durations.csv. Returns: Dict[Dict[str]]: The duration indexed by corpus and source_id. """ durations_csv_path = 'util/resources/session_durations.csv' full_path = get_full_path(durations_csv_path) durations = {} with open(full_path, 'r', encoding='utf8') as f: reader = csv.DictReader(f) for row in reader: corpus = row['corpus'] source_id = row['source_id'] duration = row['duration'] if corpus in durations: durations[corpus][source_id] = duration else: durations[corpus] = {source_id: duration} return durations
class RussianGlossMapper: gloss_dict = parse_csv(get_full_path( 'parsers/corpora/main/russian/resources/gloss.csv')) @classmethod def map(cls, gloss): return cls.gloss_dict.get(gloss, '')
class TurkishGloss2SegmentMapper: gloss2seg = parse_csv(get_full_path( 'parsers/corpora/main/turkish/resources/gloss2segment.csv')) @classmethod def map(cls, gloss): return cls.gloss2seg.get(gloss, '')
class JapaneseMiyataGlossMapper: gloss_dict = parse_csv( get_full_path( 'parsers/corpora/main/japanese_miyata/resources/gloss.csv')) @classmethod def map(cls, gloss): return cls.gloss_dict.get(gloss, '')
class ChintangGlossMapper: gloss_dict = parse_csv(get_full_path( 'parsers/corpora/main/chintang/resources/gloss.csv')) @classmethod def map(cls, gloss): gloss = ToolboxMorphemeCleaner.remove_morpheme_delimiters(gloss) return cls.gloss_dict.get(gloss, '')
class RussianSessionParser(ToolboxParser): role_mapper = RoleMapper(get_full_path( 'parsers/corpora/main/russian/resources/speaker_label2macro_role.csv' )) def get_record_reader(self): return RussianReader() def get_metadata_reader(self): return IMDIParser(self.metadata_path) def get_cleaner(self): return RussianCleaner() def parse(self): session = super().parse() tc_cleaner.clean(session) return session def add_speakers(self): for speaker_dict in self.metadata_reader.metadata['participants']: speaker = Speaker() speaker.birth_date = ICl.clean_date( speaker_dict.get('birthdate', '')) speaker.code = ICl.clean_label(speaker_dict.get('code', '')) speaker.role_raw = speaker_dict.get('familysocialrole', '') speaker.name = ICl.clean_name(speaker_dict.get('name', '')) speaker.languages_spoken = speaker_dict.get('language', '') speaker.age_raw = speaker_dict.get('age', '') ToolboxAgeUpdater.update(speaker, self.session.date) speaker.role_raw = speaker_dict.get('familysocialrole', '') speaker.role = self.role_mapper.role_raw2role(speaker.role_raw) speaker.macro_role = self.role_mapper.infer_macro_role( speaker.role_raw, speaker.age_in_days, speaker.code) speaker.gender_raw = speaker_dict.get('sex', '') speaker.gender = ICl.clean_gender(speaker.gender_raw) if not speaker.gender: speaker.gender = self.role_mapper.role_raw2gender( speaker.role_raw) self.session.speakers.append(speaker) def add_record(self, rec): super().add_record(rec) self.delete_morphemes() def delete_morphemes(self): utt = self.session.utterances[-1] utt.morpheme_raw = '' utt.gloss_raw = '' utt.pos_raw = '' utt.morphemes = []
class JapaneseMiiProGloss2SegmentMapper: gloss2seg = parse_csv( get_full_path( 'parsers/corpora/main/japanese_miipro/resources/gloss2segment.csv') ) @classmethod def map(cls, gloss): return cls.gloss2seg.get(gloss, '')
class ChintangPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/chintang/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/chintang/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): if pos.startswith('-'): return 'sfx' elif pos.endswith('-'): return 'pfx' else: pos = ToolboxMorphemeCleaner.clean(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '')
class TuatschinPOSMapper: pos_dict = parse_csv(get_full_path( 'parsers/corpora/main/tuatschin/resources/pos.csv')) pos_ud_dict = parse_pos_ud(get_full_path( 'parsers/corpora/main/tuatschin/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @classmethod def clean_pos(cls, pos): for cleaning_method in [ cls.remove_specifications ]: pos = cleaning_method(pos) return pos @staticmethod def remove_specifications(pos): """Remove specifications of POS tags. Specifications start with `_`. Examples: - words erroneously written apart: _cont - child forms: _Chld - discourse particles: _Discpart ... """ regex = re.compile(r'_[^_]+') pos = regex.sub('', pos) return pos
class CreeSessionParser(CHATParser): role_mapper = RoleMapper(get_full_path( 'parsers/corpora/main/cree/resources/speaker_label2macro_role.csv' )) @staticmethod def get_reader(session_file): return CreeReader(session_file) @staticmethod def get_cleaner(): return CreeCleaner()
class JapaneseMiyataSessionParser(CHATParser): role_mapper = RoleMapper( get_full_path('parsers/corpora/main/japanese_miyata/resources/' 'speaker_label2macro_role.csv')) @staticmethod def get_reader(session_file): return JapaneseMiyataReader(session_file) @staticmethod def get_cleaner(): return JapaneseMiyataCleaner()
class TurkishSessionParser(CHATParser): role_mapper = RoleMapper(get_full_path( 'parsers/corpora/main/turkish/resources/' 'speaker_label2macro_role.csv' )) @staticmethod def get_reader(session_file): return TurkishReader(session_file) @staticmethod def get_cleaner(): return TurkishCleaner()
class InuktitutPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @classmethod def clean_pos(cls, pos): """Replace the POS tag separator.""" return cls.replace_pos_separator(pos) @staticmethod def replace_pos_separator(pos): """Replace the POS tag separator. A morpheme may have several POS tags separated by a pipe. POS tags to the right are subcategories of the POS tags to the left. The separator is replaced by a dot. Args: pos (str): The POS tag. Returns: str: POS tag separator replaced by a dot. """ return pos.replace('|', '.')
class CreePOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/cree/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/cree/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @staticmethod def uppercase_pos_in_parentheses(pos): """Uppercase POS tags in parentheses. Parentheses indicate covert grammatical categories. """ pos_in_parentheses_regex = re.compile(r'(\()(\S+)(\))') # extract POS in parentheses match = pos_in_parentheses_regex.search(pos) if not match: return pos else: # replace by uppercased version up_pos = match.group(2).upper() return pos_in_parentheses_regex.sub(r'\1{}\3'.format(up_pos), pos) @classmethod def clean_pos(cls, pos): return cls.uppercase_pos_in_parentheses(pos)
class YucatecSessionParser(CHATParser): role_mapper = RoleMapper( get_full_path('parsers/corpora/main/yucatec/resources/' 'speaker_label2macro_role.csv')) @staticmethod def get_reader(session_file): return YucatecReader(session_file) @staticmethod def get_cleaner(): return YucatecCleaner() def parse(self): session = super().parse() tc_cleaner.clean(session) return session
class ChintangSessionParser(ToolboxParser): role_mapper = RoleMapper( get_full_path( 'parsers/corpora/main/chintang/resources/speaker_label2macro_role.csv' )) def get_record_reader(self): return ChintangReader() def get_metadata_reader(self): return ChintangIMDIParser(self.metadata_path) def get_cleaner(self): return ChintangCleaner() def parse(self): session = super().parse() tc_cleaner.clean(session) return session
class NungonGlossMapper: gloss_dict = parse_csv( get_full_path('parsers/corpora/main/nungon/resources/gloss.csv')) @classmethod def map(cls, gloss): gloss = cls.clean_gloss(gloss) return cls.gloss_dict.get(gloss, '') @classmethod def clean_gloss(cls, gloss): for cleaning_method in [ cls.remove_question_mark, cls.replace_slash, cls.replace_plus ]: gloss = cleaning_method(gloss) return gloss @staticmethod def remove_question_mark(morpheme): """Remove the question mark in the morpheme. Question marks might code insecure annotations. They are prefixed to the morpheme. """ return morpheme.lstrip('?') @staticmethod def replace_slash(gloss): """Replace the slash by a dot between numbers.""" return re.sub(r'(\d)/(\d)', r'\1.\2', gloss) @staticmethod def replace_plus(gloss): """Replace the plus by a dot.""" return gloss.replace('+', '.')
class CreeGlossMapper: gloss_dict = parse_csv( get_full_path('parsers/corpora/main/cree/resources/gloss.csv')) @classmethod def map(cls, gloss): gloss = cls.clean_gloss(gloss) return cls.gloss_dict.get(gloss, '') @staticmethod def replace_gloss_connector(gloss): """Replace the gloss connectors. There are three different gloss connectors: '.', '+', ',' ',' adds an additional specification to a gloss, e.g. 'p,quest” (question particle)'. '+' and ',' are replaced by a dot. """ return gloss.replace(',', '.').replace('+', '.') @classmethod def clean_gloss(cls, gloss): # gloss = cls.replace_gloss_connector(gloss) return gloss
class KuWaruGlossMapper: gloss_dict = parse_csv(get_full_path( 'parsers/corpora/main/ku_waru/resources/gloss.csv')) @classmethod def map(cls, gloss): return cls.infer_gloss(gloss) @classmethod def infer_gloss(cls, gloss): gloss = ToolboxMorphemeCleaner.clean(gloss) gloss = cls.replace_colons(gloss) gloss = cls.remove_bt_tp(gloss) gloss = cls.replace_many_to_one(gloss) number_person_rgx = re.compile(r'[1-3/]+(SG|DU|PL)') # case 1: direct mapping if gloss in cls.gloss_dict: return cls.gloss_dict[gloss] # case 2: number-person combinations if number_person_rgx.fullmatch(gloss): return gloss # case 3: lexical gloss if gloss.islower(): return '' # case 4: NER if gloss in ['PERSON', 'PLACE', 'TRIBE']: return '' # case 5: multi-category morpheme if '.' in gloss: categories = gloss.split('.') mapped_categories = [] for category in categories: if number_person_rgx.fullmatch(category): mapped_category = category elif category.islower(): return '' else: mapped_category = cls.gloss_dict.get(category, '???') mapped_categories.append(mapped_category) return '.'.join(mapped_categories) # other return '' @staticmethod def replace_colons(gloss): """Replace colons by dots. Args: gloss (str): The gloss. Example: IMP:2/3DU => IMP.2/3DU """ return gloss.replace(':', '.') @staticmethod def remove_bt_tp(gloss): """Remove TP and BT categories. `BT` denotes baby talk `TP` denotes Tok Pisin Args: gloss (str): The gloss. Example: banana.BT => banana """ return re.sub(r'\.(BT|TP)', '', gloss) @staticmethod def replace_many_to_one(gloss): """Replace multi-word gloss by one-word gloss. Args: gloss (str): The gloss. Current multi-word glosses mapping to one-word glosses are: TAG.Q that.ABK that.ABU that.ANA that.END this.DEF this.IP Example: this.DEF => PROX """ gloss = gloss.replace('TAG.Q', 'Q') gloss = re.sub(r'that\.AB[KU]', 'DEM', gloss) gloss = re.sub(r'that\.(ANA|END)', 'DIST', gloss) gloss = re.sub(r'this\.(DEF|IP)', 'PROX', gloss) return gloss
class SesothoGlossMapper: gloss_dict = parse_csv( get_full_path('parsers/corpora/main/sesotho/resources/gloss.csv')) @classmethod def map(cls, gloss): gloss = cls.clean_gloss(gloss) return cls.gloss_dict.get(gloss, '') @classmethod def clean_gloss(cls, gloss): """Clean a Sesotho gloss.""" for method in [ cls.remove_markers, cls.clean_proper_names_gloss_words, cls.remove_nominal_concord_markers, cls.unify_untranscribed_glosses ]: gloss = method(gloss) return gloss @classmethod def remove_markers(cls, gloss): """Remove noun and verb markers.""" gloss = cls.remove_noun_markers(gloss) gloss = cls.remove_verb_markers(gloss) return gloss @staticmethod def remove_noun_markers(gloss): """Remove noun markers.""" return re.sub(r'[nN]\^(?=\d)', '', gloss) @staticmethod def remove_verb_markers(gloss): """Remove verb markers.""" return re.sub(r'[vs]\^', '', gloss) @staticmethod def clean_proper_names_gloss_words(gloss): """Clean glosses of proper names. In proper names substitute 'n^' marker with 'a_'. Lowercase the labels of propernames. """ gloss = re.sub(r'[nN]\^([gG]ame|[nN]ame|[pP]lace|[sS]ong)', r'a_\1', gloss) if re.search(r'a_(Game|Name|Place|Song)', gloss): gloss = gloss.lower() return gloss @staticmethod def remove_nominal_concord_markers(gloss): """Remove markers for nominal concord.""" match = re.search(r'^(d|lr|obr|or|pn|ps)\d+', gloss) if match: pos = match.group(1) return re.sub(pos, '', gloss) return gloss @staticmethod def unify_untranscribed_glosses(gloss): """Unify untranscribed glosses. In Sesotho glossing for words which are not understood or couldn't be analyzed are marked by 'word' or by 'xxx'. Turn both into the standart '???'. """ if gloss == 'word' or gloss == 'xxx': return '???' return gloss
class IndonesianSessionParser(ToolboxParser): role_mapper = RoleMapper( get_full_path('parsers/corpora/main/indonesian/resources/' 'speaker_label2macro_role.csv')) def get_metadata_reader(self): return CHATParser(self.metadata_path) def add_session_metadata(self): self.session.source_id = os.path.splitext( os.path.basename(self.toolbox_path))[0] metadata = self.metadata_reader.metadata['__attrs__'] self.session.date = metadata.get('Date', None) return self.session def add_speakers(self): for speaker_dict in self.metadata_reader.metadata['participants']: speaker = Speaker() speaker.birth_date = speaker_dict.get('birthday', '') speaker.code = speaker_dict.get('id', '') speaker.name = speaker_dict.get('name', '') speaker.code = Lc.correct_speaker_label(speaker.code, speaker.name) speaker.languages_spoken = speaker_dict.get('language', '') speaker.age_raw = speaker_dict.get('age', '') IndonesianAgeUpdater.update(speaker, self.session.date) speaker.role_raw = speaker_dict.get('role', '') speaker.role = self.role_mapper.role_raw2role(speaker.role_raw) speaker.macro_role = self.role_mapper.infer_macro_role( speaker.role_raw, speaker.age_in_days, speaker.code) speaker.gender_raw = speaker_dict.get('sex', '') speaker.gender = speaker.gender_raw.title() if not speaker.gender: speaker.gender = self.role_mapper.role_raw2gender( speaker.role_raw) if self.is_speaker(speaker): self.session.speakers.append(speaker) def add_utterance(self, rec): utt = super().add_utterance(rec) speaker_label = self.record_reader.get_speaker_label(rec) speaker_label = Lc.correct_rec_label(speaker_label) utt.speaker = self._get_speaker(speaker_label, self.session.speakers) return utt @staticmethod def is_speaker(speaker): """Check whether the speaker is a real speaker. Skip `AUX` participants. Args: speaker (Speaker): The `Speaker` instance. """ return speaker.code != 'AUX' def get_record_reader(self): return IndonesianReader() def get_cleaner(self): return IndonesianCleaner() def add_words(self, actual_utterance, target_utterance): utt = self.session.utterances[-1] for word in self.record_reader.get_words(actual_utterance): w = Word() utt.words.append(w) w.word_language = '' # Distinguish between word and word_target; # otherwise the target word is identical to the actual word if re.search('\(', word): w.word_target = re.sub('[()]', '', word) w.word = re.sub('\([^)]+\)', '', word) w.word_actual = w.word else: w.word_target = re.sub('xxx?|www', '???', word) w.word = re.sub('xxx?', '???', word) w.word_actual = w.word
def create_views(db_path): view_path = get_full_path('database/views.sql') cmd = f'sqlite3 {db_path} < {view_path}' subprocess.run(cmd, shell=True)
def get_roles(): roles = ConfigParser(delimiters='=') roles.optionxform = str roles.read(get_full_path('util/resources/role_mapping.ini')) return roles
class TuatschinGlossMapper: gloss_dict = parse_csv( get_full_path('parsers/corpora/main/tuatschin/resources/gloss.csv')) @classmethod def map(cls, gloss): gloss = cls.clean_gloss(gloss) if gloss: # replace person/number combinations first pnum_regex = re.compile(r'([0123])\.(Sing)') gloss = pnum_regex.sub(r'\1SG', gloss) pnum_regex = re.compile(r'([0123])\.(Plur)') gloss = pnum_regex.sub(r'\1PL', gloss) parts = [] is_null = False for part in gloss.split('.'): if re.search(r'[0123](SG|PL)', part): parts.append(part) else: if part in cls.gloss_dict: part = cls.gloss_dict[part] if part != '???': parts.append(part) else: is_null = True break else: is_null = True break if is_null: gloss = '' else: gloss = '.'.join(parts) else: gloss = '' return gloss @classmethod def clean_gloss(cls, gloss): for cleaning_method in [cls.remove_pos]: gloss = cleaning_method(gloss) return gloss @staticmethod def remove_pos(gloss): """Remove the POS tag. Morpho-syntactic annotations start with the POS tag: [POS].[SUB-GlOSS1].[SUB-GLOSS2] Example: ADJ.Fem.Sing => Fem.Sing """ regex = re.compile(r'^[^.]+\.') gloss = regex.sub('', gloss) return gloss