class EnglishPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/english/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/english/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @classmethod def clean_pos(cls, pos): return cls.extract_first_pos(pos) @staticmethod def extract_first_pos(pos): """Extract the first POS tag. Several POS tags are separated by ':'. """ return pos.split(':')[0]
class KuWaruPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/ku_waru/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/ku_waru/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): if ud: return cls.infer_pos(pos, cls.pos_ud_dict) else: return cls.infer_pos(pos, cls.pos_dict) @classmethod def infer_pos(cls, pos, pos_dict): if cls.is_suffix(pos): return 'sfx' elif cls.is_prefix(pos): return 'pfx' else: pos = ToolboxMorphemeCleaner.clean(pos) return pos_dict.get(pos, '') @staticmethod def is_suffix(pos): return pos.startswith('-') or pos.startswith('=') @staticmethod def is_prefix(pos): return pos.endswith('-') or pos.endswith('=')
class NungonPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/nungon/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/nungon/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.remove_question_mark(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @staticmethod def remove_question_mark(morpheme): """Remove the question mark in the morpheme. Question marks might code insecure annotations. They are prefixed to the morpheme. """ return morpheme.lstrip('?')
class JapaneseMiyataPOSMapper: pos_dict = parse_csv( get_full_path( 'parsers/corpora/main/japanese_miyata/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path( 'parsers/corpora/main/japanese_miyata/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @classmethod def replace_colon_by_dot_pos(cls, pos): """Replace the colons in the POS tag by a dot.""" return pos.replace(':', '.') @classmethod def clean_pos(cls, pos): pos = cls.replace_colon_by_dot_pos(pos) return pos
class QaqetPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/qaqet/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/qaqet/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): if pos.startswith('-') or pos.startswith('='): return 'sfx' elif pos.endswith('-') or pos.endswith('='): return 'sfx' else: pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @classmethod def clean_pos(cls, pos): pos = cls.unify_unknowns_morpheme(pos) pos = ToolboxMorphemeCleaner.remove_morpheme_delimiters(pos) return pos @classmethod def unify_unknowns_morpheme(cls, morpheme): unknown_re = re.compile(r'\bx+|\?{2}|\*{3}') return unknown_re.sub('???', morpheme)
class RussianPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/russian/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/russian/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '')
class JapaneseMiiProPOSMapper: pos_dict = parse_csv(get_full_path( 'parsers/corpora/main/japanese_miipro/resources/pos.csv')) pos_ud_dict = parse_pos_ud(get_full_path( 'parsers/corpora/main/japanese_miipro/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '')
class ChintangPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/chintang/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/chintang/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): if pos.startswith('-'): return 'sfx' elif pos.endswith('-'): return 'pfx' else: pos = ToolboxMorphemeCleaner.clean(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '')
class TuatschinPOSMapper: pos_dict = parse_csv(get_full_path( 'parsers/corpora/main/tuatschin/resources/pos.csv')) pos_ud_dict = parse_pos_ud(get_full_path( 'parsers/corpora/main/tuatschin/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @classmethod def clean_pos(cls, pos): for cleaning_method in [ cls.remove_specifications ]: pos = cleaning_method(pos) return pos @staticmethod def remove_specifications(pos): """Remove specifications of POS tags. Specifications start with `_`. Examples: - words erroneously written apart: _cont - child forms: _Chld - discourse particles: _Discpart ... """ regex = re.compile(r'_[^_]+') pos = regex.sub('', pos) return pos
class InuktitutPOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @classmethod def clean_pos(cls, pos): """Replace the POS tag separator.""" return cls.replace_pos_separator(pos) @staticmethod def replace_pos_separator(pos): """Replace the POS tag separator. A morpheme may have several POS tags separated by a pipe. POS tags to the right are subcategories of the POS tags to the left. The separator is replaced by a dot. Args: pos (str): The POS tag. Returns: str: POS tag separator replaced by a dot. """ return pos.replace('|', '.')
class CreePOSMapper: pos_dict = parse_csv( get_full_path('parsers/corpora/main/cree/resources/pos.csv')) pos_ud_dict = parse_pos_ud( get_full_path('parsers/corpora/main/cree/resources/pos.csv')) @classmethod def map(cls, pos, ud=False): pos = cls.clean_pos(pos) if ud: return cls.pos_ud_dict.get(pos, '') else: return cls.pos_dict.get(pos, '') @staticmethod def uppercase_pos_in_parentheses(pos): """Uppercase POS tags in parentheses. Parentheses indicate covert grammatical categories. """ pos_in_parentheses_regex = re.compile(r'(\()(\S+)(\))') # extract POS in parentheses match = pos_in_parentheses_regex.search(pos) if not match: return pos else: # replace by uppercased version up_pos = match.group(2).upper() return pos_in_parentheses_regex.sub(r'\1{}\3'.format(up_pos), pos) @classmethod def clean_pos(cls, pos): return cls.uppercase_pos_in_parentheses(pos)