Python get_full_pathの例、acqdiv.util.path.get_full_path Pythonの例

コード例 #1

0

ファイルを表示

class JapaneseMiiProPOSMapper:

    pos_dict = parse_csv(get_full_path(
        'parsers/corpora/main/japanese_miipro/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(get_full_path(
        'parsers/corpora/main/japanese_miipro/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

コード例 #2

0

ファイルを表示

class QaqetGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/qaqet/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = ToolboxMorphemeCleaner.remove_morpheme_delimiters(gloss)
        return cls.infer_gloss(gloss)

    @classmethod
    def infer_gloss(cls, gloss):
        if gloss:
            atms_gloss_raw = gloss.split('.')
            gloss = []
            for atm_gl_raw in atms_gloss_raw:
                if atm_gl_raw not in cls.gloss_dict:
                    atm_gl = '???'
                else:
                    atm_gl = cls.gloss_dict[atm_gl_raw]
                gloss.append(atm_gl)
            # If all atm_poses are '', set to None.
            for atm_gloss in gloss:
                if atm_gloss != '???':
                    gloss = '.'.join(gloss)
                    break
            else:
                gloss = ''
        else:
            gloss = ''

        return gloss

コード例 #3

0

ファイルを表示

ファイル: parser.py プロジェクト: acqdiv/acqdiv

    def __init__(self, path):
        # check if IMDI file exists
        if os.path.isfile(path):
            self.path = path
        else:
            # load dummy file
            self.path = get_full_path('parsers/metadata/resources/dummy.imdi')
            print(f'ERROR: IMDI {path} missing!!!')
            print('Loading dummy file...')

        self.tree = objectify.parse(self.path)

        self.root = self.tree.getroot()
        self.metadata = {
            '__attrs__': self.parse_attrs(self.root),
        }
        self.metadata['__attrs__']['Cname'] = re.sub(
            r'\.xml.*|\.imdi.*', "", os.path.basename(str(self.path)))

        # Special case for Indonesian
        # Explanation: this converts the session ID to the same format as in the body files
        # Unless that issue was fixed in another way we will probably still want it

        # TODO: figure out what's going wrong with Indonesian
        """

コード例 #4

0

ファイルを表示

class InuktitutGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/inuktitut/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @classmethod
    def clean_gloss(cls, gloss):
        """Replace the stem and grammatical gloss connector."""
        return cls.replace_stem_gram_gloss_connector(gloss)

    @staticmethod
    def replace_stem_gram_gloss_connector(gloss):
        """Replace the stem and grammatical gloss connector.

        A stem gloss is connected with a grammatical gloss by an ampersand.
        The connector is replaced by a dot.

        Args:
            gloss (str): The gloss.

        Returns:
            str: The stem and grammatical connector replaced by a dot.
        """
        return gloss.replace('&', '.')

コード例 #5

0

ファイルを表示

ファイル: session_duration.py プロジェクト: acqdiv/acqdiv

def _parse():
    """Return durations from session_durations.csv.

    Returns:
        Dict[Dict[str]]: The duration indexed by corpus and source_id.
    """
    durations_csv_path = 'util/resources/session_durations.csv'
    full_path = get_full_path(durations_csv_path)

    durations = {}

    with open(full_path, 'r', encoding='utf8') as f:
        reader = csv.DictReader(f)

        for row in reader:
            corpus = row['corpus']
            source_id = row['source_id']
            duration = row['duration']

            if corpus in durations:
                durations[corpus][source_id] = duration
            else:
                durations[corpus] = {source_id: duration}

    return durations

コード例 #6

0

ファイルを表示

class RussianGlossMapper:

    gloss_dict = parse_csv(get_full_path(
        'parsers/corpora/main/russian/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.gloss_dict.get(gloss, '')

コード例 #7

0

ファイルを表示

ファイル: gloss2segment_mapper.py プロジェクト: acqdiv/acqdiv

class TurkishGloss2SegmentMapper:

    gloss2seg = parse_csv(get_full_path(
        'parsers/corpora/main/turkish/resources/gloss2segment.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.gloss2seg.get(gloss, '')

コード例 #8

0

ファイルを表示

class JapaneseMiyataGlossMapper:

    gloss_dict = parse_csv(
        get_full_path(
            'parsers/corpora/main/japanese_miyata/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.gloss_dict.get(gloss, '')

コード例 #9

0

ファイルを表示

ファイル: gloss_mapper.py プロジェクト: acqdiv/acqdiv

class ChintangGlossMapper:

    gloss_dict = parse_csv(get_full_path(
                    'parsers/corpora/main/chintang/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = ToolboxMorphemeCleaner.remove_morpheme_delimiters(gloss)
        return cls.gloss_dict.get(gloss, '')

コード例 #10

0

ファイルを表示

ファイル: session_parser.py プロジェクト: acqdiv/acqdiv

class RussianSessionParser(ToolboxParser):

    role_mapper = RoleMapper(get_full_path(
        'parsers/corpora/main/russian/resources/speaker_label2macro_role.csv'
    ))

    def get_record_reader(self):
        return RussianReader()

    def get_metadata_reader(self):
        return IMDIParser(self.metadata_path)

    def get_cleaner(self):
        return RussianCleaner()

    def parse(self):
        session = super().parse()
        tc_cleaner.clean(session)
        return session

    def add_speakers(self):
        for speaker_dict in self.metadata_reader.metadata['participants']:
            speaker = Speaker()
            speaker.birth_date = ICl.clean_date(
                speaker_dict.get('birthdate', ''))
            speaker.code = ICl.clean_label(speaker_dict.get('code', ''))
            speaker.role_raw = speaker_dict.get('familysocialrole', '')
            speaker.name = ICl.clean_name(speaker_dict.get('name', ''))
            speaker.languages_spoken = speaker_dict.get('language', '')

            speaker.age_raw = speaker_dict.get('age', '')
            ToolboxAgeUpdater.update(speaker, self.session.date)

            speaker.role_raw = speaker_dict.get('familysocialrole', '')
            speaker.role = self.role_mapper.role_raw2role(speaker.role_raw)
            speaker.macro_role = self.role_mapper.infer_macro_role(
                speaker.role_raw, speaker.age_in_days, speaker.code)

            speaker.gender_raw = speaker_dict.get('sex', '')
            speaker.gender = ICl.clean_gender(speaker.gender_raw)
            if not speaker.gender:
                speaker.gender = self.role_mapper.role_raw2gender(
                    speaker.role_raw)

            self.session.speakers.append(speaker)

    def add_record(self, rec):
        super().add_record(rec)
        self.delete_morphemes()

    def delete_morphemes(self):
        utt = self.session.utterances[-1]
        utt.morpheme_raw = ''
        utt.gloss_raw = ''
        utt.pos_raw = ''
        utt.morphemes = []

コード例 #11

0

ファイルを表示

class JapaneseMiiProGloss2SegmentMapper:

    gloss2seg = parse_csv(
        get_full_path(
            'parsers/corpora/main/japanese_miipro/resources/gloss2segment.csv')
    )

    @classmethod
    def map(cls, gloss):
        return cls.gloss2seg.get(gloss, '')

コード例 #12

0

ファイルを表示

class ChintangPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/chintang/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/chintang/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if pos.startswith('-'):
            return 'sfx'
        elif pos.endswith('-'):
            return 'pfx'
        else:
            pos = ToolboxMorphemeCleaner.clean(pos)
            if ud:
                return cls.pos_ud_dict.get(pos, '')
            else:
                return cls.pos_dict.get(pos, '')

コード例 #13

0

ファイルを表示

ファイル: pos_mapper.py プロジェクト: acqdiv/acqdiv

class TuatschinPOSMapper:

    pos_dict = parse_csv(get_full_path(
        'parsers/corpora/main/tuatschin/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(get_full_path(
        'parsers/corpora/main/tuatschin/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        for cleaning_method in [
            cls.remove_specifications
        ]:
            pos = cleaning_method(pos)
        return pos

    @staticmethod
    def remove_specifications(pos):
        """Remove specifications of POS tags.

        Specifications start with `_`.

        Examples:
        - words erroneously written apart: _cont
        - child forms: _Chld
        - discourse particles: _Discpart
        ...
        """
        regex = re.compile(r'_[^_]+')
        pos = regex.sub('', pos)
        return pos

コード例 #14

0

ファイルを表示

ファイル: session_parser.py プロジェクト: acqdiv/acqdiv

class CreeSessionParser(CHATParser):

    role_mapper = RoleMapper(get_full_path(
        'parsers/corpora/main/cree/resources/speaker_label2macro_role.csv'
    ))

    @staticmethod
    def get_reader(session_file):
        return CreeReader(session_file)

    @staticmethod
    def get_cleaner():
        return CreeCleaner()

コード例 #15

0

ファイルを表示

ファイル: session_parser.py プロジェクト: acqdiv/acqdiv

class JapaneseMiyataSessionParser(CHATParser):

    role_mapper = RoleMapper(
        get_full_path('parsers/corpora/main/japanese_miyata/resources/'
                      'speaker_label2macro_role.csv'))

    @staticmethod
    def get_reader(session_file):
        return JapaneseMiyataReader(session_file)

    @staticmethod
    def get_cleaner():
        return JapaneseMiyataCleaner()

コード例 #16

0

ファイルを表示

class TurkishSessionParser(CHATParser):

    role_mapper = RoleMapper(get_full_path(
        'parsers/corpora/main/turkish/resources/'
        'speaker_label2macro_role.csv'
    ))

    @staticmethod
    def get_reader(session_file):
        return TurkishReader(session_file)

    @staticmethod
    def get_cleaner():
        return TurkishCleaner()

コード例 #17

0

ファイルを表示

ファイル: pos_mapper.py プロジェクト: acqdiv/acqdiv

class InuktitutPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        """Replace the POS tag separator."""
        return cls.replace_pos_separator(pos)

    @staticmethod
    def replace_pos_separator(pos):
        """Replace the POS tag separator.

        A morpheme may have several POS tags separated by a pipe.
        POS tags to the right are subcategories of the POS tags to the left.
        The separator is replaced by a dot.

        Args:
            pos (str): The POS tag.

        Returns:
            str: POS tag separator replaced by a dot.
        """
        return pos.replace('|', '.')

コード例 #18

0

ファイルを表示

ファイル: pos_mapper.py プロジェクト: acqdiv/acqdiv

class CreePOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/cree/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/cree/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @staticmethod
    def uppercase_pos_in_parentheses(pos):
        """Uppercase POS tags in parentheses.

        Parentheses indicate covert grammatical categories.
        """
        pos_in_parentheses_regex = re.compile(r'(\()(\S+)(\))')
        # extract POS in parentheses
        match = pos_in_parentheses_regex.search(pos)
        if not match:
            return pos
        else:
            # replace by uppercased version
            up_pos = match.group(2).upper()
            return pos_in_parentheses_regex.sub(r'\1{}\3'.format(up_pos), pos)

    @classmethod
    def clean_pos(cls, pos):
        return cls.uppercase_pos_in_parentheses(pos)

コード例 #19

0

ファイルを表示

class YucatecSessionParser(CHATParser):

    role_mapper = RoleMapper(
        get_full_path('parsers/corpora/main/yucatec/resources/'
                      'speaker_label2macro_role.csv'))

    @staticmethod
    def get_reader(session_file):
        return YucatecReader(session_file)

    @staticmethod
    def get_cleaner():
        return YucatecCleaner()

    def parse(self):
        session = super().parse()
        tc_cleaner.clean(session)
        return session

コード例 #20

0

ファイルを表示

ファイル: session_parser.py プロジェクト: acqdiv/acqdiv

class ChintangSessionParser(ToolboxParser):

    role_mapper = RoleMapper(
        get_full_path(
            'parsers/corpora/main/chintang/resources/speaker_label2macro_role.csv'
        ))

    def get_record_reader(self):
        return ChintangReader()

    def get_metadata_reader(self):
        return ChintangIMDIParser(self.metadata_path)

    def get_cleaner(self):
        return ChintangCleaner()

    def parse(self):
        session = super().parse()
        tc_cleaner.clean(session)
        return session

コード例 #21

0

ファイルを表示

ファイル: gloss_mapper.py プロジェクト: acqdiv/acqdiv

class NungonGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/nungon/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @classmethod
    def clean_gloss(cls, gloss):
        for cleaning_method in [
                cls.remove_question_mark, cls.replace_slash, cls.replace_plus
        ]:
            gloss = cleaning_method(gloss)
        return gloss

    @staticmethod
    def remove_question_mark(morpheme):
        """Remove the question mark in the morpheme.

        Question marks might code insecure annotations. They are prefixed to
        the morpheme.
        """
        return morpheme.lstrip('?')

    @staticmethod
    def replace_slash(gloss):
        """Replace the slash by a dot between numbers."""
        return re.sub(r'(\d)/(\d)', r'\1.\2', gloss)

    @staticmethod
    def replace_plus(gloss):
        """Replace the plus by a dot."""
        return gloss.replace('+', '.')

コード例 #22

0

ファイルを表示

ファイル: gloss_mapper.py プロジェクト: acqdiv/acqdiv

class CreeGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/cree/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @staticmethod
    def replace_gloss_connector(gloss):
        """Replace the gloss connectors.

        There are three different gloss connectors: '.', '+', ','
        ',' adds an additional specification to a gloss, e.g.
        'p,quest” (question particle)'. '+' and ',' are replaced by a dot.
        """
        return gloss.replace(',', '.').replace('+', '.')

    @classmethod
    def clean_gloss(cls, gloss):
        # gloss = cls.replace_gloss_connector(gloss)
        return gloss

コード例 #23

0

ファイルを表示

ファイル: gloss_mapper.py プロジェクト: acqdiv/acqdiv

class KuWaruGlossMapper:

    gloss_dict = parse_csv(get_full_path(
        'parsers/corpora/main/ku_waru/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.infer_gloss(gloss)

    @classmethod
    def infer_gloss(cls, gloss):
        gloss = ToolboxMorphemeCleaner.clean(gloss)

        gloss = cls.replace_colons(gloss)
        gloss = cls.remove_bt_tp(gloss)
        gloss = cls.replace_many_to_one(gloss)

        number_person_rgx = re.compile(r'[1-3/]+(SG|DU|PL)')

        # case 1: direct mapping
        if gloss in cls.gloss_dict:
            return cls.gloss_dict[gloss]

        # case 2: number-person combinations
        if number_person_rgx.fullmatch(gloss):
            return gloss

        # case 3: lexical gloss
        if gloss.islower():
            return ''

        # case 4: NER
        if gloss in ['PERSON', 'PLACE', 'TRIBE']:
            return ''

        # case 5: multi-category morpheme
        if '.' in gloss:
            categories = gloss.split('.')

            mapped_categories = []

            for category in categories:
                if number_person_rgx.fullmatch(category):
                    mapped_category = category
                elif category.islower():
                    return ''
                else:
                    mapped_category = cls.gloss_dict.get(category, '???')

                mapped_categories.append(mapped_category)

            return '.'.join(mapped_categories)

        # other
        return ''

    @staticmethod
    def replace_colons(gloss):
        """Replace colons by dots.

        Args:
            gloss (str): The gloss.

        Example:
            IMP:2/3DU => IMP.2/3DU
        """
        return gloss.replace(':', '.')

    @staticmethod
    def remove_bt_tp(gloss):
        """Remove TP and BT categories.

        `BT` denotes baby talk
        `TP` denotes Tok Pisin

        Args:
            gloss (str): The gloss.

        Example:
            banana.BT => banana
        """
        return re.sub(r'\.(BT|TP)', '', gloss)

    @staticmethod
    def replace_many_to_one(gloss):
        """Replace multi-word gloss by one-word gloss.

        Args:
            gloss (str): The gloss.

        Current multi-word glosses mapping to one-word glosses are:
            TAG.Q
            that.ABK
            that.ABU
            that.ANA
            that.END
            this.DEF
            this.IP

        Example:
            this.DEF => PROX
        """
        gloss = gloss.replace('TAG.Q', 'Q')
        gloss = re.sub(r'that\.AB[KU]', 'DEM', gloss)
        gloss = re.sub(r'that\.(ANA|END)', 'DIST', gloss)
        gloss = re.sub(r'this\.(DEF|IP)', 'PROX', gloss)

        return gloss

コード例 #24

0

ファイルを表示

class SesothoGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/sesotho/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @classmethod
    def clean_gloss(cls, gloss):
        """Clean a Sesotho gloss."""
        for method in [
                cls.remove_markers, cls.clean_proper_names_gloss_words,
                cls.remove_nominal_concord_markers,
                cls.unify_untranscribed_glosses
        ]:
            gloss = method(gloss)
        return gloss

    @classmethod
    def remove_markers(cls, gloss):
        """Remove noun and verb markers."""
        gloss = cls.remove_noun_markers(gloss)
        gloss = cls.remove_verb_markers(gloss)
        return gloss

    @staticmethod
    def remove_noun_markers(gloss):
        """Remove noun markers."""
        return re.sub(r'[nN]\^(?=\d)', '', gloss)

    @staticmethod
    def remove_verb_markers(gloss):
        """Remove verb markers."""
        return re.sub(r'[vs]\^', '', gloss)

    @staticmethod
    def clean_proper_names_gloss_words(gloss):
        """Clean glosses of proper names.

        In proper names substitute 'n^' marker with 'a_'.
        Lowercase the labels of propernames.
        """
        gloss = re.sub(r'[nN]\^([gG]ame|[nN]ame|[pP]lace|[sS]ong)', r'a_\1',
                       gloss)
        if re.search(r'a_(Game|Name|Place|Song)', gloss):
            gloss = gloss.lower()
        return gloss

    @staticmethod
    def remove_nominal_concord_markers(gloss):
        """Remove markers for nominal concord."""
        match = re.search(r'^(d|lr|obr|or|pn|ps)\d+', gloss)
        if match:
            pos = match.group(1)
            return re.sub(pos, '', gloss)

        return gloss

    @staticmethod
    def unify_untranscribed_glosses(gloss):
        """Unify untranscribed glosses.

        In Sesotho glossing for words which are not understood or
        couldn't be analyzed are marked by 'word' or by 'xxx'. Turn
        both into the standart '???'.
        """
        if gloss == 'word' or gloss == 'xxx':
            return '???'

        return gloss

コード例 #25

0

ファイルを表示

ファイル: session_parser.py プロジェクト: acqdiv/acqdiv

class IndonesianSessionParser(ToolboxParser):

    role_mapper = RoleMapper(
        get_full_path('parsers/corpora/main/indonesian/resources/'
                      'speaker_label2macro_role.csv'))

    def get_metadata_reader(self):
        return CHATParser(self.metadata_path)

    def add_session_metadata(self):
        self.session.source_id = os.path.splitext(
            os.path.basename(self.toolbox_path))[0]
        metadata = self.metadata_reader.metadata['__attrs__']
        self.session.date = metadata.get('Date', None)

        return self.session

    def add_speakers(self):
        for speaker_dict in self.metadata_reader.metadata['participants']:
            speaker = Speaker()
            speaker.birth_date = speaker_dict.get('birthday', '')
            speaker.code = speaker_dict.get('id', '')
            speaker.name = speaker_dict.get('name', '')
            speaker.code = Lc.correct_speaker_label(speaker.code, speaker.name)
            speaker.languages_spoken = speaker_dict.get('language', '')

            speaker.age_raw = speaker_dict.get('age', '')
            IndonesianAgeUpdater.update(speaker, self.session.date)

            speaker.role_raw = speaker_dict.get('role', '')
            speaker.role = self.role_mapper.role_raw2role(speaker.role_raw)
            speaker.macro_role = self.role_mapper.infer_macro_role(
                speaker.role_raw, speaker.age_in_days, speaker.code)

            speaker.gender_raw = speaker_dict.get('sex', '')
            speaker.gender = speaker.gender_raw.title()
            if not speaker.gender:
                speaker.gender = self.role_mapper.role_raw2gender(
                    speaker.role_raw)

            if self.is_speaker(speaker):
                self.session.speakers.append(speaker)

    def add_utterance(self, rec):
        utt = super().add_utterance(rec)
        speaker_label = self.record_reader.get_speaker_label(rec)
        speaker_label = Lc.correct_rec_label(speaker_label)
        utt.speaker = self._get_speaker(speaker_label, self.session.speakers)

        return utt

    @staticmethod
    def is_speaker(speaker):
        """Check whether the speaker is a real speaker.

        Skip `AUX` participants.

        Args:
            speaker (Speaker): The `Speaker` instance.
        """
        return speaker.code != 'AUX'

    def get_record_reader(self):
        return IndonesianReader()

    def get_cleaner(self):
        return IndonesianCleaner()

    def add_words(self, actual_utterance, target_utterance):
        utt = self.session.utterances[-1]

        for word in self.record_reader.get_words(actual_utterance):
            w = Word()
            utt.words.append(w)

            w.word_language = ''

            # Distinguish between word and word_target;
            # otherwise the target word is identical to the actual word
            if re.search('\(', word):
                w.word_target = re.sub('[()]', '', word)
                w.word = re.sub('\([^)]+\)', '', word)
                w.word_actual = w.word
            else:
                w.word_target = re.sub('xxx?|www', '???', word)
                w.word = re.sub('xxx?', '???', word)
                w.word_actual = w.word

コード例 #26

0

ファイルを表示

 def create_views(db_path):
     view_path = get_full_path('database/views.sql')
     cmd = f'sqlite3 {db_path} < {view_path}'
     subprocess.run(cmd, shell=True)

コード例 #27

0

ファイルを表示

ファイル: role.py プロジェクト: acqdiv/acqdiv

def get_roles():
    roles = ConfigParser(delimiters='=')
    roles.optionxform = str
    roles.read(get_full_path('util/resources/role_mapping.ini'))

    return roles

コード例 #28

0

ファイルを表示

class TuatschinGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/tuatschin/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)

        if gloss:
            # replace person/number combinations first
            pnum_regex = re.compile(r'([0123])\.(Sing)')
            gloss = pnum_regex.sub(r'\1SG', gloss)
            pnum_regex = re.compile(r'([0123])\.(Plur)')
            gloss = pnum_regex.sub(r'\1PL', gloss)

            parts = []
            is_null = False
            for part in gloss.split('.'):
                if re.search(r'[0123](SG|PL)', part):
                    parts.append(part)
                else:
                    if part in cls.gloss_dict:
                        part = cls.gloss_dict[part]

                        if part != '???':
                            parts.append(part)
                        else:
                            is_null = True
                            break
                    else:
                        is_null = True
                        break

            if is_null:
                gloss = ''
            else:
                gloss = '.'.join(parts)

        else:
            gloss = ''

        return gloss

    @classmethod
    def clean_gloss(cls, gloss):
        for cleaning_method in [cls.remove_pos]:
            gloss = cleaning_method(gloss)
        return gloss

    @staticmethod
    def remove_pos(gloss):
        """Remove the POS tag.

        Morpho-syntactic annotations start with the POS tag:
        [POS].[SUB-GlOSS1].[SUB-GLOSS2]

        Example:
            ADJ.Fem.Sing => Fem.Sing
        """
        regex = re.compile(r'^[^.]+\.')
        gloss = regex.sub('', gloss)

        return gloss