Esempio n. 1
0
который-нибудь
любой
многие
мой
наш
некий
некоторый
немногие
никакой
ничей
оный
прочий
проч.
пр.
свой
сей
таков
такой
твой
тот
чей
чей-либо
чей-нибудь
чей-то
этакий
этот
""".strip().replace('ё', 'е').split())


converters.add('opencorpora-int', 'dialog2017', from_opencorpora)
Esempio n. 2
0
который
который-нибудь
любой
многие
мой
наш
некий
некоторый
немногие
никакой
ничей
оный
прочий
проч.
пр.
свой
сей
таков
такой
твой
тот
чей
чей-либо
чей-нибудь
чей-то
этакий
этот
""".strip().replace('ё', 'е').split())

converters.add('opencorpora-int', 'dialog2017', from_opencorpora)
Esempio n. 3
0
    else:
        info.discard('дст')
        info.discard('стр')

    if pos == 'ИНФИНИТИВ':
        extra_info.add('inf')
    elif pos == 'ДЕЕПРИЧАСТИЕ':
        extra_info.add('ger')

    new_form = (GRAMINFO_MAP[attr] for attr in info if attr in GRAMINFO_MAP)
    return ",".join(itertools.chain([POS[pos]], extra_info, new_form))


def to_aot(dialog_tag):
    pos, info = aot.split_tag(dialog_tag)
    new_form = (GRAMINFO_MAP_INV[tag] for tag in info
                if tag in GRAMINFO_MAP_INV)
    new_pos = POS_INV[pos]
    if pos == 'V':
        if 'inf' in info:
            new_pos = 'ИНФИНИТИВ'
        elif 'partcp' in info:
            new_pos = 'ПРИЧАСТИЕ'
        elif 'ger' in info:
            new_pos = 'ДЕЕПРИЧАСТИЕ'

    return ",".join(itertools.chain([new_pos], new_form))


converters.add('dialog2010', 'aot', to_aot)
converters.add('aot', 'dialog2010', from_aot)
Esempio n. 4
0
def internal_to_external(internal_tag):
    return _translate_tag(internal_tag, INTERNAL_TO_EXTERNAL)


def to_aot(open_tag):
    open_tags = open_tag.replace(" ", ',').split(',')
    open_pos, open_info = open_tags[0], open_tags[1:]

    pos = EXTERNAL_TO_AOT.get(open_pos, None)
    info = [EXTERNAL_TO_AOT.get(tag, None) for tag in open_info]
    info = [tag for tag in info if tag is not None]

    if open_pos == 'ПРИЛ':
        if 'мест-п' in open_info:
            pos = 'МС-П'
        elif 'числ-п' in open_info:
            pos = 'ЧИСЛ-П'

    if 'пвл' in info:
        info.append('2л')

    if not pos:
        return ''
    return ','.join([pos] + info)


converters.add('opencorpora-int', 'opencorpora-ext', internal_to_external)
converters.add('opencorpora-ext', 'opencorpora-int', external_to_internal)
converters.add('opencorpora-ext', 'aot', to_aot)
Esempio n. 5
0
def internal_to_external(internal_tag):
    return _translate_tag(internal_tag, INTERNAL_TO_EXTERNAL)


def to_aot(open_tag):
    open_tags = open_tag.replace(" ", ',').split(',')
    open_pos, open_info = open_tags[0], open_tags[1:]

    pos = EXTERNAL_TO_AOT.get(open_pos, None)
    info = [EXTERNAL_TO_AOT.get(tag, None) for tag in open_info]
    info = [tag for tag in info if tag is not None]

    if open_pos == 'ПРИЛ':
        if 'мест-п' in open_info:
            pos = 'МС-П'
        elif 'числ-п' in open_info:
            pos = 'ЧИСЛ-П'

    if 'пвл' in info:
        info.append('2л')

    if not pos:
        return ''
    return ','.join([pos] + info)


converters.add('opencorpora-int', 'opencorpora-ext', internal_to_external)
converters.add('opencorpora-ext', 'opencorpora-int', external_to_internal)
converters.add('opencorpora-ext', 'aot', to_aot)
Esempio n. 6
0
        for categ, gmap in sorted(self.GRAM_MAP.items()):
            if gram in gmap:
                if categ == '_POS':
                    self.pos = gmap[gram]
                    match = True
                else:
                    self.grammemes[categ] = gmap[gram]
                    match = True

        if not match:
            self.unmatched.add(gram)

    def _fill_from_oc(self, oc_tag):
        grams = oc_tag.replace(' ', ',').split(',')
        for g in grams:
            self._fill_one_gram_oc(g)
        self._postprocess()

    def __str__(self):
        grams = '|'.join("{}={}".format(c, v)
                         for c, v in sorted(self.grammemes.items()))
        return "{} {}".format(self.pos, grams if grams else '_')


def to_ud14(oc_tag, word=None):
    tag = Tag(oc_tag, word)
    return str(tag)


converters.add('opencorpora-int', 'ud14', to_ud14)
Esempio n. 7
0
    else:
        info.discard("дст")
        info.discard("стр")

    if pos == "ИНФИНИТИВ":
        extra_info.add("inf")
    elif pos == "ДЕЕПРИЧАСТИЕ":
        extra_info.add("ger")

    new_form = (GRAMINFO_MAP[attr] for attr in info if attr in GRAMINFO_MAP)
    return ",".join(itertools.chain([POS[pos]], extra_info, new_form))


def to_aot(dialog_tag):
    pos, info = aot.split_tag(dialog_tag)
    new_form = (GRAMINFO_MAP_INV[tag] for tag in info if tag in GRAMINFO_MAP_INV)
    new_pos = POS_INV[pos]
    if pos == "V":
        if "inf" in info:
            new_pos = "ИНФИНИТИВ"
        elif "partcp" in info:
            new_pos = "ПРИЧАСТИЕ"
        elif "ger" in info:
            new_pos = "ДЕЕПРИЧАСТИЕ"

    return ",".join(itertools.chain([new_pos], new_form))


converters.add("dialog2010", "aot", to_aot)
converters.add("aot", "dialog2010", from_aot)
Esempio n. 8
0
    # ====== 13. degree of comparison ======
    if tag.degree_of_comparison == '2':
        info.add('сравн')
    elif tag.degree_of_comparison == '3':
        info.add('прев')
    if tag.POS == 'Dg' and tag.degree_of_comparison == '2': # hack?
        pos = 'П'

    # 14. negation

    # ====== 15. voice ==========
    if tag.voice == 'A':
        info.add('дст')
    elif tag.voice == 'P':
        info.add('стр')
    elif tag.mainPOS == 'V' and tag.tense != 'F': # hack?
        info.add('дст')

    # ====== 16. variant ========
    if tag.variant in ['2', '3']:
        info.add('арх')
    elif tag.variant in ['5', '6', '7']:
        info.add('разг')
    elif tag.variant == '8':
        info.add('аббр')

    return ",".join([pos] + list(info))

converters.add('positional', 'aot', from_positional)
converters.add('aot', 'positional', to_positional)
Esempio n. 9
0
    return False


def from_opencorpora_int(open_tag):
    """
    Convert OpenCorpora tag to www.ruscorpora.com tag::

        >>> print(from_opencorpora_int('NOUN,inan,masc sing,nomn'))
        S,inan,m=sg,nom

    """

    # Whitespace is replaced with ",|,"
    # then "|" is treated as token and replaced with "=",
    # then commas around "=" are removed in result.
    # This way space is converted to "=".

    grammeme_list = open_tag.replace(' ', ',|,').split(',')

    if _is_initials(grammeme_list):
        return 'INIT=abbr'

    result = rule_engine.apply_rules(FROM_OPENCORPORA, grammeme_list)
    result = ','.join(result).replace(',=,', '=').replace(',=', '')
    if result == '':
        return 'NONLEX'
    return result


converters.add('opencorpora-int', 'ruscorpora', from_opencorpora_int)
Esempio n. 10
0
    else:
        info.discard('дст')
        info.discard('стр')

    if pos == 'ИНФИНИТИВ':
        extra_info.add('inf')
    elif pos == 'ДЕЕПРИЧАСТИЕ':
        extra_info.add('ger')

    new_form = (GRAMINFO_MAP[attr] for attr in info if attr in GRAMINFO_MAP)
    return ",".join(itertools.chain([POS[pos]], extra_info, new_form))


def to_aot(dialog_tag, word=None):
    pos, info = aot.split_tag(dialog_tag)
    new_form = (GRAMINFO_MAP_INV[tag] for tag in info if tag in GRAMINFO_MAP_INV)
    new_pos = POS_INV[pos]
    if pos == 'V':
        if 'inf' in info:
            new_pos = 'ИНФИНИТИВ'
        elif 'partcp' in info:
            new_pos = 'ПРИЧАСТИЕ'
        elif 'ger' in info:
            new_pos = 'ДЕЕПРИЧАСТИЕ'

    return ",".join(itertools.chain([new_pos], new_form))


converters.add('dialog2010', 'aot', to_aot)
converters.add('aot', 'dialog2010', from_aot)
Esempio n. 11
0
        for g in grams:
            self._fill_one_gram_oc(g)
        self._postprocess()

    def __str__(self):
        grams = '|'.join("{}={}".format(c, v) for c, v in sorted(self.grammemes.items()))
        return "{} {}".format(self.pos, grams if grams else '_')


class Tag20(Tag14):
    GRAM_MAP = deepcopy(Tag14.GRAM_MAP)
    # http://universaldependencies.org/v2/postags.html
    GRAM_MAP['_POS']['CONJ'] = 'CCONJ'
    # http://universaldependencies.org/v2/features.html
    GRAM_MAP['VerbForm']['GRND'] = 'Conv'
    GRAM_MAP['Abbr'] = {'Abbr': 'Yes'}


def to_ud14(oc_tag, word=None):
    tag = Tag14(oc_tag)
    return str(tag)


def to_ud20(oc_tag, word=None):
    tag = Tag20(oc_tag)
    return str(tag)


converters.add('opencorpora-int', 'ud14', to_ud14)
converters.add('opencorpora-int', 'ud20', to_ud20)
Esempio n. 12
0
            self._fill_one_gram_mystem(g)
        self._postprocess()

    def __str__(self):
        grams = '|'.join("{}={}".format(c, v)
                         for c, v in sorted(self.grammemes.items()))
        return "{} {}".format(self.pos, grams if grams else '_')


class Tag20(Tag14):
    GRAM_MAP = deepcopy(Tag14.GRAM_MAP)
    # http://universaldependencies.org/v2/postags.html
    GRAM_MAP['_POS']['CONJ'] = 'CCONJ'
    # http://universaldependencies.org/v2/features.html
    GRAM_MAP['VerbForm']['деепр'] = 'Conv'
    GRAM_MAP['Abbr'] = {'сокр': 'Yes'}


def to_ud14(tag, word=None):
    tag = Tag14(tag)
    return str(tag)


def to_ud20(tag, word=None):
    tag = Tag20(tag)
    return str(tag)


converters.add('mystem', 'ud14', to_ud14)
converters.add('mystem', 'ud20', to_ud20)