который-нибудь любой многие мой наш некий некоторый немногие никакой ничей оный прочий проч. пр. свой сей таков такой твой тот чей чей-либо чей-нибудь чей-то этакий этот """.strip().replace('ё', 'е').split()) converters.add('opencorpora-int', 'dialog2017', from_opencorpora)
который который-нибудь любой многие мой наш некий некоторый немногие никакой ничей оный прочий проч. пр. свой сей таков такой твой тот чей чей-либо чей-нибудь чей-то этакий этот """.strip().replace('ё', 'е').split()) converters.add('opencorpora-int', 'dialog2017', from_opencorpora)
else: info.discard('дст') info.discard('стр') if pos == 'ИНФИНИТИВ': extra_info.add('inf') elif pos == 'ДЕЕПРИЧАСТИЕ': extra_info.add('ger') new_form = (GRAMINFO_MAP[attr] for attr in info if attr in GRAMINFO_MAP) return ",".join(itertools.chain([POS[pos]], extra_info, new_form)) def to_aot(dialog_tag): pos, info = aot.split_tag(dialog_tag) new_form = (GRAMINFO_MAP_INV[tag] for tag in info if tag in GRAMINFO_MAP_INV) new_pos = POS_INV[pos] if pos == 'V': if 'inf' in info: new_pos = 'ИНФИНИТИВ' elif 'partcp' in info: new_pos = 'ПРИЧАСТИЕ' elif 'ger' in info: new_pos = 'ДЕЕПРИЧАСТИЕ' return ",".join(itertools.chain([new_pos], new_form)) converters.add('dialog2010', 'aot', to_aot) converters.add('aot', 'dialog2010', from_aot)
def internal_to_external(internal_tag): return _translate_tag(internal_tag, INTERNAL_TO_EXTERNAL) def to_aot(open_tag): open_tags = open_tag.replace(" ", ',').split(',') open_pos, open_info = open_tags[0], open_tags[1:] pos = EXTERNAL_TO_AOT.get(open_pos, None) info = [EXTERNAL_TO_AOT.get(tag, None) for tag in open_info] info = [tag for tag in info if tag is not None] if open_pos == 'ПРИЛ': if 'мест-п' in open_info: pos = 'МС-П' elif 'числ-п' in open_info: pos = 'ЧИСЛ-П' if 'пвл' in info: info.append('2л') if not pos: return '' return ','.join([pos] + info) converters.add('opencorpora-int', 'opencorpora-ext', internal_to_external) converters.add('opencorpora-ext', 'opencorpora-int', external_to_internal) converters.add('opencorpora-ext', 'aot', to_aot)
def internal_to_external(internal_tag): return _translate_tag(internal_tag, INTERNAL_TO_EXTERNAL) def to_aot(open_tag): open_tags = open_tag.replace(" ", ',').split(',') open_pos, open_info = open_tags[0], open_tags[1:] pos = EXTERNAL_TO_AOT.get(open_pos, None) info = [EXTERNAL_TO_AOT.get(tag, None) for tag in open_info] info = [tag for tag in info if tag is not None] if open_pos == 'ПРИЛ': if 'мест-п' in open_info: pos = 'МС-П' elif 'числ-п' in open_info: pos = 'ЧИСЛ-П' if 'пвл' in info: info.append('2л') if not pos: return '' return ','.join([pos] + info) converters.add('opencorpora-int', 'opencorpora-ext', internal_to_external) converters.add('opencorpora-ext', 'opencorpora-int', external_to_internal) converters.add('opencorpora-ext', 'aot', to_aot)
for categ, gmap in sorted(self.GRAM_MAP.items()): if gram in gmap: if categ == '_POS': self.pos = gmap[gram] match = True else: self.grammemes[categ] = gmap[gram] match = True if not match: self.unmatched.add(gram) def _fill_from_oc(self, oc_tag): grams = oc_tag.replace(' ', ',').split(',') for g in grams: self._fill_one_gram_oc(g) self._postprocess() def __str__(self): grams = '|'.join("{}={}".format(c, v) for c, v in sorted(self.grammemes.items())) return "{} {}".format(self.pos, grams if grams else '_') def to_ud14(oc_tag, word=None): tag = Tag(oc_tag, word) return str(tag) converters.add('opencorpora-int', 'ud14', to_ud14)
else: info.discard("дст") info.discard("стр") if pos == "ИНФИНИТИВ": extra_info.add("inf") elif pos == "ДЕЕПРИЧАСТИЕ": extra_info.add("ger") new_form = (GRAMINFO_MAP[attr] for attr in info if attr in GRAMINFO_MAP) return ",".join(itertools.chain([POS[pos]], extra_info, new_form)) def to_aot(dialog_tag): pos, info = aot.split_tag(dialog_tag) new_form = (GRAMINFO_MAP_INV[tag] for tag in info if tag in GRAMINFO_MAP_INV) new_pos = POS_INV[pos] if pos == "V": if "inf" in info: new_pos = "ИНФИНИТИВ" elif "partcp" in info: new_pos = "ПРИЧАСТИЕ" elif "ger" in info: new_pos = "ДЕЕПРИЧАСТИЕ" return ",".join(itertools.chain([new_pos], new_form)) converters.add("dialog2010", "aot", to_aot) converters.add("aot", "dialog2010", from_aot)
# ====== 13. degree of comparison ====== if tag.degree_of_comparison == '2': info.add('сравн') elif tag.degree_of_comparison == '3': info.add('прев') if tag.POS == 'Dg' and tag.degree_of_comparison == '2': # hack? pos = 'П' # 14. negation # ====== 15. voice ========== if tag.voice == 'A': info.add('дст') elif tag.voice == 'P': info.add('стр') elif tag.mainPOS == 'V' and tag.tense != 'F': # hack? info.add('дст') # ====== 16. variant ======== if tag.variant in ['2', '3']: info.add('арх') elif tag.variant in ['5', '6', '7']: info.add('разг') elif tag.variant == '8': info.add('аббр') return ",".join([pos] + list(info)) converters.add('positional', 'aot', from_positional) converters.add('aot', 'positional', to_positional)
return False def from_opencorpora_int(open_tag): """ Convert OpenCorpora tag to www.ruscorpora.com tag:: >>> print(from_opencorpora_int('NOUN,inan,masc sing,nomn')) S,inan,m=sg,nom """ # Whitespace is replaced with ",|," # then "|" is treated as token and replaced with "=", # then commas around "=" are removed in result. # This way space is converted to "=". grammeme_list = open_tag.replace(' ', ',|,').split(',') if _is_initials(grammeme_list): return 'INIT=abbr' result = rule_engine.apply_rules(FROM_OPENCORPORA, grammeme_list) result = ','.join(result).replace(',=,', '=').replace(',=', '') if result == '': return 'NONLEX' return result converters.add('opencorpora-int', 'ruscorpora', from_opencorpora_int)
else: info.discard('дст') info.discard('стр') if pos == 'ИНФИНИТИВ': extra_info.add('inf') elif pos == 'ДЕЕПРИЧАСТИЕ': extra_info.add('ger') new_form = (GRAMINFO_MAP[attr] for attr in info if attr in GRAMINFO_MAP) return ",".join(itertools.chain([POS[pos]], extra_info, new_form)) def to_aot(dialog_tag, word=None): pos, info = aot.split_tag(dialog_tag) new_form = (GRAMINFO_MAP_INV[tag] for tag in info if tag in GRAMINFO_MAP_INV) new_pos = POS_INV[pos] if pos == 'V': if 'inf' in info: new_pos = 'ИНФИНИТИВ' elif 'partcp' in info: new_pos = 'ПРИЧАСТИЕ' elif 'ger' in info: new_pos = 'ДЕЕПРИЧАСТИЕ' return ",".join(itertools.chain([new_pos], new_form)) converters.add('dialog2010', 'aot', to_aot) converters.add('aot', 'dialog2010', from_aot)
for g in grams: self._fill_one_gram_oc(g) self._postprocess() def __str__(self): grams = '|'.join("{}={}".format(c, v) for c, v in sorted(self.grammemes.items())) return "{} {}".format(self.pos, grams if grams else '_') class Tag20(Tag14): GRAM_MAP = deepcopy(Tag14.GRAM_MAP) # http://universaldependencies.org/v2/postags.html GRAM_MAP['_POS']['CONJ'] = 'CCONJ' # http://universaldependencies.org/v2/features.html GRAM_MAP['VerbForm']['GRND'] = 'Conv' GRAM_MAP['Abbr'] = {'Abbr': 'Yes'} def to_ud14(oc_tag, word=None): tag = Tag14(oc_tag) return str(tag) def to_ud20(oc_tag, word=None): tag = Tag20(oc_tag) return str(tag) converters.add('opencorpora-int', 'ud14', to_ud14) converters.add('opencorpora-int', 'ud20', to_ud20)
self._fill_one_gram_mystem(g) self._postprocess() def __str__(self): grams = '|'.join("{}={}".format(c, v) for c, v in sorted(self.grammemes.items())) return "{} {}".format(self.pos, grams if grams else '_') class Tag20(Tag14): GRAM_MAP = deepcopy(Tag14.GRAM_MAP) # http://universaldependencies.org/v2/postags.html GRAM_MAP['_POS']['CONJ'] = 'CCONJ' # http://universaldependencies.org/v2/features.html GRAM_MAP['VerbForm']['деепр'] = 'Conv' GRAM_MAP['Abbr'] = {'сокр': 'Yes'} def to_ud14(tag, word=None): tag = Tag14(tag) return str(tag) def to_ud20(tag, word=None): tag = Tag20(tag) return str(tag) converters.add('mystem', 'ud14', to_ud14) converters.add('mystem', 'ud20', to_ud20)