def get_parts_of_speech(word): """ Returns a list strings describing parts of speech the given russian word could be. The enums are derived from pymorphy2. >>> get_parts_of_speech('рогалик') ['NOUN'] >>> get_parts_of_speech('постовой') ['ADJF', 'NOUN'] >>> 'NOUN' in get_parts_of_speech('правил') True >>> 'ADJF' in get_parts_of_speech('правил') False :param word: a russian word :return: list of pymorphy2 POS enums. """ met = set() ret = [] for p in morph.parse(word): # if p.score < .1: # continue pos = p.tag.POS if pos not in met: ret.append(pos) met.add(pos) return ret
def get_initial_forms(form: str, part_filter=None)->list: """ Gets all possible initial forms (there are several of them sometimes) of a given word. Optional argument part_filter allows to prune unnecessary ambiguity with part of speech. >>> get_initial_forms('Дядя') ['дядя'] >>> get_initial_forms('самых') ['самый'] >>> get_initial_forms('честных') ['честной', 'честный'] >>> get_initial_forms('правил') ['правило', 'править'] >>> get_initial_forms('правил', 'NOUN') ['правило'] >>> get_initial_forms('правил', ['VERB']) ['править'] :param form: a russian word :param part_filter: something that supports `in' operator: str, list, set etc. If it is a container, it should contain only Part-of-speech names according to pymorphy2 enumerations :return: a list of possible initial forms of the given word in lowercase. It's guaranteed that there are no repetitions. Variants are generated in the order of descending certainty. """ met = set() ret = [] for p in morph.parse(form): # if p.score < .1: # continue if part_filter is None or p.tag.POS in part_filter: norm = p.normal_form if norm not in met: ret.append(norm) met.add(norm) return ret
def get_valid_noun_initial_form(word: str)->str: possible_forms = [p for p in morph.parse(word) if _is_valid_noun(p)] if len(possible_forms) == 0: return None else: return possible_forms[0].normal_form