def get_wordform(word : str, morph_info : 'MorphBaseInfo') -> str: """ Получить вариант написания словоформы Args: word(str): слово morph_info(MorphBaseInfo): морфологическая информация Returns: str: вариант написания """ if (morph_info is None or Utils.isNullOrEmpty(word)): return word cla = morph_info.class0_ if (cla.is_undefined): mi0 = MorphologyService.get_word_base_info(word, None, False, False) if (mi0 is not None): cla = mi0.class0_ word1 = word for ch in word: if (str.islower(ch)): word1 = word.upper() break wf = Utils.asObjectOrNull(morph_info, MorphWordForm) res = MorphologyService.__m_morph.get_wordform(word1, cla, morph_info.gender, morph_info.case_, morph_info.number, morph_info.language, wf) if (Utils.isNullOrEmpty(res)): return word return res
def can_be_equals(rus_: str, lat_: str) -> bool: if (Utils.isNullOrEmpty(rus_) or Utils.isNullOrEmpty(lat_)): return False rus_ = rus_.upper() lat_ = lat_.upper() vs = RusLatAccord.__get_vars_pref(rus_, 0, lat_, 0) if (vs is None): return False stack = list() stack.append(vs) while len(stack) > 0: if (len(stack) == 0): break ri = 0 li = 0 for s in stack: ri += len(s[0].rus) li += len(s[0].lat) if (ri >= len(rus_) and li >= len(lat_)): return True vs = RusLatAccord.__get_vars_pref(rus_, ri, lat_, li) if (vs is not None): stack.insert(0, vs) continue while len(stack) > 0: del stack[0][0] if (len(stack[0]) > 0): break del stack[0] return False
def check(self, prep: str, cas: 'MorphCase') -> bool: """ Проверить на соответствие вопросу предлога с падежом Args: prep(str): предлог cas(MorphCase): падеж Returns: bool: да-нет """ if (self.is_abstract): for it in ControlModelQuestion.ITEMS: if (not it.is_abstract and it.question == self.question): if (it.check(prep, cas)): return True return False if (((cas) & self.case_).is_undefined): if (self.preposition == "В" and prep == self.preposition): if (self.case_.is_accusative): if (cas.is_undefined or cas.is_nominative): return True return False if (prep is not None and self.preposition is not None): if (prep == self.preposition): return True if (self.preposition == "ОТ" and prep == "ОТ ИМЕНИ"): return True return Utils.isNullOrEmpty(prep) and Utils.isNullOrEmpty( self.preposition)
def check(self, prep: str, cas: 'MorphCase') -> bool: if (((cas) & self.case_).is_undefined): return False if (prep is not None and self.preposition is not None): return prep == self.preposition return Utils.isNullOrEmpty(prep) and Utils.isNullOrEmpty( self.preposition)
def initByNormalText(self, text: str, lang_: 'MorphLang' = None) -> None: """ Быстрая инициализация без морф.вариантов, производится только токенизация текста. Используется для ускорения работы со словарём в случае, когда изначально известно, что на входе уже нормализованные строки Args: text(str): исходно нормализованный текст lang_(MorphLang): возможный язык """ if (Utils.isNullOrEmpty(text)): return text = text.upper() if (text.find('\'') >= 0): text = text.replace("'", "") tok = False sp = False for ch in text: if (not str.isalpha(ch)): if (ch == ' '): sp = True else: tok = True break if (not tok and not sp): tt = TextToken(None, None) tt.term = text self.terms.append(Termin.Term(tt, False)) elif (not tok and sp): wrds = Utils.splitString(text, ' ', False) i = 0 first_pass2811 = True while True: if first_pass2811: first_pass2811 = False else: i += 1 if (not (i < len(wrds))): break if (Utils.isNullOrEmpty(wrds[i])): continue tt = TextToken(None, None) tt.term = wrds[i] self.terms.append(Termin.Term(tt, False)) else: toks = Morphology.tokenize(text) if (toks is not None): i = 0 while i < len(toks): tt = TextToken(toks[i], None) self.terms.append(Termin.Term(tt, False)) i += 1 self.lang = MorphLang(lang_)
def getWordform(word: str, morph_info: 'MorphBaseInfo') -> str: """ Получить вариант написания словоформы Args: word(str): слово morph_info(MorphBaseInfo): морфологическая информация Returns: str: вариант написания """ if (morph_info is None or Utils.isNullOrEmpty(word)): return word cla = morph_info.class0_ if (cla.is_undefined): mi0 = Morphology.getWordBaseInfo(word, None, False, False) if (mi0 is not None): cla = mi0.class0_ for ch in word: if (str.islower(ch)): word = word.upper() break return Utils.ifNotNull( Morphology.__m_inner.getWordform( word, cla, morph_info.gender, morph_info.case_, morph_info.number, morph_info.language, Utils.asObjectOrNull(morph_info, MorphWordForm)), word)
def out_units(self, lang: 'MorphLang' = None) -> str: """ Вывести только единицы измерения Args: lang(MorphLang): язык Returns: str: строка с результатом """ uu = self.units if (len(uu) == 0): return "" res = io.StringIO() print(uu[0].to_string(True, lang, 0), end="", file=res) i = 1 while i < len(uu): pow0_ = uu[i].get_string_value(UnitReferent.ATTR_POW) if (not Utils.isNullOrEmpty(pow0_) and pow0_[0] == '-'): print("/{0}".format(uu[i].to_string(True, lang, 1)), end="", file=res, flush=True) if (pow0_ != "-1"): print("<{0}>".format(pow0_[1:]), end="", file=res, flush=True) else: print("*{0}".format(uu[i].to_string(True, lang, 0)), end="", file=res, flush=True) i += 1 return Utils.toStringStringIO(res)
def initialize() -> None: if (SentimentAnalyzer.__m_inited): return SentimentAnalyzer.__m_inited = True MetaSentiment.initialize() Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = True try: for i in range(2): str0_ = EpNerBusinessInternalResourceHelper.getString( ("Positives.txt" if i == 0 else "Negatives.txt")) if (str0_ is None): continue for line0 in Utils.splitString(str0_, '\n', False): line = line0.strip() if (Utils.isNullOrEmpty(line)): continue coef = (1 if i == 0 else -1) SentimentAnalyzer.__m_termins.add( Termin._new117(line, coef)) except Exception as ex: pass for s in ["ОЧЕНЬ", "СИЛЬНО"]: SentimentAnalyzer.__m_termins.add(Termin._new117(s, 0)) Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = False ProcessorService.registerAnalyzer(SentimentAnalyzer())
def case_number(self, value_) -> str: if (Utils.isNullOrEmpty(value_)): return value_ if (".,".find(value_[len(value_) - 1]) >= 0): value_ = value_[0:0 + len(value_) - 1] self.add_slot(InstrumentReferent.ATTR_CASENUMBER, value_, True, 0) return value_
def try_parse(str0_: str, lang: 'MorphLang') -> bool: """ Преобразовать из строки Args: str0_(str): lang(MorphLang): """ lang.value = MorphLang() while not Utils.isNullOrEmpty(str0_): i = 0 i = 0 while i < len(MorphLang.__m_names): if (Utils.startsWithString(str0_, MorphLang.__m_names[i], True)): break i += 1 if (i >= len(MorphLang.__m_names)): break lang.value.value |= ((1 << i)) i = 2 while i < len(str0_): if (str.isalpha(str0_[i])): break i += 1 if (i >= len(str0_)): break str0_ = str0_[i:] if (lang.value.is_undefined): return False return True
def get_language_for_text(text: str) -> str: """ Определить язык для неструктурированного ткста Args: text(str): текст Returns: str: код языка или null при ненахождении """ if (Utils.isNullOrEmpty(text)): return None ru_chars = 0 en_chars = 0 i = 0 first_pass2896 = True while True: if first_pass2896: first_pass2896 = False else: i += 1 if (not (i < len(text))): break ch = text[i] if (not str.isalpha(ch)): continue j = (ord(ch)) if (j >= 0x400 and (j < 0x500)): ru_chars += 1 elif (j < 0x80): en_chars += 1 if (((ru_chars > (en_chars * 2))) and ru_chars > 10): return "ru" if (ru_chars > 0 and en_chars == 0): return "ru" if (en_chars > 0): return "en" return None
def toString(self, short_variant: bool, lang: 'MorphLang' = None, lev: int = 0) -> str: nam = None for l_ in range(2): for s in self.slots: if (((s.type_name == UnitReferent.ATTR_NAME and short_variant)) or ((s.type_name == UnitReferent.ATTR_FULLNAME and not short_variant))): val = Utils.asObjectOrNull(s.value, str) if (lang is not None and l_ == 0): if (lang.is_ru != LanguageHelper.isCyrillic(val)): continue nam = val break if (nam is not None): break if (nam is None): nam = self.getStringValue(UnitReferent.ATTR_NAME) pow0_ = self.getStringValue(UnitReferent.ATTR_POW) if (Utils.isNullOrEmpty(pow0_) or lev > 0): return Utils.ifNotNull(nam, "?") res = ("{0}{1}".format(nam, pow0_) if (pow0_[0] != '-') else "{0}<{1}>".format(nam, pow0_)) if (not short_variant and self.is_unknown): res = ("(?)" + res) return res
def parseDateTime(str0_ : str) -> datetime.datetime: if (Utils.isNullOrEmpty(str0_)): return None try: prts = Utils.splitString(str0_, '.', False) wrapy804 = RefOutArgWrapper(0) inoutres805 = Utils.tryParseInt(prts[0], wrapy804) y = wrapy804.value if (not inoutres805): return None mon = 0 day = 0 if (len(prts) > 1): wrapmon802 = RefOutArgWrapper(0) inoutres803 = Utils.tryParseInt(prts[1], wrapmon802) mon = wrapmon802.value if (inoutres803): if (len(prts) > 2): wrapday801 = RefOutArgWrapper(0) Utils.tryParseInt(prts[2], wrapday801) day = wrapday801.value if (mon <= 0): mon = 1 if (day <= 0): day = 1 if (day > Utils.lastDayOfMonth(y, mon)): day = Utils.lastDayOfMonth(y, mon) return datetime.datetime(y, mon, day, 0, 0, 0) except Exception as ex: pass return None
def __getName(self, cyr : bool) -> str: name = None for i in range(2): for s in self.slots: if (s.type_name == GeoReferent.ATTR_NAME): v = str(s.value) if (Utils.isNullOrEmpty(v)): continue if (i == 0): if (not LanguageHelper.isCyrillicChar(v[0])): if (cyr): continue elif (not cyr): continue if (name is None): name = v elif (len(name) > len(v)): if ((len(v) < 4) and (len(name) < 10)): pass elif (name[len(name) - 1] == 'В'): pass else: name = v elif ((len(name) < 4) and len(v) >= 4 and (len(v) < 10)): name = v if (name is not None): break if (name == "МОЛДОВА"): name = "МОЛДАВИЯ" elif (name == "БЕЛАРУСЬ"): name = "БЕЛОРУССИЯ" return Utils.ifNotNull(name, "?")
def __calc_name(self, noplural: bool) -> None: if (not Utils.isNullOrEmpty(self.from_prep)): return if (not (isinstance(self.from0_.source.source, NounPhraseToken)) or self.from0_.source.typ != SentItemType.NOUN): return if (self.from0_.source.begin_token.chars.is_all_lower): return if (not (isinstance(self.to.source.source, NounPhraseToken)) or self.to.source.typ != SentItemType.NOUN): return if (self.from0_.order != (self.to.order + 1) and not noplural): return fm = self.from0_.source.source.morph tm = self.to.source.source.morph if (not fm.case_.is_undefined and not tm.case_.is_undefined): if (((tm.case_) & fm.case_).is_undefined): return if (fm.number == MorphNumber.PLURAL): if (noplural): if (self.from_is_plural): pass elif (((tm.number) & (MorphNumber.SINGULAR)) != (MorphNumber.UNDEFINED)): return self.plural = 1 self.coef = SemanticService.PARAMS.verb_plural else: if (fm.number == MorphNumber.SINGULAR): self.plural = 0 if (NGLink.__check_morph_accord(fm, False, tm)): self.coef = SemanticService.PARAMS.morph_accord
def __calc_actant(self) -> float: if (self.can_be_participle): self.coef = -1 return self.coef vf2 = self.to_verb.last_verb.verb_morph if (vf2 is None): return -1 if (self.from_prep is None): self.coef = 0 return self.coef fm = self.from0_.source.source.morph grs = DerivateService.find_derivates( Utils.ifNotNull(vf2.normal_full, vf2.normal_case), True, None) if (grs is not None): for gr in grs: if (gr.cm.nexts is None or not self.from_prep in gr.cm.nexts): continue cas = gr.cm.nexts[self.from_prep] if (not ((cas) & fm.case_).is_undefined): self.coef = SemanticService.PARAMS.next_model if (Utils.isNullOrEmpty(self.from_prep)): if (fm.case_.is_nominative): self.coef /= (2) self.coef /= (2) return self.coef if (self.from0_.source.source.morph.case_.is_undefined): self.coef = 0 return self.coef self.coef = 0.1 return self.coef
def tryAccordVar(self, v: 'MorphBaseInfo') -> bool: for vv in self.adj_morph: if (vv.checkAccord(v, False)): return True if (self.can_be_numeric_adj): if (v.number == MorphNumber.PLURAL): return True if (isinstance(self.begin_token, NumberToken)): val = (self.begin_token).int_value if (val is None): return False num = (self.begin_token).value if (Utils.isNullOrEmpty(num)): return False dig = num[len(num) - 1] if ((((dig == '2' or dig == '3' or dig == '4')) and (val < 10)) or val > 20): if (v.case_.is_genitive): return True term = None if (isinstance(v, MorphWordForm)): term = (v).normal_case if (isinstance(v, NounPhraseItemTextVar)): term = (v).normal_value if (term == "ЛЕТ" or term == "ЧЕЛОВЕК"): return True return False
def get_language_for_text(text: str) -> str: if (Utils.isNullOrEmpty(text)): return None i = 0 j = 0 ru_chars = 0 en_chars = 0 i = 0 first_pass2989 = True while True: if first_pass2989: first_pass2989 = False else: i += 1 if (not (i < len(text))): break ch = text[i] if (not str.isalpha(ch)): continue j = (ord(ch)) if (j >= 0x400 and (j < 0x500)): ru_chars += 1 elif (j < 0x80): en_chars += 1 if (((ru_chars > (en_chars * 2))) and ru_chars > 10): return "ru" if (ru_chars > 0 and en_chars == 0): return "ru" if (en_chars > 0): return "en" return None
def parse_date_time(str0_: str) -> datetime.datetime: if (Utils.isNullOrEmpty(str0_)): return None try: prts = Utils.splitString(str0_, '.', False) y = 0 wrapy831 = RefOutArgWrapper(0) inoutres832 = Utils.tryParseInt(prts[0], wrapy831) y = wrapy831.value if (not inoutres832): return None mon = 0 day = 0 if (len(prts) > 1): wrapmon829 = RefOutArgWrapper(0) inoutres830 = Utils.tryParseInt(prts[1], wrapmon829) mon = wrapmon829.value if (inoutres830): if (len(prts) > 2): wrapday828 = RefOutArgWrapper(0) Utils.tryParseInt(prts[2], wrapday828) day = wrapday828.value if (mon <= 0): mon = 1 if (day <= 0): day = 1 if (day > Utils.lastDayOfMonth(y, mon)): day = Utils.lastDayOfMonth(y, mon) return datetime.datetime(y, mon, day, 0, 0, 0) except Exception as ex: pass return None
def house_type(self) -> 'AddressHouseType': str0_ = self.getStringValue(AddressReferent.ATTR_HOUSETYPE) if (Utils.isNullOrEmpty(str0_)): return AddressHouseType.HOUSE try: return Utils.valToEnum(str0_, AddressHouseType) except Exception as ex340: return AddressHouseType.HOUSE
def __deserialize_morph_misc_info(str0_: 'ByteArrayWrapper', mi: 'MorphMiscInfo', pos: int) -> None: mi._m_value = (str0_.deserialize_short(pos)) while True: s = str0_.deserialize_string(pos) if (Utils.isNullOrEmpty(s)): break mi.attrs.append(s)
def __deserializeMorphMiscInfo(str0_: 'ByteArrayWrapper', mi: 'MorphMiscInfo') -> None: mi._m_value = (str0_.deserializeShort()) while True: s = str0_.deserializeString() if (Utils.isNullOrEmpty(s)): break mi.attrs.append(s)
def reg_number(self, value_) -> str: if (Utils.isNullOrEmpty(value_)): self.addSlot(InstrumentReferent.ATTR_REGNUMBER, None, True, 0) return value_ if (".,".find(value_[len(value_) - 1]) >= 0): value_ = value_[0:0 + len(value_) - 1] self.addSlot(InstrumentReferent.ATTR_REGNUMBER, value_, True, 0) return value_
def _deserialize(self, str0_: 'ByteArrayWrapper', pos: int) -> None: sh = str0_.deserialize_short(pos) self.value = (sh) while True: s = str0_.deserialize_string(pos) if (Utils.isNullOrEmpty(s)): break if (not s in self.__m_attrs): self.__m_attrs.append(s)
def building_type(self) -> 'AddressBuildingType': """ Тип строения """ str0_ = self.getStringValue(AddressReferent.ATTR_BUILDINGTYPE) if (Utils.isNullOrEmpty(str0_)): return AddressBuildingType.BUILDING try: return Utils.valToEnum(str0_, AddressBuildingType) except Exception as ex341: return AddressBuildingType.BUILDING
def check_geo_object_before(t: 'Token') -> bool: from pullenti.ner.geo.internal.CityItemToken import CityItemToken if (t is None): return False tt = t.previous first_pass3156 = True while True: if first_pass3156: first_pass3156 = False else: tt = tt.previous if (not (tt is not None)): break if ((tt.is_char_of(",.;:") or tt.is_hiphen or tt.is_and) or tt.morph.class0_.is_conjunction or tt.morph.class0_.is_preposition): continue if (tt.is_value("ТЕРРИТОРИЯ", "ТЕРИТОРІЯ")): continue if ((tt.is_value("ПРОЖИВАТЬ", "ПРОЖИВАТИ") or tt.is_value("РОДИТЬ", "НАРОДИТИ") or tt.is_value("ЗАРЕГИСТРИРОВАТЬ", "ЗАРЕЄСТРУВАТИ")) or tt.is_value("АДРЕС", None)): return True if (tt.is_value("УРОЖЕНЕЦ", "УРОДЖЕНЕЦЬ") or tt.is_value("УРОЖЕНКА", "УРОДЖЕНКА")): return True if (tt.length_char == 2 and (isinstance(tt, TextToken)) and tt.chars.is_all_upper): term = tt.term if (not Utils.isNullOrEmpty(term) and term[0] == 'Р'): return True rt = Utils.asObjectOrNull(tt, ReferentToken) if (rt is None): break if ((isinstance(rt.referent, GeoReferent)) or (isinstance(rt.referent, AddressReferent)) or (isinstance(rt.referent, StreetReferent))): return True break if (t.previous is not None and t.previous.previous is not None): cit2 = CityItemToken.try_parse(t.previous, None, False, None) if (cit2 is not None and cit2.typ != CityItemToken.ItemType.NOUN and cit2.end_token.next0_ == t): cit1 = CityItemToken.try_parse(t.previous.previous, None, False, None) if (cit1 is not None and cit1.typ == CityItemToken.ItemType.NOUN): return True if (cit1 is None and t.previous.previous.is_char('.') and t.previous.previous.previous is not None): tt = t.previous.previous.previous cit1 = CityItemToken.try_parse(tt, None, False, None) if (cit1 is not None and cit1.typ == CityItemToken.ItemType.NOUN): return True if (tt.is_value("С", None) or tt.is_value("Д", None) or tt.is_value("ПОС", None)): return True return False
def serialize_string(stream : Stream, val : str) -> None: if (val is None): SerializerHelper.serialize_int(stream, -1) return if (Utils.isNullOrEmpty(val)): SerializerHelper.serialize_int(stream, 0) return data = val.encode("UTF-8", 'ignore') SerializerHelper.serialize_int(stream, len(data)) stream.write(data, 0, len(data))
def try_parse_double(val: str, f: float) -> bool: f.value = (0) if (Utils.isNullOrEmpty(val)): return False inoutres1611 = Utils.tryParseFloat(val.replace(',', '.'), f) if (val.find(',') >= 0 and inoutres1611): return True inoutres1610 = Utils.tryParseFloat(val, f) if (inoutres1610): return True return False
def find(self, key : str) -> 'Termin': if (Utils.isNullOrEmpty(key)): return None li = [ ] if (LanguageHelper.is_latin_char(key[0])): li = self.__find_in_tree(key, MorphLang.EN) else: li = self.__find_in_tree(key, MorphLang.RU) if (li is None): li = self.__find_in_tree(key, MorphLang.UA) return (li[0] if li is not None and len(li) > 0 else None)
def _addNumber(self, dt: 'DecreeToken') -> None: if (dt.typ == DecreeToken.ItemType.NUMBER): if (dt.num_year > 0): self.addSlot(DecreeReferent.ATTR_DATE, str(dt.num_year), False, 0) if (Utils.isNullOrEmpty(dt.value)): return value = dt.value if (".,".find(value[len(value) - 1]) >= 0): value = value[0:0 + len(value) - 1] self.addSlot(DecreeReferent.ATTR_NUMBER, value, False, 0)