Beispiel #1
0
 def __str__(self) -> str:
     if (self.countries is None): 
         return self.pref
     res = Utils.newStringIO(self.pref)
     for c in self.countries: 
         print(" {0}".format(c), end="", file=res, flush=True)
     return Utils.toStringStringIO(res)
Beispiel #2
0
 def __str__(self) -> str:
     res = Utils.newStringIO(self.value)
     if (self.is_noun_phrase):
         print(" NounPrase", end="", file=res)
     if (self.is_denomination):
         print(" Denom", end="", file=res)
     if (self.is_in_dictionary):
         print(" InDictionary", end="", file=res)
     if (self.is_after_conjunction):
         print(" IsAfterConjunction", end="", file=res)
     if (self.is_std_tail):
         print(" IsStdTail", end="", file=res)
     if (self.is_std_name):
         print(" IsStdName", end="", file=res)
     if (self.is_ignored_part):
         print(" IsIgnoredPart", end="", file=res)
     if (self.preposition is not None):
         print(" IsAfterPreposition '{0}'".format(self.preposition),
               end="",
               file=res,
               flush=True)
     print(" {0} ({1})".format(str(self.chars), self.getSourceText()),
           end="",
           file=res,
           flush=True)
     return Utils.toStringStringIO(res)
Beispiel #3
0
 def attach_url(t0: 'Token') -> 'UriItemToken':
     srv = UriItemToken.attach_domain_name(t0, True, False)
     if (srv is None):
         return None
     txt = Utils.newStringIO(srv.value)
     t1 = srv.end_token
     if (t1.next0_ is not None and t1.next0_.is_char(':')
             and (isinstance(t1.next0_.next0_, NumberToken))):
         t1 = t1.next0_.next0_
         print(":{0}".format(t1.value), end="", file=txt, flush=True)
     elif ((srv.value == "vk.com" and t1.next0_ is not None
            and t1.next0_.is_hiphen) and t1.next0_.next0_ is not None):
         t1 = t1.next0_.next0_
         dat = UriItemToken.__attach_uri_content(t1, ".-_+%", False)
         if (dat is not None):
             t1 = dat.end_token
             print("/{0}".format(dat.value), end="", file=txt, flush=True)
     t = t1.next0_
     while t is not None:
         if (t.is_whitespace_before):
             break
         if (not t.is_char('/')):
             break
         if (t.is_whitespace_after):
             t1 = t
             break
         dat = UriItemToken.__attach_uri_content(t.next0_, ".-_+%", False)
         if (dat is None):
             t1 = t
             break
         t1 = dat.end_token
         t = t1
         print("/{0}".format(dat.value), end="", file=txt, flush=True)
         t = t.next0_
     if ((t1.next0_ is not None and t1.next0_.is_char('?')
          and not t1.next0_.is_whitespace_after)
             and not t1.is_whitespace_after):
         dat = UriItemToken.__attach_uri_content(t1.next0_.next0_,
                                                 ".-_+%=&", False)
         if (dat is not None):
             t1 = dat.end_token
             print("?{0}".format(dat.value), end="", file=txt, flush=True)
     if ((t1.next0_ is not None and t1.next0_.is_char('#')
          and not t1.next0_.is_whitespace_after)
             and not t1.is_whitespace_after):
         dat = UriItemToken.__attach_uri_content(t1.next0_.next0_, ".-_+%",
                                                 False)
         if (dat is not None):
             t1 = dat.end_token
             print("#{0}".format(dat.value), end="", file=txt, flush=True)
     i = 0
     i = 0
     while i < txt.tell():
         if (str.isalpha(Utils.getCharAtStringIO(txt, i))):
             break
         i += 1
     if (i >= txt.tell()):
         return None
     return UriItemToken._new2706(t0, t1, Utils.toStringStringIO(txt))
Beispiel #4
0
 def __str__(self) -> str:
     if (Utils.isNullOrEmpty(self.term)): 
         return "Null"
     str0_ = self.term
     if (self.char_info.is_all_lower): 
         str0_ = str0_.lower()
     elif (self.char_info.is_capital_upper and len(str0_) > 0): 
         str0_ = "{0}{1}".format(self.term[0], self.term[1:].lower())
     elif (self.char_info.is_last_lower): 
         str0_ = "{0}{1}".format(self.term[0:0+len(self.term) - 1], self.term[len(self.term) - 1:].lower())
     if (self.word_forms is None): 
         return str0_
     res = Utils.newStringIO(str0_)
     for l_ in self.word_forms: 
         print(", {0}".format(str(l_)), end="", file=res, flush=True)
     return Utils.toStringStringIO(res)
Beispiel #5
0
 def __str__(self) -> str:
     res = Utils.newStringIO(Utils.ifNotNull(self.caption, self.name))
     if (self.upper_bound > 0 or self.lower_bound > 0):
         if (self.upper_bound == 0):
             print("[{0}..*]".format(self.lower_bound),
                   end="",
                   file=res,
                   flush=True)
         elif (self.upper_bound == self.lower_bound):
             print("[{0}]".format(self.upper_bound),
                   end="",
                   file=res,
                   flush=True)
         else:
             print("[{0}..{1}]".format(self.lower_bound, self.upper_bound),
                   end="",
                   file=res,
                   flush=True)
     return Utils.toStringStringIO(res)
Beispiel #6
0
 def to_string(self,
               short_variant: bool,
               lang: 'MorphLang' = None,
               lev: int = 0) -> str:
     res = Utils.newStringIO(self.template)
     vals = list()
     for s in self.slots:
         if (s.type_name == MeasureReferent.ATTR_VALUE):
             if (isinstance(s.value, str)):
                 val = Utils.asObjectOrNull(s.value, str)
                 if (val == "NaN"):
                     val = "?"
                 vals.append(val)
             elif (isinstance(s.value, Referent)):
                 vals.append(s.value.to_string(True, lang, 0))
     for i in range(res.tell() - 1, -1, -1):
         ch = Utils.getCharAtStringIO(res, i)
         if (not str.isdigit(ch)):
             continue
         j = ((ord(ch)) - (ord('1')))
         if ((j < 0) or j >= len(vals)):
             continue
         Utils.removeStringIO(res, i, 1)
         Utils.insertStringIO(res, i, vals[j])
     print(self.out_units(lang), end="", file=res)
     if (not short_variant):
         nam = self.get_string_value(MeasureReferent.ATTR_NAME)
         if (nam is not None):
             print(" - {0}".format(nam), end="", file=res, flush=True)
         for s in self.slots:
             if (s.type_name == MeasureReferent.ATTR_REF
                     and (isinstance(s.value, MeasureReferent))):
                 print(" / {0}".format(s.value.to_string(True, lang, 0)),
                       end="",
                       file=res,
                       flush=True)
         ki = self.kind
         if (ki != MeasureKind.UNDEFINED):
             print(" ({0})".format(Utils.enumToString(ki).upper()),
                   end="",
                   file=res,
                   flush=True)
     return Utils.toStringStringIO(res)
Beispiel #7
0
 def toString(self, short_variant : bool, lang : 'MorphLang'=None, lev : int=0) -> str:
     res = Utils.newStringIO(self.template)
     vals = list()
     for s in self.slots: 
         if (s.type_name == MeasureReferent.ATTR_VALUE): 
             if (isinstance(s.value, str)): 
                 vals.append(Utils.asObjectOrNull(s.value, str))
             elif (isinstance(s.value, Referent)): 
                 vals.append((s.value).toString(True, lang, 0))
     for i in range(res.tell() - 1, -1, -1):
         ch = Utils.getCharAtStringIO(res, i)
         if (not str.isdigit(ch)): 
             continue
         j = ((ord(ch)) - (ord('1')))
         if ((j < 0) or j >= len(vals)): 
             continue
         Utils.removeStringIO(res, i, 1)
         Utils.insertStringIO(res, i, vals[j])
     uu = self.units
     if (len(uu) > 0): 
         print(uu[0].toString(True, lang, 0), end="", file=res)
         i = 1
         while i < len(uu): 
             pow0_ = uu[i].getStringValue(UnitReferent.ATTR_POW)
             if (not Utils.isNullOrEmpty(pow0_) and pow0_[0] == '-'): 
                 print("/{0}".format(uu[i].toString(True, lang, 1)), end="", file=res, flush=True)
                 if (pow0_ != "-1"): 
                     print("<{0}>".format(pow0_[1:]), end="", file=res, flush=True)
             else: 
                 print("*{0}".format(uu[i].toString(True, lang, 0)), end="", file=res, flush=True)
             i += 1
     if (not short_variant): 
         nam = self.getStringValue(MeasureReferent.ATTR_NAME)
         if (nam is not None): 
             print(" - {0}".format(nam), end="", file=res, flush=True)
         for s in self.slots: 
             if (s.type_name == MeasureReferent.ATTR_REF and (isinstance(s.value, MeasureReferent))): 
                 print(" / {0}".format((s.value).toString(True, lang, 0)), end="", file=res, flush=True)
         ki = self.kind
         if (ki != MeasureKind.UNDEFINED): 
             print(" ({0})".format(Utils.enumToString(ki).upper()), end="", file=res, flush=True)
     return Utils.toStringStringIO(res)
Beispiel #8
0
 def __str__(self) -> str:
     res = Utils.newStringIO(Utils.ifNotNull(self.normal_case, ""))
     if (self.normal_full is not None
             and self.normal_full != self.normal_case):
         print("\\{0}".format(self.normal_full),
               end="",
               file=res,
               flush=True)
     if (res.tell() > 0):
         print(' ', end="", file=res)
     print(super().__str__(), end="", file=res)
     s = (None if self.misc is None else str(self.misc))
     if (not Utils.isNullOrEmpty(s)):
         print(" {0}".format(s), end="", file=res, flush=True)
     if (self.undef_coef > (0)):
         print(" (? {0})".format(self.undef_coef),
               end="",
               file=res,
               flush=True)
     return Utils.toStringStringIO(res)
Beispiel #9
0
 def correct_word_by_morph(self, word: str) -> str:
     vars0_ = list()
     tmp = Utils.newStringIO(len(word))
     ch = 1
     while ch < len(word):
         Utils.setLengthStringIO(tmp, 0)
         print(word, end="", file=tmp)
         Utils.setCharAtStringIO(tmp, ch, '*')
         var = self.__check_corr_var(Utils.toStringStringIO(tmp),
                                     self.m_root, 0)
         if (var is not None):
             if (not var in vars0_):
                 vars0_.append(var)
         ch += 1
     if (len(vars0_) == 0):
         ch = 1
         while ch < len(word):
             Utils.setLengthStringIO(tmp, 0)
             print(word, end="", file=tmp)
             Utils.insertStringIO(tmp, ch, '*')
             var = self.__check_corr_var(Utils.toStringStringIO(tmp),
                                         self.m_root, 0)
             if (var is not None):
                 if (not var in vars0_):
                     vars0_.append(var)
             ch += 1
     if (len(vars0_) == 0):
         ch = 1
         while ch < (len(word) - 1):
             Utils.setLengthStringIO(tmp, 0)
             print(word, end="", file=tmp)
             Utils.removeStringIO(tmp, ch, 1)
             var = self.__check_corr_var(Utils.toStringStringIO(tmp),
                                         self.m_root, 0)
             if (var is not None):
                 if (not var in vars0_):
                     vars0_.append(var)
             ch += 1
     if (len(vars0_) != 1):
         return None
     return vars0_[0]
 def transliteral_correction(value: str,
                             prev_value: str,
                             always: bool = False) -> str:
     """ Транслитеральная корректировка
     
     Args:
         value(str): 
         prev_value(str): 
         always(bool): 
     
     """
     pure_cyr = 0
     pure_lat = 0
     ques_cyr = 0
     ques_lat = 0
     udar_cyr = 0
     y = False
     udaren = False
     i = 0
     first_pass2897 = True
     while True:
         if first_pass2897: first_pass2897 = False
         else: i += 1
         if (not (i < len(value))): break
         ch = value[i]
         ui = UnicodeInfo.ALL_CHARS[ord(ch)]
         if (not ui.is_letter):
             if (ui.is_udaren):
                 udaren = True
                 continue
             if (ui.is_apos and len(value) > 2):
                 return LanguageHelper.transliteral_correction(
                     value.replace("{0}".format(ch), ""), prev_value, False)
             return value
         if (ui.is_cyrillic):
             if (LanguageHelper._m_cyr_chars.find(ch) >= 0):
                 ques_cyr += 1
             else:
                 pure_cyr += 1
         elif (ui.is_latin):
             if (LanguageHelper._m_lat_chars.find(ch) >= 0):
                 ques_lat += 1
             else:
                 pure_lat += 1
         elif (LanguageHelper.__m_udar_chars.find(ch) >= 0):
             udar_cyr += 1
         else:
             return value
         if (ch == 'Ь' and ((i + 1) < len(value)) and value[i + 1] == 'I'):
             y = True
     to_rus = False
     to_lat = False
     if (pure_lat > 0 and pure_cyr > 0):
         return value
     if (((pure_lat > 0 or always)) and ques_cyr > 0):
         to_lat = True
     elif (((pure_cyr > 0 or always)) and ques_lat > 0):
         to_rus = True
     elif (pure_cyr == 0 and pure_lat == 0):
         if (ques_cyr > 0 and ques_lat > 0):
             if (not Utils.isNullOrEmpty(prev_value)):
                 if (LanguageHelper.is_cyrillic_char(prev_value[0])):
                     to_rus = True
                 elif (LanguageHelper.is_latin_char(prev_value[0])):
                     to_lat = True
             if (not to_lat and not to_rus):
                 if (ques_cyr > ques_lat):
                     to_rus = True
                 elif (ques_cyr < ques_lat):
                     to_lat = True
     if (not to_rus and not to_lat):
         if (not y and not udaren and udar_cyr == 0):
             return value
     tmp = Utils.newStringIO(value)
     i = 0
     first_pass2898 = True
     while True:
         if first_pass2898: first_pass2898 = False
         else: i += 1
         if (not (i < tmp.tell())): break
         if (Utils.getCharAtStringIO(tmp, i) == 'Ь'
                 and ((i + 1) < tmp.tell())
                 and Utils.getCharAtStringIO(tmp, i + 1) == 'I'):
             Utils.setCharAtStringIO(tmp, i, 'Ы')
             Utils.removeStringIO(tmp, i + 1, 1)
             continue
         cod = ord(Utils.getCharAtStringIO(tmp, i))
         if (cod >= 0x300 and (cod < 0x370)):
             Utils.removeStringIO(tmp, i, 1)
             continue
         if (to_rus):
             ii = LanguageHelper._m_lat_chars.find(
                 Utils.getCharAtStringIO(tmp, i))
             if (ii >= 0):
                 Utils.setCharAtStringIO(tmp, i,
                                         LanguageHelper._m_cyr_chars[ii])
             else:
                 ii = LanguageHelper.__m_udar_chars.find(
                     Utils.getCharAtStringIO(tmp, i))
                 if (((ii)) >= 0):
                     Utils.setCharAtStringIO(
                         tmp, i, LanguageHelper.__m_udar_cyr_chars[ii])
         elif (to_lat):
             ii = LanguageHelper._m_cyr_chars.find(
                 Utils.getCharAtStringIO(tmp, i))
             if (ii >= 0):
                 Utils.setCharAtStringIO(tmp, i,
                                         LanguageHelper._m_lat_chars[ii])
         else:
             ii = LanguageHelper.__m_udar_chars.find(
                 Utils.getCharAtStringIO(tmp, i))
             if (ii >= 0):
                 Utils.setCharAtStringIO(
                     tmp, i, LanguageHelper.__m_udar_cyr_chars[ii])
     return Utils.toStringStringIO(tmp)
 def __str__(self) -> str:
     res = Utils.newStringIO(
         ("Null" if self.referent is None else str(self.referent)))
     if (self.morph is not None):
         print(" {0}".format(str(self.morph)), end="", file=res, flush=True)
     return Utils.toStringStringIO(res)
Beispiel #12
0
 def __str__(self) -> str:
     res = Utils.newStringIO(self.term)
     for l_ in self.morph.items:
         print(", {0}".format(str(l_)), end="", file=res, flush=True)
     return Utils.toStringStringIO(res)
Beispiel #13
0
 def __doCrLfCorrection(self, txt: str) -> str:
     """ Это анализ случаев принудительно отформатированного текста
     
     Args:
         txt(str): 
     """
     cou = 0
     total_len = 0
     i = 0
     first_pass3166 = True
     while True:
         if first_pass3166: first_pass3166 = False
         else: i += 1
         if (not (i < len(txt))): break
         ch = txt[i]
         if ((ord(ch)) != 0xD and (ord(ch)) != 0xA):
             continue
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < len(txt):
             ch = txt[j]
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= len(txt)):
             break
         if (len0_ < 30):
             continue
         if (last_char != '.' and last_char != ':' and last_char != ';'):
             next_is_dig = False
             k = j + 1
             while k < len(txt):
                 if (not Utils.isWhitespace(txt[k])):
                     if (str.isdigit(txt[k])):
                         next_is_dig = True
                     break
                 k += 1
             if (not next_is_dig):
                 cou += 1
                 total_len += len0_
         i = j
     if (cou < 4):
         return txt
     total_len = math.floor(total_len / cou)
     if ((total_len < 50) or total_len > 100):
         return txt
     tmp = Utils.newStringIO(txt)
     i = 0
     while i < tmp.tell():
         ch = Utils.getCharAtStringIO(tmp, i)
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, j)
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= tmp.tell()):
             break
         for jj in range(j - 1, -1, -1):
             last_char = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(last_char)):
                 break
         else:
             jj = -1
         not_single = False
         jj = (j + 1)
         if ((jj < tmp.tell())
                 and (ord(Utils.getCharAtStringIO(tmp, j))) == 0xD
                 and (ord(Utils.getCharAtStringIO(tmp, jj))) == 0xA):
             jj += 1
         while jj < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(ch)):
                 break
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 not_single = True
                 break
             jj += 1
         if (((not not_single and len0_ >
               (total_len - 20) and (len0_ < (total_len + 10)))
              and last_char != '.' and last_char != ':')
                 and last_char != ';'):
             Utils.setCharAtStringIO(tmp, j, ' ')
             self.crlf_corrected_count += 1
             if ((j + 1) < tmp.tell()):
                 ch = Utils.getCharAtStringIO(tmp, j + 1)
                 if ((ord(ch)) == 0xA):
                     Utils.setCharAtStringIO(tmp, j + 1, ' ')
                     j += 1
         i = (j - 1)
         i += 1
     return Utils.toStringStringIO(tmp)
Beispiel #14
0
 def __calcTransliteralStatistics(txt: str, info: io.StringIO) -> int:
     if (txt is None):
         return 0
     tmp = Utils.newStringIO(txt)
     return SourceOfAnalysis.__doTransliteralCorrection(tmp, info)