def deserialize(self, stream: io.IOBase, all0_: typing.List['Referent'], sofa: 'SourceOfAnalysis') -> None: typ = SerializerHelper.deserializeString(stream) cou = SerializerHelper.deserializeInt(stream) i = 0 while i < cou: typ = SerializerHelper.deserializeString(stream) c = SerializerHelper.deserializeInt(stream) id0_ = SerializerHelper.deserializeInt(stream) val = None if (id0_ < 0): val = (all0_[(-id0_) - 1]) elif (id0_ > 0): stream.seek(stream.tell() - (4), io.SEEK_SET) val = (SerializerHelper.deserializeString(stream)) self.addSlot(typ, val, False, c) i += 1 cou = SerializerHelper.deserializeInt(stream) self.__m_occurrence = list() i = 0 while i < cou: a = TextAnnotation._new2691(sofa, self) self.__m_occurrence.append(a) a.begin_char = SerializerHelper.deserializeInt(stream) a.end_char = SerializerHelper.deserializeInt(stream) attr = SerializerHelper.deserializeInt(stream) if (((attr & 1)) != 0): a.essential_for_occurence = True i += 1
def embed_token(self, mt: 'MetaToken') -> None: """ Встроить токен в основную цепочку токенов Args: mt(MetaToken): встраиваемый метатокен """ if (mt is None): return if (mt.begin_char > mt.end_char): bg = mt.begin_token mt.begin_token = mt.end_token mt.end_token = bg if (mt.begin_char > mt.end_char): return if (mt.begin_token == self.first_token): self.first_token = (mt) else: tp = mt.begin_token.previous mt.previous = tp tn = mt.end_token.next0_ mt.next0_ = tn if (isinstance(mt, ReferentToken)): if (mt.referent is not None): mt.referent.add_occurence( TextAnnotation._new474(self.sofa, mt.begin_char, mt.end_char))
def deserialize(self, stream: Stream, all0_: typing.List['Referent'], sofa: 'SourceOfAnalysis') -> None: typ = SerializerHelper.deserialize_string(stream) cou = SerializerHelper.deserialize_int(stream) i = 0 while i < cou: typ = SerializerHelper.deserialize_string(stream) c = SerializerHelper.deserialize_int(stream) id0_ = SerializerHelper.deserialize_int(stream) val = None if ((id0_ < 0) and all0_ is not None): id1 = (-id0_) - 1 if (id1 < len(all0_)): val = (all0_[id1]) elif (id0_ > 0): stream.position = stream.position - (4) val = (SerializerHelper.deserialize_string(stream)) self.add_slot(typ, val, False, c) i += 1 cou = SerializerHelper.deserialize_int(stream) self.__m_occurrence = list() i = 0 while i < cou: a = TextAnnotation._new2863(sofa, self) self.__m_occurrence.append(a) a.begin_char = SerializerHelper.deserialize_int(stream) a.end_char = SerializerHelper.deserialize_int(stream) attr = SerializerHelper.deserialize_int(stream) if (((attr & 1)) != 0): a.essential_for_occurence = True i += 1
def createAnnotation(kit_: 'AnalysisKit', max_sents: int) -> 'KeywordReferent': sents = list() t = kit_.first_token first_pass3023 = True while True: if first_pass3023: first_pass3023 = False else: t = t.next0_ if (not (t is not None)): break sent = AutoannoSentToken.__tryParse(t) if (sent is None): continue if (sent.rank > 0): sents.append(sent) t = sent.end_token if (len(sents) < 2): return None i = 0 while i < len(sents): sents[i].rank *= ((((len(sents) - i))) / (len(sents))) i += 1 if ((max_sents * 3) > len(sents)): max_sents = (math.floor(len(sents) / 3)) if (max_sents == 0): max_sents = 1 while len(sents) > max_sents: mini = 0 min0_ = sents[0].rank i = 1 while i < len(sents): if (sents[i].rank <= min0_): min0_ = sents[i].rank mini = i i += 1 del sents[mini] ano = KeywordReferent() ano.typ = KeywordType.ANNOTATION tmp = io.StringIO() for s in sents: if (tmp.tell() > 0): print(' ', end="", file=tmp) print(s.value, end="", file=tmp) ano.occurrence.append( TextAnnotation._new1488(s.begin_char, s.end_char, ano, kit_.sofa)) ano.addSlot(KeywordReferent.ATTR_VALUE, Utils.toStringStringIO(tmp), True, 0) return ano
def save_to_local_ontology(self) -> None: if (self.data is None): return r = self.data.register_referent(self.referent) self.data = (None) if (r is not None): self.referent = r anno = TextAnnotation() anno.sofa = self.kit.sofa anno.occurence_of = self.referent anno.begin_char = self.begin_char anno.end_char = self.end_char self.referent.add_occurence(anno)
def addOccurence(self, anno: 'TextAnnotation') -> None: """ Добавить аннотацию Args: anno(TextAnnotation): """ for l_ in self.occurrence: typ = l_._compareWith(anno) if (typ == TextsCompareType.NONCOMPARABLE): continue if (typ == TextsCompareType.EQUIVALENT or typ == TextsCompareType.CONTAINS): return if (typ == TextsCompareType.IN or typ == TextsCompareType.INTERSECT): l_._merge(anno) return if (anno.occurence_of != self and anno.occurence_of is not None): anno = TextAnnotation._new2689(anno.begin_char, anno.end_char, anno.sofa) if (self.__m_occurrence is None): self.__m_occurrence = list() anno.occurence_of = self if (len(self.__m_occurrence) == 0): anno.essential_for_occurence = True self.__m_occurrence.append(anno) return if (anno.begin_char < self.__m_occurrence[0].begin_char): self.__m_occurrence.insert(0, anno) return if (anno.begin_char >= self.__m_occurrence[len(self.__m_occurrence) - 1].begin_char): self.__m_occurrence.append(anno) return i = 0 while i < (len(self.__m_occurrence) - 1): if (anno.begin_char >= self.__m_occurrence[i].begin_char and anno.begin_char <= self.__m_occurrence[i + 1].begin_char): self.__m_occurrence.insert(i + 1, anno) return i += 1 self.__m_occurrence.append(anno)
def _process(begin : 'Token', max_char_pos : int, kit : 'AnalysisKit', end_token : 'Token') -> 'TitlePageReferent': end_token.value = begin res = TitlePageReferent() term = None lines = Line.parse(begin, 30, 1500, max_char_pos) if (len(lines) < 1): return None cou = len(lines) min_newlines_count = 10 lines_count_stat = dict() i = 0 while i < len(lines): if (TitleNameToken.can_be_start_of_text_or_content(lines[i].begin_token, lines[i].end_token)): cou = i break j = lines[i].newlines_before_count if (i > 0 and j > 0): if (not j in lines_count_stat): lines_count_stat[j] = 1 else: lines_count_stat[j] += 1 i += 1 max0_ = 0 for kp in lines_count_stat.items(): if (kp[1] > max0_): max0_ = kp[1] min_newlines_count = kp[0] end_char = (lines[cou - 1].end_char if cou > 0 else 0) if (max_char_pos > 0 and end_char > max_char_pos): end_char = max_char_pos names = list() i = 0 while i < cou: if (i == 6): pass j = i while (j < cou) and (j < (i + 5)): if (i == 6 and j == 8): pass if (j > i): if (lines[j - 1].is_pure_en and lines[j].is_pure_ru): break if (lines[j - 1].is_pure_ru and lines[j].is_pure_en): break if (lines[j].newlines_before_count >= (min_newlines_count * 2)): break ttt = TitleNameToken.try_parse(lines[i].begin_token, lines[j].end_token, min_newlines_count) if (ttt is not None): if (lines[i].is_pure_en): ttt.morph.language = MorphLang.EN elif (lines[i].is_pure_ru): ttt.morph.language = MorphLang.RU names.append(ttt) j += 1 i += 1 TitleNameToken.sort(names) name_rt = None if (len(names) > 0): i0 = 0 if (names[i0].morph.language.is_en): ii = 1 while ii < len(names): if (names[ii].morph.language.is_ru and names[ii].rank > 0): i0 = ii break ii += 1 term = res._add_name(names[i0].begin_name_token, names[i0].end_name_token) if (names[i0].type_value is not None): res._add_type(names[i0].type_value) if (names[i0].speciality is not None): res.speciality = names[i0].speciality rt = ReferentToken(res, names[i0].begin_token, names[i0].end_token) if (kit is not None): kit.embed_token(rt) else: res.add_occurence(TextAnnotation(rt.begin_token, rt.end_token)) end_token.value = rt.end_token name_rt = rt if (begin.begin_char == rt.begin_char): begin = (rt) if (term is not None and kit is not None): t = kit.first_token first_pass3397 = True while True: if first_pass3397: first_pass3397 = False else: t = t.next0_ if (not (t is not None)): break tok = term.try_parse(t, TerminParseAttr.NO) if (tok is None): continue t0 = t t1 = tok.end_token if (t1.next0_ is not None and t1.next0_.is_char('.')): t1 = t1.next0_ if (BracketHelper.can_be_start_of_sequence(t0.previous, False, False) and BracketHelper.can_be_end_of_sequence(t1.next0_, False, None, False)): t0 = t0.previous t1 = t1.next0_ rt = ReferentToken(res, t0, t1) kit.embed_token(rt) t = (rt) pr = PersonRelations() pers_typ = TitleItemToken.Types.UNDEFINED pers_types = pr.rel_types t = begin first_pass3398 = True while True: if first_pass3398: first_pass3398 = False else: t = t.next0_ if (not (t is not None)): break if (max_char_pos > 0 and t.begin_char > max_char_pos): break if (t == name_rt): continue tpt = TitleItemToken.try_attach(t) if (tpt is not None): pers_typ = TitleItemToken.Types.UNDEFINED if (tpt.typ == TitleItemToken.Types.TYP): if (len(res.types) == 0): res._add_type(tpt.value) elif (len(res.types) == 1): ty = res.types[0].upper() if (ty == "РЕФЕРАТ"): res._add_type(tpt.value) elif (ty == "АВТОРЕФЕРАТ"): if (tpt.value == "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатской диссертации", True, 0) elif (tpt.value == "ДОКТОРСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторской диссертации", True, 0) elif (tpt.value == "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магистерской диссертации", True, 0) elif (tpt.value == "КАНДИДАТСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатської дисертації", True, 0) elif (tpt.value == "ДОКТОРСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторської дисертації", True, 0) elif (tpt.value == "МАГІСТЕРСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магістерської дисертації", True, 0) else: res._add_type(tpt.value) elif (tpt.value == "РЕФЕРАТ" or tpt.value == "АВТОРЕФЕРАТ"): if (not tpt.value in ty): res._add_type(tpt.value) elif (tpt.typ == TitleItemToken.Types.SPECIALITY): if (res.speciality is None): res.speciality = tpt.value elif (tpt.typ in pers_types): pers_typ = tpt.typ t = tpt.end_token if (t.end_char > end_token.value.end_char): end_token.value = t if (t.next0_ is not None and t.next0_.is_char_of(":-")): t = t.next0_ continue if (t.end_char > end_char): break rli = t.get_referents() if (rli is None): continue if (not t.is_newline_before and (isinstance(t.previous, TextToken))): s = t.previous.term if (s == "ИМЕНИ" or s == "ИМ"): continue if (s == "." and t.previous.previous is not None and t.previous.previous.is_value("ИМ", None)): continue for r in rli: if (isinstance(r, PersonReferent)): if (r != rli[0]): continue p = Utils.asObjectOrNull(r, PersonReferent) if (pers_typ != TitleItemToken.Types.UNDEFINED): if (t.previous is not None and t.previous.is_char('.')): pers_typ = TitleItemToken.Types.UNDEFINED typ = pr.calc_typ_from_attrs(p) if (typ != TitleItemToken.Types.UNDEFINED): pr.add(p, typ, 1) pers_typ = typ elif (pers_typ != TitleItemToken.Types.UNDEFINED): pr.add(p, pers_typ, 1) elif (t.previous is not None and t.previous.is_char('©')): pers_typ = TitleItemToken.Types.WORKER pr.add(p, pers_typ, 1) else: tt = t.next0_ first_pass3399 = True while True: if first_pass3399: first_pass3399 = False else: tt = tt.next0_ if (not (tt is not None)): break rr = tt.get_referent() if (rr == res): pers_typ = TitleItemToken.Types.WORKER break if (isinstance(rr, PersonReferent)): if (pr.calc_typ_from_attrs(Utils.asObjectOrNull(r, PersonReferent)) != TitleItemToken.Types.UNDEFINED): break else: continue if (rr is not None): break tpt = TitleItemToken.try_attach(tt) if (tpt is not None): if (tpt.typ != TitleItemToken.Types.TYP and tpt.typ != TitleItemToken.Types.TYPANDTHEME): break tt = tpt.end_token if (tt.end_char > end_token.value.end_char): end_token.value = tt continue if (pers_typ == TitleItemToken.Types.UNDEFINED): tt = t.previous while tt is not None: rr = tt.get_referent() if (rr == res): pers_typ = TitleItemToken.Types.WORKER break if (rr is not None): break if ((tt.is_value("СТУДЕНТ", None) or tt.is_value("СТУДЕНТКА", None) or tt.is_value("СЛУШАТЕЛЬ", None)) or tt.is_value("ДИПЛОМНИК", None) or tt.is_value("ИСПОЛНИТЕЛЬ", None)): pers_typ = TitleItemToken.Types.WORKER break tpt = TitleItemToken.try_attach(tt) if (tpt is not None and tpt.typ != TitleItemToken.Types.TYP): break tt = tt.previous if (pers_typ != TitleItemToken.Types.UNDEFINED): pr.add(p, pers_typ, 1) else: pr.add(p, pers_typ, 0.5) if (t.end_char > end_token.value.end_char): end_token.value = t continue if (r == rli[0]): pers_typ = TitleItemToken.Types.UNDEFINED if (isinstance(r, DateReferent)): if (res.date is None): res.date = Utils.asObjectOrNull(r, DateReferent) if (t.end_char > end_token.value.end_char): end_token.value = t elif (isinstance(r, GeoReferent)): if (res.city is None and r.is_city): res.city = Utils.asObjectOrNull(r, GeoReferent) if (t.end_char > end_token.value.end_char): end_token.value = t if (isinstance(r, OrganizationReferent)): org0_ = Utils.asObjectOrNull(r, OrganizationReferent) if ("курс" in org0_.types and org0_.number is not None): i = 0 wrapi2673 = RefOutArgWrapper(0) inoutres2674 = Utils.tryParseInt(org0_.number, wrapi2673) i = wrapi2673.value if (inoutres2674): if (i > 0 and (i < 8)): res.student_year = i while org0_.higher is not None: if (org0_.kind != OrganizationKind.DEPARTMENT): break org0_ = org0_.higher if (org0_.kind != OrganizationKind.DEPARTMENT): if (res.org0_ is None): res.org0_ = org0_ elif (OrganizationReferent.can_be_higher(res.org0_, org0_)): res.org0_ = org0_ if (t.end_char > end_token.value.end_char): end_token.value = t if ((isinstance(r, UriReferent)) or (isinstance(r, GeoReferent))): if (t.end_char > end_token.value.end_char): end_token.value = t for ty in pers_types: for p in pr.get_persons(ty): if (pr.get_attr_name_for_type(ty) is not None): res.add_slot(pr.get_attr_name_for_type(ty), p, False, 0) if (res.get_slot_value(TitlePageReferent.ATTR_AUTHOR) is None): for p in pr.get_persons(TitleItemToken.Types.UNDEFINED): res.add_slot(TitlePageReferent.ATTR_AUTHOR, p, False, 0) break if (res.city is None and res.org0_ is not None): s = res.org0_.find_slot(OrganizationReferent.ATTR_GEO, None, True) if (s is not None and (isinstance(s.value, GeoReferent))): if (s.value.is_city): res.city = Utils.asObjectOrNull(s.value, GeoReferent) if (res.date is None): t = begin first_pass3400 = True while True: if first_pass3400: first_pass3400 = False else: t = t.next0_ if (not (t is not None and t.end_char <= end_char)): break city = Utils.asObjectOrNull(t.get_referent(), GeoReferent) if (city is None): continue if (isinstance(t.next0_, TextToken)): if (t.next0_.is_char_of(":,") or t.next0_.is_hiphen): t = t.next0_ rt = t.kit.process_referent(DateAnalyzer.ANALYZER_NAME, t.next0_) if (rt is not None): rt.save_to_local_ontology() res.date = Utils.asObjectOrNull(rt.referent, DateReferent) if (kit is not None): kit.embed_token(rt) break if (len(res.slots) == 0): return None else: return res
def addOccurenceOfRefTok(self, rt: 'ReferentToken') -> None: self.addOccurence( TextAnnotation._new700(rt.kit.sofa, rt.begin_char, rt.end_char, rt.referent))
def add_occurence_of_ref_tok(self, rt: 'ReferentToken') -> None: self.add_occurence( TextAnnotation._new714(rt.kit.sofa, rt.begin_char, rt.end_char, rt.referent))