def compute_detection_performance(dataset: List[_LabelizedText], ontology: TopicOntology): all_labels = [label for _, label in dataset] texts = [ (titles, StructuredText(EnrichedString(''), outer_alineas, [], None)) for (titles, outer_alineas), _ in dataset ] for topic, texts_ in _missing_topics(all_labels, texts, ontology).items(): _pretty_print(topic, texts_) predicted_labels = [_extract_topics(text, titles, ontology) for titles, text in texts] print(Counter([exp == pred for exp, pred in zip(all_labels, predicted_labels) if exp]))
def test_add_topics(): sub_section_1 = StructuredText(EnrichedString('Section 1.1'), [], [], None) section_1 = StructuredText(EnrichedString('Section 1'), [], [sub_section_1], None) section_2 = StructuredText(EnrichedString('Section 2'), [], [], None) am = ArreteMinisteriel(EnrichedString('arrete du 10/10/10'), [section_1, section_2], [], None, id='FAKE_ID') am_with_topics = add_topics( am, { (0, ): TopicName.INCENDIE, (0, 0): TopicName.INCENDIE, (1, ): TopicName.BRUIT_VIBRATIONS }) sections = am_with_topics.sections assert sections[0].annotations.topic == TopicName.INCENDIE # type: ignore assert sections[0].sections[ 0].annotations.topic == TopicName.INCENDIE # type: ignore assert sections[ 1].annotations.topic == TopicName.BRUIT_VIBRATIONS # type: ignore
def _get_simple_text() -> StructuredText: row1 = Row([_cell('AA'), _cell('BB'), _cell('CC')], True) row2 = Row([_cell('DD', rs=2), _cell('EE', cs=2)], False) row3 = Row([_cell('FF'), _cell('GG')], False) table = Table([row1, row2, row3]) alineas = [ _text('Alinea 1'), _text('Alinea 2'), EnrichedString('', table=table) ] sections = [StructuredText(_text('Title 2'), [], [], None, None, None)] text = StructuredText(_text('Title'), alineas, sections, None, None, None) return text
def test_extract_text_lines(): assert _get_simple_text().text_lines() == [ 'AM', 'alinea', 'foo', '# Section 1', '## Section 1.1', '# Section 2', 'bar', ] assert StructuredText(EnrichedString(' A'), [], [], None).text_lines() == ['A'] assert StructuredText(EnrichedString(' A'), [EnrichedString('')], [], None).text_lines() == ['A', ''] assert _TEXT_A.text_lines() == [ '6. Schématisation des différents types de joints mentionnés :', 'Vous pouvez consulter les schémas dans le', 'JO', 'n° 265 du 16/11/2010 texte numéro 21', ] assert _TEXT_B.text_lines() == [ '6. Schématisation des différents types de joints mentionnés :', 'Vous pouvez consulter les schémas dans le', 'JO n° 265 du 16/11/2010 texte numéro 21', ]
def _build_am() -> ArreteMinisteriel: subsection1 = StructuredText(EnrichedString('A'), [], [], None) subsection2 = StructuredText(EnrichedString('A'), [], [], None) sections = [ StructuredText(EnrichedString('Art. 1'), [], [subsection1], None), StructuredText(EnrichedString('Art. 2'), [], [subsection2], None), StructuredText(EnrichedString('Conditions d\'application'), [], [], None), StructuredText(EnrichedString('Art. 3'), [], [], None), ] return ArreteMinisteriel(EnrichedString('arrete du 10/10/10'), sections, [], None, id='FAKE_ID')
from envinorma.models.condition import AndCondition, Littler, Range, extract_leaf_conditions from envinorma.models.parameter import Parameter, ParameterType from envinorma.models.structured_text import StructuredText from envinorma.models.text_elements import EnrichedString from envinorma.parametrization.combinations import _generate_options_dicts, generate_exhaustive_combinations from envinorma.parametrization.models.parametrization import ( AlternativeSection, AMWarning, InapplicableSection, Parametrization, extract_conditions_from_parametrization, ) _DATE = Parameter(id='date-d-installation', type=ParameterType.DATE) _NEW_TEXT = StructuredText( title=EnrichedString(text='Article 2.1'), outer_alineas=[EnrichedString(text='Contenu nouveau')], sections=[], applicability=None, reference=None, annotations=None, id='d16d0fE7C7fc', ) _PARAMETRIZATION = Parametrization( inapplicable_sections=[ InapplicableSection( section_id='abcdef', alineas=None, condition=AndCondition( conditions=frozenset([Littler(parameter=_DATE, target=date(2021, 1, 1), strict=True)]) ),
def _extract_cell_data(cell: Tag) -> EnrichedString: return EnrichedString( _ensure_strs_and_join( merge_between_linebreaks( _extract_text_elements_with_linebreaks(cell))))
def _enriched_string_links() -> EnrichedString: return EnrichedString('abc', [Link('abc', 0, 4)], None)
def _enriched_string_table() -> EnrichedString: return EnrichedString('', [], _table())
def _table() -> Table: return Table([Row([Cell(EnrichedString('bonjour'), 1, 1)], True)])
def _str(text: Optional[str] = None) -> EnrichedString: return EnrichedString(text or _random_string())
assert _is_probably_cid('LEGITEXT34234') assert _is_probably_cid('FAKE_CID') assert _is_probably_cid('FAKETEXT0000324') assert not _is_probably_cid('') assert not _is_probably_cid('JORFTEX') def _get_simple_text() -> StructuredText: sub_section_1 = StructuredText(_str('Section 1.1'), [], [], None) section_1 = StructuredText(_str('Section 1'), [], [sub_section_1], None) section_2 = StructuredText(_str('Section 2'), [_str('bar')], [], None) return StructuredText(_str('AM '), [_str('alinea'), _str('foo')], [section_1, section_2], None) _TEXT_A = StructuredText( title=EnrichedString(text='6. Schématisation des différents types de joints mentionnés :'), outer_alineas=[ EnrichedString(text='Vous pouvez consulter les schémas dans le'), EnrichedString(text='JO\nn° 265 du 16/11/2010 texte numéro 21'), ], sections=[], applicability=None, reference=Reference('ref', 'name'), annotations=None, id='0bEB0b14A96f', ) _TEXT_B = StructuredText( title=EnrichedString(text='6. Schématisation des différents types de joints mentionnés :'), outer_alineas=[ EnrichedString(text='Vous pouvez consulter les schémas dans le'), EnrichedString(text='JO n° 265 du 16/11/2010 texte numéro 21'),
def _build_labelized_text(raw_text: Tuple[int, List[str], List[Dict]], labels: Set[TopicName]) -> _LabelizedText: text = raw_text[1], [ EnrichedString.from_dict(dict_) for dict_ in raw_text[2] ] return text, labels
def _str(text: Optional[str] = None) -> EnrichedString: return EnrichedString(text) if text else _random_enriched_string()
def _random_enriched_string() -> EnrichedString: return EnrichedString(_random_string(), [], None)
def test_count_cells(): assert _count_cells(Table([])) == 0 assert _count_cells(Table([Row([], True)])) == 0 cells = [Cell(EnrichedString(''), 1, 1)] assert _count_cells(Table([Row(cells, True)])) == 1 assert _count_cells(Table([Row(cells, True)] * 3)) == 3
def _random_cell() -> Cell: return Cell(EnrichedString(''), 1, 1)
def _text(txt: str) -> EnrichedString: return EnrichedString(txt)