def setUp(self): input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], import_annotations=False, ) letter_seg = Segmenter.tokenize( input_seg, [ (re.compile(r'\w'), 'tokenize', {'type': 'C'}), (re.compile(r'[aeiouy]'), 'tokenize', {'type': 'V'}), ], import_annotations=False, merge_duplicates=True, ) vowel_seg, consonant_seg = Segmenter.select( letter_seg, re.compile(r'V'), annotation_key='type', ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and without annotation (woa): self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 1, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 2, ('n', 't'): 2, ('n', 'e'): 1, ('n', 'x'): 0, ('t', 'u'): 1, ('t', 'n'): 2, ('t', 't'): 5, ('t', 'e'): 4, ('t', 'x'): 3, ('e', 'u'): 0, ('e', 'n'): 1, ('e', 't'): 4, ('e', 'e'): 4, ('e', 'x'): 3, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 3, ('x', 'e'): 3, ('x', 'x'): 3, } self.window_woa_header_row_id = '__unit__' self.window_woa_header_row_type = 'string' self.window_woa_header_col_id = '__unit2__' self.window_woa_header_col_type = 'string' self.window_woa_col_type = { col_id: 'continuous' for col_id in self.window_woa_col_ids } self.window_woa_ref = IntPivotCrosstab( self.window_woa_row_ids, self.window_woa_col_ids, self.window_woa_values, self.window_woa_header_row_id, self.window_woa_header_row_type, self.window_woa_header_col_id, self.window_woa_header_col_type, self.window_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and with annotation (wa): self.window_wa_row_ids = ['C', 'V'] self.window_wa_col_ids = ['C', 'V'] self.window_wa_values = { ('C', 'C'): 5, ('C', 'V'): 5, ('V', 'C'): 5, ('V', 'V'): 5, } self.window_wa_header_row_id = '__unit__' self.window_wa_header_row_type = 'string' self.window_wa_header_col_id = '__unit2__' self.window_wa_header_col_type = 'string' self.window_wa_col_type = { col_id: 'continuous' for col_id in self.window_wa_col_ids } self.window_wa_ref = IntPivotCrosstab( self.window_wa_row_ids, self.window_wa_col_ids, self.window_wa_values, self.window_wa_header_row_id, self.window_wa_header_row_type, self.window_wa_header_col_id, self.window_wa_header_col_type, self.window_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and without annotation (woa): self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 0, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 1, ('n', 't'): 0, ('n', 'e'): 0, ('n', 'x'): 0, ('t', 'u'): 0, ('t', 'n'): 0, ('t', 't'): 1, ('t', 'e'): 1, ('t', 'x'): 1, ('e', 'u'): 0, ('e', 'n'): 0, ('e', 't'): 1, ('e', 'e'): 1, ('e', 'x'): 1, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 1, ('x', 'e'): 1, ('x', 'x'): 1, } self.context_wos_woa_header_row_id = '__unit__' self.context_wos_woa_header_row_type = 'string' self.context_wos_woa_header_col_id = '__unit2__' self.context_wos_woa_header_col_type = 'string' self.context_wos_woa_col_type = { col_id: 'continuous' for col_id in self.context_wos_woa_col_ids } self.context_wos_woa_ref = IntPivotCrosstab( self.context_wos_woa_row_ids, self.context_wos_woa_col_ids, self.context_wos_woa_values, self.context_wos_woa_header_row_id, self.context_wos_woa_header_row_type, self.context_wos_woa_header_col_id, self.context_wos_woa_header_col_type, self.context_wos_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and with annotation (wa): self.context_wos_wa_row_ids = ['V', 'C'] self.context_wos_wa_col_ids = ['V', 'C'] self.context_wos_wa_values = { ('V', 'V'): 2, ('V', 'C'): 2, ('C', 'V'): 2, ('C', 'C'): 2, } self.context_wos_wa_header_row_id = '__unit__' self.context_wos_wa_header_row_type = 'string' self.context_wos_wa_header_col_id = '__unit2__' self.context_wos_wa_header_col_type = 'string' self.context_wos_wa_col_type = { col_id: 'continuous' for col_id in self.context_wos_wa_col_ids } self.context_wos_wa_ref = IntPivotCrosstab( self.context_wos_wa_row_ids, self.context_wos_wa_col_ids, self.context_wos_wa_values, self.context_wos_wa_header_row_id, self.context_wos_wa_header_row_type, self.context_wos_wa_header_col_id, self.context_wos_wa_header_col_type, self.context_wos_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and without annotation (woa): self.context_ws_woa_col_ids = ['u', 'e'] self.context_ws_woa_row_ids = ['n', 't', 'x'] self.context_ws_woa_values = { ('n', 'u'): 1, ('n', 'e'): 0, ('t', 'u'): 0, ('t', 'e'): 1, ('x', 'u'): 0, ('x', 'e'): 1, } self.context_ws_woa_header_row_id = '__unit__' self.context_ws_woa_header_row_type = 'string' self.context_ws_woa_header_col_id = '__unit2__' self.context_ws_woa_header_col_type = 'string' self.context_ws_woa_col_type = { col_id: 'continuous' for col_id in self.context_ws_woa_col_ids } self.context_ws_woa_ref = IntPivotCrosstab( self.context_ws_woa_row_ids, self.context_ws_woa_col_ids, self.context_ws_woa_values, self.context_ws_woa_header_row_id, self.context_ws_woa_header_row_type, self.context_ws_woa_header_col_id, self.context_ws_woa_header_col_type, self.context_ws_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and with annotation (wa): self.context_ws_wa_row_ids = ['C'] self.context_ws_wa_col_ids = ['V'] self.context_ws_wa_values = { ('C', 'V'): 2, } self.context_ws_wa_header_row_id = '__unit__' self.context_ws_wa_header_row_type = 'string' self.context_ws_wa_header_col_id = '__unit2__' self.context_ws_wa_header_col_type = 'string' self.context_ws_wa_col_type = { col_id: 'continuous' for col_id in self.context_ws_wa_col_ids } self.context_ws_wa_ref = IntPivotCrosstab( self.context_ws_wa_row_ids, self.context_ws_wa_col_ids, self.context_ws_wa_values, self.context_ws_wa_header_row_id, self.context_ws_wa_header_row_type, self.context_ws_wa_header_col_id, self.context_ws_wa_header_col_type, self.context_ws_wa_col_type, ) self.output_cooc_in_window_woa = Processor.cooc_in_window( units={'segmentation': letter_seg}, window_size=3, ) self.output_cooc_in_window_wa = Processor.cooc_in_window( units={'segmentation': letter_seg, 'annotation_key': 'type'}, window_size=3, ) self.output_cooc_in_context_wos_woa = Processor.cooc_in_context( units={'segmentation': letter_seg}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_wos_wa = Processor.cooc_in_context( units={'segmentation': letter_seg, 'annotation_key': 'type'}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_ws_woa = Processor.cooc_in_context( units={'segmentation': vowel_seg}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg}, ) self.output_cooc_in_context_ws_wa = Processor.cooc_in_context( units={'segmentation': vowel_seg, 'annotation_key': 'type'}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg, 'annotation_key': 'type'}, )