Beispiel #1
0
    def setUp(self):
        input_seg = Input("un texte")
        word_seg = Segmenter.tokenize(
            input_seg,
            [(re.compile(r'\w+'), 'tokenize')],
            import_annotations=False,
        )
        letter_seg = Segmenter.tokenize(
            input_seg,
            [
                (re.compile(r'\w'), 'tokenize', {'type': 'C'}),
                (re.compile(r'[aeiouy]'), 'tokenize', {'type': 'V'}),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )
        vowel_seg, consonant_seg = Segmenter.select(
            letter_seg,
            re.compile(r'V'),
            annotation_key='type',
        )

        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and without annotation (woa):
        self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 1,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 2,
            ('n', 't'): 2,
            ('n', 'e'): 1,
            ('n', 'x'): 0,
            ('t', 'u'): 1,
            ('t', 'n'): 2,
            ('t', 't'): 5,
            ('t', 'e'): 4,
            ('t', 'x'): 3,
            ('e', 'u'): 0,
            ('e', 'n'): 1,
            ('e', 't'): 4,
            ('e', 'e'): 4,
            ('e', 'x'): 3,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 3,
            ('x', 'e'): 3,
            ('x', 'x'): 3,
        }
        self.window_woa_header_row_id = '__unit__'
        self.window_woa_header_row_type = 'string'
        self.window_woa_header_col_id = '__unit2__'
        self.window_woa_header_col_type = 'string'
        self.window_woa_col_type = {
            col_id: 'continuous' for col_id in self.window_woa_col_ids
            }
        self.window_woa_ref = IntPivotCrosstab(
            self.window_woa_row_ids,
            self.window_woa_col_ids,
            self.window_woa_values,
            self.window_woa_header_row_id,
            self.window_woa_header_row_type,
            self.window_woa_header_col_id,
            self.window_woa_header_col_type,
            self.window_woa_col_type,
        )
        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and with annotation (wa):
        self.window_wa_row_ids = ['C', 'V']
        self.window_wa_col_ids = ['C', 'V']
        self.window_wa_values = {
            ('C', 'C'): 5,
            ('C', 'V'): 5,
            ('V', 'C'): 5,
            ('V', 'V'): 5,
        }
        self.window_wa_header_row_id = '__unit__'
        self.window_wa_header_row_type = 'string'
        self.window_wa_header_col_id = '__unit2__'
        self.window_wa_header_col_type = 'string'
        self.window_wa_col_type = {
            col_id: 'continuous' for col_id in self.window_wa_col_ids
            }
        self.window_wa_ref = IntPivotCrosstab(
            self.window_wa_row_ids,
            self.window_wa_col_ids,
            self.window_wa_values,
            self.window_wa_header_row_id,
            self.window_wa_header_row_type,
            self.window_wa_header_col_id,
            self.window_wa_header_col_type,
            self.window_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and without annotation (woa):
        self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 0,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 1,
            ('n', 't'): 0,
            ('n', 'e'): 0,
            ('n', 'x'): 0,
            ('t', 'u'): 0,
            ('t', 'n'): 0,
            ('t', 't'): 1,
            ('t', 'e'): 1,
            ('t', 'x'): 1,
            ('e', 'u'): 0,
            ('e', 'n'): 0,
            ('e', 't'): 1,
            ('e', 'e'): 1,
            ('e', 'x'): 1,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 1,
            ('x', 'e'): 1,
            ('x', 'x'): 1,
        }
        self.context_wos_woa_header_row_id = '__unit__'
        self.context_wos_woa_header_row_type = 'string'
        self.context_wos_woa_header_col_id = '__unit2__'
        self.context_wos_woa_header_col_type = 'string'
        self.context_wos_woa_col_type = {
            col_id: 'continuous' for col_id in self.context_wos_woa_col_ids
            }
        self.context_wos_woa_ref = IntPivotCrosstab(
            self.context_wos_woa_row_ids,
            self.context_wos_woa_col_ids,
            self.context_wos_woa_values,
            self.context_wos_woa_header_row_id,
            self.context_wos_woa_header_row_type,
            self.context_wos_woa_header_col_id,
            self.context_wos_woa_header_col_type,
            self.context_wos_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and with annotation (wa):
        self.context_wos_wa_row_ids = ['V', 'C']
        self.context_wos_wa_col_ids = ['V', 'C']
        self.context_wos_wa_values = {
            ('V', 'V'): 2,
            ('V', 'C'): 2,
            ('C', 'V'): 2,
            ('C', 'C'): 2,
        }
        self.context_wos_wa_header_row_id = '__unit__'
        self.context_wos_wa_header_row_type = 'string'
        self.context_wos_wa_header_col_id = '__unit2__'
        self.context_wos_wa_header_col_type = 'string'
        self.context_wos_wa_col_type = {
            col_id: 'continuous' for col_id in self.context_wos_wa_col_ids
            }
        self.context_wos_wa_ref = IntPivotCrosstab(
            self.context_wos_wa_row_ids,
            self.context_wos_wa_col_ids,
            self.context_wos_wa_values,
            self.context_wos_wa_header_row_id,
            self.context_wos_wa_header_row_type,
            self.context_wos_wa_header_col_id,
            self.context_wos_wa_header_col_type,
            self.context_wos_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and without annotation (woa):
        self.context_ws_woa_col_ids = ['u', 'e']
        self.context_ws_woa_row_ids = ['n', 't', 'x']
        self.context_ws_woa_values = {
            ('n', 'u'): 1,
            ('n', 'e'): 0,
            ('t', 'u'): 0,
            ('t', 'e'): 1,
            ('x', 'u'): 0,
            ('x', 'e'): 1,
        }
        self.context_ws_woa_header_row_id = '__unit__'
        self.context_ws_woa_header_row_type = 'string'
        self.context_ws_woa_header_col_id = '__unit2__'
        self.context_ws_woa_header_col_type = 'string'
        self.context_ws_woa_col_type = {
            col_id: 'continuous' for col_id in self.context_ws_woa_col_ids
            }
        self.context_ws_woa_ref = IntPivotCrosstab(
            self.context_ws_woa_row_ids,
            self.context_ws_woa_col_ids,
            self.context_ws_woa_values,
            self.context_ws_woa_header_row_id,
            self.context_ws_woa_header_row_type,
            self.context_ws_woa_header_col_id,
            self.context_ws_woa_header_col_type,
            self.context_ws_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and with annotation (wa):
        self.context_ws_wa_row_ids = ['C']
        self.context_ws_wa_col_ids = ['V']
        self.context_ws_wa_values = {
            ('C', 'V'): 2,
        }
        self.context_ws_wa_header_row_id = '__unit__'
        self.context_ws_wa_header_row_type = 'string'
        self.context_ws_wa_header_col_id = '__unit2__'
        self.context_ws_wa_header_col_type = 'string'
        self.context_ws_wa_col_type = {
            col_id: 'continuous' for col_id in self.context_ws_wa_col_ids
            }
        self.context_ws_wa_ref = IntPivotCrosstab(
            self.context_ws_wa_row_ids,
            self.context_ws_wa_col_ids,
            self.context_ws_wa_values,
            self.context_ws_wa_header_row_id,
            self.context_ws_wa_header_row_type,
            self.context_ws_wa_header_col_id,
            self.context_ws_wa_header_col_type,
            self.context_ws_wa_col_type,
        )
        self.output_cooc_in_window_woa = Processor.cooc_in_window(
            units={'segmentation': letter_seg},
            window_size=3,
        )
        self.output_cooc_in_window_wa = Processor.cooc_in_window(
            units={'segmentation': letter_seg, 'annotation_key': 'type'},
            window_size=3,
        )
        self.output_cooc_in_context_wos_woa = Processor.cooc_in_context(
            units={'segmentation': letter_seg},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_wos_wa = Processor.cooc_in_context(
            units={'segmentation': letter_seg, 'annotation_key': 'type'},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_ws_woa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg},
        )
        self.output_cooc_in_context_ws_wa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg, 'annotation_key': 'type'},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg, 'annotation_key': 'type'},
        )