예제 #1
0
    def test_select_progress(self):
        """Does select track progress?"""
        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.select(
            self.char_seg,
            re.compile(r'.'),
            progress_callback=progress_callback,
        )
        self.assertEqual(self.count,
                         len(self.char_seg),
                         msg="select doesn't track progress!")
예제 #2
0
 def test_select_annotations(self):
     """Does select work with annotations?"""
     segmentation, _ = Segmenter.select(self.word_seg,
                                        re.compile(r'.'),
                                        annotation_key='a')
     self.assertEqual([s.get_content() for s in segmentation], ['ab'],
                      msg="select doesn't work with annotations!")
예제 #3
0
 def test_select_autonumber(self):
     """Does select autonumber input segments?"""
     segmentation, _ = Segmenter.select(self.char_seg,
                                        re.compile(r'.'),
                                        auto_number_as='num')
     self.assertEqual([s.annotations['num'] for s in segmentation],
                      [1, 2, 3, 4, 5, 6],
                      msg="select doesn't autonumber input segments!")
예제 #4
0
    def test_select_progress(self):
        """Does select track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.select(
            self.char_seg,
            re.compile(r'.'),
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.char_seg),
            msg="select doesn't track progress!"
        )
예제 #5
0
 def test_select_select(self):
     """Does select select segments?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
     )
     self.assertEqual([s.get_content() for s in segmentation], ['cde'],
                      msg="select doesn't select segments!")
예제 #6
0
 def test_select_import_annotations_false(self):
     """Does select skip importing annotations?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w+'),
         copy_annotations=False,
     )
     self.assertFalse('a' in segmentation[0].annotations,
                      msg="select doesn't skip importing annotations!")
예제 #7
0
 def test_select_select_neg(self):
     """Does select output complementary segmentation?"""
     _, segmentation = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
     )
     self.assertEqual(
         [s.get_content() for s in segmentation], ['ab'],
         msg="select doesn't output complementary segmentation!")
예제 #8
0
 def test_select_mode(self):
     """Does select respect mode setting?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
         mode="exclude",
     )
     self.assertEqual([s.get_content() for s in segmentation], ['ab'],
                      msg="select doesn't respect mode setting!")
예제 #9
0
 def test_select_import_annotations(self):
     """Does select import annotations?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w+'),
         copy_annotations=True,
     )
     self.assertEqual(segmentation[0].annotations['a'],
                      '1',
                      msg="select doesn't import annotations!")
예제 #10
0
 def test_select_import_annotations_false(self):
     """Does select skip importing annotations?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w+'),
         copy_annotations=False,
     )
     self.assertFalse(
         'a' in segmentation[0].annotations,
         msg="select doesn't skip importing annotations!"
     )
예제 #11
0
 def test_select_select_neg(self):
     """Does select output complementary segmentation?"""
     _, segmentation = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['ab'],
         msg="select doesn't output complementary segmentation!"
     )
예제 #12
0
 def test_select_select(self):
     """Does select select segments?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['cde'],
         msg="select doesn't select segments!"
     )
예제 #13
0
 def test_select_mode(self):
     """Does select respect mode setting?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
         mode="exclude",
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['ab'],
         msg="select doesn't respect mode setting!"
     )
예제 #14
0
 def test_select_annotations(self):
     """Does select work with annotations?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'.'),
         annotation_key='a'
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['ab'],
         msg="select doesn't work with annotations!"
     )
예제 #15
0
 def test_select_import_annotations(self):
     """Does select import annotations?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w+'),
         copy_annotations=True,
     )
     self.assertEqual(
         segmentation[0].annotations['a'],
         '1',
         msg="select doesn't import annotations!"
     )
예제 #16
0
 def test_select_autonumber(self):
     """Does select autonumber input segments?"""
     segmentation, _ = Segmenter.select(
         self.char_seg,
         re.compile(r'.'),
         auto_number_as='num'
     )
     self.assertEqual(
         [s.annotations['num'] for s in segmentation],
         [1, 2, 3, 4, 5, 6],
         msg="select doesn't autonumber input segments!"
     )
    def huntTheLexic(self):
        """
            main I/O function, filters the inputSeg with the selected
            lexical fields and outputs a copy of the input this Segmentation
            with segments labelised according to the topic they belong in
        """

        # initiations...
        out = list()
        selectedListsNames = list()

        # first we select the topics according to the ones the user chose
        if self.titleLabels:
            selectedListsNames = [
                list(self.titleLabels)[idx] for idx in self.selectedFields
            ]

        # we can then associate the topics with their respective lists
        selectedLists = {
            key: value
            for key, value in defaultDict.items() if key in selectedListsNames
        }

        # if we have an input, we can select the segments of the input and
        # label them according to the lists they are found in
        if self.inputSeg is not None:
            for filter_list in selectedLists:
                work_list = [i for i in selectedLists[filter_list] if i]
                if work_list:
                    out.append(
                        Segmenter.select(
                            self.inputSeg,
                            self.listToRegex(work_list),
                            label=filter_list,
                        )[0])

        # lastly we define the output as a segmentation that is a copy of
        # the input, with the segments that we found labeled accordingly
        if self.labelName == "":
            labelNameVar = "Topic"
        else:
            labelNameVar = self.labelName

        self.outputSeg = Segmenter.concatenate(
            [Segmenter.bypass(self.inputSeg, label="__None__")] + out,
            merge_duplicates=True,
            label=self.captionTitle,
            import_labels_as=labelNameVar,
        )
    def updateTitleList(self):
        """Update the list of titles"""

        # If titleSeg has not been loaded for some reason, skip.
        if self.titleSeg is None:
            return

        # In Advanced settings mode, get list of selected titles...
        if self.displayAdvancedSettings and self.filterValue != "(all)":
            self.filteredTitleSeg, _ = Segmenter.select(
                segmentation=self.titleSeg,
                regex=re.compile(r"^%s$" % self.filterValue),
                annotation_key=self.filterCriterion,
            )
        else:
            self.filteredTitleSeg = self.titleSeg

        # Populate titleLabels list with the titles...
        self.titleLabels = sorted(
            [s.annotations["title"] for s in self.filteredTitleSeg])

        # Add specification (author, year and genre, depending on criterion)...
        titleLabels = self.titleLabels[:]
        for idx, titleLabel in enumerate(titleLabels):
            specs = list()
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "author"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["author"])
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "year"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["year"])
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "genre"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["genre"])
            titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs)
        self.titleLabels = titleLabels

        # Reset selectedTitles if needed...
        if not set(self.importedURLs).issubset(
                set(u.annotations["url"] for u in self.filteredTitleSeg)):
            self.selectedTitles = list()
        else:
            self.selectedTitles = self.selectedTitles

        self.sendButton.settingsChanged()
예제 #19
0
    def updateTitleList(self):
        """Update the list of titles"""

        # If titleSeg has not been loaded for some reason, skip.
        if self.titleSeg is None:
            return

        # In Advanced settings mode, get list of selected titles...
        if self.displayAdvancedSettings and self.filterValue != "(all)":
            self.filteredTitleSeg, _ = Segmenter.select(
                segmentation=self.titleSeg,
                regex=re.compile(r"^%s$" % self.filterValue),
                annotation_key=self.filterCriterion,
            )
        else:
            self.filteredTitleSeg = self.titleSeg

        # If criterion is not "genre" and his filter value not "all",
        # group titles with different genres...

        # Create a dictionary with "author" and "title" as key...

        unique_titles = dict()
        for title in self.filteredTitleSeg:
            title_id = (
                title.annotations["author"],
                title.annotations["title"],
            )
            try:
                unique_titles[title_id].append(title)
            except KeyError:
                unique_titles[title_id] = [title]

        # Create a list with new annotation comporting all genres...
        new_title_segments = list()
        for unique_title in unique_titles.values():
            title_genres = list()
            new_title_segments.append(unique_title[0])
            title_genres.append(unique_title[0].annotations["genre"])
            for equivalent_title in unique_title[1:]:
                title_genres.append(equivalent_title.annotations["genre"])
            new_title_segments[-1].annotations["genre"] = ", ".join(
                sorted(list(set(title_genres))))

        self.filteredTitleSeg = Segmentation(None)
        self.filteredTitleSeg.extend(new_title_segments)

        # Populate titleLabels list with the titles...
        self.titleLabels = sorted(
            [s.annotations["title"] for s in self.filteredTitleSeg])

        # Add specification (author, year and genre, depending on criterion)...
        titleLabels = self.titleLabels[:]
        for idx, titleLabel in enumerate(titleLabels):
            specs = list()
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "author"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["author"])
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "genre"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["genre"])
            titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs)
        self.titleLabels = titleLabels

        # Reset selectedTitles if needed...
        if not set(self.importedURLs).issubset(
                set(u.annotations["url"] for u in self.filteredTitleSeg)):
            self.selectedTitles = list()
        else:
            self.selectedTitles = self.selectedTitles

        self.sendButton.settingsChanged()
예제 #20
0
    def sendData(self):
        """(Have LTTL.Segmenter) perform the actual selection"""

        # Check that there's something on input...
        if not self.segmentation:
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Selected data', None, self)
            self.send('Discarded data', None, self)
            return

        # TODO: remove message 'No label was provided.' from docs

        # Advanced settings...
        if self.displayAdvancedSettings:

            # If mode is Regex...
            if self.method == u'Regex':

                # Check that regex is not empty...
                if not self.regex:
                    self.infoBox.setText(u'Please enter a regex.', 'warning')
                    self.send('Selected data', None, self)
                    self.send('Discarded data', None, self)
                    return

                # Prepare regex...
                regex_string = self.regex
                if (self.ignoreCase or self.unicodeDependent or self.multiline
                        or self.dotAll):
                    flags = ''
                    if self.ignoreCase:
                        flags += 'i'
                    if self.unicodeDependent:
                        flags += 'u'
                    if self.multiline:
                        flags += 'm'
                    if self.dotAll:
                        flags += 's'
                    regex_string += '(?%s)' % flags
                try:
                    regex = re.compile(regex_string)
                except re.error as re_error:
                    try:
                        message = u'Please enter a valid regex (error: %s).' % \
                                  re_error.msg
                    except AttributeError:
                        message = u'Please enter a valid regex.'
                    self.infoBox.setText(message, 'error')
                    self.send('Selected data', None, self)
                    self.send('Discarded data', None, self)
                    return

                # Get number of iterations...
                num_iterations = len(self.segmentation)

            # Else if mode is Sample...
            elif self.method == u'Sample':

                # Get sample size...
                if self.sampleSizeMode == u'Proportion':
                    sampleSize = iround(
                        len(self.segmentation) * (self.samplingRate / 100))
                else:
                    sampleSize = self.sampleSize
                if sampleSize <= 0:
                    self.infoBox.setText(
                        message='Please enter a larger sample size',
                        state="error",
                    )
                    self.send('Selected data', None, self)
                    self.send('Discarded data', None, self)
                    return

                # Get number of iterations...
                num_iterations = len(self.segmentation)

            # Else if mode is Threshold...
            elif self.method == u'Threshold':

                # Get min and max count...
                if self.thresholdMode == u'Proportion':
                    minCount = iround(
                        math.ceil(
                            len(self.segmentation) *
                            (self.minProportion / 100)))
                    maxCount = iround(
                        math.floor(
                            len(self.segmentation) *
                            (self.maxProportion / 100)))
                else:
                    minCount = self.minCount
                    maxCount = self.maxCount
                if not self.applyMinThreshold:
                    minCount = 1
                if not self.applyMaxThreshold:
                    maxCount = len(self.segmentation)

                # Get number of iterations...
                num_iterations = len(self.segmentation)

            # Check that autoNumberKey is not empty (if necessary)...
            if self.autoNumber:
                if self.autoNumberKey:
                    autoNumberKey = self.autoNumberKey
                else:
                    self.infoBox.setText(
                        u'Please enter an annotation key for auto-numbering.',
                        'warning')
                    self.send('Selected data', None, self)
                    self.send('Discarded data', None, self)
                    return
            else:
                autoNumberKey = None

            # Perform selection...
            self.infoBox.setText(u"Processing, please wait...", "warning")
            self.controlArea.setDisabled(True)
            progressBar = ProgressBar(self, iterations=num_iterations)
            if self.method == u'Regex':
                regexAnnotationKeyParam = self.regexAnnotationKey
                if regexAnnotationKeyParam == u'(none)':
                    regexAnnotationKeyParam = None
                (selected_data, discarded_data) = Segmenter.select(
                    segmentation=self.segmentation,
                    regex=regex,
                    mode=self.regexMode.lower(),
                    annotation_key=regexAnnotationKeyParam or None,
                    label=self.captionTitle,
                    copy_annotations=self.copyAnnotations,
                    auto_number_as=autoNumberKey,
                    progress_callback=progressBar.advance,
                )
            elif self.method == u'Sample':
                (selected_data, discarded_data) = Segmenter.sample(
                    segmentation=self.segmentation,
                    sample_size=sampleSize,
                    mode='random',
                    label=self.captionTitle,
                    copy_annotations=self.copyAnnotations,
                    auto_number_as=autoNumberKey,
                    progress_callback=progressBar.advance,
                )
            elif self.method == u'Threshold':
                if ((minCount == 1 or not self.applyMinThreshold)
                        and (maxCount == len(self.segmentation)
                             or not self.applyMaxThreshold)):
                    selected_data = Segmenter.bypass(
                        segmentation=self.segmentation,
                        label=self.captionTitle,
                    )
                    discarded_data = None
                else:
                    thresholdAnnotationKeyParam = self.thresholdAnnotationKey
                    if thresholdAnnotationKeyParam == u'(none)':
                        thresholdAnnotationKeyParam = None
                    (selected_data, discarded_data) = Segmenter.threshold(
                        segmentation=self.segmentation,
                        annotation_key=(thresholdAnnotationKeyParam or None),
                        min_count=minCount,
                        max_count=maxCount,
                        label=self.captionTitle,
                        copy_annotations=self.copyAnnotations,
                        auto_number_as=autoNumberKey,
                        progress_callback=progressBar.advance,
                    )

        # Basic settings:
        else:

            # Check that regex is not empty...
            if not self.regex:
                self.infoBox.setText(u'Please enter a regex.', 'warning')
                self.send('Selected data', None, self)
                self.send('Discarded data', None, self)
                return

            # Get number of iterations...
            num_iterations = len(self.segmentation)

            # Perform selection...
            self.infoBox.setText(u"Processing, please wait...", "warning")
            self.controlArea.setDisabled(True)
            progressBar = ProgressBar(self, iterations=num_iterations)
            regexAnnotationKeyParam = self.regexAnnotationKey
            if regexAnnotationKeyParam == u'(none)':
                regexAnnotationKeyParam = None
            try:
                (selected_data, discarded_data) = Segmenter.select(
                    segmentation=self.segmentation,
                    regex=re.compile(self.regex + '(?u)'),
                    mode=self.regexMode.lower(),
                    annotation_key=regexAnnotationKeyParam or None,
                    label=self.captionTitle,
                    copy_annotations=True,
                    auto_number_as=None,
                    progress_callback=progressBar.advance,
                )
            except re.error as re_error:
                try:
                    message = u'Please enter a valid regex (error: %s).' % \
                              re_error.msg
                except AttributeError:
                    message = u'Please enter a valid regex.'
                self.infoBox.setText(message, 'error')
                self.send('Selected data', None, self)
                self.send('Discarded data', None, self)
                progressBar.finish()
                self.controlArea.setDisabled(False)
                return

        progressBar.finish()
        self.controlArea.setDisabled(False)

        message = u'%i segment@p sent to output.' % len(selected_data)
        message = pluralize(message, len(selected_data))
        self.infoBox.setText(message)

        self.send('Selected data', selected_data, self)
        self.send('Discarded data', discarded_data, self)
        self.sendButton.resetSettingsChangedFlag()
예제 #21
0
파일: test_cooc.py 프로젝트: axanthos/LTTL
    def setUp(self):
        input_seg = Input("un texte")
        word_seg = Segmenter.tokenize(
            input_seg,
            [(re.compile(r'\w+'), 'tokenize')],
            import_annotations=False,
        )
        letter_seg = Segmenter.tokenize(
            input_seg,
            [
                (re.compile(r'\w'), 'tokenize', {'type': 'C'}),
                (re.compile(r'[aeiouy]'), 'tokenize', {'type': 'V'}),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )
        vowel_seg, consonant_seg = Segmenter.select(
            letter_seg,
            re.compile(r'V'),
            annotation_key='type',
        )

        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and without annotation (woa):
        self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 1,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 2,
            ('n', 't'): 2,
            ('n', 'e'): 1,
            ('n', 'x'): 0,
            ('t', 'u'): 1,
            ('t', 'n'): 2,
            ('t', 't'): 5,
            ('t', 'e'): 4,
            ('t', 'x'): 3,
            ('e', 'u'): 0,
            ('e', 'n'): 1,
            ('e', 't'): 4,
            ('e', 'e'): 4,
            ('e', 'x'): 3,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 3,
            ('x', 'e'): 3,
            ('x', 'x'): 3,
        }
        self.window_woa_header_row_id = '__unit__'
        self.window_woa_header_row_type = 'string'
        self.window_woa_header_col_id = '__unit2__'
        self.window_woa_header_col_type = 'string'
        self.window_woa_col_type = {
            col_id: 'continuous' for col_id in self.window_woa_col_ids
            }
        self.window_woa_ref = IntPivotCrosstab(
            self.window_woa_row_ids,
            self.window_woa_col_ids,
            self.window_woa_values,
            self.window_woa_header_row_id,
            self.window_woa_header_row_type,
            self.window_woa_header_col_id,
            self.window_woa_header_col_type,
            self.window_woa_col_type,
        )
        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and with annotation (wa):
        self.window_wa_row_ids = ['C', 'V']
        self.window_wa_col_ids = ['C', 'V']
        self.window_wa_values = {
            ('C', 'C'): 5,
            ('C', 'V'): 5,
            ('V', 'C'): 5,
            ('V', 'V'): 5,
        }
        self.window_wa_header_row_id = '__unit__'
        self.window_wa_header_row_type = 'string'
        self.window_wa_header_col_id = '__unit2__'
        self.window_wa_header_col_type = 'string'
        self.window_wa_col_type = {
            col_id: 'continuous' for col_id in self.window_wa_col_ids
            }
        self.window_wa_ref = IntPivotCrosstab(
            self.window_wa_row_ids,
            self.window_wa_col_ids,
            self.window_wa_values,
            self.window_wa_header_row_id,
            self.window_wa_header_row_type,
            self.window_wa_header_col_id,
            self.window_wa_header_col_type,
            self.window_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and without annotation (woa):
        self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 0,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 1,
            ('n', 't'): 0,
            ('n', 'e'): 0,
            ('n', 'x'): 0,
            ('t', 'u'): 0,
            ('t', 'n'): 0,
            ('t', 't'): 1,
            ('t', 'e'): 1,
            ('t', 'x'): 1,
            ('e', 'u'): 0,
            ('e', 'n'): 0,
            ('e', 't'): 1,
            ('e', 'e'): 1,
            ('e', 'x'): 1,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 1,
            ('x', 'e'): 1,
            ('x', 'x'): 1,
        }
        self.context_wos_woa_header_row_id = '__unit__'
        self.context_wos_woa_header_row_type = 'string'
        self.context_wos_woa_header_col_id = '__unit2__'
        self.context_wos_woa_header_col_type = 'string'
        self.context_wos_woa_col_type = {
            col_id: 'continuous' for col_id in self.context_wos_woa_col_ids
            }
        self.context_wos_woa_ref = IntPivotCrosstab(
            self.context_wos_woa_row_ids,
            self.context_wos_woa_col_ids,
            self.context_wos_woa_values,
            self.context_wos_woa_header_row_id,
            self.context_wos_woa_header_row_type,
            self.context_wos_woa_header_col_id,
            self.context_wos_woa_header_col_type,
            self.context_wos_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and with annotation (wa):
        self.context_wos_wa_row_ids = ['V', 'C']
        self.context_wos_wa_col_ids = ['V', 'C']
        self.context_wos_wa_values = {
            ('V', 'V'): 2,
            ('V', 'C'): 2,
            ('C', 'V'): 2,
            ('C', 'C'): 2,
        }
        self.context_wos_wa_header_row_id = '__unit__'
        self.context_wos_wa_header_row_type = 'string'
        self.context_wos_wa_header_col_id = '__unit2__'
        self.context_wos_wa_header_col_type = 'string'
        self.context_wos_wa_col_type = {
            col_id: 'continuous' for col_id in self.context_wos_wa_col_ids
            }
        self.context_wos_wa_ref = IntPivotCrosstab(
            self.context_wos_wa_row_ids,
            self.context_wos_wa_col_ids,
            self.context_wos_wa_values,
            self.context_wos_wa_header_row_id,
            self.context_wos_wa_header_row_type,
            self.context_wos_wa_header_col_id,
            self.context_wos_wa_header_col_type,
            self.context_wos_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and without annotation (woa):
        self.context_ws_woa_col_ids = ['u', 'e']
        self.context_ws_woa_row_ids = ['n', 't', 'x']
        self.context_ws_woa_values = {
            ('n', 'u'): 1,
            ('n', 'e'): 0,
            ('t', 'u'): 0,
            ('t', 'e'): 1,
            ('x', 'u'): 0,
            ('x', 'e'): 1,
        }
        self.context_ws_woa_header_row_id = '__unit__'
        self.context_ws_woa_header_row_type = 'string'
        self.context_ws_woa_header_col_id = '__unit2__'
        self.context_ws_woa_header_col_type = 'string'
        self.context_ws_woa_col_type = {
            col_id: 'continuous' for col_id in self.context_ws_woa_col_ids
            }
        self.context_ws_woa_ref = IntPivotCrosstab(
            self.context_ws_woa_row_ids,
            self.context_ws_woa_col_ids,
            self.context_ws_woa_values,
            self.context_ws_woa_header_row_id,
            self.context_ws_woa_header_row_type,
            self.context_ws_woa_header_col_id,
            self.context_ws_woa_header_col_type,
            self.context_ws_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and with annotation (wa):
        self.context_ws_wa_row_ids = ['C']
        self.context_ws_wa_col_ids = ['V']
        self.context_ws_wa_values = {
            ('C', 'V'): 2,
        }
        self.context_ws_wa_header_row_id = '__unit__'
        self.context_ws_wa_header_row_type = 'string'
        self.context_ws_wa_header_col_id = '__unit2__'
        self.context_ws_wa_header_col_type = 'string'
        self.context_ws_wa_col_type = {
            col_id: 'continuous' for col_id in self.context_ws_wa_col_ids
            }
        self.context_ws_wa_ref = IntPivotCrosstab(
            self.context_ws_wa_row_ids,
            self.context_ws_wa_col_ids,
            self.context_ws_wa_values,
            self.context_ws_wa_header_row_id,
            self.context_ws_wa_header_row_type,
            self.context_ws_wa_header_col_id,
            self.context_ws_wa_header_col_type,
            self.context_ws_wa_col_type,
        )
        self.output_cooc_in_window_woa = Processor.cooc_in_window(
            units={'segmentation': letter_seg},
            window_size=3,
        )
        self.output_cooc_in_window_wa = Processor.cooc_in_window(
            units={'segmentation': letter_seg, 'annotation_key': 'type'},
            window_size=3,
        )
        self.output_cooc_in_context_wos_woa = Processor.cooc_in_context(
            units={'segmentation': letter_seg},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_wos_wa = Processor.cooc_in_context(
            units={'segmentation': letter_seg, 'annotation_key': 'type'},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_ws_woa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg},
        )
        self.output_cooc_in_context_ws_wa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg, 'annotation_key': 'type'},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg, 'annotation_key': 'type'},
        )
예제 #22
0
    def setUp(self):
        self.maxDiff = None
        input_seg = Input("un texte")
        word_seg = Segmenter.tokenize(
            input_seg,
            [(re.compile(r'\w+'), 'tokenize')],
            import_annotations=False,
        )
        letter_seg = Segmenter.tokenize(
            input_seg,
            [
                (re.compile(r'\w'), 'tokenize', {
                    'type': 'C'
                }),
                (re.compile(r'[aeiouy]'), 'tokenize', {
                    'type': 'V'
                }),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )
        vowel_seg, consonant_seg = Segmenter.select(
            letter_seg,
            re.compile(r'V'),
            annotation_key='type',
        )

        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and without annotation (woa):
        self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 1,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 2,
            ('n', 't'): 2,
            ('n', 'e'): 1,
            ('n', 'x'): 0,
            ('t', 'u'): 1,
            ('t', 'n'): 2,
            ('t', 't'): 5,
            ('t', 'e'): 4,
            ('t', 'x'): 3,
            ('e', 'u'): 0,
            ('e', 'n'): 1,
            ('e', 't'): 4,
            ('e', 'e'): 4,
            ('e', 'x'): 3,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 3,
            ('x', 'e'): 3,
            ('x', 'x'): 3,
        }
        self.window_woa_header_row_id = '__unit__'
        self.window_woa_header_row_type = 'string'
        self.window_woa_header_col_id = '__unit__'
        self.window_woa_header_col_type = 'string'
        self.window_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.window_woa_col_ids
        }
        self.window_woa_ref = IntPivotCrosstab(
            self.window_woa_row_ids,
            self.window_woa_col_ids,
            self.window_woa_values,
            self.window_woa_header_row_id,
            self.window_woa_header_row_type,
            self.window_woa_header_col_id,
            self.window_woa_header_col_type,
            self.window_woa_col_type,
        )
        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and with annotation (wa):
        self.window_wa_row_ids = ['C', 'V']
        self.window_wa_col_ids = ['C', 'V']
        self.window_wa_values = {
            ('C', 'C'): 5,
            ('C', 'V'): 5,
            ('V', 'C'): 5,
            ('V', 'V'): 5,
        }
        self.window_wa_header_row_id = '__unit__'
        self.window_wa_header_row_type = 'string'
        self.window_wa_header_col_id = '__unit__'
        self.window_wa_header_col_type = 'string'
        self.window_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.window_wa_col_ids
        }
        self.window_wa_ref = IntPivotCrosstab(
            self.window_wa_row_ids,
            self.window_wa_col_ids,
            self.window_wa_values,
            self.window_wa_header_row_id,
            self.window_wa_header_row_type,
            self.window_wa_header_col_id,
            self.window_wa_header_col_type,
            self.window_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and without annotation (woa):
        self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 0,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 1,
            ('n', 't'): 0,
            ('n', 'e'): 0,
            ('n', 'x'): 0,
            ('t', 'u'): 0,
            ('t', 'n'): 0,
            ('t', 't'): 1,
            ('t', 'e'): 1,
            ('t', 'x'): 1,
            ('e', 'u'): 0,
            ('e', 'n'): 0,
            ('e', 't'): 1,
            ('e', 'e'): 1,
            ('e', 'x'): 1,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 1,
            ('x', 'e'): 1,
            ('x', 'x'): 1,
        }
        self.context_wos_woa_header_row_id = '__context__'
        self.context_wos_woa_header_row_type = 'string'
        self.context_wos_woa_header_col_id = '__context__'
        self.context_wos_woa_header_col_type = 'string'
        self.context_wos_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_wos_woa_col_ids
        }
        self.context_wos_woa_ref = IntPivotCrosstab(
            self.context_wos_woa_row_ids,
            self.context_wos_woa_col_ids,
            self.context_wos_woa_values,
            self.context_wos_woa_header_row_id,
            self.context_wos_woa_header_row_type,
            self.context_wos_woa_header_col_id,
            self.context_wos_woa_header_col_type,
            self.context_wos_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and with annotation (wa):
        self.context_wos_wa_row_ids = ['V', 'C']
        self.context_wos_wa_col_ids = ['V', 'C']
        self.context_wos_wa_values = {
            ('V', 'V'): 2,
            ('V', 'C'): 2,
            ('C', 'V'): 2,
            ('C', 'C'): 2,
        }
        self.context_wos_wa_header_row_id = '__context__'
        self.context_wos_wa_header_row_type = 'string'
        self.context_wos_wa_header_col_id = '__context__'
        self.context_wos_wa_header_col_type = 'string'
        self.context_wos_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_wos_wa_col_ids
        }
        self.context_wos_wa_ref = IntPivotCrosstab(
            self.context_wos_wa_row_ids,
            self.context_wos_wa_col_ids,
            self.context_wos_wa_values,
            self.context_wos_wa_header_row_id,
            self.context_wos_wa_header_row_type,
            self.context_wos_wa_header_col_id,
            self.context_wos_wa_header_col_type,
            self.context_wos_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and without annotation (woa):
        self.context_ws_woa_col_ids = ['u', 'e']
        self.context_ws_woa_row_ids = ['n', 't', 'x']
        self.context_ws_woa_values = {
            ('n', 'u'): 1,
            ('n', 'e'): 0,
            ('t', 'u'): 0,
            ('t', 'e'): 1,
            ('x', 'u'): 0,
            ('x', 'e'): 1,
        }
        self.context_ws_woa_header_row_id = '__context__'
        self.context_ws_woa_header_row_type = 'string'
        self.context_ws_woa_header_col_id = '__context__'
        self.context_ws_woa_header_col_type = 'string'
        self.context_ws_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_ws_woa_col_ids
        }
        self.context_ws_woa_ref = IntPivotCrosstab(
            self.context_ws_woa_row_ids,
            self.context_ws_woa_col_ids,
            self.context_ws_woa_values,
            self.context_ws_woa_header_row_id,
            self.context_ws_woa_header_row_type,
            self.context_ws_woa_header_col_id,
            self.context_ws_woa_header_col_type,
            self.context_ws_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and with annotation (wa):
        self.context_ws_wa_row_ids = ['C']
        self.context_ws_wa_col_ids = ['V']
        self.context_ws_wa_values = {
            ('C', 'V'): 2,
        }
        self.context_ws_wa_header_row_id = '__context__'
        self.context_ws_wa_header_row_type = 'string'
        self.context_ws_wa_header_col_id = '__context__'
        self.context_ws_wa_header_col_type = 'string'
        self.context_ws_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_ws_wa_col_ids
        }
        self.context_ws_wa_ref = IntPivotCrosstab(
            self.context_ws_wa_row_ids,
            self.context_ws_wa_col_ids,
            self.context_ws_wa_values,
            self.context_ws_wa_header_row_id,
            self.context_ws_wa_header_row_type,
            self.context_ws_wa_header_col_id,
            self.context_ws_wa_header_col_type,
            self.context_ws_wa_col_type,
        )
        self.output_cooc_in_window_woa = Processor.cooc_in_window(
            units={'segmentation': letter_seg},
            window_size=3,
        )
        self.output_cooc_in_window_wa = Processor.cooc_in_window(
            units={
                'segmentation': letter_seg,
                'annotation_key': 'type'
            },
            window_size=3,
        )
        self.output_cooc_in_context_wos_woa = Processor.cooc_in_context(
            units={'segmentation': letter_seg},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_wos_wa = Processor.cooc_in_context(
            units={
                'segmentation': letter_seg,
                'annotation_key': 'type'
            },
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_ws_woa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg},
        )
        self.output_cooc_in_context_ws_wa = Processor.cooc_in_context(
            units={
                'segmentation': vowel_seg,
                'annotation_key': 'type'
            },
            contexts={'segmentation': word_seg},
            units2={
                'segmentation': consonant_seg,
                'annotation_key': 'type'
            },
        )