def test_recode_overlapping_segmentation(self):
     """Does recode raise exception for overlapping segmentation?"""
     with self.assertRaises(
         ValueError,
         msg="recode doesn't raise exception for overlapping segmentation!"
     ):
         Segmenter.recode(
             self.overlapping_seg,
         )
 def test_import_xml_exception_missing_opening(self):
     """Does import_xml detect missing opening tag?"""
     with self.assertRaises(
         ValueError,
         msg="import_xml doesn't detect missing opening tag!"
     ):
         Segmenter.import_xml(
             self.wrong_xml_seg2,
             element='a',
         )
 def test_tokenize_exception_mode(self):
     """Does tokenize raise exception for unknown mode?"""
     with self.assertRaises(
         ValueError,
         msg="tokenize doesn't raise exception for unknown mode!"
     ):
         Segmenter.tokenize(
             self.entire_text_seg,
             [(re.compile(r'\W+'), 'unknown_mode')],
         )
 def test_auto_number_autonumber(self):
     """Does _auto_number autonumber in place?"""
     Segmenter._auto_number(
         self.third_letter_seg,
         annotation_key='num',
     )
     self.assertEqual(
         [s.annotations['num'] for s in self.third_letter_seg],
         [1, 2, 3],
         msg="_auto_number doesn't autonumber in place!"
     )
 def test_sample_exception_mode(self):
     """Does sample raise exception for unknown mode?"""
     with self.assertRaises(
         ValueError,
         msg="sample doesn't raise exception for unknown mode!"
     ):
         Segmenter.sample(
             self.entire_text_seg,
             sample_size=3,
             mode='unknown_mode',
         )
 def test_parse_xml_tag_is_opening(self):
     """Does _parse_xml_tag recognize opening tags?"""
     tags = [
         Segmenter._parse_xml_tag('<a>'),
         Segmenter._parse_xml_tag('<a attr="1"/>'),
         Segmenter._parse_xml_tag('</a>'),
     ]
     self.assertEqual(
         [tag['is_opening'] for tag in tags],
         [True, True, False],
         msg="_parse_xml_tag doesn't recognize opening tags!"
     )
 def test_parse_xml_tag_element_name(self):
     """Does _parse_xml_tag parse element name?"""
     tags = [
         Segmenter._parse_xml_tag('<a>'),
         Segmenter._parse_xml_tag('<a attr="1">'),
         Segmenter._parse_xml_tag('</a>'),
         Segmenter._parse_xml_tag('<a/>'),
     ]
     self.assertEqual(
         [tag['element'] for tag in tags],
         ['a', 'a', 'a', 'a'],
         msg="_parse_xml_tag doesn't parse element name!"
     )
 def test_parse_xml_tag_is_empty(self):
     """Does _parse_xml_tag recognize empty elements?"""
     tags = [
         Segmenter._parse_xml_tag('<a>'),
         Segmenter._parse_xml_tag('</a>'),
         Segmenter._parse_xml_tag('<a/>'),
         Segmenter._parse_xml_tag('<a attr="1"/>'),
     ]
     self.assertEqual(
         [tag['is_empty'] for tag in tags],
         [False, False, True, True],
         msg="_parse_xml_tag doesn't recognize empty elements!"
     )
    def test_import_xml_progress(self):
        """Does import_xml track progress?"""
        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.import_xml(
            self.broken_xml_seg,
            element='a',
            progress_callback=progress_callback,
        )
        self.assertEqual(self.count,
                         len(self.broken_xml_seg),
                         msg="import_xml doesn't track progress!")
    def test_tokenize_progress(self):
        """Does tokenize track progress?"""
        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.tokenize(
            self.word_seg,
            [(re.compile(r'\w'), 'tokenize')],
            progress_callback=progress_callback,
        )
        self.assertEqual(self.count,
                         len(self.word_seg),
                         msg="tokenize doesn't track progress!")
    def huntTheLexic(self):
        """
            main I/O function, filters the inputSeg with the selected
            lexical fields and outputs a copy of the input this Segmentation
            with segments labelised according to the topic they belong in
        """

        # initiations...
        out = list()
        selectedListsNames = list()

        # first we select the topics according to the ones the user chose
        if self.titleLabels:
            selectedListsNames = [
                list(self.titleLabels)[idx] for idx in self.selectedFields
            ]

        # we can then associate the topics with their respective lists
        selectedLists = {
            key: value
            for key, value in defaultDict.items() if key in selectedListsNames
        }

        # if we have an input, we can select the segments of the input and
        # label them according to the lists they are found in
        if self.inputSeg is not None:
            for filter_list in selectedLists:
                work_list = [i for i in selectedLists[filter_list] if i]
                if work_list:
                    out.append(
                        Segmenter.select(
                            self.inputSeg,
                            self.listToRegex(work_list),
                            label=filter_list,
                        )[0])

        # lastly we define the output as a segmentation that is a copy of
        # the input, with the segments that we found labeled accordingly
        if self.labelName == "":
            labelNameVar = "Topic"
        else:
            labelNameVar = self.labelName

        self.outputSeg = Segmenter.concatenate(
            [Segmenter.bypass(self.inputSeg, label="__None__")] + out,
            merge_duplicates=True,
            label=self.captionTitle,
            import_labels_as=labelNameVar,
        )
Beispiel #12
0
    def test_threshold_progress(self):
        """Does threshold track progress?"""
        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.threshold(
            self.other_letter_seg,
            min_count=2,
            max_count=2,
            progress_callback=progress_callback,
        )
        self.assertEqual(self.count,
                         len(self.other_letter_seg),
                         msg="threshold doesn't track progress!")
Beispiel #13
0
    def test_sample_progress(self):
        """Does sample track progress?"""
        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.sample(
            self.char_seg,
            sample_size=4,
            mode='random',
            progress_callback=progress_callback,
        )
        self.assertEqual(self.count,
                         len(self.char_seg),
                         msg="sample doesn't track progress!")
    def sendData(self):
        """Send segmentation to output"""
        if not self.segmentation:
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Bypassed segmentation', None, self)
            self.send('Displayed segmentation', None, self)
            return

        self.send('Bypassed segmentation',
                  Segmenter.bypass(self.segmentation, self.captionTitle), self)
        # TODO: Check if this is correct replacement for textable v1.*, v2.*
        if 'format' in self._currentWarningMessage or \
                'format' in self._currentErrorMessage:
            self.send('Displayed segmentation', None, self)
            return
        if len(self.displayedSegmentation[0].get_content()) > 0:
            self.send('Displayed segmentation', self.displayedSegmentation,
                      self)
        else:
            self.send('Displayed segmentation', None, self)
        # TODO: Differes only in capitalization with a check before
        #       Is this intentional?
        if "Format" not in self._currentErrorMessage:
            message = u'%i segment@p sent to output.' % len(self.segmentation)
            message = pluralize(message, len(self.segmentation))
            self.infoBox.setText(message)
        self.sendButton.resetSettingsChangedFlag()
Beispiel #15
0
 def test_select_annotations(self):
     """Does select work with annotations?"""
     segmentation, _ = Segmenter.select(self.word_seg,
                                        re.compile(r'.'),
                                        annotation_key='a')
     self.assertEqual([s.get_content() for s in segmentation], ['ab'],
                      msg="select doesn't work with annotations!")
Beispiel #16
0
 def test_threshold_select_no_min_no_max(self):
     """Does threshold select segments (no min, no max)?"""
     segmentation, _ = Segmenter.threshold(self.other_letter_seg, )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['a', 'b', 'b', 'c', 'c', 'c'],
         msg="threshold doesn't select segments (no min, no max)!")
    def test_concatenate_progress(self):
        """Does concatenate track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.concatenate(
            [self.letter_seg1],
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.letter_seg1),
            msg="concatenate doesn't track progress!"
        )
Beispiel #18
0
 def test_intersect_autonumber(self):
     """Does intersect autonumber input segments?"""
     segmentation, _ = Segmenter.intersect(source=self.letter_seg,
                                           filtering=self.third_letter_seg,
                                           auto_number_as='num')
     self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2],
                      msg="intersect doesn't autonumber input segments!")
Beispiel #19
0
 def test_import_xml_autonumber(self):
     """Does import_xml autonumber input segments?"""
     segmentation = Segmenter.import_xml(self.xml_seg,
                                         element='a',
                                         auto_number_as='num')
     self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2],
                      msg="import_xml doesn't autonumber input segments!")
    def send_data(self):
        """Creates the inputs based on the fetched data"""
        self.controlArea.setDisabled(True)
        self.clearCreatedInputs()
        segmentation = None

        # Goes over each queries in the data list
        for query in self.queryList:
            for text in query:
                # Create inputs
                newInput = Input(text)
                self.createdInputs.append(newInput)

        # If there is only one input, create a segmentation...
        if len(self.createdInputs) == 1:
            segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...
        annotations = list()
        for elem in self.annotList:
            for dic in elem:
                annotations.append(dic)

        for idx, segment in enumerate(segmentation):
            segment.annotations.update(annotations[idx])
            segmentation[idx] = segment

        # Calculate number of characters...
        num_chars = 0
        for segment in segmentation:
            num_chars += len(Segmentation.get_data(segment.str_index))

        # If there is data...
        if len(segmentation) != 0:
            # Inform the user of the number of segments and the number of characters...
            self.infoBox.setText(
                "{} segments sent to output ({} characters)".format(
                    len(segmentation),
                    num_chars,
                ))
            # Send the segments
            self.send("Segmentation", segmentation)
            self.controlArea.setDisabled(False)
            self.sendButton.resetSettingsChangedFlag()
        else:
            # Else, signal the user that no data is sendable...
            self.infoBox.setText(
                "There are {} segments to send to output. Please fill the query basket and click 'send' again"
                .format(len(segmentation)), "warning")
            self.sendButton.resetSettingsChangedFlag()
            self.controlArea.setDisabled(False)
            self.send("Segmentation", None)
Beispiel #21
0
 def test_tokenize_import_annotations_false_split(self):
     """Does tokenize skip importing annotations (mode split)?"""
     segmentation = Segmenter.tokenize(self.word_seg,
                                       [(re.compile(r'a'), 'split')],
                                       import_annotations=False)
     self.assertFalse(
         'a' in segmentation[0].annotations,
         msg="tokenize doesn't skip importing annotations (mode split)!")
 def test_parse_xml_tag_attributes(self):
     """Does _parse_xml_tag parse attributes?"""
     tag = Segmenter._parse_xml_tag('<a attr1="2" attr3="4">')
     self.assertEqual(
         tag['attributes'],
         {'attr1': '2', 'attr3': '4'},
         msg="_parse_xml_tag doesn't parse attributes!"
     )
Beispiel #23
0
 def test_select_select(self):
     """Does select select segments?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
     )
     self.assertEqual([s.get_content() for s in segmentation], ['cde'],
                      msg="select doesn't select segments!")
    def test_select_progress(self):
        """Does select track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.select(
            self.char_seg,
            re.compile(r'.'),
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.char_seg),
            msg="select doesn't track progress!"
        )
    def test_intersect_progress(self):
        """Does intersect track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.intersect(
            source=self.letter_seg,
            filtering=self.third_letter_seg,
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.letter_seg),
            msg="intersect doesn't track progress!"
        )
Beispiel #26
0
 def test_select_autonumber(self):
     """Does select autonumber input segments?"""
     segmentation, _ = Segmenter.select(self.char_seg,
                                        re.compile(r'.'),
                                        auto_number_as='num')
     self.assertEqual([s.annotations['num'] for s in segmentation],
                      [1, 2, 3, 4, 5, 6],
                      msg="select doesn't autonumber input segments!")
 def test_parse_xml_tag_attributes(self):
     """Does _parse_xml_tag parse attributes?"""
     tag = Segmenter._parse_xml_tag('<a attr1="2" attr3="4">')
     self.assertEqual(tag['attributes'], {
         'attr1': '2',
         'attr3': '4'
     },
                      msg="_parse_xml_tag doesn't parse attributes!")
 def test_bypass_copy_segments(self):
     """Does bypass copy input segments?"""
     segmentation = Segmenter.bypass(self.letter_seg)
     self.assertEqual(
         [s.get_content() for s in segmentation],
         [s.get_content() for s in self.letter_seg],
         msg="bypass doesn't copy input segments!"
     )
    def test_tokenize_progress(self):
        """Does tokenize track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.tokenize(
            self.word_seg,
            [(re.compile(r'\w'), 'tokenize')],
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.word_seg),
            msg="tokenize doesn't track progress!"
        )
    def test_import_xml_progress(self):
        """Does import_xml track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.import_xml(
            self.broken_xml_seg,
            element='a',
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.broken_xml_seg),
            msg="import_xml doesn't track progress!"
        )
 def test_bypass_deepcopy(self):
     """Does bypass deep copy input segments?"""
     segmentation = Segmenter.bypass(self.letter_seg)
     self.assertNotEqual(
         segmentation,
         self.letter_seg,
         msg="bypass doesn't deep copy input segments!"
     )
 def test_bypass_copy_annotations(self):
     """Does bypass copy annotations?"""
     segmentation = Segmenter.bypass(self.other_letter_seg)
     self.assertEqual(
         [s.annotations['a'] for s in segmentation],
         [s.annotations['a'] for s in self.other_letter_seg],
         msg="bypass doesn't copy annotations!"
     )
Beispiel #33
0
    def sendData(self):

        if not self.file:
            self.infoBox.setText(u"Please select input file.", "warning")
            self.send('Text data', None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()
        # Get transcription

        try:
            transcription = self.get_large_audio_transcription(
                self.file,
                language=self.language,
                set_silence_len=self.selected_dur,
                set_silence_threshold=self.selected_vol)
        except speech_recognition.UnknownValueError as err:
            self.infoBox.setText(
                u"You seem to have overuseed the built-in API key, refer to the documentation for further informations.",
                "warning")
            self.send('Text data', None, self)
            return

        # Checks if there is a transcription
        if transcription is None:
            self.infoBox.setText(u"You must use mp3 or wav audio files.",
                                 "warning")
            self.send('Text data', None, self)
            return

        # Regex to get the name of the input file
        title = self.file
        regex = re.compile("[^(/\\)]+[mp3|wav]$")
        match = re.findall(regex, title)

        if self.selected_seg:
            for chunk in transcription:
                new_input = Input(chunk, label=match)
                self.createdInputs.append(new_input)
        else:
            new_input = Input(transcription, label=match)
            self.createdInputs.append(new_input)
        # Concatenates the segmentations in the output segmentation
        self.segmentation = Segmenter.concatenate(
            segmentations=self.createdInputs,
            label=self.captionTitle,
            copy_annotations=False,
            import_labels_as="")

        #Sending segments length
        message = " Succesfully transcripted ! % i segment@p sent to output" % len(
            self.segmentation)
        message = pluralize(message, len(self.segmentation))
        # Send token...
        self.send("Text data", self.segmentation, self)
        self.infoBox.setText(message)
        self.sendButton.resetSettingsChangedFlag()
    def test_sample_progress(self):
        """Does sample track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.sample(
            self.char_seg,
            sample_size=4,
            mode='random',
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.char_seg),
            msg="sample doesn't track progress!"
        )
    def test_threshold_progress(self):
        """Does threshold track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        Segmenter.threshold(
            self.other_letter_seg,
            min_count=2,
            max_count=2,
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.other_letter_seg),
            msg="threshold doesn't track progress!"
        )
 def test_recode_no_change(self):
     """Does recode return a Segmentation when no change is made?"""
     segmentation = Segmenter.recode(
         self.entire_text_seg,
     )
     self.assertTrue(
         isinstance(segmentation, Segmentation),
         msg="recode doesn't return a Segmentation when no change is made!"
     )
Beispiel #37
0
 def test_tokenize_import_annotations_tokenize(self):
     """Does tokenize import annotations (mode tokenize)?"""
     segmentation = Segmenter.tokenize(self.word_seg,
                                       [(re.compile(r'\w{2}'), 'tokenize')],
                                       import_annotations=True)
     self.assertEqual(
         segmentation[0].annotations['a'],
         '1',
         msg="tokenize doesn't import annotations (mode tokenize)!")
Beispiel #38
0
 def test_sample_systematic_sample(self):
     """Does sample systematically sample segments?"""
     segmentation, _ = Segmenter.sample(
         self.char_seg,
         sample_size=3,
         mode='systematic',
     )
     self.assertEqual([s.start for s in segmentation], [0, 2, 4],
                      msg="sample doesn't systematically sample segments!")
Beispiel #39
0
 def test_recode_remove_accents(self):
     """Does recode remove accents?"""
     segmentation, _ = Segmenter.recode(
         self.second_word_seg,
         remove_accents=True,
     )
     self.assertEqual([s.get_content() for s in segmentation],
                      ['AB', 'cde'],
                      msg="recode doesn't remove accents!")
 def test_import_xml_condition(self):
     """Does import_xml respect conditions?"""
     segmentation = Segmenter.import_xml(
         self.xml_seg,
         element='a',
         conditions={'attr': re.compile(r'^2$')},
     )
     self.assertEqual([s.annotations['attr'] for s in segmentation], ['2'],
                      msg="import_xml doesn't respect conditions!")
Beispiel #41
0
 def test_select_mode(self):
     """Does select respect mode setting?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
         mode="exclude",
     )
     self.assertEqual([s.get_content() for s in segmentation], ['ab'],
                      msg="select doesn't respect mode setting!")
Beispiel #42
0
 def test_select_import_annotations_false(self):
     """Does select skip importing annotations?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w+'),
         copy_annotations=False,
     )
     self.assertFalse('a' in segmentation[0].annotations,
                      msg="select doesn't skip importing annotations!")
Beispiel #43
0
 def test_sample_autonumber(self):
     """Does sample autonumber input segments?"""
     segmentation, _ = Segmenter.sample(self.char_seg,
                                        sample_size=4,
                                        mode='random',
                                        auto_number_as='num')
     self.assertEqual([s.annotations['num'] for s in segmentation],
                      [1, 2, 3, 4],
                      msg="sample doesn't autonumber input segments!")
Beispiel #44
0
 def test_select_select_neg(self):
     """Does select output complementary segmentation?"""
     _, segmentation = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
     )
     self.assertEqual(
         [s.get_content() for s in segmentation], ['ab'],
         msg="select doesn't output complementary segmentation!")
Beispiel #45
0
 def test_import_xml_segment_elements(self):
     """Does import_xml segment xml elements?"""
     segmentation = Segmenter.import_xml(
         self.xml_seg,
         element='a',
     )
     self.assertEqual([s.get_content() for s in segmentation],
                      ['<a attr="2/3/">c<a/>d</a>', 'c<a/>d'],
                      msg="import_xml doesn't segment xml elements!")
Beispiel #46
0
 def test_import_xml_convert_attributes(self):
     """Does import_xml convert attributes?"""
     segmentation = Segmenter.import_xml(
         self.xml_seg,
         element='a',
     )
     self.assertEqual([s.annotations['attr'] for s in segmentation],
                      ['1', '2/3/'],
                      msg="import_xml doesn't convert attributes!")
Beispiel #47
0
 def test_concatenate_merge_segments(self):
     """Does concatenate merge input segments?"""
     segmentation = Segmenter.concatenate([
         self.letter_seg2,
         self.letter_seg1,
     ], )
     self.assertEqual([s.get_content() for s in segmentation],
                      ['a', 'b', 'c', 'd', 'e'],
                      msg="concatenate doesn't merge input segments!")
Beispiel #48
0
 def test_sample_import_annotations_false(self):
     """Does sample skip importing annotations?"""
     segmentation, _ = Segmenter.sample(
         self.single_letter_seg,
         sample_size=1,
         copy_annotations=False,
     )
     self.assertFalse('b' in segmentation[0].annotations,
                      msg="sample doesn't import annotations!")
Beispiel #49
0
 def test_import_xml_import_annotations_false(self):
     """Does import_xml skip importing annotations?"""
     segmentation = Segmenter.import_xml(
         self.broken_xml_seg,
         element='a',
         import_annotations=False,
     )
     self.assertFalse('a' in segmentation[0].annotations,
                      msg="import_xml doesn't skip importing annotations!")
Beispiel #50
0
 def test_import_xml_import_element_as_annotation(self):
     """Does import_xml import element as annotation?"""
     segmentation = Segmenter.import_xml(
         self.xml_seg,
         element='a',
         import_element_as='test',
     )
     self.assertEqual(
         [s.annotations['test'] for s in segmentation], ['a', 'a'],
         msg="import_xml doesn't import element as annotation!")
 def test_recode_segmentation_as_input(self):
     """Does recode return a Segmentation when input is one?"""
     segmentation = Segmenter.recode(
         self.letter_seg,
         case='upper',
     )
     self.assertTrue(
         isinstance(segmentation, Segmentation),
         msg="recode doesn't return a Segmentation when input is one!"
     )
 def test_threshold_select_no_min_no_max(self):
     """Does threshold select segments (no min, no max)?"""
     segmentation, _ = Segmenter.threshold(
         self.other_letter_seg,
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['a', 'b', 'b', 'c', 'c', 'c'],
         msg="threshold doesn't select segments (no min, no max)!"
     )
 def test_recode_single_input(self):
     """Does recode return a single Input object when needed?"""
     segmentation = Segmenter.recode(
         self.entire_text_seg,
         case='upper',
     )
     self.assertTrue(
         isinstance(segmentation, Input),
         msg="recode doesn't return a single Input object when needed!"
     )
 def test_merge_duplicate_segments(self):
     """Does _merge_duplicate_segments merge duplicates?"""
     segments = Segmenter._merge_duplicate_segments(
         self.duplicate_seg
     )
     self.assertEqual(
         [s.get_content() for s in segments],
         ['a'],
         msg="_merge_duplicate_segments doesn't merge duplicates!"
     )
 def test_intersect_neg(self):
     """Does intersect output complementary segmentation?"""
     _, segmentation = Segmenter.intersect(
         source=self.letter_seg,
         filtering=self.third_letter_seg,
     )
     self.assertEqual(
         ''.join(s.get_content() for s in segmentation),
         'ace',
         msg="intersect doesn't output complementary segmentation!"
     )
 def test_intersect_content_content(self):
     """Does intersect filter segments (content content)?"""
     segmentation, _ = Segmenter.intersect(
         source=self.letter_seg,
         filtering=self.third_letter_seg,
     )
     self.assertEqual(
         ''.join(s.get_content() for s in segmentation),
         'bd',
         msg="intersect doesn't filter segments (content content)!"
     )
 def test_select_select_neg(self):
     """Does select output complementary segmentation?"""
     _, segmentation = Segmenter.select(
         self.word_seg,
         re.compile(r'\w{3,}'),
     )
     self.assertEqual(
         [s.get_content() for s in segmentation],
         ['ab'],
         msg="select doesn't output complementary segmentation!"
     )
 def test_sample_import_annotations_false(self):
     """Does sample skip importing annotations?"""
     segmentation, _ = Segmenter.sample(
         self.single_letter_seg,
         sample_size=1,
         copy_annotations=False,
     )
     self.assertFalse(
         'b' in segmentation[0].annotations,
         msg="sample doesn't import annotations!"
     )
 def test_select_import_annotations_false(self):
     """Does select skip importing annotations?"""
     segmentation, _ = Segmenter.select(
         self.word_seg,
         re.compile(r'\w+'),
         copy_annotations=False,
     )
     self.assertFalse(
         'a' in segmentation[0].annotations,
         msg="select doesn't skip importing annotations!"
     )
Beispiel #60
0
 def test_get_real_str_index_recoded(self):
     """Does get_real_str_index() work with actual str index?"""
     recoded_seg, _ = Segmenter.recode(
         self.char_seg,
         substitutions=[(re.compile(r'[bd]'), 'f')],
     )
     self.assertEqual(
         recoded_seg[-1].get_real_str_index(),
         self.char_seg[0].str_index,
         msg="get_real_str_index() doesn't work with redirected str index!"
     )