def test_recode_overlapping_segmentation(self): """Does recode raise exception for overlapping segmentation?""" with self.assertRaises( ValueError, msg="recode doesn't raise exception for overlapping segmentation!" ): Segmenter.recode( self.overlapping_seg, )
def test_import_xml_exception_missing_opening(self): """Does import_xml detect missing opening tag?""" with self.assertRaises( ValueError, msg="import_xml doesn't detect missing opening tag!" ): Segmenter.import_xml( self.wrong_xml_seg2, element='a', )
def test_tokenize_exception_mode(self): """Does tokenize raise exception for unknown mode?""" with self.assertRaises( ValueError, msg="tokenize doesn't raise exception for unknown mode!" ): Segmenter.tokenize( self.entire_text_seg, [(re.compile(r'\W+'), 'unknown_mode')], )
def test_auto_number_autonumber(self): """Does _auto_number autonumber in place?""" Segmenter._auto_number( self.third_letter_seg, annotation_key='num', ) self.assertEqual( [s.annotations['num'] for s in self.third_letter_seg], [1, 2, 3], msg="_auto_number doesn't autonumber in place!" )
def test_sample_exception_mode(self): """Does sample raise exception for unknown mode?""" with self.assertRaises( ValueError, msg="sample doesn't raise exception for unknown mode!" ): Segmenter.sample( self.entire_text_seg, sample_size=3, mode='unknown_mode', )
def test_parse_xml_tag_is_opening(self): """Does _parse_xml_tag recognize opening tags?""" tags = [ Segmenter._parse_xml_tag('<a>'), Segmenter._parse_xml_tag('<a attr="1"/>'), Segmenter._parse_xml_tag('</a>'), ] self.assertEqual( [tag['is_opening'] for tag in tags], [True, True, False], msg="_parse_xml_tag doesn't recognize opening tags!" )
def test_parse_xml_tag_element_name(self): """Does _parse_xml_tag parse element name?""" tags = [ Segmenter._parse_xml_tag('<a>'), Segmenter._parse_xml_tag('<a attr="1">'), Segmenter._parse_xml_tag('</a>'), Segmenter._parse_xml_tag('<a/>'), ] self.assertEqual( [tag['element'] for tag in tags], ['a', 'a', 'a', 'a'], msg="_parse_xml_tag doesn't parse element name!" )
def test_parse_xml_tag_is_empty(self): """Does _parse_xml_tag recognize empty elements?""" tags = [ Segmenter._parse_xml_tag('<a>'), Segmenter._parse_xml_tag('</a>'), Segmenter._parse_xml_tag('<a/>'), Segmenter._parse_xml_tag('<a attr="1"/>'), ] self.assertEqual( [tag['is_empty'] for tag in tags], [False, False, True, True], msg="_parse_xml_tag doesn't recognize empty elements!" )
def test_import_xml_progress(self): """Does import_xml track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.import_xml( self.broken_xml_seg, element='a', progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.broken_xml_seg), msg="import_xml doesn't track progress!")
def test_tokenize_progress(self): """Does tokenize track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.tokenize( self.word_seg, [(re.compile(r'\w'), 'tokenize')], progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.word_seg), msg="tokenize doesn't track progress!")
def huntTheLexic(self): """ main I/O function, filters the inputSeg with the selected lexical fields and outputs a copy of the input this Segmentation with segments labelised according to the topic they belong in """ # initiations... out = list() selectedListsNames = list() # first we select the topics according to the ones the user chose if self.titleLabels: selectedListsNames = [ list(self.titleLabels)[idx] for idx in self.selectedFields ] # we can then associate the topics with their respective lists selectedLists = { key: value for key, value in defaultDict.items() if key in selectedListsNames } # if we have an input, we can select the segments of the input and # label them according to the lists they are found in if self.inputSeg is not None: for filter_list in selectedLists: work_list = [i for i in selectedLists[filter_list] if i] if work_list: out.append( Segmenter.select( self.inputSeg, self.listToRegex(work_list), label=filter_list, )[0]) # lastly we define the output as a segmentation that is a copy of # the input, with the segments that we found labeled accordingly if self.labelName == "": labelNameVar = "Topic" else: labelNameVar = self.labelName self.outputSeg = Segmenter.concatenate( [Segmenter.bypass(self.inputSeg, label="__None__")] + out, merge_duplicates=True, label=self.captionTitle, import_labels_as=labelNameVar, )
def test_threshold_progress(self): """Does threshold track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.threshold( self.other_letter_seg, min_count=2, max_count=2, progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.other_letter_seg), msg="threshold doesn't track progress!")
def test_sample_progress(self): """Does sample track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.sample( self.char_seg, sample_size=4, mode='random', progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.char_seg), msg="sample doesn't track progress!")
def sendData(self): """Send segmentation to output""" if not self.segmentation: self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Bypassed segmentation', None, self) self.send('Displayed segmentation', None, self) return self.send('Bypassed segmentation', Segmenter.bypass(self.segmentation, self.captionTitle), self) # TODO: Check if this is correct replacement for textable v1.*, v2.* if 'format' in self._currentWarningMessage or \ 'format' in self._currentErrorMessage: self.send('Displayed segmentation', None, self) return if len(self.displayedSegmentation[0].get_content()) > 0: self.send('Displayed segmentation', self.displayedSegmentation, self) else: self.send('Displayed segmentation', None, self) # TODO: Differes only in capitalization with a check before # Is this intentional? if "Format" not in self._currentErrorMessage: message = u'%i segment@p sent to output.' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()
def test_select_annotations(self): """Does select work with annotations?""" segmentation, _ = Segmenter.select(self.word_seg, re.compile(r'.'), annotation_key='a') self.assertEqual([s.get_content() for s in segmentation], ['ab'], msg="select doesn't work with annotations!")
def test_threshold_select_no_min_no_max(self): """Does threshold select segments (no min, no max)?""" segmentation, _ = Segmenter.threshold(self.other_letter_seg, ) self.assertEqual( [s.get_content() for s in segmentation], ['a', 'b', 'b', 'c', 'c', 'c'], msg="threshold doesn't select segments (no min, no max)!")
def test_concatenate_progress(self): """Does concatenate track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.concatenate( [self.letter_seg1], progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.letter_seg1), msg="concatenate doesn't track progress!" )
def test_intersect_autonumber(self): """Does intersect autonumber input segments?""" segmentation, _ = Segmenter.intersect(source=self.letter_seg, filtering=self.third_letter_seg, auto_number_as='num') self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2], msg="intersect doesn't autonumber input segments!")
def test_import_xml_autonumber(self): """Does import_xml autonumber input segments?""" segmentation = Segmenter.import_xml(self.xml_seg, element='a', auto_number_as='num') self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2], msg="import_xml doesn't autonumber input segments!")
def send_data(self): """Creates the inputs based on the fetched data""" self.controlArea.setDisabled(True) self.clearCreatedInputs() segmentation = None # Goes over each queries in the data list for query in self.queryList: for text in query: # Create inputs newInput = Input(text) self.createdInputs.append(newInput) # If there is only one input, create a segmentation... if len(self.createdInputs) == 1: segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... annotations = list() for elem in self.annotList: for dic in elem: annotations.append(dic) for idx, segment in enumerate(segmentation): segment.annotations.update(annotations[idx]) segmentation[idx] = segment # Calculate number of characters... num_chars = 0 for segment in segmentation: num_chars += len(Segmentation.get_data(segment.str_index)) # If there is data... if len(segmentation) != 0: # Inform the user of the number of segments and the number of characters... self.infoBox.setText( "{} segments sent to output ({} characters)".format( len(segmentation), num_chars, )) # Send the segments self.send("Segmentation", segmentation) self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() else: # Else, signal the user that no data is sendable... self.infoBox.setText( "There are {} segments to send to output. Please fill the query basket and click 'send' again" .format(len(segmentation)), "warning") self.sendButton.resetSettingsChangedFlag() self.controlArea.setDisabled(False) self.send("Segmentation", None)
def test_tokenize_import_annotations_false_split(self): """Does tokenize skip importing annotations (mode split)?""" segmentation = Segmenter.tokenize(self.word_seg, [(re.compile(r'a'), 'split')], import_annotations=False) self.assertFalse( 'a' in segmentation[0].annotations, msg="tokenize doesn't skip importing annotations (mode split)!")
def test_parse_xml_tag_attributes(self): """Does _parse_xml_tag parse attributes?""" tag = Segmenter._parse_xml_tag('<a attr1="2" attr3="4">') self.assertEqual( tag['attributes'], {'attr1': '2', 'attr3': '4'}, msg="_parse_xml_tag doesn't parse attributes!" )
def test_select_select(self): """Does select select segments?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), ) self.assertEqual([s.get_content() for s in segmentation], ['cde'], msg="select doesn't select segments!")
def test_select_progress(self): """Does select track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.select( self.char_seg, re.compile(r'.'), progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.char_seg), msg="select doesn't track progress!" )
def test_intersect_progress(self): """Does intersect track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.intersect( source=self.letter_seg, filtering=self.third_letter_seg, progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.letter_seg), msg="intersect doesn't track progress!" )
def test_select_autonumber(self): """Does select autonumber input segments?""" segmentation, _ = Segmenter.select(self.char_seg, re.compile(r'.'), auto_number_as='num') self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2, 3, 4, 5, 6], msg="select doesn't autonumber input segments!")
def test_parse_xml_tag_attributes(self): """Does _parse_xml_tag parse attributes?""" tag = Segmenter._parse_xml_tag('<a attr1="2" attr3="4">') self.assertEqual(tag['attributes'], { 'attr1': '2', 'attr3': '4' }, msg="_parse_xml_tag doesn't parse attributes!")
def test_bypass_copy_segments(self): """Does bypass copy input segments?""" segmentation = Segmenter.bypass(self.letter_seg) self.assertEqual( [s.get_content() for s in segmentation], [s.get_content() for s in self.letter_seg], msg="bypass doesn't copy input segments!" )
def test_tokenize_progress(self): """Does tokenize track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.tokenize( self.word_seg, [(re.compile(r'\w'), 'tokenize')], progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.word_seg), msg="tokenize doesn't track progress!" )
def test_import_xml_progress(self): """Does import_xml track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.import_xml( self.broken_xml_seg, element='a', progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.broken_xml_seg), msg="import_xml doesn't track progress!" )
def test_bypass_deepcopy(self): """Does bypass deep copy input segments?""" segmentation = Segmenter.bypass(self.letter_seg) self.assertNotEqual( segmentation, self.letter_seg, msg="bypass doesn't deep copy input segments!" )
def test_bypass_copy_annotations(self): """Does bypass copy annotations?""" segmentation = Segmenter.bypass(self.other_letter_seg) self.assertEqual( [s.annotations['a'] for s in segmentation], [s.annotations['a'] for s in self.other_letter_seg], msg="bypass doesn't copy annotations!" )
def sendData(self): if not self.file: self.infoBox.setText(u"Please select input file.", "warning") self.send('Text data', None, self) return # Clear created Inputs. self.clearCreatedInputs() # Get transcription try: transcription = self.get_large_audio_transcription( self.file, language=self.language, set_silence_len=self.selected_dur, set_silence_threshold=self.selected_vol) except speech_recognition.UnknownValueError as err: self.infoBox.setText( u"You seem to have overuseed the built-in API key, refer to the documentation for further informations.", "warning") self.send('Text data', None, self) return # Checks if there is a transcription if transcription is None: self.infoBox.setText(u"You must use mp3 or wav audio files.", "warning") self.send('Text data', None, self) return # Regex to get the name of the input file title = self.file regex = re.compile("[^(/\\)]+[mp3|wav]$") match = re.findall(regex, title) if self.selected_seg: for chunk in transcription: new_input = Input(chunk, label=match) self.createdInputs.append(new_input) else: new_input = Input(transcription, label=match) self.createdInputs.append(new_input) # Concatenates the segmentations in the output segmentation self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=False, import_labels_as="") #Sending segments length message = " Succesfully transcripted ! % i segment@p sent to output" % len( self.segmentation) message = pluralize(message, len(self.segmentation)) # Send token... self.send("Text data", self.segmentation, self) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()
def test_sample_progress(self): """Does sample track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.sample( self.char_seg, sample_size=4, mode='random', progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.char_seg), msg="sample doesn't track progress!" )
def test_threshold_progress(self): """Does threshold track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.threshold( self.other_letter_seg, min_count=2, max_count=2, progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.other_letter_seg), msg="threshold doesn't track progress!" )
def test_recode_no_change(self): """Does recode return a Segmentation when no change is made?""" segmentation = Segmenter.recode( self.entire_text_seg, ) self.assertTrue( isinstance(segmentation, Segmentation), msg="recode doesn't return a Segmentation when no change is made!" )
def test_tokenize_import_annotations_tokenize(self): """Does tokenize import annotations (mode tokenize)?""" segmentation = Segmenter.tokenize(self.word_seg, [(re.compile(r'\w{2}'), 'tokenize')], import_annotations=True) self.assertEqual( segmentation[0].annotations['a'], '1', msg="tokenize doesn't import annotations (mode tokenize)!")
def test_sample_systematic_sample(self): """Does sample systematically sample segments?""" segmentation, _ = Segmenter.sample( self.char_seg, sample_size=3, mode='systematic', ) self.assertEqual([s.start for s in segmentation], [0, 2, 4], msg="sample doesn't systematically sample segments!")
def test_recode_remove_accents(self): """Does recode remove accents?""" segmentation, _ = Segmenter.recode( self.second_word_seg, remove_accents=True, ) self.assertEqual([s.get_content() for s in segmentation], ['AB', 'cde'], msg="recode doesn't remove accents!")
def test_import_xml_condition(self): """Does import_xml respect conditions?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', conditions={'attr': re.compile(r'^2$')}, ) self.assertEqual([s.annotations['attr'] for s in segmentation], ['2'], msg="import_xml doesn't respect conditions!")
def test_select_mode(self): """Does select respect mode setting?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), mode="exclude", ) self.assertEqual([s.get_content() for s in segmentation], ['ab'], msg="select doesn't respect mode setting!")
def test_select_import_annotations_false(self): """Does select skip importing annotations?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w+'), copy_annotations=False, ) self.assertFalse('a' in segmentation[0].annotations, msg="select doesn't skip importing annotations!")
def test_sample_autonumber(self): """Does sample autonumber input segments?""" segmentation, _ = Segmenter.sample(self.char_seg, sample_size=4, mode='random', auto_number_as='num') self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2, 3, 4], msg="sample doesn't autonumber input segments!")
def test_select_select_neg(self): """Does select output complementary segmentation?""" _, segmentation = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), ) self.assertEqual( [s.get_content() for s in segmentation], ['ab'], msg="select doesn't output complementary segmentation!")
def test_import_xml_segment_elements(self): """Does import_xml segment xml elements?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', ) self.assertEqual([s.get_content() for s in segmentation], ['<a attr="2/3/">c<a/>d</a>', 'c<a/>d'], msg="import_xml doesn't segment xml elements!")
def test_import_xml_convert_attributes(self): """Does import_xml convert attributes?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', ) self.assertEqual([s.annotations['attr'] for s in segmentation], ['1', '2/3/'], msg="import_xml doesn't convert attributes!")
def test_concatenate_merge_segments(self): """Does concatenate merge input segments?""" segmentation = Segmenter.concatenate([ self.letter_seg2, self.letter_seg1, ], ) self.assertEqual([s.get_content() for s in segmentation], ['a', 'b', 'c', 'd', 'e'], msg="concatenate doesn't merge input segments!")
def test_sample_import_annotations_false(self): """Does sample skip importing annotations?""" segmentation, _ = Segmenter.sample( self.single_letter_seg, sample_size=1, copy_annotations=False, ) self.assertFalse('b' in segmentation[0].annotations, msg="sample doesn't import annotations!")
def test_import_xml_import_annotations_false(self): """Does import_xml skip importing annotations?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', import_annotations=False, ) self.assertFalse('a' in segmentation[0].annotations, msg="import_xml doesn't skip importing annotations!")
def test_import_xml_import_element_as_annotation(self): """Does import_xml import element as annotation?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', import_element_as='test', ) self.assertEqual( [s.annotations['test'] for s in segmentation], ['a', 'a'], msg="import_xml doesn't import element as annotation!")
def test_recode_segmentation_as_input(self): """Does recode return a Segmentation when input is one?""" segmentation = Segmenter.recode( self.letter_seg, case='upper', ) self.assertTrue( isinstance(segmentation, Segmentation), msg="recode doesn't return a Segmentation when input is one!" )
def test_threshold_select_no_min_no_max(self): """Does threshold select segments (no min, no max)?""" segmentation, _ = Segmenter.threshold( self.other_letter_seg, ) self.assertEqual( [s.get_content() for s in segmentation], ['a', 'b', 'b', 'c', 'c', 'c'], msg="threshold doesn't select segments (no min, no max)!" )
def test_recode_single_input(self): """Does recode return a single Input object when needed?""" segmentation = Segmenter.recode( self.entire_text_seg, case='upper', ) self.assertTrue( isinstance(segmentation, Input), msg="recode doesn't return a single Input object when needed!" )
def test_merge_duplicate_segments(self): """Does _merge_duplicate_segments merge duplicates?""" segments = Segmenter._merge_duplicate_segments( self.duplicate_seg ) self.assertEqual( [s.get_content() for s in segments], ['a'], msg="_merge_duplicate_segments doesn't merge duplicates!" )
def test_intersect_neg(self): """Does intersect output complementary segmentation?""" _, segmentation = Segmenter.intersect( source=self.letter_seg, filtering=self.third_letter_seg, ) self.assertEqual( ''.join(s.get_content() for s in segmentation), 'ace', msg="intersect doesn't output complementary segmentation!" )
def test_intersect_content_content(self): """Does intersect filter segments (content content)?""" segmentation, _ = Segmenter.intersect( source=self.letter_seg, filtering=self.third_letter_seg, ) self.assertEqual( ''.join(s.get_content() for s in segmentation), 'bd', msg="intersect doesn't filter segments (content content)!" )
def test_select_select_neg(self): """Does select output complementary segmentation?""" _, segmentation = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), ) self.assertEqual( [s.get_content() for s in segmentation], ['ab'], msg="select doesn't output complementary segmentation!" )
def test_sample_import_annotations_false(self): """Does sample skip importing annotations?""" segmentation, _ = Segmenter.sample( self.single_letter_seg, sample_size=1, copy_annotations=False, ) self.assertFalse( 'b' in segmentation[0].annotations, msg="sample doesn't import annotations!" )
def test_select_import_annotations_false(self): """Does select skip importing annotations?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w+'), copy_annotations=False, ) self.assertFalse( 'a' in segmentation[0].annotations, msg="select doesn't skip importing annotations!" )
def test_get_real_str_index_recoded(self): """Does get_real_str_index() work with actual str index?""" recoded_seg, _ = Segmenter.recode( self.char_seg, substitutions=[(re.compile(r'[bd]'), 'f')], ) self.assertEqual( recoded_seg[-1].get_real_str_index(), self.char_seg[0].str_index, msg="get_real_str_index() doesn't work with redirected str index!" )