def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.other_entire_text_seg = Input('d') str_index = self.entire_text_seg[0].str_index self.first_word_seg = Segmentation( [ Segment( str_index=str_index, start=0, end=2, annotations={'a': 1} ) ] ) self.last_word_seg = Segmentation( [Segment(str_index=str_index, start=3, end=6)] ) self.char_seg = Segmentation( [ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=2, end=3), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5), Segment(str_index=str_index, start=5, end=6), ] )
def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=self.str_index, start=0, end=2, annotations={ 'a': '1', 'bc': '20' }), Segment(str_index=self.str_index, start=3, end=6) ]) self.overlapping_seg = Segmentation([ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ]) self.base_output_string = ('segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6') % (self.str_index, self.str_index) self.count = 0
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model if not self.model: self.noLanguageModelWarning() return # Check that there's an input if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.send('Summary', None, self) self.send('HTML_Summary', None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) # Type of segmentation (per segment or per segmentation) segments = list() html_segments = list() if self.typeSeg == "Summarize each segments individually": # Process each segment separately, then create segmentation for segment in self.inputSeg: content = segment.get_content() resume, html_resume = self.summarize(self.cv, content) segments.append(Segment(str_index=resume[0].str_index, )) html_segments.append( Segment(str_index=html_resume[0].str_index, )) elif self.typeSeg == "Summarize all segments as one": merged_seg = " ".join( [segment.get_content() for segment in self.inputSeg]) resume, html_resume = self.summarize(self.cv, merged_seg) segments.append(Segment(str_index=resume[0].str_index, )) html_segments.append(Segment(str_index=html_resume[0].str_index, )) # Create segmentation from segment() and assign it to the output self.outputSeg = Segmentation(segments, self.captionTitle) self.html_outputSeg = Segmentation(html_segments, self.captionTitle) # Send segmentation to output channels self.send("Summary", self.outputSeg, self) self.send('HTML_Summary', self.html_outputSeg, self) # Set message to sent message = "%i segment@p sent to output " % len(self.outputSeg) message = pluralize(message, len(self.outputSeg)) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag() self.controlArea.setDisabled(False)
def test_clear_string(self): """Does clear set stored string to None?""" seg = Input('test3') seg.clear() self.assertEqual(Segmentation.get_data(-1), None, msg="clear doesn't set stored string to None!")
def test_update_string(self): """Does update modify stored string?""" seg = Input('test2') seg.update('modified') self.assertEqual(Segmentation.get_data(-1)[:], 'modified', msg="update doesn't modify stored string!")
def send_data(self): """Creates the inputs based on the fetched data""" self.controlArea.setDisabled(True) self.clearCreatedInputs() segmentation = None # Goes over each queries in the data list for query in self.queryList: for text in query: # Create inputs newInput = Input(text) self.createdInputs.append(newInput) # If there is only one input, create a segmentation... if len(self.createdInputs) == 1: segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... annotations = list() for elem in self.annotList: for dic in elem: annotations.append(dic) for idx, segment in enumerate(segmentation): segment.annotations.update(annotations[idx]) segmentation[idx] = segment # Calculate number of characters... num_chars = 0 for segment in segmentation: num_chars += len(Segmentation.get_data(segment.str_index)) # If there is data... if len(segmentation) != 0: # Inform the user of the number of segments and the number of characters... self.infoBox.setText( "{} segments sent to output ({} characters)".format( len(segmentation), num_chars, )) # Send the segments self.send("Segmentation", segmentation) self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() else: # Else, signal the user that no data is sendable... self.infoBox.setText( "There are {} segments to send to output. Please fill the query basket and click 'send' again" .format(len(segmentation)), "warning") self.sendButton.resetSettingsChangedFlag() self.controlArea.setDisabled(False) self.send("Segmentation", None)
def test_slice_string(self): """Does the slicing work like in strings""" Input('Hello world!') self.assertEqual( Segmentation.get_data(-1)[3:7], u"Hello world!"[3:7], msg="slicing doesn't return the same as in strings" )
def test_creator_store_string(self): """Does creator store string in class variable?""" Input(u'test') self.assertEqual( Segmentation.get_data(-1)[:], u'test', msg="creator doesn't store string in class variable!" )
def test_update_string(self): """Does update modify stored string?""" seg = Input(u'test2') seg.update(u'modified') self.assertEqual( Segmentation.get_data(-1)[:], u'modified', msg="update doesn't modify stored string!" )
def test_clear_string(self): """Does clear set stored string to None?""" seg = Input('test3') seg.clear() self.assertEqual( Segmentation.get_data(-1), None, msg="clear doesn't set stored string to None!" )
def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation( [ Segment( str_index=self.str_index, start=0, end=2, annotations={'a': '1', 'bc': '20'} ), Segment( str_index=self.str_index, start=3, end=6 ) ] ) self.overlapping_seg = Segmentation( [ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ] ) self.base_output_string = ( 'segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6' ) % (self.str_index, self.str_index) self.count = 0
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") del self.headerList[:] self.headerList = self.headerList self.send("CSV Segmentation", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # Treat... for segment in self.csvSeg: pass progressBar.advance() # Set status to OK and report data size... outputSeg = Segmentation(self.csvSeg, label=self.captionTitle) if len(self.contentIsNone) == 0 : message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # message if one or more segments has no content and has been ignored elif len(self.contentIsNone) == 1: message = "%i segment@p sent to output. (ignored %i segment with \ no content)" % (len(outputSeg), len(self.contentIsNone)) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) else : message = "%i segment@p sent to output. (ignored %i segments with \ no content)" % (len(outputSeg), len(self.contentIsNone)) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("CSV Segmentation", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:]
def sendData(self): """Compute result of widget processing and send to output""" # Clear created Inputs self.clearCreatedInputs() if self.service == u'Twitter': try: self.createdInputs = self.get_tweets( self.word_to_search, self.nb_tweet, self.include_RT, self.useTwitterLicenseKey, ( self.twitterLicenseKeysConsumerKey, self.twitterLicenseKeysConsumerSecret, ( self.twitterLicenseKeysAccessToken, self.twitterLicenseKeysAccessTokenSecret ) ) ) except (HTTP401Authentication, HTTP400BadRequest): self.infoBox.setText( u'Please enter valid Twitter api keys.', u'error', ) self.send(u'Text data', None, self) return False except SearchEngineLimitError: self.infoBox.setText( u'Twitter search limit has been exceeded.', u'error', ) self.send(u'Text data', None, self) return False elif self.service == u'Wikipedia': self.createdInputs = self.get_wiki_article( self.word_to_search, self.wiki_section, self.wiki_type_of_text ) elif self.service == u'Bing': self.createdInputs = self.get_bing_entries( self.word_to_search, self.nb_bing_entry ) if len(self.createdInputs) == 0: self.infoBox.setText( u'Please try to change query or settings.', u'warning', ) self.send(u'Text data', None, self) return False # Initialize progress bar progressBar = OWGUI.ProgressBar( self, iterations=50 ) output_segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None ) message = u'%i segment@p sent to output ' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) numChars = 0 for segment in output_segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) for _ in xrange(50): progressBar.advance() # Clear progress bar. progressBar.finish() self.send('Text data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): # Si le lien vers treetagger n"est pas trouve if self.NoLink: self.infoBox.setText(u"Sorry, TreeTagger's link not found.", "error") self.send("Text data", None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText(u"Widget needs input", "warning") self.send("Text data", None) # affiche que quelque chose se passe... else: self.infoBox.setText(u"TreeTagger is running...", "warning") # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = gui.ProgressBar(self, iterations=5) # Copie de la segmentation avec ajout d"une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()]) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d"un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d"un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d"un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w") self.infoBox.dataSent("") # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", "w") else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), "w") file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send("Text data", final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
class TestSegmentation(unittest.TestCase): """Test suite for LTTL Segment module""" def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation( [ Segment( str_index=self.str_index, start=0, end=2, annotations={'a': '1', 'bc': '20'} ), Segment( str_index=self.str_index, start=3, end=6 ) ] ) self.overlapping_seg = Segmentation( [ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ] ) self.base_output_string = ( 'segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6' ) % (self.str_index, self.str_index) self.count = 0 def tearDown(self): """Cleaning up after the test""" pass def test_creator(self): """Does creator return Segmentation object?""" self.assertIsInstance( Segmentation(), Segmentation, msg="creator doesn't return Segmentation object!" ) def test_to_string_default_format(self): """Does to_string() format segmentation correctly by default?""" output_string = self.word_seg.to_string() self.assertEqual( output_string, self.base_output_string, msg="to_string() doesn't format segmentation correctly by default!" ) def test_to_string_header(self): """Does to_string() format header correctly?""" output_string = self.word_seg.to_string( header='HEADER', ) self.assertEqual( output_string, 'HEADER' + self.base_output_string, msg="to_string() doesn't format header correctly!" ) def test_to_string_footer(self): """Does to_string() format footer correctly?""" output_string = self.word_seg.to_string( footer='FOOTER', ) self.assertEqual( output_string, self.base_output_string + 'FOOTER', msg="to_string() doesn't format footer correctly!" ) def test_to_string_humanize_addresses(self): """Does to_string() humanize addresses?""" output_string = self.word_seg.to_string( humanize_addresses=True, ) humanized_str_index = self.str_index + 1 humanized_string = self.base_output_string.replace('t:\t3', 't:\t4') humanized_string = humanized_string.replace('t:\t0', 't:\t1') humanized_string = humanized_string.replace( 'x:\t%i' % self.str_index, 'x:\t%i' % humanized_str_index ) self.assertEqual( output_string, humanized_string, msg="to_string() doesn't humanize addresses!" ) def test_to_string_interpolate_builtin_variables(self): """Does to_string() interpolate builtin variables?""" output_string = self.word_seg.to_string( formatting=( '%(__num__)s,%(__content__)s,' '%(__str_index__)s,%(__start__)s,%(__end__)s,' '%(__str_index_raw__)s,%(__start_raw__)s,%(__end_raw__)s' ) ) self.assertEqual( output_string, '1,ab,%i,0,2,%i,0,2\n2,cde,%i,3,6,%i,3,6' % ( self.str_index, self.str_index, self.str_index, self.str_index ), msg="to_string() doesn't interpolate builtin variables!" ) def test_to_string_interpolate_annotations(self): """Does to_string() interpolate annotations?""" output_string = self.word_seg.to_string( formatting='%(a)s' ) self.assertEqual( output_string, '1\n__none__', msg="to_string() doesn't interpolate annotations!" ) def test_to_string_progress(self): """Does to_string track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 self.word_seg.to_string( progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.word_seg), msg="to_string doesn't track progress!" ) def test_get_annotation_keys(self): """Does get_annotation_keys() return existing annotations?""" annotations = self.word_seg.get_annotation_keys() self.assertEqual( sorted(annotations), sorted(['a', 'bc']), msg="get_annotation_keys() doesn't return existing annotations!" ) def test_is_non_overlapping(self): """Does is_non_overlapping() recognize absence of overlap?""" self.assertTrue( self.word_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize absence of overlap!" ) def test_is_overlapping(self): """Does is_non_overlapping() recognize presence of overlap?""" self.assertFalse( self.overlapping_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize presence of overlap!" )
def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[self.file, self.encoding, "", "", "", "eng", False]] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myFiles)) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] pdf_password = myFile[4] # SuperTextFiles ocr_languages = myFile[5] # SuperTextFiles ocr_force = myFile[6] # SuperTextFiles myFiletype = filetype.guess(myFile[0]) # SuperTextFiles # Try to open the file... self.error() # Start SuperTextFiles try: if myFiletype is None: fileContent = self.extract_raw_text(filePath, encoding) elif myFiletype.extension == "pdf": if ocr_force is True: fileContent = self.get_pdf_content( filePath, ocr_languages, ) else: if self.is_textual_pdf_file(filePath) is True: fileContent = self.extract_text_from_pdf(filePath) else: fileContent = self.get_pdf_content( filePath, ocr_languages, ) elif myFiletype.extension in IMG_FILETYPES: fileContent = self.ocrize(filePath, ocr_languages) if fileContent == -1: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # End SuperTextFiles except IOError as e: if "tesseract" in str(e): QMessageBox.warning(None, 'Textable', str(e), QMessageBox.Ok) progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) selectedTexts = list() text_content = list() annotations = list() # get the Gutenberg cache cache = GutenbergCache.get_cache() try: # TODO: Retrieve selected texts from gutenberg for text in self.myBasket: # Get the id of the text query_id = cache.native_query( sql_query= "select gutenbergbookid from books where id == {selected_id}" .format(selected_id=text[2])) gutenberg_id = list(query_id) # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0])) text_content.append(gutenberg_text) annotations.append(text[1]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) return # TODO: send gutenberg texts as output # Store downloaded lyrics strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # TODO: annotate with book metadata # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Connect to imdb and add elements in lists list_review = list() list_annotation = list() annotations = list() try: for item in self.myBasket: movie = self.ia.get_movie_reviews(item['id']) movie_annotations = self.ia.get_movie(item['id']) list_review.append(movie) list_annotation.append(movie_annotations) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from imdb", "error") self.controlArea.setDisabled(False) return # Store movie critics strings in input objects... for movie in list_review: data = movie.get('data', "") reviews_data = data.get('reviews') for review in reviews_data: reviews = review.get('content') newInput = Input(reviews) self.createdInputs.append(newInput) for item in list_annotation: print(item) # Store the annotation as dicts in a separate list annotations_dict = {"title": item, "year": item["year"]} annot_dict_copy = annotations_dict.copy() for i in range(25): annotations.append(annot_dict_copy) print(annotations) # If there's only one item, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = f"{len(self.segmentation)} segment@p sent to output" message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += " (%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Segmentation', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) tokenizedSegments = list() # Process each input segment... for segment in self.inputSeg: # Input segment attributes... inputContent = segment.get_content() inputAnnotations = segment.annotations inputString = segment.str_index inputStart = segment.start or 0 inputEnd = segment.end or len(inputContent) # NLP analysis... doc = self.nlp(inputContent) # Process each token in input segment... for token in doc: tokenAnnotations = inputAnnotations.copy() tokenAnnotations.update({ k: getattr(token, k) for k in RELEVANT_KEYS if getattr(token, k) is not None }) tokenStart = inputStart + token.idx tokenizedSegments.append( Segment( str_index=inputString, start=tokenStart, end=tokenStart + len(token), annotations=tokenAnnotations, )) progressBar.advance() outputSeg = Segmentation(tokenizedSegments, self.captionTitle) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) print(outputSeg.to_string()) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) text_content = list() annotations = list() try: # Retrieve selected texts from gutenberg for text in self.myBasket: gutenberg_id = text[2] # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode( "utf-8") text_content.append(gutenberg_text) # populate the annotation list annotations.append([text[0], text[1], text[3]]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception as exc: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) print(exc) return # Store downloaded text strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there's only one text, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation. else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments with book metadata for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx][0]}) segment.annotations.update({"author": annotations[idx][1]}) segment.annotations.update({"language": annotations[idx][2]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=str_index, start=0, end=2, annotations={'a': '1'}), Segment(str_index=str_index, start=3, end=6) ]) self.char_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=2, end=3), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5), Segment(str_index=str_index, start=5, end=6), ]) self.letter_seg1 = Segmentation([ Segment(str_index=str_index, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index, start=1, end=2), ]) self.letter_seg2 = Segmentation([ Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5, annotations={'b': '2'}), Segment(str_index=str_index, start=5, end=6), ]) self.letter_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5, annotations={'b': '2'}), Segment(str_index=str_index, start=5, end=6), ]) self.single_letter_seg = Segmentation([ Segment(str_index=str_index, start=4, end=5, annotations={'b': '1'}), ]) self.duplicate_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=0, end=1), ]) self.overlapping_seg = Segmentation([ Segment(str_index=str_index, start=3, end=5), Segment(str_index=str_index, start=4, end=6), ]) self.other_entire_text_seg = Input('abbccc') str_index2 = self.other_entire_text_seg[0].str_index self.other_letter_seg = Segmentation([ Segment(str_index=str_index2, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index2, start=1, end=2, annotations={'a': '1'}), Segment(str_index=str_index2, start=2, end=3, annotations={'a': '1'}), Segment(str_index=str_index2, start=3, end=4, annotations={'a': '2'}), Segment(str_index=str_index2, start=4, end=5, annotations={'a': '2'}), Segment(str_index=str_index2, start=5, end=6, annotations={'a': '3'}), ]) self.third_entire_text_seg = Input('bd1') str_index3 = self.third_entire_text_seg[0].str_index self.third_letter_seg = Segmentation([ Segment(str_index=str_index3, start=0, end=1), Segment(str_index=str_index3, start=1, end=2, annotations={'a': '2'}), Segment(str_index=str_index3, start=2, end=3, annotations={'a': 'b'}), ]) self.fourth_entire_text_seg = Input('AB cd\xe9') str_index = self.fourth_entire_text_seg[0].str_index self.second_word_seg = Segmentation([ Segment(str_index=str_index, start=0, end=2), Segment(str_index=str_index, start=3, end=6), ]) self.xml_seg = Input('<a attr="1"><a attr="2/3/">c<a/>d</a></a>') self.wrong_xml_seg = Input('<a><a>test</a>') self.wrong_xml_seg2 = Input('<a>test</a></a>') self.part_xml_seg = Input('<a>1<a>2<a>3</a>4') str_index3 = self.part_xml_seg[0].str_index self.part_xml_seg2 = Input('</a>5</a>') str_index4 = self.part_xml_seg2[0].str_index self.broken_xml_seg = Segmentation([ Segment(str_index=str_index3, annotations={'a': '1'}), Segment(str_index=str_index4), ]) self.count = 0
def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText(u"Widget needs input", "warning") self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar(self, iterations=5) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join([ "%s=%s" % ( ''.join(c for c in unicodedata.normalize('NFD', item[0]) if unicodedata.category(c) != 'Mn'), quoteattr(str(item[1])), ) for item in segment.annotations.items() ]) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Replace <unknown> with [unknown] and " with " then # re-segment to match the original segmentation structure. tagged_segmentation, _ = Segmenter.recode( tagged_input, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r'"""'), '"""'), ], ) tagged_segmentation = Segmenter.import_xml(tagged_segmentation, "ax_tt") self.progressBar.advance() # Place each output line of Treetagger in an xml tag with annotations.. xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w") except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error") self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Send data from website springfieldspringfield""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning" ) self.segmentation = None self.send("Movie transcripts", self.segmentation, self) return # Clear created Inputs. self.clearCreatedInputs() annotations = list() script_list = list() annotations_dict = dict() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # This part of code is what fetches the actual script try: for movie in self.myBasket: # Each movie that is in the corpus is split into title and year # (rsplit makes sure to only split last occurence) which will # become annotations b = copy.copy(movie) future_annotation = b.rsplit('(', 1) movie_title = future_annotation[0] movie_year = future_annotation[-1] movie_year = movie_year[:-1] annotations_dict["Movie Title"] = movie_title annotations_dict["Year of release"] = movie_year # It is important to make a copy of dictionary, otherwise each # iteration will replace every element of the annotations list annotations.append(annotations_dict.copy()) # link_end and page_url are the two variables that will have to # be changed in case scripts need to be taken from elsewhere link_end = self.path_storage[movie] page_url = "https://www.springfieldspringfield.co.uk/" + \ "movie_script.php?movie=" + link_end page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') # This is what grabs the movie script script = soup.find("div", {"class":"movie_script"}) script_list.append(script.text) # 1 tick on the progress bar of the widget progressBar.advance() except: self.infoBox.setText( "Couldn't download data from SpringfieldSpringfield website.", "error" ) self.controlArea.setDisabled(False) return # Store downloaded script strings in input objects... for script in script_list: newInput = Input(script, self.captionTitle) self.createdInputs.append(newInput) # If there's only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Movie transcripts", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def clearCreatedInputIndices(self): for i in self.createdInputIndices: Segmentation.set_data(i, None)
def test_creator(self): """Does creator return Segmentation object?""" self.assertIsInstance( Segmentation(), Segmentation, msg="creator doesn't return Segmentation object!")
def sendData(self): """Compute result of widget processing and send to output""" # Clear morphology... self.morphology = dict() # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Morphologically analyzed data", None, self) self.updateGUI() return # Perform morphological analysis... # Initialize progress bar. self.infoBox.setText( u"Processing, please wait (word count)...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=100) # Word count... wordCounts = collections.Counter( [segment.get_content() for segment in self.inputSeg]) self.morphology["wordCounts"] = wordCounts self.infoBox.setText( u"Processing, please wait (signature extraction)...", "warning", ) progressBar.advance(5) # 5 ticks on the progress bar... # Learn signatures... try: lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts) self.morphology["signatures"] = signatures self.morphology["stems"] = stems self.morphology["suffixes"] = suffixes except ValueError as e: self.infoBox.setText(e.__str__(), "warning") self.send("Morphologically analyzed data", None, self) self.controlArea.setDisabled(False) progressBar.finish() # Clear progress bar. self.morphology = dict() self.updateGUI() return self.infoBox.setText( u"Processing, please wait (word parsing)...", "warning", ) progressBar.advance(80) # Parse words... parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes) self.morphology["parser"] = parser newSegments = list() num_analyzed_words = 0 for segment in self.inputSeg: parses = parser[segment.get_content()] newSegment = segment.deepcopy() if parses[0].signature: num_analyzed_words += 1 newSegment.annotations.update( { "stem": parses[0].stem, "suffix": parses[0].suffix \ if len(parses[0].suffix) else "NULL", "signature": parses[0].signature } ) newSegments.append(newSegment) self.send( "Morphologically analyzed data", Segmentation(newSegments, self.captionTitle), self, ) self.updateGUI() progressBar.advance(15) # Set status to OK and report data size... message = "%i segment@p sent to output (%.2f%% analyzed)." % (len( self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100)) message = pluralize(message, len(self.inputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Convert input(s) and send output""" if not (self.segmentation or self.corpus): self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Textable segmentation', None, self) self.send('Text Mining corpus', None) return msg_seg = msg_corpus = "" num_iterations = 0 if self.corpus: num_iterations += len(self.corpus) if self.segmentation: num_iterations += len(self.segmentation) self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_iterations) # Convert corpus to segmentation... if self.corpus: self.clearCreatedInputs() new_segments = list() text_feature = self.corpus.text_features[self.segmentContent] for row in self.corpus: content = row[text_feature].value if content == "": continue new_input = Input(row[text_feature].value) new_segment_annotations = dict() for attr in self.corpus.domain: attr_str = str(row[attr]) if attr_str != "?": new_segment_annotations[str(attr)] = attr_str for meta_attr in self.corpus.domain.metas: meta_attr_str = str(row[meta_attr]) if (meta_attr != text_feature and meta_attr_str != "?"): new_segment_annotations[str(meta_attr)] = meta_attr_str new_segments.append( Segment(new_input[0].str_index, new_input[0].start, new_input[0].end, new_segment_annotations)) self.createdInputs.append(new_input) progressBar.advance() new_segmentation = Segmentation(new_segments, self.captionTitle) msg_seg = u'%i segment@p' % len(new_segmentation) msg_seg = pluralize(msg_seg, len(new_segmentation)) self.send('Textable segmentation', new_segmentation, self) else: self.send('Textable segmentation', None, self) # Convert segmentation to corpus... if self.segmentation: metas = list() attributes = list() meta_keys = list() attribute_keys = list() for key in self.segmentation.get_annotation_keys(): possible_values = set() for segment in self.segmentation: try: possible_values.add(str(segment.annotations[key])) except KeyError: pass if (self.limitNumCategories and len(possible_values) > self.maxNumCategories): metas.append(StringVariable(key)) meta_keys.append(key) else: attributes.append( DiscreteVariable(key, values=list(possible_values))) attribute_keys.append(key) metas.append(StringVariable("textable_text")) domain = Domain(attributes, [], metas) rows = list() for segment in self.segmentation: row = [ str(segment.annotations.get(annotation_key, None)) for annotation_key in attribute_keys ] row.extend([ str(segment.annotations.get(annotation_key, None)) for annotation_key in meta_keys ]) row.append(segment.get_content()) rows.append(row) progressBar.advance table = Table(domain, rows) if textMiningIsInstalled: corpus = Corpus(domain, X=table.X, metas=table.metas, text_features=[metas[-1]]) msg_corpus = u'%i document@p' % len(self.segmentation) msg_corpus = pluralize(msg_corpus, len(self.segmentation)) self.send('Text Mining corpus', corpus) else: self.send('Text Mining corpus', None) progressBar.finish() self.controlArea.setDisabled(False) if msg_seg or msg_corpus: message = msg_seg if msg_seg and msg_corpus: message += " and " message += msg_corpus message += " sent to output." self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output.""" # Check that there's a model... if not self.model: self.noLanguageModelWarning() self.sendNoneToOutputs() return # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input.", "warning") self.sendNoneToOutputs() return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) # Disable control area and initialize progress bar... self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.char_df)) # Get start and end pos of concatenated input segments... startPositions = [0] endPositions = list() numSegments = len(self.inputSeg) for idx in range(1, numSegments): prevSegLen = len(self.inputSeg[idx-1].get_content()) startPositions.append(startPositions[-1] + prevSegLen + 1) endPositions.append(startPositions[-1] - 1) endPositions.append(startPositions[-1] + len(self.inputSeg[-1].get_content()) + 1) # Get or update character aliases... find_pairs = sys.modules['charnetto.find_pairs'] characters = [entry.split(", ") for entry in self.characters] find_pairs.map_names(self.char_df, characters) # Initializations... charSegments = list() currentSegmentIdx = 0 # For each character token in Charnetto's output... for index, charToken in self.char_df.iterrows(): # Skip non-PER named entities. if charToken["tag"] != "PER": continue # Get index of containing segment... while charToken["end_pos"] > endPositions[currentSegmentIdx]: currentSegmentIdx += 1 # Create segment for char with its actual coordinates... strIndex = self.inputSeg[currentSegmentIdx].str_index start = charToken["start_pos"]-startPositions[currentSegmentIdx] end = charToken["end_pos"]-startPositions[currentSegmentIdx] annotations = {"id": charToken["alias"]} charSegments.append(Segment(strIndex, start, end, annotations)) progressBar.advance() # Send output... outputSegmentation = Segmentation(charSegments, label=self.captionTitle) self.send("Character segmentation", outputSegmentation, self) print(outputSegmentation.to_string()) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSegmentation) message = pluralize(message, len(outputSegmentation)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some songs first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Attempt to connect to Genius and retrieve lyrics... selectedSongs = list() song_content = list() annotations = list() try: for song in self.myBasket: # song is a dict {'idx1':{'title':'song1'...}, # 'idx2':{'title':'song2'...}} page_url = "http://genius.com" + song['path'] lyrics = self.html_to_text(page_url) song_content.append(lyrics) annotations.append(song.copy()) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Genius website.", "error") self.controlArea.setDisabled(False) return # Store downloaded lyrics strings in input objects... for song in song_content: newInput = Input(song, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Lyrics importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText("Please select one or more titles.", "warning") self.send("XML-TEI data", None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. progressBar = gui.ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to ECP and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: doc_url = self.document_base_url + \ self.filteredTitleSeg[title].annotations["url"] print(doc_url) url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url) print(url) response = urllib.request.urlopen(url) xml_contents.append(response.read().decode('utf-8')) source_annotations = \ self.filteredTitleSeg[title].annotations.copy() #source_annotations["url"] = source_annotations["href"] #del source_annotations["href"] annotations.append(source_annotations) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: #Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from ECP website.", "error") # Reset output channel. self.send("XML-TEI data", None, self) return # Store downloaded XML in input objects... for xml_content_idx in range(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"] ] # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() # Clear progress bar. progressBar.finish() # Send token... self.send("XML-TEI data", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:]
class ECP(OWTextableBaseWidget): """Textable widget for importing XML-TEI data from the Eighteenth Century Poetry website (http://www.eighteenthcenturypoetry.org/) """ #---------------------------------------------------------------------- # Widget"s metadata... name = "18th Century Poetry" description = "Import XML-TEI data from ECP website" icon = "icons/18th_century_poetry.svg" priority = 10 #---------------------------------------------------------------------- # Channel definitions (NB: no input in this case)... inputs = [] outputs = [("XML-TEI data", Segmentation)] #---------------------------------------------------------------------- # Settings... settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) autoSend = settings.Setting(False) selectedTitles = settings.Setting([]) titleLabels = settings.Setting([]) filterCriterion = settings.Setting("author") filterValue = settings.Setting("(all)") importedURLs = settings.Setting([]) displayAdvancedSettings = settings.Setting(False) want_main_area = False def __init__(self): """Widget creator.""" super().__init__() # Other attributes... self.segmentation = None self.createdInputs = list() self.titleSeg = None self.filteredTitleSeg = None self.filterValues = dict() self.base_url = \ u"http://www.eighteenthcenturypoetry.org/works/#genres" self.document_base_url = \ u"http://www.eighteenthcenturypoetry.org" # Next two instructions are helpers from TextableUtils. Corresponding # interface elements are declared here and actually drawn below (at # their position in the UI)... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute="infoBox", sendIfPreCallback=self.updateGUI, ) # The AdvancedSettings class, also from TextableUtils, facilitates # the management of basic vs. advanced interface. An object from this # class (here assigned to self.advancedSettings) contains two lists # (basicWidgets and advancedWidgets), to which the corresponding # widgetBoxes must be added. self.advancedSettings = AdvancedSettings( widget=self.controlArea, master=self, callback=self.updateFilterValueList, ) # User interface... # Advanced settings checkbox (basic/advanced interface will appear # immediately after it... self.advancedSettings.draw() # Filter box (advanced settings only) filterBox = gui.widgetBox( widget=self.controlArea, box="Filter", orientation="vertical", ) filterCriterionCombo = gui.comboBox( widget=filterBox, master=self, value="filterCriterion", items=["author", "genre"], sendSelectedValue=True, orientation="horizontal", label="Criterion:", labelWidth=120, callback=self.updateFilterValueList, tooltip=( "Please select a criterion for searching the title list\n"), ) filterCriterionCombo.setMinimumWidth(120) gui.separator(widget=filterBox, height=3) self.filterValueCombo = gui.comboBox( widget=filterBox, master=self, value="filterValue", sendSelectedValue=True, orientation="horizontal", label="Value:", labelWidth=120, callback=self.updateTitleList, tooltip=("Please select a value for the chosen criterion."), ) gui.separator(widget=filterBox, height=3) # The following lines add filterBox (and a vertical separator) to the # advanced interface... self.advancedSettings.advancedWidgets.append(filterBox) self.advancedSettings.advancedWidgetsAppendSeparator() # Title box titleBox = gui.widgetBox( widget=self.controlArea, box="Titles", orientation="vertical", ) self.titleListbox = gui.listBox( widget=titleBox, master=self, value="selectedTitles", # setting (list) labels="titleLabels", # setting (list) callback=self.sendButton.settingsChanged, tooltip="The list of titles whose content will be imported", ) self.titleListbox.setMinimumHeight(150) self.titleListbox.setSelectionMode(3) gui.separator(widget=titleBox, height=3) gui.button( widget=titleBox, master=self, label="Refresh", callback=self.refreshTitleSeg, tooltip="Connect to ECP website and refresh list.", ) gui.separator(widget=titleBox, height=3) gui.separator(widget=self.controlArea, height=3) gui.rubber(self.controlArea) # Now Info box and Send button must be drawn... self.sendButton.draw() self.infoBox.draw() # This initialization step needs to be done after infoBox has been # drawn (because getTitleSeg may need to display an error message). self.getTitleSeg() # Send data if autoSend. self.sendButton.sendIf() self.setMinimumWidth(350) self.adjustSizeWithTimer() def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText("Please select one or more titles.", "warning") self.send("XML-TEI data", None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. progressBar = gui.ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to ECP and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: doc_url = self.document_base_url + \ self.filteredTitleSeg[title].annotations["url"] print(doc_url) url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url) print(url) response = urllib.request.urlopen(url) xml_contents.append(response.read().decode('utf-8')) source_annotations = \ self.filteredTitleSeg[title].annotations.copy() #source_annotations["url"] = source_annotations["href"] #del source_annotations["href"] annotations.append(source_annotations) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: #Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from ECP website.", "error") # Reset output channel. self.send("XML-TEI data", None, self) return # Store downloaded XML in input objects... for xml_content_idx in range(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"] ] # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() # Clear progress bar. progressBar.finish() # Send token... self.send("XML-TEI data", self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def getTitleSeg(self): """Get title segmentation, either saved locally or online""" # Try to open saved file in this module"s directory... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "rb") self.titleSeg = pickle.load(file) file.close() # Else try to load list from ECP and build new seg... except IOError: self.titleSeg = self.getTitleListFromECP() # Build author and genre lists... if self.titleSeg is not None: self.filterValues["author"] = Processor.count_in_context( units={ "segmentation": self.titleSeg, "annotation_key": "author" }).col_ids self.filterValues["author"].sort() self.filterValues["genre"] = Processor.count_in_context( units={ "segmentation": self.titleSeg, "annotation_key": "genre" }).col_ids self.filterValues["genre"].sort() # Sort the segmentation alphabetically based on titles (nasty hack!)... self.titleSeg.buffer.sort(key=lambda s: s.annotations["title"]) # Update title and filter value lists (only at init and on manual # refresh, therefore separate from self.updateGUI). self.updateFilterValueList() def refreshTitleSeg(self): """Refresh title segmentation from website""" self.titleSeg = self.getTitleListFromECP() # Update title and filter value lists (only at init and on manual # refresh, therefore separate from self.updateGUI). self.updateFilterValueList() def getTitleListFromECP(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait") # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('utf-8') self.infoBox.customMessage("Done fetching data from ECP website.") # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent(warning="Couldn't access ECP website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles... genresListSeg = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"id": re.compile(r"^genres-list")}, ) # Extract genre annotation... genreSeg = Segmenter.tokenize( segmentation=genresListSeg, regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \ "tokenize", {"genre": "&1"})], import_annotations=False, ) # Extract works... titleSeg = Segmenter.tokenize( segmentation=genreSeg, regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \ "tokenize")], ) # Extract annotations... titleSeg = Segmenter.tokenize( segmentation=titleSeg, regexes=[ (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", { "author": "&1" }), (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'), "tokenize", { "url": "&1" }), (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", { "title": "&1" }), ], merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg def updateFilterValueList(self): """Update the list of filter values""" # In Advanced settings mode, populate filter value list... if self.titleSeg is not None and self.displayAdvancedSettings: self.filterValueCombo.clear() self.filterValueCombo.addItem("(all)") for filterValue in self.filterValues[self.filterCriterion]: self.filterValueCombo.addItem(filterValue) # Reset filterValue if needed... if self.filterValue not in [ self.filterValueCombo.itemText(i) for i in range(self.filterValueCombo.count()) ]: self.filterValue = "(all)" else: self.filterValue = self.filterValue self.updateTitleList() def updateTitleList(self): """Update the list of titles""" # If titleSeg has not been loaded for some reason, skip. if self.titleSeg is None: return # In Advanced settings mode, get list of selected titles... if self.displayAdvancedSettings and self.filterValue != "(all)": self.filteredTitleSeg, _ = Segmenter.select( segmentation=self.titleSeg, regex=re.compile(r"^%s$" % self.filterValue), annotation_key=self.filterCriterion, ) else: self.filteredTitleSeg = self.titleSeg # If criterion is not "genre" and his filter value not "all", # group titles with different genres... # Create a dictionary with "author" and "title" as key... unique_titles = dict() for title in self.filteredTitleSeg: title_id = ( title.annotations["author"], title.annotations["title"], ) try: unique_titles[title_id].append(title) except KeyError: unique_titles[title_id] = [title] # Create a list with new annotation comporting all genres... new_title_segments = list() for unique_title in unique_titles.values(): title_genres = list() new_title_segments.append(unique_title[0]) title_genres.append(unique_title[0].annotations["genre"]) for equivalent_title in unique_title[1:]: title_genres.append(equivalent_title.annotations["genre"]) new_title_segments[-1].annotations["genre"] = ", ".join( sorted(list(set(title_genres)))) self.filteredTitleSeg = Segmentation(None) self.filteredTitleSeg.extend(new_title_segments) # Populate titleLabels list with the titles... self.titleLabels = sorted( [s.annotations["title"] for s in self.filteredTitleSeg]) # Add specification (author, year and genre, depending on criterion)... titleLabels = self.titleLabels[:] for idx, titleLabel in enumerate(titleLabels): specs = list() if (self.displayAdvancedSettings == False or self.filterCriterion != "author" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["author"]) if (self.displayAdvancedSettings == False or self.filterCriterion != "genre" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["genre"]) titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs) self.titleLabels = titleLabels # Reset selectedTitles if needed... if not set(self.importedURLs).issubset( set(u.annotations["url"] for u in self.filteredTitleSeg)): self.selectedTitles = list() else: self.selectedTitles = self.selectedTitles self.sendButton.settingsChanged() def updateGUI(self): """Update GUI state""" if self.displayAdvancedSettings: self.advancedSettings.setVisible(True) else: self.advancedSettings.setVisible(False) if len(self.titleLabels) > 0: self.selectedTitles = self.selectedTitles def clearCreatedInputs(self): """Delete all Input objects that have been created.""" for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() # The following method need to be copied (without any change) in # every Textable widget... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title)
def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:]
def updateTitleList(self): """Update the list of titles""" # If titleSeg has not been loaded for some reason, skip. if self.titleSeg is None: return # In Advanced settings mode, get list of selected titles... if self.displayAdvancedSettings and self.filterValue != "(all)": self.filteredTitleSeg, _ = Segmenter.select( segmentation=self.titleSeg, regex=re.compile(r"^%s$" % self.filterValue), annotation_key=self.filterCriterion, ) else: self.filteredTitleSeg = self.titleSeg # If criterion is not "genre" and his filter value not "all", # group titles with different genres... # Create a dictionary with "author" and "title" as key... unique_titles = dict() for title in self.filteredTitleSeg: title_id = ( title.annotations["author"], title.annotations["title"], ) try: unique_titles[title_id].append(title) except KeyError: unique_titles[title_id] = [title] # Create a list with new annotation comporting all genres... new_title_segments = list() for unique_title in unique_titles.values(): title_genres = list() new_title_segments.append(unique_title[0]) title_genres.append(unique_title[0].annotations["genre"]) for equivalent_title in unique_title[1:]: title_genres.append(equivalent_title.annotations["genre"]) new_title_segments[-1].annotations["genre"] = ", ".join( sorted(list(set(title_genres)))) self.filteredTitleSeg = Segmentation(None) self.filteredTitleSeg.extend(new_title_segments) # Populate titleLabels list with the titles... self.titleLabels = sorted( [s.annotations["title"] for s in self.filteredTitleSeg]) # Add specification (author, year and genre, depending on criterion)... titleLabels = self.titleLabels[:] for idx, titleLabel in enumerate(titleLabels): specs = list() if (self.displayAdvancedSettings == False or self.filterCriterion != "author" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["author"]) if (self.displayAdvancedSettings == False or self.filterCriterion != "genre" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["genre"]) titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs) self.titleLabels = titleLabels # Reset selectedTitles if needed... if not set(self.importedURLs).issubset( set(u.annotations["url"] for u in self.filteredTitleSeg)): self.selectedTitles = list() else: self.selectedTitles = self.selectedTitles self.sendButton.settingsChanged()
def sendData(self): # Si le lien vers treetagger n'est pas trouve if self.NoLink: self.infoBox.setText( u"Sorry, TreeTagger's link not found.", "error" ) self.send('Text data', None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText( u"Widget needs input", "warning" ) self.send('Text data', None) # affiche que quelque chose se passe... else: self.infoBox.setText( u'TreeTagger is running...', "warning" ) # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = OWGUI.ProgressBar( self, iterations = 5 ) # Copie de la segmentation avec ajout d'une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()] ) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d'un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d'un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d'un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions = [ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w" ) self.infoBox.dataSent('') # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", 'w') else: file = open(os.path.normpath( "/Users/" + self.user + "/treetagger_link.txt"), 'w' ) file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send('Text data', final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
class TestSegmentation(unittest.TestCase): """Test suite for LTTL Segment module""" def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=self.str_index, start=0, end=2, annotations={ 'a': '1', 'bc': '20' }), Segment(str_index=self.str_index, start=3, end=6) ]) self.overlapping_seg = Segmentation([ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ]) self.base_output_string = ('segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6') % (self.str_index, self.str_index) self.count = 0 def tearDown(self): """Cleaning up after the test""" pass def test_creator(self): """Does creator return Segmentation object?""" self.assertIsInstance( Segmentation(), Segmentation, msg="creator doesn't return Segmentation object!") def test_to_string_default_format(self): """Does to_string() format segmentation correctly by default?""" output_string = self.word_seg.to_string() self.assertEqual( output_string, self.base_output_string, msg="to_string() doesn't format segmentation correctly by default!" ) def test_to_string_delimiter(self): """Does to_string() format segment delimiter correctly?""" output_string = self.word_seg.to_string( segment_delimiter='DELIMITER', ) self.assertIn( 'DELIMITER', output_string, msg="to_string() doesn't format segment delimiter correctly!") def test_to_string_header(self): """Does to_string() format header correctly?""" output_string = self.word_seg.to_string(header='HEADER', ) self.assertEqual(output_string, 'HEADER' + self.base_output_string, msg="to_string() doesn't format header correctly!") def test_to_string_footer(self): """Does to_string() format footer correctly?""" output_string = self.word_seg.to_string(footer='FOOTER', ) self.assertEqual(output_string, self.base_output_string + 'FOOTER', msg="to_string() doesn't format footer correctly!") def test_to_string_humanize_addresses(self): """Does to_string() humanize addresses?""" output_string = self.word_seg.to_string(humanize_addresses=True, ) humanized_str_index = self.str_index + 1 humanized_string = self.base_output_string.replace('t:\t3', 't:\t4') humanized_string = humanized_string.replace('t:\t0', 't:\t1') humanized_string = humanized_string.replace( 'x:\t%i' % self.str_index, 'x:\t%i' % humanized_str_index) self.assertEqual(output_string, humanized_string, msg="to_string() doesn't humanize addresses!") def test_to_string_interpolate_builtin_variables(self): """Does to_string() interpolate builtin variables?""" output_string = self.word_seg.to_string(formatting=( '%(__num__)s,%(__content__)s,' '%(__str_index__)s,%(__start__)s,%(__end__)s,' '%(__str_index_raw__)s,%(__start_raw__)s,%(__end_raw__)s')) self.assertEqual( output_string, '1,ab,%i,0,2,%i,0,2\n2,cde,%i,3,6,%i,3,6' % (self.str_index, self.str_index, self.str_index, self.str_index), msg="to_string() doesn't interpolate builtin variables!") def test_to_string_interpolate_annotations(self): """Does to_string() interpolate annotations?""" output_string = self.word_seg.to_string(formatting='%(a)s') self.assertEqual(output_string, '1\n__none__', msg="to_string() doesn't interpolate annotations!") def test_to_string_progress(self): """Does to_string track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 self.word_seg.to_string(progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.word_seg), msg="to_string doesn't track progress!") def test_get_annotation_keys(self): """Does get_annotation_keys() return existing annotations?""" annotations = self.word_seg.get_annotation_keys() self.assertEqual( sorted(annotations), sorted(['a', 'bc']), msg="get_annotation_keys() doesn't return existing annotations!") def test_is_non_overlapping(self): """Does is_non_overlapping() recognize absence of overlap?""" self.assertTrue( self.word_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize absence of overlap!") def test_is_overlapping(self): """Does is_non_overlapping() recognize presence of overlap?""" self.assertFalse( self.overlapping_seg.is_non_overlapping(), msg="is_non_overlapping() doesn't recognize presence of overlap!")