def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.other_entire_text_seg = Input('d') str_index = self.entire_text_seg[0].str_index self.first_word_seg = Segmentation( [ Segment( str_index=str_index, start=0, end=2, annotations={'a': 1} ) ] ) self.last_word_seg = Segmentation( [Segment(str_index=str_index, start=3, end=6)] ) self.char_seg = Segmentation( [ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=2, end=3), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5), Segment(str_index=str_index, start=5, end=6), ] )
def test_update_string(self): """Does update modify stored string?""" seg = Input('test2') seg.update('modified') self.assertEqual(Segmentation.get_data(-1)[:], 'modified', msg="update doesn't modify stored string!")
def test_clear_string(self): """Does clear set stored string to None?""" seg = Input('test3') seg.clear() self.assertEqual(Segmentation.get_data(-1), None, msg="clear doesn't set stored string to None!")
def sendData(self): if not self.file: self.infoBox.setText(u"Please select input file.", "warning") self.send('Text data', None, self) return # Clear created Inputs. self.clearCreatedInputs() # Get transcription try: transcription = self.get_large_audio_transcription( self.file, language=self.language, set_silence_len=self.selected_dur, set_silence_threshold=self.selected_vol) except speech_recognition.UnknownValueError as err: self.infoBox.setText( u"You seem to have overuseed the built-in API key, refer to the documentation for further informations.", "warning") self.send('Text data', None, self) return # Checks if there is a transcription if transcription is None: self.infoBox.setText(u"You must use mp3 or wav audio files.", "warning") self.send('Text data', None, self) return # Regex to get the name of the input file title = self.file regex = re.compile("[^(/\\)]+[mp3|wav]$") match = re.findall(regex, title) if self.selected_seg: for chunk in transcription: new_input = Input(chunk, label=match) self.createdInputs.append(new_input) else: new_input = Input(transcription, label=match) self.createdInputs.append(new_input) # Concatenates the segmentations in the output segmentation self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=False, import_labels_as="") #Sending segments length message = " Succesfully transcripted ! % i segment@p sent to output" % len( self.segmentation) message = pluralize(message, len(self.segmentation)) # Send token... self.send("Text data", self.segmentation, self) self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()
def test_update_string(self): """Does update modify stored string?""" seg = Input(u'test2') seg.update(u'modified') self.assertEqual( Segmentation.get_data(-1)[:], u'modified', msg="update doesn't modify stored string!" )
def test_clear_string(self): """Does clear set stored string to None?""" seg = Input('test3') seg.clear() self.assertEqual( Segmentation.get_data(-1), None, msg="clear doesn't set stored string to None!" )
def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') self.str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=self.str_index, start=0, end=2, annotations={ 'a': '1', 'bc': '20' }), Segment(str_index=self.str_index, start=3, end=6) ]) self.overlapping_seg = Segmentation([ Segment(str_index=self.str_index, start=3, end=5), Segment(str_index=self.str_index, start=4, end=6), ]) self.base_output_string = ('segment number 1\n' '\tcontent:\t"ab"\n' '\tstr_index:\t%i\n' '\tstart:\t0\n' '\tend:\t2\n' '\tannotations:\n' '\t\ta 1\n' '\t\tbc 20\n' 'segment number 2\n' '\tcontent:\t"cde"\n' '\tstr_index:\t%i\n' '\tstart:\t3\n' '\tend:\t6') % (self.str_index, self.str_index) self.count = 0
def test_creator(self): """Does creator return Input object?""" self.assertIsInstance( Input(), Input, msg="creator doesn't return Input object!" )
def send_data(self): """Creates the inputs based on the fetched data""" self.controlArea.setDisabled(True) self.clearCreatedInputs() segmentation = None # Goes over each queries in the data list for query in self.queryList: for text in query: # Create inputs newInput = Input(text) self.createdInputs.append(newInput) # If there is only one input, create a segmentation... if len(self.createdInputs) == 1: segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... annotations = list() for elem in self.annotList: for dic in elem: annotations.append(dic) for idx, segment in enumerate(segmentation): segment.annotations.update(annotations[idx]) segmentation[idx] = segment # Calculate number of characters... num_chars = 0 for segment in segmentation: num_chars += len(Segmentation.get_data(segment.str_index)) # If there is data... if len(segmentation) != 0: # Inform the user of the number of segments and the number of characters... self.infoBox.setText( "{} segments sent to output ({} characters)".format( len(segmentation), num_chars, )) # Send the segments self.send("Segmentation", segmentation) self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag() else: # Else, signal the user that no data is sendable... self.infoBox.setText( "There are {} segments to send to output. Please fill the query basket and click 'send' again" .format(len(segmentation)), "warning") self.sendButton.resetSettingsChangedFlag() self.controlArea.setDisabled(False) self.send("Segmentation", None)
def test_creator_store_string(self): """Does creator store string in class variable?""" Input(u'test') self.assertEqual( Segmentation.get_data(-1)[:], u'test', msg="creator doesn't store string in class variable!" )
def test_slice_string(self): """Does the slicing work like in strings""" Input('Hello world!') self.assertEqual( Segmentation.get_data(-1)[3:7], u"Hello world!"[3:7], msg="slicing doesn't return the same as in strings" )
def __init__(self): """Initialize a Text File widget""" super().__init__() # Other attributes... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute='infoBox', ) # LTTL.Input object (token that will be sent). self.segmentation = Input(text=u'') # GUI... # Text Field... gui.separator( widget=self.controlArea, height=3, ) self.editor = QPlainTextEdit() self.editor.setPlainText(self.textFieldContent.decode('utf-8')) self.controlArea.layout().addWidget(self.editor) self.editor.textChanged.connect(self.sendButton.settingsChanged) gui.separator( widget=self.controlArea, height=3, ) self.setMinimumWidth(250) # Send button... self.sendButton.draw() # Info box... self.infoBox.draw() self.sendButton.sendIf()
def get_tweets(self, search, nb, include_RT, useKey, keys): if not useKey: keys = None twitter = Twitter(language=self.dico_lang[self.language], license=keys) tweets = list() if not include_RT: for tweet in twitter.search(search, start=1, count=nb * 3): if not tweet.text.startswith('RT'): tweet_input = Input(tweet.text) annotations = { 'source': 'Twitter', 'author': tweet.author, 'date': tweet.date, 'url': tweet.url, 'search': search, } segment = tweet_input[0] segment.annotations.update(annotations) tweet_input[0] = segment tweets.append(tweet_input) if len(tweets) == nb: break else: for tweet in twitter.search(search, start=1, count=nb): tweet_input = Input(tweet.text) annotations = { 'source': 'Twitter', 'author': tweet.author, 'date': tweet.date, 'url': tweet.url, 'search': search, } segment = tweet_input[0] segment.annotations.update(annotations) tweet_input[0] = segment tweets.append(tweet_input) return tweets
def get_wiki_article(self, search, separate_in_section=False, type_of_text=u'Plain text'): segments = list() article = Wikipedia(language=self.dico_lang[self.language]).search( search, cached=False) if article: if separate_in_section: for section in article.sections: if type_of_text == u'Plain text': wiki_article = Input(section.string) else: wiki_article = Input(section.html) annotations = { 'source': 'Wikipedia', 'section title': section.title, 'section level': section.level, 'search': search, } segment = wiki_article[0] segment.annotations.update(annotations) wiki_article[0] = segment segments.append(wiki_article) else: if type_of_text == u'Plain text': wiki_article = Input(article.string) else: wiki_article = Input(article.html) annotations = { 'source': 'Wikipedia', 'search': search, } segment = wiki_article[0] segment.annotations.update(annotations) wiki_article[0] = segment segments.append(wiki_article) return segments
def main(): input_seg = Input("un texte") verbatim_seg = Segmenter.tokenize( input_seg, [(re.compile(r'.+'), 'tokenize')], ) # verbatim in input = ok print("verbatim in input:", end=' ') contained_segment_idxs = input_seg[0].get_contained_segment_indices( verbatim_seg) try: print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail") # verbatim in verbatim = ok print("verbatim in verbatim:", end=' ') contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices( verbatim_seg) try: print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail") # input in verbatim = fail print("input in verbatim:", end=' ') contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices( input_seg) try: print("ok" if input_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail") # input in input = fail print("input in input:", end=' ') contained_segment_idxs = input_seg[0].get_contained_segment_indices( input_seg) try: print("ok" if input_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail")
def get_bing_entries(self, search, nb): bing = Bing(language=self.dico_lang[self.language]) entries = list() for result in bing.search(search, start=1, count=nb, cached=False): entry_input = Input(result.text) annotations = { 'source': 'Bing', 'title': result.title, 'url': result.url, 'search': search, } segment = entry_input[0] segment.annotations.update(annotations) entry_input[0] = segment entries.append(entry_input) return entries
def sendData(self): """Compute result of widget processing and send to output""" # Check that there's an input... if self.inputSeg is None: self.infoBox.setText("Widget needs input", "warning") self.send("Linguistically analyzed data", None, self) return # Initialize progress bar. self.infoBox.setText( u"Processing, please wait...", "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.inputSeg)) # Basic NLP analysis for dev purposes... analyzedSegments = list() for segment in self.inputSeg: analyzedString = "" doc = self.nlp(segment.get_content()) for token in doc: analyzedString += "%s\t%s\n" % (token.text, token.pos_) analyzedSegments.append(Input(analyzedString)) progressBar.advance() outputSeg = LTTL.Segmenter.concatenate( analyzedSegments, import_labels_as=None, label=self.captionTitle, ) # Set status to OK and report data size... message = "%i segment@p sent to output." % len(outputSeg) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("Linguistically analyzed data", outputSeg, self) self.sendButton.resetSettingsChangedFlag()
def main(): input_seg = Input("un texte") verbatim_seg = Segmenter.tokenize( input_seg, [(re.compile(r'.+'), 'tokenize')], ) # verbatim in input = ok print "verbatim in input:", contained_segments = input_seg[0].get_contained_segments(verbatim_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail" # verbatim in verbatim = ok print "verbatim in verbatim:", contained_segments = verbatim_seg[0].get_contained_segments(verbatim_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail" # input in verbatim = fail print "input in verbatim:", contained_segments = verbatim_seg[0].get_contained_segments(input_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail" # input in input = fail print "input in input:", contained_segments = input_seg[0].get_contained_segments(input_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail"
class OWTextableTextField(OWTextableBaseWidget): """Orange widget for typing text data""" name = "Text Field" description = "Import text data from keyboard input" icon = "icons/TextField.png" priority = 1 # Input and output channels... inputs = [('Text data', Segmentation, "inputTextData", widget.Single)] outputs = [('Text data', Segmentation)] settingsHandler = VersionedSettingsHandler( version=__version__.rsplit(".", 1)[0]) # Settings ... textFieldContent = settings.Setting(u''.encode('utf-8')) encoding = settings.Setting(u'utf-8') want_main_area = False def __init__(self): """Initialize a Text File widget""" super().__init__() # Other attributes... self.infoBox = InfoBox(widget=self.controlArea) self.sendButton = SendButton( widget=self.controlArea, master=self, callback=self.sendData, infoBoxAttribute='infoBox', ) # LTTL.Input object (token that will be sent). self.segmentation = Input(text=u'') # GUI... # Text Field... gui.separator( widget=self.controlArea, height=3, ) self.editor = QPlainTextEdit() self.editor.setPlainText(self.textFieldContent.decode('utf-8')) self.controlArea.layout().addWidget(self.editor) self.editor.textChanged.connect(self.sendButton.settingsChanged) gui.separator( widget=self.controlArea, height=3, ) self.setMinimumWidth(250) # Send button... self.sendButton.draw() # Info box... self.infoBox.draw() self.sendButton.sendIf() def inputTextData(self, segmentation): """Handle text data on input connection""" if not segmentation: return self.editor.setPlainText(''.join( [s.get_content() for s in segmentation])) self.sendButton.settingsChanged() def sendData(self): """Normalize content, then create and send segmentation""" textFieldContent = self.editor.toPlainText() self.textFieldContent = textFieldContent.encode('utf-8') textFieldContent \ = textFieldContent.replace('\r\n', '\n').replace('\r', '\n') textFieldContent = normalize('NFC', textFieldContent) # Check that text field is not empty... if not self.textFieldContent: self.infoBox.setText( message=u'Please type or paste some text above.', state='warning', ) self.send('Text data', None, self) return # TODO: remove message 'No label was provided.' from docs # Set status to OK... message = u'1 segment (%i character@p) sent to output.' % \ len(textFieldContent) message = pluralize(message, len(textFieldContent)) self.infoBox.setText(message) # Update segmentation. self.segmentation.update(textFieldContent, label=self.captionTitle) # Send token... self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag() def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title) def onDeleteWidget(self): self.segmentation.clear() self.segmentation.__del__()
self.contextAnnotationKey = self.contextAnnotationKey def handleNewSignals(self): """Overridden: called after multiple signals have been added""" self.openContext(self.uuid, self.segmentations) self.updateGUI() self.sendButton.sendIf() if __name__ == '__main__': import sys from PyQt4.QtGui import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableCount() seg1 = Input(u'hello world', label=u'text1') seg2 = Input(u'cruel world', label=u'text2') seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus') seg4 = Segmenter.tokenize(seg3, [(r'\w+(?u)', u'tokenize', { 'type': 'mot' })], label=u'words') ow.inputData(seg3, 1) ow.inputData(seg4, 2) ow.show() appl.exec_() ow.saveSettings()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some movies first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar(self, iterations=len(self.myBasket)) # Connect to imdb and add elements in lists list_review = list() list_annotation = list() annotations = list() try: for item in self.myBasket: movie = self.ia.get_movie_reviews(item['id']) movie_annotations = self.ia.get_movie(item['id']) list_review.append(movie) list_annotation.append(movie_annotations) # 1 tick on the progress bar of the widget progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from imdb", "error") self.controlArea.setDisabled(False) return # Store movie critics strings in input objects... for movie in list_review: data = movie.get('data', "") reviews_data = data.get('reviews') for review in reviews_data: reviews = review.get('content') newInput = Input(reviews) self.createdInputs.append(newInput) for item in list_annotation: print(item) # Store the annotation as dicts in a separate list annotations_dict = {"title": item, "year": item["year"]} annot_dict_copy = annotations_dict.copy() for i in range(25): annotations.append(annot_dict_copy) print(annotations) # If there's only one item, the widget's output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = f"{len(self.segmentation)} segment@p sent to output" message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += " (%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send('Segmentation', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
message = "%i segment@p sent to output. (ignored %i segments with \ no content)" % (len(outputSeg), len(self.contentIsNone)) message = pluralize(message, len(outputSeg)) self.infoBox.setText(message) # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Send data to output... self.send("CSV Segmentation", outputSeg, self) self.sendButton.resetSettingsChangedFlag() # The following method needs to be copied verbatim in # every Textable widget that sends a segmentation... def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title) if __name__ == "__main__": from LTTL.Input import Input WidgetPreview(ExtractCSV).run(inputData=Input("a simple example"))
def getTitleListFromECP(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait") # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('utf-8') self.infoBox.customMessage("Done fetching data from ECP website.") # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent(warning="Couldn't access ECP website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles... genresListSeg = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"id": re.compile(r"^genres-list")}, ) # Extract genre annotation... genreSeg = Segmenter.tokenize( segmentation=genresListSeg, regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \ "tokenize", {"genre": "&1"})], import_annotations=False, ) # Extract works... titleSeg = Segmenter.tokenize( segmentation=genreSeg, regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \ "tokenize")], ) # Extract annotations... titleSeg = Segmenter.tokenize( segmentation=titleSeg, regexes=[ (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", { "author": "&1" }), (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'), "tokenize", { "url": "&1" }), (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", { "title": "&1" }), ], merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.titleLabels == list(): return # Check that something has been selected... if len(self.selectedTitles) == 0: self.infoBox.setText("Please select one or more titles.", "warning") self.send("XML-TEI data", None, self) return # Clear created Inputs. self.clearCreatedInputs() # Initialize progress bar. progressBar = gui.ProgressBar(self, iterations=len(self.selectedTitles)) # Attempt to connect to ECP and retrieve plays... xml_contents = list() annotations = list() try: for title in self.selectedTitles: doc_url = self.document_base_url + \ self.filteredTitleSeg[title].annotations["url"] print(doc_url) url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url) print(url) response = urllib.request.urlopen(url) xml_contents.append(response.read().decode('utf-8')) source_annotations = \ self.filteredTitleSeg[title].annotations.copy() #source_annotations["url"] = source_annotations["href"] #del source_annotations["href"] annotations.append(source_annotations) progressBar.advance() # 1 tick on the progress bar... # If an error occurs (e.g. http error, or memory error)... except: #Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from ECP website.", "error") # Reset output channel. self.send("XML-TEI data", None, self) return # Store downloaded XML in input objects... for xml_content_idx in range(len(xml_contents)): newInput = Input(xml_contents[xml_content_idx], self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update(annotations[idx]) self.segmentation[idx] = segment # Store imported URLs as setting. self.importedURLs = [ self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"] ] # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() # Clear progress bar. progressBar.finish() # Send token... self.send("XML-TEI data", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
from LTTL.Input import Input import LTTL.Segmenter as Segmenter import re input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], ) vowel_seg = Segmenter.tokenize( input_seg, [(re.compile(r'[aeiouy]'), 'tokenize')], ) for seg in word_seg[1].get_contained_segments(vowel_seg): print(seg.get_content())
def handleNewSignals(self): """Overridden: called after multiple signals have been added""" self.openContext(self.uuid, self.segmentations) self.updateGUI() self.sendButton.sendIf() if __name__ == '__main__': import sys, re from PyQt4.QtGui import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableVariety() seg1 = Input(u'aabccc', 'text1') seg2 = Input(u'abci', 'text2') seg3 = Segmenter.concatenate( [seg1, seg2], import_labels_as='string', label='corpus' ) seg4 = Segmenter.tokenize( seg3, regexes=[(re.compile(r'\w+'), u'tokenize',)], ) seg5 = Segmenter.tokenize( seg4, regexes=[(re.compile(r'[ai]'), u'tokenize',)], label='V' )
def clearCreatedInputs(self): for i in self.createdInputs: Segmentation.set_data(i[0].str_index, None) del self.createdInputs[:] def onDeleteWidget(self): """Free memory when widget is deleted (overriden method)""" self.clearCreatedInputs() def setCaption(self, title): if 'captionTitle' in dir(self): changed = title != self.captionTitle super().setCaption(title) if changed: self.sendButton.settingsChanged() else: super().setCaption(title) if __name__ == "__main__": import sys from PyQt5.QtWidgets import QApplication myApplication = QApplication(sys.argv) myWidget = Treetagger() myWidget.show() myWidget.segmentation = Input("My tailor is rich.") myWidget.language = "English" myWidget.sendData() myApplication.exec_()
def sendData(self): """Compute result of widget processing and send to output""" # Skip if title list is empty: if self.myBasket == list(): self.infoBox.setText( "Your corpus is empty, please add some books first", "warning") return # Clear created Inputs. self.clearCreatedInputs() self.controlArea.setDisabled(True) # Initialize progress bar. progressBar = ProgressBar( self, iterations=len(self.myBasket), ) selectedTexts = list() text_content = list() annotations = list() # get the Gutenberg cache cache = GutenbergCache.get_cache() try: # TODO: Retrieve selected texts from gutenberg for text in self.myBasket: # Get the id of the text query_id = cache.native_query( sql_query= "select gutenbergbookid from books where id == {selected_id}" .format(selected_id=text[2])) gutenberg_id = list(query_id) # Get the text with Gutenbergpy gutenberg_text = gutenbergpy.textget.strip_headers( gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0])) text_content.append(gutenberg_text) annotations.append(text[1]) progressBar.advance() # If an error occurs (e.g. http error, or memory error)... except Exception: # Set Info box and widget to "error" state. self.infoBox.setText("Couldn't download data from Gutenberg", "error") self.controlArea.setDisabled(False) return # TODO: send gutenberg texts as output # Store downloaded lyrics strings in input objects... for text in text_content: newInput = Input(text, self.captionTitle) self.createdInputs.append(newInput) # If there"s only one play, the widget"s output is the created Input. if len(self.createdInputs) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget"s output is a concatenation... else: self.segmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle, import_labels_as=None, ) # TODO: annotate with book metadata # Annotate segments... for idx, segment in enumerate(self.segmentation): segment.annotations.update({"title": annotations[idx]}) self.segmentation[idx] = segment # Clear progress bar. progressBar.finish() self.controlArea.setDisabled(False) # Set status to OK and report data size... message = "%i segment@p sent to output " % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += "(%i character@p)." % numChars message = pluralize(message, numChars) self.infoBox.setText(message) self.send("Gutenberg importation", self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def setUp(self): self.maxDiff = None input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], import_annotations=False, ) letter_seg = Segmenter.tokenize( input_seg, [ (re.compile(r'\w'), 'tokenize', { 'type': 'C' }), (re.compile(r'[aeiouy]'), 'tokenize', { 'type': 'V' }), ], import_annotations=False, merge_duplicates=True, ) vowel_seg, consonant_seg = Segmenter.select( letter_seg, re.compile(r'V'), annotation_key='type', ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and without annotation (woa): self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 1, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 2, ('n', 't'): 2, ('n', 'e'): 1, ('n', 'x'): 0, ('t', 'u'): 1, ('t', 'n'): 2, ('t', 't'): 5, ('t', 'e'): 4, ('t', 'x'): 3, ('e', 'u'): 0, ('e', 'n'): 1, ('e', 't'): 4, ('e', 'e'): 4, ('e', 'x'): 3, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 3, ('x', 'e'): 3, ('x', 'x'): 3, } self.window_woa_header_row_id = '__unit__' self.window_woa_header_row_type = 'string' self.window_woa_header_col_id = '__unit__' self.window_woa_header_col_type = 'string' self.window_woa_col_type = { col_id: 'continuous' for col_id in self.window_woa_col_ids } self.window_woa_ref = IntPivotCrosstab( self.window_woa_row_ids, self.window_woa_col_ids, self.window_woa_values, self.window_woa_header_row_id, self.window_woa_header_row_type, self.window_woa_header_col_id, self.window_woa_header_col_type, self.window_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and with annotation (wa): self.window_wa_row_ids = ['C', 'V'] self.window_wa_col_ids = ['C', 'V'] self.window_wa_values = { ('C', 'C'): 5, ('C', 'V'): 5, ('V', 'C'): 5, ('V', 'V'): 5, } self.window_wa_header_row_id = '__unit__' self.window_wa_header_row_type = 'string' self.window_wa_header_col_id = '__unit__' self.window_wa_header_col_type = 'string' self.window_wa_col_type = { col_id: 'continuous' for col_id in self.window_wa_col_ids } self.window_wa_ref = IntPivotCrosstab( self.window_wa_row_ids, self.window_wa_col_ids, self.window_wa_values, self.window_wa_header_row_id, self.window_wa_header_row_type, self.window_wa_header_col_id, self.window_wa_header_col_type, self.window_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and without annotation (woa): self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 0, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 1, ('n', 't'): 0, ('n', 'e'): 0, ('n', 'x'): 0, ('t', 'u'): 0, ('t', 'n'): 0, ('t', 't'): 1, ('t', 'e'): 1, ('t', 'x'): 1, ('e', 'u'): 0, ('e', 'n'): 0, ('e', 't'): 1, ('e', 'e'): 1, ('e', 'x'): 1, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 1, ('x', 'e'): 1, ('x', 'x'): 1, } self.context_wos_woa_header_row_id = '__context__' self.context_wos_woa_header_row_type = 'string' self.context_wos_woa_header_col_id = '__context__' self.context_wos_woa_header_col_type = 'string' self.context_wos_woa_col_type = { col_id: 'continuous' for col_id in self.context_wos_woa_col_ids } self.context_wos_woa_ref = IntPivotCrosstab( self.context_wos_woa_row_ids, self.context_wos_woa_col_ids, self.context_wos_woa_values, self.context_wos_woa_header_row_id, self.context_wos_woa_header_row_type, self.context_wos_woa_header_col_id, self.context_wos_woa_header_col_type, self.context_wos_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and with annotation (wa): self.context_wos_wa_row_ids = ['V', 'C'] self.context_wos_wa_col_ids = ['V', 'C'] self.context_wos_wa_values = { ('V', 'V'): 2, ('V', 'C'): 2, ('C', 'V'): 2, ('C', 'C'): 2, } self.context_wos_wa_header_row_id = '__context__' self.context_wos_wa_header_row_type = 'string' self.context_wos_wa_header_col_id = '__context__' self.context_wos_wa_header_col_type = 'string' self.context_wos_wa_col_type = { col_id: 'continuous' for col_id in self.context_wos_wa_col_ids } self.context_wos_wa_ref = IntPivotCrosstab( self.context_wos_wa_row_ids, self.context_wos_wa_col_ids, self.context_wos_wa_values, self.context_wos_wa_header_row_id, self.context_wos_wa_header_row_type, self.context_wos_wa_header_col_id, self.context_wos_wa_header_col_type, self.context_wos_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and without annotation (woa): self.context_ws_woa_col_ids = ['u', 'e'] self.context_ws_woa_row_ids = ['n', 't', 'x'] self.context_ws_woa_values = { ('n', 'u'): 1, ('n', 'e'): 0, ('t', 'u'): 0, ('t', 'e'): 1, ('x', 'u'): 0, ('x', 'e'): 1, } self.context_ws_woa_header_row_id = '__context__' self.context_ws_woa_header_row_type = 'string' self.context_ws_woa_header_col_id = '__context__' self.context_ws_woa_header_col_type = 'string' self.context_ws_woa_col_type = { col_id: 'continuous' for col_id in self.context_ws_woa_col_ids } self.context_ws_woa_ref = IntPivotCrosstab( self.context_ws_woa_row_ids, self.context_ws_woa_col_ids, self.context_ws_woa_values, self.context_ws_woa_header_row_id, self.context_ws_woa_header_row_type, self.context_ws_woa_header_col_id, self.context_ws_woa_header_col_type, self.context_ws_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and with annotation (wa): self.context_ws_wa_row_ids = ['C'] self.context_ws_wa_col_ids = ['V'] self.context_ws_wa_values = { ('C', 'V'): 2, } self.context_ws_wa_header_row_id = '__context__' self.context_ws_wa_header_row_type = 'string' self.context_ws_wa_header_col_id = '__context__' self.context_ws_wa_header_col_type = 'string' self.context_ws_wa_col_type = { col_id: 'continuous' for col_id in self.context_ws_wa_col_ids } self.context_ws_wa_ref = IntPivotCrosstab( self.context_ws_wa_row_ids, self.context_ws_wa_col_ids, self.context_ws_wa_values, self.context_ws_wa_header_row_id, self.context_ws_wa_header_row_type, self.context_ws_wa_header_col_id, self.context_ws_wa_header_col_type, self.context_ws_wa_col_type, ) self.output_cooc_in_window_woa = Processor.cooc_in_window( units={'segmentation': letter_seg}, window_size=3, ) self.output_cooc_in_window_wa = Processor.cooc_in_window( units={ 'segmentation': letter_seg, 'annotation_key': 'type' }, window_size=3, ) self.output_cooc_in_context_wos_woa = Processor.cooc_in_context( units={'segmentation': letter_seg}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_wos_wa = Processor.cooc_in_context( units={ 'segmentation': letter_seg, 'annotation_key': 'type' }, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_ws_woa = Processor.cooc_in_context( units={'segmentation': vowel_seg}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg}, ) self.output_cooc_in_context_ws_wa = Processor.cooc_in_context( units={ 'segmentation': vowel_seg, 'annotation_key': 'type' }, contexts={'segmentation': word_seg}, units2={ 'segmentation': consonant_seg, 'annotation_key': 'type' }, )
def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText(u"Widget needs input", "warning") self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar(self, iterations=5) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join([ "%s=%s" % ( ''.join(c for c in unicodedata.normalize('NFD', item[0]) if unicodedata.category(c) != 'Mn'), quoteattr(str(item[1])), ) for item in segment.annotations.items() ]) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Replace <unknown> with [unknown] and " with " then # re-segment to match the original segmentation structure. tagged_segmentation, _ = Segmenter.recode( tagged_input, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r'"""'), '"""'), ], ) tagged_segmentation = Segmenter.import_xml(tagged_segmentation, "ax_tt") self.progressBar.advance() # Place each output line of Treetagger in an xml tag with annotations.. xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w") except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error") self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[self.file, self.encoding, "", "", "", "eng", False]] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myFiles)) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] pdf_password = myFile[4] # SuperTextFiles ocr_languages = myFile[5] # SuperTextFiles ocr_force = myFile[6] # SuperTextFiles myFiletype = filetype.guess(myFile[0]) # SuperTextFiles # Try to open the file... self.error() # Start SuperTextFiles try: if myFiletype is None: fileContent = self.extract_raw_text(filePath, encoding) elif myFiletype.extension == "pdf": if ocr_force is True: fileContent = self.get_pdf_content( filePath, ocr_languages, ) else: if self.is_textual_pdf_file(filePath) is True: fileContent = self.extract_text_from_pdf(filePath) else: fileContent = self.get_pdf_content( filePath, ocr_languages, ) elif myFiletype.extension in IMG_FILETYPES: fileContent = self.ocrize(filePath, ocr_languages) if fileContent == -1: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # End SuperTextFiles except IOError as e: if "tesseract" in str(e): QMessageBox.warning(None, 'Textable', str(e), QMessageBox.Ok) progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def setUp(self): """ Setting up for the test """ self.entire_text_seg = Input('ab cde') str_index = self.entire_text_seg[0].str_index self.word_seg = Segmentation([ Segment(str_index=str_index, start=0, end=2, annotations={'a': '1'}), Segment(str_index=str_index, start=3, end=6) ]) self.char_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=2, end=3), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5), Segment(str_index=str_index, start=5, end=6), ]) self.letter_seg1 = Segmentation([ Segment(str_index=str_index, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index, start=1, end=2), ]) self.letter_seg2 = Segmentation([ Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5, annotations={'b': '2'}), Segment(str_index=str_index, start=5, end=6), ]) self.letter_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index, start=1, end=2), Segment(str_index=str_index, start=3, end=4), Segment(str_index=str_index, start=4, end=5, annotations={'b': '2'}), Segment(str_index=str_index, start=5, end=6), ]) self.single_letter_seg = Segmentation([ Segment(str_index=str_index, start=4, end=5, annotations={'b': '1'}), ]) self.duplicate_seg = Segmentation([ Segment(str_index=str_index, start=0, end=1), Segment(str_index=str_index, start=0, end=1), ]) self.overlapping_seg = Segmentation([ Segment(str_index=str_index, start=3, end=5), Segment(str_index=str_index, start=4, end=6), ]) self.other_entire_text_seg = Input('abbccc') str_index2 = self.other_entire_text_seg[0].str_index self.other_letter_seg = Segmentation([ Segment(str_index=str_index2, start=0, end=1, annotations={'a': '1'}), Segment(str_index=str_index2, start=1, end=2, annotations={'a': '1'}), Segment(str_index=str_index2, start=2, end=3, annotations={'a': '1'}), Segment(str_index=str_index2, start=3, end=4, annotations={'a': '2'}), Segment(str_index=str_index2, start=4, end=5, annotations={'a': '2'}), Segment(str_index=str_index2, start=5, end=6, annotations={'a': '3'}), ]) self.third_entire_text_seg = Input('bd1') str_index3 = self.third_entire_text_seg[0].str_index self.third_letter_seg = Segmentation([ Segment(str_index=str_index3, start=0, end=1), Segment(str_index=str_index3, start=1, end=2, annotations={'a': '2'}), Segment(str_index=str_index3, start=2, end=3, annotations={'a': 'b'}), ]) self.fourth_entire_text_seg = Input('AB cd\xe9') str_index = self.fourth_entire_text_seg[0].str_index self.second_word_seg = Segmentation([ Segment(str_index=str_index, start=0, end=2), Segment(str_index=str_index, start=3, end=6), ]) self.xml_seg = Input('<a attr="1"><a attr="2/3/">c<a/>d</a></a>') self.wrong_xml_seg = Input('<a><a>test</a>') self.wrong_xml_seg2 = Input('<a>test</a></a>') self.part_xml_seg = Input('<a>1<a>2<a>3</a>4') str_index3 = self.part_xml_seg[0].str_index self.part_xml_seg2 = Input('</a>5</a>') str_index4 = self.part_xml_seg2[0].str_index self.broken_xml_seg = Segmentation([ Segment(str_index=str_index3, annotations={'a': '1'}), Segment(str_index=str_index4), ]) self.count = 0