def test_import_xml_exception_missing_opening(self): """Does import_xml detect missing opening tag?""" with self.assertRaises( ValueError, msg="import_xml doesn't detect missing opening tag!"): Segmenter.import_xml( self.wrong_xml_seg2, element='a', )
def test_import_xml_exception_missing_opening(self): """Does import_xml detect missing opening tag?""" with self.assertRaises( ValueError, msg="import_xml doesn't detect missing opening tag!" ): Segmenter.import_xml( self.wrong_xml_seg2, element='a', )
def test_import_xml_progress(self): """Does import_xml track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.import_xml( self.broken_xml_seg, element='a', progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.broken_xml_seg), msg="import_xml doesn't track progress!")
def test_import_xml_autonumber(self): """Does import_xml autonumber input segments?""" segmentation = Segmenter.import_xml(self.xml_seg, element='a', auto_number_as='num') self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2], msg="import_xml doesn't autonumber input segments!")
def test_import_xml_progress(self): """Does import_xml track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.import_xml( self.broken_xml_seg, element='a', progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.broken_xml_seg), msg="import_xml doesn't track progress!" )
def test_import_xml_condition(self): """Does import_xml respect conditions?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', conditions={'attr': re.compile(r'^2$')}, ) self.assertEqual([s.annotations['attr'] for s in segmentation], ['2'], msg="import_xml doesn't respect conditions!")
def test_import_xml_segment_elements(self): """Does import_xml segment xml elements?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', ) self.assertEqual([s.get_content() for s in segmentation], ['<a attr="2/3/">c<a/>d</a>', 'c<a/>d'], msg="import_xml doesn't segment xml elements!")
def test_import_xml_convert_attributes(self): """Does import_xml convert attributes?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', ) self.assertEqual([s.annotations['attr'] for s in segmentation], ['1', '2/3/'], msg="import_xml doesn't convert attributes!")
def test_import_xml_import_annotations_false(self): """Does import_xml skip importing annotations?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', import_annotations=False, ) self.assertFalse('a' in segmentation[0].annotations, msg="import_xml doesn't skip importing annotations!")
def test_import_xml_merge_duplicates(self): """Does import_xml merge duplicates?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', merge_duplicates=True, remove_markup=True, ) self.assertEqual([s.get_content() for s in segmentation], ['c', 'd'], msg="import_xml doesn't merge duplicates!")
def test_import_xml_import_element_as_annotation(self): """Does import_xml import element as annotation?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', import_element_as='test', ) self.assertEqual( [s.annotations['test'] for s in segmentation], ['a', 'a'], msg="import_xml doesn't import element as annotation!")
def test_import_xml_segment_elements_broken(self): """Does import_xml segment xml elements from distinct strings?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', ) self.assertEqual( [s.get_content() for s in segmentation], ['1<a>2<a>3</a>4', '2<a>3</a>4', '3', '</a>5'], msg="import_xml doesn't segment elements from distinct strings!")
def test_import_xml_remove_markup(self): """Does import_xml remove markup?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', conditions={'attr': re.compile(r'^2/3/$')}, remove_markup=True, ) self.assertEqual([s.get_content() for s in segmentation], ['c', 'd'], msg="import_xml doesn't remove markup!")
def test_import_xml_import_annotations(self): """Does import_xml import annotations?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', import_annotations=True, ) self.assertEqual(segmentation[0].annotations['a'], '1', msg="import_xml doesn't import annotations!")
def test_import_xml_segment_elements(self): """Does import_xml segment xml elements?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', ) self.assertEqual( [s.get_content() for s in segmentation], ['<a attr="2">c<a/>d</a>', 'c<a/>d'], msg="import_xml doesn't segment xml elements!" )
def test_import_xml_import_annotations_false(self): """Does import_xml skip importing annotations?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', import_annotations=False, ) self.assertFalse( 'a' in segmentation[0].annotations, msg="import_xml doesn't skip importing annotations!" )
def test_import_xml_segment_elements_broken(self): """Does import_xml segment xml elements from distinct strings?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', ) self.assertEqual( [s.get_content() for s in segmentation], ['1<a>2<a>3</a>4', '2<a>3</a>4', '3', '</a>5'], msg="import_xml doesn't segment elements from distinct strings!" )
def test_import_xml_convert_attributes(self): """Does import_xml convert attributes?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', ) self.assertEqual( [s.annotations['attr'] for s in segmentation], ['1', '2'], msg="import_xml doesn't convert attributes!" )
def test_import_xml_solve_attribute_conflict(self): """Does import_xml solve attribute conflicts?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', merge_duplicates=True, remove_markup=True, ) self.assertEqual(segmentation[0].annotations['attr'], '1', msg="import_xml doesn't solve attribute conflicts!")
def test_import_xml_remove_markup_broken(self): """Does import_xml remove markup from distinct strings?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', remove_markup=True, ) self.assertEqual( [s.get_content() for s in segmentation], ['1', '2', '2', '3', '3', '3', '4', '4', '5'], msg="import_xml doesn't remove markup from distinct strings!")
def test_import_xml_import_element_as_annotation(self): """Does import_xml import element as annotation?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', import_element_as='test', ) self.assertEqual( [s.annotations['test'] for s in segmentation], ['a', 'a'], msg="import_xml doesn't import element as annotation!" )
def test_import_xml_import_annotations(self): """Does import_xml import annotations?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', import_annotations=True, ) self.assertEqual( segmentation[0].annotations['a'], '1', msg="import_xml doesn't import annotations!" )
def test_import_xml_remove_markup_broken(self): """Does import_xml remove markup from distinct strings?""" segmentation = Segmenter.import_xml( self.broken_xml_seg, element='a', remove_markup=True, ) self.assertEqual( [s.get_content() for s in segmentation], ['1', '2', '2', '3', '3', '3', '4', '4', '5'], msg="import_xml doesn't remove markup from distinct strings!" )
def test_import_xml_condition(self): """Does import_xml respect conditions?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', conditions={'attr': re.compile(r'^2$')}, ) self.assertEqual( [s.annotations['attr'] for s in segmentation], ['2'], msg="import_xml doesn't respect conditions!" )
def test_import_xml_autonumber(self): """Does import_xml autonumber input segments?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', auto_number_as='num' ) self.assertEqual( [s.annotations['num'] for s in segmentation], [1, 2], msg="import_xml doesn't autonumber input segments!" )
def test_import_xml_preserve_leaves(self): """Does import_xml preserve leaves?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', merge_duplicates=True, remove_markup=True, preserve_leaves=True, ) self.assertEqual(segmentation[0].annotations['attr'], '2/3/', msg="import_xml doesn't preserve leaves!")
def test_import_xml_solve_attribute_conflict(self): """Does import_xml solve attribute conflicts?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', merge_duplicates=True, remove_markup=True, ) self.assertEqual( segmentation[0].annotations['attr'], '1', msg="import_xml doesn't solve attribute conflicts!" )
def test_import_xml_remove_markup(self): """Does import_xml remove markup?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', conditions={'attr': re.compile(r'^2$')}, remove_markup=True, ) self.assertEqual( [s.get_content() for s in segmentation], ['c', 'd'], msg="import_xml doesn't remove markup!" )
def test_import_xml_merge_duplicates(self): """Does import_xml merge duplicates?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', merge_duplicates=True, remove_markup=True, ) self.assertEqual( [s.get_content() for s in segmentation], ['c', 'd'], msg="import_xml doesn't merge duplicates!" )
def test_import_xml_preserve_leaves(self): """Does import_xml preserve leaves?""" segmentation = Segmenter.import_xml( self.xml_seg, element='a', merge_duplicates=True, remove_markup=True, preserve_leaves=True, ) self.assertEqual( segmentation[0].annotations['attr'], '2', msg="import_xml doesn't preserve leaves!" )
def getTitleListFromTheatreClassique(self): """Fetch titles from the Theatre-classique website""" self.infoBox.customMessage( u'Fetching data from Theatre-classique website, please wait') # Attempt to connect to Theatre-classique... try: response = urllib2.urlopen(self.base_url) base_html = unicode(response.read(), 'iso-8859-1') self.infoBox.customMessage( u'Done fetching data from Theatre-classique website.') # If unable to connect (somehow)... except: # Set Info box and widget to 'warning' state. self.infoBox.noDataSent( warning=u"Couldn't access theatre-classique website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send(u'Text data', None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles from HTML. table_seg = Segmenter.import_xml( segmentation=recoded_seg, element=u'table', conditions={u'id': re.compile(ur'^table_AA$')}, ) # Extract table lines. line_seg = Segmenter.import_xml( segmentation=table_seg, element=u'tr', ) # Compile the regex that will be used to parse each line. field_regex = re.compile( ur"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*" ur"<td>(.+?)</td>\s*" ur"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*" ur"<td.+?>\s*(.+?)\s*</td>\s*" ur"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML") # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=line_seg, regexes=[ (field_regex, u'tokenize', { u'author': u'&1' }), (field_regex, u'tokenize', { u'title': u'&2' }), (field_regex, u'tokenize', { u'year': u'&3' }), (field_regex, u'tokenize', { u'genre': u'&4' }), (field_regex, u'tokenize', { u'url': u'&5' }), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module's directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, u"cached_title_list"), u'wb') pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): # Si le lien vers treetagger n'est pas trouve if self.NoLink: self.infoBox.setText(u"Sorry, TreeTagger's link not found.", "error") self.send('Text data', None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText(u"Widget needs input", "warning") self.send('Text data', None) # affiche que quelque chose se passe... else: self.infoBox.setText(u'TreeTagger is running...', "warning") # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = OWGUI.ProgressBar(self, iterations=5) # ajouter la seguementation du seguement for seg_idx, segment in enumerate(self.inputData): attr = " ".join(["%s='%s'" % \ item for item in segment.annotations.items()]) for itema in segment.annotations.items(): print itema print attr segment.annotations["tt_xb"] = attr self.inputData[seg_idx] = segment # si on re-utilise le widget il faut supprimer l'annotation tt_xb sinon bug car déjà existante # avancer la progressBar d'un cran self.progressBar.advance() concatenated_text = self.inputData.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d'un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d'un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile(r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>'), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile(r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>'), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w") self.infoBox.dataSent('') # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", 'w') else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), 'w') file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send('Text data', final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
def sendData(self): # Si le lien vers treetagger n"est pas trouve if self.NoLink: self.infoBox.setText(u"Sorry, TreeTagger's link not found.", "error") self.send("Text data", None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText(u"Widget needs input", "warning") self.send("Text data", None) # affiche que quelque chose se passe... else: self.infoBox.setText(u"TreeTagger is running...", "warning") # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = gui.ProgressBar(self, iterations=5) # Copie de la segmentation avec ajout d"une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()]) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d"un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d"un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d"un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w") self.infoBox.dataSent("") # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", "w") else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), "w") file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send("Text data", final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
def getTitleListFromEighteenthCenturyPoetry(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait" ) # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('iso-8859-1') self.infoBox.customMessage( "Done fetching data from EighteenthCenturyPoetry website." ) # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent( warning="Couldn't access EighteenthCenturyPoetry website." ) # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract works. genre_corpus = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"class": re.compile(r"^genres-list$")}, ) genre_list = Segmenter.tokenize( segmentation=genre_corpus, regexes=re.compile(r"<a.+$"), import_annotations=False, merge_duplicates=True, ) work_list = Segmenter.tokenize( segmentation=genres_list, regexes=re.compile(r"<li class="bibl">(.+?)</li>"), import_annotations=False, merge_duplicates=True, ) # Compile the regex that will be used to parse each line. field_regex = re.compile( r"<a href="(.+?)">" r"<a href=".+?">(.+?)</a>" r"<span style="color:.+?666">(.+?)</span>" ) # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=work_list, regexes=[ (field_regex, "tokenize", {"url": "&1"}), (field_regex, "tokenize", {"title": "&2"}), (field_regex, "tokenize", {"author": "&3"}), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) try: file = open(os.path.join(path, "cached_title_list"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def getTitleListFromTheatreClassique(self): """Fetch titles from the Theatre-classique website""" self.infoBox.customMessage( "Fetching data from Theatre-classique website, please wait" ) # Attempt to connect to Theatre-classique... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('iso-8859-1') self.infoBox.customMessage( "Done fetching data from Theatre-classique website." ) # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent( warning="Couldn't access theatre-classique website." ) # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles from HTML. table_seg = Segmenter.import_xml( segmentation=recoded_seg, element="table", conditions={"id": re.compile(r"^table_AA$")}, ) # Extract table lines. line_seg = Segmenter.import_xml( segmentation=table_seg, element="tr", ) # Compile the regex that will be used to parse each line. field_regex = re.compile( r"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*" r"<td>(.+?)</td>\s*" r"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*" r"<td.+?>\s*(.+?)\s*</td>\s*" r"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML" ) # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=line_seg, regexes=[ (field_regex, "tokenize", {"author": "&1"}), (field_regex, "tokenize", {"title": "&2"}), (field_regex, "tokenize", {"year": "&3"}), (field_regex, "tokenize", {"genre": "&4"}), (field_regex, "tokenize", {"url": "&5"}), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) try: file = open(os.path.join(path, "cached_title_list"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText(u"Widget needs input", "warning") self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar(self, iterations=5) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join([ "%s=%s" % ( ''.join(c for c in unicodedata.normalize('NFD', item[0]) if unicodedata.category(c) != 'Mn'), quoteattr(str(item[1])), ) for item in segment.annotations.items() ]) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Replace <unknown> with [unknown] and " with " then # re-segment to match the original segmentation structure. tagged_segmentation, _ = Segmenter.recode( tagged_input, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r'"""'), '"""'), ], ) tagged_segmentation = Segmenter.import_xml(tagged_segmentation, "ax_tt") self.progressBar.advance() # Place each output line of Treetagger in an xml tag with annotations.. xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w") except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error") self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
def sendData(self): """Compute result of widget processing and send to output""" if not self.importedCorpora: self.infoBox.setText("Please add a corpus to the selection.", "warning") self.send("Files", None, self) self.send("Utterances", None, self) return # Clear created Inputs and initialize progress bar... self.clearCreatedInputs() numberOfSteps = 2 if self.outputUtterances else 1 numberOfSteps += 2 if self.outputWords else 0 self.infoBox.setText( "(1/%i) Retrieving data, please wait..." % numberOfSteps, "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.importedCorpora)) annotations = list() # Iterate over corpora... for importedCorpus in self.importedCorpora: corpus = importedCorpus.split("/")[-1] # Try to retrieve corpus from cache... try: basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) corpusFilepath = os.path.normpath( os.path.join( basepath, self.__class__.cachedFoldername, importedCorpus[len(self.__class__.baseUrl):], )) myZip = zipfile.ZipFile(corpusFilepath) except IOError: # Else try to download (and cache) requested zip file... try: response = requests.get(importedCorpus) myZip = zipfile.ZipFile(io.BytesIO(response.content)) corpusFolderpath = os.path.dirname(corpusFilepath) try: os.makedirs(corpusFolderpath) except OSError: pass try: outputFile = open(corpusFilepath, "wb") outputFile.write(response.content) outputFile.close() except IOError: pass # If an error occurs (e.g. connection error)... except: # Set Info box and widget to "error" state. self.infoBox.setText( "Couldn't download corpus %s from CHILDES website." % corpus, "error") # Reset output channel. self.send("Files", None, self) self.send("Utterances", None, self) progressBar.finish() self.controlArea.setDisabled(False) return # Create Input for each zipped file and store annotations... for file in myZip.infolist(): file_content = myZip.read(file).decode('utf-8') # If word segmentation is requested... if self.outputWords: # Implement replacements. file_content = re.sub( r"<w.+?(<replacement.+</replacement>).*?</w>", r"\1", file_content, ) # Prepend pre-clitics. file_content, n = re.subn( r"(<mor .+?)(<mor-pre>.+</mor-pre>)", r"\2\1", file_content, ) # Move <gra> into <mw>. file_content, n = re.subn( r"(</mw>)(<gra.+?/>)", r"\2\1", file_content, ) newInput = Input(file_content, self.captionTitle + "_files") self.createdInputs.append(newInput) chatSeg = Segmenter.import_xml(newInput, "CHAT") annotations.append(dict()) annotations[-1]["file_path"] = file.filename for key in ["Corpus", "Lang", "PID"]: try: annotations[-1][key.lower()] = \ chatSeg[0].annotations[key] except KeyError: pass participantListSeg = Segmenter.import_xml( newInput, "Participants") recodedInput, _ = Segmenter.recode( participantListSeg, [(re.compile("/>"), "> </participant>")]) participantSeg = Segmenter.import_xml(recodedInput, "participant") targetChildData = list() for participant in participantSeg: if participant.annotations["role"] != "Target_Child": continue targetChildData.append(dict()) if "age" in participant.annotations: targetChildData[-1]["target_child_age"] = \ participant.annotations["age"] age_parse = re.search( r"(\d+)Y(\d+)M(\d+)D", participant.annotations["age"], ) if age_parse: targetChildData[-1]["target_child_years"] = \ age_parse.group(1) months = int(age_parse.group(2)) \ + 12 * int(age_parse.group(1)) targetChildData[-1]["target_child_months"] = \ '%02d' % months days = int(age_parse.group(3)) \ + 30 * months targetChildData[-1]["target_child_days"] = \ '%02d' % days if "id" in participant.annotations: targetChildData[-1]["target_child_id"] = \ participant.annotations["id"] if "sex" in participant.annotations: targetChildData[-1]["target_child_sex"] = \ participant.annotations["sex"] if len(targetChildData) == 1: annotations[-1].update(targetChildData[0]) progressBar.advance() # If there's only one file, the widget's output is the created Input... if len(self.createdInputs) == 1: self.fileSegmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.fileSegmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle + "_files", import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.fileSegmentation): segment.annotations.update(annotations[idx]) self.fileSegmentation[idx] = segment # Terminate progress bar... progressBar.finish() message = "%i file@p" % len(self.fileSegmentation) message = pluralize(message, len(self.fileSegmentation)) self.send("Files", self.fileSegmentation, self) # Build utterance segmentation if needed... if self.outputUtterances: self.infoBox.setText( "(2/%i) Building utterance segmentation, please wait..." \ % numberOfSteps, "warning", ) progressBar = ProgressBar(self, iterations=len(self.fileSegmentation)) self.utteranceSegmentation = Segmenter.import_xml( self.fileSegmentation, "u", progress_callback=progressBar.advance, label=self.captionTitle + "_utterances", ) progressBar.finish() message += " and " if not self.outputWords else ", " message += "%i utterance@p" % len(self.utteranceSegmentation) message = pluralize(message, len(self.utteranceSegmentation)) self.send("Utterances", self.utteranceSegmentation, self) else: self.send("Utterances", None, self) # Build word segmentation if needed... if self.outputWords: self.infoBox.setText( "(%i/%i) Building word segmentation, please wait..." \ % (2 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) try: baseSegmentation = self.utteranceSegmentation except: baseSegmentation = self.fileSegmentation progressBar = ProgressBar(self, iterations=2 * len(baseSegmentation)) wordSegmentation = Segmenter.import_xml( baseSegmentation, "w", progress_callback=progressBar.advance, ) mwSegmentation = Segmenter.import_xml( baseSegmentation, "mw", progress_callback=progressBar.advance, ) # Analyze words to extract annotations... self.infoBox.setText( "(%i/%i) Extracting word annotations, please wait..." \ % (3 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) progressBar.finish() progressBar = ProgressBar(self, iterations=len(wordSegmentation)) wordSegments = list() for word in wordSegmentation: mws = word.get_contained_segments(mwSegmentation) if mws: for mw in mws: wordSegment = word.deepcopy() wordSegment.annotations.update( self.extractWordAnnotations(mw)) wordSegments.append(wordSegment) else: wordSegments.append(word) progressBar.advance() self.wordSegmentation = Segmentation( wordSegments, label=self.captionTitle + "_words", ) message += " and %i word@p" % len(self.wordSegmentation) message = pluralize(message, len(self.wordSegmentation)) self.send("Words", self.wordSegmentation, self) else: self.send("Words", None, self) # Set status to OK and report data size... message += " sent to output." message = pluralize(message, len(self.fileSegmentation)) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def sendData(self): # Si le lien vers treetagger n'est pas trouve if self.NoLink: self.infoBox.setText( u"Sorry, TreeTagger's link not found.", "error" ) self.send('Text data', None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText( u"Widget needs input", "warning" ) self.send('Text data', None) # affiche que quelque chose se passe... else: self.infoBox.setText( u'TreeTagger is running...', "warning" ) # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = OWGUI.ProgressBar( self, iterations = 5 ) # Copie de la segmentation avec ajout d'une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()] ) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d'un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d'un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d'un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions = [ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w" ) self.infoBox.dataSent('') # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", 'w') else: file = open(os.path.normpath( "/Users/" + self.user + "/treetagger_link.txt"), 'w' ) file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send('Text data', final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
def getTitleListFromECP(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait") # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('utf-8') self.infoBox.customMessage("Done fetching data from ECP website.") # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent(warning="Couldn't access ECP website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles... genresListSeg = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"id": re.compile(r"^genres-list")}, ) # Extract genre annotation... genreSeg = Segmenter.tokenize( segmentation=genresListSeg, regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \ "tokenize", {"genre": "&1"})], import_annotations=False, ) # Extract works... titleSeg = Segmenter.tokenize( segmentation=genreSeg, regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \ "tokenize")], ) # Extract annotations... titleSeg = Segmenter.tokenize( segmentation=titleSeg, regexes=[ (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", { "author": "&1" }), (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'), "tokenize", { "url": "&1" }), (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", { "title": "&1" }), ], merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): """(Have LTTL.Segmenter) perform the actual tokenization""" # Check that there's something on input... if not self.inputSegmentation: self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Extracted data', None, self) return # Check that element field is not empty... if not self.element: self.infoBox.setText(u'Please type an XML element', 'warning') self.send('Extracted data', None, self) return # TODO: update docs to indicate that angle brackets are optional # TODO: remove message 'No label was provided.' from docs # Check that importElementAs is not empty (if necessary)... if self.displayAdvancedSettings and self.importElement: if self.importElementAs: importElementAs = self.importElementAs else: self.infoBox.setText( u'Please enter an annotation key for element import.', 'warning') self.send('Extracted data', None, self) return else: importElementAs = None # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey num_iterations = (2 * len(self.inputSegmentation)) else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Extracted data', None, self) return else: autoNumberKey = None num_iterations = len(self.inputSegmentation) # Prepare conditions... conditions = dict() if self.displayAdvancedSettings: for condition_idx in range(len(self.conditions)): condition = self.conditions[condition_idx] attribute = condition[0] regex_string = condition[1] if (condition[2] or condition[3] or condition[4] or condition[5]): flags = '' if condition[2]: flags += 'i' if condition[3]: flags += 'u' if condition[4]: flags += 'm' if condition[5]: flags += 's' regex_string += '(?%s)' % flags try: conditions[attribute] = re.compile(regex_string) except re.error as re_error: try: message = u'Please enter a valid regex (error: %s' % \ re_error.msg if len(self.conditions) > 1: message += u', condition #%i' % (condition_idx + 1) message += u').' except AttributeError: message = u'Please enter a valid regex' if len(self.conditions) > 1: message += u' (condition #%i)' % (condition_idx + 1) message += u'.' self.infoBox.setText(message, 'error') self.send('Extracted data', None, self) return # Basic settings... if self.displayAdvancedSettings: importAnnotations = self.importAnnotations preserveLeaves = self.preserveLeaves mergeDuplicates = self.mergeDuplicates if mergeDuplicates: num_iterations += len(self.inputSegmentation) else: importAnnotations = True mergeDuplicates = False preserveLeaves = False # Perform tokenization... self.controlArea.setDisabled(True) self.infoBox.setText(u"Processing, please wait...", "warning") progressBar = ProgressBar(self, iterations=num_iterations) try: xml_extracted_data = Segmenter.import_xml( segmentation=self.inputSegmentation, element=self.element, conditions=conditions, import_element_as=importElementAs, label=self.captionTitle, import_annotations=importAnnotations, auto_number_as=autoNumberKey, remove_markup=self.deleteMarkup, merge_duplicates=mergeDuplicates, preserve_leaves=preserveLeaves, progress_callback=progressBar.advance, ) message = u'%i segment@p sent to output.' % len(xml_extracted_data) message = pluralize(message, len(xml_extracted_data)) self.infoBox.setText(message) self.send('Extracted data', xml_extracted_data, self) except ValueError: self.infoBox.setText( message=u'Please make sure that input is well-formed XML.', state='error', ) self.send('Extracted data', None, self) self.sendButton.resetSettingsChangedFlag() progressBar.finish() self.controlArea.setDisabled(False)