def test_recode_overlapping_segmentation(self): """Does recode raise exception for overlapping segmentation?""" with self.assertRaises( ValueError, msg= "recode doesn't raise exception for overlapping segmentation!" ): Segmenter.recode(self.overlapping_seg, )
def test_recode_overlapping_segmentation(self): """Does recode raise exception for overlapping segmentation?""" with self.assertRaises( ValueError, msg="recode doesn't raise exception for overlapping segmentation!" ): Segmenter.recode( self.overlapping_seg, )
def test_recode_remove_accents(self): """Does recode remove accents?""" segmentation, _ = Segmenter.recode( self.second_word_seg, remove_accents=True, ) self.assertEqual([s.get_content() for s in segmentation], ['AB', 'cde'], msg="recode doesn't remove accents!")
def test_recode_no_change(self): """Does recode return a Segmentation when no change is made?""" segmentation = Segmenter.recode( self.entire_text_seg, ) self.assertTrue( isinstance(segmentation, Segmentation), msg="recode doesn't return a Segmentation when no change is made!" )
def test_recode_segmentation_as_input(self): """Does recode return a Segmentation when input is one?""" segmentation = Segmenter.recode( self.letter_seg, case='upper', ) self.assertTrue( isinstance(segmentation, Segmentation), msg="recode doesn't return a Segmentation when input is one!" )
def test_recode_single_input(self): """Does recode return a single Input object when needed?""" segmentation = Segmenter.recode( self.entire_text_seg, case='upper', ) self.assertTrue( isinstance(segmentation, Input), msg="recode doesn't return a single Input object when needed!" )
def test_get_real_str_index_recoded(self): """Does get_real_str_index() work with actual str index?""" recoded_seg = Segmenter.recode( self.char_seg, substitutions=[(re.compile(r'[bd]'), 'f')], ) self.assertEqual( recoded_seg[-1].get_real_str_index(), self.char_seg[0].str_index, msg="get_real_str_index() doesn't work with redirected str index!")
def test_get_real_str_index_recoded(self): """Does get_real_str_index() work with actual str index?""" recoded_seg, _ = Segmenter.recode( self.char_seg, substitutions=[(re.compile(r'[bd]'), 'f')], ) self.assertEqual( recoded_seg[-1].get_real_str_index(), self.char_seg[0].str_index, msg="get_real_str_index() doesn't work with redirected str index!" )
def test_recode_substitutions(self): """Does recode apply substitutions?""" segmentation, _ = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'..'), 'x'), (re.compile(r'xe'), 'ex'), ], ) self.assertEqual([s.get_content() for s in segmentation], ['x', 'ex'], msg="recode doesn't apply substitutions!")
def test_recode_variable_interpolation(self): """Does recode interpolate variables for substitutions?""" segmentation, _ = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'(.)(.)'), '&2&1'), ], ) self.assertEqual( [s.get_content() for s in segmentation], ['ba', 'dce'], msg="recode doesn't interpolate variables for substitutions!")
def test_recode_remove_accents(self): """Does recode remove accents?""" segmentation = Segmenter.recode( self.second_word_seg, remove_accents=True, ) self.assertEqual( [s.get_content() for s in segmentation], ['AB', 'cde'], msg="recode doesn't remove accents!" )
def test_recode_lower_case(self): """Does recode change case to lower?""" segmentation = Segmenter.recode( self.second_word_seg, case='lower', ) self.assertEqual( [s.get_content() for s in segmentation], ['ab', 'cd\xe9'], msg="recode doesn't change case to lower!" )
def test_recode_upper_case(self): """Does recode change case to upper?""" segmentation = Segmenter.recode( self.word_seg, case='upper', ) self.assertEqual( [s.get_content() for s in segmentation], ['AB', 'CDE'], msg="recode doesn't change case to upper!" )
def test_recode_copy_annotations_false(self): """Does recode skip copying annotations?""" segmentation, _ = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'.'), 'test'), ], copy_annotations=False, ) self.assertFalse('a' in segmentation[0].annotations, msg="recode doesn't skip copying annotations!")
def test_recode_progress(self): """Does recode track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.recode( self.word_seg, case='upper', substitutions=[ (re.compile(r'..'), 'x'), (re.compile(r'xe'), 'ex'), ], progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.word_seg), msg="recode doesn't track progress!" )
def test_recode_copy_annotations(self): """Does recode copy annotations?""" segmentation, _ = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'.'), 'test'), ], copy_annotations=True, ) self.assertEqual(segmentation[0].annotations['a'], '1', msg="recode doesn't copy annotations!")
def test_recode_copy_annotations_false(self): """Does recode skip copying annotations?""" segmentation = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'...'), 'test'), ], copy_annotations=False, ) self.assertFalse( 'a' in segmentation[0].annotations, msg="recode doesn't skip copying annotations!" )
def test_recode_variable_interpolation(self): """Does recode interpolate variables for substitutions?""" segmentation = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'(.)(.)'), '&2&1'), ], ) self.assertEqual( [s.get_content() for s in segmentation], ['ba', 'dce'], msg="recode doesn't interpolate variables for substitutions!" )
def test_recode_substitutions(self): """Does recode apply substitutions?""" segmentation = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'..'), 'x'), (re.compile(r'xe'), 'ex'), ], ) self.assertEqual( [s.get_content() for s in segmentation], ['x', 'ex'], msg="recode doesn't apply substitutions!" )
def test_recode_copy_annotations(self): """Does recode copy annotations?""" segmentation = Segmenter.recode( self.word_seg, substitutions=[ (re.compile(r'...'), 'test'), ], copy_annotations=True, ) self.assertEqual( segmentation[0].annotations['a'], '1', msg="recode doesn't copy annotations!" )
def test_recode_substitutions_after(self): """Does recode apply substitutions after preprocessing?""" segmentation = Segmenter.recode( self.word_seg, case='upper', substitutions=[ (re.compile(r'..'), 'x'), (re.compile(r'xe'), 'ex'), ], ) self.assertEqual( [s.get_content() for s in segmentation], ['x', 'xE'], msg="recode doesn't apply substitutions after preprocessing!" )
def sendData(self): """Compute result of widget processing and send to output""" if not self.importedCorpora: self.infoBox.setText("Please add a corpus to the selection.", "warning") self.send("Files", None, self) self.send("Utterances", None, self) return # Clear created Inputs and initialize progress bar... self.clearCreatedInputs() numberOfSteps = 2 if self.outputUtterances else 1 numberOfSteps += 2 if self.outputWords else 0 self.infoBox.setText( "(1/%i) Retrieving data, please wait..." % numberOfSteps, "warning", ) self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.importedCorpora)) annotations = list() # Iterate over corpora... for importedCorpus in self.importedCorpora: corpus = importedCorpus.split("/")[-1] # Try to retrieve corpus from cache... try: basepath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) corpusFilepath = os.path.normpath( os.path.join( basepath, self.__class__.cachedFoldername, importedCorpus[len(self.__class__.baseUrl):], )) myZip = zipfile.ZipFile(corpusFilepath) except IOError: # Else try to download (and cache) requested zip file... try: response = requests.get(importedCorpus) myZip = zipfile.ZipFile(io.BytesIO(response.content)) corpusFolderpath = os.path.dirname(corpusFilepath) try: os.makedirs(corpusFolderpath) except OSError: pass try: outputFile = open(corpusFilepath, "wb") outputFile.write(response.content) outputFile.close() except IOError: pass # If an error occurs (e.g. connection error)... except: # Set Info box and widget to "error" state. self.infoBox.setText( "Couldn't download corpus %s from CHILDES website." % corpus, "error") # Reset output channel. self.send("Files", None, self) self.send("Utterances", None, self) progressBar.finish() self.controlArea.setDisabled(False) return # Create Input for each zipped file and store annotations... for file in myZip.infolist(): file_content = myZip.read(file).decode('utf-8') # If word segmentation is requested... if self.outputWords: # Implement replacements. file_content = re.sub( r"<w.+?(<replacement.+</replacement>).*?</w>", r"\1", file_content, ) # Prepend pre-clitics. file_content, n = re.subn( r"(<mor .+?)(<mor-pre>.+</mor-pre>)", r"\2\1", file_content, ) # Move <gra> into <mw>. file_content, n = re.subn( r"(</mw>)(<gra.+?/>)", r"\2\1", file_content, ) newInput = Input(file_content, self.captionTitle + "_files") self.createdInputs.append(newInput) chatSeg = Segmenter.import_xml(newInput, "CHAT") annotations.append(dict()) annotations[-1]["file_path"] = file.filename for key in ["Corpus", "Lang", "PID"]: try: annotations[-1][key.lower()] = \ chatSeg[0].annotations[key] except KeyError: pass participantListSeg = Segmenter.import_xml( newInput, "Participants") recodedInput, _ = Segmenter.recode( participantListSeg, [(re.compile("/>"), "> </participant>")]) participantSeg = Segmenter.import_xml(recodedInput, "participant") targetChildData = list() for participant in participantSeg: if participant.annotations["role"] != "Target_Child": continue targetChildData.append(dict()) if "age" in participant.annotations: targetChildData[-1]["target_child_age"] = \ participant.annotations["age"] age_parse = re.search( r"(\d+)Y(\d+)M(\d+)D", participant.annotations["age"], ) if age_parse: targetChildData[-1]["target_child_years"] = \ age_parse.group(1) months = int(age_parse.group(2)) \ + 12 * int(age_parse.group(1)) targetChildData[-1]["target_child_months"] = \ '%02d' % months days = int(age_parse.group(3)) \ + 30 * months targetChildData[-1]["target_child_days"] = \ '%02d' % days if "id" in participant.annotations: targetChildData[-1]["target_child_id"] = \ participant.annotations["id"] if "sex" in participant.annotations: targetChildData[-1]["target_child_sex"] = \ participant.annotations["sex"] if len(targetChildData) == 1: annotations[-1].update(targetChildData[0]) progressBar.advance() # If there's only one file, the widget's output is the created Input... if len(self.createdInputs) == 1: self.fileSegmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.fileSegmentation = Segmenter.concatenate( self.createdInputs, self.captionTitle + "_files", import_labels_as=None, ) # Annotate segments... for idx, segment in enumerate(self.fileSegmentation): segment.annotations.update(annotations[idx]) self.fileSegmentation[idx] = segment # Terminate progress bar... progressBar.finish() message = "%i file@p" % len(self.fileSegmentation) message = pluralize(message, len(self.fileSegmentation)) self.send("Files", self.fileSegmentation, self) # Build utterance segmentation if needed... if self.outputUtterances: self.infoBox.setText( "(2/%i) Building utterance segmentation, please wait..." \ % numberOfSteps, "warning", ) progressBar = ProgressBar(self, iterations=len(self.fileSegmentation)) self.utteranceSegmentation = Segmenter.import_xml( self.fileSegmentation, "u", progress_callback=progressBar.advance, label=self.captionTitle + "_utterances", ) progressBar.finish() message += " and " if not self.outputWords else ", " message += "%i utterance@p" % len(self.utteranceSegmentation) message = pluralize(message, len(self.utteranceSegmentation)) self.send("Utterances", self.utteranceSegmentation, self) else: self.send("Utterances", None, self) # Build word segmentation if needed... if self.outputWords: self.infoBox.setText( "(%i/%i) Building word segmentation, please wait..." \ % (2 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) try: baseSegmentation = self.utteranceSegmentation except: baseSegmentation = self.fileSegmentation progressBar = ProgressBar(self, iterations=2 * len(baseSegmentation)) wordSegmentation = Segmenter.import_xml( baseSegmentation, "w", progress_callback=progressBar.advance, ) mwSegmentation = Segmenter.import_xml( baseSegmentation, "mw", progress_callback=progressBar.advance, ) # Analyze words to extract annotations... self.infoBox.setText( "(%i/%i) Extracting word annotations, please wait..." \ % (3 + (1 if self.outputUtterances else 0), numberOfSteps), "warning", ) progressBar.finish() progressBar = ProgressBar(self, iterations=len(wordSegmentation)) wordSegments = list() for word in wordSegmentation: mws = word.get_contained_segments(mwSegmentation) if mws: for mw in mws: wordSegment = word.deepcopy() wordSegment.annotations.update( self.extractWordAnnotations(mw)) wordSegments.append(wordSegment) else: wordSegments.append(word) progressBar.advance() self.wordSegmentation = Segmentation( wordSegments, label=self.captionTitle + "_words", ) message += " and %i word@p" % len(self.wordSegmentation) message = pluralize(message, len(self.wordSegmentation)) self.send("Words", self.wordSegmentation, self) else: self.send("Words", None, self) # Set status to OK and report data size... message += " sent to output." message = pluralize(message, len(self.fileSegmentation)) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.sendButton.resetSettingsChangedFlag()
def test_recode_no_change(self): """Does recode return a Segmentation when no change is made?""" segmentation, _ = Segmenter.recode(self.entire_text_seg, ) self.assertTrue( isinstance(segmentation, Segmentation), msg="recode doesn't return a Segmentation when no change is made!")
def sendData(self): # Clear created Inputs... self.clearCreatedInputs() if not self.TreetaggerPath: self.infoBox.setText(self.noTreetaggerPathWarning, "warning") self.send("Tagged data", None) return elif not self.getAvailableLanguages(): self.infoBox.setText(self.noLanguageParameterWarning, "warning") self.send("Tagged data", None) return elif not self.segmentation: self.infoBox.setText(u"Widget needs input", "warning") self.send("Tagged data", None) return # Initialize progress bar. self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) self.progressBar = ProgressBar(self, iterations=5) # Create a copy of input seg, storing annotations in temp attr... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.segmentation.label for seg_idx, segment in enumerate(self.segmentation): attr = " ".join([ "%s=%s" % ( ''.join(c for c in unicodedata.normalize('NFD', item[0]) if unicodedata.category(c) != 'Mn'), quoteattr(str(item[1])), ) for item in segment.annotations.items() ]) segment.annotations["tt_ax"] = attr copy_of_input_seg.append(segment) self.progressBar.advance() # Dump segmentation in unique string to avoid multiple calls to TT... concatenated_text = copy_of_input_seg.to_string( formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>", display_all=True, ) self.progressBar.advance() # Tag the segmentation contents... tagopt = '-token -lemma -sgml -quiet' if self.replaceUnknown: tagopt += " -no-unknown" tagger = treetaggerwrapper.TreeTagger( TAGLANG=pycountry.languages.get(name=self.language).alpha_2, TAGOPT=tagopt, TAGDIR=self.TreetaggerPath, ) tagged_lines = tagger.tag_text( concatenated_text, notagurl=True, notagemail=True, notagip=True, notagdns=True, ) tagged_input = Input("\n".join(tagged_lines)) self.createdInputs.append(tagged_input) # Replace <unknown> with [unknown] and " with " then # re-segment to match the original segmentation structure. tagged_segmentation, _ = Segmenter.recode( tagged_input, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r'"""'), '"""'), ], ) tagged_segmentation = Segmenter.import_xml(tagged_segmentation, "ax_tt") self.progressBar.advance() # Place each output line of Treetagger in an xml tag with annotations.. xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), '<w lemma="&3" pos-tag="&2">&1</w>'), (re.compile(r'^\n|\n$'), ''), ], ) # Segment into individual tokens if XML output option is disabled... if self.outputFormat == "add XML tags": output_segmentation = xml_segmentation else: try: output_segmentation = Segmenter.import_xml( xml_segmentation, "w") except ValueError: self.infoBox.setText( "Please check that either the input contains well-formed " "XML, or it doesn't contain instances of '<' and '\x3e'", "error") self.send("Tagged data", None) self.progressBar.finish() self.controlArea.setDisabled(False) return self.progressBar.finish() self.controlArea.setDisabled(False) output_segmentation.label = self.captionTitle message = u'%i segment@p sent to output.' % len(output_segmentation) message = pluralize(message, len(output_segmentation)) self.infoBox.setText(message) self.send('Tagged data', output_segmentation, self) self.sendButton.resetSettingsChangedFlag()
def getTitleListFromTheatreClassique(self): """Fetch titles from the Theatre-classique website""" self.infoBox.customMessage( "Fetching data from Theatre-classique website, please wait" ) # Attempt to connect to Theatre-classique... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('iso-8859-1') self.infoBox.customMessage( "Done fetching data from Theatre-classique website." ) # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent( warning="Couldn't access theatre-classique website." ) # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles from HTML. table_seg = Segmenter.import_xml( segmentation=recoded_seg, element="table", conditions={"id": re.compile(r"^table_AA$")}, ) # Extract table lines. line_seg = Segmenter.import_xml( segmentation=table_seg, element="tr", ) # Compile the regex that will be used to parse each line. field_regex = re.compile( r"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*" r"<td>(.+?)</td>\s*" r"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*" r"<td.+?>\s*(.+?)\s*</td>\s*" r"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML" ) # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=line_seg, regexes=[ (field_regex, "tokenize", {"author": "&1"}), (field_regex, "tokenize", {"title": "&2"}), (field_regex, "tokenize", {"year": "&3"}), (field_regex, "tokenize", {"genre": "&4"}), (field_regex, "tokenize", {"url": "&5"}), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) try: file = open(os.path.join(path, "cached_title_list"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): """(Have LTTL.Segmenter) perform the actual recoding""" # Check that there's something on input... if not self.segmentation: self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Recoded data', None, self) return # Check that segmentation is non-overlapping... if not self.segmentation.is_non_overlapping(): self.infoBox.setText( message=u'Please make sure that input segments are not ' + \ u'overlapping.', state='error' ) self.send('Recoded data', None, self) return # TODO: remove label message in doc # Get substitutions from basic or advanced settings... if self.displayAdvancedSettings: mySubstitutions = self.substitutions else: mySubstitutions = [[ self.regex, self.replString, False, True, False, False, ]] # Basic settings... if self.displayAdvancedSettings: copyAnnotations = self.copyAnnotations else: copyAnnotations = True # Prepare regexes... substitutions = list() for subst_idx in range(len(mySubstitutions)): subst = mySubstitutions[subst_idx] regex_string = subst[0] if subst[2] or subst[3] or subst[4] or subst[5]: flags = '' if subst[2]: flags += 'i' if subst[3]: flags += 'u' if subst[4]: flags += 'm' if subst[5]: flags += 's' regex_string += '(?%s)' % flags try: substitutions.append((re.compile(regex_string), subst[1])) except re.error as re_error: try: message = u'Please enter a valid regex (error: %s' % \ re_error.msg if self.displayAdvancedSettings and len( mySubstitutions) > 1: message += u', substitution #%i' % (subst_idx + 1) message += u').' except AttributeError: message = u'Please enter a valid regex' if self.displayAdvancedSettings and len( mySubstitutions) > 1: message += u' (substitution #%i)' % (subst_idx + 1) message += u'.' self.infoBox.setText(message, 'error') self.send('Recoded data', None, self) return # Perform recoding... self.clearCreatedInputIndices() previousNumInputs = len(Segmentation.data) self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(self.segmentation)) try: recoded_data, num_subs = Segmenter.recode( segmentation=self.segmentation, substitutions=substitutions, label=self.captionTitle, copy_annotations=copyAnnotations, progress_callback=progressBar.advance, ) newNumInputs = len(Segmentation.data) self.createdInputIndices = range(previousNumInputs, newNumInputs) message = u'%i segment@p sent to output' % len(recoded_data) message = pluralize(message, len(recoded_data)) if num_subs: message += u' (%i replacement@p performed).' % num_subs message = pluralize(message, num_subs) else: message += u" (no replacements performed)." self.infoBox.setText(message) self.send('Recoded data', recoded_data, self) except re.error as re_error: try: if str(re_error) == 'invalid group reference': message = u'Reference to unmatched group in ' + \ u' annotation key and/or value.' else: message = u'Please enter a valid regex (error: %s).' % \ str(re_error) except AttributeError: message = u'Please enter a valid regex.' self.infoBox.setText(message, 'error') self.send('Recoded data', None, self) self.sendButton.resetSettingsChangedFlag() progressBar.finish() self.controlArea.setDisabled(False)
def getTitleListFromEighteenthCenturyPoetry(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait" ) # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('iso-8859-1') self.infoBox.customMessage( "Done fetching data from EighteenthCenturyPoetry website." ) # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent( warning="Couldn't access EighteenthCenturyPoetry website." ) # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract works. genre_corpus = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"class": re.compile(r"^genres-list$")}, ) genre_list = Segmenter.tokenize( segmentation=genre_corpus, regexes=re.compile(r"<a.+$"), import_annotations=False, merge_duplicates=True, ) work_list = Segmenter.tokenize( segmentation=genres_list, regexes=re.compile(r"<li class="bibl">(.+?)</li>"), import_annotations=False, merge_duplicates=True, ) # Compile the regex that will be used to parse each line. field_regex = re.compile( r"<a href="(.+?)">" r"<a href=".+?">(.+?)</a>" r"<span style="color:.+?666">(.+?)</span>" ) # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=work_list, regexes=[ (field_regex, "tokenize", {"url": "&1"}), (field_regex, "tokenize", {"title": "&2"}), (field_regex, "tokenize", {"author": "&3"}), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) try: file = open(os.path.join(path, "cached_title_list"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): # Si le lien vers treetagger n"est pas trouve if self.NoLink: self.infoBox.setText(u"Sorry, TreeTagger's link not found.", "error") self.send("Text data", None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText(u"Widget needs input", "warning") self.send("Text data", None) # affiche que quelque chose se passe... else: self.infoBox.setText(u"TreeTagger is running...", "warning") # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = gui.ProgressBar(self, iterations=5) # Copie de la segmentation avec ajout d"une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()]) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d"un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d"un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d"un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation, _ = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), "[unknown]"), (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"), "<w lemma='&3' type='&2'>&1</w>"), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w") self.infoBox.dataSent("") # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", "w") else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), "w") file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send("Text data", final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
def getTitleListFromTheatreClassique(self): """Fetch titles from the Theatre-classique website""" self.infoBox.customMessage( u'Fetching data from Theatre-classique website, please wait') # Attempt to connect to Theatre-classique... try: response = urllib2.urlopen(self.base_url) base_html = unicode(response.read(), 'iso-8859-1') self.infoBox.customMessage( u'Done fetching data from Theatre-classique website.') # If unable to connect (somehow)... except: # Set Info box and widget to 'warning' state. self.infoBox.noDataSent( warning=u"Couldn't access theatre-classique website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send(u'Text data', None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles from HTML. table_seg = Segmenter.import_xml( segmentation=recoded_seg, element=u'table', conditions={u'id': re.compile(ur'^table_AA$')}, ) # Extract table lines. line_seg = Segmenter.import_xml( segmentation=table_seg, element=u'tr', ) # Compile the regex that will be used to parse each line. field_regex = re.compile( ur"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*" ur"<td>(.+?)</td>\s*" ur"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*" ur"<td.+?>\s*(.+?)\s*</td>\s*" ur"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML") # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=line_seg, regexes=[ (field_regex, u'tokenize', { u'author': u'&1' }), (field_regex, u'tokenize', { u'title': u'&2' }), (field_regex, u'tokenize', { u'year': u'&3' }), (field_regex, u'tokenize', { u'genre': u'&4' }), (field_regex, u'tokenize', { u'url': u'&5' }), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module's directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, u"cached_title_list"), u'wb') pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): # Si le lien vers treetagger n'est pas trouve if self.NoLink: self.infoBox.setText(u"Sorry, TreeTagger's link not found.", "error") self.send('Text data', None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText(u"Widget needs input", "warning") self.send('Text data', None) # affiche que quelque chose se passe... else: self.infoBox.setText(u'TreeTagger is running...', "warning") # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = OWGUI.ProgressBar(self, iterations=5) # ajouter la seguementation du seguement for seg_idx, segment in enumerate(self.inputData): attr = " ".join(["%s='%s'" % \ item for item in segment.annotations.items()]) for itema in segment.annotations.items(): print itema print attr segment.annotations["tt_xb"] = attr self.inputData[seg_idx] = segment # si on re-utilise le widget il faut supprimer l'annotation tt_xb sinon bug car déjà existante # avancer la progressBar d'un cran self.progressBar.advance() concatenated_text = self.inputData.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d'un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d'un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile(r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>'), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile(r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>'), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w") self.infoBox.dataSent('') # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", 'w') else: file = open( os.path.normpath("/Users/" + self.user + "/treetagger_link.txt"), 'w') file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send('Text data', final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()
def getTitleListFromECP(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait") # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('utf-8') self.infoBox.customMessage("Done fetching data from ECP website.") # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent(warning="Couldn't access ECP website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles... genresListSeg = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"id": re.compile(r"^genres-list")}, ) # Extract genre annotation... genreSeg = Segmenter.tokenize( segmentation=genresListSeg, regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \ "tokenize", {"genre": "&1"})], import_annotations=False, ) # Extract works... titleSeg = Segmenter.tokenize( segmentation=genreSeg, regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \ "tokenize")], ) # Extract annotations... titleSeg = Segmenter.tokenize( segmentation=titleSeg, regexes=[ (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", { "author": "&1" }), (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'), "tokenize", { "url": "&1" }), (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", { "title": "&1" }), ], merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): # Si le lien vers treetagger n'est pas trouve if self.NoLink: self.infoBox.setText( u"Sorry, TreeTagger's link not found.", "error" ) self.send('Text data', None) # Important: if input data is None, propagate this value to output... elif not self.inputData: self.infoBox.setText( u"Widget needs input", "warning" ) self.send('Text data', None) # affiche que quelque chose se passe... else: self.infoBox.setText( u'TreeTagger is running...', "warning" ) # Initialisation de variables total_tagged_text = list() new_segmentations = list() i = 0 # Initialize progress bar. self.progressBar = OWGUI.ProgressBar( self, iterations = 5 ) # Copie de la segmentation avec ajout d'une annotation... copy_of_input_seg = Segmentation() copy_of_input_seg.label = self.inputData.label for seg_idx, segment in enumerate(self.inputData): attr = " ".join( ["%s='%s'" % item for item in segment.annotations.items()] ) segment.annotations["tt_xb"] = attr copy_of_input_seg.append(segment) # avancer la progressBar d'un cran self.progressBar.advance() concatenated_text = copy_of_input_seg.to_string( formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>", display_all=True, ) # avancer la progressBar d'un cran self.progressBar.advance() tagged_text = self.tag(concatenated_text) tagged_input = Input(tagged_text) tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt") # avancer la progressBar d'un cran self.progressBar.advance() # Si checkBox xml active if self.activer_xml == True: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions = [ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = xml_segmentation # Si checkBox xml desactive else: xml_segmentation = Segmenter.recode( tagged_segmentation, substitutions=[ (re.compile(r"<unknown>"), '[unknown]'), (re.compile( r"(.+)\t(.+)\t(.+)"), '<w lemma="&3" type="&2">&1</w>' ), (re.compile(r'"""'), '"""'), ], ) final_segmentation = Segmenter.import_xml( xml_segmentation, "w" ) self.infoBox.dataSent('') # Enregistrer le lien de treetagger... if self.system == "nt": file = open("treetagger_link.txt", 'w') else: file = open(os.path.normpath( "/Users/" + self.user + "/treetagger_link.txt"), 'w' ) file.write(self.treetagger_link) file.close() # Clear progress bar. self.progressBar.finish() # envoyer la seguementation self.send('Text data', final_segmentation, self) self.compteur += 1 self.sendButton.resetSettingsChangedFlag()