def test_tokenize_exception_mode(self): """Does tokenize raise exception for unknown mode?""" with self.assertRaises( ValueError, msg="tokenize doesn't raise exception for unknown mode!"): Segmenter.tokenize( self.entire_text_seg, [(re.compile(r'\W+'), 'unknown_mode')], )
def test_tokenize_exception_mode(self): """Does tokenize raise exception for unknown mode?""" with self.assertRaises( ValueError, msg="tokenize doesn't raise exception for unknown mode!" ): Segmenter.tokenize( self.entire_text_seg, [(re.compile(r'\W+'), 'unknown_mode')], )
def test_tokenize_progress(self): """Does tokenize track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.tokenize( self.word_seg, [(re.compile(r'\w'), 'tokenize')], progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.word_seg), msg="tokenize doesn't track progress!")
def test_tokenize_import_annotations_false_split(self): """Does tokenize skip importing annotations (mode split)?""" segmentation = Segmenter.tokenize(self.word_seg, [(re.compile(r'a'), 'split')], import_annotations=False) self.assertFalse( 'a' in segmentation[0].annotations, msg="tokenize doesn't skip importing annotations (mode split)!")
def test_tokenize_progress(self): """Does tokenize track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.tokenize( self.word_seg, [(re.compile(r'\w'), 'tokenize')], progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.word_seg), msg="tokenize doesn't track progress!" )
def test_tokenize_import_annotations_tokenize(self): """Does tokenize import annotations (mode tokenize)?""" segmentation = Segmenter.tokenize(self.word_seg, [(re.compile(r'\w{2}'), 'tokenize')], import_annotations=True) self.assertEqual( segmentation[0].annotations['a'], '1', msg="tokenize doesn't import annotations (mode tokenize)!")
def test_tokenize_import_annotations_split(self): """Does tokenize import annotations (mode split)?""" segmentation = Segmenter.tokenize( self.word_seg, [(re.compile(r'a'), 'split')], ) self.assertEqual( segmentation[0].annotations['a'], '1', msg="tokenize doesn't import annotations (mode split)!")
def test_tokenize_autonumber(self): """Does tokenize autonumber input segments?""" segmentation = Segmenter.tokenize(self.word_seg, [ (re.compile(r'\w+'), 'tokenize'), (re.compile(r'\W+'), 'split'), ], auto_number_as='num') self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2, 3, 4], msg="tokenize doesn't autonumber input segments!")
def test_tokenize_import_annotations_false_split(self): """Does tokenize skip importing annotations (mode split)?""" segmentation = Segmenter.tokenize( self.word_seg, [(re.compile(r'a'), 'split')], import_annotations=False ) self.assertFalse( 'a' in segmentation[0].annotations, msg="tokenize doesn't skip importing annotations (mode split)!" )
def test_tokenize_create_static_annotations_split(self): """Does tokenize create static annotations (mode split)?""" segmentation = Segmenter.tokenize( self.word_seg, [(re.compile(r'\W'), 'split', { 'c': '3' })], ) self.assertEqual( [s.annotations['c'] for s in segmentation], ['3', '3'], msg="tokenize doesn't create static annotations (mode split)!")
def test_tokenize_create_static_annotations_split(self): """Does tokenize create static annotations (mode split)?""" segmentation = Segmenter.tokenize( self.word_seg, [(re.compile(r'\W'), 'split', {'c': '3'})], ) self.assertEqual( [s.annotations['c'] for s in segmentation], ['3', '3'], msg="tokenize doesn't create static annotations (mode split)!" )
def test_tokenize_import_annotations_split(self): """Does tokenize import annotations (mode split)?""" segmentation = Segmenter.tokenize( self.word_seg, [(re.compile(r'a'), 'split')], ) self.assertEqual( segmentation[0].annotations['a'], '1', msg="tokenize doesn't import annotations (mode split)!" )
def test_tokenize_sort(self): """Does tokenize sort output segments?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w'), 'tokenize'), (re.compile(r'[ae]'), 'tokenize'), ], ) self.assertEqual([s.get_content() for s in segmentation], ['a', 'a', 'b', 'c', 'd', 'e', 'e'], msg="tokenize doesn't sort output segments!")
def test_tokenize_segment_split(self): """Does tokenize split input?""" segmentation = Segmenter.tokenize( self.entire_text_seg, [ (re.compile(r'\W+'), 'split'), (re.compile(r'd'), 'split'), ], ) self.assertEqual([s.get_content() for s in segmentation], ['ab', 'ab c', 'cde', 'e'], msg="tokenize doesn't split input!")
def test_tokenize_import_annotations_tokenize(self): """Does tokenize import annotations (mode tokenize)?""" segmentation = Segmenter.tokenize( self.word_seg, [(re.compile(r'\w{2}'), 'tokenize')], import_annotations=True ) self.assertEqual( segmentation[0].annotations['a'], '1', msg="tokenize doesn't import annotations (mode tokenize)!" )
def test_tokenize_segment_tokenize(self): """Does tokenize tokenize input?""" segmentation = Segmenter.tokenize( self.entire_text_seg, [ (re.compile(r'\w+'), 'tokenize'), (re.compile(r'\w{3,}'), 'tokenize'), ], ) self.assertEqual([s.get_content() for s in segmentation], ['ab', 'cde', 'cde'], msg="tokenize doesn't tokenize input!")
def test_tokenize_create_dynamic_annotations_tokenize(self): """Does tokenize create dynamic annotations (mode tokenize)?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w(\w)(\w)'), 'tokenize', {'&1': '&2'}), ], ) self.assertEqual( segmentation[0].annotations['d'], 'e', msg="tokenize doesn't create dynamic annotations (mode tokenize)!" )
def test_tokenize_merge_duplicates(self): """Does tokenize merge duplicates?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w+'), 'tokenize'), (re.compile(r'\W+'), 'split'), ], merge_duplicates=True, ) self.assertEqual([s.get_content() for s in segmentation], ['ab', 'cde'], msg="tokenize doesn't merge duplicates!")
def test_tokenize_create_dynamic_annotations_tokenize(self): """Does tokenize create dynamic annotations (mode tokenize)?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w(\w)(\w)'), 'tokenize', { '&1': '&2' }), ], ) self.assertEqual( segmentation[0].annotations['d'], 'e', msg="tokenize doesn't create dynamic annotations (mode tokenize)!")
def test_tokenize_segment_split(self): """Does tokenize split input?""" segmentation = Segmenter.tokenize( self.entire_text_seg, [ (re.compile(r'\W+'), 'split'), (re.compile(r'd'), 'split'), ], ) self.assertEqual( [s.get_content() for s in segmentation], ['ab', 'ab c', 'cde', 'e'], msg="tokenize doesn't split input!" )
def test_tokenize_sort(self): """Does tokenize sort output segments?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w'), 'tokenize'), (re.compile(r'[ae]'), 'tokenize'), ], ) self.assertEqual( [s.get_content() for s in segmentation], ['a', 'a', 'b', 'c', 'd', 'e', 'e'], msg="tokenize doesn't sort output segments!" )
def test_tokenize_segment_tokenize(self): """Does tokenize tokenize input?""" segmentation = Segmenter.tokenize( self.entire_text_seg, [ (re.compile(r'\w+'), 'tokenize'), (re.compile(r'\w{3,}'), 'tokenize'), ], ) self.assertEqual( [s.get_content() for s in segmentation], ['ab', 'cde', 'cde'], msg="tokenize doesn't tokenize input!" )
def test_tokenize_autonumber(self): """Does tokenize autonumber input segments?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w+'), 'tokenize'), (re.compile(r'\W+'), 'split'), ], auto_number_as='num' ) self.assertEqual( [s.annotations['num'] for s in segmentation], [1, 2, 3, 4], msg="tokenize doesn't autonumber input segments!" )
def test_tokenize_solve_conflicts_merge_duplicates(self): """Does tokenize solve conflicts when merging duplicates?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w+'), 'tokenize', {'a': '10'}), (re.compile(r'\W+'), 'split', {'a': '20'}), ], merge_duplicates=True, ) self.assertEqual( segmentation[1].annotations['a'], '20', msg="tokenize doesn't solve conflicts when merging duplicates!" )
def test_tokenize_merge_duplicates(self): """Does tokenize merge duplicates?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w+'), 'tokenize'), (re.compile(r'\W+'), 'split'), ], merge_duplicates=True, ) self.assertEqual( [s.get_content() for s in segmentation], ['ab', 'cde'], msg="tokenize doesn't merge duplicates!" )
def main(): input_seg = Input("un texte") verbatim_seg = Segmenter.tokenize( input_seg, [(re.compile(r'.+'), 'tokenize')], ) # verbatim in input = ok print("verbatim in input:", end=' ') contained_segment_idxs = input_seg[0].get_contained_segment_indices( verbatim_seg) try: print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail") # verbatim in verbatim = ok print("verbatim in verbatim:", end=' ') contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices( verbatim_seg) try: print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail") # input in verbatim = fail print("input in verbatim:", end=' ') contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices( input_seg) try: print("ok" if input_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail") # input in input = fail print("input in input:", end=' ') contained_segment_idxs = input_seg[0].get_contained_segment_indices( input_seg) try: print("ok" if input_seg[contained_segment_idxs[0]].get_content() == 'un texte' else "fail") except: print("fail")
def test_tokenize_solve_conflicts_merge_duplicates(self): """Does tokenize solve conflicts when merging duplicates?""" segmentation = Segmenter.tokenize( self.word_seg, [ (re.compile(r'\w+'), 'tokenize', { 'a': '10' }), (re.compile(r'\W+'), 'split', { 'a': '20' }), ], merge_duplicates=True, ) self.assertEqual( segmentation[1].annotations['a'], '20', msg="tokenize doesn't solve conflicts when merging duplicates!")
def main(): input_seg = Input("un texte") verbatim_seg = Segmenter.tokenize( input_seg, [(re.compile(r'.+'), 'tokenize')], ) # verbatim in input = ok print "verbatim in input:", contained_segments = input_seg[0].get_contained_segments(verbatim_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail" # verbatim in verbatim = ok print "verbatim in verbatim:", contained_segments = verbatim_seg[0].get_contained_segments(verbatim_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail" # input in verbatim = fail print "input in verbatim:", contained_segments = verbatim_seg[0].get_contained_segments(input_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail" # input in input = fail print "input in input:", contained_segments = input_seg[0].get_contained_segments(input_seg) try: print "ok" if contained_segments[0].get_content( ) == 'un texte' else "fail" except: print "fail"
def main(): input_seg = Input("un texte") verbatim_seg = Segmenter.tokenize( input_seg, [(re.compile(r'.+'), 'tokenize')], ) # verbatim in input = ok print "verbatim in input:", contained_segments = input_seg[0].get_contained_segments(verbatim_seg) try: print "ok" if contained_segments[0].get_content() == 'un texte' else "fail" except: print "fail" # verbatim in verbatim = ok print "verbatim in verbatim:", contained_segments = verbatim_seg[0].get_contained_segments(verbatim_seg) try: print "ok" if contained_segments[0].get_content() == 'un texte' else "fail" except: print "fail" # input in verbatim = fail print "input in verbatim:", contained_segments = verbatim_seg[0].get_contained_segments(input_seg) try: print "ok" if contained_segments[0].get_content() == 'un texte' else "fail" except: print "fail" # input in input = fail print "input in input:", contained_segments = input_seg[0].get_contained_segments(input_seg) try: print "ok" if contained_segments[0].get_content() == 'un texte' else "fail" except: print "fail"
from LTTL.Input import Input import LTTL.Segmenter as Segmenter import re input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], ) vowel_seg = Segmenter.tokenize( input_seg, [(re.compile(r'[aeiouy]'), 'tokenize')], ) for seg in word_seg[1].get_contained_segments(vowel_seg): print(seg.get_content())
def getTitleListFromECP(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait") # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('utf-8') self.infoBox.customMessage("Done fetching data from ECP website.") # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent(warning="Couldn't access ECP website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles... genresListSeg = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"id": re.compile(r"^genres-list")}, ) # Extract genre annotation... genreSeg = Segmenter.tokenize( segmentation=genresListSeg, regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \ "tokenize", {"genre": "&1"})], import_annotations=False, ) # Extract works... titleSeg = Segmenter.tokenize( segmentation=genreSeg, regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \ "tokenize")], ) # Extract annotations... titleSeg = Segmenter.tokenize( segmentation=titleSeg, regexes=[ (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", { "author": "&1" }), (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'), "tokenize", { "url": "&1" }), (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", { "title": "&1" }), ], merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, "cached_title_list_ecp"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
from LTTL.Input import Input import LTTL.Segmenter as Segmenter import re input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], ) consonant_seg = Segmenter.tokenize( input_seg, [(re.compile(r'[^aeiouy]'), 'tokenize')], ) # Prints nothing (though 'n' is in 'un' for seg in word_seg[0].get_contained_segments(consonant_seg): print(seg.get_content())
import sys, re from PyQt4.QtGui import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableVariety() seg1 = Input(u'aabccc', 'text1') seg2 = Input(u'abci', 'text2') seg3 = Segmenter.concatenate( [seg1, seg2], import_labels_as='string', label='corpus' ) seg4 = Segmenter.tokenize( seg3, regexes=[(re.compile(r'\w+'), u'tokenize',)], ) seg5 = Segmenter.tokenize( seg4, regexes=[(re.compile(r'[ai]'), u'tokenize',)], label='V' ) seg6 = Segmenter.tokenize( seg4, regexes=[(re.compile(r'[bc]'), u'tokenize',)], label='C' ) seg7 = Segmenter.concatenate( [seg5, seg6], import_labels_as='category', label='letters',
if __name__ == '__main__': import sys import re from PyQt4.QtGui import QApplication from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableIntersect() seg1 = Input(u'hello world', 'text') seg2 = Segmenter.tokenize( seg1, [ (re.compile(r'hello'), u'tokenize', {'tag': 'interj'}), (re.compile(r'world'), u'tokenize', {'tag': 'noun'}), ], label='words', ) seg3 = Segmenter.tokenize( seg2, [(re.compile(r'[aeiou]'), u'tokenize')], label='V' ) seg4 = Segmenter.tokenize( seg2, [(re.compile(r'[hlwrdc]'), u'tokenize')], label='C' ) seg5 = Segmenter.tokenize( seg2,
def getTitleListFromTheatreClassique(self): """Fetch titles from the Theatre-classique website""" self.infoBox.customMessage( "Fetching data from Theatre-classique website, please wait" ) # Attempt to connect to Theatre-classique... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('iso-8859-1') self.infoBox.customMessage( "Done fetching data from Theatre-classique website." ) # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent( warning="Couldn't access theatre-classique website." ) # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles from HTML. table_seg = Segmenter.import_xml( segmentation=recoded_seg, element="table", conditions={"id": re.compile(r"^table_AA$")}, ) # Extract table lines. line_seg = Segmenter.import_xml( segmentation=table_seg, element="tr", ) # Compile the regex that will be used to parse each line. field_regex = re.compile( r"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*" r"<td>(.+?)</td>\s*" r"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*" r"<td.+?>\s*(.+?)\s*</td>\s*" r"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML" ) # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=line_seg, regexes=[ (field_regex, "tokenize", {"author": "&1"}), (field_regex, "tokenize", {"title": "&2"}), (field_regex, "tokenize", {"year": "&3"}), (field_regex, "tokenize", {"genre": "&4"}), (field_regex, "tokenize", {"url": "&5"}), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) try: file = open(os.path.join(path, "cached_title_list"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def sendData(self): """(Have LTTL.Segmenter) perform the actual tokenization""" # Check that there's something on input... if not self.inputSegmentation: self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Segmented data', None, self) return # Check that there's at least one regex (if needed)... if ((self.displayAdvancedSettings and not self.regexes) or (self.segmentType == 'Use a regular expression' and not (self.regex or self.displayAdvancedSettings))): self.infoBox.setText(u'Please enter a regex.', 'warning') self.send('Segmented data', None, self) return # Get regexes from basic or advanced settings... regexForType = { u'Segment into letters': r'\w', u'Segment into words': r'\w+', u'Segment into lines': r'.+', } if self.displayAdvancedSettings: myRegexes = self.regexes elif self.segmentType == 'Use a regular expression': myRegexes = [[ self.regex, None, None, False, True, False, False, u'tokenize', ]] else: myRegexes = [[ regexForType[self.segmentType], None, None, False, True, False, False, u'tokenize', ]] # TODO: remove message 'No label was provided.' from docs if self.displayAdvancedSettings: importAnnotations = self.importAnnotations if self.autoNumber: autoNumberKey = self.autoNumberKey if autoNumberKey == '': self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Segmented data', None, self) return else: autoNumberKey = None mergeDuplicates = self.mergeDuplicates else: importAnnotations = True autoNumberKey = None mergeDuplicates = False # Prepare regexes... regexes = list() for regex_idx in range(len(myRegexes)): regex = myRegexes[regex_idx] regex_string = regex[0] if regex[3] or regex[4] or regex[5] or regex[6]: flags = '' if regex[3]: flags += 'i' if regex[4]: flags += 'u' if regex[5]: flags += 'm' if regex[6]: flags += 's' regex_string += '(?%s)' % flags try: if regex[1] and regex[2]: regexes.append( (re.compile(regex_string), (regex[7].lower()), { regex[1]: regex[2] })) else: regexes.append( (re.compile(regex_string), regex[7].lower())) except re.error as re_error: try: message = u'Please enter a valid regex (error: %s' % \ re_error.msg if self.displayAdvancedSettings and len(myRegexes) > 1: message += u', regex #%i' % (regex_idx + 1) message += u').' except AttributeError: message = u'Please enter a valid regex' if self.displayAdvancedSettings and len(myRegexes) > 1: message += u' (regex #%i)' % (regex_idx + 1) message += u'.' self.infoBox.setText(message, 'error') self.send('Segmented data', None, self) return # Perform tokenization... self.controlArea.setDisabled(True) self.infoBox.setText(u"Processing, please wait...", "warning") progressBar = ProgressBar(self, iterations=len(self.inputSegmentation) * len(myRegexes)) self.warning() self.error() try: segmented_data = Segmenter.tokenize( segmentation=self.inputSegmentation, regexes=regexes, label=self.captionTitle, import_annotations=importAnnotations, merge_duplicates=mergeDuplicates, auto_number_as=autoNumberKey, progress_callback=progressBar.advance, ) message = u'%i segment@p sent to output.' % len(segmented_data) message = pluralize(message, len(segmented_data)) self.infoBox.setText(message) self.send('Segmented data', segmented_data, self) except IndexError: self.infoBox.setText( u'Reference to unmatched group in annotation key and/or value.', 'error') self.send('Segmented data', None, self) self.sendButton.resetSettingsChangedFlag() progressBar.finish() self.controlArea.setDisabled(False)
def setUp(self): self.maxDiff = None input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], import_annotations=False, ) letter_seg = Segmenter.tokenize( input_seg, [ (re.compile(r'\w'), 'tokenize', { 'type': 'C' }), (re.compile(r'[aeiouy]'), 'tokenize', { 'type': 'V' }), ], import_annotations=False, merge_duplicates=True, ) vowel_seg, consonant_seg = Segmenter.select( letter_seg, re.compile(r'V'), annotation_key='type', ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and without annotation (woa): self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 1, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 2, ('n', 't'): 2, ('n', 'e'): 1, ('n', 'x'): 0, ('t', 'u'): 1, ('t', 'n'): 2, ('t', 't'): 5, ('t', 'e'): 4, ('t', 'x'): 3, ('e', 'u'): 0, ('e', 'n'): 1, ('e', 't'): 4, ('e', 'e'): 4, ('e', 'x'): 3, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 3, ('x', 'e'): 3, ('x', 'x'): 3, } self.window_woa_header_row_id = '__unit__' self.window_woa_header_row_type = 'string' self.window_woa_header_col_id = '__unit__' self.window_woa_header_col_type = 'string' self.window_woa_col_type = { col_id: 'continuous' for col_id in self.window_woa_col_ids } self.window_woa_ref = IntPivotCrosstab( self.window_woa_row_ids, self.window_woa_col_ids, self.window_woa_values, self.window_woa_header_row_id, self.window_woa_header_row_type, self.window_woa_header_col_id, self.window_woa_header_col_type, self.window_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and with annotation (wa): self.window_wa_row_ids = ['C', 'V'] self.window_wa_col_ids = ['C', 'V'] self.window_wa_values = { ('C', 'C'): 5, ('C', 'V'): 5, ('V', 'C'): 5, ('V', 'V'): 5, } self.window_wa_header_row_id = '__unit__' self.window_wa_header_row_type = 'string' self.window_wa_header_col_id = '__unit__' self.window_wa_header_col_type = 'string' self.window_wa_col_type = { col_id: 'continuous' for col_id in self.window_wa_col_ids } self.window_wa_ref = IntPivotCrosstab( self.window_wa_row_ids, self.window_wa_col_ids, self.window_wa_values, self.window_wa_header_row_id, self.window_wa_header_row_type, self.window_wa_header_col_id, self.window_wa_header_col_type, self.window_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and without annotation (woa): self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 0, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 1, ('n', 't'): 0, ('n', 'e'): 0, ('n', 'x'): 0, ('t', 'u'): 0, ('t', 'n'): 0, ('t', 't'): 1, ('t', 'e'): 1, ('t', 'x'): 1, ('e', 'u'): 0, ('e', 'n'): 0, ('e', 't'): 1, ('e', 'e'): 1, ('e', 'x'): 1, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 1, ('x', 'e'): 1, ('x', 'x'): 1, } self.context_wos_woa_header_row_id = '__context__' self.context_wos_woa_header_row_type = 'string' self.context_wos_woa_header_col_id = '__context__' self.context_wos_woa_header_col_type = 'string' self.context_wos_woa_col_type = { col_id: 'continuous' for col_id in self.context_wos_woa_col_ids } self.context_wos_woa_ref = IntPivotCrosstab( self.context_wos_woa_row_ids, self.context_wos_woa_col_ids, self.context_wos_woa_values, self.context_wos_woa_header_row_id, self.context_wos_woa_header_row_type, self.context_wos_woa_header_col_id, self.context_wos_woa_header_col_type, self.context_wos_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and with annotation (wa): self.context_wos_wa_row_ids = ['V', 'C'] self.context_wos_wa_col_ids = ['V', 'C'] self.context_wos_wa_values = { ('V', 'V'): 2, ('V', 'C'): 2, ('C', 'V'): 2, ('C', 'C'): 2, } self.context_wos_wa_header_row_id = '__context__' self.context_wos_wa_header_row_type = 'string' self.context_wos_wa_header_col_id = '__context__' self.context_wos_wa_header_col_type = 'string' self.context_wos_wa_col_type = { col_id: 'continuous' for col_id in self.context_wos_wa_col_ids } self.context_wos_wa_ref = IntPivotCrosstab( self.context_wos_wa_row_ids, self.context_wos_wa_col_ids, self.context_wos_wa_values, self.context_wos_wa_header_row_id, self.context_wos_wa_header_row_type, self.context_wos_wa_header_col_id, self.context_wos_wa_header_col_type, self.context_wos_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and without annotation (woa): self.context_ws_woa_col_ids = ['u', 'e'] self.context_ws_woa_row_ids = ['n', 't', 'x'] self.context_ws_woa_values = { ('n', 'u'): 1, ('n', 'e'): 0, ('t', 'u'): 0, ('t', 'e'): 1, ('x', 'u'): 0, ('x', 'e'): 1, } self.context_ws_woa_header_row_id = '__context__' self.context_ws_woa_header_row_type = 'string' self.context_ws_woa_header_col_id = '__context__' self.context_ws_woa_header_col_type = 'string' self.context_ws_woa_col_type = { col_id: 'continuous' for col_id in self.context_ws_woa_col_ids } self.context_ws_woa_ref = IntPivotCrosstab( self.context_ws_woa_row_ids, self.context_ws_woa_col_ids, self.context_ws_woa_values, self.context_ws_woa_header_row_id, self.context_ws_woa_header_row_type, self.context_ws_woa_header_col_id, self.context_ws_woa_header_col_type, self.context_ws_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and with annotation (wa): self.context_ws_wa_row_ids = ['C'] self.context_ws_wa_col_ids = ['V'] self.context_ws_wa_values = { ('C', 'V'): 2, } self.context_ws_wa_header_row_id = '__context__' self.context_ws_wa_header_row_type = 'string' self.context_ws_wa_header_col_id = '__context__' self.context_ws_wa_header_col_type = 'string' self.context_ws_wa_col_type = { col_id: 'continuous' for col_id in self.context_ws_wa_col_ids } self.context_ws_wa_ref = IntPivotCrosstab( self.context_ws_wa_row_ids, self.context_ws_wa_col_ids, self.context_ws_wa_values, self.context_ws_wa_header_row_id, self.context_ws_wa_header_row_type, self.context_ws_wa_header_col_id, self.context_ws_wa_header_col_type, self.context_ws_wa_col_type, ) self.output_cooc_in_window_woa = Processor.cooc_in_window( units={'segmentation': letter_seg}, window_size=3, ) self.output_cooc_in_window_wa = Processor.cooc_in_window( units={ 'segmentation': letter_seg, 'annotation_key': 'type' }, window_size=3, ) self.output_cooc_in_context_wos_woa = Processor.cooc_in_context( units={'segmentation': letter_seg}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_wos_wa = Processor.cooc_in_context( units={ 'segmentation': letter_seg, 'annotation_key': 'type' }, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_ws_woa = Processor.cooc_in_context( units={'segmentation': vowel_seg}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg}, ) self.output_cooc_in_context_ws_wa = Processor.cooc_in_context( units={ 'segmentation': vowel_seg, 'annotation_key': 'type' }, contexts={'segmentation': word_seg}, units2={ 'segmentation': consonant_seg, 'annotation_key': 'type' }, )
def getTitleListFromTheatreClassique(self): """Fetch titles from the Theatre-classique website""" self.infoBox.customMessage( u'Fetching data from Theatre-classique website, please wait') # Attempt to connect to Theatre-classique... try: response = urllib2.urlopen(self.base_url) base_html = unicode(response.read(), 'iso-8859-1') self.infoBox.customMessage( u'Done fetching data from Theatre-classique website.') # If unable to connect (somehow)... except: # Set Info box and widget to 'warning' state. self.infoBox.noDataSent( warning=u"Couldn't access theatre-classique website.") # Empty title list box. self.titleLabels = list() # Reset output channel. self.send(u'Text data', None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract table containing titles from HTML. table_seg = Segmenter.import_xml( segmentation=recoded_seg, element=u'table', conditions={u'id': re.compile(ur'^table_AA$')}, ) # Extract table lines. line_seg = Segmenter.import_xml( segmentation=table_seg, element=u'tr', ) # Compile the regex that will be used to parse each line. field_regex = re.compile( ur"^\s*<td>\s*<a.+?>(.+?)</a>\s*</td>\s*" ur"<td>(.+?)</td>\s*" ur"<td.+?>\s*<a.+?>\s*(\d+?)\s*</a>\s*</td>\s*" ur"<td.+?>\s*(.+?)\s*</td>\s*" ur"<td.+?>\s*<a\s+.+?t=\.{2}/(.+?)'>\s*HTML") # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=line_seg, regexes=[ (field_regex, u'tokenize', { u'author': u'&1' }), (field_regex, u'tokenize', { u'title': u'&2' }), (field_regex, u'tokenize', { u'year': u'&3' }), (field_regex, u'tokenize', { u'genre': u'&4' }), (field_regex, u'tokenize', { u'url': u'&5' }), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module's directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) try: file = open(os.path.join(path, u"cached_title_list"), u'wb') pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
def setUp(self): input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], import_annotations=False, ) letter_seg = Segmenter.tokenize( input_seg, [ (re.compile(r'\w'), 'tokenize', {'type': 'C'}), (re.compile(r'[aeiouy]'), 'tokenize', {'type': 'V'}), ], import_annotations=False, merge_duplicates=True, ) vowel_seg, consonant_seg = Segmenter.select( letter_seg, re.compile(r'V'), annotation_key='type', ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and without annotation (woa): self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 1, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 2, ('n', 't'): 2, ('n', 'e'): 1, ('n', 'x'): 0, ('t', 'u'): 1, ('t', 'n'): 2, ('t', 't'): 5, ('t', 'e'): 4, ('t', 'x'): 3, ('e', 'u'): 0, ('e', 'n'): 1, ('e', 't'): 4, ('e', 'e'): 4, ('e', 'x'): 3, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 3, ('x', 'e'): 3, ('x', 'x'): 3, } self.window_woa_header_row_id = '__unit__' self.window_woa_header_row_type = 'string' self.window_woa_header_col_id = '__unit2__' self.window_woa_header_col_type = 'string' self.window_woa_col_type = { col_id: 'continuous' for col_id in self.window_woa_col_ids } self.window_woa_ref = IntPivotCrosstab( self.window_woa_row_ids, self.window_woa_col_ids, self.window_woa_values, self.window_woa_header_row_id, self.window_woa_header_row_type, self.window_woa_header_col_id, self.window_woa_header_col_type, self.window_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and with annotation (wa): self.window_wa_row_ids = ['C', 'V'] self.window_wa_col_ids = ['C', 'V'] self.window_wa_values = { ('C', 'C'): 5, ('C', 'V'): 5, ('V', 'C'): 5, ('V', 'V'): 5, } self.window_wa_header_row_id = '__unit__' self.window_wa_header_row_type = 'string' self.window_wa_header_col_id = '__unit2__' self.window_wa_header_col_type = 'string' self.window_wa_col_type = { col_id: 'continuous' for col_id in self.window_wa_col_ids } self.window_wa_ref = IntPivotCrosstab( self.window_wa_row_ids, self.window_wa_col_ids, self.window_wa_values, self.window_wa_header_row_id, self.window_wa_header_row_type, self.window_wa_header_col_id, self.window_wa_header_col_type, self.window_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and without annotation (woa): self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 0, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 1, ('n', 't'): 0, ('n', 'e'): 0, ('n', 'x'): 0, ('t', 'u'): 0, ('t', 'n'): 0, ('t', 't'): 1, ('t', 'e'): 1, ('t', 'x'): 1, ('e', 'u'): 0, ('e', 'n'): 0, ('e', 't'): 1, ('e', 'e'): 1, ('e', 'x'): 1, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 1, ('x', 'e'): 1, ('x', 'x'): 1, } self.context_wos_woa_header_row_id = '__unit__' self.context_wos_woa_header_row_type = 'string' self.context_wos_woa_header_col_id = '__unit2__' self.context_wos_woa_header_col_type = 'string' self.context_wos_woa_col_type = { col_id: 'continuous' for col_id in self.context_wos_woa_col_ids } self.context_wos_woa_ref = IntPivotCrosstab( self.context_wos_woa_row_ids, self.context_wos_woa_col_ids, self.context_wos_woa_values, self.context_wos_woa_header_row_id, self.context_wos_woa_header_row_type, self.context_wos_woa_header_col_id, self.context_wos_woa_header_col_type, self.context_wos_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and with annotation (wa): self.context_wos_wa_row_ids = ['V', 'C'] self.context_wos_wa_col_ids = ['V', 'C'] self.context_wos_wa_values = { ('V', 'V'): 2, ('V', 'C'): 2, ('C', 'V'): 2, ('C', 'C'): 2, } self.context_wos_wa_header_row_id = '__unit__' self.context_wos_wa_header_row_type = 'string' self.context_wos_wa_header_col_id = '__unit2__' self.context_wos_wa_header_col_type = 'string' self.context_wos_wa_col_type = { col_id: 'continuous' for col_id in self.context_wos_wa_col_ids } self.context_wos_wa_ref = IntPivotCrosstab( self.context_wos_wa_row_ids, self.context_wos_wa_col_ids, self.context_wos_wa_values, self.context_wos_wa_header_row_id, self.context_wos_wa_header_row_type, self.context_wos_wa_header_col_id, self.context_wos_wa_header_col_type, self.context_wos_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and without annotation (woa): self.context_ws_woa_col_ids = ['u', 'e'] self.context_ws_woa_row_ids = ['n', 't', 'x'] self.context_ws_woa_values = { ('n', 'u'): 1, ('n', 'e'): 0, ('t', 'u'): 0, ('t', 'e'): 1, ('x', 'u'): 0, ('x', 'e'): 1, } self.context_ws_woa_header_row_id = '__unit__' self.context_ws_woa_header_row_type = 'string' self.context_ws_woa_header_col_id = '__unit2__' self.context_ws_woa_header_col_type = 'string' self.context_ws_woa_col_type = { col_id: 'continuous' for col_id in self.context_ws_woa_col_ids } self.context_ws_woa_ref = IntPivotCrosstab( self.context_ws_woa_row_ids, self.context_ws_woa_col_ids, self.context_ws_woa_values, self.context_ws_woa_header_row_id, self.context_ws_woa_header_row_type, self.context_ws_woa_header_col_id, self.context_ws_woa_header_col_type, self.context_ws_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and with annotation (wa): self.context_ws_wa_row_ids = ['C'] self.context_ws_wa_col_ids = ['V'] self.context_ws_wa_values = { ('C', 'V'): 2, } self.context_ws_wa_header_row_id = '__unit__' self.context_ws_wa_header_row_type = 'string' self.context_ws_wa_header_col_id = '__unit2__' self.context_ws_wa_header_col_type = 'string' self.context_ws_wa_col_type = { col_id: 'continuous' for col_id in self.context_ws_wa_col_ids } self.context_ws_wa_ref = IntPivotCrosstab( self.context_ws_wa_row_ids, self.context_ws_wa_col_ids, self.context_ws_wa_values, self.context_ws_wa_header_row_id, self.context_ws_wa_header_row_type, self.context_ws_wa_header_col_id, self.context_ws_wa_header_col_type, self.context_ws_wa_col_type, ) self.output_cooc_in_window_woa = Processor.cooc_in_window( units={'segmentation': letter_seg}, window_size=3, ) self.output_cooc_in_window_wa = Processor.cooc_in_window( units={'segmentation': letter_seg, 'annotation_key': 'type'}, window_size=3, ) self.output_cooc_in_context_wos_woa = Processor.cooc_in_context( units={'segmentation': letter_seg}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_wos_wa = Processor.cooc_in_context( units={'segmentation': letter_seg, 'annotation_key': 'type'}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_ws_woa = Processor.cooc_in_context( units={'segmentation': vowel_seg}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg}, ) self.output_cooc_in_context_ws_wa = Processor.cooc_in_context( units={'segmentation': vowel_seg, 'annotation_key': 'type'}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg, 'annotation_key': 'type'}, )
self.updateGUI() self.sendButton.sendIf() if __name__ == '__main__': import sys from PyQt5.QtWidgets import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableLength() seg1 = Input(u'hello world', label=u'text1') seg2 = Input(u'wonderful world', label=u'text2') seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus') seg4 = Segmenter.tokenize(seg3, [( r'\w+(?u)', u'tokenize', )], label=u'words') seg5 = Segmenter.tokenize(seg3, [( r'\w', u'tokenize', )], label=u'letters') ow.inputData(seg3, 1) ow.inputData(seg4, 2) ow.inputData(seg5, 3) ow.show() appl.exec_() ow.saveSettings()
if __name__ == '__main__': import sys from PyQt5.QtWidgets import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableCooccurrence() seg1 = Input(u'un texte', label=u'text') seg2 = Segmenter.tokenize( seg1, regexes=[( re.compile(r'\w+'), u'tokenize', { 'type': 'W' }, )], label=u'words', ) seg3 = Segmenter.tokenize( seg1, regexes=[( re.compile(r'[aeiouy]'), u'tokenize', { 'type': 'V' }, )], label=u'vowel',
def getTitleListFromEighteenthCenturyPoetry(self): """Fetch titles from the ECP website""" self.infoBox.customMessage( "Fetching data from ECP website, please wait" ) # Attempt to connect to ECP... try: response = urllib.request.urlopen(self.base_url) base_html = response.read().decode('iso-8859-1') self.infoBox.customMessage( "Done fetching data from EighteenthCenturyPoetry website." ) # If unable to connect (somehow)... except: # Set Info box and widget to "warning" state. self.infoBox.noDataSent( warning="Couldn't access EighteenthCenturyPoetry website." ) # Empty title list box. self.titleLabels = list() # Reset output channel. self.send("XML-TEI data", None, self) return None # Otherwise store HTML content in LTTL Input object. base_html_seg = Input(base_html) # Remove accents from the data... recoded_seg = Segmenter.recode(base_html_seg, remove_accents=True) # Extract works. genre_corpus = Segmenter.import_xml( segmentation=recoded_seg, element="ul", conditions={"class": re.compile(r"^genres-list$")}, ) genre_list = Segmenter.tokenize( segmentation=genre_corpus, regexes=re.compile(r"<a.+$"), import_annotations=False, merge_duplicates=True, ) work_list = Segmenter.tokenize( segmentation=genres_list, regexes=re.compile(r"<li class="bibl">(.+?)</li>"), import_annotations=False, merge_duplicates=True, ) # Compile the regex that will be used to parse each line. field_regex = re.compile( r"<a href="(.+?)">" r"<a href=".+?">(.+?)</a>" r"<span style="color:.+?666">(.+?)</span>" ) # Parse each line and store the resulting segmentation in an attribute. titleSeg = Segmenter.tokenize( segmentation=work_list, regexes=[ (field_regex, "tokenize", {"url": "&1"}), (field_regex, "tokenize", {"title": "&2"}), (field_regex, "tokenize", {"author": "&3"}), ], import_annotations=False, merge_duplicates=True, ) # Try to save list in this module"s directory for future reference... path = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) try: file = open(os.path.join(path, "cached_title_list"), "wb") pickle.dump(titleSeg, file, -1) file.close() except IOError: pass # Remove warning (if any)... self.error(0) self.warning(0) return titleSeg
self.contextAnnotationKey = self.contextAnnotationKey def handleNewSignals(self): """Overridden: called after multiple signals have been added""" self.openContext(self.uuid, self.segmentations) self.updateGUI() self.sendButton.sendIf() if __name__ == '__main__': import sys from PyQt4.QtGui import QApplication import LTTL.Segmenter as Segmenter from LTTL.Input import Input appl = QApplication(sys.argv) ow = OWTextableCount() seg1 = Input(u'hello world', label=u'text1') seg2 = Input(u'cruel world', label=u'text2') seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus') seg4 = Segmenter.tokenize(seg3, [(r'\w+(?u)', u'tokenize', { 'type': 'mot' })], label=u'words') ow.inputData(seg3, 1) ow.inputData(seg4, 2) ow.show() appl.exec_() ow.saveSettings()
else: text_or_id = kwargs.get("text_or_id", None) if isinstance(text_or_id, str) or text_or_id is None: self._currentErrorMessage = text_or_id or "" return super().error(*args, **kwargs) def warning(self, *args, **kwargs): # Reimplemented to track the current active warning message if args: text_or_id = args[0] else: text_or_id = kwargs.get("text_or_id", None) if isinstance(text_or_id, str) or text_or_id is None: self._currentWarningMessage = text_or_id or "" return super().warning(*args, **kwargs) if __name__ == '__main__': appl = QApplication(sys.argv) ow = OWTextableDisplay() ow.show() seg1 = Input(u'hello world', label=u'text1') seg2 = Input(u'cruel world', label=u'text2') seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus') seg4 = Segmenter.tokenize(seg3, [(r'\w+(?u)', u'tokenize')], label=u'words') ow.inputData(seg4) appl.exec_()