def test_select_progress(self): """Does select track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.select( self.char_seg, re.compile(r'.'), progress_callback=progress_callback, ) self.assertEqual(self.count, len(self.char_seg), msg="select doesn't track progress!")
def test_select_annotations(self): """Does select work with annotations?""" segmentation, _ = Segmenter.select(self.word_seg, re.compile(r'.'), annotation_key='a') self.assertEqual([s.get_content() for s in segmentation], ['ab'], msg="select doesn't work with annotations!")
def test_select_autonumber(self): """Does select autonumber input segments?""" segmentation, _ = Segmenter.select(self.char_seg, re.compile(r'.'), auto_number_as='num') self.assertEqual([s.annotations['num'] for s in segmentation], [1, 2, 3, 4, 5, 6], msg="select doesn't autonumber input segments!")
def test_select_progress(self): """Does select track progress?""" def progress_callback(): """Mock progress callback""" self.count += 1 Segmenter.select( self.char_seg, re.compile(r'.'), progress_callback=progress_callback, ) self.assertEqual( self.count, len(self.char_seg), msg="select doesn't track progress!" )
def test_select_select(self): """Does select select segments?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), ) self.assertEqual([s.get_content() for s in segmentation], ['cde'], msg="select doesn't select segments!")
def test_select_import_annotations_false(self): """Does select skip importing annotations?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w+'), copy_annotations=False, ) self.assertFalse('a' in segmentation[0].annotations, msg="select doesn't skip importing annotations!")
def test_select_select_neg(self): """Does select output complementary segmentation?""" _, segmentation = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), ) self.assertEqual( [s.get_content() for s in segmentation], ['ab'], msg="select doesn't output complementary segmentation!")
def test_select_mode(self): """Does select respect mode setting?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), mode="exclude", ) self.assertEqual([s.get_content() for s in segmentation], ['ab'], msg="select doesn't respect mode setting!")
def test_select_import_annotations(self): """Does select import annotations?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w+'), copy_annotations=True, ) self.assertEqual(segmentation[0].annotations['a'], '1', msg="select doesn't import annotations!")
def test_select_import_annotations_false(self): """Does select skip importing annotations?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w+'), copy_annotations=False, ) self.assertFalse( 'a' in segmentation[0].annotations, msg="select doesn't skip importing annotations!" )
def test_select_select_neg(self): """Does select output complementary segmentation?""" _, segmentation = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), ) self.assertEqual( [s.get_content() for s in segmentation], ['ab'], msg="select doesn't output complementary segmentation!" )
def test_select_select(self): """Does select select segments?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), ) self.assertEqual( [s.get_content() for s in segmentation], ['cde'], msg="select doesn't select segments!" )
def test_select_mode(self): """Does select respect mode setting?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w{3,}'), mode="exclude", ) self.assertEqual( [s.get_content() for s in segmentation], ['ab'], msg="select doesn't respect mode setting!" )
def test_select_annotations(self): """Does select work with annotations?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'.'), annotation_key='a' ) self.assertEqual( [s.get_content() for s in segmentation], ['ab'], msg="select doesn't work with annotations!" )
def test_select_import_annotations(self): """Does select import annotations?""" segmentation, _ = Segmenter.select( self.word_seg, re.compile(r'\w+'), copy_annotations=True, ) self.assertEqual( segmentation[0].annotations['a'], '1', msg="select doesn't import annotations!" )
def test_select_autonumber(self): """Does select autonumber input segments?""" segmentation, _ = Segmenter.select( self.char_seg, re.compile(r'.'), auto_number_as='num' ) self.assertEqual( [s.annotations['num'] for s in segmentation], [1, 2, 3, 4, 5, 6], msg="select doesn't autonumber input segments!" )
def huntTheLexic(self): """ main I/O function, filters the inputSeg with the selected lexical fields and outputs a copy of the input this Segmentation with segments labelised according to the topic they belong in """ # initiations... out = list() selectedListsNames = list() # first we select the topics according to the ones the user chose if self.titleLabels: selectedListsNames = [ list(self.titleLabels)[idx] for idx in self.selectedFields ] # we can then associate the topics with their respective lists selectedLists = { key: value for key, value in defaultDict.items() if key in selectedListsNames } # if we have an input, we can select the segments of the input and # label them according to the lists they are found in if self.inputSeg is not None: for filter_list in selectedLists: work_list = [i for i in selectedLists[filter_list] if i] if work_list: out.append( Segmenter.select( self.inputSeg, self.listToRegex(work_list), label=filter_list, )[0]) # lastly we define the output as a segmentation that is a copy of # the input, with the segments that we found labeled accordingly if self.labelName == "": labelNameVar = "Topic" else: labelNameVar = self.labelName self.outputSeg = Segmenter.concatenate( [Segmenter.bypass(self.inputSeg, label="__None__")] + out, merge_duplicates=True, label=self.captionTitle, import_labels_as=labelNameVar, )
def updateTitleList(self): """Update the list of titles""" # If titleSeg has not been loaded for some reason, skip. if self.titleSeg is None: return # In Advanced settings mode, get list of selected titles... if self.displayAdvancedSettings and self.filterValue != "(all)": self.filteredTitleSeg, _ = Segmenter.select( segmentation=self.titleSeg, regex=re.compile(r"^%s$" % self.filterValue), annotation_key=self.filterCriterion, ) else: self.filteredTitleSeg = self.titleSeg # Populate titleLabels list with the titles... self.titleLabels = sorted( [s.annotations["title"] for s in self.filteredTitleSeg]) # Add specification (author, year and genre, depending on criterion)... titleLabels = self.titleLabels[:] for idx, titleLabel in enumerate(titleLabels): specs = list() if (self.displayAdvancedSettings == False or self.filterCriterion != "author" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["author"]) if (self.displayAdvancedSettings == False or self.filterCriterion != "year" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["year"]) if (self.displayAdvancedSettings == False or self.filterCriterion != "genre" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["genre"]) titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs) self.titleLabels = titleLabels # Reset selectedTitles if needed... if not set(self.importedURLs).issubset( set(u.annotations["url"] for u in self.filteredTitleSeg)): self.selectedTitles = list() else: self.selectedTitles = self.selectedTitles self.sendButton.settingsChanged()
def updateTitleList(self): """Update the list of titles""" # If titleSeg has not been loaded for some reason, skip. if self.titleSeg is None: return # In Advanced settings mode, get list of selected titles... if self.displayAdvancedSettings and self.filterValue != "(all)": self.filteredTitleSeg, _ = Segmenter.select( segmentation=self.titleSeg, regex=re.compile(r"^%s$" % self.filterValue), annotation_key=self.filterCriterion, ) else: self.filteredTitleSeg = self.titleSeg # If criterion is not "genre" and his filter value not "all", # group titles with different genres... # Create a dictionary with "author" and "title" as key... unique_titles = dict() for title in self.filteredTitleSeg: title_id = ( title.annotations["author"], title.annotations["title"], ) try: unique_titles[title_id].append(title) except KeyError: unique_titles[title_id] = [title] # Create a list with new annotation comporting all genres... new_title_segments = list() for unique_title in unique_titles.values(): title_genres = list() new_title_segments.append(unique_title[0]) title_genres.append(unique_title[0].annotations["genre"]) for equivalent_title in unique_title[1:]: title_genres.append(equivalent_title.annotations["genre"]) new_title_segments[-1].annotations["genre"] = ", ".join( sorted(list(set(title_genres)))) self.filteredTitleSeg = Segmentation(None) self.filteredTitleSeg.extend(new_title_segments) # Populate titleLabels list with the titles... self.titleLabels = sorted( [s.annotations["title"] for s in self.filteredTitleSeg]) # Add specification (author, year and genre, depending on criterion)... titleLabels = self.titleLabels[:] for idx, titleLabel in enumerate(titleLabels): specs = list() if (self.displayAdvancedSettings == False or self.filterCriterion != "author" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["author"]) if (self.displayAdvancedSettings == False or self.filterCriterion != "genre" or self.filterValue == "(all)"): specs.append(self.filteredTitleSeg[idx].annotations["genre"]) titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs) self.titleLabels = titleLabels # Reset selectedTitles if needed... if not set(self.importedURLs).issubset( set(u.annotations["url"] for u in self.filteredTitleSeg)): self.selectedTitles = list() else: self.selectedTitles = self.selectedTitles self.sendButton.settingsChanged()
def sendData(self): """(Have LTTL.Segmenter) perform the actual selection""" # Check that there's something on input... if not self.segmentation: self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Selected data', None, self) self.send('Discarded data', None, self) return # TODO: remove message 'No label was provided.' from docs # Advanced settings... if self.displayAdvancedSettings: # If mode is Regex... if self.method == u'Regex': # Check that regex is not empty... if not self.regex: self.infoBox.setText(u'Please enter a regex.', 'warning') self.send('Selected data', None, self) self.send('Discarded data', None, self) return # Prepare regex... regex_string = self.regex if (self.ignoreCase or self.unicodeDependent or self.multiline or self.dotAll): flags = '' if self.ignoreCase: flags += 'i' if self.unicodeDependent: flags += 'u' if self.multiline: flags += 'm' if self.dotAll: flags += 's' regex_string += '(?%s)' % flags try: regex = re.compile(regex_string) except re.error as re_error: try: message = u'Please enter a valid regex (error: %s).' % \ re_error.msg except AttributeError: message = u'Please enter a valid regex.' self.infoBox.setText(message, 'error') self.send('Selected data', None, self) self.send('Discarded data', None, self) return # Get number of iterations... num_iterations = len(self.segmentation) # Else if mode is Sample... elif self.method == u'Sample': # Get sample size... if self.sampleSizeMode == u'Proportion': sampleSize = iround( len(self.segmentation) * (self.samplingRate / 100)) else: sampleSize = self.sampleSize if sampleSize <= 0: self.infoBox.setText( message='Please enter a larger sample size', state="error", ) self.send('Selected data', None, self) self.send('Discarded data', None, self) return # Get number of iterations... num_iterations = len(self.segmentation) # Else if mode is Threshold... elif self.method == u'Threshold': # Get min and max count... if self.thresholdMode == u'Proportion': minCount = iround( math.ceil( len(self.segmentation) * (self.minProportion / 100))) maxCount = iround( math.floor( len(self.segmentation) * (self.maxProportion / 100))) else: minCount = self.minCount maxCount = self.maxCount if not self.applyMinThreshold: minCount = 1 if not self.applyMaxThreshold: maxCount = len(self.segmentation) # Get number of iterations... num_iterations = len(self.segmentation) # Check that autoNumberKey is not empty (if necessary)... if self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Selected data', None, self) self.send('Discarded data', None, self) return else: autoNumberKey = None # Perform selection... self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_iterations) if self.method == u'Regex': regexAnnotationKeyParam = self.regexAnnotationKey if regexAnnotationKeyParam == u'(none)': regexAnnotationKeyParam = None (selected_data, discarded_data) = Segmenter.select( segmentation=self.segmentation, regex=regex, mode=self.regexMode.lower(), annotation_key=regexAnnotationKeyParam or None, label=self.captionTitle, copy_annotations=self.copyAnnotations, auto_number_as=autoNumberKey, progress_callback=progressBar.advance, ) elif self.method == u'Sample': (selected_data, discarded_data) = Segmenter.sample( segmentation=self.segmentation, sample_size=sampleSize, mode='random', label=self.captionTitle, copy_annotations=self.copyAnnotations, auto_number_as=autoNumberKey, progress_callback=progressBar.advance, ) elif self.method == u'Threshold': if ((minCount == 1 or not self.applyMinThreshold) and (maxCount == len(self.segmentation) or not self.applyMaxThreshold)): selected_data = Segmenter.bypass( segmentation=self.segmentation, label=self.captionTitle, ) discarded_data = None else: thresholdAnnotationKeyParam = self.thresholdAnnotationKey if thresholdAnnotationKeyParam == u'(none)': thresholdAnnotationKeyParam = None (selected_data, discarded_data) = Segmenter.threshold( segmentation=self.segmentation, annotation_key=(thresholdAnnotationKeyParam or None), min_count=minCount, max_count=maxCount, label=self.captionTitle, copy_annotations=self.copyAnnotations, auto_number_as=autoNumberKey, progress_callback=progressBar.advance, ) # Basic settings: else: # Check that regex is not empty... if not self.regex: self.infoBox.setText(u'Please enter a regex.', 'warning') self.send('Selected data', None, self) self.send('Discarded data', None, self) return # Get number of iterations... num_iterations = len(self.segmentation) # Perform selection... self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_iterations) regexAnnotationKeyParam = self.regexAnnotationKey if regexAnnotationKeyParam == u'(none)': regexAnnotationKeyParam = None try: (selected_data, discarded_data) = Segmenter.select( segmentation=self.segmentation, regex=re.compile(self.regex + '(?u)'), mode=self.regexMode.lower(), annotation_key=regexAnnotationKeyParam or None, label=self.captionTitle, copy_annotations=True, auto_number_as=None, progress_callback=progressBar.advance, ) except re.error as re_error: try: message = u'Please enter a valid regex (error: %s).' % \ re_error.msg except AttributeError: message = u'Please enter a valid regex.' self.infoBox.setText(message, 'error') self.send('Selected data', None, self) self.send('Discarded data', None, self) progressBar.finish() self.controlArea.setDisabled(False) return progressBar.finish() self.controlArea.setDisabled(False) message = u'%i segment@p sent to output.' % len(selected_data) message = pluralize(message, len(selected_data)) self.infoBox.setText(message) self.send('Selected data', selected_data, self) self.send('Discarded data', discarded_data, self) self.sendButton.resetSettingsChangedFlag()
def setUp(self): input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], import_annotations=False, ) letter_seg = Segmenter.tokenize( input_seg, [ (re.compile(r'\w'), 'tokenize', {'type': 'C'}), (re.compile(r'[aeiouy]'), 'tokenize', {'type': 'V'}), ], import_annotations=False, merge_duplicates=True, ) vowel_seg, consonant_seg = Segmenter.select( letter_seg, re.compile(r'V'), annotation_key='type', ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and without annotation (woa): self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 1, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 2, ('n', 't'): 2, ('n', 'e'): 1, ('n', 'x'): 0, ('t', 'u'): 1, ('t', 'n'): 2, ('t', 't'): 5, ('t', 'e'): 4, ('t', 'x'): 3, ('e', 'u'): 0, ('e', 'n'): 1, ('e', 't'): 4, ('e', 'e'): 4, ('e', 'x'): 3, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 3, ('x', 'e'): 3, ('x', 'x'): 3, } self.window_woa_header_row_id = '__unit__' self.window_woa_header_row_type = 'string' self.window_woa_header_col_id = '__unit2__' self.window_woa_header_col_type = 'string' self.window_woa_col_type = { col_id: 'continuous' for col_id in self.window_woa_col_ids } self.window_woa_ref = IntPivotCrosstab( self.window_woa_row_ids, self.window_woa_col_ids, self.window_woa_values, self.window_woa_header_row_id, self.window_woa_header_row_type, self.window_woa_header_col_id, self.window_woa_header_col_type, self.window_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and with annotation (wa): self.window_wa_row_ids = ['C', 'V'] self.window_wa_col_ids = ['C', 'V'] self.window_wa_values = { ('C', 'C'): 5, ('C', 'V'): 5, ('V', 'C'): 5, ('V', 'V'): 5, } self.window_wa_header_row_id = '__unit__' self.window_wa_header_row_type = 'string' self.window_wa_header_col_id = '__unit2__' self.window_wa_header_col_type = 'string' self.window_wa_col_type = { col_id: 'continuous' for col_id in self.window_wa_col_ids } self.window_wa_ref = IntPivotCrosstab( self.window_wa_row_ids, self.window_wa_col_ids, self.window_wa_values, self.window_wa_header_row_id, self.window_wa_header_row_type, self.window_wa_header_col_id, self.window_wa_header_col_type, self.window_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and without annotation (woa): self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 0, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 1, ('n', 't'): 0, ('n', 'e'): 0, ('n', 'x'): 0, ('t', 'u'): 0, ('t', 'n'): 0, ('t', 't'): 1, ('t', 'e'): 1, ('t', 'x'): 1, ('e', 'u'): 0, ('e', 'n'): 0, ('e', 't'): 1, ('e', 'e'): 1, ('e', 'x'): 1, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 1, ('x', 'e'): 1, ('x', 'x'): 1, } self.context_wos_woa_header_row_id = '__unit__' self.context_wos_woa_header_row_type = 'string' self.context_wos_woa_header_col_id = '__unit2__' self.context_wos_woa_header_col_type = 'string' self.context_wos_woa_col_type = { col_id: 'continuous' for col_id in self.context_wos_woa_col_ids } self.context_wos_woa_ref = IntPivotCrosstab( self.context_wos_woa_row_ids, self.context_wos_woa_col_ids, self.context_wos_woa_values, self.context_wos_woa_header_row_id, self.context_wos_woa_header_row_type, self.context_wos_woa_header_col_id, self.context_wos_woa_header_col_type, self.context_wos_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and with annotation (wa): self.context_wos_wa_row_ids = ['V', 'C'] self.context_wos_wa_col_ids = ['V', 'C'] self.context_wos_wa_values = { ('V', 'V'): 2, ('V', 'C'): 2, ('C', 'V'): 2, ('C', 'C'): 2, } self.context_wos_wa_header_row_id = '__unit__' self.context_wos_wa_header_row_type = 'string' self.context_wos_wa_header_col_id = '__unit2__' self.context_wos_wa_header_col_type = 'string' self.context_wos_wa_col_type = { col_id: 'continuous' for col_id in self.context_wos_wa_col_ids } self.context_wos_wa_ref = IntPivotCrosstab( self.context_wos_wa_row_ids, self.context_wos_wa_col_ids, self.context_wos_wa_values, self.context_wos_wa_header_row_id, self.context_wos_wa_header_row_type, self.context_wos_wa_header_col_id, self.context_wos_wa_header_col_type, self.context_wos_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and without annotation (woa): self.context_ws_woa_col_ids = ['u', 'e'] self.context_ws_woa_row_ids = ['n', 't', 'x'] self.context_ws_woa_values = { ('n', 'u'): 1, ('n', 'e'): 0, ('t', 'u'): 0, ('t', 'e'): 1, ('x', 'u'): 0, ('x', 'e'): 1, } self.context_ws_woa_header_row_id = '__unit__' self.context_ws_woa_header_row_type = 'string' self.context_ws_woa_header_col_id = '__unit2__' self.context_ws_woa_header_col_type = 'string' self.context_ws_woa_col_type = { col_id: 'continuous' for col_id in self.context_ws_woa_col_ids } self.context_ws_woa_ref = IntPivotCrosstab( self.context_ws_woa_row_ids, self.context_ws_woa_col_ids, self.context_ws_woa_values, self.context_ws_woa_header_row_id, self.context_ws_woa_header_row_type, self.context_ws_woa_header_col_id, self.context_ws_woa_header_col_type, self.context_ws_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and with annotation (wa): self.context_ws_wa_row_ids = ['C'] self.context_ws_wa_col_ids = ['V'] self.context_ws_wa_values = { ('C', 'V'): 2, } self.context_ws_wa_header_row_id = '__unit__' self.context_ws_wa_header_row_type = 'string' self.context_ws_wa_header_col_id = '__unit2__' self.context_ws_wa_header_col_type = 'string' self.context_ws_wa_col_type = { col_id: 'continuous' for col_id in self.context_ws_wa_col_ids } self.context_ws_wa_ref = IntPivotCrosstab( self.context_ws_wa_row_ids, self.context_ws_wa_col_ids, self.context_ws_wa_values, self.context_ws_wa_header_row_id, self.context_ws_wa_header_row_type, self.context_ws_wa_header_col_id, self.context_ws_wa_header_col_type, self.context_ws_wa_col_type, ) self.output_cooc_in_window_woa = Processor.cooc_in_window( units={'segmentation': letter_seg}, window_size=3, ) self.output_cooc_in_window_wa = Processor.cooc_in_window( units={'segmentation': letter_seg, 'annotation_key': 'type'}, window_size=3, ) self.output_cooc_in_context_wos_woa = Processor.cooc_in_context( units={'segmentation': letter_seg}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_wos_wa = Processor.cooc_in_context( units={'segmentation': letter_seg, 'annotation_key': 'type'}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_ws_woa = Processor.cooc_in_context( units={'segmentation': vowel_seg}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg}, ) self.output_cooc_in_context_ws_wa = Processor.cooc_in_context( units={'segmentation': vowel_seg, 'annotation_key': 'type'}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg, 'annotation_key': 'type'}, )
def setUp(self): self.maxDiff = None input_seg = Input("un texte") word_seg = Segmenter.tokenize( input_seg, [(re.compile(r'\w+'), 'tokenize')], import_annotations=False, ) letter_seg = Segmenter.tokenize( input_seg, [ (re.compile(r'\w'), 'tokenize', { 'type': 'C' }), (re.compile(r'[aeiouy]'), 'tokenize', { 'type': 'V' }), ], import_annotations=False, merge_duplicates=True, ) vowel_seg, consonant_seg = Segmenter.select( letter_seg, re.compile(r'V'), annotation_key='type', ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and without annotation (woa): self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.window_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 1, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 2, ('n', 't'): 2, ('n', 'e'): 1, ('n', 'x'): 0, ('t', 'u'): 1, ('t', 'n'): 2, ('t', 't'): 5, ('t', 'e'): 4, ('t', 'x'): 3, ('e', 'u'): 0, ('e', 'n'): 1, ('e', 't'): 4, ('e', 'e'): 4, ('e', 'x'): 3, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 3, ('x', 'e'): 3, ('x', 'x'): 3, } self.window_woa_header_row_id = '__unit__' self.window_woa_header_row_type = 'string' self.window_woa_header_col_id = '__unit__' self.window_woa_header_col_type = 'string' self.window_woa_col_type = { col_id: 'continuous' for col_id in self.window_woa_col_ids } self.window_woa_ref = IntPivotCrosstab( self.window_woa_row_ids, self.window_woa_col_ids, self.window_woa_values, self.window_woa_header_row_id, self.window_woa_header_row_type, self.window_woa_header_col_id, self.window_woa_header_col_type, self.window_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in window # with window_size=3 and with annotation (wa): self.window_wa_row_ids = ['C', 'V'] self.window_wa_col_ids = ['C', 'V'] self.window_wa_values = { ('C', 'C'): 5, ('C', 'V'): 5, ('V', 'C'): 5, ('V', 'V'): 5, } self.window_wa_header_row_id = '__unit__' self.window_wa_header_row_type = 'string' self.window_wa_header_col_id = '__unit__' self.window_wa_header_col_type = 'string' self.window_wa_col_type = { col_id: 'continuous' for col_id in self.window_wa_col_ids } self.window_wa_ref = IntPivotCrosstab( self.window_wa_row_ids, self.window_wa_col_ids, self.window_wa_values, self.window_wa_header_row_id, self.window_wa_header_row_type, self.window_wa_header_col_id, self.window_wa_header_col_type, self.window_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and without annotation (woa): self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x'] self.context_wos_woa_values = { ('u', 'u'): 1, ('u', 'n'): 1, ('u', 't'): 0, ('u', 'e'): 0, ('u', 'x'): 0, ('n', 'u'): 1, ('n', 'n'): 1, ('n', 't'): 0, ('n', 'e'): 0, ('n', 'x'): 0, ('t', 'u'): 0, ('t', 'n'): 0, ('t', 't'): 1, ('t', 'e'): 1, ('t', 'x'): 1, ('e', 'u'): 0, ('e', 'n'): 0, ('e', 't'): 1, ('e', 'e'): 1, ('e', 'x'): 1, ('x', 'u'): 0, ('x', 'n'): 0, ('x', 't'): 1, ('x', 'e'): 1, ('x', 'x'): 1, } self.context_wos_woa_header_row_id = '__context__' self.context_wos_woa_header_row_type = 'string' self.context_wos_woa_header_col_id = '__context__' self.context_wos_woa_header_col_type = 'string' self.context_wos_woa_col_type = { col_id: 'continuous' for col_id in self.context_wos_woa_col_ids } self.context_wos_woa_ref = IntPivotCrosstab( self.context_wos_woa_row_ids, self.context_wos_woa_col_ids, self.context_wos_woa_values, self.context_wos_woa_header_row_id, self.context_wos_woa_header_row_type, self.context_wos_woa_header_col_id, self.context_wos_woa_header_col_type, self.context_wos_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # without the secondary unit (wos) and with annotation (wa): self.context_wos_wa_row_ids = ['V', 'C'] self.context_wos_wa_col_ids = ['V', 'C'] self.context_wos_wa_values = { ('V', 'V'): 2, ('V', 'C'): 2, ('C', 'V'): 2, ('C', 'C'): 2, } self.context_wos_wa_header_row_id = '__context__' self.context_wos_wa_header_row_type = 'string' self.context_wos_wa_header_col_id = '__context__' self.context_wos_wa_header_col_type = 'string' self.context_wos_wa_col_type = { col_id: 'continuous' for col_id in self.context_wos_wa_col_ids } self.context_wos_wa_ref = IntPivotCrosstab( self.context_wos_wa_row_ids, self.context_wos_wa_col_ids, self.context_wos_wa_values, self.context_wos_wa_header_row_id, self.context_wos_wa_header_row_type, self.context_wos_wa_header_col_id, self.context_wos_wa_header_col_type, self.context_wos_wa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and without annotation (woa): self.context_ws_woa_col_ids = ['u', 'e'] self.context_ws_woa_row_ids = ['n', 't', 'x'] self.context_ws_woa_values = { ('n', 'u'): 1, ('n', 'e'): 0, ('t', 'u'): 0, ('t', 'e'): 1, ('x', 'u'): 0, ('x', 'e'): 1, } self.context_ws_woa_header_row_id = '__context__' self.context_ws_woa_header_row_type = 'string' self.context_ws_woa_header_col_id = '__context__' self.context_ws_woa_header_col_type = 'string' self.context_ws_woa_col_type = { col_id: 'continuous' for col_id in self.context_ws_woa_col_ids } self.context_ws_woa_ref = IntPivotCrosstab( self.context_ws_woa_row_ids, self.context_ws_woa_col_ids, self.context_ws_woa_values, self.context_ws_woa_header_row_id, self.context_ws_woa_header_row_type, self.context_ws_woa_header_col_id, self.context_ws_woa_header_col_type, self.context_ws_woa_col_type, ) # Create the cooccurrence matrix for cooccurrence in context # with the secondary unit (ws) and with annotation (wa): self.context_ws_wa_row_ids = ['C'] self.context_ws_wa_col_ids = ['V'] self.context_ws_wa_values = { ('C', 'V'): 2, } self.context_ws_wa_header_row_id = '__context__' self.context_ws_wa_header_row_type = 'string' self.context_ws_wa_header_col_id = '__context__' self.context_ws_wa_header_col_type = 'string' self.context_ws_wa_col_type = { col_id: 'continuous' for col_id in self.context_ws_wa_col_ids } self.context_ws_wa_ref = IntPivotCrosstab( self.context_ws_wa_row_ids, self.context_ws_wa_col_ids, self.context_ws_wa_values, self.context_ws_wa_header_row_id, self.context_ws_wa_header_row_type, self.context_ws_wa_header_col_id, self.context_ws_wa_header_col_type, self.context_ws_wa_col_type, ) self.output_cooc_in_window_woa = Processor.cooc_in_window( units={'segmentation': letter_seg}, window_size=3, ) self.output_cooc_in_window_wa = Processor.cooc_in_window( units={ 'segmentation': letter_seg, 'annotation_key': 'type' }, window_size=3, ) self.output_cooc_in_context_wos_woa = Processor.cooc_in_context( units={'segmentation': letter_seg}, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_wos_wa = Processor.cooc_in_context( units={ 'segmentation': letter_seg, 'annotation_key': 'type' }, contexts={'segmentation': word_seg}, units2=None, ) self.output_cooc_in_context_ws_woa = Processor.cooc_in_context( units={'segmentation': vowel_seg}, contexts={'segmentation': word_seg}, units2={'segmentation': consonant_seg}, ) self.output_cooc_in_context_ws_wa = Processor.cooc_in_context( units={ 'segmentation': vowel_seg, 'annotation_key': 'type' }, contexts={'segmentation': word_seg}, units2={ 'segmentation': consonant_seg, 'annotation_key': 'type' }, )