Esempio n. 1
0
 def setUp(self):
     """ Setting up for the test """
     self.entire_text_seg = Input('ab cde')
     self.other_entire_text_seg = Input('d')
     str_index = self.entire_text_seg[0].str_index
     self.first_word_seg = Segmentation(
         [
             Segment(
                     str_index=str_index,
                     start=0,
                     end=2,
                     annotations={'a': 1}
             )
         ]
     )
     self.last_word_seg = Segmentation(
         [Segment(str_index=str_index, start=3, end=6)]
     )
     self.char_seg = Segmentation(
         [
             Segment(str_index=str_index, start=0, end=1),
             Segment(str_index=str_index, start=1, end=2),
             Segment(str_index=str_index, start=2, end=3),
             Segment(str_index=str_index, start=3, end=4),
             Segment(str_index=str_index, start=4, end=5),
             Segment(str_index=str_index, start=5, end=6),
         ]
     )
Esempio n. 2
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        self.str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=self.str_index,
                    start=0,
                    end=2,
                    annotations={
                        'a': '1',
                        'bc': '20'
                    }),
            Segment(str_index=self.str_index, start=3, end=6)
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=self.str_index, start=3, end=5),
            Segment(str_index=self.str_index, start=4, end=6),
        ])

        self.base_output_string = ('segment number 1\n'
                                   '\tcontent:\t"ab"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t0\n'
                                   '\tend:\t2\n'
                                   '\tannotations:\n'
                                   '\t\ta                    1\n'
                                   '\t\tbc                   20\n'
                                   'segment number 2\n'
                                   '\tcontent:\t"cde"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t3\n'
                                   '\tend:\t6') % (self.str_index,
                                                   self.str_index)

        self.count = 0
Esempio n. 3
0
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model
        if not self.model:
            self.noLanguageModelWarning()
            return

        # Check that there's an input
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.send('Summary', None, self)
            self.send('HTML_Summary', None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )

        self.controlArea.setDisabled(True)

        # Type of segmentation (per segment or per segmentation)
        segments = list()
        html_segments = list()
        if self.typeSeg == "Summarize each segments individually":
            # Process each segment separately, then create segmentation
            for segment in self.inputSeg:
                content = segment.get_content()
                resume, html_resume = self.summarize(self.cv, content)
                segments.append(Segment(str_index=resume[0].str_index, ))
                html_segments.append(
                    Segment(str_index=html_resume[0].str_index, ))
        elif self.typeSeg == "Summarize all segments as one":
            merged_seg = " ".join(
                [segment.get_content() for segment in self.inputSeg])
            resume, html_resume = self.summarize(self.cv, merged_seg)
            segments.append(Segment(str_index=resume[0].str_index, ))
            html_segments.append(Segment(str_index=html_resume[0].str_index, ))

        # Create segmentation from segment() and assign it to the output
        self.outputSeg = Segmentation(segments, self.captionTitle)
        self.html_outputSeg = Segmentation(html_segments, self.captionTitle)

        # Send segmentation to output channels
        self.send("Summary", self.outputSeg, self)
        self.send('HTML_Summary', self.html_outputSeg, self)

        # Set message to sent
        message = "%i segment@p sent to output " % len(self.outputSeg)
        message = pluralize(message, len(self.outputSeg))
        self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
        self.controlArea.setDisabled(False)
Esempio n. 4
0
 def test_clear_string(self):
     """Does clear set stored string to None?"""
     seg = Input('test3')
     seg.clear()
     self.assertEqual(Segmentation.get_data(-1),
                      None,
                      msg="clear doesn't set stored string to None!")
Esempio n. 5
0
 def test_update_string(self):
     """Does update modify stored string?"""
     seg = Input('test2')
     seg.update('modified')
     self.assertEqual(Segmentation.get_data(-1)[:],
                      'modified',
                      msg="update doesn't modify stored string!")
    def send_data(self):
        """Creates the inputs based on the fetched data"""
        self.controlArea.setDisabled(True)
        self.clearCreatedInputs()
        segmentation = None

        # Goes over each queries in the data list
        for query in self.queryList:
            for text in query:
                # Create inputs
                newInput = Input(text)
                self.createdInputs.append(newInput)

        # If there is only one input, create a segmentation...
        if len(self.createdInputs) == 1:
            segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...
        annotations = list()
        for elem in self.annotList:
            for dic in elem:
                annotations.append(dic)

        for idx, segment in enumerate(segmentation):
            segment.annotations.update(annotations[idx])
            segmentation[idx] = segment

        # Calculate number of characters...
        num_chars = 0
        for segment in segmentation:
            num_chars += len(Segmentation.get_data(segment.str_index))

        # If there is data...
        if len(segmentation) != 0:
            # Inform the user of the number of segments and the number of characters...
            self.infoBox.setText(
                "{} segments sent to output ({} characters)".format(
                    len(segmentation),
                    num_chars,
                ))
            # Send the segments
            self.send("Segmentation", segmentation)
            self.controlArea.setDisabled(False)
            self.sendButton.resetSettingsChangedFlag()
        else:
            # Else, signal the user that no data is sendable...
            self.infoBox.setText(
                "There are {} segments to send to output. Please fill the query basket and click 'send' again"
                .format(len(segmentation)), "warning")
            self.sendButton.resetSettingsChangedFlag()
            self.controlArea.setDisabled(False)
            self.send("Segmentation", None)
Esempio n. 7
0
 def test_slice_string(self):
     """Does the slicing work like in strings"""
     Input('Hello world!')
     self.assertEqual(
         Segmentation.get_data(-1)[3:7],
         u"Hello world!"[3:7],
         msg="slicing doesn't return the same as in strings"
     )
Esempio n. 8
0
 def test_creator_store_string(self):
     """Does creator store string in class variable?"""
     Input(u'test')
     self.assertEqual(
         Segmentation.get_data(-1)[:],
         u'test',
         msg="creator doesn't store string in class variable!"
     )
Esempio n. 9
0
 def test_update_string(self):
     """Does update modify stored string?"""
     seg = Input(u'test2')
     seg.update(u'modified')
     self.assertEqual(
         Segmentation.get_data(-1)[:],
         u'modified',
         msg="update doesn't modify stored string!"
     )
Esempio n. 10
0
 def test_clear_string(self):
     """Does clear set stored string to None?"""
     seg = Input('test3')
     seg.clear()
     self.assertEqual(
         Segmentation.get_data(-1),
         None,
         msg="clear doesn't set stored string to None!"
     )
Esempio n. 11
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        self.str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation(
            [
                Segment(
                    str_index=self.str_index,
                    start=0,
                    end=2,
                    annotations={'a': '1', 'bc': '20'}
                ),
                Segment(
                    str_index=self.str_index,
                    start=3,
                    end=6
                )
            ]
        )
        self.overlapping_seg = Segmentation(
            [
                Segment(str_index=self.str_index, start=3, end=5),
                Segment(str_index=self.str_index, start=4, end=6),
            ]
        )

        self.base_output_string = (
            'segment number 1\n'
            '\tcontent:\t"ab"\n'
            '\tstr_index:\t%i\n'
            '\tstart:\t0\n'
            '\tend:\t2\n'
            '\tannotations:\n'
            '\t\ta                    1\n'
            '\t\tbc                   20\n'
            'segment number 2\n'
            '\tcontent:\t"cde"\n'
            '\tstr_index:\t%i\n'
            '\tstart:\t3\n'
            '\tend:\t6'
        ) % (self.str_index, self.str_index)

        self.count = 0
Esempio n. 12
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        
        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            del self.headerList[:]
            self.headerList = self.headerList
            self.send("CSV Segmentation", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))


        # Treat...
        for segment in self.csvSeg:
            
            pass
                        
            progressBar.advance()

                 
        # Set status to OK and report data size...
        outputSeg = Segmentation(self.csvSeg, label=self.captionTitle)
        if len(self.contentIsNone) == 0 :
            message = "%i segment@p sent to output." % len(outputSeg)
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)
        # message if one or more segments has no content and has been ignored
        elif len(self.contentIsNone) == 1:
            message = "%i segment@p sent to output. (ignored %i segment with \
            no content)" % (len(outputSeg), len(self.contentIsNone))
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)
        else :
            message = "%i segment@p sent to output. (ignored %i segments with \
            no content)" % (len(outputSeg), len(self.contentIsNone))
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
        
        # Send data to output...
        self.send("CSV Segmentation", outputSeg, self)
        
        self.sendButton.resetSettingsChangedFlag()             
Esempio n. 13
0
 def clearCreatedInputs(self):
     """Delete all Input objects that have been created."""
     for i in self.createdInputs:
         Segmentation.set_data(i[0].str_index, None)
     del self.createdInputs[:]
Esempio n. 14
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Clear created Inputs
        self.clearCreatedInputs()
        
        if self.service == u'Twitter':
            try:
                self.createdInputs = self.get_tweets(
                    self.word_to_search,
                    self.nb_tweet,
                    self.include_RT,
                    self.useTwitterLicenseKey,
                    (
                        self.twitterLicenseKeysConsumerKey,
                        self.twitterLicenseKeysConsumerSecret,
                        (
                            self.twitterLicenseKeysAccessToken,
                            self.twitterLicenseKeysAccessTokenSecret
                        )
                    )
                )
            except (HTTP401Authentication, HTTP400BadRequest):
                self.infoBox.setText(
                    u'Please enter valid Twitter api keys.',
                    u'error',
                )
                self.send(u'Text data', None, self)
                return False
            except SearchEngineLimitError:
                self.infoBox.setText(
                    u'Twitter search limit has been exceeded.',
                    u'error',
                )
                self.send(u'Text data', None, self)
                return False


        elif self.service == u'Wikipedia':
            self.createdInputs = self.get_wiki_article(
                self.word_to_search,
                self.wiki_section,
                self.wiki_type_of_text
            )

        elif self.service == u'Bing':
            self.createdInputs = self.get_bing_entries(
                self.word_to_search,
                self.nb_bing_entry
            )

        

        if len(self.createdInputs) == 0:
            self.infoBox.setText(
                u'Please try to change query or settings.',
                u'warning',
            )
            self.send(u'Text data', None, self)
            return False

        # Initialize progress bar
        progressBar = OWGUI.ProgressBar(
            self, 
            iterations=50
        )

        output_segmentation = Segmenter.concatenate(
            self.createdInputs, 
            self.captionTitle, 
            import_labels_as=None
        )

        message = u'%i segment@p sent to output ' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        numChars = 0
        for segment in output_segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        for _ in xrange(50):
            progressBar.advance()

        # Clear progress bar.
        progressBar.finish()

        self.send('Text data', output_segmentation, self)
    
        self.sendButton.resetSettingsChangedFlag()
Esempio n. 15
0
    def sendData(self):

        # Si le lien vers treetagger n"est pas trouve
        if self.NoLink:
            self.infoBox.setText(u"Sorry, TreeTagger's link not found.",
                                 "error")
            self.send("Text data", None)
        # Important: if input data is None, propagate this value to output...
        elif not self.inputData:
            self.infoBox.setText(u"Widget needs input", "warning")
            self.send("Text data", None)
        # affiche que quelque chose se passe...
        else:
            self.infoBox.setText(u"TreeTagger is running...", "warning")

            # Initialisation de variables
            total_tagged_text = list()
            new_segmentations = list()
            i = 0

            # Initialize progress bar.
            self.progressBar = gui.ProgressBar(self, iterations=5)

            # Copie de la segmentation avec ajout d"une annotation...
            copy_of_input_seg = Segmentation()
            copy_of_input_seg.label = self.inputData.label
            for seg_idx, segment in enumerate(self.inputData):
                attr = " ".join(
                    ["%s='%s'" % item for item in segment.annotations.items()])
                segment.annotations["tt_xb"] = attr
                copy_of_input_seg.append(segment)

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            concatenated_text = copy_of_input_seg.to_string(
                formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>",
                display_all=True,
            )

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            tagged_text = self.tag(concatenated_text)
            tagged_input = Input(tagged_text)
            tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt")

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            # Si checkBox xml active
            if self.activer_xml == True:
                xml_segmentation, _ = Segmenter.recode(
                    tagged_segmentation,
                    substitutions=[
                        (re.compile(r"<unknown>"), "[unknown]"),
                        (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                         "<w lemma='&3' type='&2'>&1</w>"),
                        (re.compile(r'"""'), '"&quot;"'),
                    ],
                )
                final_segmentation = xml_segmentation
            # Si checkBox xml desactive
            else:
                xml_segmentation, _ = Segmenter.recode(
                    tagged_segmentation,
                    substitutions=[
                        (re.compile(r"<unknown>"), "[unknown]"),
                        (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                         "<w lemma='&3' type='&2'>&1</w>"),
                        (re.compile(r'"""'), '"&quot;"'),
                    ],
                )
                final_segmentation = Segmenter.import_xml(
                    xml_segmentation, "w")

            self.infoBox.dataSent("")

            # Enregistrer le lien de treetagger...
            if self.system == "nt":
                file = open("treetagger_link.txt", "w")
            else:
                file = open(
                    os.path.normpath("/Users/" + self.user +
                                     "/treetagger_link.txt"), "w")

            file.write(self.treetagger_link)
            file.close()

            # Clear progress bar.
            self.progressBar.finish()

            # envoyer la seguementation
            self.send("Text data", final_segmentation, self)
            self.compteur += 1
            self.sendButton.resetSettingsChangedFlag()
Esempio n. 16
0
class TestSegmentation(unittest.TestCase):
    """Test suite for LTTL Segment module"""

    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        self.str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation(
            [
                Segment(
                    str_index=self.str_index,
                    start=0,
                    end=2,
                    annotations={'a': '1', 'bc': '20'}
                ),
                Segment(
                    str_index=self.str_index,
                    start=3,
                    end=6
                )
            ]
        )
        self.overlapping_seg = Segmentation(
            [
                Segment(str_index=self.str_index, start=3, end=5),
                Segment(str_index=self.str_index, start=4, end=6),
            ]
        )

        self.base_output_string = (
            'segment number 1\n'
            '\tcontent:\t"ab"\n'
            '\tstr_index:\t%i\n'
            '\tstart:\t0\n'
            '\tend:\t2\n'
            '\tannotations:\n'
            '\t\ta                    1\n'
            '\t\tbc                   20\n'
            'segment number 2\n'
            '\tcontent:\t"cde"\n'
            '\tstr_index:\t%i\n'
            '\tstart:\t3\n'
            '\tend:\t6'
        ) % (self.str_index, self.str_index)

        self.count = 0

    def tearDown(self):
        """Cleaning up after the test"""
        pass

    def test_creator(self):
        """Does creator return Segmentation object?"""
        self.assertIsInstance(
            Segmentation(),
            Segmentation,
            msg="creator doesn't return Segmentation object!"
        )

    def test_to_string_default_format(self):
        """Does to_string() format segmentation correctly by default?"""
        output_string = self.word_seg.to_string()
        self.assertEqual(
            output_string,
            self.base_output_string,
            msg="to_string() doesn't format segmentation correctly by default!"
        )

    def test_to_string_header(self):
        """Does to_string() format header correctly?"""
        output_string = self.word_seg.to_string(
            header='HEADER',
        )
        self.assertEqual(
            output_string,
            'HEADER' + self.base_output_string,
            msg="to_string() doesn't format header correctly!"
        )

    def test_to_string_footer(self):
        """Does to_string() format footer correctly?"""
        output_string = self.word_seg.to_string(
            footer='FOOTER',
        )
        self.assertEqual(
            output_string,
            self.base_output_string + 'FOOTER',
            msg="to_string() doesn't format footer correctly!"
        )

    def test_to_string_humanize_addresses(self):
        """Does to_string() humanize addresses?"""
        output_string = self.word_seg.to_string(
            humanize_addresses=True,
        )
        humanized_str_index = self.str_index + 1
        humanized_string = self.base_output_string.replace('t:\t3', 't:\t4')
        humanized_string = humanized_string.replace('t:\t0', 't:\t1')
        humanized_string = humanized_string.replace(
            'x:\t%i' % self.str_index,
            'x:\t%i' % humanized_str_index
        )
        self.assertEqual(
            output_string,
            humanized_string,
            msg="to_string() doesn't humanize addresses!"
        )

    def test_to_string_interpolate_builtin_variables(self):
        """Does to_string() interpolate builtin variables?"""
        output_string = self.word_seg.to_string(
            formatting=(
                '%(__num__)s,%(__content__)s,'
                '%(__str_index__)s,%(__start__)s,%(__end__)s,'
                '%(__str_index_raw__)s,%(__start_raw__)s,%(__end_raw__)s'
            )
        )
        self.assertEqual(
            output_string,
            '1,ab,%i,0,2,%i,0,2\n2,cde,%i,3,6,%i,3,6' % (
                self.str_index, self.str_index, self.str_index, self.str_index
            ),
            msg="to_string() doesn't interpolate builtin variables!"
        )

    def test_to_string_interpolate_annotations(self):
        """Does to_string() interpolate annotations?"""
        output_string = self.word_seg.to_string(
            formatting='%(a)s'
        )
        self.assertEqual(
            output_string,
            '1\n__none__',
            msg="to_string() doesn't interpolate annotations!"
        )

    def test_to_string_progress(self):
        """Does to_string track progress?"""

        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        self.word_seg.to_string(
            progress_callback=progress_callback,
        )
        self.assertEqual(
            self.count,
            len(self.word_seg),
            msg="to_string doesn't track progress!"
        )

    def test_get_annotation_keys(self):
        """Does get_annotation_keys() return existing annotations?"""
        annotations = self.word_seg.get_annotation_keys()
        self.assertEqual(
            sorted(annotations),
            sorted(['a', 'bc']),
            msg="get_annotation_keys() doesn't return existing annotations!"
        )

    def test_is_non_overlapping(self):
        """Does is_non_overlapping() recognize absence of overlap?"""
        self.assertTrue(
            self.word_seg.is_non_overlapping(),
            msg="is_non_overlapping() doesn't recognize absence of overlap!"
        )

    def test_is_overlapping(self):
        """Does is_non_overlapping() recognize presence of overlap?"""
        self.assertFalse(
            self.overlapping_seg.is_non_overlapping(),
            msg="is_non_overlapping() doesn't recognize presence of overlap!"
        )
    def sendData(self):
        """Load files, create and send segmentation"""

        # Check that there's something on input...
        if ((self.displayAdvancedSettings and not self.files)
                or not (self.file or self.displayAdvancedSettings)):
            self.infoBox.setText(u'Please select input file.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFiles = self.files
        else:
            myFiles = [[self.file, self.encoding, "", "", "", "eng", False]]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(myFiles))

        # Open and process each file successively...
        for myFile in myFiles:
            filePath = myFile[0]
            encoding = myFile[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myFile[2]
            annotation_value = myFile[3]
            pdf_password = myFile[4]  # SuperTextFiles
            ocr_languages = myFile[5]  # SuperTextFiles
            ocr_force = myFile[6]  # SuperTextFiles

            myFiletype = filetype.guess(myFile[0])  # SuperTextFiles

            # Try to open the file...
            self.error()
            # Start SuperTextFiles
            try:
                if myFiletype is None:
                    fileContent = self.extract_raw_text(filePath, encoding)

                elif myFiletype.extension == "pdf":
                    if ocr_force is True:
                        fileContent = self.get_pdf_content(
                            filePath,
                            ocr_languages,
                        )
                    else:
                        if self.is_textual_pdf_file(filePath) is True:
                            fileContent = self.extract_text_from_pdf(filePath)
                        else:
                            fileContent = self.get_pdf_content(
                                filePath,
                                ocr_languages,
                            )

                elif myFiletype.extension in IMG_FILETYPES:
                    fileContent = self.ocrize(filePath, ocr_languages)

                if fileContent == -1:
                    message = u"Couldn't open file."
                    self.infoBox.setText(message, 'error')
                    self.send('Text data', None, self)
                    self.controlArea.setDisabled(False)
                    return

            # End SuperTextFiles

            except IOError as e:
                if "tesseract" in str(e):
                    QMessageBox.warning(None, 'Textable', str(e),
                                        QMessageBox.Ok)
                progressBar.finish()
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                fileContent = fileContent.lstrip(
                    codecs.BOM_UTF8.decode('utf-8'))

            # Normalize text (canonical decomposition then composition)...
            fileContent = normalize('NFC', fileContent)

            fileContents.append(fileContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importFilenames and self.importFilenamesKey:
                    filename = os.path.basename(filePath)
                    annotation[self.importFilenamesKey] = filename
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each file...
        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Esempio n. 18
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some books first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(
            self,
            iterations=len(self.myBasket),
        )

        selectedTexts = list()
        text_content = list()
        annotations = list()
        # get the Gutenberg cache
        cache = GutenbergCache.get_cache()
        try:
            # TODO: Retrieve selected texts from gutenberg
            for text in self.myBasket:

                # Get the id of the text
                query_id = cache.native_query(
                    sql_query=
                    "select gutenbergbookid from books where id == {selected_id}"
                    .format(selected_id=text[2]))
                gutenberg_id = list(query_id)

                # Get the text with Gutenbergpy
                gutenberg_text = gutenbergpy.textget.strip_headers(
                    gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0]))
                text_content.append(gutenberg_text)

                annotations.append(text[1])
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except Exception:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Gutenberg",
                                 "error")
            self.controlArea.setDisabled(False)
            return

        # TODO: send gutenberg texts as output
        # Store downloaded lyrics strings in input objects...
        for text in text_content:
            newInput = Input(text, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # TODO: annotate with book metadata
        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update({"title": annotations[idx]})
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Gutenberg importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Esempio n. 19
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some movies first",
                "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # Connect to imdb and add elements in lists
        list_review = list()
        list_annotation = list()
        annotations = list()
        try:
            for item in self.myBasket:
                movie = self.ia.get_movie_reviews(item['id'])
                movie_annotations = self.ia.get_movie(item['id'])
                list_review.append(movie)
                list_annotation.append(movie_annotations)
                # 1 tick on the progress bar of the widget
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from imdb", "error")
            self.controlArea.setDisabled(False)
            return

        # Store movie critics strings in input objects...
        for movie in list_review:
            data = movie.get('data', "")
            reviews_data = data.get('reviews')
            for review in reviews_data:
                reviews = review.get('content')
                newInput = Input(reviews)
                self.createdInputs.append(newInput)
        for item in list_annotation:
            print(item)
            # Store the annotation as dicts in a separate list
            annotations_dict = {"title": item, "year": item["year"]}
            annot_dict_copy = annotations_dict.copy()
            for i in range(25):
                annotations.append(annot_dict_copy)
        print(annotations)
        # If there's only one item, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = f"{len(self.segmentation)} segment@p sent to output"
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += " (%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send('Segmentation', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Esempio n. 20
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Linguistically analyzed data", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        tokenizedSegments = list()

        # Process each input segment...
        for segment in self.inputSeg:

            # Input segment attributes...
            inputContent = segment.get_content()
            inputAnnotations = segment.annotations
            inputString = segment.str_index
            inputStart = segment.start or 0
            inputEnd = segment.end or len(inputContent)

            # NLP analysis...
            doc = self.nlp(inputContent)

            # Process each token in input segment...
            for token in doc:
                tokenAnnotations = inputAnnotations.copy()
                tokenAnnotations.update({
                    k: getattr(token, k)
                    for k in RELEVANT_KEYS if getattr(token, k) is not None
                })
                tokenStart = inputStart + token.idx
                tokenizedSegments.append(
                    Segment(
                        str_index=inputString,
                        start=tokenStart,
                        end=tokenStart + len(token),
                        annotations=tokenAnnotations,
                    ))

            progressBar.advance()

        outputSeg = Segmentation(tokenizedSegments, self.captionTitle)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSeg)
        message = pluralize(message, len(outputSeg))
        self.infoBox.setText(message)

        print(outputSeg.to_string())

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        # Send data to output...
        self.send("Linguistically analyzed data", outputSeg, self)

        self.sendButton.resetSettingsChangedFlag()
Esempio n. 21
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some books first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(
            self,
            iterations=len(self.myBasket),
        )

        text_content = list()
        annotations = list()

        try:
            # Retrieve selected texts from gutenberg
            for text in self.myBasket:

                gutenberg_id = text[2]

                # Get the text with Gutenbergpy
                gutenberg_text = gutenbergpy.textget.strip_headers(
                    gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode(
                        "utf-8")

                text_content.append(gutenberg_text)
                # populate the annotation list
                annotations.append([text[0], text[1], text[3]])
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except Exception as exc:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Gutenberg",
                                 "error")
            self.controlArea.setDisabled(False)
            print(exc)
            return

        # Store downloaded text strings in input objects...
        for text in text_content:
            newInput = Input(text, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there's only one text, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation.
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments with book metadata
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update({"title": annotations[idx][0]})
            segment.annotations.update({"author": annotations[idx][1]})
            segment.annotations.update({"language": annotations[idx][2]})
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Gutenberg importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Esempio n. 22
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=3, end=6)
        ])
        self.char_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=2, end=3),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index, start=4, end=5),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg1 = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
        ])
        self.letter_seg2 = Segmentation([
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.single_letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '1'}),
        ])
        self.duplicate_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=0, end=1),
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=str_index, start=3, end=5),
            Segment(str_index=str_index, start=4, end=6),
        ])

        self.other_entire_text_seg = Input('abbccc')
        str_index2 = self.other_entire_text_seg[0].str_index
        self.other_letter_seg = Segmentation([
            Segment(str_index=str_index2,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=1,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=2,
                    end=3,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=3,
                    end=4,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=4,
                    end=5,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=5,
                    end=6,
                    annotations={'a': '3'}),
        ])

        self.third_entire_text_seg = Input('bd1')
        str_index3 = self.third_entire_text_seg[0].str_index
        self.third_letter_seg = Segmentation([
            Segment(str_index=str_index3, start=0, end=1),
            Segment(str_index=str_index3,
                    start=1,
                    end=2,
                    annotations={'a': '2'}),
            Segment(str_index=str_index3,
                    start=2,
                    end=3,
                    annotations={'a': 'b'}),
        ])

        self.fourth_entire_text_seg = Input('AB cd\xe9')
        str_index = self.fourth_entire_text_seg[0].str_index
        self.second_word_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=2),
            Segment(str_index=str_index, start=3, end=6),
        ])

        self.xml_seg = Input('<a attr="1"><a attr="2/3/">c<a/>d</a></a>')
        self.wrong_xml_seg = Input('<a><a>test</a>')
        self.wrong_xml_seg2 = Input('<a>test</a></a>')

        self.part_xml_seg = Input('<a>1<a>2<a>3</a>4')
        str_index3 = self.part_xml_seg[0].str_index
        self.part_xml_seg2 = Input('</a>5</a>')
        str_index4 = self.part_xml_seg2[0].str_index
        self.broken_xml_seg = Segmentation([
            Segment(str_index=str_index3, annotations={'a': '1'}),
            Segment(str_index=str_index4),
        ])

        self.count = 0
Esempio n. 23
0
    def sendData(self):

        # Clear created Inputs...
        self.clearCreatedInputs()

        if not self.TreetaggerPath:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.getAvailableLanguages():
            self.infoBox.setText(self.noLanguageParameterWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.segmentation:
            self.infoBox.setText(u"Widget needs input", "warning")
            self.send("Tagged data", None)
            return

        # Initialize progress bar.
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        self.progressBar = ProgressBar(self, iterations=5)

        # Create a copy of input seg, storing annotations in temp attr...
        copy_of_input_seg = Segmentation()
        copy_of_input_seg.label = self.segmentation.label
        for seg_idx, segment in enumerate(self.segmentation):
            attr = " ".join([
                "%s=%s" % (
                    ''.join(c for c in unicodedata.normalize('NFD', item[0])
                            if unicodedata.category(c) != 'Mn'),
                    quoteattr(str(item[1])),
                ) for item in segment.annotations.items()
            ])

            segment.annotations["tt_ax"] = attr
            copy_of_input_seg.append(segment)

        self.progressBar.advance()

        # Dump segmentation in unique string to avoid multiple calls to TT...
        concatenated_text = copy_of_input_seg.to_string(
            formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>",
            display_all=True,
        )

        self.progressBar.advance()

        # Tag the segmentation contents...
        tagopt = '-token -lemma -sgml -quiet'
        if self.replaceUnknown:
            tagopt += " -no-unknown"
        tagger = treetaggerwrapper.TreeTagger(
            TAGLANG=pycountry.languages.get(name=self.language).alpha_2,
            TAGOPT=tagopt,
            TAGDIR=self.TreetaggerPath,
        )
        tagged_lines = tagger.tag_text(
            concatenated_text,
            notagurl=True,
            notagemail=True,
            notagip=True,
            notagdns=True,
        )
        tagged_input = Input("\n".join(tagged_lines))
        self.createdInputs.append(tagged_input)

        # Replace <unknown> with [unknown] and " with &quot; then
        # re-segment to match the original segmentation structure.
        tagged_segmentation, _ = Segmenter.recode(
            tagged_input,
            substitutions=[
                (re.compile(r"<unknown>"), "[unknown]"),
                (re.compile(r'"""'), '"&quot;"'),
            ],
        )
        tagged_segmentation = Segmenter.import_xml(tagged_segmentation,
                                                   "ax_tt")

        self.progressBar.advance()

        # Place each output line of Treetagger in an xml tag with annotations..
        xml_segmentation, _ = Segmenter.recode(
            tagged_segmentation,
            substitutions=[
                (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                 '<w lemma="&3" pos-tag="&2">&1</w>'),
                (re.compile(r'^\n|\n$'), ''),
            ],
        )
        # Segment into individual tokens if XML output option is disabled...
        if self.outputFormat == "add XML tags":
            output_segmentation = xml_segmentation
        else:
            try:
                output_segmentation = Segmenter.import_xml(
                    xml_segmentation, "w")
            except ValueError:
                self.infoBox.setText(
                    "Please check that either the input contains well-formed "
                    "XML, or it doesn't contain instances of '&#60;' and '\x3e'",
                    "error")
                self.send("Tagged data", None)
                self.progressBar.finish()
                self.controlArea.setDisabled(False)
                return

        self.progressBar.finish()
        self.controlArea.setDisabled(False)

        output_segmentation.label = self.captionTitle
        message = u'%i segment@p sent to output.' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        self.infoBox.setText(message)
        self.send('Tagged data', output_segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Send data from website springfieldspringfield"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some movies first",
                "warning"
            )
            self.segmentation = None
            self.send("Movie transcripts", self.segmentation, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        annotations = list()
        script_list = list()
        annotations_dict = dict()
        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # This part of code is what fetches the actual script
        try:
            for movie in self.myBasket:
                # Each movie that is in the corpus is split into title and year
                # (rsplit makes sure to only split last occurence) which will 
                # become annotations
                b = copy.copy(movie)
                future_annotation = b.rsplit('(', 1)
                movie_title = future_annotation[0]
                movie_year = future_annotation[-1]
                movie_year = movie_year[:-1]
                annotations_dict["Movie Title"] = movie_title
                annotations_dict["Year of release"] = movie_year
                # It is important to make a copy of dictionary, otherwise each 
                # iteration will replace every element of the annotations list
                annotations.append(annotations_dict.copy())
                # link_end and page_url are the two variables that will have to
                # be changed in case scripts need to be taken from elsewhere
                link_end = self.path_storage[movie]
                page_url = "https://www.springfieldspringfield.co.uk/" +   \
                    "movie_script.php?movie=" + link_end
                page = urllib.request.urlopen(page_url)
                soup = BeautifulSoup(page, 'html.parser')

                # This is what grabs the movie script
                script = soup.find("div", {"class":"movie_script"})

                script_list.append(script.text)

                # 1 tick on the progress bar of the widget
                progressBar.advance()

        except:
            self.infoBox.setText(
                "Couldn't download data from SpringfieldSpringfield website.",
                "error"
            )
            self.controlArea.setDisabled(False)
            return

        # Store downloaded script strings in input objects...
        for script in script_list:
            newInput = Input(script, self.captionTitle)
            self.createdInputs.append(newInput)

       # If there's only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Movie transcripts", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
 def clearCreatedInputIndices(self):
     for i in self.createdInputIndices:
         Segmentation.set_data(i, None)
Esempio n. 26
0
 def test_creator(self):
     """Does creator return Segmentation object?"""
     self.assertIsInstance(
         Segmentation(),
         Segmentation,
         msg="creator doesn't return Segmentation object!")
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Clear morphology...
        self.morphology = dict()

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Morphologically analyzed data", None, self)
            self.updateGUI()
            return

        # Perform morphological analysis...

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait (word count)...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=100)

        # Word count...
        wordCounts = collections.Counter(
            [segment.get_content() for segment in self.inputSeg])
        self.morphology["wordCounts"] = wordCounts
        self.infoBox.setText(
            u"Processing, please wait (signature extraction)...",
            "warning",
        )
        progressBar.advance(5)  # 5 ticks on the progress bar...

        # Learn signatures...
        try:
            lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen
            signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts)
            self.morphology["signatures"] = signatures
            self.morphology["stems"] = stems
            self.morphology["suffixes"] = suffixes
        except ValueError as e:
            self.infoBox.setText(e.__str__(), "warning")
            self.send("Morphologically analyzed data", None, self)
            self.controlArea.setDisabled(False)
            progressBar.finish()  # Clear progress bar.
            self.morphology = dict()
            self.updateGUI()
            return
        self.infoBox.setText(
            u"Processing, please wait (word parsing)...",
            "warning",
        )
        progressBar.advance(80)

        # Parse words...
        parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes)
        self.morphology["parser"] = parser
        newSegments = list()
        num_analyzed_words = 0
        for segment in self.inputSeg:
            parses = parser[segment.get_content()]
            newSegment = segment.deepcopy()
            if parses[0].signature:
                num_analyzed_words += 1
            newSegment.annotations.update(
                {
                    "stem": parses[0].stem,
                    "suffix": parses[0].suffix  \
                                if len(parses[0].suffix) else "NULL",
                    "signature": parses[0].signature
                }
            )
            newSegments.append(newSegment)
        self.send(
            "Morphologically analyzed data",
            Segmentation(newSegments, self.captionTitle),
            self,
        )
        self.updateGUI()
        progressBar.advance(15)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output (%.2f%% analyzed)." % (len(
            self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100))
        message = pluralize(message, len(self.inputSeg))
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()
Esempio n. 28
0
    def sendData(self):
        """Convert input(s) and send output"""
        if not (self.segmentation or self.corpus):
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Textable segmentation', None, self)
            self.send('Text Mining corpus', None)
            return

        msg_seg = msg_corpus = ""

        num_iterations = 0
        if self.corpus:
            num_iterations += len(self.corpus)
        if self.segmentation:
            num_iterations += len(self.segmentation)
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=num_iterations)

        # Convert corpus to segmentation...
        if self.corpus:
            self.clearCreatedInputs()
            new_segments = list()
            text_feature = self.corpus.text_features[self.segmentContent]
            for row in self.corpus:
                content = row[text_feature].value
                if content == "":
                    continue
                new_input = Input(row[text_feature].value)
                new_segment_annotations = dict()
                for attr in self.corpus.domain:
                    attr_str = str(row[attr])
                    if attr_str != "?":
                        new_segment_annotations[str(attr)] = attr_str
                for meta_attr in self.corpus.domain.metas:
                    meta_attr_str = str(row[meta_attr])
                    if (meta_attr != text_feature and meta_attr_str != "?"):
                        new_segment_annotations[str(meta_attr)] = meta_attr_str
                new_segments.append(
                    Segment(new_input[0].str_index, new_input[0].start,
                            new_input[0].end, new_segment_annotations))
                self.createdInputs.append(new_input)
                progressBar.advance()
            new_segmentation = Segmentation(new_segments, self.captionTitle)
            msg_seg = u'%i segment@p' % len(new_segmentation)
            msg_seg = pluralize(msg_seg, len(new_segmentation))
            self.send('Textable segmentation', new_segmentation, self)
        else:
            self.send('Textable segmentation', None, self)

        # Convert segmentation to corpus...
        if self.segmentation:
            metas = list()
            attributes = list()
            meta_keys = list()
            attribute_keys = list()
            for key in self.segmentation.get_annotation_keys():
                possible_values = set()
                for segment in self.segmentation:
                    try:
                        possible_values.add(str(segment.annotations[key]))
                    except KeyError:
                        pass
                if (self.limitNumCategories
                        and len(possible_values) > self.maxNumCategories):
                    metas.append(StringVariable(key))
                    meta_keys.append(key)
                else:
                    attributes.append(
                        DiscreteVariable(key, values=list(possible_values)))
                    attribute_keys.append(key)
            metas.append(StringVariable("textable_text"))
            domain = Domain(attributes, [], metas)
            rows = list()
            for segment in self.segmentation:
                row = [
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in attribute_keys
                ]
                row.extend([
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in meta_keys
                ])
                row.append(segment.get_content())
                rows.append(row)
                progressBar.advance
            table = Table(domain, rows)
            if textMiningIsInstalled:
                corpus = Corpus(domain,
                                X=table.X,
                                metas=table.metas,
                                text_features=[metas[-1]])
            msg_corpus = u'%i document@p' % len(self.segmentation)
            msg_corpus = pluralize(msg_corpus, len(self.segmentation))
            self.send('Text Mining corpus', corpus)
        else:
            self.send('Text Mining corpus', None)

        progressBar.finish()
        self.controlArea.setDisabled(False)

        if msg_seg or msg_corpus:
            message = msg_seg
            if msg_seg and msg_corpus:
                message += " and "
            message += msg_corpus
            message += " sent to output."
            self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.noLanguageModelWarning()
            self.sendNoneToOutputs()
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            return
       
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )

        # Disable control area and initialize progress bar...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.char_df))       

        # Get start and end pos of concatenated input segments...
        startPositions = [0]
        endPositions = list()
        numSegments = len(self.inputSeg)
        for idx in range(1, numSegments):
            prevSegLen = len(self.inputSeg[idx-1].get_content())
            startPositions.append(startPositions[-1] + prevSegLen + 1)
            endPositions.append(startPositions[-1] - 1)
        endPositions.append(startPositions[-1] + 
                            len(self.inputSeg[-1].get_content()) + 1)

        # Get or update character aliases...
        find_pairs = sys.modules['charnetto.find_pairs']
        characters = [entry.split(", ") for entry in self.characters]
        find_pairs.map_names(self.char_df, characters)

        # Initializations...
        charSegments = list()
        currentSegmentIdx = 0
                
        # For each character token in Charnetto's output...
        for index, charToken in self.char_df.iterrows():
        
            # Skip non-PER named entities.
            if charToken["tag"] != "PER":
                continue

            # Get index of containing segment...
            while charToken["end_pos"] > endPositions[currentSegmentIdx]:
                currentSegmentIdx += 1
                
            # Create segment for char with its actual coordinates...
            strIndex = self.inputSeg[currentSegmentIdx].str_index
            start = charToken["start_pos"]-startPositions[currentSegmentIdx]
            end = charToken["end_pos"]-startPositions[currentSegmentIdx]
            annotations = {"id": charToken["alias"]}
            charSegments.append(Segment(strIndex, start, end, annotations))
            
            progressBar.advance()

        # Send output...
        outputSegmentation = Segmentation(charSegments, 
                                           label=self.captionTitle)
        self.send("Character segmentation", outputSegmentation, self)
        print(outputSegmentation.to_string())

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSegmentation)
        message = pluralize(message, len(outputSegmentation))
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some songs first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # Attempt to connect to Genius and retrieve lyrics...
        selectedSongs = list()
        song_content = list()
        annotations = list()
        try:
            for song in self.myBasket:
                # song is a dict {'idx1':{'title':'song1'...},
                # 'idx2':{'title':'song2'...}}
                page_url = "http://genius.com" + song['path']
                lyrics = self.html_to_text(page_url)
                song_content.append(lyrics)
                annotations.append(song.copy())
                # 1 tick on the progress bar of the widget
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Genius website.",
                                 "error")
            self.controlArea.setDisabled(False)
            return

        # Store downloaded lyrics strings in input objects...
        for song in song_content:
            newInput = Input(song, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Lyrics importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Esempio n. 31
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Skip if title list is empty:
        if self.titleLabels == list():
            return

        # Check that something has been selected...
        if len(self.selectedTitles) == 0:
            self.infoBox.setText("Please select one or more titles.",
                                 "warning")
            self.send("XML-TEI data", None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        # Initialize progress bar.
        progressBar = gui.ProgressBar(self,
                                      iterations=len(self.selectedTitles))

        # Attempt to connect to ECP and retrieve plays...
        xml_contents = list()
        annotations = list()
        try:
            for title in self.selectedTitles:
                doc_url = self.document_base_url +  \
                    self.filteredTitleSeg[title].annotations["url"]
                print(doc_url)
                url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url)
                print(url)
                response = urllib.request.urlopen(url)
                xml_contents.append(response.read().decode('utf-8'))
                source_annotations = \
                self.filteredTitleSeg[title].annotations.copy()
                #source_annotations["url"] = source_annotations["href"]
                #del source_annotations["href"]
                annotations.append(source_annotations)
                progressBar.advance()  # 1 tick on the progress bar...

        # If an error occurs (e.g. http error, or memory error)...
        except:
            #Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from ECP website.",
                                 "error")
            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return

        # Store downloaded XML in input objects...
        for xml_content_idx in range(len(xml_contents)):
            newInput = Input(xml_contents[xml_content_idx], self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Store imported URLs as setting.
        self.importedURLs = [
            self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"]
        ]

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()

        # Clear progress bar.
        progressBar.finish()

        # Send token...
        self.send("XML-TEI data", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Esempio n. 32
0
 def clearCreatedInputs(self):
     """Delete all Input objects that have been created."""
     for i in self.createdInputs:
         Segmentation.set_data(i[0].str_index, None)
     del self.createdInputs[:]
Esempio n. 33
0
class ECP(OWTextableBaseWidget):
    """Textable widget for importing XML-TEI data from the Eighteenth Century
    Poetry website (http://www.eighteenthcenturypoetry.org/)
    """

    #----------------------------------------------------------------------
    # Widget"s metadata...

    name = "18th Century Poetry"
    description = "Import XML-TEI data from ECP website"
    icon = "icons/18th_century_poetry.svg"
    priority = 10

    #----------------------------------------------------------------------
    # Channel definitions (NB: no input in this case)...

    inputs = []
    outputs = [("XML-TEI data", Segmentation)]

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    autoSend = settings.Setting(False)
    selectedTitles = settings.Setting([])
    titleLabels = settings.Setting([])
    filterCriterion = settings.Setting("author")
    filterValue = settings.Setting("(all)")
    importedURLs = settings.Setting([])
    displayAdvancedSettings = settings.Setting(False)

    want_main_area = False

    def __init__(self):
        """Widget creator."""

        super().__init__()

        # Other attributes...
        self.segmentation = None
        self.createdInputs = list()
        self.titleSeg = None
        self.filteredTitleSeg = None
        self.filterValues = dict()
        self.base_url =     \
          u"http://www.eighteenthcenturypoetry.org/works/#genres"
        self.document_base_url =     \
          u"http://www.eighteenthcenturypoetry.org"

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=self.updateGUI,
        )

        # The AdvancedSettings class, also from TextableUtils, facilitates
        # the management of basic vs. advanced interface. An object from this
        # class (here assigned to self.advancedSettings) contains two lists
        # (basicWidgets and advancedWidgets), to which the corresponding
        # widgetBoxes must be added.
        self.advancedSettings = AdvancedSettings(
            widget=self.controlArea,
            master=self,
            callback=self.updateFilterValueList,
        )

        # User interface...

        # Advanced settings checkbox (basic/advanced interface will appear
        # immediately after it...
        self.advancedSettings.draw()

        # Filter box (advanced settings only)
        filterBox = gui.widgetBox(
            widget=self.controlArea,
            box="Filter",
            orientation="vertical",
        )
        filterCriterionCombo = gui.comboBox(
            widget=filterBox,
            master=self,
            value="filterCriterion",
            items=["author", "genre"],
            sendSelectedValue=True,
            orientation="horizontal",
            label="Criterion:",
            labelWidth=120,
            callback=self.updateFilterValueList,
            tooltip=(
                "Please select a criterion for searching the title list\n"),
        )
        filterCriterionCombo.setMinimumWidth(120)
        gui.separator(widget=filterBox, height=3)
        self.filterValueCombo = gui.comboBox(
            widget=filterBox,
            master=self,
            value="filterValue",
            sendSelectedValue=True,
            orientation="horizontal",
            label="Value:",
            labelWidth=120,
            callback=self.updateTitleList,
            tooltip=("Please select a value for the chosen criterion."),
        )
        gui.separator(widget=filterBox, height=3)

        # The following lines add filterBox (and a vertical separator) to the
        # advanced interface...
        self.advancedSettings.advancedWidgets.append(filterBox)
        self.advancedSettings.advancedWidgetsAppendSeparator()

        # Title box
        titleBox = gui.widgetBox(
            widget=self.controlArea,
            box="Titles",
            orientation="vertical",
        )
        self.titleListbox = gui.listBox(
            widget=titleBox,
            master=self,
            value="selectedTitles",  # setting (list)
            labels="titleLabels",  # setting (list)
            callback=self.sendButton.settingsChanged,
            tooltip="The list of titles whose content will be imported",
        )
        self.titleListbox.setMinimumHeight(150)
        self.titleListbox.setSelectionMode(3)
        gui.separator(widget=titleBox, height=3)
        gui.button(
            widget=titleBox,
            master=self,
            label="Refresh",
            callback=self.refreshTitleSeg,
            tooltip="Connect to ECP website and refresh list.",
        )
        gui.separator(widget=titleBox, height=3)

        gui.separator(widget=self.controlArea, height=3)

        gui.rubber(self.controlArea)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()

        # This initialization step needs to be done after infoBox has been
        # drawn (because getTitleSeg may need to display an error message).
        self.getTitleSeg()

        # Send data if autoSend.
        self.sendButton.sendIf()

        self.setMinimumWidth(350)
        self.adjustSizeWithTimer()

    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Skip if title list is empty:
        if self.titleLabels == list():
            return

        # Check that something has been selected...
        if len(self.selectedTitles) == 0:
            self.infoBox.setText("Please select one or more titles.",
                                 "warning")
            self.send("XML-TEI data", None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        # Initialize progress bar.
        progressBar = gui.ProgressBar(self,
                                      iterations=len(self.selectedTitles))

        # Attempt to connect to ECP and retrieve plays...
        xml_contents = list()
        annotations = list()
        try:
            for title in self.selectedTitles:
                doc_url = self.document_base_url +  \
                    self.filteredTitleSeg[title].annotations["url"]
                print(doc_url)
                url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url)
                print(url)
                response = urllib.request.urlopen(url)
                xml_contents.append(response.read().decode('utf-8'))
                source_annotations = \
                self.filteredTitleSeg[title].annotations.copy()
                #source_annotations["url"] = source_annotations["href"]
                #del source_annotations["href"]
                annotations.append(source_annotations)
                progressBar.advance()  # 1 tick on the progress bar...

        # If an error occurs (e.g. http error, or memory error)...
        except:
            #Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from ECP website.",
                                 "error")
            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return

        # Store downloaded XML in input objects...
        for xml_content_idx in range(len(xml_contents)):
            newInput = Input(xml_contents[xml_content_idx], self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Store imported URLs as setting.
        self.importedURLs = [
            self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"]
        ]

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()

        # Clear progress bar.
        progressBar.finish()

        # Send token...
        self.send("XML-TEI data", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()

    def getTitleSeg(self):
        """Get title segmentation, either saved locally or online"""

        # Try to open saved file in this module"s directory...
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        try:
            file = open(os.path.join(path, "cached_title_list_ecp"), "rb")
            self.titleSeg = pickle.load(file)
            file.close()
        # Else try to load list from ECP and build new seg...
        except IOError:
            self.titleSeg = self.getTitleListFromECP()

        # Build author and genre lists...
        if self.titleSeg is not None:
            self.filterValues["author"] = Processor.count_in_context(
                units={
                    "segmentation": self.titleSeg,
                    "annotation_key": "author"
                }).col_ids
            self.filterValues["author"].sort()
            self.filterValues["genre"] = Processor.count_in_context(
                units={
                    "segmentation": self.titleSeg,
                    "annotation_key": "genre"
                }).col_ids
            self.filterValues["genre"].sort()

        # Sort the segmentation alphabetically based on titles (nasty hack!)...
        self.titleSeg.buffer.sort(key=lambda s: s.annotations["title"])

        # Update title and filter value lists (only at init and on manual
        # refresh, therefore separate from self.updateGUI).
        self.updateFilterValueList()

    def refreshTitleSeg(self):
        """Refresh title segmentation from website"""
        self.titleSeg = self.getTitleListFromECP()
        # Update title and filter value lists (only at init and on manual
        # refresh, therefore separate from self.updateGUI).
        self.updateFilterValueList()

    def getTitleListFromECP(self):
        """Fetch titles from the ECP website"""

        self.infoBox.customMessage(
            "Fetching data from ECP website, please wait")

        # Attempt to connect to ECP...
        try:
            response = urllib.request.urlopen(self.base_url)
            base_html = response.read().decode('utf-8')
            self.infoBox.customMessage("Done fetching data from ECP website.")

        # If unable to connect (somehow)...
        except:

            # Set Info box and widget to "warning" state.
            self.infoBox.noDataSent(warning="Couldn't access ECP website.")

            # Empty title list box.
            self.titleLabels = list()

            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return None

        # Otherwise store HTML content in LTTL Input object.
        base_html_seg = Input(base_html)

        # Remove accents from the data...
        recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True)

        # Extract table containing titles...
        genresListSeg = Segmenter.import_xml(
            segmentation=recoded_seg,
            element="ul",
            conditions={"id": re.compile(r"^genres-list")},
        )

        # Extract genre annotation...
        genreSeg = Segmenter.tokenize(
            segmentation=genresListSeg,
            regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \
            "tokenize", {"genre": "&1"})],
            import_annotations=False,
        )

        # Extract works...
        titleSeg = Segmenter.tokenize(
            segmentation=genreSeg,
            regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \
            "tokenize")],
        )

        # Extract annotations...
        titleSeg = Segmenter.tokenize(
            segmentation=titleSeg,
            regexes=[
                (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", {
                    "author": "&1"
                }),
                (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'),
                 "tokenize", {
                     "url": "&1"
                 }),
                (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", {
                    "title": "&1"
                }),
            ],
            merge_duplicates=True,
        )

        # Try to save list in this module"s directory for future reference...
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        try:
            file = open(os.path.join(path, "cached_title_list_ecp"), "wb")
            pickle.dump(titleSeg, file, -1)
            file.close()
        except IOError:
            pass

        # Remove warning (if any)...
        self.error(0)
        self.warning(0)

        return titleSeg

    def updateFilterValueList(self):
        """Update the list of filter values"""

        # In Advanced settings mode, populate filter value list...
        if self.titleSeg is not None and self.displayAdvancedSettings:
            self.filterValueCombo.clear()
            self.filterValueCombo.addItem("(all)")
            for filterValue in self.filterValues[self.filterCriterion]:
                self.filterValueCombo.addItem(filterValue)

        # Reset filterValue if needed...
        if self.filterValue not in [
                self.filterValueCombo.itemText(i)
                for i in range(self.filterValueCombo.count())
        ]:
            self.filterValue = "(all)"
        else:
            self.filterValue = self.filterValue

        self.updateTitleList()

    def updateTitleList(self):
        """Update the list of titles"""

        # If titleSeg has not been loaded for some reason, skip.
        if self.titleSeg is None:
            return

        # In Advanced settings mode, get list of selected titles...
        if self.displayAdvancedSettings and self.filterValue != "(all)":
            self.filteredTitleSeg, _ = Segmenter.select(
                segmentation=self.titleSeg,
                regex=re.compile(r"^%s$" % self.filterValue),
                annotation_key=self.filterCriterion,
            )
        else:
            self.filteredTitleSeg = self.titleSeg

        # If criterion is not "genre" and his filter value not "all",
        # group titles with different genres...

        # Create a dictionary with "author" and "title" as key...

        unique_titles = dict()
        for title in self.filteredTitleSeg:
            title_id = (
                title.annotations["author"],
                title.annotations["title"],
            )
            try:
                unique_titles[title_id].append(title)
            except KeyError:
                unique_titles[title_id] = [title]

        # Create a list with new annotation comporting all genres...
        new_title_segments = list()
        for unique_title in unique_titles.values():
            title_genres = list()
            new_title_segments.append(unique_title[0])
            title_genres.append(unique_title[0].annotations["genre"])
            for equivalent_title in unique_title[1:]:
                title_genres.append(equivalent_title.annotations["genre"])
            new_title_segments[-1].annotations["genre"] = ", ".join(
                sorted(list(set(title_genres))))

        self.filteredTitleSeg = Segmentation(None)
        self.filteredTitleSeg.extend(new_title_segments)

        # Populate titleLabels list with the titles...
        self.titleLabels = sorted(
            [s.annotations["title"] for s in self.filteredTitleSeg])

        # Add specification (author, year and genre, depending on criterion)...
        titleLabels = self.titleLabels[:]
        for idx, titleLabel in enumerate(titleLabels):
            specs = list()
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "author"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["author"])
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "genre"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["genre"])
            titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs)
        self.titleLabels = titleLabels

        # Reset selectedTitles if needed...
        if not set(self.importedURLs).issubset(
                set(u.annotations["url"] for u in self.filteredTitleSeg)):
            self.selectedTitles = list()
        else:
            self.selectedTitles = self.selectedTitles

        self.sendButton.settingsChanged()

    def updateGUI(self):
        """Update GUI state"""
        if self.displayAdvancedSettings:
            self.advancedSettings.setVisible(True)
        else:
            self.advancedSettings.setVisible(False)

        if len(self.titleLabels) > 0:
            self.selectedTitles = self.selectedTitles

    def clearCreatedInputs(self):
        """Delete all Input objects that have been created."""
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def onDeleteWidget(self):
        """Free memory when widget is deleted (overriden method)"""
        self.clearCreatedInputs()

    # The following method need to be copied (without any change) in
    # every Textable widget...

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
 def clearCreatedInputs(self):
     for i in self.createdInputs:
         Segmentation.set_data(i[0].str_index, None)
     del self.createdInputs[:]
Esempio n. 35
0
    def updateTitleList(self):
        """Update the list of titles"""

        # If titleSeg has not been loaded for some reason, skip.
        if self.titleSeg is None:
            return

        # In Advanced settings mode, get list of selected titles...
        if self.displayAdvancedSettings and self.filterValue != "(all)":
            self.filteredTitleSeg, _ = Segmenter.select(
                segmentation=self.titleSeg,
                regex=re.compile(r"^%s$" % self.filterValue),
                annotation_key=self.filterCriterion,
            )
        else:
            self.filteredTitleSeg = self.titleSeg

        # If criterion is not "genre" and his filter value not "all",
        # group titles with different genres...

        # Create a dictionary with "author" and "title" as key...

        unique_titles = dict()
        for title in self.filteredTitleSeg:
            title_id = (
                title.annotations["author"],
                title.annotations["title"],
            )
            try:
                unique_titles[title_id].append(title)
            except KeyError:
                unique_titles[title_id] = [title]

        # Create a list with new annotation comporting all genres...
        new_title_segments = list()
        for unique_title in unique_titles.values():
            title_genres = list()
            new_title_segments.append(unique_title[0])
            title_genres.append(unique_title[0].annotations["genre"])
            for equivalent_title in unique_title[1:]:
                title_genres.append(equivalent_title.annotations["genre"])
            new_title_segments[-1].annotations["genre"] = ", ".join(
                sorted(list(set(title_genres))))

        self.filteredTitleSeg = Segmentation(None)
        self.filteredTitleSeg.extend(new_title_segments)

        # Populate titleLabels list with the titles...
        self.titleLabels = sorted(
            [s.annotations["title"] for s in self.filteredTitleSeg])

        # Add specification (author, year and genre, depending on criterion)...
        titleLabels = self.titleLabels[:]
        for idx, titleLabel in enumerate(titleLabels):
            specs = list()
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "author"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["author"])
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "genre"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["genre"])
            titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs)
        self.titleLabels = titleLabels

        # Reset selectedTitles if needed...
        if not set(self.importedURLs).issubset(
                set(u.annotations["url"] for u in self.filteredTitleSeg)):
            self.selectedTitles = list()
        else:
            self.selectedTitles = self.selectedTitles

        self.sendButton.settingsChanged()
Esempio n. 36
0
    def sendData(self):
        
        # Si le lien vers treetagger n'est pas trouve
        if self.NoLink:
            self.infoBox.setText(
                u"Sorry, TreeTagger's link not found.",
                "error"
            )
            self.send('Text data', None)
        # Important: if input data is None, propagate this value to output...
        elif not self.inputData:
            self.infoBox.setText(
                u"Widget needs input",
                "warning"
            )
            self.send('Text data', None)
        # affiche que quelque chose se passe...
        else:
            self.infoBox.setText(
                u'TreeTagger is running...',
                "warning"
            )

            # Initialisation de variables
            total_tagged_text = list()
            new_segmentations = list()
            i = 0
            
            # Initialize progress bar.
            self.progressBar = OWGUI.ProgressBar(
                self,
                iterations = 5
            )
            
            # Copie de la segmentation avec ajout d'une annotation...
            copy_of_input_seg = Segmentation()
            copy_of_input_seg.label = self.inputData.label
            for seg_idx, segment in enumerate(self.inputData):
                attr = " ".join(
                    ["%s='%s'" % item for item in segment.annotations.items()]
                )
                segment.annotations["tt_xb"] = attr
                copy_of_input_seg.append(segment)
            
            # avancer la progressBar d'un cran
            self.progressBar.advance()

            concatenated_text = copy_of_input_seg.to_string(
                formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>",
                display_all=True,
            )
            
            
            
            # avancer la progressBar d'un cran
            self.progressBar.advance()
            
            tagged_text = self.tag(concatenated_text)
            tagged_input = Input(tagged_text)
            tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt")
            
            # avancer la progressBar d'un cran
            self.progressBar.advance()
            
            # Si checkBox xml active
            if self.activer_xml == True:
                xml_segmentation = Segmenter.recode(
                        tagged_segmentation,
                        substitutions = [
                            (re.compile(r"<unknown>"), '[unknown]'),
                            (re.compile(
                                r"(.+)\t(.+)\t(.+)"),
                                '<w lemma="&3" type="&2">&1</w>'
                            ),
                            (re.compile(r'"""'), '"&quot;"'),
                        ],
                    )
                final_segmentation = xml_segmentation
            # Si checkBox xml desactive
            else:
                xml_segmentation = Segmenter.recode(
                        tagged_segmentation,
                        substitutions=[
                            (re.compile(r"<unknown>"), '[unknown]'),
                            (re.compile(
                                r"(.+)\t(.+)\t(.+)"),
                                '<w lemma="&3" type="&2">&1</w>'
                            ),
                            (re.compile(r'"""'), '"&quot;"'),

                        ],
                    )
                final_segmentation = Segmenter.import_xml(
                    xml_segmentation,
                    "w"
                )

            self.infoBox.dataSent('')

            # Enregistrer le lien de treetagger...
            if self.system == "nt":
                file = open("treetagger_link.txt", 'w')
            else:
                file = open(os.path.normpath(
                    "/Users/" + self.user + "/treetagger_link.txt"),
                    'w'
                )

            file.write(self.treetagger_link)
            file.close()

            # Clear progress bar.
            self.progressBar.finish()

            # envoyer la seguementation
            self.send('Text data', final_segmentation, self)
            self.compteur += 1
            self.sendButton.resetSettingsChangedFlag()
Esempio n. 37
0
class TestSegmentation(unittest.TestCase):
    """Test suite for LTTL Segment module"""
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        self.str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=self.str_index,
                    start=0,
                    end=2,
                    annotations={
                        'a': '1',
                        'bc': '20'
                    }),
            Segment(str_index=self.str_index, start=3, end=6)
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=self.str_index, start=3, end=5),
            Segment(str_index=self.str_index, start=4, end=6),
        ])

        self.base_output_string = ('segment number 1\n'
                                   '\tcontent:\t"ab"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t0\n'
                                   '\tend:\t2\n'
                                   '\tannotations:\n'
                                   '\t\ta                    1\n'
                                   '\t\tbc                   20\n'
                                   'segment number 2\n'
                                   '\tcontent:\t"cde"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t3\n'
                                   '\tend:\t6') % (self.str_index,
                                                   self.str_index)

        self.count = 0

    def tearDown(self):
        """Cleaning up after the test"""
        pass

    def test_creator(self):
        """Does creator return Segmentation object?"""
        self.assertIsInstance(
            Segmentation(),
            Segmentation,
            msg="creator doesn't return Segmentation object!")

    def test_to_string_default_format(self):
        """Does to_string() format segmentation correctly by default?"""
        output_string = self.word_seg.to_string()
        self.assertEqual(
            output_string,
            self.base_output_string,
            msg="to_string() doesn't format segmentation correctly by default!"
        )

    def test_to_string_delimiter(self):
        """Does to_string() format segment delimiter correctly?"""
        output_string = self.word_seg.to_string(
            segment_delimiter='DELIMITER', )
        self.assertIn(
            'DELIMITER',
            output_string,
            msg="to_string() doesn't format segment delimiter correctly!")

    def test_to_string_header(self):
        """Does to_string() format header correctly?"""
        output_string = self.word_seg.to_string(header='HEADER', )
        self.assertEqual(output_string,
                         'HEADER' + self.base_output_string,
                         msg="to_string() doesn't format header correctly!")

    def test_to_string_footer(self):
        """Does to_string() format footer correctly?"""
        output_string = self.word_seg.to_string(footer='FOOTER', )
        self.assertEqual(output_string,
                         self.base_output_string + 'FOOTER',
                         msg="to_string() doesn't format footer correctly!")

    def test_to_string_humanize_addresses(self):
        """Does to_string() humanize addresses?"""
        output_string = self.word_seg.to_string(humanize_addresses=True, )
        humanized_str_index = self.str_index + 1
        humanized_string = self.base_output_string.replace('t:\t3', 't:\t4')
        humanized_string = humanized_string.replace('t:\t0', 't:\t1')
        humanized_string = humanized_string.replace(
            'x:\t%i' % self.str_index, 'x:\t%i' % humanized_str_index)
        self.assertEqual(output_string,
                         humanized_string,
                         msg="to_string() doesn't humanize addresses!")

    def test_to_string_interpolate_builtin_variables(self):
        """Does to_string() interpolate builtin variables?"""
        output_string = self.word_seg.to_string(formatting=(
            '%(__num__)s,%(__content__)s,'
            '%(__str_index__)s,%(__start__)s,%(__end__)s,'
            '%(__str_index_raw__)s,%(__start_raw__)s,%(__end_raw__)s'))
        self.assertEqual(
            output_string,
            '1,ab,%i,0,2,%i,0,2\n2,cde,%i,3,6,%i,3,6' %
            (self.str_index, self.str_index, self.str_index, self.str_index),
            msg="to_string() doesn't interpolate builtin variables!")

    def test_to_string_interpolate_annotations(self):
        """Does to_string() interpolate annotations?"""
        output_string = self.word_seg.to_string(formatting='%(a)s')
        self.assertEqual(output_string,
                         '1\n__none__',
                         msg="to_string() doesn't interpolate annotations!")

    def test_to_string_progress(self):
        """Does to_string track progress?"""
        def progress_callback():
            """Mock progress callback"""
            self.count += 1

        self.word_seg.to_string(progress_callback=progress_callback, )
        self.assertEqual(self.count,
                         len(self.word_seg),
                         msg="to_string doesn't track progress!")

    def test_get_annotation_keys(self):
        """Does get_annotation_keys() return existing annotations?"""
        annotations = self.word_seg.get_annotation_keys()
        self.assertEqual(
            sorted(annotations),
            sorted(['a', 'bc']),
            msg="get_annotation_keys() doesn't return existing annotations!")

    def test_is_non_overlapping(self):
        """Does is_non_overlapping() recognize absence of overlap?"""
        self.assertTrue(
            self.word_seg.is_non_overlapping(),
            msg="is_non_overlapping() doesn't recognize absence of overlap!")

    def test_is_overlapping(self):
        """Does is_non_overlapping() recognize presence of overlap?"""
        self.assertFalse(
            self.overlapping_seg.is_non_overlapping(),
            msg="is_non_overlapping() doesn't recognize presence of overlap!")