Exemple #1
0
 def test_update_string(self):
     """Does update modify stored string?"""
     seg = Input('test2')
     seg.update('modified')
     self.assertEqual(Segmentation.get_data(-1)[:],
                      'modified',
                      msg="update doesn't modify stored string!")
Exemple #2
0
 def test_clear_string(self):
     """Does clear set stored string to None?"""
     seg = Input('test3')
     seg.clear()
     self.assertEqual(Segmentation.get_data(-1),
                      None,
                      msg="clear doesn't set stored string to None!")
    def send_data(self):
        """Creates the inputs based on the fetched data"""
        self.controlArea.setDisabled(True)
        self.clearCreatedInputs()
        segmentation = None

        # Goes over each queries in the data list
        for query in self.queryList:
            for text in query:
                # Create inputs
                newInput = Input(text)
                self.createdInputs.append(newInput)

        # If there is only one input, create a segmentation...
        if len(self.createdInputs) == 1:
            segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...
        annotations = list()
        for elem in self.annotList:
            for dic in elem:
                annotations.append(dic)

        for idx, segment in enumerate(segmentation):
            segment.annotations.update(annotations[idx])
            segmentation[idx] = segment

        # Calculate number of characters...
        num_chars = 0
        for segment in segmentation:
            num_chars += len(Segmentation.get_data(segment.str_index))

        # If there is data...
        if len(segmentation) != 0:
            # Inform the user of the number of segments and the number of characters...
            self.infoBox.setText(
                "{} segments sent to output ({} characters)".format(
                    len(segmentation),
                    num_chars,
                ))
            # Send the segments
            self.send("Segmentation", segmentation)
            self.controlArea.setDisabled(False)
            self.sendButton.resetSettingsChangedFlag()
        else:
            # Else, signal the user that no data is sendable...
            self.infoBox.setText(
                "There are {} segments to send to output. Please fill the query basket and click 'send' again"
                .format(len(segmentation)), "warning")
            self.sendButton.resetSettingsChangedFlag()
            self.controlArea.setDisabled(False)
            self.send("Segmentation", None)
Exemple #4
0
 def test_slice_string(self):
     """Does the slicing work like in strings"""
     Input('Hello world!')
     self.assertEqual(
         Segmentation.get_data(-1)[3:7],
         u"Hello world!"[3:7],
         msg="slicing doesn't return the same as in strings"
     )
Exemple #5
0
 def test_creator_store_string(self):
     """Does creator store string in class variable?"""
     Input(u'test')
     self.assertEqual(
         Segmentation.get_data(-1)[:],
         u'test',
         msg="creator doesn't store string in class variable!"
     )
Exemple #6
0
 def test_clear_string(self):
     """Does clear set stored string to None?"""
     seg = Input('test3')
     seg.clear()
     self.assertEqual(
         Segmentation.get_data(-1),
         None,
         msg="clear doesn't set stored string to None!"
     )
Exemple #7
0
 def test_update_string(self):
     """Does update modify stored string?"""
     seg = Input(u'test2')
     seg.update(u'modified')
     self.assertEqual(
         Segmentation.get_data(-1)[:],
         u'modified',
         msg="update doesn't modify stored string!"
     )
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Clear created Inputs
        self.clearCreatedInputs()
        
        if self.service == u'Twitter':
            try:
                self.createdInputs = self.get_tweets(
                    self.word_to_search,
                    self.nb_tweet,
                    self.include_RT,
                    self.useTwitterLicenseKey,
                    (
                        self.twitterLicenseKeysConsumerKey,
                        self.twitterLicenseKeysConsumerSecret,
                        (
                            self.twitterLicenseKeysAccessToken,
                            self.twitterLicenseKeysAccessTokenSecret
                        )
                    )
                )
            except (HTTP401Authentication, HTTP400BadRequest):
                self.infoBox.setText(
                    u'Please enter valid Twitter api keys.',
                    u'error',
                )
                self.send(u'Text data', None, self)
                return False
            except SearchEngineLimitError:
                self.infoBox.setText(
                    u'Twitter search limit has been exceeded.',
                    u'error',
                )
                self.send(u'Text data', None, self)
                return False


        elif self.service == u'Wikipedia':
            self.createdInputs = self.get_wiki_article(
                self.word_to_search,
                self.wiki_section,
                self.wiki_type_of_text
            )

        elif self.service == u'Bing':
            self.createdInputs = self.get_bing_entries(
                self.word_to_search,
                self.nb_bing_entry
            )

        

        if len(self.createdInputs) == 0:
            self.infoBox.setText(
                u'Please try to change query or settings.',
                u'warning',
            )
            self.send(u'Text data', None, self)
            return False

        # Initialize progress bar
        progressBar = OWGUI.ProgressBar(
            self, 
            iterations=50
        )

        output_segmentation = Segmenter.concatenate(
            self.createdInputs, 
            self.captionTitle, 
            import_labels_as=None
        )

        message = u'%i segment@p sent to output ' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        numChars = 0
        for segment in output_segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        for _ in xrange(50):
            progressBar.advance()

        # Clear progress bar.
        progressBar.finish()

        self.send('Text data', output_segmentation, self)
    
        self.sendButton.resetSettingsChangedFlag()
Exemple #9
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Skip if title list is empty:
        if self.titleLabels == list():
            return

        # Check that something has been selected...
        if len(self.selectedTitles) == 0:
            self.infoBox.setText("Please select one or more titles.",
                                 "warning")
            self.send("XML-TEI data", None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        # Initialize progress bar.
        progressBar = gui.ProgressBar(self,
                                      iterations=len(self.selectedTitles))

        # Attempt to connect to ECP and retrieve plays...
        xml_contents = list()
        annotations = list()
        try:
            for title in self.selectedTitles:
                doc_url = self.document_base_url +  \
                    self.filteredTitleSeg[title].annotations["url"]
                print(doc_url)
                url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url)
                print(url)
                response = urllib.request.urlopen(url)
                xml_contents.append(response.read().decode('utf-8'))
                source_annotations = \
                self.filteredTitleSeg[title].annotations.copy()
                #source_annotations["url"] = source_annotations["href"]
                #del source_annotations["href"]
                annotations.append(source_annotations)
                progressBar.advance()  # 1 tick on the progress bar...

        # If an error occurs (e.g. http error, or memory error)...
        except:
            #Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from ECP website.",
                                 "error")
            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return

        # Store downloaded XML in input objects...
        for xml_content_idx in range(len(xml_contents)):
            newInput = Input(xml_contents[xml_content_idx], self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Store imported URLs as setting.
        self.importedURLs = [
            self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"]
        ]

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()

        # Clear progress bar.
        progressBar.finish()

        # Send token...
        self.send("XML-TEI data", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Fetch URL content, create and send segmentation"""

        # Check that there's something on input...
        if ((self.displayAdvancedSettings and not self.URLs)
                or not (self.URL or self.displayAdvancedSettings)):
            self.infoBox.setText(u'Please select source URL.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        URLContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myURLs = self.URLs
        else:
            myURLs = [[self.URL, self.encoding, u'', u'']]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(myURLs))

        # Process each URL successively...
        for myURL in myURLs:

            URL = myURL[0]
            if not URL.startswith("http"):
                URL = "http://" + URL
            encoding = myURL[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myURL[2]
            annotation_value = myURL[3]

            # Try to fetch URL content...
            self.error()
            URLContent = ""
            try:
                URLHandle = urlopen(URL)
                URLContent = URLHandle.read()
                URLHandle.close()
            except http.client.IncompleteRead as e:
                URLContent = e.partial
            except IOError:
                progressBar.finish()
                if len(myURLs) > 1:
                    message = u"Couldn't retrieve %s." % URL
                else:
                    message = u"Couldn't retrieve URL."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return
            try:
                if encoding == "(auto-detect)":
                    encoding = chardet.detect(URLContent)['encoding']
                URLContent = URLContent.decode(encoding)
            except UnicodeError:
                progressBar.finish()
                if len(myURLs) > 1:
                    message = u"Please select another encoding "    \
                              + u"for URL %s." % URL
                else:
                    message = u"Please select another encoding."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Replace newlines with '\n'...
            # URLContent = URLContent.replace('\r\n', '\n').replace('\r', '\n')

            # TODO: check if this is more efficient than replace above...
            URLContent = '\n'.join(URLContent.splitlines())

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                URLContent = URLContent.lstrip(codecs.BOM_UTF8.decode('utf-8'))

            # Normalize text (canonical decomposition then composition)...
            URLContent = normalize('NFC', URLContent)

            URLContents.append(URLContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importURLs and self.importURLsKey:
                    annotation[self.importURLsKey] = URL
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each URL...
        if len(URLContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(URLContents)):
            myInput = Input(URLContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one URL, the widget's output is the created Input.
        if len(URLContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemple #11
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some movies first",
                "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # Connect to imdb and add elements in lists
        list_review = list()
        list_annotation = list()
        annotations = list()
        try:
            for item in self.myBasket:
                movie = self.ia.get_movie_reviews(item['id'])
                movie_annotations = self.ia.get_movie(item['id'])
                list_review.append(movie)
                list_annotation.append(movie_annotations)
                # 1 tick on the progress bar of the widget
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from imdb", "error")
            self.controlArea.setDisabled(False)
            return

        # Store movie critics strings in input objects...
        for movie in list_review:
            data = movie.get('data', "")
            reviews_data = data.get('reviews')
            for review in reviews_data:
                reviews = review.get('content')
                newInput = Input(reviews)
                self.createdInputs.append(newInput)
        for item in list_annotation:
            print(item)
            # Store the annotation as dicts in a separate list
            annotations_dict = {"title": item, "year": item["year"]}
            annot_dict_copy = annotations_dict.copy()
            for i in range(25):
                annotations.append(annot_dict_copy)
        print(annotations)
        # If there's only one item, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = f"{len(self.segmentation)} segment@p sent to output"
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += " (%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send('Segmentation', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemple #12
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some books first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(
            self,
            iterations=len(self.myBasket),
        )

        selectedTexts = list()
        text_content = list()
        annotations = list()
        # get the Gutenberg cache
        cache = GutenbergCache.get_cache()
        try:
            # TODO: Retrieve selected texts from gutenberg
            for text in self.myBasket:

                # Get the id of the text
                query_id = cache.native_query(
                    sql_query=
                    "select gutenbergbookid from books where id == {selected_id}"
                    .format(selected_id=text[2]))
                gutenberg_id = list(query_id)

                # Get the text with Gutenbergpy
                gutenberg_text = gutenbergpy.textget.strip_headers(
                    gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0]))
                text_content.append(gutenberg_text)

                annotations.append(text[1])
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except Exception:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Gutenberg",
                                 "error")
            self.controlArea.setDisabled(False)
            return

        # TODO: send gutenberg texts as output
        # Store downloaded lyrics strings in input objects...
        for text in text_content:
            newInput = Input(text, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # TODO: annotate with book metadata
        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update({"title": annotations[idx]})
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Gutenberg importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemple #13
0
    def sendData(self):

        """Load folders, create and send segmentation"""

        # Check that there's something on input...
        if (
            (self.displayAdvancedSettings and not self.folders) or
            not (self.rootFolderPath or self.displayAdvancedSettings)
        ):
            self.infoBox.setText(u'Please select input folder.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning'
                )
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFolders = self.folders
        else:
            myFolders = [[self.rootFolderPath]]

        progressBar = gui.ProgressBar(
            self,
            iterations=len(myFolders)
        )

        # Walk through each folder and open each files successively...

        fileContents = self.fileContents

        # Annotations...
        myFolders = self.folders
        for myFolder in myFolders:
            myFiles = myFolder['fileList']

            for myFile in myFiles:
                # print(myFile)
                annotation = dict()

                if self.importFileNameKey:
                    annotation[self.importFileNameKey] = myFile['fileName']

                if self.importFolderNameKey:
                    annotation[self.importFolderNameKey] = myFile['folderName']

                if self.FolderDepth1Key:
                    annotation[self.FolderDepth1Key] = myFile['depth1']

                if self.FolderDepth2Key:
                    annotation[self.FolderDepth2Key] = myFile['depth2']

                if self.FolderDepthLvl:
                    annotation[self.FolderDepthLvl] = myFile['depthLvl']

                annotations.append(annotation)
            # progressBar.advance()

        # Create an LTTL.Input for each files...

        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )
        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some songs first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # Attempt to connect to Genius and retrieve lyrics...
        selectedSongs = list()
        song_content = list()
        annotations = list()
        try:
            for song in self.myBasket:
                # song is a dict {'idx1':{'title':'song1'...},
                # 'idx2':{'title':'song2'...}}
                page_url = "http://genius.com" + song['path']
                lyrics = self.html_to_text(page_url)
                song_content.append(lyrics)
                annotations.append(song.copy())
                # 1 tick on the progress bar of the widget
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Genius website.",
                                 "error")
            self.controlArea.setDisabled(False)
            return

        # Store downloaded lyrics strings in input objects...
        for song in song_content:
            newInput = Input(song, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Lyrics importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Load files, create and send segmentation"""

        # Check that there's something on input...
        if ((self.displayAdvancedSettings and not self.files)
                or not (self.file or self.displayAdvancedSettings)):
            self.infoBox.setText(u'Please select input file.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFiles = self.files
        else:
            myFiles = [[self.file, self.encoding, u'', u'']]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(myFiles))

        # Open and process each file successively...
        for myFile in myFiles:
            filePath = myFile[0]
            encoding = myFile[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myFile[2]
            annotation_value = myFile[3]

            # Try to open the file...
            self.error()
            try:
                if encoding == "(auto-detect)":
                    detector = UniversalDetector()
                    fh = open(filePath, 'rb')
                    for line in fh:
                        detector.feed(line)
                        if detector.done: break
                    detector.close()
                    fh.close()
                    encoding = detector.result['encoding']
                fh = open(
                    filePath,
                    mode='rU',
                    encoding=encoding,
                )
                try:
                    fileContent = ""
                    i = 0
                    chunks = list()
                    for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""):
                        chunks.append('\n'.join(chunk.splitlines()))
                        i += CHUNK_LENGTH
                        if i % (CHUNK_NUM * CHUNK_LENGTH) == 0:
                            fileContent += "".join(chunks)
                            chunks = list()
                    if len(chunks):
                        fileContent += "".join(chunks)
                    del chunks
                except UnicodeError:
                    progressBar.finish()
                    if len(myFiles) > 1:
                        message = u"Please select another encoding "    \
                                  + u"for file %s." % filePath
                    else:
                        message = u"Please select another encoding."
                    self.infoBox.setText(message, 'error')
                    self.send('Text data', None, self)
                    self.controlArea.setDisabled(False)
                    return
                finally:
                    fh.close()
            except IOError:
                progressBar.finish()
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                fileContent = fileContent.lstrip(
                    codecs.BOM_UTF8.decode('utf-8'))

            # Normalize text (canonical decomposition then composition)...
            fileContent = normalize('NFC', fileContent)

            fileContents.append(fileContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importFilenames and self.importFilenamesKey:
                    filename = os.path.basename(filePath)
                    annotation[self.importFilenamesKey] = filename
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each file...
        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemple #16
0
    def sendData(self):
        """Load folders, create and send segmentation"""

        # Check that there's something on input...
        if (self.displayAdvancedSettings
                and not self.folders) or not (self.rootFolderPath
                                              or self.displayAdvancedSettings):
            self.infoBox.setText(u'Please select input folder.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFolders = self.folders
        else:
            myFolders = [self.folder]

        # Annotations...
        allFileListContent = list()
        for myFolder in myFolders:

            myFiles = myFolder['fileList']

            for myFile in myFiles:

                annotation = dict()
                annotation['file name'] = myFile['fileName']
                annotation['file depth level'] = myFile['depthLvl']
                annotation['file path'] = myFile['absoluteFilePath']
                try:
                    annotation['file encoding, confidence'] = myFile[
                        'encoding'] + ", " + str(myFile['encodingConfidence'])
                except TypeError:
                    annotation['file encoding, confidence'] = "unknown"

                depths = [k for k in myFile.keys() if k.startswith('depth_')]
                for depth in depths:
                    annotation[depth] = myFile[depth]

                annotations.append(annotation)
                allFileListContent.append(myFile['fileContent'])

        # Create an LTTL.Input for each files...

        if len(allFileListContent) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(allFileListContent)):
            myInput = Input(allFileListContent[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(allFileListContent) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )
        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Skip if title list is empty:
        if self.titleLabels == list():
            return

        # Check that something has been selected...
        if len(self.selectedTitles) == 0:
            self.infoBox.setText(u'Please select one or more titles.',
                                 'warning')
            self.send(u'Text data', None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        # Initialize progress bar.
        progressBar = OWGUI.ProgressBar(self,
                                        iterations=len(self.selectedTitles))

        # Attempt to connect to Theatre-classique and retrieve plays...
        xml_contents = list()
        annotations = list()
        try:
            for title in self.selectedTitles:
                response = urllib2.urlopen(
                    self.document_base_url +
                    self.filteredTitleSeg[title].annotations[u'url'])
                xml_contents.append(unicode(response.read(), u'utf8'))
                annotations.append(
                    self.filteredTitleSeg[title].annotations.copy())
                progressBar.advance()  # 1 tick on the progress bar...

        # If an error occurs (e.g. http error, or memory error)...
        except:

            # Set Info box and widget to 'error' state.
            self.infoBox.setText(
                u"Couldn't download data from theatre-classique website.",
                'error')

            # Reset output channel.
            self.send(u'Text data', None, self)
            return

        # Store downloaded XML in input objects...
        for xml_content_idx in xrange(len(xml_contents)):
            newInput = Input(xml_contents[xml_content_idx], self.captionTitle)
            self.createdInputs.append(newInput)

        # If there's only one play, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Store imported URLs as setting.
        self.importedURLs = [
            self.filteredTitleSeg[self.selectedTitles[0]].annotations[u'url']
        ]

        # Set status to OK and report data size...
        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()

        # Clear progress bar.
        progressBar.finish()

        # Send token...
        self.send(u'Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemple #18
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some books first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(
            self,
            iterations=len(self.myBasket),
        )

        text_content = list()
        annotations = list()

        try:
            # Retrieve selected texts from gutenberg
            for text in self.myBasket:

                gutenberg_id = text[2]

                # Get the text with Gutenbergpy
                gutenberg_text = gutenbergpy.textget.strip_headers(
                    gutenbergpy.textget.get_text_by_id(gutenberg_id)).decode(
                        "utf-8")

                text_content.append(gutenberg_text)
                # populate the annotation list
                annotations.append([text[0], text[1], text[3]])
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except Exception as exc:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Gutenberg",
                                 "error")
            self.controlArea.setDisabled(False)
            print(exc)
            return

        # Store downloaded text strings in input objects...
        for text in text_content:
            newInput = Input(text, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there's only one text, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation.
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments with book metadata
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update({"title": annotations[idx][0]})
            segment.annotations.update({"author": annotations[idx][1]})
            segment.annotations.update({"language": annotations[idx][2]})
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Gutenberg importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Send data from website springfieldspringfield"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some movies first",
                "warning"
            )
            self.segmentation = None
            self.send("Movie transcripts", self.segmentation, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        annotations = list()
        script_list = list()
        annotations_dict = dict()
        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # This part of code is what fetches the actual script
        try:
            for movie in self.myBasket:
                # Each movie that is in the corpus is split into title and year
                # (rsplit makes sure to only split last occurence) which will 
                # become annotations
                b = copy.copy(movie)
                future_annotation = b.rsplit('(', 1)
                movie_title = future_annotation[0]
                movie_year = future_annotation[-1]
                movie_year = movie_year[:-1]
                annotations_dict["Movie Title"] = movie_title
                annotations_dict["Year of release"] = movie_year
                # It is important to make a copy of dictionary, otherwise each 
                # iteration will replace every element of the annotations list
                annotations.append(annotations_dict.copy())
                # link_end and page_url are the two variables that will have to
                # be changed in case scripts need to be taken from elsewhere
                link_end = self.path_storage[movie]
                page_url = "https://www.springfieldspringfield.co.uk/" +   \
                    "movie_script.php?movie=" + link_end
                page = urllib.request.urlopen(page_url)
                soup = BeautifulSoup(page, 'html.parser')

                # This is what grabs the movie script
                script = soup.find("div", {"class":"movie_script"})

                script_list.append(script.text)

                # 1 tick on the progress bar of the widget
                progressBar.advance()

        except:
            self.infoBox.setText(
                "Couldn't download data from SpringfieldSpringfield website.",
                "error"
            )
            self.controlArea.setDisabled(False)
            return

        # Store downloaded script strings in input objects...
        for script in script_list:
            newInput = Input(script, self.captionTitle)
            self.createdInputs.append(newInput)

       # If there's only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Movie transcripts", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Load files, create and send segmentation"""

        # Check that there's something on input...
        if ((self.displayAdvancedSettings and not self.files)
                or not (self.file or self.displayAdvancedSettings)):
            self.infoBox.setText(u'Please select input file.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFiles = self.files
        else:
            myFiles = [[self.file, self.encoding, "", "", "", "eng", False]]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(myFiles))

        # Open and process each file successively...
        for myFile in myFiles:
            filePath = myFile[0]
            encoding = myFile[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myFile[2]
            annotation_value = myFile[3]
            pdf_password = myFile[4]  # SuperTextFiles
            ocr_languages = myFile[5]  # SuperTextFiles
            ocr_force = myFile[6]  # SuperTextFiles

            myFiletype = filetype.guess(myFile[0])  # SuperTextFiles

            # Try to open the file...
            self.error()
            # Start SuperTextFiles
            try:
                if myFiletype is None:
                    fileContent = self.extract_raw_text(filePath, encoding)

                elif myFiletype.extension == "pdf":
                    if ocr_force is True:
                        fileContent = self.get_pdf_content(
                            filePath,
                            ocr_languages,
                        )
                    else:
                        if self.is_textual_pdf_file(filePath) is True:
                            fileContent = self.extract_text_from_pdf(filePath)
                        else:
                            fileContent = self.get_pdf_content(
                                filePath,
                                ocr_languages,
                            )

                elif myFiletype.extension in IMG_FILETYPES:
                    fileContent = self.ocrize(filePath, ocr_languages)

                if fileContent == -1:
                    message = u"Couldn't open file."
                    self.infoBox.setText(message, 'error')
                    self.send('Text data', None, self)
                    self.controlArea.setDisabled(False)
                    return

            # End SuperTextFiles

            except IOError as e:
                if "tesseract" in str(e):
                    QMessageBox.warning(None, 'Textable', str(e),
                                        QMessageBox.Ok)
                progressBar.finish()
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                fileContent = fileContent.lstrip(
                    codecs.BOM_UTF8.decode('utf-8'))

            # Normalize text (canonical decomposition then composition)...
            fileContent = normalize('NFC', fileContent)

            fileContents.append(fileContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importFilenames and self.importFilenamesKey:
                    filename = os.path.basename(filePath)
                    annotation[self.importFilenamesKey] = filename
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each file...
        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemple #21
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Clear created Inputs
        self.clearCreatedInputs()

        if self.service == u'Twitter':
            try:
                self.createdInputs = self.get_tweets(
                    self.word_to_search, self.nb_tweet, self.include_RT,
                    self.useTwitterLicenseKey,
                    (self.twitterLicenseKeysConsumerKey,
                     self.twitterLicenseKeysConsumerSecret,
                     (self.twitterLicenseKeysAccessToken,
                      self.twitterLicenseKeysAccessTokenSecret)))
            except (HTTP401Authentication, HTTP400BadRequest):
                self.infoBox.setText(
                    u'Please enter valid Twitter api keys.',
                    u'error',
                )
                self.send(u'Text data', None, self)
                return False
            except SearchEngineLimitError:
                self.infoBox.setText(
                    u'Twitter search limit has been exceeded.',
                    u'error',
                )
                self.send(u'Text data', None, self)
                return False

        elif self.service == u'Wikipedia':
            self.createdInputs = self.get_wiki_article(self.word_to_search,
                                                       self.wiki_section,
                                                       self.wiki_type_of_text)

        elif self.service == u'Bing':
            self.createdInputs = self.get_bing_entries(self.word_to_search,
                                                       self.nb_bing_entry)

        if len(self.createdInputs) == 0:
            self.infoBox.setText(
                u'Please try to change query or settings.',
                u'warning',
            )
            self.send(u'Text data', None, self)
            return False

        # Initialize progress bar
        progressBar = OWGUI.ProgressBar(self, iterations=50)

        output_segmentation = Segmenter.concatenate(self.createdInputs,
                                                    self.captionTitle,
                                                    import_labels_as=None)

        message = u'%i segment@p sent to output ' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        numChars = 0
        for segment in output_segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        for _ in xrange(50):
            progressBar.advance()

        # Clear progress bar.
        progressBar.finish()

        self.send('Text data', output_segmentation, self)

        self.sendButton.resetSettingsChangedFlag()