Ejemplo n.º 1
0
 def setUp(self):
     """ Setting up for the test """
     self.entire_text_seg = Input('ab cde')
     self.other_entire_text_seg = Input('d')
     str_index = self.entire_text_seg[0].str_index
     self.first_word_seg = Segmentation(
         [
             Segment(
                     str_index=str_index,
                     start=0,
                     end=2,
                     annotations={'a': 1}
             )
         ]
     )
     self.last_word_seg = Segmentation(
         [Segment(str_index=str_index, start=3, end=6)]
     )
     self.char_seg = Segmentation(
         [
             Segment(str_index=str_index, start=0, end=1),
             Segment(str_index=str_index, start=1, end=2),
             Segment(str_index=str_index, start=2, end=3),
             Segment(str_index=str_index, start=3, end=4),
             Segment(str_index=str_index, start=4, end=5),
             Segment(str_index=str_index, start=5, end=6),
         ]
     )
Ejemplo n.º 2
0
 def test_update_string(self):
     """Does update modify stored string?"""
     seg = Input('test2')
     seg.update('modified')
     self.assertEqual(Segmentation.get_data(-1)[:],
                      'modified',
                      msg="update doesn't modify stored string!")
Ejemplo n.º 3
0
 def test_clear_string(self):
     """Does clear set stored string to None?"""
     seg = Input('test3')
     seg.clear()
     self.assertEqual(Segmentation.get_data(-1),
                      None,
                      msg="clear doesn't set stored string to None!")
Ejemplo n.º 4
0
    def sendData(self):

        if not self.file:
            self.infoBox.setText(u"Please select input file.", "warning")
            self.send('Text data', None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()
        # Get transcription

        try:
            transcription = self.get_large_audio_transcription(
                self.file,
                language=self.language,
                set_silence_len=self.selected_dur,
                set_silence_threshold=self.selected_vol)
        except speech_recognition.UnknownValueError as err:
            self.infoBox.setText(
                u"You seem to have overuseed the built-in API key, refer to the documentation for further informations.",
                "warning")
            self.send('Text data', None, self)
            return

        # Checks if there is a transcription
        if transcription is None:
            self.infoBox.setText(u"You must use mp3 or wav audio files.",
                                 "warning")
            self.send('Text data', None, self)
            return

        # Regex to get the name of the input file
        title = self.file
        regex = re.compile("[^(/\\)]+[mp3|wav]$")
        match = re.findall(regex, title)

        if self.selected_seg:
            for chunk in transcription:
                new_input = Input(chunk, label=match)
                self.createdInputs.append(new_input)
        else:
            new_input = Input(transcription, label=match)
            self.createdInputs.append(new_input)
        # Concatenates the segmentations in the output segmentation
        self.segmentation = Segmenter.concatenate(
            segmentations=self.createdInputs,
            label=self.captionTitle,
            copy_annotations=False,
            import_labels_as="")

        #Sending segments length
        message = " Succesfully transcripted ! % i segment@p sent to output" % len(
            self.segmentation)
        message = pluralize(message, len(self.segmentation))
        # Send token...
        self.send("Text data", self.segmentation, self)
        self.infoBox.setText(message)
        self.sendButton.resetSettingsChangedFlag()
Ejemplo n.º 5
0
 def test_update_string(self):
     """Does update modify stored string?"""
     seg = Input(u'test2')
     seg.update(u'modified')
     self.assertEqual(
         Segmentation.get_data(-1)[:],
         u'modified',
         msg="update doesn't modify stored string!"
     )
Ejemplo n.º 6
0
 def test_clear_string(self):
     """Does clear set stored string to None?"""
     seg = Input('test3')
     seg.clear()
     self.assertEqual(
         Segmentation.get_data(-1),
         None,
         msg="clear doesn't set stored string to None!"
     )
Ejemplo n.º 7
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        self.str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=self.str_index,
                    start=0,
                    end=2,
                    annotations={
                        'a': '1',
                        'bc': '20'
                    }),
            Segment(str_index=self.str_index, start=3, end=6)
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=self.str_index, start=3, end=5),
            Segment(str_index=self.str_index, start=4, end=6),
        ])

        self.base_output_string = ('segment number 1\n'
                                   '\tcontent:\t"ab"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t0\n'
                                   '\tend:\t2\n'
                                   '\tannotations:\n'
                                   '\t\ta                    1\n'
                                   '\t\tbc                   20\n'
                                   'segment number 2\n'
                                   '\tcontent:\t"cde"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t3\n'
                                   '\tend:\t6') % (self.str_index,
                                                   self.str_index)

        self.count = 0
Ejemplo n.º 8
0
 def test_creator(self):
     """Does creator return Input object?"""
     self.assertIsInstance(
         Input(),
         Input,
         msg="creator doesn't return Input object!"
     )
    def send_data(self):
        """Creates the inputs based on the fetched data"""
        self.controlArea.setDisabled(True)
        self.clearCreatedInputs()
        segmentation = None

        # Goes over each queries in the data list
        for query in self.queryList:
            for text in query:
                # Create inputs
                newInput = Input(text)
                self.createdInputs.append(newInput)

        # If there is only one input, create a segmentation...
        if len(self.createdInputs) == 1:
            segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...
        annotations = list()
        for elem in self.annotList:
            for dic in elem:
                annotations.append(dic)

        for idx, segment in enumerate(segmentation):
            segment.annotations.update(annotations[idx])
            segmentation[idx] = segment

        # Calculate number of characters...
        num_chars = 0
        for segment in segmentation:
            num_chars += len(Segmentation.get_data(segment.str_index))

        # If there is data...
        if len(segmentation) != 0:
            # Inform the user of the number of segments and the number of characters...
            self.infoBox.setText(
                "{} segments sent to output ({} characters)".format(
                    len(segmentation),
                    num_chars,
                ))
            # Send the segments
            self.send("Segmentation", segmentation)
            self.controlArea.setDisabled(False)
            self.sendButton.resetSettingsChangedFlag()
        else:
            # Else, signal the user that no data is sendable...
            self.infoBox.setText(
                "There are {} segments to send to output. Please fill the query basket and click 'send' again"
                .format(len(segmentation)), "warning")
            self.sendButton.resetSettingsChangedFlag()
            self.controlArea.setDisabled(False)
            self.send("Segmentation", None)
Ejemplo n.º 10
0
 def test_creator_store_string(self):
     """Does creator store string in class variable?"""
     Input(u'test')
     self.assertEqual(
         Segmentation.get_data(-1)[:],
         u'test',
         msg="creator doesn't store string in class variable!"
     )
Ejemplo n.º 11
0
 def test_slice_string(self):
     """Does the slicing work like in strings"""
     Input('Hello world!')
     self.assertEqual(
         Segmentation.get_data(-1)[3:7],
         u"Hello world!"[3:7],
         msg="slicing doesn't return the same as in strings"
     )
    def __init__(self):
        """Initialize a Text File widget"""

        super().__init__()

        # Other attributes...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute='infoBox',
        )

        # LTTL.Input object (token that will be sent).
        self.segmentation = Input(text=u'')

        # GUI...

        # Text Field...
        gui.separator(
            widget=self.controlArea,
            height=3,
        )
        self.editor = QPlainTextEdit()
        self.editor.setPlainText(self.textFieldContent.decode('utf-8'))
        self.controlArea.layout().addWidget(self.editor)
        self.editor.textChanged.connect(self.sendButton.settingsChanged)
        gui.separator(
            widget=self.controlArea,
            height=3,
        )
        self.setMinimumWidth(250)

        # Send button...
        self.sendButton.draw()

        # Info box...
        self.infoBox.draw()

        self.sendButton.sendIf()
Ejemplo n.º 13
0
    def get_tweets(self, search, nb, include_RT, useKey, keys):

        if not useKey:
            keys = None

        twitter = Twitter(language=self.dico_lang[self.language], license=keys)

        tweets = list()
        if not include_RT:
            for tweet in twitter.search(search, start=1, count=nb * 3):
                if not tweet.text.startswith('RT'):
                    tweet_input = Input(tweet.text)
                    annotations = {
                        'source': 'Twitter',
                        'author': tweet.author,
                        'date': tweet.date,
                        'url': tweet.url,
                        'search': search,
                    }
                    segment = tweet_input[0]
                    segment.annotations.update(annotations)
                    tweet_input[0] = segment
                    tweets.append(tweet_input)
                if len(tweets) == nb:
                    break
        else:
            for tweet in twitter.search(search, start=1, count=nb):
                tweet_input = Input(tweet.text)
                annotations = {
                    'source': 'Twitter',
                    'author': tweet.author,
                    'date': tweet.date,
                    'url': tweet.url,
                    'search': search,
                }
                segment = tweet_input[0]
                segment.annotations.update(annotations)
                tweet_input[0] = segment
                tweets.append(tweet_input)
        return tweets
Ejemplo n.º 14
0
    def get_wiki_article(self,
                         search,
                         separate_in_section=False,
                         type_of_text=u'Plain text'):
        segments = list()
        article = Wikipedia(language=self.dico_lang[self.language]).search(
            search, cached=False)
        if article:
            if separate_in_section:
                for section in article.sections:
                    if type_of_text == u'Plain text':
                        wiki_article = Input(section.string)
                    else:
                        wiki_article = Input(section.html)

                    annotations = {
                        'source': 'Wikipedia',
                        'section title': section.title,
                        'section level': section.level,
                        'search': search,
                    }
                    segment = wiki_article[0]
                    segment.annotations.update(annotations)
                    wiki_article[0] = segment
                    segments.append(wiki_article)
            else:
                if type_of_text == u'Plain text':
                    wiki_article = Input(article.string)
                else:
                    wiki_article = Input(article.html)
                annotations = {
                    'source': 'Wikipedia',
                    'search': search,
                }
                segment = wiki_article[0]
                segment.annotations.update(annotations)
                wiki_article[0] = segment
                segments.append(wiki_article)
        return segments
Ejemplo n.º 15
0
def main():

    input_seg = Input("un texte")

    verbatim_seg = Segmenter.tokenize(
        input_seg,
        [(re.compile(r'.+'), 'tokenize')],
    )

    # verbatim in input = ok
    print("verbatim in input:", end=' ')
    contained_segment_idxs = input_seg[0].get_contained_segment_indices(
        verbatim_seg)
    try:
        print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")

    # verbatim in verbatim = ok
    print("verbatim in verbatim:", end=' ')
    contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices(
        verbatim_seg)
    try:
        print("ok" if verbatim_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")

    # input in verbatim = fail
    print("input in verbatim:", end=' ')
    contained_segment_idxs = verbatim_seg[0].get_contained_segment_indices(
        input_seg)
    try:
        print("ok" if input_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")

    # input in input = fail
    print("input in input:", end=' ')
    contained_segment_idxs = input_seg[0].get_contained_segment_indices(
        input_seg)
    try:
        print("ok" if input_seg[contained_segment_idxs[0]].get_content() ==
              'un texte' else "fail")
    except:
        print("fail")
Ejemplo n.º 16
0
 def get_bing_entries(self, search, nb):
     bing = Bing(language=self.dico_lang[self.language])
     entries = list()
     for result in bing.search(search, start=1, count=nb, cached=False):
         entry_input = Input(result.text)
         annotations = {
             'source': 'Bing',
             'title': result.title,
             'url': result.url,
             'search': search,
         }
         segment = entry_input[0]
         segment.annotations.update(annotations)
         entry_input[0] = segment
         entries.append(entry_input)
     return entries
Ejemplo n.º 17
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Linguistically analyzed data", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        # Basic NLP analysis for dev purposes...
        analyzedSegments = list()
        for segment in self.inputSeg:
            analyzedString = ""
            doc = self.nlp(segment.get_content())
            for token in doc:
                analyzedString += "%s\t%s\n" % (token.text, token.pos_)
            analyzedSegments.append(Input(analyzedString))
            progressBar.advance()

        outputSeg = LTTL.Segmenter.concatenate(
            analyzedSegments,
            import_labels_as=None,
            label=self.captionTitle,
        )

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSeg)
        message = pluralize(message, len(outputSeg))
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        # Send data to output...
        self.send("Linguistically analyzed data", outputSeg, self)

        self.sendButton.resetSettingsChangedFlag()
Ejemplo n.º 18
0
def main():

    input_seg = Input("un texte")

    verbatim_seg = Segmenter.tokenize(
        input_seg,
        [(re.compile(r'.+'), 'tokenize')],
    )

    # verbatim in input = ok
    print "verbatim in input:",
    contained_segments = input_seg[0].get_contained_segments(verbatim_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"

    # verbatim in verbatim = ok
    print "verbatim in verbatim:",
    contained_segments = verbatim_seg[0].get_contained_segments(verbatim_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"

    # input in verbatim = fail
    print "input in verbatim:",
    contained_segments = verbatim_seg[0].get_contained_segments(input_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"

    # input in input = fail
    print "input in input:",
    contained_segments = input_seg[0].get_contained_segments(input_seg)
    try:
        print "ok" if contained_segments[0].get_content(
        ) == 'un texte' else "fail"
    except:
        print "fail"
class OWTextableTextField(OWTextableBaseWidget):
    """Orange widget for typing text data"""

    name = "Text Field"
    description = "Import text data from keyboard input"
    icon = "icons/TextField.png"
    priority = 1

    # Input and output channels...
    inputs = [('Text data', Segmentation, "inputTextData", widget.Single)]
    outputs = [('Text data', Segmentation)]

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    # Settings ...
    textFieldContent = settings.Setting(u''.encode('utf-8'))
    encoding = settings.Setting(u'utf-8')

    want_main_area = False

    def __init__(self):
        """Initialize a Text File widget"""

        super().__init__()

        # Other attributes...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute='infoBox',
        )

        # LTTL.Input object (token that will be sent).
        self.segmentation = Input(text=u'')

        # GUI...

        # Text Field...
        gui.separator(
            widget=self.controlArea,
            height=3,
        )
        self.editor = QPlainTextEdit()
        self.editor.setPlainText(self.textFieldContent.decode('utf-8'))
        self.controlArea.layout().addWidget(self.editor)
        self.editor.textChanged.connect(self.sendButton.settingsChanged)
        gui.separator(
            widget=self.controlArea,
            height=3,
        )
        self.setMinimumWidth(250)

        # Send button...
        self.sendButton.draw()

        # Info box...
        self.infoBox.draw()

        self.sendButton.sendIf()

    def inputTextData(self, segmentation):
        """Handle text data on input connection"""
        if not segmentation:
            return
        self.editor.setPlainText(''.join(
            [s.get_content() for s in segmentation]))
        self.sendButton.settingsChanged()

    def sendData(self):
        """Normalize content, then create and send segmentation"""
        textFieldContent = self.editor.toPlainText()
        self.textFieldContent = textFieldContent.encode('utf-8')
        textFieldContent \
            = textFieldContent.replace('\r\n', '\n').replace('\r', '\n')
        textFieldContent = normalize('NFC', textFieldContent)

        # Check that text field is not empty...
        if not self.textFieldContent:
            self.infoBox.setText(
                message=u'Please type or paste some text above.',
                state='warning',
            )
            self.send('Text data', None, self)
            return

        # TODO: remove message 'No label was provided.' from docs

        # Set status to OK...
        message = u'1 segment (%i character@p) sent to output.' %   \
                  len(textFieldContent)
        message = pluralize(message, len(textFieldContent))
        self.infoBox.setText(message)

        # Update segmentation.
        self.segmentation.update(textFieldContent, label=self.captionTitle)

        # Send token...
        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)

    def onDeleteWidget(self):
        self.segmentation.clear()
        self.segmentation.__del__()
Ejemplo n.º 20
0
            self.contextAnnotationKey = self.contextAnnotationKey

    def handleNewSignals(self):
        """Overridden: called after multiple signals have been added"""
        self.openContext(self.uuid, self.segmentations)
        self.updateGUI()
        self.sendButton.sendIf()


if __name__ == '__main__':
    import sys

    from PyQt4.QtGui import QApplication
    import LTTL.Segmenter as Segmenter
    from LTTL.Input import Input

    appl = QApplication(sys.argv)
    ow = OWTextableCount()
    seg1 = Input(u'hello world', label=u'text1')
    seg2 = Input(u'cruel world', label=u'text2')
    seg3 = Segmenter.concatenate([seg1, seg2], label=u'corpus')
    seg4 = Segmenter.tokenize(seg3, [(r'\w+(?u)', u'tokenize', {
        'type': 'mot'
    })],
                              label=u'words')
    ow.inputData(seg3, 1)
    ow.inputData(seg4, 2)
    ow.show()
    appl.exec_()
    ow.saveSettings()
Ejemplo n.º 21
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some movies first",
                "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(self, iterations=len(self.myBasket))

        # Connect to imdb and add elements in lists
        list_review = list()
        list_annotation = list()
        annotations = list()
        try:
            for item in self.myBasket:
                movie = self.ia.get_movie_reviews(item['id'])
                movie_annotations = self.ia.get_movie(item['id'])
                list_review.append(movie)
                list_annotation.append(movie_annotations)
                # 1 tick on the progress bar of the widget
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from imdb", "error")
            self.controlArea.setDisabled(False)
            return

        # Store movie critics strings in input objects...
        for movie in list_review:
            data = movie.get('data', "")
            reviews_data = data.get('reviews')
            for review in reviews_data:
                reviews = review.get('content')
                newInput = Input(reviews)
                self.createdInputs.append(newInput)
        for item in list_annotation:
            print(item)
            # Store the annotation as dicts in a separate list
            annotations_dict = {"title": item, "year": item["year"]}
            annot_dict_copy = annotations_dict.copy()
            for i in range(25):
                annotations.append(annot_dict_copy)
        print(annotations)
        # If there's only one item, the widget's output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = f"{len(self.segmentation)} segment@p sent to output"
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += " (%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send('Segmentation', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Ejemplo n.º 22
0
            message = "%i segment@p sent to output. (ignored %i segments with \
            no content)" % (len(outputSeg), len(self.contentIsNone))
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
        
        # Send data to output...
        self.send("CSV Segmentation", outputSeg, self)
        
        self.sendButton.resetSettingsChangedFlag()             

    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)

            
if __name__ == "__main__":
    from LTTL.Input import Input
    WidgetPreview(ExtractCSV).run(inputData=Input("a simple example"))
Ejemplo n.º 23
0
    def getTitleListFromECP(self):
        """Fetch titles from the ECP website"""

        self.infoBox.customMessage(
            "Fetching data from ECP website, please wait")

        # Attempt to connect to ECP...
        try:
            response = urllib.request.urlopen(self.base_url)
            base_html = response.read().decode('utf-8')
            self.infoBox.customMessage("Done fetching data from ECP website.")

        # If unable to connect (somehow)...
        except:

            # Set Info box and widget to "warning" state.
            self.infoBox.noDataSent(warning="Couldn't access ECP website.")

            # Empty title list box.
            self.titleLabels = list()

            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return None

        # Otherwise store HTML content in LTTL Input object.
        base_html_seg = Input(base_html)

        # Remove accents from the data...
        recoded_seg, _ = Segmenter.recode(base_html_seg, remove_accents=True)

        # Extract table containing titles...
        genresListSeg = Segmenter.import_xml(
            segmentation=recoded_seg,
            element="ul",
            conditions={"id": re.compile(r"^genres-list")},
        )

        # Extract genre annotation...
        genreSeg = Segmenter.tokenize(
            segmentation=genresListSeg,
            regexes=[(re.compile(r'<a id[^>]+>(.+?)</a.+?(?=<a id|$)(?s)'), \
            "tokenize", {"genre": "&1"})],
            import_annotations=False,
        )

        # Extract works...
        titleSeg = Segmenter.tokenize(
            segmentation=genreSeg,
            regexes=[(re.compile(r'<li class="bibl".+?</span>(?s)'), \
            "tokenize")],
        )

        # Extract annotations...
        titleSeg = Segmenter.tokenize(
            segmentation=titleSeg,
            regexes=[
                (re.compile(r"^.*>\n(.+?)</span>.*$(?s)"), "tokenize", {
                    "author": "&1"
                }),
                (re.compile(r'^.*href="(/works/.+?\.shtml)">.*$(?s)'),
                 "tokenize", {
                     "url": "&1"
                 }),
                (re.compile(r'^.*shtml">(.*)</a>.*$(?s)'), "tokenize", {
                    "title": "&1"
                }),
            ],
            merge_duplicates=True,
        )

        # Try to save list in this module"s directory for future reference...
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        try:
            file = open(os.path.join(path, "cached_title_list_ecp"), "wb")
            pickle.dump(titleSeg, file, -1)
            file.close()
        except IOError:
            pass

        # Remove warning (if any)...
        self.error(0)
        self.warning(0)

        return titleSeg
Ejemplo n.º 24
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Skip if title list is empty:
        if self.titleLabels == list():
            return

        # Check that something has been selected...
        if len(self.selectedTitles) == 0:
            self.infoBox.setText("Please select one or more titles.",
                                 "warning")
            self.send("XML-TEI data", None, self)
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        # Initialize progress bar.
        progressBar = gui.ProgressBar(self,
                                      iterations=len(self.selectedTitles))

        # Attempt to connect to ECP and retrieve plays...
        xml_contents = list()
        annotations = list()
        try:
            for title in self.selectedTitles:
                doc_url = self.document_base_url +  \
                    self.filteredTitleSeg[title].annotations["url"]
                print(doc_url)
                url = re.sub(r"/([^/]+)\.shtml", r"/\1/\1.xml", doc_url)
                print(url)
                response = urllib.request.urlopen(url)
                xml_contents.append(response.read().decode('utf-8'))
                source_annotations = \
                self.filteredTitleSeg[title].annotations.copy()
                #source_annotations["url"] = source_annotations["href"]
                #del source_annotations["href"]
                annotations.append(source_annotations)
                progressBar.advance()  # 1 tick on the progress bar...

        # If an error occurs (e.g. http error, or memory error)...
        except:
            #Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from ECP website.",
                                 "error")
            # Reset output channel.
            self.send("XML-TEI data", None, self)
            return

        # Store downloaded XML in input objects...
        for xml_content_idx in range(len(xml_contents)):
            newInput = Input(xml_contents[xml_content_idx], self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update(annotations[idx])
            self.segmentation[idx] = segment

        # Store imported URLs as setting.
        self.importedURLs = [
            self.filteredTitleSeg[self.selectedTitles[0]].annotations["url"]
        ]

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()

        # Clear progress bar.
        progressBar.finish()

        # Send token...
        self.send("XML-TEI data", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Ejemplo n.º 25
0
from LTTL.Input import Input
import LTTL.Segmenter as Segmenter
import re

input_seg = Input("un texte")

word_seg = Segmenter.tokenize(
    input_seg,
    [(re.compile(r'\w+'), 'tokenize')],
)

vowel_seg = Segmenter.tokenize(
    input_seg,
    [(re.compile(r'[aeiouy]'), 'tokenize')],
)

for seg in word_seg[1].get_contained_segments(vowel_seg):
    print(seg.get_content())
Ejemplo n.º 26
0
    def handleNewSignals(self):
        """Overridden: called after multiple signals have been added"""
        self.openContext(self.uuid, self.segmentations)
        self.updateGUI()
        self.sendButton.sendIf()


if __name__ == '__main__':
    import sys, re
    from PyQt4.QtGui import  QApplication
    import LTTL.Segmenter as Segmenter
    from LTTL.Input import Input

    appl = QApplication(sys.argv)
    ow = OWTextableVariety()
    seg1 = Input(u'aabccc', 'text1')
    seg2 = Input(u'abci', 'text2')
    seg3 = Segmenter.concatenate(
        [seg1, seg2],
        import_labels_as='string',
        label='corpus'
    )
    seg4 = Segmenter.tokenize(
        seg3,
        regexes=[(re.compile(r'\w+'), u'tokenize',)],
    )
    seg5 = Segmenter.tokenize(
        seg4,
        regexes=[(re.compile(r'[ai]'), u'tokenize',)],
        label='V'
    )
Ejemplo n.º 27
0
    def clearCreatedInputs(self):
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def onDeleteWidget(self):
        """Free memory when widget is deleted (overriden method)"""
        self.clearCreatedInputs()

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)


if __name__ == "__main__":
    import sys
    from PyQt5.QtWidgets import QApplication

    myApplication = QApplication(sys.argv)
    myWidget = Treetagger()
    myWidget.show()
    myWidget.segmentation = Input("My tailor is rich.")
    myWidget.language = "English"
    myWidget.sendData()
    myApplication.exec_()
Ejemplo n.º 28
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        # Skip if title list is empty:
        if self.myBasket == list():
            self.infoBox.setText(
                "Your corpus is empty, please add some books first", "warning")
            return

        # Clear created Inputs.
        self.clearCreatedInputs()

        self.controlArea.setDisabled(True)

        # Initialize progress bar.
        progressBar = ProgressBar(
            self,
            iterations=len(self.myBasket),
        )

        selectedTexts = list()
        text_content = list()
        annotations = list()
        # get the Gutenberg cache
        cache = GutenbergCache.get_cache()
        try:
            # TODO: Retrieve selected texts from gutenberg
            for text in self.myBasket:

                # Get the id of the text
                query_id = cache.native_query(
                    sql_query=
                    "select gutenbergbookid from books where id == {selected_id}"
                    .format(selected_id=text[2]))
                gutenberg_id = list(query_id)

                # Get the text with Gutenbergpy
                gutenberg_text = gutenbergpy.textget.strip_headers(
                    gutenbergpy.textget.get_text_by_id(gutenberg_id[0][0]))
                text_content.append(gutenberg_text)

                annotations.append(text[1])
                progressBar.advance()

        # If an error occurs (e.g. http error, or memory error)...
        except Exception:
            # Set Info box and widget to "error" state.
            self.infoBox.setText("Couldn't download data from Gutenberg",
                                 "error")
            self.controlArea.setDisabled(False)
            return

        # TODO: send gutenberg texts as output
        # Store downloaded lyrics strings in input objects...
        for text in text_content:
            newInput = Input(text, self.captionTitle)
            self.createdInputs.append(newInput)

        # If there"s only one play, the widget"s output is the created Input.
        if len(self.createdInputs) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget"s output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle,
                import_labels_as=None,
            )

        # TODO: annotate with book metadata
        # Annotate segments...
        for idx, segment in enumerate(self.segmentation):
            segment.annotations.update({"title": annotations[idx]})
            self.segmentation[idx] = segment

        # Clear progress bar.
        progressBar.finish()

        self.controlArea.setDisabled(False)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += "(%i character@p)." % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send("Gutenberg importation", self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Ejemplo n.º 29
0
    def setUp(self):
        self.maxDiff = None
        input_seg = Input("un texte")
        word_seg = Segmenter.tokenize(
            input_seg,
            [(re.compile(r'\w+'), 'tokenize')],
            import_annotations=False,
        )
        letter_seg = Segmenter.tokenize(
            input_seg,
            [
                (re.compile(r'\w'), 'tokenize', {
                    'type': 'C'
                }),
                (re.compile(r'[aeiouy]'), 'tokenize', {
                    'type': 'V'
                }),
            ],
            import_annotations=False,
            merge_duplicates=True,
        )
        vowel_seg, consonant_seg = Segmenter.select(
            letter_seg,
            re.compile(r'V'),
            annotation_key='type',
        )

        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and without annotation (woa):
        self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.window_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 1,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 2,
            ('n', 't'): 2,
            ('n', 'e'): 1,
            ('n', 'x'): 0,
            ('t', 'u'): 1,
            ('t', 'n'): 2,
            ('t', 't'): 5,
            ('t', 'e'): 4,
            ('t', 'x'): 3,
            ('e', 'u'): 0,
            ('e', 'n'): 1,
            ('e', 't'): 4,
            ('e', 'e'): 4,
            ('e', 'x'): 3,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 3,
            ('x', 'e'): 3,
            ('x', 'x'): 3,
        }
        self.window_woa_header_row_id = '__unit__'
        self.window_woa_header_row_type = 'string'
        self.window_woa_header_col_id = '__unit__'
        self.window_woa_header_col_type = 'string'
        self.window_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.window_woa_col_ids
        }
        self.window_woa_ref = IntPivotCrosstab(
            self.window_woa_row_ids,
            self.window_woa_col_ids,
            self.window_woa_values,
            self.window_woa_header_row_id,
            self.window_woa_header_row_type,
            self.window_woa_header_col_id,
            self.window_woa_header_col_type,
            self.window_woa_col_type,
        )
        #  Create the cooccurrence matrix for cooccurrence in window
        #  with window_size=3 and with annotation (wa):
        self.window_wa_row_ids = ['C', 'V']
        self.window_wa_col_ids = ['C', 'V']
        self.window_wa_values = {
            ('C', 'C'): 5,
            ('C', 'V'): 5,
            ('V', 'C'): 5,
            ('V', 'V'): 5,
        }
        self.window_wa_header_row_id = '__unit__'
        self.window_wa_header_row_type = 'string'
        self.window_wa_header_col_id = '__unit__'
        self.window_wa_header_col_type = 'string'
        self.window_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.window_wa_col_ids
        }
        self.window_wa_ref = IntPivotCrosstab(
            self.window_wa_row_ids,
            self.window_wa_col_ids,
            self.window_wa_values,
            self.window_wa_header_row_id,
            self.window_wa_header_row_type,
            self.window_wa_header_col_id,
            self.window_wa_header_col_type,
            self.window_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and without annotation (woa):
        self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_col_ids = ['u', 'n', 't', 'e', 'x']
        self.context_wos_woa_values = {
            ('u', 'u'): 1,
            ('u', 'n'): 1,
            ('u', 't'): 0,
            ('u', 'e'): 0,
            ('u', 'x'): 0,
            ('n', 'u'): 1,
            ('n', 'n'): 1,
            ('n', 't'): 0,
            ('n', 'e'): 0,
            ('n', 'x'): 0,
            ('t', 'u'): 0,
            ('t', 'n'): 0,
            ('t', 't'): 1,
            ('t', 'e'): 1,
            ('t', 'x'): 1,
            ('e', 'u'): 0,
            ('e', 'n'): 0,
            ('e', 't'): 1,
            ('e', 'e'): 1,
            ('e', 'x'): 1,
            ('x', 'u'): 0,
            ('x', 'n'): 0,
            ('x', 't'): 1,
            ('x', 'e'): 1,
            ('x', 'x'): 1,
        }
        self.context_wos_woa_header_row_id = '__context__'
        self.context_wos_woa_header_row_type = 'string'
        self.context_wos_woa_header_col_id = '__context__'
        self.context_wos_woa_header_col_type = 'string'
        self.context_wos_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_wos_woa_col_ids
        }
        self.context_wos_woa_ref = IntPivotCrosstab(
            self.context_wos_woa_row_ids,
            self.context_wos_woa_col_ids,
            self.context_wos_woa_values,
            self.context_wos_woa_header_row_id,
            self.context_wos_woa_header_row_type,
            self.context_wos_woa_header_col_id,
            self.context_wos_woa_header_col_type,
            self.context_wos_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # without the secondary unit (wos) and with annotation (wa):
        self.context_wos_wa_row_ids = ['V', 'C']
        self.context_wos_wa_col_ids = ['V', 'C']
        self.context_wos_wa_values = {
            ('V', 'V'): 2,
            ('V', 'C'): 2,
            ('C', 'V'): 2,
            ('C', 'C'): 2,
        }
        self.context_wos_wa_header_row_id = '__context__'
        self.context_wos_wa_header_row_type = 'string'
        self.context_wos_wa_header_col_id = '__context__'
        self.context_wos_wa_header_col_type = 'string'
        self.context_wos_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_wos_wa_col_ids
        }
        self.context_wos_wa_ref = IntPivotCrosstab(
            self.context_wos_wa_row_ids,
            self.context_wos_wa_col_ids,
            self.context_wos_wa_values,
            self.context_wos_wa_header_row_id,
            self.context_wos_wa_header_row_type,
            self.context_wos_wa_header_col_id,
            self.context_wos_wa_header_col_type,
            self.context_wos_wa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and without annotation (woa):
        self.context_ws_woa_col_ids = ['u', 'e']
        self.context_ws_woa_row_ids = ['n', 't', 'x']
        self.context_ws_woa_values = {
            ('n', 'u'): 1,
            ('n', 'e'): 0,
            ('t', 'u'): 0,
            ('t', 'e'): 1,
            ('x', 'u'): 0,
            ('x', 'e'): 1,
        }
        self.context_ws_woa_header_row_id = '__context__'
        self.context_ws_woa_header_row_type = 'string'
        self.context_ws_woa_header_col_id = '__context__'
        self.context_ws_woa_header_col_type = 'string'
        self.context_ws_woa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_ws_woa_col_ids
        }
        self.context_ws_woa_ref = IntPivotCrosstab(
            self.context_ws_woa_row_ids,
            self.context_ws_woa_col_ids,
            self.context_ws_woa_values,
            self.context_ws_woa_header_row_id,
            self.context_ws_woa_header_row_type,
            self.context_ws_woa_header_col_id,
            self.context_ws_woa_header_col_type,
            self.context_ws_woa_col_type,
        )
        # Create the cooccurrence matrix for cooccurrence in context
        # with the secondary unit (ws) and with annotation (wa):
        self.context_ws_wa_row_ids = ['C']
        self.context_ws_wa_col_ids = ['V']
        self.context_ws_wa_values = {
            ('C', 'V'): 2,
        }
        self.context_ws_wa_header_row_id = '__context__'
        self.context_ws_wa_header_row_type = 'string'
        self.context_ws_wa_header_col_id = '__context__'
        self.context_ws_wa_header_col_type = 'string'
        self.context_ws_wa_col_type = {
            col_id: 'continuous'
            for col_id in self.context_ws_wa_col_ids
        }
        self.context_ws_wa_ref = IntPivotCrosstab(
            self.context_ws_wa_row_ids,
            self.context_ws_wa_col_ids,
            self.context_ws_wa_values,
            self.context_ws_wa_header_row_id,
            self.context_ws_wa_header_row_type,
            self.context_ws_wa_header_col_id,
            self.context_ws_wa_header_col_type,
            self.context_ws_wa_col_type,
        )
        self.output_cooc_in_window_woa = Processor.cooc_in_window(
            units={'segmentation': letter_seg},
            window_size=3,
        )
        self.output_cooc_in_window_wa = Processor.cooc_in_window(
            units={
                'segmentation': letter_seg,
                'annotation_key': 'type'
            },
            window_size=3,
        )
        self.output_cooc_in_context_wos_woa = Processor.cooc_in_context(
            units={'segmentation': letter_seg},
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_wos_wa = Processor.cooc_in_context(
            units={
                'segmentation': letter_seg,
                'annotation_key': 'type'
            },
            contexts={'segmentation': word_seg},
            units2=None,
        )
        self.output_cooc_in_context_ws_woa = Processor.cooc_in_context(
            units={'segmentation': vowel_seg},
            contexts={'segmentation': word_seg},
            units2={'segmentation': consonant_seg},
        )
        self.output_cooc_in_context_ws_wa = Processor.cooc_in_context(
            units={
                'segmentation': vowel_seg,
                'annotation_key': 'type'
            },
            contexts={'segmentation': word_seg},
            units2={
                'segmentation': consonant_seg,
                'annotation_key': 'type'
            },
        )
Ejemplo n.º 30
0
    def sendData(self):

        # Clear created Inputs...
        self.clearCreatedInputs()

        if not self.TreetaggerPath:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.getAvailableLanguages():
            self.infoBox.setText(self.noLanguageParameterWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.segmentation:
            self.infoBox.setText(u"Widget needs input", "warning")
            self.send("Tagged data", None)
            return

        # Initialize progress bar.
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        self.progressBar = ProgressBar(self, iterations=5)

        # Create a copy of input seg, storing annotations in temp attr...
        copy_of_input_seg = Segmentation()
        copy_of_input_seg.label = self.segmentation.label
        for seg_idx, segment in enumerate(self.segmentation):
            attr = " ".join([
                "%s=%s" % (
                    ''.join(c for c in unicodedata.normalize('NFD', item[0])
                            if unicodedata.category(c) != 'Mn'),
                    quoteattr(str(item[1])),
                ) for item in segment.annotations.items()
            ])

            segment.annotations["tt_ax"] = attr
            copy_of_input_seg.append(segment)

        self.progressBar.advance()

        # Dump segmentation in unique string to avoid multiple calls to TT...
        concatenated_text = copy_of_input_seg.to_string(
            formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>",
            display_all=True,
        )

        self.progressBar.advance()

        # Tag the segmentation contents...
        tagopt = '-token -lemma -sgml -quiet'
        if self.replaceUnknown:
            tagopt += " -no-unknown"
        tagger = treetaggerwrapper.TreeTagger(
            TAGLANG=pycountry.languages.get(name=self.language).alpha_2,
            TAGOPT=tagopt,
            TAGDIR=self.TreetaggerPath,
        )
        tagged_lines = tagger.tag_text(
            concatenated_text,
            notagurl=True,
            notagemail=True,
            notagip=True,
            notagdns=True,
        )
        tagged_input = Input("\n".join(tagged_lines))
        self.createdInputs.append(tagged_input)

        # Replace <unknown> with [unknown] and " with &quot; then
        # re-segment to match the original segmentation structure.
        tagged_segmentation, _ = Segmenter.recode(
            tagged_input,
            substitutions=[
                (re.compile(r"<unknown>"), "[unknown]"),
                (re.compile(r'"""'), '"&quot;"'),
            ],
        )
        tagged_segmentation = Segmenter.import_xml(tagged_segmentation,
                                                   "ax_tt")

        self.progressBar.advance()

        # Place each output line of Treetagger in an xml tag with annotations..
        xml_segmentation, _ = Segmenter.recode(
            tagged_segmentation,
            substitutions=[
                (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                 '<w lemma="&3" pos-tag="&2">&1</w>'),
                (re.compile(r'^\n|\n$'), ''),
            ],
        )
        # Segment into individual tokens if XML output option is disabled...
        if self.outputFormat == "add XML tags":
            output_segmentation = xml_segmentation
        else:
            try:
                output_segmentation = Segmenter.import_xml(
                    xml_segmentation, "w")
            except ValueError:
                self.infoBox.setText(
                    "Please check that either the input contains well-formed "
                    "XML, or it doesn't contain instances of '&#60;' and '\x3e'",
                    "error")
                self.send("Tagged data", None)
                self.progressBar.finish()
                self.controlArea.setDisabled(False)
                return

        self.progressBar.finish()
        self.controlArea.setDisabled(False)

        output_segmentation.label = self.captionTitle
        message = u'%i segment@p sent to output.' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        self.infoBox.setText(message)
        self.send('Tagged data', output_segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Load files, create and send segmentation"""

        # Check that there's something on input...
        if ((self.displayAdvancedSettings and not self.files)
                or not (self.file or self.displayAdvancedSettings)):
            self.infoBox.setText(u'Please select input file.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFiles = self.files
        else:
            myFiles = [[self.file, self.encoding, "", "", "", "eng", False]]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(myFiles))

        # Open and process each file successively...
        for myFile in myFiles:
            filePath = myFile[0]
            encoding = myFile[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myFile[2]
            annotation_value = myFile[3]
            pdf_password = myFile[4]  # SuperTextFiles
            ocr_languages = myFile[5]  # SuperTextFiles
            ocr_force = myFile[6]  # SuperTextFiles

            myFiletype = filetype.guess(myFile[0])  # SuperTextFiles

            # Try to open the file...
            self.error()
            # Start SuperTextFiles
            try:
                if myFiletype is None:
                    fileContent = self.extract_raw_text(filePath, encoding)

                elif myFiletype.extension == "pdf":
                    if ocr_force is True:
                        fileContent = self.get_pdf_content(
                            filePath,
                            ocr_languages,
                        )
                    else:
                        if self.is_textual_pdf_file(filePath) is True:
                            fileContent = self.extract_text_from_pdf(filePath)
                        else:
                            fileContent = self.get_pdf_content(
                                filePath,
                                ocr_languages,
                            )

                elif myFiletype.extension in IMG_FILETYPES:
                    fileContent = self.ocrize(filePath, ocr_languages)

                if fileContent == -1:
                    message = u"Couldn't open file."
                    self.infoBox.setText(message, 'error')
                    self.send('Text data', None, self)
                    self.controlArea.setDisabled(False)
                    return

            # End SuperTextFiles

            except IOError as e:
                if "tesseract" in str(e):
                    QMessageBox.warning(None, 'Textable', str(e),
                                        QMessageBox.Ok)
                progressBar.finish()
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                fileContent = fileContent.lstrip(
                    codecs.BOM_UTF8.decode('utf-8'))

            # Normalize text (canonical decomposition then composition)...
            fileContent = normalize('NFC', fileContent)

            fileContents.append(fileContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importFilenames and self.importFilenamesKey:
                    filename = os.path.basename(filePath)
                    annotation[self.importFilenamesKey] = filename
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each file...
        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Ejemplo n.º 32
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=3, end=6)
        ])
        self.char_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=2, end=3),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index, start=4, end=5),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg1 = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
        ])
        self.letter_seg2 = Segmentation([
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.single_letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '1'}),
        ])
        self.duplicate_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=0, end=1),
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=str_index, start=3, end=5),
            Segment(str_index=str_index, start=4, end=6),
        ])

        self.other_entire_text_seg = Input('abbccc')
        str_index2 = self.other_entire_text_seg[0].str_index
        self.other_letter_seg = Segmentation([
            Segment(str_index=str_index2,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=1,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=2,
                    end=3,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=3,
                    end=4,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=4,
                    end=5,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=5,
                    end=6,
                    annotations={'a': '3'}),
        ])

        self.third_entire_text_seg = Input('bd1')
        str_index3 = self.third_entire_text_seg[0].str_index
        self.third_letter_seg = Segmentation([
            Segment(str_index=str_index3, start=0, end=1),
            Segment(str_index=str_index3,
                    start=1,
                    end=2,
                    annotations={'a': '2'}),
            Segment(str_index=str_index3,
                    start=2,
                    end=3,
                    annotations={'a': 'b'}),
        ])

        self.fourth_entire_text_seg = Input('AB cd\xe9')
        str_index = self.fourth_entire_text_seg[0].str_index
        self.second_word_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=2),
            Segment(str_index=str_index, start=3, end=6),
        ])

        self.xml_seg = Input('<a attr="1"><a attr="2/3/">c<a/>d</a></a>')
        self.wrong_xml_seg = Input('<a><a>test</a>')
        self.wrong_xml_seg2 = Input('<a>test</a></a>')

        self.part_xml_seg = Input('<a>1<a>2<a>3</a>4')
        str_index3 = self.part_xml_seg[0].str_index
        self.part_xml_seg2 = Input('</a>5</a>')
        str_index4 = self.part_xml_seg2[0].str_index
        self.broken_xml_seg = Segmentation([
            Segment(str_index=str_index3, annotations={'a': '1'}),
            Segment(str_index=str_index4),
        ])

        self.count = 0