class SuperTextFiles(OWTextableBaseWidget):
    """Textable widget to import PDF files and if necessary to do an Optical
    Character Recognition (OCR)"""

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "Super Text Files"
    description = "Import data from raw text and PDF files"
    icon = "icons/SuperTextFiles.svg"
    priority = 1  # TODO

    #----------------------------------------------------------------------
    # Channel definitions....

    inputs = [('Message', JSONMessage, "inputMessage", widget.Single)]
    outputs = [('Text data', Segmentation)]

    #----------------------------------------------------------------------
    # Layout parameters...

    want_main_area = False

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    files = settings.Setting([])
    encoding = settings.Setting('(auto-detect)')
    autoNumber = settings.Setting(False)
    autoNumberKey = settings.Setting(u'num')
    importFilenames = settings.Setting(True)
    importFilenamesKey = settings.Setting(u'filename')
    lastLocation = settings.Setting('.')
    displayAdvancedSettings = settings.Setting(False)
    file = settings.Setting(u'')

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Other attributes...
        self.segmentation = None
        self.createdInputs = list()
        self.fileLabels = list()
        self.selectedFileLabels = list()
        self.newFiles = u''
        self.newAnnotationKey = u''
        self.newAnnotationValue = u''
        self.pdfPassword = u''  # SuperTextFiles
        self.ocrForce = False  # SuperTextFiles
        self.ocrLanguages = u'eng'  # SuperTextFiles
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute='infoBox',
            sendIfPreCallback=self.updateGUI,
        )
        self.advancedSettings = AdvancedSettings(
            widget=self.controlArea,
            master=self,
            callback=self.sendButton.settingsChanged,
        )

        # GUI...

        # Advanced settings checkbox...
        self.advancedSettings.draw()

        # BASIC GUI...

        # Basic file box
        basicFileBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Source',
            orientation='vertical',
            addSpace=False,
        )
        basicFileBoxLine1 = gui.widgetBox(
            widget=basicFileBox,
            box=False,
            orientation='horizontal',
        )
        gui.lineEdit(
            widget=basicFileBoxLine1,
            master=self,
            value='file',
            orientation='horizontal',
            label=u'File path:',
            labelWidth=101,
            callback=self.sendButton.settingsChanged,
            tooltip=(u"The path of the file."),
        )
        gui.separator(widget=basicFileBoxLine1, width=5)
        gui.button(
            widget=basicFileBoxLine1,
            master=self,
            label=u'Browse',
            callback=self.browse,
            tooltip=(u"Open a dialog for selecting file."),
        )
        gui.separator(widget=basicFileBox, width=3)
        advancedEncodingsCombobox = gui.comboBox(
            widget=basicFileBox,
            master=self,
            value='encoding',
            items=getPredefinedEncodings(),
            sendSelectedValue=True,
            orientation='horizontal',
            label=u'Encoding:',
            labelWidth=101,
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Select input file(s) encoding."),
        )
        addSeparatorAfterDefaultEncodings(advancedEncodingsCombobox)
        addAutoDetectEncoding(advancedEncodingsCombobox)
        gui.separator(widget=basicFileBox, width=3)
        self.advancedSettings.basicWidgets.append(basicFileBox)
        self.advancedSettings.basicWidgetsAppendSeparator()

        # ADVANCED GUI...

        defaultLabelWidth = 120  # SuperTextFiles

        # File box
        fileBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Sources',
            orientation='vertical',
            addSpace=False,
        )
        fileBoxLine1 = gui.widgetBox(
            widget=fileBox,
            box=False,
            orientation='horizontal',
            addSpace=True,
        )
        self.fileListbox = gui.listBox(
            widget=fileBoxLine1,
            master=self,
            value='selectedFileLabels',
            labels='fileLabels',
            callback=self.updateFileBoxButtons,
            tooltip=(
                u"The list of files whose content will be imported.\n"
                u"\nIn the output segmentation, the content of each\n"
                u"file appears in the same position as in the list.\n"
                u"\nColumn 1 shows the file's name.\n"
                u"Column 2 shows the file's annotation (if any).\n"
                # Start SuperTextFiles
                # u"Column 3 shows the file's encoding." # removed
                u"Column 3 shows the file's password (if any).\n"
                u"Column 4 shows the file's languages (if any).\n"
                u"Column 5 shows if OCR is forced.\n"
                u"Column 6 shows the file's encoding."
                # End SuperTextFiles
            ),
        )
        font = QFont()
        font.setFamily('Courier')
        font.setStyleHint(QFont.Courier)
        font.setPixelSize(12)
        self.fileListbox.setFont(font)
        fileBoxCol2 = gui.widgetBox(
            widget=fileBoxLine1,
            orientation='vertical',
        )
        self.moveUpButton = gui.button(
            widget=fileBoxCol2,
            master=self,
            label=u'Move Up',
            callback=self.moveUp,
            tooltip=(u"Move the selected file upward in the list."),
        )
        self.moveDownButton = gui.button(
            widget=fileBoxCol2,
            master=self,
            label=u'Move Down',
            callback=self.moveDown,
            tooltip=(u"Move the selected file downward in the list."),
        )
        self.removeButton = gui.button(
            widget=fileBoxCol2,
            master=self,
            label=u'Remove',
            callback=self.remove,
            tooltip=(u"Remove the selected file from the list."),
        )
        self.clearAllButton = gui.button(
            widget=fileBoxCol2,
            master=self,
            label=u'Clear All',
            callback=self.clearAll,
            tooltip=(u"Remove all files from the list."),
        )
        self.exportButton = gui.button(
            widget=fileBoxCol2,
            master=self,
            label=u'Export List',
            callback=self.exportList,
            tooltip=(u"Open a dialog for selecting a file where the file\n"
                     u"list can be exported in JSON format."),
        )
        self.importButton = gui.button(
            widget=fileBoxCol2,
            master=self,
            label=u'Import List',
            callback=self.importList,
            tooltip=(u"Open a dialog for selecting a file list to\n"
                     u"import (in JSON format). Files from this list\n"
                     u"will be added to those already imported."),
        )
        fileBoxLine2 = gui.widgetBox(
            widget=fileBox,
            box=False,
            orientation='vertical',
        )
        # Add file box
        addFileBox = gui.widgetBox(
            widget=fileBoxLine2,
            box=True,
            orientation='vertical',
        )
        addFileBoxLine1 = gui.widgetBox(
            widget=addFileBox,
            orientation='horizontal',
        )
        gui.lineEdit(
            widget=addFileBoxLine1,
            master=self,
            value='newFiles',
            orientation='horizontal',
            label=u'File path(s):',
            labelWidth=defaultLabelWidth,
            callback=self.updateGUI,
            tooltip=(u"The paths of the files that will be added to the\n"
                     u"list when button 'Add' is clicked.\n\n"
                     u"Successive paths must be separated with ' / ' \n"
                     u"(whitespace + slash + whitespace). Their order in\n"
                     u"the list will be the same as in this field."),
        )
        gui.separator(widget=addFileBoxLine1, width=5)
        gui.button(
            widget=addFileBoxLine1,
            master=self,
            label=u'Browse',
            callback=self.browse,
            tooltip=(u"Open a dialog for selecting files.\n\n"
                     u"To select multiple files at once, either draw a\n"
                     u"selection box around them, or use shift and/or\n"
                     u"ctrl + click.\n\n"
                     u"Selected file paths will appear in the field to\n"
                     u"the left of this button afterwards, ready to be\n"
                     u"added to the list when button 'Add' is clicked."),
        )
        gui.separator(widget=addFileBox, width=3)
        basicEncodingsCombobox = gui.comboBox(
            widget=addFileBox,
            master=self,
            value='encoding',
            items=getPredefinedEncodings(),
            sendSelectedValue=True,
            orientation='horizontal',
            label=u'Encoding:',
            labelWidth=defaultLabelWidth,
            callback=self.updateGUI,
            tooltip=(u"Select input file(s) encoding."),
        )
        addSeparatorAfterDefaultEncodings(basicEncodingsCombobox)
        addAutoDetectEncoding(basicEncodingsCombobox)
        self.encoding = self.encoding
        gui.separator(widget=addFileBox, width=3)
        gui.lineEdit(
            widget=addFileBox,
            master=self,
            value='newAnnotationKey',
            orientation='horizontal',
            label=u'Annotation key:',
            labelWidth=defaultLabelWidth,
            callback=self.updateGUI,
            tooltip=(u"This field lets you specify a custom annotation\n"
                     u"key associated with each file that is about to be\n"
                     u"added to the list."),
        )
        gui.separator(widget=addFileBox, width=3)
        gui.lineEdit(
            widget=addFileBox,
            master=self,
            value='newAnnotationValue',
            orientation='horizontal',
            label=u'Annotation value:',
            labelWidth=defaultLabelWidth,
            callback=self.updateGUI,
            tooltip=(u"This field lets you specify the annotation value\n"
                     u"associated with the above annotation key."),
        )

        ### Start SuperTextFiles addition
        gui.separator(widget=addFileBox, width=3)
        # Field for PDF password
        gui.lineEdit(
            widget=addFileBox,
            master=self,
            value='pdfPassword',
            orientation='horizontal',
            label=u'PDF password:'******'ocrLanguages',
            orientation='horizontal',
            label=u'OCR Language(s):',
            labelWidth=defaultLabelWidth,
            callback=self.updateGUI,
            tooltip=(u"This field lets you specify languages\n"
                     u"for the OCR process. Ex.: fra+ita"),
        )

        gui.checkBox(
            widget=addFileBox,
            master=self,
            value='ocrForce',
            label=u'Force OCR',
            labelWidth=defaultLabelWidth,
            callback=self.updateGUI,
            tooltip=(u"Force to use an OCR detection on this file"),
        )
        ### End SuperTextFiles addition

        gui.separator(widget=addFileBox, width=3)
        self.addButton = gui.button(
            widget=addFileBox,
            master=self,
            label=u'Add',
            callback=self.add,
            tooltip=(u"Add the file(s) currently displayed in the\n"
                     u"'Files' text field to the list.\n\n"
                     u"Each of these files will be associated with the\n"
                     u"specified encoding and annotation (if any).\n\n"
                     u"Other files may be selected afterwards and\n"
                     u"assigned a different encoding and annotation."),
        )
        self.advancedSettings.advancedWidgets.append(fileBox)
        self.advancedSettings.advancedWidgetsAppendSeparator()

        # Options box...
        optionsBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Options',
            orientation='vertical',
            addSpace=False,
        )
        optionsBoxLine1 = gui.widgetBox(
            widget=optionsBox,
            box=False,
            orientation='horizontal',
        )
        gui.checkBox(
            widget=optionsBoxLine1,
            master=self,
            value='importFilenames',
            label=u'Import file names with key:',
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Import file names as annotations."),
        )
        self.importFilenamesKeyLineEdit = gui.lineEdit(
            widget=optionsBoxLine1,
            master=self,
            value='importFilenamesKey',
            orientation='horizontal',
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Annotation key for importing file names."),
        )
        gui.separator(widget=optionsBox, width=3)
        optionsBoxLine2 = gui.widgetBox(
            widget=optionsBox,
            box=False,
            orientation='horizontal',
        )
        gui.checkBox(
            widget=optionsBoxLine2,
            master=self,
            value='autoNumber',
            label=u'Auto-number with key:',
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Annotate files with increasing numeric indices."),
        )
        self.autoNumberKeyLineEdit = gui.lineEdit(
            widget=optionsBoxLine2,
            master=self,
            value='autoNumberKey',
            orientation='horizontal',
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Annotation key for file auto-numbering."),
        )
        gui.separator(widget=optionsBox, width=3)
        self.advancedSettings.advancedWidgets.append(optionsBox)
        self.advancedSettings.advancedWidgetsAppendSeparator()

        gui.rubber(self.controlArea)

        # Send button...
        self.sendButton.draw()

        # Info box...
        self.infoBox.draw()

        self.adjustSizeWithTimer()
        QTimer.singleShot(0, self.sendButton.sendIf)

    def inputMessage(self, message):
        """Handle JSON message on input connection"""
        if not message:
            return
        self.displayAdvancedSettings = True
        self.advancedSettings.setVisible(True)
        self.clearAll()
        self.infoBox.inputChanged()
        try:
            json_data = json.loads(message.content)
            temp_files = list()
            for entry in json_data:
                path = entry.get('path', '')
                encoding = entry.get('encoding', '')
                annotationKey = entry.get('annotation_key', '')
                annotationValue = entry.get('annotation_value', '')
                pdfPassword = entry.get('pdf_password', '')  # SuperTextFiles
                ocrLanguages = entry.get('ocr_languages', '')  # SuperTextFiles
                ocrForce = entry.get('ocr_force', '')  # SuperTextFiles

                if path == '' or encoding == '' or ocrForce == '':
                    self.infoBox.setText(
                        u"Please verify keys and values of incoming "
                        u"JSON message.", 'error')
                    self.send('Text data', None, self)
                    return
                temp_files.append((
                    path,
                    encoding,
                    annotationKey,
                    annotationValue,
                    pdfPassword,  # SuperTextFiles
                    ocrLanguages,  # SuperTextFiles
                    ocrForce,  # SuperTextFiles
                ))
            self.files.extend(temp_files)
            self.sendButton.settingsChanged()
        except ValueError:
            self.infoBox.setText(
                u"Please make sure that incoming message is valid JSON.",
                'error')
            self.send('Text data', None, self)
            return

    def sendData(self):
        """Load files, create and send segmentation"""

        # Check that there's something on input...
        if ((self.displayAdvancedSettings and not self.files)
                or not (self.file or self.displayAdvancedSettings)):
            self.infoBox.setText(u'Please select input file.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFiles = self.files
        else:
            myFiles = [[self.file, self.encoding, "", "", "", "eng", False]]

        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(myFiles))

        # Open and process each file successively...
        for myFile in myFiles:
            filePath = myFile[0]
            encoding = myFile[1]
            encoding = re.sub(r"[ ]\(.+", "", encoding)
            annotation_key = myFile[2]
            annotation_value = myFile[3]
            pdf_password = myFile[4]  # SuperTextFiles
            ocr_languages = myFile[5]  # SuperTextFiles
            ocr_force = myFile[6]  # SuperTextFiles

            myFiletype = filetype.guess(myFile[0])  # SuperTextFiles

            # Try to open the file...
            self.error()
            # Start SuperTextFiles
            try:
                if myFiletype is None:
                    fileContent = self.extract_raw_text(filePath, encoding)

                elif myFiletype.extension == "pdf":
                    if ocr_force is True:
                        fileContent = self.get_pdf_content(
                            filePath,
                            ocr_languages,
                        )
                    else:
                        if self.is_textual_pdf_file(filePath) is True:
                            fileContent = self.extract_text_from_pdf(filePath)
                        else:
                            fileContent = self.get_pdf_content(
                                filePath,
                                ocr_languages,
                            )

                elif myFiletype.extension in IMG_FILETYPES:
                    fileContent = self.ocrize(filePath, ocr_languages)

                if fileContent == -1:
                    message = u"Couldn't open file."
                    self.infoBox.setText(message, 'error')
                    self.send('Text data', None, self)
                    self.controlArea.setDisabled(False)
                    return

            # End SuperTextFiles

            except IOError as e:
                if "tesseract" in str(e):
                    QMessageBox.warning(None, 'Textable', str(e),
                                        QMessageBox.Ok)
                progressBar.finish()
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                self.controlArea.setDisabled(False)
                return

            # Remove utf-8 BOM if necessary...
            if encoding == u'utf-8':
                fileContent = fileContent.lstrip(
                    codecs.BOM_UTF8.decode('utf-8'))

            # Normalize text (canonical decomposition then composition)...
            fileContent = normalize('NFC', fileContent)

            fileContents.append(fileContent)

            # Annotations...
            annotation = dict()
            if self.displayAdvancedSettings:
                if annotation_key and annotation_value:
                    annotation[annotation_key] = annotation_value
                if self.importFilenames and self.importFilenamesKey:
                    filename = os.path.basename(filePath)
                    annotation[self.importFilenamesKey] = filename
                if self.autoNumber and self.autoNumberKey:
                    annotation[self.autoNumberKey] = counter
                    counter += 1
            annotations.append(annotation)
            progressBar.advance()

        # Create an LTTL.Input for each file...
        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )

        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()

    def extract_raw_text(self, filePath, encoding):
        """This function receive a filePath and an encoding value and return a
        string with the text of the given file."""
        if encoding == "(auto-detect)":
            detector = UniversalDetector()
            fh = open(filePath, 'rb')
            for line in fh:
                detector.feed(line)
                if detector.done: break
            detector.close()
            fh.close()
            encoding = detector.result['encoding']
        fh = open(
            filePath,
            mode='rU',
            encoding=encoding,
        )
        try:
            i = 0
            fileContent = ""
            chunks = list()
            for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""):
                chunks.append('\n'.join(chunk.splitlines()))
                i += CHUNK_LENGTH
                if i % (CHUNK_NUM * CHUNK_LENGTH) == 0:
                    fileContent += "".join(chunks)
                    chunks = list()
            if len(chunks):
                fileContent += "".join(chunks)
            del chunks
            return fileContent
        except UnicodeError:
            progressBar.finish()
            if len(myFiles) > 1:
                message = u"Please select another encoding "    \
                        + u"for file %s." % filePath
            else:
                message = u"Please select another encoding."
            self.infoBox.setText(message, 'error')
            self.send('Text data', None, self)
            self.controlArea.setDisabled(False)
            return
        finally:
            fh.close()

    def is_textual_pdf_file(self, filePath):
        """Evaluate the content of the pdf file"""
        with pdfplumber.open(filePath, password=self.pdfPassword) as fh:
            first_page = fh.pages[0]
            text = first_page.extract_text()

            if text is None or text.isspace() is True:
                return False
            else:
                return True

    def extract_text_from_pdf(self, filePath):
        """Extract all readable text contents"""
        fileContent = ""
        with pdfplumber.open(filePath, password=self.pdfPassword) as fh:
            for page in fh.pages:
                fileContent += page.extract_text()

        return fileContent

    def get_pdf_content(self, filePath, languages):
        """ First this function get all texts in the file if exist. Then it
        creates a list of pictures to make the OCR method."""
        text = ""
        with fitz.open(filePath) as doc:
            images = []
            for page in doc:
                text += page.getText("text")
                images += doc.getPageImageList(page.number)

            for image in images:
                xref = image[0]
                picture = fitz.Pixmap(doc, xref)

                if picture.n > 4:  # CMYK colorspace
                    picture = fitz.Pixmap(fitz.csRGB,
                                          picture)  # convert to RGB

                bytes_img = BytesIO(picture.getImageData())

                page_text = self.ocrize(bytes_img, languages)

                if page_text == -1:
                    text = -1
                    break
                elif page_text:
                    text += page_text

        return text

    def ocrize(self, image, languages):
        """Make an OCR on a list of images or an image file"""
        languages = languages.strip()  # remove trailing spaces
        if languages == "":
            languages = "eng"
        try:
            ocrized_text = image_to_string(Image.open(image), lang=languages)
            return ocrized_text
        except TesseractError as e:
            if "load" in str(e):
                QMessageBox.warning(
                    None, 'Textable',
                    "Please make sure all Tesseract parameter files for "
                    "language(s) '%s' have been installed." % languages,
                    QMessageBox.Ok)
            return -1

    def clearCreatedInputs(self):
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def importList(self):
        """Display a FileDialog and import file list"""
        filePath, _ = QFileDialog.getOpenFileName(self, u'Import File List',
                                                  self.lastLocation,
                                                  u'Text files (*)')
        if not filePath:
            return
        self.file = os.path.normpath(filePath)
        self.lastLocation = os.path.dirname(filePath)
        self.error()
        try:
            fileHandle = codecs.open(filePath, encoding='utf8')
            fileContent = fileHandle.read()
            fileHandle.close()
        except IOError:
            QMessageBox.warning(None, 'Textable', "Couldn't open file.",
                                QMessageBox.Ok)
            return
        try:
            json_data = json.loads(fileContent)
            temp_files = list()
            for entry in json_data:
                path = entry.get('path', '')
                encoding = entry.get('encoding', '')
                annotationKey = entry.get('annotation_key', '')
                annotationValue = entry.get('annotation_value', '')
                pdfPassword = entry.get('pdf_password', '')  # SuperTextFiles
                ocrLanguages = entry.get('ocr_languages', '')  # SuperTextFiles
                ocrForce = entry.get('ocr_force', '')  # SuperTextFiles

                if path == '' or encoding == '' or ocrForce == '':
                    QMessageBox.warning(
                        None, 'Textable',
                        "Selected JSON file doesn't have the right keys "
                        "and/or values.", QMessageBox.Ok)
                    return
                temp_files.append((
                    path,
                    encoding,
                    annotationKey,
                    annotationValue,
                    pdfPassword,  # SuperTextFiles
                    ocrLanguages,  # SuperTextFiles
                    ocrForce,  # SuperTextFiles
                ))
            self.files.extend(temp_files)
            if temp_files:
                self.sendButton.settingsChanged()
        except ValueError:
            QMessageBox.warning(None, 'Textable', "JSON parsing error.",
                                QMessageBox.Ok)
            return

    def exportList(self):
        """Display a FileDialog and export file list"""
        toDump = list()
        for myfile in self.files:
            toDump.append({
                'path': myfile[0],
                'encoding': myfile[1],
            })
            if myfile[2] and myfile[3]:
                toDump[-1]['annotation_key'] = myfile[2]
                toDump[-1]['annotation_value'] = myfile[3]
            # Start SuperTextFiles
            if myfile[4]:
                toDump[-1]['pdf_password'] = myfile[4]

            if myfile[5]:
                toDump[-1]['ocr_languages'] = myfile[5]

            toDump[-1]['ocr_force'] = myfile[6]
            # End SuperTextFiles

        filePath, _ = QFileDialog.getSaveFileName(
            self,
            u'Export File List',
            self.lastLocation,
        )

        if filePath:
            self.lastLocation = os.path.dirname(filePath)
            outputFile = codecs.open(
                filePath,
                encoding='utf8',
                mode='w',
                errors='xmlcharrefreplace',
            )
            outputFile.write(
                normalizeCarriageReturns(
                    json.dumps(toDump, sort_keys=True, indent=4)))
            outputFile.close()
            QMessageBox.information(None, 'Textable',
                                    'File list correctly exported',
                                    QMessageBox.Ok)

    def browse(self):
        """Display a FileDialog and select files"""
        if self.displayAdvancedSettings:
            filePathList, _ = QFileDialog.getOpenFileNames(
                self, u'Select Text File(s)', self.lastLocation,
                u'Text files (*)')
            if not filePathList:
                return
            filePathList = [os.path.normpath(f) for f in filePathList]
            self.newFiles = u' / '.join(filePathList)
            self.lastLocation = os.path.dirname(filePathList[-1])
            self.updateGUI()
        else:
            filePath, _ = QFileDialog.getOpenFileName(self, u'Open Text File',
                                                      self.lastLocation,
                                                      u'Text files (*)')
            if not filePath:
                return
            self.file = os.path.normpath(filePath)
            self.lastLocation = os.path.dirname(filePath)
            self.updateGUI()
            self.sendButton.settingsChanged()

    def moveUp(self):
        """Move file upward in Files listbox"""
        if self.selectedFileLabels:
            index = self.selectedFileLabels[0]
            if index > 0:
                temp = self.files[index - 1]
                self.files[index - 1] = self.files[index]
                self.files[index] = temp
                self.selectedFileLabels = [index - 1]
                self.sendButton.settingsChanged()

    def moveDown(self):
        """Move file downward in Files listbox"""
        if self.selectedFileLabels:
            index = self.selectedFileLabels[0]
            if index < len(self.files) - 1:
                temp = self.files[index + 1]
                self.files[index + 1] = self.files[index]
                self.files[index] = temp
                self.selectedFileLabels = [index + 1]
                self.sendButton.settingsChanged()

    def clearAll(self):
        """Remove all files from files attr"""
        del self.files[:]
        del self.selectedFileLabels[:]
        self.sendButton.settingsChanged()

    def remove(self):
        """Remove file from files attr"""
        if self.selectedFileLabels:
            index = self.selectedFileLabels[0]
            self.files.pop(index)
            del self.selectedFileLabels[:]
            self.sendButton.settingsChanged()

    def add(self):
        """Add files to files attr"""
        filePathList = re.split(r' +/ +', self.newFiles)
        for filePath in filePathList:
            encoding = re.sub(r"[ ]\(.+", "", self.encoding)
            self.files.append((
                filePath,
                encoding,
                self.newAnnotationKey,
                self.newAnnotationValue,
                self.pdfPassword,  # SuperTextFiles
                self.ocrLanguages,  # SuperTextFiles
                self.ocrForce,  # SuperTextFiles
            ))
        self.sendButton.settingsChanged()

    def updateGUI(self):
        """Update GUI state"""
        if self.displayAdvancedSettings:
            if self.selectedFileLabels:
                cachedLabel = self.selectedFileLabels[0]
            else:
                cachedLabel = None
            del self.fileLabels[:]
            if self.files:
                filePaths = [f[0] for f in self.files]
                filenames = [os.path.basename(p) for p in filePaths]
                encodings = [f[1] for f in self.files]
                annotations = ['{%s: %s}' % (f[2], f[3]) for f in self.files]
                maxFilenameLen = max([len(n) for n in filenames])
                maxAnnoLen = max([len(a) for a in annotations])
                # Start SuperTextFiles
                pdfPassword = [f[4] for f in self.files]
                ocrLanguages = [f[5] for f in self.files]
                ocrForce = [str(f[6]) for f in self.files]
                maxPdfPasswordLen = max([len(n) for n in pdfPassword])
                maxOcrLanguagesLen = max([len(n) for n in ocrLanguages])
                # End SuperTextFiles

                for index in range(len(self.files)):
                    format = u'%-' + str(maxFilenameLen + 2) + u's'
                    fileLabel = format % filenames[index]
                    if maxAnnoLen > 4:
                        if len(annotations[index]) > 4:
                            format = u'%-' + str(maxAnnoLen + 2) + u's'
                            fileLabel += format % annotations[index]
                        else:
                            fileLabel += u' ' * (maxAnnoLen + 2)

                    # Start SuperTextFiles
                    format = u'%-' + str(maxPdfPasswordLen + 2) + u's'
                    fileLabel += format % pdfPassword[index]

                    format = u'%-' + str(maxOcrLanguagesLen + 2) + u's'
                    fileLabel += format % ocrLanguages[index]

                    format = u'%-' + str(5 + 2) + u's'
                    fileLabel += format % ocrForce[index]
                    # End SuperTextFiles

                    fileLabel += encodings[index]
                    self.fileLabels.append(fileLabel)
            self.fileLabels = self.fileLabels
            if cachedLabel is not None:
                self.sendButton.sendIfPreCallback = None
                self.selectedFileLabels = [cachedLabel]
                self.sendButton.sendIfPreCallback = self.updateGUI
            if self.newFiles:
                if ((self.newAnnotationKey and self.newAnnotationValue)
                        or (not self.newAnnotationKey
                            and not self.newAnnotationValue)):
                    self.addButton.setDisabled(False)
                else:
                    self.addButton.setDisabled(True)
            else:
                self.addButton.setDisabled(True)
            if self.autoNumber:
                self.autoNumberKeyLineEdit.setDisabled(False)
            else:
                self.autoNumberKeyLineEdit.setDisabled(True)
            if self.importFilenames:
                self.importFilenamesKeyLineEdit.setDisabled(False)
            else:
                self.importFilenamesKeyLineEdit.setDisabled(True)
            self.updateFileBoxButtons()
            self.advancedSettings.setVisible(True)
        else:
            self.advancedSettings.setVisible(False)

    def updateFileBoxButtons(self):
        """Update state of File box buttons"""
        if self.selectedFileLabels:
            self.removeButton.setDisabled(False)
            if self.selectedFileLabels[0] > 0:
                self.moveUpButton.setDisabled(False)
            else:
                self.moveUpButton.setDisabled(True)
            if self.selectedFileLabels[0] < len(self.files) - 1:
                self.moveDownButton.setDisabled(False)
            else:
                self.moveDownButton.setDisabled(True)
        else:
            self.moveUpButton.setDisabled(True)
            self.moveDownButton.setDisabled(True)
            self.removeButton.setDisabled(True)
        if len(self.files):
            self.clearAllButton.setDisabled(False)
            self.exportButton.setDisabled(False)
        else:
            self.clearAllButton.setDisabled(True)
            self.exportButton.setDisabled(True)

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)

    def onDeleteWidget(self):
        self.clearCreatedInputs()
class Linguistica(OWTextableBaseWidget):
    """Textable widget for unsupervised morphology learning, using the
    "Crab Nebula" algorithm from John Golsdmith's Linguistica
    """

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "Linguistica"
    description = "Unupervised morphological analysis"
    icon = "icons/linguistica.svg"
    priority = 21

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Word segmentation", Segmentation, "inputData")]
    outputs = [("Morphologically analyzed data", Segmentation)]

    #----------------------------------------------------------------------
    # Layout parameters...

    want_main_area = True

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    autoSend = settings.Setting(False)
    minStemLen = settings.Setting(3)
    maxSuffixLen = settings.Setting(4)

    def __init__(self):
        """Widget creator."""

        super().__init__()

        # Other attributes...
        self.inputSeg = None
        self.morphology = dict()
        self.selectedMainWord = None
        self.mainWords = list()
        self.selectedParse = None
        self.parses = list()
        self.selectedStemForParse = None
        self.stemsForParse = list()
        self.selectedSuffixForParse = None
        self.suffixesForParse = list()
        self.selectedMainSignature = None
        self.mainSignatures = list()
        self.wordsForSig = list()
        self.stemsForSig = list()
        self.suffixesForSig = list()

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=None,
        )

        # User interface...

        # A) Control area...

        # Options box...
        optionsBox = gui.widgetBox(
            widget=self.controlArea,
            box="Options",
            orientation="vertical",
        )
        gui.spin(
            widget=optionsBox,
            master=self,
            value='minStemLen',
            label='Minimum length of stems: ',
            callback=self.sendButton.sendIf,
            labelWidth=180,
            tooltip=(
                'Select the minimum number of required characters in stems'),
            minv=LOWER_MIN_STEM_LEN,
            maxv=MAX_MORPH_LEN,
            step=1,
        )
        gui.separator(widget=optionsBox, height=2)

        gui.rubber(self.controlArea)

        # B) Main area...

        font = QFont()
        font.setFamily('Courier')
        font.setStyleHint(QFont.Courier)
        font.setPixelSize(12)

        # Tabs...
        self.tabs = QTabWidget()
        self.wordTab = QWidget()
        self.signatureTab = QWidget()
        self.tabs.addTab(self.wordTab, "Words")
        self.tabs.addTab(self.signatureTab, "Signatures")

        # Words tab...
        wordTabBox = QHBoxLayout()
        wordBox = gui.widgetBox(
            widget=self.wordTab,
            orientation="horizontal",
            margin=5,
        )

        wordBoxRight = gui.widgetBox(widget=wordBox)

        self.mainWordListbox = gui.listBox(
            widget=wordBoxRight,
            master=self,
            value="selectedMainWord",
            labels="mainWords",
            callback=self.mainWordSelected,
            tooltip="Select a word to display its possible parses.",
        )
        self.mainWordListbox.setFont(font)

        gui.separator(widget=wordBox, width=3)

        wordBoxLeft = gui.widgetBox(widget=wordBox)

        gui.label(
            widget=wordBoxLeft,
            master=self,
            label="Parse(s):",
        )

        self.parsesListbox = gui.listBox(
            widget=wordBoxLeft,
            master=self,
            value="selectedParse",
            labels="parses",
            callback=self.parseSelected,
            tooltip="Select a parse to display the corresponding signature.",
        )
        self.parsesListbox.setFont(font)

        self.sigForParseBox = gui.widgetBox(
            widget=wordBoxLeft,
            box="Signature",
        )

        gui.label(
            widget=self.sigForParseBox,
            master=self,
            label="Stem(s):",
        )

        self.stemsForParseListbox = gui.listBox(
            widget=self.sigForParseBox,
            master=self,
            labels="stemsForParse",
            tooltip="Stems associated with the parse selected above.",
        )

        gui.separator(widget=self.sigForParseBox, height=2)

        gui.label(
            widget=self.sigForParseBox,
            master=self,
            label="Suffixes(s):",
        )

        self.suffixesForParseListbox = gui.listBox(
            widget=self.sigForParseBox,
            master=self,
            labels="suffixesForParse",
            tooltip="Suffixes associated with the parse selected above.",
        )

        wordTabBox.addWidget(wordBox)
        self.wordTab.setLayout(wordTabBox)

        # Signature tab...
        signatureTabBox = QHBoxLayout()

        signatureBox = gui.widgetBox(
            widget=self.signatureTab,
            orientation="horizontal",
            margin=5,
        )

        signatureBoxRight = gui.widgetBox(widget=signatureBox)

        self.mainSignatureListbox = gui.listBox(
            widget=signatureBoxRight,
            master=self,
            value="selectedMainSignature",
            labels="mainSignatures",
            callback=self.mainSignatureSelected,
            tooltip="Select a signature to display its contents.",
        )
        self.mainSignatureListbox.setFont(font)

        gui.separator(widget=signatureBox, width=3)

        signatureBoxLeft = gui.widgetBox(widget=signatureBox)

        gui.label(
            widget=signatureBoxLeft,
            master=self,
            label="Words:",
        )

        self.wordsForSigListbox = gui.listBox(
            widget=signatureBoxLeft,
            master=self,
            labels="wordsForSig",
            tooltip="Words associated with the selected signature.",
        )
        self.wordsForSigListbox.setFont(font)

        gui.label(
            widget=signatureBoxLeft,
            master=self,
            label="Stem(s):",
        )

        self.stemsForSigListbox = gui.listBox(
            widget=signatureBoxLeft,
            master=self,
            labels="stemsForSig",
            tooltip="Stems associated with the selected signature.",
        )
        self.stemsForSigListbox.setFont(font)

        gui.label(
            widget=signatureBoxLeft,
            master=self,
            label="Suffixes(s):",
        )

        self.suffixesForSigListbox = gui.listBox(
            widget=signatureBoxLeft,
            master=self,
            labels="suffixesForSig",
            tooltip="Suffixes associated with the selected signature.",
        )
        self.suffixesForSigListbox.setFont(font)

        signatureTabBox.addWidget(signatureBox)
        self.signatureTab.setLayout(signatureTabBox)

        self.mainArea.layout().addWidget(self.tabs)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input", "warning")

        self.setMinimumWidth(602)
        self.setMinimumHeight(317)
        self.adjustSizeWithTimer()

        # Send data if autoSend.
        self.sendButton.sendIf()

    def inputData(self, newInput):
        """Process incoming data."""
        self.inputSeg = newInput
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def mainSignatureSelected(self):
        """Display selected signature and generated words."""
        # Return if no selected signature...
        if len(self.selectedMainSignature) == 0:
            self.wordsForSig = list()
            return

        # Get generated words (by decreasing frequency)...
        sigs = self.morphology["signatures"]
        if self.selectedMainSignature[0] == 0:
            words = sorted([
                w for w in self.morphology["wordCounts"].keys()
                if self.morphology["parser"][w][0].signature == 0
            ])
        else:
            su = list(sigs.keys())[self.selectedMainSignature[0] - 1]
            words = ["".join(pair) for pair in itertools.product(sigs[su], su)]
        words.sort(key=self.morphology["wordCounts"].get, reverse=True)

        # Display generated words...
        max_count = self.morphology["wordCounts"][words[0]]
        padding = len(str(max_count)) + 1
        self.wordsForSig = [
            '{num: {width}} {word}'.format(
                num=self.morphology["wordCounts"][word],
                width=padding,
                word=word,
            ) for word in words
        ]

        # Display stems and suffixes in signature...
        if self.selectedMainSignature[0] > 0:
            suffixes = list(sigs.keys())[self.selectedMainSignature[0] - 1]
            self.suffixesForSig = [suffix or "NULL" for suffix in suffixes]
            self.stemsForSig = sigs[suffixes]
        else:
            self.suffixesForSig = ["NULL"]
            self.stemsForSig = sorted(words[:])

    def mainWordSelected(self):
        """Display possible parses for selected word."""

        self.sigForParseBox.setTitle(" Signature ")

        # Return if no selected word...
        if len(self.selectedMainWord) == 0:
            self.parses = list()
            return

        # Get selected word's parses...
        words = list(self.morphology["wordCounts"].keys())
        words.sort(key=self.morphology["wordCounts"].get, reverse=True)
        parses = self.morphology["parser"][words[self.selectedMainWord[0]]]

        # Display parses...
        self.parses = [
            '{score:.2f} {stem} + {suffix}'.format(
                score=parse.score,
                stem=parse.stem,
                suffix=parse.suffix if parse.suffix else "NULL",
            ) for parse in parses
        ]
        self.selectedParse = [0]
        self.parseSelected()

    def parseSelected(self):
        """Display selected parse's signature."""
        # Return if no selected parse...
        if len(self.selectedParse) == 0:
            self.stemsForParse = list()
            self.suffixesForParse = list()
            return

        # Get selected parse's signature...
        words = list(self.morphology["wordCounts"].keys())
        words.sort(key=self.morphology["wordCounts"].get, reverse=True)
        parses = self.morphology["parser"][words[self.selectedMainWord[0]]]
        parse = parses[self.selectedParse[0]]
        sigNum = parse.signature

        # Display stems and suffixes in parse's signature...
        if sigNum > 0:
            self.sigForParseBox.setTitle(" Signature {} ".format(sigNum))
            signatures = list(self.morphology["signatures"].keys())
            self.suffixesForParse = [
                suffix or "NULL" for suffix in signatures[sigNum - 1]
            ]
            self.stemsForParse =    \
                self.morphology["signatures"][signatures[sigNum-1]]
        else:
            self.sigForParseBox.setTitle(" Signature 0 ")
            self.suffixesForParse = ["NULL"]
            self.stemsForParse = sorted([
                w for w in words
                if self.morphology["parser"][w][0].signature == 0
            ])

    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Clear morphology...
        self.morphology = dict()

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Morphologically analyzed data", None, self)
            self.updateGUI()
            return

        # Perform morphological analysis...

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait (word count)...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=100)

        # Word count...
        wordCounts = collections.Counter(
            [segment.get_content() for segment in self.inputSeg])
        self.morphology["wordCounts"] = wordCounts
        self.infoBox.setText(
            u"Processing, please wait (signature extraction)...",
            "warning",
        )
        progressBar.advance(5)  # 5 ticks on the progress bar...

        # Learn signatures...
        try:
            lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen
            signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts)
            self.morphology["signatures"] = signatures
            self.morphology["stems"] = stems
            self.morphology["suffixes"] = suffixes
        except ValueError as e:
            self.infoBox.setText(e.__str__(), "warning")
            self.send("Morphologically analyzed data", None, self)
            self.controlArea.setDisabled(False)
            progressBar.finish()  # Clear progress bar.
            self.morphology = dict()
            self.updateGUI()
            return
        self.infoBox.setText(
            u"Processing, please wait (word parsing)...",
            "warning",
        )
        progressBar.advance(80)

        # Parse words...
        parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes)
        self.morphology["parser"] = parser
        newSegments = list()
        num_analyzed_words = 0
        for segment in self.inputSeg:
            parses = parser[segment.get_content()]
            newSegment = segment.deepcopy()
            if parses[0].signature:
                num_analyzed_words += 1
            newSegment.annotations.update(
                {
                    "stem": parses[0].stem,
                    "suffix": parses[0].suffix  \
                                if len(parses[0].suffix) else "NULL",
                    "signature": parses[0].signature
                }
            )
            newSegments.append(newSegment)
        self.send(
            "Morphologically analyzed data",
            Segmentation(newSegments, self.captionTitle),
            self,
        )
        self.updateGUI()
        progressBar.advance(15)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output (%.2f%% analyzed)." % (len(
            self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100))
        message = pluralize(message, len(self.inputSeg))
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()

    def updateGUI(self):
        """Update GUI state"""

        # Empty lists...
        self.mainWords = list()
        self.parses = list()
        self.stemsForParse = list()
        self.suffixesForParse = list()
        self.sigForParseBox.setTitle(" Signature ")
        self.mainSignatures = list()
        self.wordsForSig = list()
        self.stemsForSig = list()
        self.suffixesForSig = list()

        # Fill main lists if necessary...
        if len(self.morphology):

            # Main word list...
            words = list(self.morphology["wordCounts"].keys())
            words.sort(key=self.morphology["wordCounts"].get, reverse=True)
            max_count = self.morphology["wordCounts"][words[0]]
            padding = len(str(max_count)) + 1
            self.mainWords = [
                '{num: {width}} {word}'.format(
                    num=self.morphology["wordCounts"][word],
                    width=padding,
                    word=word,
                ) for word in words
            ]

            # Main signature list...
            sigs = [["NULL"]] + list(self.morphology["signatures"].keys())
            padding = len(str(len(sigs))) + 1
            self.mainSignatures = [
                '{num: {width}} {sig}'.format(
                    num=idx,
                    width=padding,
                    sig=", ".join([suff or "NULL" for suff in sig]))
                for idx, sig in enumerate(sigs)
            ]

    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
Exemple #3
0
class ExtractCSV(OWTextableBaseWidget):
    """Textable widget for to extract CSV usign the CSV module and Sniffer."""

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "Extract CSV"
    description = "Extract tabulated data as a Textable Segmentation"
    icon = "icons/extractcsv.png"
    priority = 21   # TODO

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("CSV Data", Segmentation, "inputData")]
    outputs = [("CSV Segmentation", Segmentation)]

    #----------------------------------------------------------------------
    # Layout parameters...
    
    want_main_area = False

    #----------------------------------------------------------------------
    
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0]
    )
    
    autoSend = settings.Setting(False)

    content_column = settings.Setting(0)
    deleteQuotes = settings.Setting(False)

    def __init__(self):
        """Widget creator."""

        super().__init__()

        # Other attributes...

        self.inputSeg = None
        self.outputSeg = None
        self.dialect = None
        self.selectedHeader = None
        self.csvSeg = list()
        # list of deleted segments
        self.contentIsNone = list()
        # list for gui
        self.headerList = list()
        self.content_column = 0
        self.headerEdit = ""

        # those are for the rename function
        self.renamedHeader = None
        self.isRenamed = False
        self.dict_keys = list()

        # preprocess
        self.deleteQuotes = False

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=None,
        )
        #self.header_there = False

        #----------------------------------------------------------------------
        # User interface...

        # preprocess box...
        self.preprocessBox = gui.widgetBox(
            widget=self.controlArea,
            box="Preprocess",
            orientation="vertical",
        )
        # check box...
        self.checkQuotes = gui.checkBox(
            widget=self.preprocessBox,
            master=self,
            value='deleteQuotes',
            label='delete quotation marks',
            callback=self.delete_quotes,
        )

        # main box...
        self.mainBox = gui.widgetBox(
            widget=self.controlArea,
            box="Click to select a header to modify",
            orientation="vertical",
        )

        # List of all the headers (named with numbers if None)
        self.headerListbox = gui.listBox(
            widget=self.mainBox,
            master=self,
            value="selectedHeader",
            labels="headerList",
            callback=self.update_gui,
            selectionMode=1, # can only choose one item
            tooltip="list of all your headers",
        )

        # set "rename" button (must be aside the list)
        self.renameHeader = gui.button(
            widget=self.mainBox,
            master=self,
            label="rename",
            callback=self.set_renamebox,
            tooltip="click to rename header"
        )

        # set "use as content" button (must be aside the list)
        self.iscontentHeader = gui.button(
            widget=self.mainBox,
            master=self,
            label="use as content",
            callback=self.content_changed,
            tooltip="click to select as content"
        )

        #----------------------------------------------------------------------
        # rename box...

        self.renameBox = gui.widgetBox(
            widget=self.controlArea,
            box='Rename header',
            orientation='horizontal',
            addSpace=True,
        )
        gui.separator(widget=self.renameBox, height=3)
        self.headerEditLine = gui.lineEdit(
            widget=self.renameBox,
            master=self,
            value='headerEdit',
            orientation='horizontal',
            label='New header:',
            tooltip=(
                "Rename the selected header."
            ),
            callback=lambda: self.renameButton.setDisabled(not self.headerEdit),
        )
        self.renameButton = gui.button(
            widget=self.renameBox,
            master=self,
            label="rename",
            callback=self.rename,
            tooltip="click to rename header"
        )
        self.cancelButton = gui.button(
            widget=self.renameBox,
            master=self,
            label="cancel",
            callback=self.cancel,
            tooltip="click to cancel renaming"
        )
        #----------------------------------------------------------------------
        # interface parameters...

        self.update_gui()
        self.renameBox.setVisible(False)

        gui.rubber(self.controlArea)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input", "warning")
        
        # Send data if autoSend.
        self.sendButton.sendIf()

    #----------------------------------------------------------------------

    def update_gui(self):
        if len(self.selectedHeader)==0:
            self.iscontentHeader.setDisabled(True)
            self.renameHeader.setDisabled(True)
        else:
            self.iscontentHeader.setDisabled(False)
            self.renameHeader.setDisabled(False)

    def content_changed(self):
        self.content_column = int(self.selectedHeader[0])
        self.treat_input()
        return

    def delete_quotes(self):
        self.treat_input()

    def set_renamebox(self):
        # take selectedHeader
        self.renamedHeader = int(self.selectedHeader[0])
        # appear rename gui
        self.renameBox.setVisible(True)
        self.renameButton.setDisabled(True)
        # disable other
        self.iscontentHeader.setDisabled(True)
        self.renameHeader.setDisabled(True)
        self.headerListbox.setDisabled(True)
        self.checkQuotes.setDisabled(True)

    def rename(self):
        # rename
        for key in self.dict_keys:
            # change my header name
            if self.dict_keys.index(key) == self.renamedHeader:
                self.dict_keys[self.dict_keys.index(key)] = self.headerEdit
        # implement check value
        self.isRenamed = True
        # and treat again
        self.treat_input()

        # here we get back to normal gui
        self.renameBox.setVisible(False)
        self.headerListbox.setDisabled(False)
        self.checkQuotes.setDisabled(False)
        self.update_gui()
        # clear value
        self.headerEdit = ""

    def cancel(self):
        # here we get back to normal gui
        self.renameBox.setVisible(False)
        self.headerListbox.setDisabled(False)
        self.update_gui()
        # clear value
        self.headerEdit = ""

    def treat_input(self):
        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            del self.headerList[:]
            self.headerList = self.headerList
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        # clear lists
        del self.csvSeg[:]
        del self.contentIsNone[:]

        # Process each input segment...
        for segment in self.inputSeg:
        
            # Input segment attributes...
            inputContent = segment.get_content()
            if not self.deleteQuotes == False :
                inputContent = inputContent.replace('"',"")
            inputAnnotations = segment.annotations
            inputStrIdx = segment.str_index
            inputStart = segment.start or 0
            inputEnd = segment.end or len(inputContent)
            #Call data processing
            csv_stream = io.StringIO(inputContent)
            dialect = sniffer.sniff(csv_stream.readline())
            dialect.quoting=csv.QUOTE_NONE
            csv_stream.seek(0)
            my_reader = csv.reader(csv_stream, dialect)
            position = 0
            # Process each seg in inputContent
            for seg in inputContent:
                segAnnotations = inputAnnotations.copy()
            # This  will launch if sniffer detects a header in the content.
            if sniffer.has_header(inputContent) == True:
                # go back to the start otherwise we're going to start from the
                # second row
                csv_stream.seek(0)
                # the header row is defined here.
                if self.isRenamed == False :
                    self.dict_keys = next(my_reader)
                    for key in self.dict_keys:
                    # this is position of first content
                    # TODO : separator length (if not 1)
                        position += (len(key) + 1)
                else :
                    input_keys = next(my_reader)
                    for key in input_keys:
                    # this is position of first content
                    # TODO : separator length (if not 1)
                        position += (len(key) + 1)


            # This will launch if sniffer does not detect a header 
            # in the content.
            if sniffer.has_header(inputContent) == False:
                # go back to the start otherwise we're going to start from the
                # second row. we do this here even though we don't really care
                # about the first row simply because in general we consider the
                # first row to not have any missing values
                csv_stream.seek(0)
                first_row = next(my_reader)
                n_cols = len(first_row)
                if self.isRenamed == False :
                    self.dict_keys = list()
                    for item in range(1, n_cols+1):
                        self.dict_keys.append(str(item))
                csv_stream.seek(0)


            # clear the list before appending
            del self.headerList[:]

            for key in self.dict_keys:
                # appends the headers to the gui list
                if self.dict_keys.index(key) == self.content_column:
                    self.headerList.append(str(key)+"(*content)")
                    self.headerList = self.headerList
                else :
                    self.headerList.append(str(key))
                    self.headerList = self.headerList


            for idx, row in enumerate(my_reader, start=2):
                # Get old annotations in new dictionary
                oldAnnotations = inputAnnotations.copy()
                segAnnotations = dict()
                # initiate next row starting position
                next_position = position
                for key in oldAnnotations.keys():
                    segAnnotations[key] = oldAnnotations[key]

                # This is the main part where we transform our data into
                # annotations.
                for key in self.dict_keys:
                    # segAnnotations["length"] = position
                    # segAnnotations["row"] = str(row)

                    # if column is content (first column (0) by default)
                    if self.dict_keys.index(key) == self.content_column:
                        # put value as content
                        content = row[self.dict_keys.index(key)]
                    # else we put value in annotation
                    else:
                        # only if value is not None
                        if len(row[self.dict_keys.index(key)]) != 0 :
                            segAnnotations[key] = row[self.dict_keys.index(key)]
                    # implement position and next_position depending on
                    # content column
                    if self.dict_keys.index(key) < self.content_column:
                        position += len(row[self.dict_keys.index(key)]) + 1
                        next_position += len(row[self.dict_keys.index(key)]) + 1
                    if self.dict_keys.index(key) >= self.content_column:
                        next_position += len(row[self.dict_keys.index(key)]) + 1

                if len(content) != 0:
                    self.csvSeg.append(
                        Segment(
                            str_index = inputStrIdx,
                            start = position,
                            end = position + len(content),
                            annotations = segAnnotations
                            )
                        )

                else :
                    # if no content, add idx of the row and do not append
                    # TODO : something with contentIsNone
                    self.contentIsNone.append(idx)

                # initiate new row starting position
                position = next_position
                        
            progressBar.advance()

        unSeg = len(self.csvSeg)         
        # Set status to OK and report segment analyzed...
        message = "%i segment@p analyzed." % unSeg
        message = pluralize(message, unSeg)
        message += " (Ignored %i segment@p with no content)" %      \
            len(self.contentIsNone)
        message = pluralize(message, len(self.contentIsNone))
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()
        self.sendButton.sendIf()

    def inputData(self, newInput):
        """Process incoming data."""
        self.inputSeg = newInput
        self.infoBox.inputChanged()

        del self.dict_keys[:]
        self.isRenamed = False

        self.sendButton.sendIf()

        self.treat_input()

    def sendData(self):
        """Compute result of widget processing and send to output"""
        
        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            del self.headerList[:]
            self.headerList = self.headerList
            self.send("CSV Segmentation", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))


        # Treat...
        for segment in self.csvSeg:
            
            pass
                        
            progressBar.advance()

                 
        # Set status to OK and report data size...
        outputSeg = Segmentation(self.csvSeg, label=self.captionTitle)
        if len(self.contentIsNone) == 0 :
            message = "%i segment@p sent to output." % len(outputSeg)
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)
        # message if one or more segments has no content and has been ignored
        elif len(self.contentIsNone) == 1:
            message = "%i segment@p sent to output. (ignored %i segment with \
            no content)" % (len(outputSeg), len(self.contentIsNone))
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)
        else :
            message = "%i segment@p sent to output. (ignored %i segments with \
            no content)" % (len(outputSeg), len(self.contentIsNone))
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
        
        # Send data to output...
        self.send("CSV Segmentation", outputSeg, self)
        
        self.sendButton.resetSettingsChangedFlag()             

    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
Exemple #4
0
class Treetagger(OWTextableBaseWidget):
    """Orange widget for POS-tagging and lemmatization with Treetagger"""

    name = "Treetagger"
    description = "POS-tagging and lemmatization with Treetagger"
    icon = "icons/treetagger.svg"
    priority = 2003

    inputs = [("Segmentation", Segmentation, "inputData")]
    outputs = [("Tagged data", Segmentation)]

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    language = settings.Setting(0)
    replaceUnknown = settings.Setting(False)
    outputFormat = settings.Setting("segment into words")

    want_main_area = False

    configFilePath = os.path.normpath(
        appdirs.user_data_dir("textable", "langtech") + "/treetagger_path")

    def __init__(self, *args, **kwargs):
        """Initialize a Message widget"""
        super().__init__(*args, **kwargs)

        # Other attributes...
        self.segmentation = None
        self.createdInputs = list()
        self.noLanguageParameterWarning = (
            "Please make sure that at least one language parameter "
            "file is installed in your Treetagger 'lib' directory, "
            "then click 'Reload language parameter files'.")
        self.noTreetaggerPathWarning = (
            "Please click 'Locate Treetagger' below and select the "
            "base directory of a valid Treetagger distribution.")
        self.TreetaggerPath = (treetaggerwrapper.locate_treetagger()
                               or self.lookupSavedTreetaggerPath())

        self.infoBox = InfoBox(widget=self.controlArea)

        self.sendButton = SendButton(widget=self.controlArea,
                                     master=self,
                                     callback=self.sendData,
                                     infoBoxAttribute=u"infoBox",
                                     sendIfPreCallback=self.updateGUI)

        gui.separator(self.controlArea, height=3)

        self.optionsBox = gui.widgetBox(
            self.controlArea,
            u"Options",
        )

        self.languageCombobox = gui.comboBox(
            widget=self.optionsBox,
            master=self,
            value="language",
            items=list(),
            sendSelectedValue=True,
            orientation=u"horizontal",
            label="Input language:",
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Select the language of the input text."),
        )
        self.languageCombobox.setMinimumWidth(120)

        gui.separator(self.optionsBox, height=3)

        gui.comboBox(
            widget=self.optionsBox,
            master=self,
            value="outputFormat",
            items=[
                "segment into words",
                "add XML tags",
            ],
            sendSelectedValue=True,
            orientation=u"horizontal",
            label="Output format:",
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(
                u"Select the format of the output:\n\n"
                u"Segment into words: each word is in a separate segment,\n"
                u"with lemma and POS-tag as annotations.\n\n"
                u"Add XML tags: output segments correspond to input segments\n"
                u"and each word is tagged in XML as a 'w' element with\n"
                u"lemma and POS-tag as attributes."),
        )

        gui.separator(self.optionsBox, height=3)

        gui.checkBox(
            widget=self.optionsBox,
            master=self,
            value="replaceUnknown",
            label="Output token in place of [unknown] lemmas",
            callback=self.sendButton.settingsChanged,
            tooltip=(
                u"For out-of-vocabulary words, the word form is used as the\n"
                u"lemma (in place of Treetagger's default 'unknown' code)."),
        )

        gui.rubber(self.controlArea)

        self.sendButton.draw()
        self.infoBox.draw()

        self.locateTreetaggerBox = gui.widgetBox(
            self.controlArea,
            addSpace=False,
        )

        gui.separator(self.locateTreetaggerBox, height=3)

        self.treetaggerButton = gui.button(
            widget=self.locateTreetaggerBox,
            master=self,
            label="Locate Treetagger",
            callback=self.validateTreetagger,
            tooltip=(
                u"Click to select the location of the Treetagger base\n"
                u"directory (containing the 'lib' and 'bin' subdirectories)."),
        )

        self.sendButton.sendIf()

        self.adjustSizeWithTimer()

    def inputData(self, inputData):
        """Process incoming data."""
        self.segmentation = inputData
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def sendData(self):

        # Clear created Inputs...
        self.clearCreatedInputs()

        if not self.TreetaggerPath:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.getAvailableLanguages():
            self.infoBox.setText(self.noLanguageParameterWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.segmentation:
            self.infoBox.setText(u"Widget needs input", "warning")
            self.send("Tagged data", None)
            return

        # Initialize progress bar.
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        self.progressBar = ProgressBar(self, iterations=5)

        # Create a copy of input seg, storing annotations in temp attr...
        copy_of_input_seg = Segmentation()
        copy_of_input_seg.label = self.segmentation.label
        for seg_idx, segment in enumerate(self.segmentation):
            attr = " ".join([
                "%s=%s" % (
                    ''.join(c for c in unicodedata.normalize('NFD', item[0])
                            if unicodedata.category(c) != 'Mn'),
                    quoteattr(str(item[1])),
                ) for item in segment.annotations.items()
            ])

            segment.annotations["tt_ax"] = attr
            copy_of_input_seg.append(segment)

        self.progressBar.advance()

        # Dump segmentation in unique string to avoid multiple calls to TT...
        concatenated_text = copy_of_input_seg.to_string(
            formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>",
            display_all=True,
        )

        self.progressBar.advance()

        # Tag the segmentation contents...
        tagopt = '-token -lemma -sgml -quiet'
        if self.replaceUnknown:
            tagopt += " -no-unknown"
        tagger = treetaggerwrapper.TreeTagger(
            TAGLANG=pycountry.languages.get(name=self.language).alpha_2,
            TAGOPT=tagopt,
            TAGDIR=self.TreetaggerPath,
        )
        tagged_lines = tagger.tag_text(
            concatenated_text,
            notagurl=True,
            notagemail=True,
            notagip=True,
            notagdns=True,
        )
        tagged_input = Input("\n".join(tagged_lines))
        self.createdInputs.append(tagged_input)

        # Replace <unknown> with [unknown] and " with &quot; then
        # re-segment to match the original segmentation structure.
        tagged_segmentation, _ = Segmenter.recode(
            tagged_input,
            substitutions=[
                (re.compile(r"<unknown>"), "[unknown]"),
                (re.compile(r'"""'), '"&quot;"'),
            ],
        )
        tagged_segmentation = Segmenter.import_xml(tagged_segmentation,
                                                   "ax_tt")

        self.progressBar.advance()

        # Place each output line of Treetagger in an xml tag with annotations..
        xml_segmentation, _ = Segmenter.recode(
            tagged_segmentation,
            substitutions=[
                (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                 '<w lemma="&3" pos-tag="&2">&1</w>'),
                (re.compile(r'^\n|\n$'), ''),
            ],
        )
        # Segment into individual tokens if XML output option is disabled...
        if self.outputFormat == "add XML tags":
            output_segmentation = xml_segmentation
        else:
            try:
                output_segmentation = Segmenter.import_xml(
                    xml_segmentation, "w")
            except ValueError:
                self.infoBox.setText(
                    "Please check that either the input contains well-formed "
                    "XML, or it doesn't contain instances of '&#60;' and '\x3e'",
                    "error")
                self.send("Tagged data", None)
                self.progressBar.finish()
                self.controlArea.setDisabled(False)
                return

        self.progressBar.finish()
        self.controlArea.setDisabled(False)

        output_segmentation.label = self.captionTitle
        message = u'%i segment@p sent to output.' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        self.infoBox.setText(message)
        self.send('Tagged data', output_segmentation, self)
        self.sendButton.resetSettingsChangedFlag()

    def updateGUI(self):
        """Update GUI state"""
        if self.TreetaggerPath:
            self.optionsBox.setDisabled(False)
            self.locateTreetaggerBox.setVisible(False)
            self.languageCombobox.clear()
            languages = self.getAvailableLanguages()
            if not languages:
                self.infoBox.setText(self.noLanguageParameterWarning,
                                     "warning")
                self.optionsBox.setDisabled(True)
                self.locateTreetaggerBox.setVisible(True)
                self.treetaggerButton.setText(
                    "Reload language parameter files")
            else:
                self.language = self.language or languages[0]
        else:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.optionsBox.setDisabled(True)
            self.locateTreetaggerBox.setVisible(True)
        self.adjustSizeWithTimer()

    def getAvailableLanguages(self):
        languages = list()
        for lang_code in sorted(treetaggerwrapper.g_langsupport):
            if lang_code.startswith("__"):
                continue
            try:
                treetaggerwrapper.TreeTagger(
                    TAGLANG=lang_code,
                    TAGDIR=self.TreetaggerPath,
                )
                language = pycountry.languages.get(alpha_2=lang_code).name
                self.languageCombobox.addItem(language)
                languages.append(language)
            except:
                pass
        return languages

    def lookupSavedTreetaggerPath(self):
        """Look for a saved Treetagger base dir path in app data"""
        if os.path.exists(self.__class__.configFilePath):
            try:
                inputFile = open(self.__class__.configFilePath, "r")
                TreetaggerSavedPath = inputFile.read()
                inputFile.close()
                if self.checkTreetaggerPath(TreetaggerSavedPath):
                    return TreetaggerSavedPath
                else:
                    os.remove(self.__class__.configFilePath)
                    return None
            except IOError:
                pass

    def validateTreetagger(self):
        """Respond to user actions needed to validate Treetagger path"""

        # If the Treetagger path is known, make sure there are language files...
        if self.TreetaggerPath:
            if self.getAvailableLanguages():
                self.sendButton.settingsChanged()
                self.updateGUI()
            else:
                QMessageBox.warning(None, 'Textable',
                                    'Language parameter files not found.',
                                    QMessageBox.Ok)
            return

        # Else if the path is not known...

        # First try to locate it automatically...
        TreetaggerPath = treetaggerwrapper.locate_treetagger()

        # If it fails, let the user locate it manually...
        if not (TreetaggerPath and self.checkTreetaggerPath(TreetaggerPath)):

            TreetaggerManualPath = os.path.normpath(
                str(
                    QFileDialog.getExistingDirectory(
                        self, u"Please locate Treetagger base directory")))

            # If user selected a dir...
            if TreetaggerManualPath:

                # Check if selected dir contains Treetagger binary...
                if self.checkTreetaggerPath(TreetaggerManualPath):
                    TreetaggerPath = TreetaggerManualPath
                else:
                    QMessageBox.warning(
                        None, 'Textable',
                        'Not a valid Treetagger base directory.',
                        QMessageBox.Ok)

        # If a valid path was found somehow, save config to app data...
        if TreetaggerPath:
            try:
                user_data_editor_dir = os.path.normpath(
                    self.__class__.configFilePath + "/../..")
                if not os.path.exists(user_data_editor_dir):
                    os.makedirs(user_data_editor_dir)
                user_data_software_dir = os.path.normpath(
                    self.__class__.configFilePath + "/..")
                if not os.path.exists(user_data_software_dir):
                    os.makedirs(user_data_software_dir)
                outputFile = open(self.__class__.configFilePath, "w")
                outputFile.write(TreetaggerPath)
                outputFile.close()
            except IOError:
                pass
            self.TreetaggerPath = TreetaggerPath

            self.sendButton.settingsChanged()

    def checkTreetaggerPath(self, path):
        """Check if path is a valid Treetagger base dir"""
        return os.path.exists(
            os.path.normpath(path + "/bin/tree-tagger" +
                             (".exe" if os.name == "nt" else "")))

    def clearCreatedInputs(self):
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def onDeleteWidget(self):
        """Free memory when widget is deleted (overriden method)"""
        self.clearCreatedInputs()

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
Exemple #5
0
class TopicModels(OWTextableBaseWidget):
    """Textable widget for building topic models based on a term-document matrix
    """

    #----------------------------------------------------------------------
    # Widget"s metadata...

    name = "Topic Models"
    description = "Build topic models based on term-document matrices"
    icon = "icons/topic_models.svg"
    priority = 10

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Textable Crosstab", PivotCrosstab, "input_data")]
    outputs = [
        ("Term-topic Textable table", PivotCrosstab, widget.Default),
        ("Document-topic Textable table", PivotCrosstab),
        ("Term-topic Orange table", Orange.data.Table, widget.Default),
        ("Document-topic Orange table", Orange.data.Table),
    ]

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    autoSend = settings.Setting(False)
    method = settings.Setting("Latent semantic indexing")
    numTopics = settings.Setting(10)

    want_main_area = False

    def __init__(self):
        """Widget creator."""
        super().__init__()

        # Other attributes...
        self.inputTable = None
        self.listEntries = list()

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.send_data,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=self.updateGUI,
        )

        # User interface...

        # Filter box (advanced settings only)
        optionsBox = gui.widgetBox(
            widget=self.controlArea,
            box="Options",
            orientation="vertical",
        )
        method_combo = gui.comboBox(
            widget=optionsBox,
            master=self,
            value="method",
            items=[
                "Latent Dirichlet allocation",
                "Latent semantic indexing",
                "Correspondence analysis",
            ],
            sendSelectedValue=True,
            orientation="horizontal",
            label="Method:",
            labelWidth=120,
            callback=self.sendButton.settingsChanged,
            tooltip=("Please select the desired topic modelling method.\n"),
        )
        method_combo.setMinimumWidth(120)
        gui.separator(widget=optionsBox, height=3)
        self.numTopicsSpin = gui.spin(
            widget=optionsBox,
            master=self,
            value='numTopics',
            minv=1,
            maxv=999,
            orientation='horizontal',
            label=u'Number of topics:',
            labelWidth=120,
            callback=self.sendButton.settingsChanged,
            keyboardTracking=False,
            tooltip=(
                u"Please select the desired number of topics in output tables."
            ),
        )
        gui.separator(widget=optionsBox, height=3)
        gui.listBox(
            widget=optionsBox,
            master=self,
            labels='listEntries',
            tooltip=(u"TODO"),
        )

        gui.separator(widget=self.controlArea, height=3)

        gui.rubber(self.controlArea)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input", "warning")

        # Send data if autoSend.
        self.sendButton.sendIf()

        self.setMinimumWidth(350)
        self.adjustSizeWithTimer()

    def input_data(self, newInput):
        """Process incoming data."""
        self.inputTable = newInput
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def send_data(self):
        """Compute result of widget processing and send to output"""

        # Check that there's a table in input...
        if self.inputTable is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.send("Term-topic Textable table", None)
            self.send("Document-topic Textable table", None)
            self.send("Term-topic Orange table", None)
            self.send("Document-topic Orange table", None)
            self.listEntries = list()
            return

        # Initialize progress bar.
        progressBar = gui.ProgressBar(
            self,
            iterations=1  # TODO
        )

        # Convert input table to gensim dictionary.
        dictionary, corpus = pivot_crosstab_to_gensim(self.inputTable)

        # Apply topic modelling...

        # Case 1: LDA...
        if self.method == "Latent Dirichlet allocation":

            model = models.LdaModel(
                corpus,
                id2word=dictionary,
                num_topics=self.numTopics,
            )

            # Create segment-topic PivotCrosstab table.
            values = dict()
            terms = list()
            for topic in xrange(self.numTopics):
                topic_terms = model.get_topic_terms(
                    topic,
                    len(self.inputTable.col_ids),
                )
                for term, score in topic_terms:
                    values[(dictionary[term], topic)] = score
                terms.append(
                    list(dictionary[t]
                         for t, s in topic_terms[:MAX_NUM_DISPLAYED_TERMS]))
            segmentTopicTable = PivotCrosstab(
                row_ids=self.inputTable.col_ids[:],
                col_ids=list(range(self.numTopics)),
                values=values,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict((col_id, 'continuous')
                              for col_id in range(self.numTopics)),
            )

            # Fill listbox...
            newListEntries = list()
            for topicNum in range(self.numTopics):
                displayedTerms = ", ".join(terms[topicNum])
                if len(self.inputTable.col_ids) > MAX_NUM_DISPLAYED_TERMS:
                    displayedTerms += ", ..."
                listEntry = "%i. %s" % (
                    topicNum + 1,
                    displayedTerms,
                )
                newListEntries.append(listEntry)
            self.listEntries = newListEntries

            # Create context-topic PivotCrosstab table...
            corpus_lda = model[corpus]
            values = dict()
            for row_idx, row in enumerate(self.inputTable.row_ids):
                lda_doc = corpus_lda[row_idx]
                for topic, score in lda_doc:
                    values[(row, topic)] = score
            contextTopicTable = PivotCrosstab(
                row_ids=self.inputTable.row_ids[:],
                col_ids=list(range(self.numTopics)),
                values=values,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__context__',
                header_col_type='string',
                col_type=dict((col_id, 'continuous')
                              for col_id in range(self.numTopics)),
                missing=0,
            )

        # Case 2: LSI...
        if self.method == "Latent semantic indexing":

            model = models.LsiModel(
                corpus,
                id2word=dictionary,
                num_topics=self.numTopics,
            )

            # Create segment-topic PivotCrosstab table.
            segmentTopicTable = PivotCrosstab.from_numpy(
                row_ids=self.inputTable.col_ids[:],
                col_ids=list(range(self.numTopics)),
                np_array=model.projection.u,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict((col_id, 'continuous')
                              for col_id in range(self.numTopics)),
            )

            # Fill listbox...
            colIds = np.array(self.inputTable.col_ids)
            newListEntries = list()
            # Subtask: compute total inertia, i.e. sum of eigenvalues of
            # doc-term matrix multiplied by its transposed...
            rect_matrix = self.inputTable.to_numpy()
            matrix_dims = self.inputTable.to_numpy().shape
            if matrix_dims[0] > matrix_dims[1]:
                square_matrix = np.dot(np.transpose(rect_matrix), rect_matrix)
            else:
                square_matrix = np.dot(rect_matrix, np.transpose(rect_matrix))
            total_inertia = sum(np.linalg.eigvals(square_matrix))
            for topicNum in range(self.numTopics):
                # Proportion of inertia is SQUARE of singular value divided by
                # total inertia, because n-th singular value = square root of
                # n-th eigenvalue (cf. compute total inertia above)...
                propInertia = model.projection.s[topicNum]**2 / total_inertia
                scores = model.projection.u[:, topicNum]
                sortedTerms = colIds[scores.argsort()[::-1]]
                if len(colIds) > MAX_NUM_DISPLAYED_TERMS:
                    displayedTerms = ", ".join(
                        sortedTerms[:MAX_NUM_DISPLAYED_TERMS // 2])
                    displayedTerms += ", ..., "
                    displayedTerms += ", ".join(
                        sortedTerms[-MAX_NUM_DISPLAYED_TERMS // 2:])
                else:
                    displayedTerms = ", ".join(sortedTerms)
                listEntry = "%i. (%.2f%%) %s" % (
                    topicNum + 1,
                    propInertia * 100,
                    displayedTerms,
                )
                newListEntries.append(listEntry)
            self.listEntries = newListEntries

            # Create context-topic PivotCrosstab table...
            contextTopicMatrix = corpus2dense(
                model[corpus], len(model.projection.s)).T / model.projection.s
            values = dict()
            for row_idx, row in enumerate(contextTopicMatrix):
                for topic, val in enumerate(row):
                    values[(self.inputTable.row_ids[row_idx], topic)] = val
            contextTopicTable = PivotCrosstab(
                row_ids=self.inputTable.row_ids[:],
                col_ids=list(range(self.numTopics)),
                values=values,
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__context__',
                header_col_type='string',
                col_type=dict((col_id, 'continuous')
                              for col_id in range(self.numTopics)),
                missing=0,
            )

        # Case 2: Correspondence analysis...
        elif self.method == "Correspondence analysis":

            ca = correspondence(self.inputTable.to_numpy())

            # Create segment-topic PivotCrosstab table.
            segmentTopicTable = PivotCrosstab.from_numpy(
                row_ids=self.inputTable.col_ids[:],
                col_ids=list(range(self.numTopics)),
                np_array=ca.col_factors[:, range(self.numTopics)],
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict((col_id, 'continuous')
                              for col_id in range(self.numTopics)),
            )

            # Fill listbox...
            colIds = np.array(self.inputTable.col_ids)
            newListEntries = list()
            total_inertia = sum(ca.inertia_of_axis())
            for topicNum in range(self.numTopics):
                propInertia = ca.inertia_of_axis()[topicNum] / total_inertia
                scores = np.array(ca.col_factors[:, topicNum])
                sortedTerms = colIds[scores.argsort()[::-1]]
                if len(colIds) > MAX_NUM_DISPLAYED_TERMS:
                    displayedTerms = ", ".join(
                        sortedTerms[:MAX_NUM_DISPLAYED_TERMS // 2])
                    displayedTerms += ", ..., "
                    displayedTerms += ", ".join(
                        sortedTerms[-MAX_NUM_DISPLAYED_TERMS // 2:])
                else:
                    displayedTerms = ", ".join(sortedTerms)
                listEntry = "%i. (%.2f%%) %s" % (
                    topicNum + 1,
                    propInertia * 100,
                    displayedTerms,
                )
                newListEntries.append(listEntry)
            self.listEntries = newListEntries

            # Create context-topic PivotCrosstab table.
            contextTopicTable = PivotCrosstab.from_numpy(
                row_ids=self.inputTable.row_ids[:],
                col_ids=list(range(self.numTopics)),
                np_array=ca.row_factors[:, range(self.numTopics)],
                header_row_id='__topic__',
                header_row_type='continuous',
                header_col_id='__unit__',
                header_col_type='string',
                col_type=dict((col_id, 'continuous')
                              for col_id in range(self.numTopics)),
            )

        # Set status to OK and report...
        self.infoBox.setText("Tables correctly sent to output.")
        progressBar.finish()

        # Clear progress bar.
        progressBar.finish()

        # Send tokens...
        self.send("Term-topic Textable table", segmentTopicTable)
        self.send("Document-topic Textable table", contextTopicTable)
        self.send(
            "Term-topic Orange table",
            segmentTopicTable.to_orange_table(),
        )
        self.send(
            "Document-topic Orange table",
            contextTopicTable.to_orange_table(),
        )

        self.sendButton.resetSettingsChangedFlag()

    def updateGUI(self):
        """Update GUI state"""
        if self.inputTable is not None:
            if (self.method == "Latent semantic indexing"
                    or self.method == "Correspondence analysis"):
                maxNumTopics = min(
                    len(self.inputTable.row_ids),
                    len(self.inputTable.col_ids),
                )
                self.numTopicsSpin.setRange(1, maxNumTopics - 1)
            else:
                self.numTopicsSpin.setRange(1, 999)
        else:
            self.numTopicsSpin.setRange(1, 999)
Exemple #6
0
class SpaCy(OWTextableBaseWidget):
    """Textable widget for NLP using spaCy."""

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "spaCy"
    description = "Natural language processing using spaCy"
    icon = "icons/spacy.svg"
    priority = 21  # TODO

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Text data", Segmentation, "inputData")]
    outputs = [("Linguistically analyzed data", Segmentation)]

    #----------------------------------------------------------------------
    # Layout parameters...

    want_main_area = False

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    autoSend = settings.Setting(False)
    model = settings.Setting("fr_core_news_sm")

    def __init__(self):
        """Widget creator."""

        super().__init__()

        # Other attributes...
        self.inputSeg = None
        self.nlp = None

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=None,
        )

        # User interface...

        # Tabs...
        self.tabs = QTabWidget()
        self.optionsTab = QWidget()
        self.modelManagerTab = QWidget()
        self.tabs.addTab(self.optionsTab, "Options")
        self.tabs.addTab(self.modelManagerTab, "Model manager")

        # Options tab...
        OptionsTabBox = QHBoxLayout()

        optionsBox = gui.widgetBox(widget=self.optionsTab)

        self.modelComboBox = gui.comboBox(
            widget=optionsBox,
            master=self,
            value='model',
            label='Model: ',
            tooltip='Select the spaCy language model you want to use.',
            items=INSTALLED_MODELS,
            sendSelectedValue=True,
            callback=self.modelChanged,
        )

        OptionsTabBox.addWidget(optionsBox)
        self.optionsTab.setLayout(OptionsTabBox)

        # Model manager tab...
        modelManagerTabBox = QHBoxLayout()

        modelManagerBox = gui.widgetBox(widget=self.modelManagerTab)

        # TODO: Model manager UI

        modelManagerTabBox.addWidget(modelManagerBox)
        self.modelManagerTab.setLayout(modelManagerTabBox)

        self.controlArea.layout().addWidget(self.tabs)

        gui.rubber(self.controlArea)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input", "warning")

        # Load spaCy language model...
        self.modelChanged()

        # Send data if autoSend.
        self.sendButton.sendIf()

    def inputData(self, newInput):
        """Process incoming data."""
        self.inputSeg = newInput
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def modelChanged(self):
        """Respond to model change in UI."""
        self.nlp = spacy.load(self.model)
        self.sendButton.settingsChanged()

    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Linguistically analyzed data", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        tokenizedSegments = list()

        # Process each input segment...
        for segment in self.inputSeg:

            # Input segment attributes...
            inputContent = segment.get_content()
            inputAnnotations = segment.annotations
            inputString = segment.str_index
            inputStart = segment.start or 0
            inputEnd = segment.end or len(inputContent)

            # NLP analysis...
            doc = self.nlp(inputContent)

            # Process each token in input segment...
            for token in doc:
                tokenAnnotations = inputAnnotations.copy()
                tokenAnnotations.update({
                    k: getattr(token, k)
                    for k in RELEVANT_KEYS if getattr(token, k) is not None
                })
                tokenStart = inputStart + token.idx
                tokenizedSegments.append(
                    Segment(
                        str_index=inputString,
                        start=tokenStart,
                        end=tokenStart + len(token),
                        annotations=tokenAnnotations,
                    ))

            progressBar.advance()

        outputSeg = Segmentation(tokenizedSegments, self.captionTitle)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSeg)
        message = pluralize(message, len(outputSeg))
        self.infoBox.setText(message)

        print(outputSeg.to_string())

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        # Send data to output...
        self.send("Linguistically analyzed data", outputSeg, self)

        self.sendButton.resetSettingsChangedFlag()

    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
class Treetagger(OWTextableBaseWidget):
    """Orange widget to get corpus from pattern web"""

    #----------------------------------------------------------------------
    # Widget"s metadata...

    name = "Treetagger"
    description = "..."
    icon = "icons/icon_treetagger.png"
    priority = 1

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Text Input", Segmentation, "processInputData")]
    outputs = [("Text data", Segmentation)]

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    autoSend = settings.Setting(False)
    unknown = settings.Setting(False)
    activer_xml = settings.Setting(False)

    want_main_area = False

    def __init__(self):
        """Widget creator."""
        super().__init__()

        # NONE BASIC SETTING
        self.inputData = None
        self.system = os.name
        self.user = os.environ.get("USER")
        self.langues = list()
        self.created_inputs = list()
        self.language = 0
        self.check_firt_use = False
        self.createdInputs = list()
        self.compteur = 0
        self.NoLink = True

        # liste des langues possible
        self.langues_possibles = {
            "French": ["french.par", "french-abbreviations"],
            "English": ["english-utf8.par", "english-abbreviations"],
            "German": ["german-utf8.par", "german-abbreviations"],
            "Italian": ["italian-utf8.par", "italian-abbreviations"],
            "Swahili": ["swahili.par", "swahili-abbreviations"],
            "Portuguese": ["portuguese.par", "portuguese-abbreviations"],
            "Russian": ["russian.par", "russian-abbreviations"],
            "Spanish":
            ["spanish-utf8.par", "spanish-abbreviations", "spanish-mwls"],
            "Slovenian": ["slovenian-utf8.par"],
            "Slovak": ["slovak2-utf8.par"],
            "Romanian": ["romanian.par"],
            "Polish": ["polish-utf8.par"],
            "Mongolian": ["mongolian.par"],
            "Latin": ["latin.par"],
            "Galician": ["galician.par"],
            "Finnish": ["finnish-utf8.par"],
            "Estonian": ["estonian.par"],
            "Bulgarian": ["bulgarian-utf8.par"],
            "Spoken French": ["spoken-french.par", "french-abbreviations"]
        }

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(widget=self.controlArea,
                                     master=self,
                                     callback=self.sendData,
                                     infoBoxAttribute=u"infoBox",
                                     sendIfPreCallback=self.updateGUI)

        # The AdvancedSettings class, also from TextableUtils, facilitates
        # the management of basic vs. advanced interface. An object from this
        # class (here assigned to self.advancedSettings) contains two lists
        # (basicWidgets and advancedWidgets), to which the corresponding
        # widgetBoxes must be added.

        # User interface...

        # OPTION BOX
        gui.separator(widget=self.controlArea, height=5)

        self.infoBox1 = gui.widgetBox(self.controlArea,
                                      u"Option",
                                      addSpace=True)

        # definir la langue
        self.langueBox = gui.comboBox(widget=self.infoBox1,
                                      master=self,
                                      value="language",
                                      items=self.langues,
                                      orientation=u"horizontal",
                                      label="Select text language :",
                                      callback=self.settings_changed)

        self.langueBox.setMaximumWidth(100)

        gui.separator(widget=self.controlArea, height=3)

        # Checkbox pour activer output avec code xml
        self.choix_xml = gui.checkBox(widget=self.infoBox1,
                                      master=self,
                                      value="activer_xml",
                                      label=" Output with XML code",
                                      callback=self.settings_changed)

        # Checkbox pour afficher unknown si le mot est inconnu
        self.choix_unknown = gui.checkBox(widget=self.infoBox1,
                                          master=self,
                                          value="unknown",
                                          label=" Output without '[unknown]'",
                                          callback=self.settings_changed)

        # The following lines:
        # Bouton pour aller cherche le lien vers treetagger...
        self.treetagger_box = gui.widgetBox(
            self.controlArea,
            u"Please, enter a correct path to TreeTagger :",
            addSpace=True)

        gui.button(widget=self.treetagger_box,
                   master=self,
                   label="Browse",
                   callback=self.treetagger_search)

        gui.separator(widget=self.treetagger_box, height=3)

        gui.rubber(self.controlArea)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()

        # Send data if autoSend.
        self.sendButton.sendIf()

        # ajuster taille widjet
        self.adjustSizeWithTimer()

        # verifie lien treetagger
        self.treetagger_check()

    # ALL FUNCTIONS

    def treetagger_check(self):

        # liste des element que doit contenir le dossier treetagger...
        liste = list()
        tokenize = os.path.normpath("/cmd/tokenize.pl")
        tokenize_utf8 = os.path.normpath("/cmd/utf8-tokenize.perl")
        treetagger = os.path.normpath("/bin/tree-tagger")

        # definir le ce que l"on trouve dans le chemin vers treetagger
        if self.system == "nt":
            check_list = [tokenize, tokenize_utf8, treetagger + ".exe"]
        else:
            check_list = [tokenize, tokenize_utf8, treetagger]

        # definir le chemin vers treetagger automatiquement
        path = os.path.dirname(
            os.path.abspath(inspect.getfile(
                inspect.currentframe())))  # --> temporaire

        # stoquer le lien vers treetagger (windows ou autre)...
        if self.system == "nt":
            if os.path.exists("treetagger_link.txt"):
                file = open("treetagger_link.txt", "r")
                self.treetagger_link = file.read()
            else:
                self.treetagger_link = os.path.normpath("C:\TreeTagger")

        else:
            if os.path.exists(os.path.normpath("/Users/" + \
            self.user + "/treetagger_link.txt")):
                file = open(
                    os.path.normpath("/Users/" + self.user +
                                     "/treetagger_link.txt"), "r")
                self.treetagger_link = file.read()
            else:
                self.treetagger_link = os.path.normpath(
                    "/Applications/TreeTagger")

        # verifier si le chemin est correcte
        for check in check_list:
            check = os.path.exists(self.treetagger_link + check)
            liste.append(check)

        # afficher le bouton pour aller chercher le lien
        # et verouiller le reste des posibilite...
        if False in liste:
            self.NoLink = True
            # botton encore visible et les autres verouille
            self.treetagger_box.setVisible(True)
            self.infoBox1.setDisabled(True)

            # afficher les probleme s"il y en a...
            if self.check_firt_use is False:
                self.infoBox.setText(
                    u"Please click 'Browse' and select the path \
                    to TreeTagger base folder. ", "warning")
            else:
                self.infoBox.setText(
                    u"Sorry, TreeTagger's link isn't correct.", "error")

        # cacher le bouton pour aller chercher le lien
        # et deverouiller le reste des posibilite...
        else:
            if self.check_firt_use is True:
                self.infoBox.setText(
                    u"TreeTagger's link is correct !\n\n \
                    Now, Widget needs input.", "warning")
            else:
                self.infoBox.setText(u"Widget needs input.", "warning")

            # affiche les langues
            self.language_possibility()
            for langue_actualise in self.langues:
                self.langueBox.addItem(langue_actualise)

            # modification affichage de l"interface
            self.NoLink = False
            self.treetagger_box.setVisible(False)
            self.infoBox1.setDisabled(False)

        self.saveSettings()

        return liste

    def treetagger_search(self):

        # rentre un lien comme lien de base marche pas
        self.treetagger_link = os.path.normpath(
            str(
                QFileDialog.getExistingDirectory(
                    self, u"Enter a path to Treetagger")))

        # Try to save list in this module"s directory for future reference...
        if self.system == "nt":
            file = open("treetagger_link.txt", "w")
        else:
            file = open(
                os.path.normpath("/Users/" + self.user +
                                 "/treetagger_link.txt"), "w")

        file.write(self.treetagger_link)
        file.close()

        self.check_firt_use = True

        # verifie si le lien marche
        self.treetagger_check()

    def language_possibility(self):

        # initilise que les langues installees dans treetagger
        # la liste dans son dossier
        langue_verification = os.listdir(".")

        langues_presentes = list()

        # On cherche quelles langue sont installees dans l"ordinateur
        for langue in self.langues_possibles.keys():
            check = True
            for file_utile in self.langues_possibles[langue]:
                check = check and os.path.isfile(
                    os.path.normpath(self.treetagger_link + "/lib/" +
                                     file_utile))
                if not check:
                    break
            if check:
                langues_presentes.append(langue)

        self.langues = langues_presentes

        return langues_presentes

    #recoit l"input
    def processInputData(self, inputData):

        # ici on prend le input
        self.inputData = inputData

        #change l"infobox quand input change
        if self.compteur != 0:
            self.infoBox.inputChanged()

        # Send data to output.
        self.sendButton.sendIf()

    def sendData(self):

        # Si le lien vers treetagger n"est pas trouve
        if self.NoLink:
            self.infoBox.setText(u"Sorry, TreeTagger's link not found.",
                                 "error")
            self.send("Text data", None)
        # Important: if input data is None, propagate this value to output...
        elif not self.inputData:
            self.infoBox.setText(u"Widget needs input", "warning")
            self.send("Text data", None)
        # affiche que quelque chose se passe...
        else:
            self.infoBox.setText(u"TreeTagger is running...", "warning")

            # Initialisation de variables
            total_tagged_text = list()
            new_segmentations = list()
            i = 0

            # Initialize progress bar.
            self.progressBar = gui.ProgressBar(self, iterations=5)

            # Copie de la segmentation avec ajout d"une annotation...
            copy_of_input_seg = Segmentation()
            copy_of_input_seg.label = self.inputData.label
            for seg_idx, segment in enumerate(self.inputData):
                attr = " ".join(
                    ["%s='%s'" % item for item in segment.annotations.items()])
                segment.annotations["tt_xb"] = attr
                copy_of_input_seg.append(segment)

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            concatenated_text = copy_of_input_seg.to_string(
                formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>",
                display_all=True,
            )

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            tagged_text = self.tag(concatenated_text)
            tagged_input = Input(tagged_text)
            tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt")

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            # Si checkBox xml active
            if self.activer_xml == True:
                xml_segmentation, _ = Segmenter.recode(
                    tagged_segmentation,
                    substitutions=[
                        (re.compile(r"<unknown>"), "[unknown]"),
                        (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                         "<w lemma='&3' type='&2'>&1</w>"),
                        (re.compile(r'"""'), '"&quot;"'),
                    ],
                )
                final_segmentation = xml_segmentation
            # Si checkBox xml desactive
            else:
                xml_segmentation, _ = Segmenter.recode(
                    tagged_segmentation,
                    substitutions=[
                        (re.compile(r"<unknown>"), "[unknown]"),
                        (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                         "<w lemma='&3' type='&2'>&1</w>"),
                        (re.compile(r'"""'), '"&quot;"'),
                    ],
                )
                final_segmentation = Segmenter.import_xml(
                    xml_segmentation, "w")

            self.infoBox.dataSent("")

            # Enregistrer le lien de treetagger...
            if self.system == "nt":
                file = open("treetagger_link.txt", "w")
            else:
                file = open(
                    os.path.normpath("/Users/" + self.user +
                                     "/treetagger_link.txt"), "w")

            file.write(self.treetagger_link)
            file.close()

            # Clear progress bar.
            self.progressBar.finish()

            # envoyer la seguementation
            self.send("Text data", final_segmentation, self)
            self.compteur += 1
            self.sendButton.resetSettingsChangedFlag()

    def tag(self, inputData):

        # fichier temporaire...
        tmp = os.path.normpath(os.path.expanduser("~/tmp_file.txt"))
        tmp2 = os.path.normpath(os.path.expanduser("~/tmp_file2.txt"))

        # ecrire dans un premier fichier le texte
        f = open(tmp, "w", encoding="utf-8")
        f.write(inputData)
        f.close()

        # liste de langue en option...
        option = str()
        if self.langues[self.language] == "French":
            option = "-f"
        elif self.langues[self.language] == "English":
            option = "-e"
        elif self.langues[self.language] == "Italian":
            option = "-i"

        # commande perle executee pour separer le texte en mot
        if option:
            commande1 = [
                "perl",
                os.path.normpath(
                    self.treetagger_link + "/cmd/utf8-tokenize.perl"
                ),
                option,
                "-a",
                os.path.normpath(
                    self.treetagger_link + "/lib/" + \
                    self.langues_possibles[self.langues[self.language]][1]
                ),
                tmp
            ]
        else:
            commande1 = [
                "perl",
                os.path.normpath(self.treetagger_link + "/cmd/tokenize.pl"),
                "-a",
                os.path.normpath(
                    self.treetagger_link + "/lib/" + \
                    self.langues_possibles[self.langues[self.language]][1]
                ),
                tmp
            ]

        # evoyer un ordre a la ligne de commande
        if self.system == "nt":
            outcom1 = sp.Popen(commande1, stdout=sp.PIPE, shell=True)
            out = outcom1.communicate()[0]\
                         .decode(encoding="utf-8", errors="ignore")\
                         .replace("\r", "")
        else:
            outcom1 = sp.Popen(commande1, stdout=sp.PIPE, shell=False)
            out = outcom1.communicate()[0]\
                         .decode(encoding="utf-8", errors="ignore")

        # avancer la progressBar d"un cran
        self.progressBar.advance()

        # ecrire dans un deuxieme fichier le texte separe en mots
        f = codecs.open(tmp2, "w", encoding="utf-8")
        f.write(out)
        f.close()

        if self.system == "nt":
            bin_treetagger = "/bin/tree-tagger.exe"
        else:
            bin_treetagger = "/bin/tree-tagger"

        # taguer le texte avec type et lemma
        if self.unknown == True:
            commande2 = [
                os.path.normpath(self.treetagger_link + bin_treetagger),
                os.path.normpath(
                    self.treetagger_link + "/lib/" + \
                    self.langues_possibles[self.langues[self.language]][0]
                ),
                "-token",
                "-lemma",
                "-sgml",
                "-no-unknown",
                "-quiet",
                tmp2
            ]

        if self.unknown == False:
            commande2 = [
                os.path.normpath(self.treetagger_link + bin_treetagger),
                os.path.normpath(
                    self.treetagger_link + "/lib/" + \
                    self.langues_possibles[self.langues[self.language]][0]
                ),
                "-token",
                "-lemma",
                "-sgml",
                "-quiet",
                tmp2
            ]

        if self.system == "nt":
            output = sp.Popen(commande2, stdout=sp.PIPE, shell=True)
            outtext = output.communicate()[0]\
                            .decode(encoding="utf-8", errors="ignore")
        else:
            output = sp.Popen(commande2, stdout=sp.PIPE, shell=False)
            outtext = output.communicate()[0]\
                            .decode(encoding="utf-8", errors="ignore")

        # supprimer ficher temporaire
        os.remove(tmp)
        os.remove(tmp2)

        # avancer la progressBar d"un cran
        self.progressBar.advance()

        return outtext

    def updateGUI(self):
        """Update GUI state"""
        pass

    def clearCreatedInputs(self):
        #Delete all Input objects that have been created.
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def settings_changed(self):
        # eviter qu"un changement arrive
        # si le widget n"a pas encore evoyer d"output...
        if self.compteur > 0:
            return self.sendButton.settingsChanged()

    def onDeleteWidget(self):
        """Free memory when widget is deleted (overriden method)"""
        self.clearCreatedInputs()

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
class Charnetto(OWTextableBaseWidget):
    """Textable widget for building character networks with Charnetto."""

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "Charnetto"
    description = "Build character networks with the Charnetto package"
    icon = "icons/charnetto.svg"
    priority = 20

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Text data", Segmentation, "inputData")]
    outputs = [("Character segmentation", Segmentation)]

    #----------------------------------------------------------------------
    # Layout parameters...
    
    want_main_area = False

    #----------------------------------------------------------------------
    # Settings...
    
    sourceType = settings.Setting("Plain text")
    minFreq = settings.Setting(1)
    model = settings.Setting("")

    #----------------------------------------------------------------------
    # The following lines need to be copied verbatim in every Textable widget...
    
    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0]
    )
    
    def __init__(self):
        """Widget creator."""

        super().__init__()

        #----------------------------------------------------------------------
        # Other (non settings) attributes...
        
        self.inputSeg = None
        self.selectedCharacters = list()
        self.characters = list()
        if spacy_widget.INSTALLED_MODELS:
            self.model = spacy_widget.INSTALLED_MODELS[0]
            self.mustInstall = False
        else:
            self.model = ""
            self.mustInstall = True
        self.editsWereMade = False

        #----------------------------------------------------------------------
        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=None,
        )

        #----------------------------------------------------------------------
        # User interface...

        # Sample box...
        self.optionsBox = gui.widgetBox(
            widget=self.controlArea,
            box="Options",
            orientation="vertical",
        )
        # self.sourceTypeCombo = gui.comboBox(
            # widget=self.optionsBox,
            # master=self,
            # value="sourceType",
            # sendSelectedValue=True,
            # items=["Plain text", "IMSDB-formatted script"],
            # orientation="horizontal",
            # label="Source type:",
            # labelWidth=120,
            # callback=self.changeSourceType,
            # tooltip=(
                # "TODO\n"
                # "TODO\n"
                # "TODO\n"
            # ),
        # )

        self.spacyModelCombo = gui.comboBox(
            widget=self.optionsBox,
            master=self,
            value="model",
            sendSelectedValue=True,
            items=spacy_widget.INSTALLED_MODELS,
            orientation="horizontal",
            label="SpaCy model:",
            labelWidth=120,
            callback=self.loadModel,
            tooltip=("Choose spaCy model for named entity recognition."),
        )


        # gui.separator(widget=self.optionsBox, height=3)

        # minFreqSpin = gui.spin(
            # widget=self.optionsBox,
            # master=self,
            # value='minFreq',
            # minv=1,
            # maxv=1000,
            # orientation='horizontal',
            # label="Minimum frequency:",
            # labelWidth=120,
            # callback=self.sendButton.settingsChanged,
            # keyboardTracking=False,
            # tooltip=(
                # "TODO\n"
                # "TODO\n"
                # "TODO\n"
            # ),
        # )
        
        # gui.separator(widget=self.optionsBox, height=3)

        # Character box...
        self.characterBox = gui.widgetBox(
            widget=self.controlArea,
            box="Edit character list",
            orientation="vertical",
        )

        characterListbox = gui.listBox(
            widget=self.characterBox,
            master=self,
            value="selectedCharacters",
            labels="characters",
            callback=self.updateButtons,
            tooltip="List of identified characters",
        )
        # TODO set min height

        self.characterButtonBox = gui.widgetBox(
            widget=self.characterBox,
            orientation="horizontal",
        )

        self.newButton = gui.button(
            widget=self.characterButtonBox,
            master=self,
            label="New",
            callback=self.newCharacter,
            tooltip="Add a new entry to the character list.",
        )
        
        self.editButton = gui.button(
            widget=self.characterButtonBox,
            master=self,
            label="Edit",
            callback=self.editCharacters,
            tooltip="Add the selected character list entry.",
        )
        
        self.deleteButton = gui.button(
            widget=self.characterButtonBox,
            master=self,
            label="Delete",
            callback=self.deleteCharacter,
            tooltip="Delete the selected character list entry.",
        )

        self.resetButton = gui.button(
            widget=self.characterButtonBox,
            master=self,
            label="Reset",
            callback=self.resetCharacters,
            tooltip="Revert all changes made to character list.",
        )

        self.updateButtons()
        
        gui.rubber(self.controlArea)

        #----------------------------------------------------------------------
        # Draw Info box and Send button...
        
        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input.", "warning")
        
        # Check that there's a model...
        if self.mustInstall:
            self.noLanguageModelWarning()
        else:
            self.loadModel()

    def inputData(self, newInput):
        """Process incoming data."""
        if self.mustInstall:
            return
        self.inputSeg = newInput
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            self.characters = list()
            return
        self.updateCharacterList()
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def updateCharacterList(self):
        """Update character list based on Charnetto output."""
        # Sanity checks...
        if not self.model or not self.inputSeg:
            return
        
        # Init UI...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=4)
        
        # Get input strings...
        strings = [segment.get_content() for segment in self.inputSeg]
        progressBar.advance()
        
        # Extract character tokens...
        # if self.sourceType == "Plain text":
            # self.char_df = charnetto.extract_spacy_df(strings, self.nlp)
        # elif self.sourceType == "IMSDB-formatted script":
            # self.char_df = charnetto.extract_movie_df(" ".join(strings))
        self.char_df = charnetto.extract_spacy_df(strings, self.nlp)
        
        # TODO deal with \n in names
        progressBar.advance()
        
        # Unify spaCy tags to match those of flair...  
        self.char_df = charnetto.unify_tags(self.char_df)
        progressBar.advance()
        
        # Collapse characters whose name is the prefix of another.
        self.char_list = charnetto.concatenate_parents(self.char_df, min_occ = 1)

        # Build char list and reset UI.
        self.characters = [", ".join(char) for char in self.char_list]
        progressBar.advance()
        progressBar.finish()
        self.controlArea.setDisabled(False)
        
        # Cache character list for resetting if needed.
        self.cachedCaracters = self.characters[:]
    
    def loadModel(self):
        """(Re-)load language model if needed."""
        # Display warning, disable UI and initialize progress bar...
        self.infoBox.setText(
            u"Loading language model, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=1)

        # Load model and reset UI.
        self.nlp = spacy.load(spacy_widget.AVAILABLE_MODELS[self.model])
        progressBar.advance()
        progressBar.finish()
        self.controlArea.setDisabled(False)
        
        # Update char list if there's an input...
        if self.inputSeg:
            self.updateCharacterList()
        
        self.sendButton.settingsChanged()

    def noLanguageModelWarning(self):
        """"Warn user that a spaCy model must be installed and disable GUI."""
        self.infoBox.setText(
            "Please first use the spaCy widget to download a language "
            "model, then create a new copy of the Charnetto widget.",
            "warning",
        )
        self.controlArea.setDisabled(True)

    def changeSourceType(self):
        """"Deal with user-requested source type change."""
        self.spacyModelCombo.setDisabled(self.sourceType ==
            "IMSDB-formatted script")
            
        # Update char list if there's an input...
        if self.inputSeg:
            self.updateCharacterList()
        
        self.sendButton.settingsChanged()

    def newCharacter(self):
        """"Add new character to list."""
        new_value, ok = QInputDialog.getText(self, "New character", 
            "Enter new line:")
        if ok and self.checkInputValidity(new_value):
            self.editsWereMade = True
            self.characters.append(str(new_value))
            self.characters = self.characters
            self.sendButton.settingsChanged()

    def editCharacters(self):
        """"Deal with user requested edition of character in list."""
        selected_idx = self.selectedCharacters[0]
        old_value = self.characters[selected_idx]
        new_value, ok = QInputDialog.getText(self, "Edit character", 
            "Enter new value for this line:", text=old_value)
        if ok and self.checkInputValidity(new_value):
            if new_value != old_value:
                self.editsWereMade = True
                self.characters[selected_idx] = str(new_value)
                self.characters = self.characters
                self.sendButton.settingsChanged()

    def deleteCharacter(self):
        """"Deal with user requested deletion of character in list."""
        selected_idx = self.selectedCharacters[0]
        old_value = self.characters[selected_idx]
        answer = QMessageBox.question(self, "Delete character",
            f"Do you really want to delete line '{old_value}'")
        if answer == QMessageBox.Yes:
            self.editsWereMade = True
            del self.characters[selected_idx]
            self.characters = self.characters
            self.sendButton.settingsChanged()

    def resetCharacters(self):
        """"Revert all edits to character list."""
        self.characters = self.cachedCaracters[:]
        self.editsWereMade = False
        self.resetButton.setDisabled(not self.editsWereMade)
        self.sendButton.settingsChanged()

    def checkInputValidity(self, value):
        """"Check validity of user-submitted character list entry."""
        if value == "":
            QMessageBox.warning(self, "Invalid input",
                "Please submit a nonempty string value.")
            return False
        if [item for item in value.split(", ") if item == ""]:
            QMessageBox.warning(self, "Invalid input",
                "Please make sure your entry consists in nonempty strings "
                "separated by \", \".")
            return False
        return True

    def updateButtons(self):
        """Enable/disable buttons depending on selection in list."""
        self.editButton.setDisabled(len(self.selectedCharacters) == 0)
        self.deleteButton.setDisabled(len(self.selectedCharacters) == 0)
        self.resetButton.setDisabled(not self.editsWereMade)

    def sendNoneToOutputs(self):
        """Send None token to all output channels."""
        for channel in [c.name for c in self.outputs]:
            self.send(channel, None, self)
        return

    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.noLanguageModelWarning()
            self.sendNoneToOutputs()
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            return
       
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )

        # Disable control area and initialize progress bar...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.char_df))       

        # Get start and end pos of concatenated input segments...
        startPositions = [0]
        endPositions = list()
        numSegments = len(self.inputSeg)
        for idx in range(1, numSegments):
            prevSegLen = len(self.inputSeg[idx-1].get_content())
            startPositions.append(startPositions[-1] + prevSegLen + 1)
            endPositions.append(startPositions[-1] - 1)
        endPositions.append(startPositions[-1] + 
                            len(self.inputSeg[-1].get_content()) + 1)

        # Get or update character aliases...
        find_pairs = sys.modules['charnetto.find_pairs']
        characters = [entry.split(", ") for entry in self.characters]
        find_pairs.map_names(self.char_df, characters)

        # Initializations...
        charSegments = list()
        currentSegmentIdx = 0
                
        # For each character token in Charnetto's output...
        for index, charToken in self.char_df.iterrows():
        
            # Skip non-PER named entities.
            if charToken["tag"] != "PER":
                continue

            # Get index of containing segment...
            while charToken["end_pos"] > endPositions[currentSegmentIdx]:
                currentSegmentIdx += 1
                
            # Create segment for char with its actual coordinates...
            strIndex = self.inputSeg[currentSegmentIdx].str_index
            start = charToken["start_pos"]-startPositions[currentSegmentIdx]
            end = charToken["end_pos"]-startPositions[currentSegmentIdx]
            annotations = {"id": charToken["alias"]}
            charSegments.append(Segment(strIndex, start, end, annotations))
            
            progressBar.advance()

        # Send output...
        outputSegmentation = Segmentation(charSegments, 
                                           label=self.captionTitle)
        self.send("Character segmentation", outputSegmentation, self)
        print(outputSegmentation.to_string())

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSegmentation)
        message = pluralize(message, len(outputSegmentation))
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             

    #----------------------------------------------------------------------
    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...
    
    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
class LexicalHunter(OWTextableBaseWidget):
    """Textable widget for identifying lexical fields in segments
    """

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "Lexical Hunter"
    description = "Identify words contained in lists (lexical fields)"
    icon = "icons/lexical_hunter.svg"
    priority = 22

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Word segmentation", Segmentation, "inputData")]
    outputs = [("Segmentation with annotations", Segmentation)]

    #----------------------------------------------------------------------
    # Layout parameters...

    want_main_area = False

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    savedDict = settings.Setting({})
    selectedFields = settings.Setting([])
    autoSend = settings.Setting(False)
    labelName = settings.Setting("Topic")

    def __init__(self):
        """Widget creator."""

        super().__init__()

        # Other attributes...
        self.inputSeg = None
        self.outputSeg = None
        self.titleLabels = []
        # Put the saved dictionarys, if exist, in the global variable defaultDict
        if self.savedDict:
            defaultDict.clear()
            defaultDict.update(self.savedDict)

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=self.updateGUI,
        )

        # User interface...

        # Options box...
        titleLabelsList = gui.widgetBox(
            widget=self.controlArea,
            box="Click to select the lexical lists",
            orientation="vertical",
        )
        # List of Lexical list that the user can select
        self.titleListbox = gui.listBox(
            widget=titleLabelsList,
            master=self,
            value="selectedFields",  # setting (list)
            labels="titleLabels",  # setting (list)
            callback=self.sendButton.settingsChanged,
            tooltip="The list of lexical list that you want\
                to use for annotation",
        )
        self.titleListbox.setMinimumHeight(150)
        self.titleListbox.setSelectionMode(2)

        # Edit a list ...
        self.OptionList = gui.button(
            widget=titleLabelsList,
            master=self,
            label="Edit lists",
            callback=self.editList,
            width=100,
        )

        self.titleEdit = gui.lineEdit(
            widget=self.controlArea,
            master=self,
            value="labelName",
            label="Annotation key : ",
            orientation="horizontal",
        )
        self.titleEdit.setPlaceholderText("Topic")

        gui.rubber(self.controlArea)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()

        # Show the lists in the box
        self.setTitleList()
        # Send data if autoSend.
        self.sendButton.sendIf()

    def getDefaultLists(self):
        """ DEPRECATED
        Gets default lexical lists stored in txt files """
        # Seting the path of the files...
        __location__ = os.path.realpath(
            os.path.join(os.getcwd(), os.path.dirname(__file__)))
        if platform.system() == "Windows":
            __location__ += r"\lexicalfields"
        else:
            __location__ += r"/lexicalfields"

        # Initiations
        self.myContent = {}

        # For each txt file in the directory...
        for file in os.listdir(__location__):
            if file.endswith(".txt"):
                # Gets txt file name and substracts .txt extension
                fileName = os.path.join(__location__, file)

                if platform.system() == "Windows":
                    listLexicName = fileName.split('\\')

                else:
                    listLexicName = fileName.split('/')

                lexicName = listLexicName[-1]
                lexicName = re.sub('\.txt$', '', lexicName)

                # Trying to open the files and store their content in
                # a dictionnary then store all of theses in a list
                try:
                    fileHandle = codecs.open(fileName, encoding='utf-8')
                    fileContent = fileHandle.read()
                    fileHandle.close()
                    defaultDict[lexicName] = fileContent.split('\n')
                except IOError:
                    QMessageBox.warning(None, 'Textable',
                                        "Couldn't open file.", QMessageBox.Ok)
                    return

    def setTitleList(self):
        """Creates a list with each key of the default dictionnaries to display
        them on the list box Be careful, the order really matter for the
        selectedFields variable !"""

        self.titleLabels = sorted(defaultDict.keys())
        # save the dictionnary used to display the list as a setting
        self.savedDict.clear()
        self.savedDict.update(defaultDict)

    def editList(self):
        """Creates edit list widget"""
        self.widgetEdit = WidgetEditList(self)
        self.widgetEdit.show()
        self.setTitleList()

    def inputData(self, newInput):
        """Process incoming data."""
        ######### traiter inputSeg comme le segement d entree ##########
        self.inputSeg = newInput
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def sendData(self):
        """Compute result of widget processing and send to output"""

        # An input is needed
        if self.inputSeg == None:
            self.infoBox.setText("A segmentation input is needed.", "warning")
            self.send("Segmentation with annotations", None, self)
            return

        # Skip if no list is selected
        if self.titleLabels == None:
            self.infoBox.setText(
                "You need to define at least one lexical list.", "error")
            self.send("Segmentation with annotations", None, self)
            return

        # A list must have been selected
        if len(self.selectedFields) == 0:
            self.infoBox.setText("Please select one or more lexical lists.",
                                 "warning")
            self.send("Segmentation with annotations", None, self)
            return

        # A annotation key must have been defined
        """
        if self.labelName == "":
            self.infoBox.setText(
                "An annotation key is needed.",
                "warning"
            )
            self.send("Segmentation with annotations", None, self)
            return
        """

        self.huntTheLexic()

        # Set status to OK and report data size...
        message = "%i segment@p sent to output " % len(self.outputSeg)
        message = pluralize(message, len(self.outputSeg))

        # Segmentation go to outputs...
        self.send("Segmentation with annotations", self.outputSeg, self)
        self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()

    ######## NOTRE FONCTION PRINCIPALE !!! #######
    def huntTheLexic(self):
        """
            main I/O function, filters the inputSeg with the selected
            lexical fields and outputs a copy of the input this Segmentation
            with segments labelised according to the topic they belong in
        """

        # initiations...
        out = list()
        selectedListsNames = list()

        # first we select the topics according to the ones the user chose
        if self.titleLabels:
            selectedListsNames = [
                list(self.titleLabels)[idx] for idx in self.selectedFields
            ]

        # we can then associate the topics with their respective lists
        selectedLists = {
            key: value
            for key, value in defaultDict.items() if key in selectedListsNames
        }

        # if we have an input, we can select the segments of the input and
        # label them according to the lists they are found in
        if self.inputSeg is not None:
            for filter_list in selectedLists:
                work_list = [i for i in selectedLists[filter_list] if i]
                if work_list:
                    out.append(
                        Segmenter.select(
                            self.inputSeg,
                            self.listToRegex(work_list),
                            label=filter_list,
                        )[0])

        # lastly we define the output as a segmentation that is a copy of
        # the input, with the segments that we found labeled accordingly
        if self.labelName == "":
            labelNameVar = "Topic"
        else:
            labelNameVar = self.labelName

        self.outputSeg = Segmenter.concatenate(
            [Segmenter.bypass(self.inputSeg, label="__None__")] + out,
            merge_duplicates=True,
            label=self.captionTitle,
            import_labels_as=labelNameVar,
        )

    def updateGUI(self):
        """Update GUI state"""

        if len(self.titleLabels) > 0:
            self.selectedFields = self.selectedFields

    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...
    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle

            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)

    #An eventually useful function, set aside for the moment
    def listToRegex(self, list):
        """
        Takes a list and turns it into a
        regex that matches any elements within it
        """

        regexString = "^(" + "|".join(list) + ")$"
        exitRegex = re.compile(regexString, re.IGNORECASE)

        return exitRegex
Exemple #10
0
class OWTextableTextTree(OWTextableBaseWidget):
    """Orange widget for loading text folders"""

    name = "Text Tree"
    description = "Import data from raw text trees"

    icon = "icons/Textfolders.png"

    icon = "icons/textTree.svg"

    priority = 2

    # Input and output channels...
    inputs = [('Message', JSONMessage, "inputMessage", widget.Single)]
    outputs = [('Text data', Segmentation)]

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    # Settings...
    autoSend = settings.Setting(True)
    folders = settings.Setting([])
    encoding = settings.Setting('iso-8859-1')
    operation = settings.Setting('nothing')
    sampling = settings.Setting(100)
    autoNumber = settings.Setting(False)
    autoNumberKey = settings.Setting(u'num')
    importFilenames = settings.Setting(True)
    importFolderName = settings.Setting(True)

    lastLocation = settings.Setting('.')
    displayAdvancedSettings = settings.Setting(False)
    folder = settings.Setting(u'')

    want_main_area = False

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Other attributes...
        self.segmentation = None
        self.operation = "no"
        self.applyInclusion = False
        self.applyExclusion = False
        self.applySampling = True
        self.samplingRate = 100
        self.createdInputs = list()
        self.folderLabels = list()
        self.selectedFolderLabels = list()
        self.rootFolderPath = u''
        self.inclusionsUser = u''
        self.exclusionsUser = u''
        self.newAnnotationKey = u''
        self.newAnnotationValue = u''

        # self.folder is a dictionary whose keys are :'rootPath', 'maxDepth','inclusionsUser','exclusionsUser', ...
        # ... 'samplingRate' and 'fileList'
        self.folder = dict()

        # self.folders is a list of previously defined "self.folder" dictionaries
        self.folders = list()

        # self.inclusionList is the default inclusion list (used in minimal mode, ...
        # ... and in advanced mode when no inclusion has been selected)
        self.inclusionList = [".txt", ".html", ".xml", ".csv", ".rtf"]

        # self.inclusionList is the default null inclusion list (used in minimal mode, ...
        # ... and in advanced mode when no inclusion has been selected)
        self.exclusionList = []

        self.infoBox = InfoBox(widget=self.controlArea)

        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute='infoBox',
            sendIfPreCallback=self.updateGUI,
        )
        self.advancedSettings = AdvancedSettings(
            widget=self.controlArea,
            master=self,
            callback=self.sendButton.settingsChanged,
        )

        # GUI...

        # Advanced settings checkbox...
        self.advancedSettings.draw()

        # BASIC GUI...

        # Basic folder box
        basicFolderBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Source',
            orientation='vertical',
            addSpace=False,
        )
        basicFolderBoxLine1 = gui.widgetBox(
            widget=basicFolderBox,
            box=False,
            orientation='horizontal',
        )
        gui.lineEdit(
            widget=basicFolderBoxLine1,
            master=self,
            value='rootFolderPath',
            orientation='horizontal',
            label=u'Folder path:',
            labelWidth=101,
            callback=self.add,
            tooltip=(u"The path of the folder."),
        )
        gui.separator(widget=basicFolderBoxLine1, width=5)
        gui.button(
            widget=basicFolderBoxLine1,
            master=self,
            label=u'Browse',
            callback=self.browse,
            tooltip=(u"Open a dialog for selecting a top folder."),
        )

        gui.separator(widget=basicFolderBox, width=3)
        self.advancedSettings.basicWidgets.append(basicFolderBox)
        self.advancedSettings.basicWidgetsAppendSeparator()

        # ADVANCED GUI...

        # folder box
        folderBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Sources',
            orientation='vertical',
            addSpace=False,
        )
        folderBoxLine1 = gui.widgetBox(
            widget=folderBox,
            box=False,
            orientation='horizontal',
            addSpace=True,
        )
        self.folderListbox = gui.listBox(
            widget=folderBoxLine1,
            master=self,
            value='selectedFolderLabels',
            labels='folderLabels',
            callback=self.updatefolderBoxButtons,
            tooltip=(u"The list of folders whose content will be imported.\n"
                     u"\nIn the output segmentation, the content of each\n"
                     u"folder appears in the same position as in the list.\n"
                     u"\nColumn 1 shows the folder's name.\n"
                     u"Column 2 shows the folder's depth.\n"
                     u"Column 3 shows the inclusions filter.\n"
                     u"Column 4 shows the exclusions filter.\n"
                     u"Column 5 shows the folder's level of sampling."),
        )
        font = QFont()
        font.setFamily('Courier')
        font.setStyleHint(QFont.Courier)
        font.setPixelSize(12)
        self.folderListbox.setFont(font)
        folderBoxCol2 = gui.widgetBox(
            widget=folderBoxLine1,
            orientation='vertical',
        )
        self.moveUpButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Move Up',
            callback=self.moveUp,
            tooltip=(u"Move the selected folder upward in the list."),
        )
        self.moveDownButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Move Down',
            callback=self.moveDown,
            tooltip=(u"Move the selected folder downward in the list."),
        )
        self.removeButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Remove',
            callback=self.remove,
            tooltip=(u"Remove the selected folder from the list."),
        )
        self.clearAllButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Clear All',
            callback=self.clearAll,
            tooltip=(u"Remove all folders from the list."),
        )
        self.exportButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'',
            callback=self.exportList,
            disabled=True,
            tooltip=(u"Open a dialog for selecting a folder where the folder\n"
                     u"list can be exported in JSON format."),
        )
        self.importButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'',
            callback=self.importList,
            disabled=True,
            tooltip=(u"Open a dialog for selecting a folder list to\n"
                     u"import (in JSON format). folders from this list\n"
                     u"will be added to those already imported."),
        )
        folderBoxLine2 = gui.widgetBox(
            widget=folderBox,
            box=False,
            orientation='vertical',
        )
        # Add folder box
        addFolderBox = gui.widgetBox(
            widget=folderBoxLine2,
            box=True,
            orientation='vertical',
        )
        addFolderBoxLine1 = gui.widgetBox(
            widget=addFolderBox,
            orientation='horizontal',
        )
        # Folder path input
        gui.lineEdit(
            widget=addFolderBoxLine1,
            master=self,
            value='rootFolderPath',
            orientation='horizontal',
            label=u'Folder path:',
            labelWidth=101,
            callback=self.updateGUI,
            tooltip=(u"The paths of the folders that will be added to the\n"
                     u"list when button 'Add' is clicked.\n\n"
                     u"Successive paths must be separated with ' / ' \n"
                     u"(whitespace + slash + whitespace). Their order in\n"
                     u"the list will be the same as in this field."),
        )
        gui.separator(widget=addFolderBoxLine1, width=5)
        # Button Browse
        gui.button(
            widget=addFolderBoxLine1,
            master=self,
            label=u'Browse',
            callback=self.browse,
            tooltip=(u"Open a dialog for selecting a top folder.\n\n"
                     u"Selected folder paths will appear in the field to\n"
                     u"the left of this button afterwards, ready to be\n"
                     u"added to the list when button 'Add' is clicked."),
        )
        gui.separator(widget=addFolderBox, width=10)

        # Filter box to input include
        gui.separator(widget=addFolderBox, width=3)
        includeBoxLine1 = gui.widgetBox(
            widget=addFolderBox,
            box=False,
            orientation='horizontal',
        )

        # Include box
        gui.checkBox(
            widget=includeBoxLine1,
            master=self,
            value='applyInclusion',
            label=u'Include',
            labelWidth=100,
            callback=lambda: includeLineEdit.setDisabled(not self.
                                                         applyInclusion),
            tooltip=(u"Choose the inclusion(s)"),
        )
        includeLineEdit = gui.lineEdit(
            widget=includeBoxLine1,
            master=self,
            value='inclusionsUser',
            orientation='horizontal',
            label=u'',
            disabled=True,
            labelWidth=101,
            tooltip=(u"This field lets you specify a custom filter\n"
                     u"to select the folders to be\n"
                     u"added to the list."),
        )

        # Filter box to exclude
        gui.separator(widget=addFolderBox, width=3)
        excludeBoxLine1 = gui.widgetBox(
            widget=addFolderBox,
            box=False,
            orientation='horizontal',
        )
        # Exclude box
        gui.checkBox(
            widget=excludeBoxLine1,
            master=self,
            value='applyExclusion',
            label=u'Exclude',
            labelWidth=100,
            disabled=False,
            callback=lambda: includeLineEdit2.setDisabled(not self.
                                                          applyExclusion),
            tooltip=(u"Exclude the inclusion(s)"),
        )
        includeLineEdit2 = gui.lineEdit(
            widget=excludeBoxLine1,
            master=self,
            value='exclusionsUser',
            orientation='horizontal',
            label=u'',
            disabled=True,
            labelWidth=101,
            tooltip=(u"This field lets you specify a custom filter\n"
                     u"to select the folders to be\n"
                     u"added to the list."),
        )

        # Sampling box to input the level of sampling
        gui.separator(widget=addFolderBox, width=3)
        samplingBoxLine1 = gui.widgetBox(
            widget=addFolderBox,
            box=False,
            orientation='horizontal',
        )
        # Check box for sampling
        gui.checkBox(
            widget=samplingBoxLine1,
            master=self,
            value='applySampling',
            label=u'Sampling',
            labelWidth=100,
            disabled=False,
            callback=lambda: samplingSpin.setDisabled(not self.applySampling),
            tooltip=(u"Choose the sampling level"),
        )

        samplingSpin = gui.spin(
            widget=samplingBoxLine1,
            master=self,
            value='samplingRate',
            minv=10,
            maxv=100,
            labelWidth=50,
            orientation='horizontal',
            tooltip=(u"sampling level"),
        )
        gui.separator(widget=addFolderBox, width=3)
        self.addButton = gui.button(
            widget=addFolderBox,
            master=self,
            label=u'Add',
            callback=self.add,
            tooltip=(u"Add the folder(s) currently displayed in the\n"
                     u"'folders' text field to the list.\n\n"
                     u"Each of these folders will be associated with the\n"
                     u"specified encoding and annotation (if any).\n\n"
                     u"Other folders may be selected afterwards and\n"
                     u"assigned a different encoding and annotation."),
        )
        self.advancedSettings.advancedWidgets.append(folderBox)
        self.advancedSettings.advancedWidgetsAppendSeparator()

        # Options box...
        optionsBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Options',
            orientation='vertical',
            addSpace=False,
        )
        optionsBoxLine1 = gui.widgetBox(
            widget=optionsBox,
            box=False,
            orientation='horizontal',
        )

        gui.separator(widget=optionsBox, width=3)
        optionsBoxLine2 = gui.widgetBox(
            widget=optionsBox,
            box=False,
            orientation='horizontal',
        )
        gui.checkBox(
            widget=optionsBoxLine2,
            master=self,
            value='autoNumber',
            label=u'Auto-number with key:',
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Annotate folders with increasing numeric indices."),
        )
        self.autoNumberKeyLineEdit = gui.lineEdit(
            widget=optionsBoxLine2,
            master=self,
            value='autoNumberKey',
            orientation='horizontal',
            callback=self.sendButton.settingsChanged,
            tooltip=(u"Annotation key for folder auto-numbering."),
        )
        gui.separator(widget=optionsBox, width=3)
        self.advancedSettings.advancedWidgets.append(optionsBox)
        self.advancedSettings.advancedWidgetsAppendSeparator()

        gui.rubber(self.controlArea)

        # Send button...
        self.sendButton.draw()

        # Info box...
        self.infoBox.draw()

        self.adjustSizeWithTimer()
        QTimer.singleShot(0, self.sendButton.sendIf)

    def inputMessage(self, message):
        """Handle JSON message on input connection"""
        if not message:
            return
        self.displayAdvancedSettings = True
        self.advancedSettings.setVisible(True)
        self.clearAll()
        self.infoBox.inputChanged()
        try:
            jsonData = json.loads(message.content)
            tempFolders = list()
            for entry in jsonData:
                path = entry.get('path', '')
                encoding = entry.get('encoding', '')
                annotationKey = entry.get('annotation_key', '')
                annotationValue = entry.get('annotation_value', '')
                if path == '' or encoding == '':
                    self.infoBox.setText(
                        u"Please verify keys and values of incoming "
                        u"JSON message.", 'error')
                    self.send('Text data', None, self)
                    return
                depth = "0"
                options = "[i]:{unicorn}"
                tempFolders.append((
                    name,
                    path,
                    depth,
                    options,
                ))
            self.folders.extend(tempFolders)
            self.sendButton.settingsChanged()
        except ValueError:
            self.infoBox.setText(
                u"Please make sure that incoming message is valid JSON.",
                'error')
            self.send('Text data', None, self)
            return

    def sendData(self):
        """Load folders, create and send segmentation"""

        # Check that there's something on input...
        if (self.displayAdvancedSettings
                and not self.folders) or not (self.rootFolderPath
                                              or self.displayAdvancedSettings):
            self.infoBox.setText(u'Please select input folder.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning')
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFolders = self.folders
        else:
            myFolders = [self.folder]

        # Annotations...
        allFileListContent = list()
        for myFolder in myFolders:

            myFiles = myFolder['fileList']

            for myFile in myFiles:

                annotation = dict()
                annotation['file name'] = myFile['fileName']
                annotation['file depth level'] = myFile['depthLvl']
                annotation['file path'] = myFile['absoluteFilePath']
                try:
                    annotation['file encoding, confidence'] = myFile[
                        'encoding'] + ", " + str(myFile['encodingConfidence'])
                except TypeError:
                    annotation['file encoding, confidence'] = "unknown"

                depths = [k for k in myFile.keys() if k.startswith('depth_')]
                for depth in depths:
                    annotation[depth] = myFile[depth]

                annotations.append(annotation)
                allFileListContent.append(myFile['fileContent'])

        # Create an LTTL.Input for each files...

        if len(allFileListContent) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(allFileListContent)):
            myInput = Input(allFileListContent[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(allFileListContent) == 1:
            self.segmentation = self.createdInputs[0]
        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )
        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()

    def clearCreatedInputs(self):
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def importList(self):
        """Display a folderDialog and import folder list"""
        folderPath = QFileDialog.getOpenFileName(self, u'Import folder List',
                                                 self.lastLocation,
                                                 u'Text folders (*)')
        if not folderPath:
            return
        self.rootFolderPath = os.path.normpath(folderPath)
        self.lastLocation = os.path.dirname(folderPath)
        self.error()
        try:
            folderHandle = codecs.open(folderPath, encoding='utf8')
            folderContent = folderHandle.read()
            folderHandle.close()
        except IOError:
            QMessageBox.warning(None, 'Textable', "Couldn't open folder.",
                                QMessageBox.Ok)
            return
        try:
            jsonData = json.loads(folderContent)
            tempFolders = list()
            for entry in jsonData:
                path = entry.get('path', '')
                encoding = entry.get('encoding', '')
                annotationKey = entry.get('annotation_key', '')
                annotationValue = entry.get('annotation_value', '')
                if path == '' or encoding == '':
                    QMessageBox.warning(
                        None, 'Textable',
                        "Selected JSON folder doesn't have the right keys "
                        "and/or values.", QMessageBox.Ok)
                    return
                tempFolders.append((
                    path,
                    encoding,
                    annotationKey,
                    annotationValue,
                ))
            self.folders.extend(tempFolders)
            if tempFolders:
                self.sendButton.settingsChanged()
        except ValueError:
            QMessageBox.warning(None, 'Textable', "JSON parsing error.",
                                QMessageBox.Ok)
            return

    def exportList(self):
        """Display a folderDialog and export folder list"""
        toDump = list()
        myFolders = self.folders
        for myFolder in myFolders:
            toDump.append({
                'path': myFolder[0],
                'encoding': myFolder[1],
            })
            if myFolder[2] and myFolder[3]:
                toDump[-1]['annotation_key'] = myFolder[2]
                toDump[-1]['annotation_value'] = myFolder[3]
        folderPath = QFileDialog.getSaveFileName(
            self,
            u'Export folder List',
            self.lastLocation,
        )

        if folderPath:
            self.lastLocation = os.path.dirname(folderPath)
            outputfolder = codecs.open(
                folderPath,
                encoding='utf8',
                mode='w',
                errors='xmlcharrefreplace',
            )
            outputfolder.write(
                normalizeCarriageReturns(
                    json.dumps(toDump, sort_keys=True, indent=4)))
            outputfolder.close()
            QMessageBox.information(None, 'Textable',
                                    'folder list correctly exported',
                                    QMessageBox.Ok)

    def getFileList(self):

        initialRootParentPath, _ = os.path.split(

            # self.rootFolderPath is the initially selected's folder parent
            self.rootFolderPath)
        fileList = list()

        # fileListExt is a list of files matching default extension
        fileListExt = list()
        depthList = list()

        progressBarZero = gui.ProgressBar(self, iterations=1)

        # Using os.walk to walk through directories :
        # Variables descriptions :
        # currPath is a STRING, the path to the directory.
        # dirNames is a LIST of the names of subdirectories.
        # fileNames is a LIST of the names of the files in currPath
        # symlink are not considered in this analysis

        for currPath, dirNames, fileNames in os.walk(self.rootFolderPath):

            currRelPath = currPath[
                len(initialRootParentPath) +
                1:]  # defines current relative path by similar initial parent path part
            currRelPathList = os.path.normpath(currRelPath).split(
                os.sep)  # splits current relative path by os separator

            for fileName in fileNames:

                # file dict is a dictionary of the file's informations will get following keys :
                # file = {
                # "absoluteFilePath",
                # "fileName",
                # "depth_0",
                # "depth_X"
                # depthLvl",
                # "fileContent"
                # }

                # 'fileContent','encoding' and 'encodingConfidence' keys are defined when function "openFileList" is called

                file = dict()

                # Initial annotations correspond different subfolders browsed by each depth level (used for depth_X annot.)
                annotations = currRelPathList[:]

                currDepth = len(annotations) - 1
                depthList.append(currDepth)

                file['absoluteFilePath'] = os.path.join(currPath, fileName)
                file['fileName'] = fileName
                file['depthLvl'] = currDepth

                file['depth_0'] = annotations[0]

                # Created an annotation by depth level, corresponding to folder names
                for i in range(1, currDepth + 1):
                    file['depth_' + str(i)] = annotations[i]

                # Apply default file extension filter
                for extension in self.inclusionList:
                    if fileName.endswith(extension):

                        # FileListExt = file list created with default inclusion criteria (text extensions from inclusionList)
                        fileListExt.append(file)

                fileList.append(file)

        # apply inclusion filter
        if self.applyInclusion:
            fileListIncl = [
                file for file in fileList
                # match in inclusion list
                if self.match(file['fileName'], self.inclusionsUserAsList)
            ]
        else:
            fileListIncl = fileListExt

        # apply exclusion filter
        if self.applyExclusion:
            fileListExcl = [
                file for file in fileListIncl
                # no match in exclusion list
                if not self.match(file['fileName'], self.exclusionsUserAsList)
            ]
        else:
            fileListExcl = fileListIncl

        # output file list
        self.fileList = fileListExcl

        if self.fileList:
            self.maxDepth = max(depthList)
            self.fileList = self.sampleFileList()
            self.openFileList()
        else:
            self.maxDepth = 0

        progressBarZero.finish()

    # Test if file contains one of the patterns in patternList
    def match(self, file, patternList):
        for pattern in patternList:
            if pattern in file:
                return True
        return False

    def openFileList(self):

        tempFileList = list()

        progressBarOpen = gui.ProgressBar(self, iterations=len(self.fileList))

        for file in self.fileList:
            fileContent = ""
            try:
                filePath = file['absoluteFilePath']
            except TypeError:
                pass

            encodings = getPredefinedEncodings()
            try:
                with open(filePath, 'rb') as openedFile:
                    fileContent = openedFile.read()
                    charsetDict = chardet.detect(fileContent)
                    detectedEncoding = charsetDict['encoding']
                    detectedConfidence = charsetDict['confidence']

                    # Chunking functionnality should be added here

                    try:
                        encodings.remove(detectedEncoding)
                        encodings.insert(0, detectedEncoding)

                    except ValueError:
                        pass

                    for encoding in encodings:
                        try:
                            self.fileContent = fileContent.decode(encoding)
                        except:
                            pass

                file['encoding'] = detectedEncoding
                file['fileContent'] = self.fileContent
                file['encodingConfidence'] = detectedConfidence
                progressBarOpen.advance()
                tempFileList.append(file)

            except IOError:
                if len(myFiles) > 1:
                    message = u"Couldn't open file '%s'." % filePath
                else:
                    message = u"Couldn't open file."
                self.infoBox.setText(message, 'error')
                self.send('Text data', None, self)
                return

        self.fileList = tempFileList

        self.folder = {
            'rootPath': self.rootFolderPath,
            'maxDepth': self.maxDepth,
            'inclusionsUser': self.inclusionsUser,
            'exclusionsUser': self.exclusionsUser,
            'samplingRate': self.samplingRate,
            'fileList': self.fileList
        }
        progressBarOpen.finish()

    def browse(self):
        """Display a QFileDialog and select a folder"""

        rootFolderPath = QFileDialog.getExistingDirectory(
            self,
            u'Select Folder(s)',
            self.lastLocation,
        )
        if not rootFolderPath:
            return

        rootFolderPath = os.path.normpath(rootFolderPath)
        self.rootFolderPath = rootFolderPath
        self.lastLocation = rootFolderPath

        if self.displayAdvancedSettings:
            pass
        else:
            self.getFileList()
            self.folder = {
                'rootPath': self.rootFolderPath,
                'maxDepth': self.maxDepth,
                'fileList': self.fileList,
            }
            self.sendButton.settingsChanged()

        self.updateGUI()

    def moveUp(self):
        """Move folder upward in folders listbox"""
        if self.selectedFolderLabels:
            index = self.selectedFolderLabels[0]
            if index > 0:
                temp = self.folders[index - 1]
                self.folders[index - 1] = self.folders[index]
                self.folders[index] = temp
                self.selectedFolderLabels.listBox.item(index -
                                                       1).setSelected(1)
                self.sendButton.settingsChanged()

    def moveDown(self):
        """Move folder downward in folders listbox"""
        if self.selectedFolderLabels:
            index = self.selectedFolderLabels[0]
            if index < len(self.folders) - 1:
                temp = self.folders[index + 1]
                self.folders[index + 1] = self.folders[index]
                self.folders[index] = temp
                self.selectedFolderLabels.listBox.item(index +
                                                       1).setSelected(1)
                self.sendButton.settingsChanged()

    def clearAll(self):
        """Remove all folders from folders attr"""
        del self.folders[:]
        del self.selectedFolderLabels[:]
        self.sendButton.settingsChanged()

    def remove(self):
        """Remove folder from folders attr"""
        if self.selectedFolderLabels:
            index = self.selectedFolderLabels[0]
            self.folders.pop(index)
            del self.selectedFolderLabels[:]
            self.sendButton.settingsChanged()

    def add(self):
        """Add folders to folders attr"""

        # Identify sequences separated by a comma (,) and deletes existing whitespaces
        self.inclusionsUserAsList = [
            x.strip() for x in self.inclusionsUser.split(",") if x.strip()
        ]
        self.exclusionsUserAsList = [
            x.strip() for x in self.exclusionsUser.split(",") if x.strip()
        ]

        # Calling the GetFileList function returns a self.fileList list of all files corresponding to either defaults
        # or optional settings
        self.getFileList()

        self.folders.append(self.folder)

        self.sendButton.settingsChanged()

    def sampleFileList(self):

        myList = list(self.fileList)

        # Sampling rate from input allows calculation of the sampling percentage
        samplePercentage = self.samplingRate / 100.0

        # The initial list is shuffled so that files from all folders can be picked randomly
        random.shuffle(myList)

        # Files are picked randomly from the previously shuffled list
        nOfFiles = int(math.ceil(len(myList) * samplePercentage))
        return myList[:nOfFiles]

    def updateGUI(self):
        """Update GUI state"""
        if self.displayAdvancedSettings:
            if self.selectedFolderLabels:
                cachedLabel = self.selectedFolderLabels[0]
            else:
                cachedLabel = None

            del self.folderLabels[:]
            folderLabels = []

            if self.folders:
                folderRootPathsList = [f['rootPath'] for f in self.folders]
                maxDepthList = ['%s' % f['maxDepth'] for f in self.folders]
                inclusionsUserList = [
                    f['inclusionsUser'] for f in self.folders
                ]
                exclusionsUserList = [
                    f['exclusionsUser'] for f in self.folders
                ]
                samplingRatesList = [
                    '%s' % f['samplingRate'] for f in self.folders
                ]
                folderNamesList = [
                    os.path.basename(p) for p in folderRootPathsList
                ]
                maxFolderNameLen = max([len(n) for n in folderNamesList])

                for index in range(len(self.folders)):
                    format = u'%-' + str(maxFolderNameLen + 2) + u's'
                    folderLabel = format % folderNamesList[index]
                    folderLabel += "[d]:{" + maxDepthList[index] + "} "
                    folderLabel += "[i]:{" + inclusionsUserList[index] + "} "
                    folderLabel += "[e]:{" + exclusionsUserList[index] + "} "
                    folderLabel += "[s]:{" + samplingRatesList[index] + "%}"
                    folderLabels.append(folderLabel)

            self.folderLabels = folderLabels

            if cachedLabel is not None:
                self.sendButton.sendIfPreCallback = None
                self.selectedFolderLabels.listBox.item(
                    cachedLabel).setSelected(1)
                self.sendButton.sendIfPreCallback = self.updateGUI
            if self.rootFolderPath:
                if (self.newAnnotationKey and self.newAnnotationValue) or (
                        not self.newAnnotationKey
                        and not self.newAnnotationValue):
                    self.addButton.setDisabled(False)
                else:
                    self.addButton.setDisabled(True)
            else:
                self.addButton.setDisabled(True)
            if self.autoNumber:
                self.autoNumberKeyLineEdit.setDisabled(False)
            else:
                self.autoNumberKeyLineEdit.setDisabled(True)

            self.updatefolderBoxButtons()
            self.advancedSettings.setVisible(True)
        else:
            self.advancedSettings.setVisible(False)

    def updatefolderBoxButtons(self):
        """Update state of folder box buttons"""
        if self.selectedFolderLabels:
            self.removeButton.setDisabled(False)
            if self.selectedFolderLabels[0] > 0:
                self.moveUpButton.setDisabled(False)
            else:
                self.moveUpButton.setDisabled(True)
            if self.selectedFolderLabels[0] < len(self.folders) - 1:
                self.moveDownButton.setDisabled(False)
            else:
                self.moveDownButton.setDisabled(True)
        else:
            self.moveUpButton.setDisabled(True)
            self.moveDownButton.setDisabled(True)
            self.removeButton.setDisabled(True)
        if len(self.folders):
            self.clearAllButton.setDisabled(False)
            self.exportButton.setDisabled(True)
        else:
            self.clearAllButton.setDisabled(True)
            self.exportButton.setDisabled(True)

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)

    def onDeleteWidget(self):
        self.clearCreatedInputs()
class SpaCy(OWTextableBaseWidget):
    """Textable widget for NLP using spaCy."""

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "spaCy"
    description = "Natural language processing using spaCy"
    icon = "icons/spacy.svg"
    priority = 21   # TODO

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Text data", Segmentation, "inputData")]
    outputs = [
        ("Tokenized text", Segmentation, widget.Default),
        ("Named entities", Segmentation),      
        ("Noun chunks", Segmentation),
        ("Sentences", Segmentation),
    ]

    #----------------------------------------------------------------------
    # Layout parameters...
    
    want_main_area = False

    #----------------------------------------------------------------------
    # Settings...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0]
    )
    
    maxLen = settings.Setting("1000000")
    annotatePOSTags = settings.Setting(False)
    annotateDependencies = settings.Setting(False)
    annotateEntities = settings.Setting(False)
    segmentEntities = settings.Setting(False)
    segmentChunks = settings.Setting(False)
    segmentSentences = settings.Setting(False)
    autoSend = settings.Setting(False)
    model = settings.Setting("")

    def __init__(self):
        """Widget creator."""

        super().__init__()

        if INSTALLED_MODELS:
            self.model = INSTALLED_MODELS[0]
        else:
            self.model = ""

        # Other attributes...
        self.inputSeg = None
        self.nlp = None
        self.selectedModels = list()
        self.downloadableModelLabels = list()
        self.loadedComponents = list()       
        self.mustLoad = True

        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=None,
        )

        # User interface...

        # Tabs...
        self.tabs = QTabWidget()
        self.optionsTab = QWidget()
        self.modelManagerTab = QWidget()		
        self.tabs.addTab(self.optionsTab, "Options")
        self.tabs.addTab(self.modelManagerTab, "Model manager")
        
        # Options tab...
        OptionsTabBox = QHBoxLayout()
        
        optionsBox = gui.widgetBox(widget=self.optionsTab)

        self.modelComboBox = gui.comboBox(
            widget=optionsBox,
            master=self,
            value='model',
            label='Model: ',
            tooltip='Select the spaCy language model you want to use.',
            items=INSTALLED_MODELS[:],
            sendSelectedValue=True,
            callback=self.modelComboboxChanged,
        )

        gui.separator(widget=optionsBox, height=3)

        annotationsBox = gui.widgetBox(
            widget=optionsBox, 
            box="Additional token annotations:",
        )
        
        annotationsBoxLine1 = gui.widgetBox(
            widget=annotationsBox,
            orientation="horizontal",
            box=None,
        )
        
        gui.checkBox(
            widget=annotationsBoxLine1,
            master=self,
            value='annotatePOSTags',
            label='part-of-speech tags',
            callback=self.updateDisabledComponents,
            tooltip=("Annotate output tokens with part-of-speech tags."),
        )
        
        self.annotatePOSTagsReloadLabel = gui.label(
            annotationsBoxLine1,
            master=self,
            label="(reload needed)",
        )
        self.annotatePOSTagsReloadLabel.setStyleSheet(
            "font-style: oblique; color: gray"
        )
 
        annotationsBoxLine2 = gui.widgetBox(
            widget=annotationsBox,
            orientation="horizontal",
            box=None,
        )
        
        gui.checkBox(
            widget=annotationsBoxLine2,
            master=self,
            value='annotateDependencies',
            label='syntactic dependencies',
            callback=self.updateDisabledComponents,
            tooltip=("Annotate output tokens with syntactic dependencies."),
        )

        self.annotateDependenciesReloadLabel = gui.label(
            annotationsBoxLine2,
            master=self,
            label="(reload needed)",
        )
        self.annotateDependenciesReloadLabel.setStyleSheet(
            "font-style: oblique; color: gray"
        )

        annotationsBoxLine3 = gui.widgetBox(
            widget=annotationsBox,
            orientation="horizontal",
            box=None,
        )
        
        gui.checkBox(
            widget=annotationsBoxLine3,
            master=self,
            value='annotateEntities',
            label='named entities',
            callback=self.updateDisabledComponents,
            tooltip=("Annotate output tokens with named entities."),
        )

        self.annotateEntitiesReloadLabel = gui.label(
            annotationsBoxLine3,
            master=self,
            label="(reload needed)",
        )
        self.annotateEntitiesReloadLabel.setStyleSheet(
            "font-style: oblique; color: gray"
        )

        segmentationsBox = gui.widgetBox(
            widget=optionsBox, 
            box="Additional segmentations:",
        )
        
        segmentationsBoxLine1 = gui.widgetBox(
            widget=segmentationsBox,
            orientation="horizontal",
            box=None,
        )
        
        gui.checkBox(
            widget=segmentationsBoxLine1,
            master=self,
            value='segmentEntities',
            label='named entities',
            callback=self.updateDisabledComponents,
            tooltip="Output named entity segmentation on separate channel.",
        )
        
        self.segmentEntitiesReloadLabel = gui.label(
            segmentationsBoxLine1,
            master=self,
            label="(reload needed)",
        )
        self.segmentEntitiesReloadLabel.setStyleSheet(
            "font-style: oblique; color: gray"
        )
 
        segmentationsBoxLine2 = gui.widgetBox(
            widget=segmentationsBox,
            orientation="horizontal",
            box=None,
        )
        
        gui.checkBox(
            widget=segmentationsBoxLine2,
            master=self,
            value='segmentChunks',
            label='noun chunks',
            callback=self.updateDisabledComponents,
            tooltip="Output noun chunk segmentation on separate channel.",
        )

        self.segmentChunksReloadLabel = gui.label(
            segmentationsBoxLine2,
            master=self,
            label="(reload needed)",
        )
        self.segmentChunksReloadLabel.setStyleSheet(
            "font-style: oblique; color: gray"
        )

        segmentationsBoxLine3 = gui.widgetBox(
            widget=segmentationsBox,
            orientation="horizontal",
            box=None,
        )
        
        gui.checkBox(
            widget=segmentationsBoxLine3,
            master=self,
            value='segmentSentences',
            label='sentences',
            callback=self.updateDisabledComponents,
            tooltip="Output sentence segmentation on separate channel.",
        )

        self.segmentSentencesReloadLabel = gui.label(
            segmentationsBoxLine3,
            master=self,
            label="(reload needed)",
        )
        self.segmentSentencesReloadLabel.setStyleSheet(
            "font-style: oblique; color: gray"
        )

        self.updateReloadNeededLabels()

        gui.comboBox(
            widget=optionsBox,
            master=self,
            value='maxLen',
            items=["1 million"] + ["%i millions" % l for l in range(2, 10)]   \
                  + ["no limit"],
            sendSelectedValue=True,
            label=u'Max number of input characters:',
            tooltip=(
                "The spaCy parser and NER models require roughly 1GB of\n"
                "temporary memory per 100'000 characters in the input.\n"
                "This means long texts may cause memory allocation errors.\n"
                "If you're not using the parser or NER, or have lots of \n"
                "RAM, it's probably safe to increase the default limit of\n"
                "1 million characters."
            ),
        )

        gui.rubber(optionsBox)

        OptionsTabBox.addWidget(optionsBox)
        self.optionsTab.setLayout(OptionsTabBox)

        # Model manager tab...
        modelManagerTabBox = QHBoxLayout()

        modelManagerBox = gui.widgetBox(widget=self.modelManagerTab)
               
        gui.label(modelManagerBox, self, label="Available models:")
        
        self.downloadableModelsListbox = gui.listBox(
            widget=modelManagerBox,
            master=self,
            value="selectedModels",
            labels="downloadableModelLabels",
            callback=self.downloadableModelsListboxChanged,
            tooltip="Select language models then click Download.",
        )
        self.downloadableModelsListbox.setSelectionMode(3)
        self.downloadableModelLabels = DOWNLOADABLE_MODELS[:]
        self.downloadableModelLabels = self.downloadableModelLabels
        
        self.downloadButton = gui.button(
            widget=modelManagerBox,
            master=self,
            label="Download",
            callback=self.downloadModels,
            tooltip="Download the selected language models.",
        )
        self.downloadButton.setDisabled(True)
        
        modelManagerTabBox.addWidget(modelManagerBox)
        self.modelManagerTab.setLayout(modelManagerTabBox)

        self.controlArea.layout().addWidget(self.tabs)

        gui.rubber(self.controlArea)

        # Now Info box and Send button must be drawn...
        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input.", "warning")
        
        # Check that there's a model...
        if not self.model:
            self.infoBox.setText(
                "Please download a language model first.",
                "warning",
            )
            self.tabs.setCurrentIndex(1)
            optionsBox.setDisabled(True)

    def inputData(self, newInput):
        """Process incoming data."""
        self.inputSeg = newInput
        self.infoBox.inputChanged()
        self.sendButton.sendIf()
                  
    def modelComboboxChanged(self):
        """Respond to model change in UI (Options tab)."""
        self.mustLoad = True
        self.sendButton.settingsChanged()              

    def downloadableModelsListboxChanged(self):
        """Respond to model change in UI (Model manager tab)."""
        self.downloadButton.setDisabled(len(self.selectedModels) == 0)        

    def downloadModels(self):
        """Respond to Download button (Model manager tab)."""
        global INSTALLED_MODELS

        # Ask for confirmation...
        num_models = len(self.selectedModels)
        message = "Your are about to download %i language model@p. " +   \
                  "This may take up to several minutes depending on your " +  \
                  "internet connection. Do you want to proceed?"
        message = message % num_models
        buttonReply = QMessageBox.question(
            self, 
            "Textable", 
            pluralize(message, num_models),
            QMessageBox.Ok | QMessageBox.Cancel
        )
        if buttonReply == QMessageBox.Cancel:
            return
            
        # Download models...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=num_models)       
        for model_idx in reversed(self.selectedModels):
            model = self.downloadableModelLabels[model_idx]
            download_spacy_model(AVAILABLE_MODELS[model])
            del self.downloadableModelLabels[model_idx]
            progressBar.advance()
            
        # Update GUI...
        self.downloadableModelLabels = self.downloadableModelLabels
        self.selectedModels = list()
        progressBar.finish()
        self.controlArea.setDisabled(False)
        message = "Downloaded %i language model@p, please restart " +   \
                  "Orange for changes to take effect."
        message = message % num_models
        QMessageBox.information(
            None,
            "Textable",
            pluralize(message, num_models),
            QMessageBox.Ok
        )

    def updateDisabledComponents(self):
        """Load components if needed."""
        self.updateReloadNeededLabels()
        self.sendButton.settingsChanged()

    def updateReloadNeededLabels(self):
        """Update the labels that indicate whether model reload is needed."""
        self.annotatePOSTagsReloadLabel.setVisible(
            self.annotatePOSTags and ("tagger" not in self.loadedComponents)
        )
        self.annotateDependenciesReloadLabel.setVisible(
            self.annotateDependencies and ("parser" not in self.loadedComponents)
        )
        self.annotateEntitiesReloadLabel.setVisible(
            self.annotateEntities and ("ner" not in self.loadedComponents)
        )
        self.segmentSentencesReloadLabel.setVisible(
            self.segmentSentences and "parser" not in self.loadedComponents
        )
        self.segmentChunksReloadLabel.setVisible(
            self.segmentChunks and (
                ("tagger" not in self.loadedComponents)
                or ("parser" not in self.loadedComponents)
            )
        )
        self.segmentEntitiesReloadLabel.setVisible(
            self.segmentEntities and "ner" not in self.loadedComponents
        )

    def getComponentStatus(self):
        """Returns the list of disabled/enabled component based on UI state."""
        disabledComponents = list()
        enabledComponents = list()
        if self.annotatePOSTags or self.segmentChunks:
            enabledComponents.append("tagger")
        else:
            disabledComponents.append("tagger")
        if self.annotateDependencies or self.segmentChunks or self.segmentSentences:
            enabledComponents.append("parser")
        else:
            disabledComponents.append("parser")
        if self.annotateEntities or self.segmentEntities:
            enabledComponents.append("ner")
        else:
            disabledComponents.append("ner")
        return disabledComponents, enabledComponents
    
    def loadModel(self):
        """(Re-)load language model if needed."""
        # Initialize progress bar.
        self.infoBox.setText(
            u"Loading language model, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=1)       
        disabled, enabled = self.getComponentStatus()
        self.nlp = spacy.load(
            AVAILABLE_MODELS[self.model], 
            disable=disabled,
        )
        self.loadedComponents = enabled
        self.updateReloadNeededLabels()
        self.mustLoad = False
        progressBar.advance()
        progressBar.finish()
        self.controlArea.setDisabled(False)

    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.infoBox.setText(
                "Please download a language model first.",
                "warning",
            )
            self.tabs.setCurrentIndex(1)
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            for channel in [c.name for c in self.outputs]:
                self.send(channel, None, self)
            return

        # Check max length and adjust if needed...
        inputLength = sum(len(s.get_content()) for s in self.inputSeg)
        if self.maxLen != "no limit":
            maxNumChar = int(self.maxLen.split()[0]) * 1000000
            if inputLength > maxNumChar:
                self.infoBox.setText(
                    "Input exceeds max number of characters set by user.", 
                    "warning",
                )
                for channel in [c.name for c in self.outputs]:
                    self.send(channel, None, self)
                return
        else:
            if inputLength > self.nlp.max_length:
                maxNumChar = inputLength          
        
        # Load components if needed...
        disabled, enabled = self.getComponentStatus()
        if self.mustLoad or not(
            self.nlp and set(enabled) <= set(self.loadedComponents)
        ):
            self.loadModel()
        self.nlp.max_length = maxNumChar
        
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))       

        tokenSegments = list()
        entitySegments = list()
        chunkSegments = list()
        sentenceSegments = list()
        
        # Process each input segment...
        for segment in self.inputSeg:
        
            # NLP analysis...
            disabled, _ = self.getComponentStatus()
            disabled = [c for c in disabled if c in set(self.loadedComponents)]
            with self.nlp.disable_pipes(*disabled):
                doc = self.nlp(segment.get_content())

            # Get token segments...
            tokenSegments.extend(spacyItemsToSegments(doc, segment))

            # Get named entity segments...
            if self.segmentEntities:
                entitySegments.extend(spacyItemsToSegments(doc.ents, segment))

            # Get noun chunk segments...
            if self.segmentChunks:
                chunkSegments.extend(
                    spacyItemsToSegments(doc.noun_chunks, segment), 
                )

            # Get sentences segments...
            if self.segmentSentences:
                sentenceSegments.extend(
                    spacyItemsToSegments(doc.sents, segment), 
                )

            progressBar.advance()

        # Build segmentations and send them to output...                   
        tokenSeg = Segmentation(tokenSegments, self.captionTitle + "_tokens")
        self.send("Tokenized text", tokenSeg, self)
        if self.segmentChunks:
            chunkSeg = Segmentation(
                chunkSegments, 
                self.captionTitle + "_chunks",
            )
            self.send("Noun chunks", chunkSeg, self)
        if self.segmentEntities:
            entitySeg = Segmentation(
                entitySegments, 
                self.captionTitle + "_entities",
            )
            self.send("Named entities", entitySeg, self)
        if self.segmentSentences:
            sentenceSeg = Segmentation(
                sentenceSegments, 
                self.captionTitle + "_sentences",
            )
            self.send("Sentences", sentenceSeg, self)

        # Set status to OK and report data size...
        message = "%i token@p" % len(tokenSeg)
        message = pluralize(message, len(tokenSeg))
        if self.segmentChunks:
            message += ", %i chunk@p" % len(chunkSeg)
            message = pluralize(message, len(chunkSeg))
        if self.segmentEntities:
            message += ", %i " % len(entitySeg)
            message += "entity" if len(entitySeg) == 1 else "entities"
        if self.segmentSentences:
            message += ", %i sentence@p" % len(sentenceSeg)
            message = pluralize(message, len(sentenceSeg))
        message += " sent to output."
        last_comma_idx = message.rfind(",")
        if last_comma_idx > -1:
            message = message[:last_comma_idx] + " and" +    \
                message[last_comma_idx+1:]
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             

    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...
    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
Exemple #12
0
class OWTextableTextTree(OWTextableBaseWidget):
    """Orange widget for loading text folders"""

    name = "Text Tree"
    description = "Import data from raw text trees"

    icon = "icons/Textfolders.png"

    icon = "icons/textTree.svg"

    priority = 2

    # Input and output channels...
    inputs = [
        ('Message', JSONMessage, "inputMessage", widget.Single)
    ]
    outputs = [('Text data', Segmentation)]

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0]
    )

    # Settings...
    autoSend = settings.Setting(True)
    folders = settings.Setting([])
    encoding = settings.Setting('iso-8859-1')
    operation = settings.Setting('nothing')
    sampling =settings.Setting(100)
    autoNumber = settings.Setting(False)
    autoNumberKey = settings.Setting(u'num')
    importFilenames = settings.Setting(True)
    importFolderName = settings.Setting(True)
    importFolderNameKey = settings.Setting(u'folderName')
    importFileNameKey = settings.Setting(u'filename')
    FolderDepth1Key = settings.Setting(u'depth 1')
    FolderDepth2Key = settings.Setting(u'depth 2')
    FolderDepth2Key = settings.Setting(u'depth 3')
    FolderDepth2Key = settings.Setting(u'depth 4')
    FolderDepthLvl = settings.Setting(u'depth level')

    lastLocation = settings.Setting('.')
    displayAdvancedSettings = settings.Setting(False)
    folder = settings.Setting(u'')

    want_main_area = False

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Other attributes...
        self.segmentation = None
        self.operation = "no"
        self.applyInclusion = False
        self.applyExclusion = False
        self.applySampling = True
        self.samplingRate = 100
        self.createdInputs = list()
        self.folderLabels = list()
        self.selectedfolderLabels = list()
        self.rootFolderPath = u''
        self.inclusionsUser = u''
        self.exclusionsUser = u''
        self.newAnnotationKey = u''
        self.newAnnotationValue = u''
        self.folders = list() # self.folders is a list of dictionaries with each dictionaries being a a folder
        self.inclusionList = [".txt",".html",".xml",".csv"] #by default empty list

        # self.exclusionList = [".png,",".PNG",".jpg",".JPG",".gif",".GIF",".tiff",".TIFF",".jpeg",".JPEG",".DS_Store"] # by default exclusions : img files, .DS_Store (macOS)
        self.exclusionList = [] # by default null
        self.infoBox = InfoBox(widget=self.controlArea)
        # self.fileList = list() #output file list

        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute='infoBox',
            sendIfPreCallback=self.updateGUI,
        )
        self.advancedSettings = AdvancedSettings(
            widget=self.controlArea,
            master=self,
            callback=self.sendButton.settingsChanged,
        )

        # GUI...

        # Advanced settings checkbox...
        self.advancedSettings.draw()

        # BASIC GUI...

        # Basic folder box
        basicfolderBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Source',
            orientation='vertical',
            addSpace=False,
        )
        basicfolderBoxLine1 = gui.widgetBox(
            widget=basicfolderBox,
            box=False,
            orientation='horizontal',
        )
        gui.lineEdit(
            widget=basicfolderBoxLine1,
            master=self,
            value='rootFolderPath',
            orientation='horizontal',
            label=u'Folder path:',
            labelWidth=101,
            callback=self.add,
            tooltip=(
                u"The path of the folder."
            ),
        )
        gui.separator(widget=basicfolderBoxLine1, width=5)
        gui.button(
            widget=basicfolderBoxLine1,
            master=self,
            label=u'Browse',
            callback=self.browse,
            tooltip=(
                u"Open a dialog for selecting a top folder."
            ),
        )
        #gui.separator(widget=basicfolderBox, width=3)
        #gui.comboBox(
#            widget=basicfolderBox,
#            master=self,
#            value='encoding',
#            items=getPredefinedEncodings(),
#            sendSelectedValue=True,
#            orientation='horizontal',
#            label=u'Encoding:',
#            labelWidth=101,
#            callback=self.sendButton.settingsChanged,
#            tooltip=(
#                u"Select input folder(s) encoding."
#            ),
#        )
        gui.separator(widget=basicfolderBox, width=3)
        self.advancedSettings.basicWidgets.append(basicfolderBox)
        self.advancedSettings.basicWidgetsAppendSeparator()

        # ADVANCED GUI...

        # folder box
        folderBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Sources',
            orientation='vertical',
            addSpace=False,
        )
        folderBoxLine1 = gui.widgetBox(
            widget=folderBox,
            box=False,
            orientation='horizontal',
            addSpace=True,
        )
        self.folderListbox = gui.listBox(
            widget=folderBoxLine1,
            master=self,
            value='selectedfolderLabels',
            labels='folderLabels',
            callback=self.updatefolderBoxButtons,
            tooltip=(
                u"The list of folders whose content will be imported.\n"
                u"\nIn the output segmentation, the content of each\n"
                u"folder appears in the same position as in the list.\n"
                u"\nColumn 1 shows the folder's name.\n"
                u"Column 2 shows the folder's annotation (if any).\n"
                u"Column 3 shows the folder's encoding."
            ),
        )
        font = QFont()
        font.setFamily('Courier')
        font.setStyleHint(QFont.Courier)
        font.setPixelSize(12)
        self.folderListbox.setFont(font)
        folderBoxCol2 = gui.widgetBox(
            widget=folderBoxLine1,
            orientation='vertical',
        )
        self.moveUpButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Move Up',
            callback=self.moveUp,
            tooltip=(
                u"Move the selected folder upward in the list."
            ),
        )
        self.moveDownButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Move Down',
            callback=self.moveDown,
            tooltip=(
                u"Move the selected folder downward in the list."
            ),
        )
        self.removeButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Remove',
            callback=self.remove,
            tooltip=(
                u"Remove the selected folder from the list."
            ),
        )
        self.clearAllButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Clear All',
            callback=self.clearAll,
            tooltip=(
                u"Remove all folders from the list."
            ),
        )
        self.exportButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Export List',
            callback=self.exportList,
            tooltip=(
                u"Open a dialog for selecting a folder where the folder\n"
                u"list can be exported in JSON format."
            ),
        )
        self.importButton = gui.button(
            widget=folderBoxCol2,
            master=self,
            label=u'Import List',
            callback=self.importList,
            tooltip=(
                u"Open a dialog for selecting a folder list to\n"
                u"import (in JSON format). folders from this list\n"
                u"will be added to those already imported."
            ),
        )
        folderBoxLine2 = gui.widgetBox(
            widget=folderBox,
            box=False,
            orientation='vertical',
        )
        # Add folder box
        addfolderBox = gui.widgetBox(
            widget=folderBoxLine2,
            box=True,
            orientation='vertical',
        )
        addfolderBoxLine1 = gui.widgetBox(
            widget=addfolderBox,
            orientation='horizontal',
        )
        # Folder path input
        gui.lineEdit(
            widget=addfolderBoxLine1,
            master=self,
            value='rootFolderPath',
            orientation='horizontal',
            label=u'Folder path:',
            labelWidth=101,
            callback=self.updateGUI,
            tooltip=(
                u"The paths of the folders that will be added to the\n"
                u"list when button 'Add' is clicked.\n\n"
                u"Successive paths must be separated with ' / ' \n"
                u"(whitespace + slash + whitespace). Their order in\n"
                u"the list will be the same as in this field."
            ),
        )
        gui.separator(widget=addfolderBoxLine1, width=5)
        # Button Browse
        gui.button(
            widget=addfolderBoxLine1,
            master=self,
            label=u'Browse',
            callback=self.browse,
            tooltip=(
                u"Open a dialog for selecting a top folder.\n\n"
                u"Selected folder paths will appear in the field to\n"
                u"the left of this button afterwards, ready to be\n"
                u"added to the list when button 'Add' is clicked."
            ),
        )
        gui.separator(widget=addfolderBox, width=10)

        # Filter choice to include only certain files or to exclude files
        # ------------
        # self.applyInclusion = False  à mettre dans le init
        # gui.checkbox()
        # callback = lambda t=self.applyInclusion : includeLineEdit.setDisabled(not t)
        # includeLineEdit = gui.lineEdit()
        # ------------

        # Filter box to input include only
        gui.separator(widget=addfolderBox, width=3)
        includeBoxLine1 = gui.widgetBox(
            widget=addfolderBox,
            box=False,
            orientation='horizontal',
        )

        # Include only box
        gui.checkBox(
            widget=includeBoxLine1,
            master=self,
            value='applyInclusion',
            label=u'Include only',
            labelWidth=100,
            callback = lambda: includeLineEdit.setDisabled(not self.applyInclusion),
            tooltip=(
                u"Choose the inclusion"
            ),
        )
        includeLineEdit = gui.lineEdit(
            widget=includeBoxLine1,
            master=self,
            value='inclusionsUser',
            orientation='horizontal',
            label=u'',
            disabled = True,
            labelWidth=101,
            tooltip=(
                u"This field lets you specify a custom filter\n"
                u"to select the folders to be\n"
                u"added to the list."
            ),
        )

        # Filter box to exclude only
        gui.separator(widget=addfolderBox, width=3)
        excludeBoxLine1 = gui.widgetBox(
            widget=addfolderBox,
            box=False,
            orientation='horizontal',
        )
        # Exclude only box
        gui.checkBox(
            widget=excludeBoxLine1,
            master=self,
            value='applyExclusion',
            label=u'Exclude',
            labelWidth=100,
            disabled = False,
            callback = lambda: includeLineEdit2.setDisabled(not self.applyExclusion),
            tooltip=(
                u"Exclude the inclusion"
            ),
        )
        includeLineEdit2=gui.lineEdit(
            widget=excludeBoxLine1,
            master=self,
            value='exclusionsUser',
            orientation='horizontal',
            label=u'',
            disabled = True,
            labelWidth=101,
            tooltip=(
                u"This field lets you specify a custom filter\n"
                u"to select the folders to be\n"
                u"added to the list."
            ),
        )

        # Sampling box to input the level of sampling
        gui.separator(widget=addfolderBox, width=3)
        samplingBoxLine1 = gui.widgetBox(
            widget=addfolderBox,
            box=False,
            orientation='horizontal',
        )
        # Check box for sampling
        gui.checkBox(
            widget=samplingBoxLine1,
            master=self,
            value='applySampling',
            label=u'Sampling',
            labelWidth=100,
            disabled = False,
            callback = lambda: samplingSpin.setDisabled(not self.applySampling),
            tooltip=(
                u"Choose the sampling level"
            ),
        )
        # Box to input the level of samplig, spin minv = 10 and maxv = 100

        # self.importFilenamesKeyLineEdit = gui.spin(

        samplingSpin = gui.spin(
            widget=samplingBoxLine1,
            master=self,
            value='samplingRate',
            minv = 10,
            maxv = 100,
            labelWidth=50,
            orientation='horizontal',
            tooltip=(
                u"sampling level"
            ),
        )
        gui.separator(widget=addfolderBox, width=3)
        self.addButton = gui.button(
            widget=addfolderBox,
            master=self,
            label=u'Add',
            callback=self.add,
            tooltip=(
                u"Add the folder(s) currently displayed in the\n"
                u"'folders' text field to the list.\n\n"
                u"Each of these folders will be associated with the\n"
                u"specified encoding and annotation (if any).\n\n"
                u"Other folders may be selected afterwards and\n"
                u"assigned a different encoding and annotation."
            ),
        )
        self.advancedSettings.advancedWidgets.append(folderBox)
        self.advancedSettings.advancedWidgetsAppendSeparator()

        # Options box...
        optionsBox = gui.widgetBox(
            widget=self.controlArea,
            box=u'Options',
            orientation='vertical',
            addSpace=False,
        )
        optionsBoxLine1 = gui.widgetBox(
            widget=optionsBox,
            box=False,
            orientation='horizontal',
        )
#        gui.checkBox(
#           widget=optionsBoxLine1,
#            master=self,
#           value='importFilenames',
#            label=u'Import folder names with key:',
#           labelWidth=180,
#            callback=self.sendButton.settingsChanged,
#            tooltip=(
#                u"Import folder names as annotations."
#           ),
#        )
#        self.importFilenamesKeyLineEdit = gui.lineEdit(
#            widget=optionsBoxLine1,
#            master=self,
#            value='importFilenamesKey',
#            orientation='horizontal',
#            callback=self.sendButton.settingsChanged,
#            tooltip=(
#                u"Annotation key for importing folder names."
#            ),
#        )
        gui.separator(widget=optionsBox, width=3)
        optionsBoxLine2 = gui.widgetBox(
            widget=optionsBox,
            box=False,
            orientation='horizontal',
        )
        gui.checkBox(
            widget=optionsBoxLine2,
            master=self,
            value='autoNumber',
            label=u'Auto-number with key:',
            labelWidth=180,
            callback=self.sendButton.settingsChanged,
            tooltip=(
                u"Annotate folders with increasing numeric indices."
            ),
        )
        self.autoNumberKeyLineEdit = gui.lineEdit(
            widget=optionsBoxLine2,
            master=self,
            value='autoNumberKey',
            orientation='horizontal',
            callback=self.sendButton.settingsChanged,
            tooltip=(
                u"Annotation key for folder auto-numbering."
            ),
        )
        gui.separator(widget=optionsBox, width=3)
        self.advancedSettings.advancedWidgets.append(optionsBox)
        self.advancedSettings.advancedWidgetsAppendSeparator()

        gui.rubber(self.controlArea)

        # Send button...
        self.sendButton.draw()

        # Info box...
        self.infoBox.draw()

        self.adjustSizeWithTimer()
        QTimer.singleShot(0, self.sendButton.sendIf)

    def inputMessage(self, message):
        """Handle JSON message on input connection"""
        if not message:
            return
        self.displayAdvancedSettings = True
        self.advancedSettings.setVisible(True)
        self.clearAll()
        self.infoBox.inputChanged()
        try:
            json_data = json.loads(message.content)
            temp_folders = list()
            for entry in json_data:
                path = entry.get('path', '')
                encoding = entry.get('encoding', '')
                annotationKey = entry.get('annotation_key', '')
                annotationValue = entry.get('annotation_value', '')
                if path == '' or encoding == '':
                    self.infoBox.setText(
                        u"Please verify keys and values of incoming "
                        u"JSON message.",
                        'error'
                    )
                    self.send('Text data', None, self)
                    return
                depth = "0"
                options = "[i]:{unicorn}"
                temp_folders.append((
                    name,
                    path,
                    depth,
                    options,

                ))
            self.folders.extend(temp_folders)
            self.sendButton.settingsChanged()
        except ValueError:
            self.infoBox.setText(
                u"Please make sure that incoming message is valid JSON.",
                'error'
            )
            self.send('Text data', None, self)
            return

    def sendData(self):

        """Load folders, create and send segmentation"""

        # Check that there's something on input...
        if (
            (self.displayAdvancedSettings and not self.folders) or
            not (self.rootFolderPath or self.displayAdvancedSettings)
        ):
            self.infoBox.setText(u'Please select input folder.', 'warning')
            self.send('Text data', None, self)
            return

        # Check that autoNumberKey is not empty (if necessary)...
        if self.displayAdvancedSettings and self.autoNumber:
            if self.autoNumberKey:
                autoNumberKey = self.autoNumberKey
            else:
                self.infoBox.setText(
                    u'Please enter an annotation key for auto-numbering.',
                    'warning'
                )
                self.send('Text data', None, self)
                return
        else:
            autoNumberKey = None

        # Clear created Inputs...
        self.clearCreatedInputs()

        fileContents = list()
        annotations = list()
        counter = 1

        if self.displayAdvancedSettings:
            myFolders = self.folders
        else:
            myFolders = [[self.rootFolderPath]]

        progressBar = gui.ProgressBar(
            self,
            iterations=len(myFolders)
        )

        # Walk through each folder and open each files successively...

        fileContents = self.fileContents

        # Annotations...
        myFolders = self.folders
        for myFolder in myFolders:
            myFiles = myFolder['fileList']

            for myFile in myFiles:
                # print(myFile)
                annotation = dict()

                if self.importFileNameKey:
                    annotation[self.importFileNameKey] = myFile['fileName']

                if self.importFolderNameKey:
                    annotation[self.importFolderNameKey] = myFile['folderName']

                if self.FolderDepth1Key:
                    annotation[self.FolderDepth1Key] = myFile['depth1']

                if self.FolderDepth2Key:
                    annotation[self.FolderDepth2Key] = myFile['depth2']

                if self.FolderDepthLvl:
                    annotation[self.FolderDepthLvl] = myFile['depthLvl']

                annotations.append(annotation)
            # progressBar.advance()

        # Create an LTTL.Input for each files...

        if len(fileContents) == 1:
            label = self.captionTitle
        else:
            label = None
        for index in range(len(fileContents)):
            myInput = Input(fileContents[index], label)
            segment = myInput[0]
            segment.annotations.update(annotations[index])
            myInput[0] = segment
            self.createdInputs.append(myInput)

        # If there's only one file, the widget's output is the created Input.
        if len(fileContents) == 1:
            self.segmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.segmentation = Segmenter.concatenate(
                segmentations=self.createdInputs,
                label=self.captionTitle,
                copy_annotations=True,
                import_labels_as=None,
                sort=False,
                auto_number_as=None,
                merge_duplicates=False,
                progress_callback=None,
            )
        message = u'%i segment@p sent to output ' % len(self.segmentation)
        message = pluralize(message, len(self.segmentation))
        numChars = 0
        for segment in self.segmentation:
            segmentLength = len(Segmentation.get_data(segment.str_index))
            numChars += segmentLength
        message += u'(%i character@p).' % numChars
        message = pluralize(message, numChars)
        self.infoBox.setText(message)
        progressBar.finish()

        self.send('Text data', self.segmentation, self)
        self.sendButton.resetSettingsChangedFlag()

    def clearCreatedInputs(self):
        for i in self.createdInputs:
            Segmentation.set_data(i[0].str_index, None)
        del self.createdInputs[:]

    def importList(self):
        """Display a folderDialog and import folder list"""
        folderPath = QFileDialog.getOpenFileName(
            self,
            u'Import folder List',
            self.lastLocation,
            u'Text folders (*)'
        )
        if not folderPath:
            return
        self.rootFolderPath = os.path.normpath(folderPath)
        self.lastLocation = os.path.dirname(folderPath)
        self.error()
        try:
            folderHandle = codecs.open(folderPath, encoding='utf8')
            folderContent = folderHandle.read()
            folderHandle.close()
        except IOError:
            QMessageBox.warning(
                None,
                'Textable',
                "Couldn't open folder.",
                QMessageBox.Ok
            )
            return
        try:
            json_data = json.loads(folderContent)
            temp_folders = list()
            for entry in json_data:
                path = entry.get('path', '')
                encoding = entry.get('encoding', '')
                annotationKey = entry.get('annotation_key', '')
                annotationValue = entry.get('annotation_value', '')
                if path == '' or encoding == '':
                    QMessageBox.warning(
                        None,
                        'Textable',
                        "Selected JSON folder doesn't have the right keys "
                        "and/or values.",
                        QMessageBox.Ok
                    )
                    return
                temp_folders.append((
                    path,
                    encoding,
                    annotationKey,
                    annotationValue,
                ))
            self.folders.extend(temp_folders)
            if temp_folders:
                self.sendButton.settingsChanged()
        except ValueError:
            QMessageBox.warning(
                None,
                'Textable',
                "JSON parsing error.",
                QMessageBox.Ok
            )
            return

    def exportList(self):
        """Display a folderDialog and export folder list"""
        toDump = list()
        myFolders = self.folders
        for myFolder in myFolders:
            toDump.append({
                'path': myFolder[0],
                'encoding': myFolder[1],
            })
            if myFolder[2] and myFolder[3]:
                toDump[-1]['annotation_key'] = myFolder[2]
                toDump[-1]['annotation_value'] = myFolder[3]
        folderPath =QFileDialog.getSaveFileName(
            self,
            u'Export folder List',
            self.lastLocation,
        )

        if folderPath:
            self.lastLocation = os.path.dirname(folderPath)
            outputfolder = codecs.open(
                folderPath,
                encoding='utf8',
                mode='w',
                errors='xmlcharrefreplace',
            )
            outputfolder.write(
                normalizeCarriageReturns(
                    json.dumps(toDump, sort_keys=True, indent=4)
                )
            )
            outputfolder.close()
            QMessageBox.information(
                None,
                'Textable',
                'folder list correctly exported',
                QMessageBox.Ok
            )

    def getFileList(self):
        #print("getFileList")

        initialRootParentPath, _ = os.path.split(self.rootFolderPath) #initial parent path is selected's folder parent folder
        fileListExt = list() # list of files matching default extension
        depthList = list()

        progressBar = gui.ProgressBar(self, iterations=1)

        for curr_path, dirnames, filenames in os.walk(self.rootFolderPath):
    	#curr_path is a STRING, the path to the directory.
    	#dirnames is a LIST of the names of subdirectories.
    	#filenames is a LIST of the names of the files in curr_path
    	#symlink non traités

            curr_rel_path = curr_path[len(initialRootParentPath)+1:] #defines current relative path by similar initial parent path part
            curr_rel_path_list = os.path.normpath(curr_rel_path).split(os.sep) #splits current relative path by os separator

            for filename in filenames:
                file = dict()
                # file = {"absoluteFilePath","foldername","filename","depth1","depth2","depth3","depth4","depth5","depth lvl"}
                # prev_non_excl_check = False
                # curr_non_excl_check = prev_non_excl_check #importing previous state of the "non-exclusion check" (opposite of exclusion check)

                annotations = curr_rel_path_list[:] # annotations are different subfolders browsed
                # print(annotations)

                curr_depth = len(annotations)

                depthList.append(curr_depth)

                file['absoluteFilePath'] = os.path.join(curr_path,filename)
                file['fileName'] = filename
                file['depthLvl'] = curr_depth

                file['folderName'] = annotations[0]

                for i in range(1, curr_depth):
                    file['depth' + str(i)] = annotations[i]
                for i in range(curr_depth, 5):
                    file['depth' + str(i)] = "0"

                # apply default file extension filter
                for extension in self.inclusionList:
                    if filename.endswith(extension):
                        fileListExt.append(file)

        # apply inclusion filter
        if self.applyInclusion:
            fileListIncl = [file for file in fileListExt
                            # match in inclusion list
                            if self.match(file['fileName'], self.inclusionsUserAsList)]
        else:
            fileListIncl = fileListExt

        # apply exclusion filter
        if self.applyExclusion:
            fileListExcl = [file for file in fileListIncl
                            # no match in exclusion list
                            if not self.match(file['fileName'], self.exclusionsUserAsList)]
        else:
            fileListExcl = fileListIncl

        # output file list
        self.fileList = fileListExcl

        if self.fileList:
            self.maxDepth = max(depthList)
            self.openFileList()
        else:
            self.maxDepth = 0
        progressBar.advance()
        progressBar.finish()

    # test if file contains one of the patterns in patternList
    def match(self, file, patternList):
        for pattern in patternList:
            if pattern in file:
                return True
        return False

    def openFileList(self):
        self.fileContents = list()
        for file in self.fileList:
            fileContent = ""
            try:
                file_path = file['absoluteFilePath']
            except TypeError:
                pass

            encodings = getPredefinedEncodings()
            with open(file_path,'rb') as opened_file:
                fileContent = opened_file.read()
                charset_dict = chardet.detect(fileContent)
                detected_encoding = charset_dict['encoding']

                # i = 0
                # chunks = list()
                #
                # for chunk in iter(lambda: opened_file.read(CHUNK_LENGTH), ""):
                #     chunks.append('\n'.join(chunk.splitlines()))
                #     i += CHUNK_LENGTH
                #     if i % (CHUNK_NUM * CHUNK_LENGTH) == 0:
                #         fileContent += "".join(str(chunks)
                #         chunk = list()
                #
                # if len(chunks):
                #     fileContent += "".join(str(chunks))
                # del chunks

                try:
                    encodings.remove(detected_encoding)
                    encodings.insert(0,detected_encoding)

                except ValueError:
                    pass

                for encoding in encodings:
                    try:
                        self.fileContent = fileContent.decode(encoding)
                    except:
                        pass


                # fileContent = normalize('NFC', str(fileContent))
                # fileContents.append(fileContent)

                self.fileContents.append(self.fileContent)

        del self.fileContents[-1]
        # print(self.fileContents)

    def browse(self):
        """Display a QFileDialog and select a folder"""

        rootFolderPath = QFileDialog.getExistingDirectory(    #Use QFileDialog.getExistingDirectory
            self,
            u'Select Folder(s)',
            self.lastLocation,
        )
        if not rootFolderPath:
            return

        rootFolderPath = os.path.normpath(rootFolderPath)
        self.rootFolderPath = rootFolderPath
        self.lastLocation = rootFolderPath


        if self.displayAdvancedSettings:
            pass
        else:
            self.add()

        self.updateGUI()

    def moveUp(self):
        """Move folder upward in folders listbox"""
        if self.selectedfolderLabels:
            index = self.selectedfolderLabels[0]
            if index > 0:
                temp = self.folders[index - 1]
                self.folders[index - 1] = self.folders[index]
                self.folders[index] = temp
                self.selectedfolderLabels.listBox.item(index - 1).setSelected(1)
                self.sendButton.settingsChanged()

    def moveDown(self):
        """Move folder downward in folders listbox"""
        if self.selectedfolderLabels:
            index = self.selectedfolderLabels[0]
            if index < len(self.folders) - 1:
                temp = self.folders[index + 1]
                self.folders[index + 1] = self.folders[index]
                self.folders[index] = temp
                self.selectedfolderLabels.listBox.item(index + 1).setSelected(1)
                self.sendButton.settingsChanged()

    def clearAll(self):
        """Remove all folders from folders attr"""
        del self.folders[:]
        del self.selectedfolderLabels[:]
        self.sendButton.settingsChanged()

    def remove(self):
        """Remove folder from folders attr"""
        if self.selectedfolderLabels:
            index = self.selectedfolderLabels[0]
            self.folders.pop(index)
            del self.selectedfolderLabels[:]
            self.sendButton.settingsChanged()

    def add(self):
        """Add folders to folders attr"""

        #rootFolderPathList = re.split(r' +/ +', self.rootFolderPath) #self.rootFolderPath = name

        # identify sequences separated by a "," and suppress the white spaces
        self.inclusionsUserAsList = [x.strip() for x in self.inclusionsUser.split(",") if x.strip()]
        self.exclusionsUserAsList = [x.strip() for x in self.exclusionsUser.split(",") if x.strip()]

        self.getFileList()
        # display the list of files
        print("Files: ", list(map(lambda f: f['fileName'], self.fileList)))

        sampleFileList = self.sampleFileList()
        # display the list of sampled files
        print("Files after sampling: ", list(map(lambda f: f['fileName'], sampleFileList)))

        self.folders.append(
            {
            'rootPath' : self.rootFolderPath,
            'maxDepth' : self.maxDepth,
            'inclusionsUser' : self.inclusionsUser,
            'exclusionsUser' : self.exclusionsUser,
            'samplingRate' : self.samplingRate,
            'fileList' : sampleFileList,
            }
        )
        # print(self.folders)
        self.sendButton.settingsChanged()

        # for folderDict in self.folders:
        #     fileList = folderDict['fileList']

    def sampleFileList(self):

        # Utilisation de la variable fileList
        # On fait une copie pour eviter de modifier self.fileList avec shuffle plus bas
        myList = list(self.fileList)

        # Initialisation d'un parametre qui decidera de l'echantillonage
        samplePercentage = self.samplingRate / 100.0
        # print(samplePercentage)

        # On melange la liste pour prendre ensuite les "samplePercentage" premiers
        random.shuffle(myList)

        # On definit le nombre de fichiers voulus selon le parametre d'echantillonage "samplePercentage", arrondi au superieur
        nOfFiles = int(math.ceil(len(myList) * samplePercentage))
        # On prend les "nOfFiles" premiers fichiers de la liste melangee
        return myList[:nOfFiles]

    def updateGUI(self):
        """Update GUI state"""
        if self.displayAdvancedSettings:
            if self.selectedfolderLabels:
                cachedLabel = self.selectedfolderLabels[0]
            else:
                cachedLabel = None

            del self.folderLabels[:]
            folderLabels = []

            if self.folders:
                folderRootPathsList = [f['rootPath'] for f in self.folders]
                maxDepthList = ['%s' % f['maxDepth'] for f in self.folders]
                inclusionsUserList = [f['inclusionsUser'] for f in self.folders]
                # print(inclusionsUserList)
                exclusionsUserList = [f['exclusionsUser'] for f in self.folders]
                samplingRatesList = ['%s' % f['samplingRate'] for f in self.folders]
                folderNamesList = [os.path.basename(p) for p in folderRootPathsList]
                maxFolderNameLen = max([len(n) for n in folderNamesList])

                for index in range(len(self.folders)):
                    format = u'%-' + str(maxFolderNameLen + 2) + u's'
                    # folderLabel = format % folderNamesList[index],
                    folderLabel = format % folderNamesList[index]
                    # print(inclusionsUserList[index])
                    folderLabel += "[d]:{"+maxDepthList[index]+"} "
                    folderLabel += "[i]:{"+inclusionsUserList[index]+"} "
                    folderLabel += "[e]:{"+exclusionsUserList[index]+"} "
                    folderLabel += "[s]:{"+samplingRatesList[index]+"%}"
                    folderLabels.append(folderLabel)

            self.folderLabels = folderLabels

            if cachedLabel is not None:
                self.sendButton.sendIfPreCallback = None
                self.selectedfolderLabels.listBox.item(
                    cachedLabel
                ).setSelected(1)
                self.sendButton.sendIfPreCallback = self.updateGUI
            if self.rootFolderPath:
                if (
                    (self.newAnnotationKey and self.newAnnotationValue) or
                    (not self.newAnnotationKey and not self.newAnnotationValue)
                ):
                    self.addButton.setDisabled(False)
                else:
                    self.addButton.setDisabled(True)
            else:
                self.addButton.setDisabled(True)
            if self.autoNumber:
                self.autoNumberKeyLineEdit.setDisabled(False)
            else:
                self.autoNumberKeyLineEdit.setDisabled(True)
            # if self.importFilenames:
            #     self.importFilenamesKeyLineEdit.setDisabled(False)
            # else:
            #     self.importFilenamesKeyLineEdit.setDisabled(True)
            self.updatefolderBoxButtons()
            self.advancedSettings.setVisible(True)
        else:
            self.advancedSettings.setVisible(False)

    def updatefolderBoxButtons(self):
        """Update state of folder box buttons"""
        if self.selectedfolderLabels:
            self.removeButton.setDisabled(False)
            if self.selectedfolderLabels[0] > 0:
                self.moveUpButton.setDisabled(False)
            else:
                self.moveUpButton.setDisabled(True)
            if self.selectedfolderLabels[0] < len(self.folders) - 1:
                self.moveDownButton.setDisabled(False)
            else:
                self.moveDownButton.setDisabled(True)
        else:
            self.moveUpButton.setDisabled(True)
            self.moveDownButton.setDisabled(True)
            self.removeButton.setDisabled(True)
        if len(self.folders):
            self.clearAllButton.setDisabled(False)
            self.exportButton.setDisabled(False)
        else:
            self.clearAllButton.setDisabled(True)
            self.exportButton.setDisabled(True)

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)

    def onDeleteWidget(self):
        self.clearCreatedInputs()
class Charnet(OWTextableBaseWidget):
    """Textable widget for building character networks with Charnet."""

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "Charnet"
    description = "Build character networks with the Charnet package"
    icon = "icons/charnet.svg"
    priority = 21   # TODO

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Text data", Segmentation, "inputData")]
    outputs = [("Character segmentation", Segmentation)]

    #----------------------------------------------------------------------
    # Layout parameters...
    
    want_main_area = False

    #----------------------------------------------------------------------
    # Settings...
    
    # TODO

    #----------------------------------------------------------------------
    # The following lines need to be copied verbatim in every Textable widget...
    
    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0]
    )
    
    def __init__(self):
        """Widget creator."""

        super().__init__()

        #----------------------------------------------------------------------
        # Other (non settings) attributes...
        
        self.inputSeg = None
        self.selectedCharacters = list()
        self.characters = list()
        self.mustLoad = True
        if INSTALLED_MODELS:
            self.model = INSTALLED_MODELS[0]
        else:
            self.model = ""

        #----------------------------------------------------------------------
        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...
        
        self.infoBox = InfoBox(widget=self.controlArea)
        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=None,
        )

        #----------------------------------------------------------------------
        # User interface...

        self.characterListbox = gui.listBox(
            widget=self.controlArea,
            master=self,
            value="selectedCharacters",
            labels="characters",
            callback=None,
            tooltip="List of identified characters",
        )
        
        gui.rubber(self.controlArea)

        #----------------------------------------------------------------------
        # Draw Info box and Send button...
        
        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input.", "warning")
        
        # Check that there's a model...
        if not self.model:
            self.noLanguageModelWarning()

    def inputData(self, newInput):
        """Process incoming data."""
        self.inputSeg = newInput
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            self.characters = list()
            return
        self.updateCharacterList()
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def updateCharacterList(self):
        """Update character list based on Charnet output."""
        if self.mustLoad:
            self.loadModel()
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=4)
        string = " ".join(segment.get_content() for segment in self.inputSeg)
        progressBar.advance()
        self.char_df = charnet.extract_spacy_df(string, self.nlp) # TODO progress
        progressBar.advance()
        self.char_df = charnet.unify_tags(self.char_df)
        progressBar.advance()
        self.char_list = charnet.concatenate_parents(self.char_df, min_occ = 1)
        self.characters = [", ".join(char) for char in self.char_list]
        progressBar.advance()
        progressBar.finish()
        self.controlArea.setDisabled(False)
    
    def loadModel(self):
        """(Re-)load language model if needed."""
        # Initialize progress bar.
        self.infoBox.setText(
            u"Loading language model, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=1)       
        self.nlp = spacy.load(
            #AVAILABLE_MODELS[self.model],
            "en_core_web_sm",
        )
        self.mustLoad = False
        progressBar.advance()
        progressBar.finish()
        self.controlArea.setDisabled(False)

    def noLanguageModelWarning(self):
        """"Warn user that a spaCy model must be installed and disable GUI."""
        self.infoBox.setText(
            "Please use the spaCy widget to download a language "
            "model first.",
            "warning",
        )
        self.controlArea.setDisabled(True)

    def sendNoneToOutputs(self):
        """Send None token to all output channels."""
        for channel in [c.name for c in self.outputs]:
            self.send(channel, None, self)
        return

    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.noLanguageModelWarning()
            self.sendNoneToOutputs()
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            return
       
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )

        # Disable control area and initialize progress bar...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.char_df))       

        # Get start and end pos of concatenated input segments...
        start_positions = [0]
        end_positions = list()
        num_segments = len(self.inputSeg)
        for idx in range(1, num_segments):
            prev_seg_len = len(self.inputSeg[idx-1].get_content())
            start_positions.append(start_positions[-1] + prev_seg_len + 1)
            end_positions.append(start_positions[-1] - 1)
        end_positions.append(start_positions[-1] + 
                             len(self.inputSeg[-1].get_content()) + 1)

        # Initializations...
        char_segments = list()
        current_segment_idx = 0

        # For each character token in Charnet's output...
        for index, char_token in self.char_df.iterrows():

            # Get index of containing segment...
            while char_token["end_pos"] > end_positions[current_segment_idx]:
                current_segment_idx += 1
                
            # Create segment for char with its actual coordinates...
            str_index = self.inputSeg[current_segment_idx].str_index
            start = char_token["start_pos"]-start_positions[current_segment_idx]
            end = char_token["end_pos"]-start_positions[current_segment_idx]
            char_segments.append(Segment(str_index, start, end))
            
            progressBar.advance()

        # Send output...
        output_segmentation = Segmentation(char_segments, 
                                           label=self.captionTitle)
        self.send("Character segmentation", output_segmentation, self)
        print(output_segmentation.to_string())

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             

    #----------------------------------------------------------------------
    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...
    
    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)
Exemple #14
0
class TextSummarizer(OWTextableBaseWidget):
    """Textable widget for summarizing a segment in a selected language."""

    #----------------------------------------------------------------------
    # Widget's metadata...

    name = "TL;DR"
    description = "Summarize texts with spaCy models"
    icon = "icons/TL_DR_icon.svg"
    priority = 21

    #----------------------------------------------------------------------
    # Channel definitions...

    inputs = [("Segmentation", Segmentation, "inputData")]
    outputs = [("Summary", Segmentation, widget.Default),
               ("HTML_Summary", Segmentation)]

    #----------------------------------------------------------------------
    # GUI layout parameters...

    want_main_area = False

    #----------------------------------------------------------------------
    # Settings - defines set values when opening widget

    numSents = settings.Setting(5)
    language = settings.Setting("English")
    typeSeg = settings.Setting("Summarize each segments individually")
    percentage = settings.Setting(20)
    method = settings.Setting("Number of sentences")

    #----------------------------------------------------------------------
    # The following lines need to be copied verbatim in every Textable widget...

    settingsHandler = VersionedSettingsHandler(
        version=__version__.rsplit(".", 1)[0])

    def __init__(self):
        """Widget creator."""

        super().__init__()

        #----------------------------------------------------------------------
        # Other (non settings) attributes...

        self.inputSeg = None
        self.outputSeg = None
        self.html_outputSeg = None
        self.nlp = None
        self.cv = None
        if INSTALLED_MODELS:
            self.model = INSTALLED_MODELS[0]
        else:
            self.model = ""

        #----------------------------------------------------------------------
        # Next two instructions are helpers from TextableUtils. Corresponding
        # interface elements are declared here and actually drawn below (at
        # their position in the UI)...

        self.infoBox = InfoBox(widget=self.controlArea)

        self.sendButton = SendButton(
            widget=self.controlArea,
            master=self,
            callback=self.sendData,
            infoBoxAttribute="infoBox",
            sendIfPreCallback=None,
        )

        #----------------------------------------------------------------------
        # User interface...

        self.languageCombo = gui.comboBox(
            widget=self.controlArea,
            master=self,
            value="language",
            items=INSTALLED_MODELS[:],
            sendSelectedValue=True,
            orientation="horizontal",
            label="Input's language:",
            labelWidth=135,
            # Appeler autre méthode
            callback=self.languageChanged,
            tooltip=("Please select the text's language.\n"),
        )

        box = gui.widgetBox(self.controlArea, "Language informations:")
        self.infoa = gui.widgetLabel(
            box,
            "More languages are available. \nTo access them, please use the spaCy widget to \ndownload the model first."
        )

        self.lenghtMethodCombo = gui.comboBox(
            widget=self.controlArea,
            master=self,
            value="method",
            items=[
                "Number of sentences",
                "Percentage of input's length",
            ],
            sendSelectedValue=True,
            orientation="horizontal",
            label="Define summary's length by:",
            labelWidth=180,
            #Add below call to method that activate/deactivate self.numSentsSpin or self.percentageSpin
            callback=self.summaryGui,
            tooltip=("How do you want to choose the summary's length ?"),
        )

        self.numSentsSpin = gui.spin(
            widget=self.controlArea,
            master=self,
            value='numSents',
            label='Number of sentences : ',
            callback=self.sendButton.sendIf(),
            labelWidth=180,
            tooltip=('Select the number of sentences wanted for the summary.'),
            # Define max sentences according to input
            maxv=10,
            minv=1,
            step=1,
        )

        self.percentageSpin = gui.spin(
            widget=self.controlArea,
            master=self,
            value='percentage',
            label='Length in %:',
            labelWidth=180,
            callback=self.sendButton.sendIf(),
            tooltip=
            ('Select the length of the summary in percentage of the input text.'
             ),
            maxv=99,
            minv=1,
            step=1,
        )

        self.segmentBox = gui.comboBox(
            widget=self.controlArea,
            master=self,
            value="typeSeg",
            items=[
                "Summarize all segments as one",
                "Summarize each segments individually",
            ],
            sendSelectedValue=True,
            orientation="horizontal",
            label="Segmentation",
            labelWidth=135,
            callback=self.maxNumSents,
            tooltip=("How should the input segments be summarized ? \n"),
        )

        gui.rubber(self.controlArea)

        #----------------------------------------------------------------------
        # Draw Info box and Send button...

        self.sendButton.draw()
        self.infoBox.draw()
        self.infoBox.setText("Widget needs input.", "warning")

        # Check that there's a model and if not call noLanguageModelWarning()
        if not self.model:
            self.noLanguageModelWarning()

    #----------------------------------------------------------------------------

    def inputData(self, segmentation):
        """Process incoming data."""
        self.inputSeg = segmentation
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.send('Summary', None, self)
            self.send('HTML_Summary', None, self)
            return
        # Load default language model
        self.cv = self.loadModelEN()
        # Set max number of sentence of summary
        self.maxNumSents()
        # Disable percentageSpin OR numSentsSpin
        self.summaryGui()
        # Set segmentBox visible OR unvisible
        self.segmentBoxState()
        self.infoBox.inputChanged()
        self.sendButton.sendIf()

    def noLanguageModelWarning(self):
        """"Warn user that a spaCy model must be installed and disable GUI."""
        self.infoBox.setText(
            "Please use the spaCy widget to download a language "
            "model first.",
            "warning",
        )
        self.controlArea.setDisabled(True)

    def maxNumSents(self):
        """Set numSentsSpin.maxv according to inputSeg"""
        fusionStrategy = sum if self.typeSeg == "Summarize all segments as one" else min
        self.sendButton.settingsChanged()
        self.numSentsSpin.setMaximum(
            fusionStrategy(
                len(list(self.nlp(seg.get_content()).sents))
                for seg in self.inputSeg))

    def languageChanged(self):
        """Load the appropriate model according to user choice"""

        self.infoBox.setText(
            u"Loading model, please wait...",
            "warning",
        )

        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=1)

        if self.language == "French":
            self.cv = self.loadModelFR()
        elif self.language == "English":
            self.cv = self.loadModelEN()
        elif self.language == "Portuguese":
            self.cv = self.loadModelPT()
        elif self.language == "Dutch":
            self.cv = self.loadModelLN()
        elif self.language == "German":
            self.cv = self.loadModelDE()
        elif self.language == "Greek":
            self.cv = self.loadModelEL()
        elif self.language == "Italian":
            self.cv = self.loadModelIT()
        elif self.language == "Lithuanian":
            self.cv = self.loadModelLT()
        elif self.language == "Norwegian":
            self.cv = self.loadModelNB()
        elif self.language == "Spanish":
            self.cv = self.loadModelES()

        progressBar.advance()
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.sendButton.settingsChanged()

    def summaryGui(self):
        """Disable percentageSpin or numSentsSpin"""
        if self.method == "Number of sentences":
            self.percentageSpin.setVisible(False)
            self.percentageSpin.label.setVisible(False)
            self.numSentsSpin.setVisible(True)
            self.numSentsSpin.label.setVisible(True)
        elif self.method == "Percentage of input's length":
            self.percentageSpin.setVisible(True)
            self.percentageSpin.label.setVisible(True)
            self.numSentsSpin.setVisible(False)
            self.numSentsSpin.label.setVisible(False)

        self.sendButton.settingsChanged()

    def segmentBoxState(self):
        """Hide segmentBox GUI if input is only one segment
        Show if input is 1+ segments"""
        if len(self.inputSeg) > 1:
            self.segmentBox.setVisible(True)
            self.segmentBox.label.setVisible(True)
        elif len(self.inputSeg) == 1:
            self.segmentBox.setVisible(False)
            self.segmentBox.label.setVisible(False)

    ################################################################
    # Called when send button is clicked
    ################################################################

    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model
        if not self.model:
            self.noLanguageModelWarning()
            return

        # Check that there's an input
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.send('Summary', None, self)
            self.send('HTML_Summary', None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )

        self.controlArea.setDisabled(True)

        # Type of segmentation (per segment or per segmentation)
        segments = list()
        html_segments = list()
        if self.typeSeg == "Summarize each segments individually":
            # Process each segment separately, then create segmentation
            for segment in self.inputSeg:
                content = segment.get_content()
                resume, html_resume = self.summarize(self.cv, content)
                segments.append(Segment(str_index=resume[0].str_index, ))
                html_segments.append(
                    Segment(str_index=html_resume[0].str_index, ))
        elif self.typeSeg == "Summarize all segments as one":
            merged_seg = " ".join(
                [segment.get_content() for segment in self.inputSeg])
            resume, html_resume = self.summarize(self.cv, merged_seg)
            segments.append(Segment(str_index=resume[0].str_index, ))
            html_segments.append(Segment(str_index=html_resume[0].str_index, ))

        # Create segmentation from segment() and assign it to the output
        self.outputSeg = Segmentation(segments, self.captionTitle)
        self.html_outputSeg = Segmentation(html_segments, self.captionTitle)

        # Send segmentation to output channels
        self.send("Summary", self.outputSeg, self)
        self.send('HTML_Summary', self.html_outputSeg, self)

        # Set message to sent
        message = "%i segment@p sent to output " % len(self.outputSeg)
        message = pluralize(message, len(self.outputSeg))
        self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
        self.controlArea.setDisabled(False)

    ################################################################
    # Main function
    ################################################################

    def summarize(self, cv, content):
        "Main function that summarize the text"

        progressBar = ProgressBar(self, iterations=3)

        doc = self.nlp(content)

        corpus = [sent.text.lower() for sent in doc.sents]
        cv_fit = self.cv.fit_transform(corpus)

        # Count unique words and how many times they appear
        word_list = self.cv.get_feature_names()
        count_list = cv_fit.toarray().sum(axis=0)
        word_frequency = dict(zip(word_list, count_list))

        # Get sorted dict of word frequency and print the top to test
        val = sorted(word_frequency.values())
        higher_word_frequencies = [
            word for word, freq in word_frequency.items() if freq in val[-3:]
        ]

        # gets relative frequency of words to frequent words
        higher_frequency = val[-1]
        for word in word_frequency.keys():
            word_frequency[word] = (word_frequency[word] / higher_frequency)

        progressBar.advance()
        # Initialise a sentence dictionnary
        sentence_rank = {}

        # For each word in each sentence ...
        for sent in doc.sents:
            count = 0
            for word in sent:
                count += 1
                # if the word appears in word_frequency dict
                if word.text.lower() in word_frequency.keys():
                    # If the sentence is already in sentence_rank dict, we add points
                    if sent in sentence_rank.keys():
                        sentence_rank[sent] += word_frequency[
                            word.text.lower()]
                    # else we create a new key/value pair in dict
                    else:
                        sentence_rank[sent] = word_frequency[word.text.lower()]

            # Normalize: divide score of current sentence by number of words
            if sentence_rank.get(sent, None) != None:
                sentence_rank[sent] = (sentence_rank.get(sent) / count)

        progressBar.advance()

        # Sort sentences
        top_sentences = (sorted(sentence_rank.values())[::-1])
        # This is where we can choose how many sentences we want to keep for the summary
        # Depending on the choosen method: sentences or %
        if self.method == "Number of sentences":
            top_sent = top_sentences[:self.numSents]
        elif self.method == "Percentage of text lenght":
            percentSent = int(round(self.percentage * len(sentence_rank) /
                                    100))
            top_sent = top_sentences[:percentSent]

        summary = list()
        for sent, strength in sentence_rank.items():
            if strength in top_sent:
                summary.append(sent)
            else:
                continue

        progressBar.advance()

        #Summary contains spacy.tokens.span.Span that must be converted to string
        summary_str = [str(i) for i in summary]
        # Join all sentence in a single string
        resume = " ".join(summary_str)

        # Create HTML resume
        html_summary = list()
        for sent in doc.sents:
            if sent in summary:
                new_sent = '<b style=\'color:blue\'>' + str(sent) + '</b>'
                html_summary.append(new_sent)
            else:
                html_summary.append(sent)

        #html_summary contains spacy.tokens.span.Span that must be converted to string
        html_summary_str = [str(i) for i in html_summary]
        # Join all sentence in a single string
        html_resume = "<!DOCTYPE html>\n<html>\n<body>\n" + " ".join(
            html_summary_str) + "\n</body>\n</html>"

        progressBar.finish()

        # Create ouput segmentation from summary
        return Input(resume), Input(html_resume)

    ################################################################
    # loadmodelEN(), loadmodelFR() and loadmodelPT() load choosen model
    ################################################################

    def loadModelEN(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("en_core_web_sm")
        from spacy.lang.en.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelFR(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("fr_core_news_sm")
        from spacy.lang.fr.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelPT(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("pt_core_news_sm")
        from spacy.lang.pt.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelNL(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("nl_core_news_sm")
        from spacy.lang.nl.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelDE(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("de_core_news_sm")
        from spacy.lang.de.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelEL(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("el_core_news_sm")
        from spacy.lang.el.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelIT(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("it_core_news_sm")
        from spacy.lang.it.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelLT(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("lt_core_news_sm")
        from spacy.lang.lt.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelNB(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("nb_core_news_sm")
        from spacy.lang.nb.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    def loadModelES(self):
        """(Re-)load language model if needed."""
        self.nlp = spacy.load("es_core_news_sm")
        from spacy.lang.es.stop_words import STOP_WORDS
        cv = CountVectorizer(stop_words=list(STOP_WORDS))
        return cv

    #--------------------------------------------------------------
    # The following method needs to be copied verbatim in
    # every Textable widget that sends a segmentation...

    def setCaption(self, title):
        if 'captionTitle' in dir(self):
            changed = title != self.captionTitle
            super().setCaption(title)
            if changed:
                self.sendButton.settingsChanged()
        else:
            super().setCaption(title)