Exemple #1
0
 def setUp(self):
     """ Setting up for the test """
     self.entire_text_seg = Input('ab cde')
     self.other_entire_text_seg = Input('d')
     str_index = self.entire_text_seg[0].str_index
     self.first_word_seg = Segmentation(
         [
             Segment(
                     str_index=str_index,
                     start=0,
                     end=2,
                     annotations={'a': 1}
             )
         ]
     )
     self.last_word_seg = Segmentation(
         [Segment(str_index=str_index, start=3, end=6)]
     )
     self.char_seg = Segmentation(
         [
             Segment(str_index=str_index, start=0, end=1),
             Segment(str_index=str_index, start=1, end=2),
             Segment(str_index=str_index, start=2, end=3),
             Segment(str_index=str_index, start=3, end=4),
             Segment(str_index=str_index, start=4, end=5),
             Segment(str_index=str_index, start=5, end=6),
         ]
     )
Exemple #2
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        self.str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=self.str_index,
                    start=0,
                    end=2,
                    annotations={
                        'a': '1',
                        'bc': '20'
                    }),
            Segment(str_index=self.str_index, start=3, end=6)
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=self.str_index, start=3, end=5),
            Segment(str_index=self.str_index, start=4, end=6),
        ])

        self.base_output_string = ('segment number 1\n'
                                   '\tcontent:\t"ab"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t0\n'
                                   '\tend:\t2\n'
                                   '\tannotations:\n'
                                   '\t\ta                    1\n'
                                   '\t\tbc                   20\n'
                                   'segment number 2\n'
                                   '\tcontent:\t"cde"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t3\n'
                                   '\tend:\t6') % (self.str_index,
                                                   self.str_index)

        self.count = 0
Exemple #3
0
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model
        if not self.model:
            self.noLanguageModelWarning()
            return

        # Check that there's an input
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.send('Summary', None, self)
            self.send('HTML_Summary', None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )

        self.controlArea.setDisabled(True)

        # Type of segmentation (per segment or per segmentation)
        segments = list()
        html_segments = list()
        if self.typeSeg == "Summarize each segments individually":
            # Process each segment separately, then create segmentation
            for segment in self.inputSeg:
                content = segment.get_content()
                resume, html_resume = self.summarize(self.cv, content)
                segments.append(Segment(str_index=resume[0].str_index, ))
                html_segments.append(
                    Segment(str_index=html_resume[0].str_index, ))
        elif self.typeSeg == "Summarize all segments as one":
            merged_seg = " ".join(
                [segment.get_content() for segment in self.inputSeg])
            resume, html_resume = self.summarize(self.cv, merged_seg)
            segments.append(Segment(str_index=resume[0].str_index, ))
            html_segments.append(Segment(str_index=html_resume[0].str_index, ))

        # Create segmentation from segment() and assign it to the output
        self.outputSeg = Segmentation(segments, self.captionTitle)
        self.html_outputSeg = Segmentation(html_segments, self.captionTitle)

        # Send segmentation to output channels
        self.send("Summary", self.outputSeg, self)
        self.send('HTML_Summary', self.html_outputSeg, self)

        # Set message to sent
        message = "%i segment@p sent to output " % len(self.outputSeg)
        message = pluralize(message, len(self.outputSeg))
        self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
        self.controlArea.setDisabled(False)
Exemple #4
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        
        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            del self.headerList[:]
            self.headerList = self.headerList
            self.send("CSV Segmentation", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))


        # Treat...
        for segment in self.csvSeg:
            
            pass
                        
            progressBar.advance()

                 
        # Set status to OK and report data size...
        outputSeg = Segmentation(self.csvSeg, label=self.captionTitle)
        if len(self.contentIsNone) == 0 :
            message = "%i segment@p sent to output." % len(outputSeg)
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)
        # message if one or more segments has no content and has been ignored
        elif len(self.contentIsNone) == 1:
            message = "%i segment@p sent to output. (ignored %i segment with \
            no content)" % (len(outputSeg), len(self.contentIsNone))
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)
        else :
            message = "%i segment@p sent to output. (ignored %i segments with \
            no content)" % (len(outputSeg), len(self.contentIsNone))
            message = pluralize(message, len(outputSeg))
            self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
        
        # Send data to output...
        self.send("CSV Segmentation", outputSeg, self)
        
        self.sendButton.resetSettingsChangedFlag()             
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Clear morphology...
        self.morphology = dict()

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Morphologically analyzed data", None, self)
            self.updateGUI()
            return

        # Perform morphological analysis...

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait (word count)...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=100)

        # Word count...
        wordCounts = collections.Counter(
            [segment.get_content() for segment in self.inputSeg])
        self.morphology["wordCounts"] = wordCounts
        self.infoBox.setText(
            u"Processing, please wait (signature extraction)...",
            "warning",
        )
        progressBar.advance(5)  # 5 ticks on the progress bar...

        # Learn signatures...
        try:
            lxa5crab.crab_nebula.MIN_STEM_LEN = self.minStemLen
            signatures, stems, suffixes = lxa5crab.find_signatures(wordCounts)
            self.morphology["signatures"] = signatures
            self.morphology["stems"] = stems
            self.morphology["suffixes"] = suffixes
        except ValueError as e:
            self.infoBox.setText(e.__str__(), "warning")
            self.send("Morphologically analyzed data", None, self)
            self.controlArea.setDisabled(False)
            progressBar.finish()  # Clear progress bar.
            self.morphology = dict()
            self.updateGUI()
            return
        self.infoBox.setText(
            u"Processing, please wait (word parsing)...",
            "warning",
        )
        progressBar.advance(80)

        # Parse words...
        parser = lxa5crab.build_parser(wordCounts, signatures, stems, suffixes)
        self.morphology["parser"] = parser
        newSegments = list()
        num_analyzed_words = 0
        for segment in self.inputSeg:
            parses = parser[segment.get_content()]
            newSegment = segment.deepcopy()
            if parses[0].signature:
                num_analyzed_words += 1
            newSegment.annotations.update(
                {
                    "stem": parses[0].stem,
                    "suffix": parses[0].suffix  \
                                if len(parses[0].suffix) else "NULL",
                    "signature": parses[0].signature
                }
            )
            newSegments.append(newSegment)
        self.send(
            "Morphologically analyzed data",
            Segmentation(newSegments, self.captionTitle),
            self,
        )
        self.updateGUI()
        progressBar.advance(15)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output (%.2f%% analyzed)." % (len(
            self.inputSeg), (num_analyzed_words / len(self.inputSeg) * 100))
        message = pluralize(message, len(self.inputSeg))
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()
Exemple #6
0
    def updateTitleList(self):
        """Update the list of titles"""

        # If titleSeg has not been loaded for some reason, skip.
        if self.titleSeg is None:
            return

        # In Advanced settings mode, get list of selected titles...
        if self.displayAdvancedSettings and self.filterValue != "(all)":
            self.filteredTitleSeg, _ = Segmenter.select(
                segmentation=self.titleSeg,
                regex=re.compile(r"^%s$" % self.filterValue),
                annotation_key=self.filterCriterion,
            )
        else:
            self.filteredTitleSeg = self.titleSeg

        # If criterion is not "genre" and his filter value not "all",
        # group titles with different genres...

        # Create a dictionary with "author" and "title" as key...

        unique_titles = dict()
        for title in self.filteredTitleSeg:
            title_id = (
                title.annotations["author"],
                title.annotations["title"],
            )
            try:
                unique_titles[title_id].append(title)
            except KeyError:
                unique_titles[title_id] = [title]

        # Create a list with new annotation comporting all genres...
        new_title_segments = list()
        for unique_title in unique_titles.values():
            title_genres = list()
            new_title_segments.append(unique_title[0])
            title_genres.append(unique_title[0].annotations["genre"])
            for equivalent_title in unique_title[1:]:
                title_genres.append(equivalent_title.annotations["genre"])
            new_title_segments[-1].annotations["genre"] = ", ".join(
                sorted(list(set(title_genres))))

        self.filteredTitleSeg = Segmentation(None)
        self.filteredTitleSeg.extend(new_title_segments)

        # Populate titleLabels list with the titles...
        self.titleLabels = sorted(
            [s.annotations["title"] for s in self.filteredTitleSeg])

        # Add specification (author, year and genre, depending on criterion)...
        titleLabels = self.titleLabels[:]
        for idx, titleLabel in enumerate(titleLabels):
            specs = list()
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "author"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["author"])
            if (self.displayAdvancedSettings == False
                    or self.filterCriterion != "genre"
                    or self.filterValue == "(all)"):
                specs.append(self.filteredTitleSeg[idx].annotations["genre"])
            titleLabels[idx] = titleLabel + " (%s)" % "; ".join(specs)
        self.titleLabels = titleLabels

        # Reset selectedTitles if needed...
        if not set(self.importedURLs).issubset(
                set(u.annotations["url"] for u in self.filteredTitleSeg)):
            self.selectedTitles = list()
        else:
            self.selectedTitles = self.selectedTitles

        self.sendButton.settingsChanged()
Exemple #7
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=3, end=6)
        ])
        self.char_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=2, end=3),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index, start=4, end=5),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg1 = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
        ])
        self.letter_seg2 = Segmentation([
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.single_letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '1'}),
        ])
        self.duplicate_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=0, end=1),
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=str_index, start=3, end=5),
            Segment(str_index=str_index, start=4, end=6),
        ])

        self.other_entire_text_seg = Input('abbccc')
        str_index2 = self.other_entire_text_seg[0].str_index
        self.other_letter_seg = Segmentation([
            Segment(str_index=str_index2,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=1,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=2,
                    end=3,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=3,
                    end=4,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=4,
                    end=5,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=5,
                    end=6,
                    annotations={'a': '3'}),
        ])

        self.third_entire_text_seg = Input('bd1')
        str_index3 = self.third_entire_text_seg[0].str_index
        self.third_letter_seg = Segmentation([
            Segment(str_index=str_index3, start=0, end=1),
            Segment(str_index=str_index3,
                    start=1,
                    end=2,
                    annotations={'a': '2'}),
            Segment(str_index=str_index3,
                    start=2,
                    end=3,
                    annotations={'a': 'b'}),
        ])

        self.fourth_entire_text_seg = Input('AB cd\xe9')
        str_index = self.fourth_entire_text_seg[0].str_index
        self.second_word_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=2),
            Segment(str_index=str_index, start=3, end=6),
        ])

        self.xml_seg = Input('<a attr="1"><a attr="2/3/">c<a/>d</a></a>')
        self.wrong_xml_seg = Input('<a><a>test</a>')
        self.wrong_xml_seg2 = Input('<a>test</a></a>')

        self.part_xml_seg = Input('<a>1<a>2<a>3</a>4')
        str_index3 = self.part_xml_seg[0].str_index
        self.part_xml_seg2 = Input('</a>5</a>')
        str_index4 = self.part_xml_seg2[0].str_index
        self.broken_xml_seg = Segmentation([
            Segment(str_index=str_index3, annotations={'a': '1'}),
            Segment(str_index=str_index4),
        ])

        self.count = 0
Exemple #8
0
    def sendData(self):

        # Clear created Inputs...
        self.clearCreatedInputs()

        if not self.TreetaggerPath:
            self.infoBox.setText(self.noTreetaggerPathWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.getAvailableLanguages():
            self.infoBox.setText(self.noLanguageParameterWarning, "warning")
            self.send("Tagged data", None)
            return
        elif not self.segmentation:
            self.infoBox.setText(u"Widget needs input", "warning")
            self.send("Tagged data", None)
            return

        # Initialize progress bar.
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        self.progressBar = ProgressBar(self, iterations=5)

        # Create a copy of input seg, storing annotations in temp attr...
        copy_of_input_seg = Segmentation()
        copy_of_input_seg.label = self.segmentation.label
        for seg_idx, segment in enumerate(self.segmentation):
            attr = " ".join([
                "%s=%s" % (
                    ''.join(c for c in unicodedata.normalize('NFD', item[0])
                            if unicodedata.category(c) != 'Mn'),
                    quoteattr(str(item[1])),
                ) for item in segment.annotations.items()
            ])

            segment.annotations["tt_ax"] = attr
            copy_of_input_seg.append(segment)

        self.progressBar.advance()

        # Dump segmentation in unique string to avoid multiple calls to TT...
        concatenated_text = copy_of_input_seg.to_string(
            formatting="<ax_tt %(tt_ax)s>%(__content__)s</ax_tt>",
            display_all=True,
        )

        self.progressBar.advance()

        # Tag the segmentation contents...
        tagopt = '-token -lemma -sgml -quiet'
        if self.replaceUnknown:
            tagopt += " -no-unknown"
        tagger = treetaggerwrapper.TreeTagger(
            TAGLANG=pycountry.languages.get(name=self.language).alpha_2,
            TAGOPT=tagopt,
            TAGDIR=self.TreetaggerPath,
        )
        tagged_lines = tagger.tag_text(
            concatenated_text,
            notagurl=True,
            notagemail=True,
            notagip=True,
            notagdns=True,
        )
        tagged_input = Input("\n".join(tagged_lines))
        self.createdInputs.append(tagged_input)

        # Replace <unknown> with [unknown] and " with &quot; then
        # re-segment to match the original segmentation structure.
        tagged_segmentation, _ = Segmenter.recode(
            tagged_input,
            substitutions=[
                (re.compile(r"<unknown>"), "[unknown]"),
                (re.compile(r'"""'), '"&quot;"'),
            ],
        )
        tagged_segmentation = Segmenter.import_xml(tagged_segmentation,
                                                   "ax_tt")

        self.progressBar.advance()

        # Place each output line of Treetagger in an xml tag with annotations..
        xml_segmentation, _ = Segmenter.recode(
            tagged_segmentation,
            substitutions=[
                (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                 '<w lemma="&3" pos-tag="&2">&1</w>'),
                (re.compile(r'^\n|\n$'), ''),
            ],
        )
        # Segment into individual tokens if XML output option is disabled...
        if self.outputFormat == "add XML tags":
            output_segmentation = xml_segmentation
        else:
            try:
                output_segmentation = Segmenter.import_xml(
                    xml_segmentation, "w")
            except ValueError:
                self.infoBox.setText(
                    "Please check that either the input contains well-formed "
                    "XML, or it doesn't contain instances of '&#60;' and '\x3e'",
                    "error")
                self.send("Tagged data", None)
                self.progressBar.finish()
                self.controlArea.setDisabled(False)
                return

        self.progressBar.finish()
        self.controlArea.setDisabled(False)

        output_segmentation.label = self.captionTitle
        message = u'%i segment@p sent to output.' % len(output_segmentation)
        message = pluralize(message, len(output_segmentation))
        self.infoBox.setText(message)
        self.send('Tagged data', output_segmentation, self)
        self.sendButton.resetSettingsChangedFlag()
Exemple #9
0
 def test_creator(self):
     """Does creator return Segmentation object?"""
     self.assertIsInstance(
         Segmentation(),
         Segmentation,
         msg="creator doesn't return Segmentation object!")
    def sendData(self):

        # Si le lien vers treetagger n"est pas trouve
        if self.NoLink:
            self.infoBox.setText(u"Sorry, TreeTagger's link not found.",
                                 "error")
            self.send("Text data", None)
        # Important: if input data is None, propagate this value to output...
        elif not self.inputData:
            self.infoBox.setText(u"Widget needs input", "warning")
            self.send("Text data", None)
        # affiche que quelque chose se passe...
        else:
            self.infoBox.setText(u"TreeTagger is running...", "warning")

            # Initialisation de variables
            total_tagged_text = list()
            new_segmentations = list()
            i = 0

            # Initialize progress bar.
            self.progressBar = gui.ProgressBar(self, iterations=5)

            # Copie de la segmentation avec ajout d"une annotation...
            copy_of_input_seg = Segmentation()
            copy_of_input_seg.label = self.inputData.label
            for seg_idx, segment in enumerate(self.inputData):
                attr = " ".join(
                    ["%s='%s'" % item for item in segment.annotations.items()])
                segment.annotations["tt_xb"] = attr
                copy_of_input_seg.append(segment)

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            concatenated_text = copy_of_input_seg.to_string(
                formatting="<xb_tt %(tt_xb)s>%(__content__)s</xb_tt>",
                display_all=True,
            )

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            tagged_text = self.tag(concatenated_text)
            tagged_input = Input(tagged_text)
            tagged_segmentation = Segmenter.import_xml(tagged_input, "xb_tt")

            # avancer la progressBar d"un cran
            self.progressBar.advance()

            # Si checkBox xml active
            if self.activer_xml == True:
                xml_segmentation, _ = Segmenter.recode(
                    tagged_segmentation,
                    substitutions=[
                        (re.compile(r"<unknown>"), "[unknown]"),
                        (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                         "<w lemma='&3' type='&2'>&1</w>"),
                        (re.compile(r'"""'), '"&quot;"'),
                    ],
                )
                final_segmentation = xml_segmentation
            # Si checkBox xml desactive
            else:
                xml_segmentation, _ = Segmenter.recode(
                    tagged_segmentation,
                    substitutions=[
                        (re.compile(r"<unknown>"), "[unknown]"),
                        (re.compile(r"(.+)\t(.+)\t(.+?)(?=[\r\n])"),
                         "<w lemma='&3' type='&2'>&1</w>"),
                        (re.compile(r'"""'), '"&quot;"'),
                    ],
                )
                final_segmentation = Segmenter.import_xml(
                    xml_segmentation, "w")

            self.infoBox.dataSent("")

            # Enregistrer le lien de treetagger...
            if self.system == "nt":
                file = open("treetagger_link.txt", "w")
            else:
                file = open(
                    os.path.normpath("/Users/" + self.user +
                                     "/treetagger_link.txt"), "w")

            file.write(self.treetagger_link)
            file.close()

            # Clear progress bar.
            self.progressBar.finish()

            # envoyer la seguementation
            self.send("Text data", final_segmentation, self)
            self.compteur += 1
            self.sendButton.resetSettingsChangedFlag()
Exemple #11
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Linguistically analyzed data", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        tokenizedSegments = list()

        # Process each input segment...
        for segment in self.inputSeg:

            # Input segment attributes...
            inputContent = segment.get_content()
            inputAnnotations = segment.annotations
            inputString = segment.str_index
            inputStart = segment.start or 0
            inputEnd = segment.end or len(inputContent)

            # NLP analysis...
            doc = self.nlp(inputContent)

            # Process each token in input segment...
            for token in doc:
                tokenAnnotations = inputAnnotations.copy()
                tokenAnnotations.update({
                    k: getattr(token, k)
                    for k in RELEVANT_KEYS if getattr(token, k) is not None
                })
                tokenStart = inputStart + token.idx
                tokenizedSegments.append(
                    Segment(
                        str_index=inputString,
                        start=tokenStart,
                        end=tokenStart + len(token),
                        annotations=tokenAnnotations,
                    ))

            progressBar.advance()

        outputSeg = Segmentation(tokenizedSegments, self.captionTitle)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSeg)
        message = pluralize(message, len(outputSeg))
        self.infoBox.setText(message)

        print(outputSeg.to_string())

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        # Send data to output...
        self.send("Linguistically analyzed data", outputSeg, self)

        self.sendButton.resetSettingsChangedFlag()
Exemple #12
0
    def sendData(self):
        """Convert input(s) and send output"""
        if not (self.segmentation or self.corpus):
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Textable segmentation', None, self)
            self.send('Text Mining corpus', None)
            return

        msg_seg = msg_corpus = ""

        num_iterations = 0
        if self.corpus:
            num_iterations += len(self.corpus)
        if self.segmentation:
            num_iterations += len(self.segmentation)
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=num_iterations)

        # Convert corpus to segmentation...
        if self.corpus:
            self.clearCreatedInputs()
            new_segments = list()
            text_feature = self.corpus.text_features[self.segmentContent]
            for row in self.corpus:
                content = row[text_feature].value
                if content == "":
                    continue
                new_input = Input(row[text_feature].value)
                new_segment_annotations = dict()
                for attr in self.corpus.domain:
                    attr_str = str(row[attr])
                    if attr_str != "?":
                        new_segment_annotations[str(attr)] = attr_str
                for meta_attr in self.corpus.domain.metas:
                    meta_attr_str = str(row[meta_attr])
                    if (meta_attr != text_feature and meta_attr_str != "?"):
                        new_segment_annotations[str(meta_attr)] = meta_attr_str
                new_segments.append(
                    Segment(new_input[0].str_index, new_input[0].start,
                            new_input[0].end, new_segment_annotations))
                self.createdInputs.append(new_input)
                progressBar.advance()
            new_segmentation = Segmentation(new_segments, self.captionTitle)
            msg_seg = u'%i segment@p' % len(new_segmentation)
            msg_seg = pluralize(msg_seg, len(new_segmentation))
            self.send('Textable segmentation', new_segmentation, self)
        else:
            self.send('Textable segmentation', None, self)

        # Convert segmentation to corpus...
        if self.segmentation:
            metas = list()
            attributes = list()
            meta_keys = list()
            attribute_keys = list()
            for key in self.segmentation.get_annotation_keys():
                possible_values = set()
                for segment in self.segmentation:
                    try:
                        possible_values.add(str(segment.annotations[key]))
                    except KeyError:
                        pass
                if (self.limitNumCategories
                        and len(possible_values) > self.maxNumCategories):
                    metas.append(StringVariable(key))
                    meta_keys.append(key)
                else:
                    attributes.append(
                        DiscreteVariable(key, values=list(possible_values)))
                    attribute_keys.append(key)
            metas.append(StringVariable("textable_text"))
            domain = Domain(attributes, [], metas)
            rows = list()
            for segment in self.segmentation:
                row = [
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in attribute_keys
                ]
                row.extend([
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in meta_keys
                ])
                row.append(segment.get_content())
                rows.append(row)
                progressBar.advance
            table = Table(domain, rows)
            if textMiningIsInstalled:
                corpus = Corpus(domain,
                                X=table.X,
                                metas=table.metas,
                                text_features=[metas[-1]])
            msg_corpus = u'%i document@p' % len(self.segmentation)
            msg_corpus = pluralize(msg_corpus, len(self.segmentation))
            self.send('Text Mining corpus', corpus)
        else:
            self.send('Text Mining corpus', None)

        progressBar.finish()
        self.controlArea.setDisabled(False)

        if msg_seg or msg_corpus:
            message = msg_seg
            if msg_seg and msg_corpus:
                message += " and "
            message += msg_corpus
            message += " sent to output."
            self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.noLanguageModelWarning()
            self.sendNoneToOutputs()
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            return
       
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )

        # Disable control area and initialize progress bar...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.char_df))       

        # Get start and end pos of concatenated input segments...
        startPositions = [0]
        endPositions = list()
        numSegments = len(self.inputSeg)
        for idx in range(1, numSegments):
            prevSegLen = len(self.inputSeg[idx-1].get_content())
            startPositions.append(startPositions[-1] + prevSegLen + 1)
            endPositions.append(startPositions[-1] - 1)
        endPositions.append(startPositions[-1] + 
                            len(self.inputSeg[-1].get_content()) + 1)

        # Get or update character aliases...
        find_pairs = sys.modules['charnetto.find_pairs']
        characters = [entry.split(", ") for entry in self.characters]
        find_pairs.map_names(self.char_df, characters)

        # Initializations...
        charSegments = list()
        currentSegmentIdx = 0
                
        # For each character token in Charnetto's output...
        for index, charToken in self.char_df.iterrows():
        
            # Skip non-PER named entities.
            if charToken["tag"] != "PER":
                continue

            # Get index of containing segment...
            while charToken["end_pos"] > endPositions[currentSegmentIdx]:
                currentSegmentIdx += 1
                
            # Create segment for char with its actual coordinates...
            strIndex = self.inputSeg[currentSegmentIdx].str_index
            start = charToken["start_pos"]-startPositions[currentSegmentIdx]
            end = charToken["end_pos"]-startPositions[currentSegmentIdx]
            annotations = {"id": charToken["alias"]}
            charSegments.append(Segment(strIndex, start, end, annotations))
            
            progressBar.advance()

        # Send output...
        outputSegmentation = Segmentation(charSegments, 
                                           label=self.captionTitle)
        self.send("Character segmentation", outputSegmentation, self)
        print(outputSegmentation.to_string())

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSegmentation)
        message = pluralize(message, len(outputSegmentation))
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.infoBox.setText(
                "Please download a language model first.",
                "warning",
            )
            self.tabs.setCurrentIndex(1)
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            for channel in [c.name for c in self.outputs]:
                self.send(channel, None, self)
            return

        # Check max length and adjust if needed...
        inputLength = sum(len(s.get_content()) for s in self.inputSeg)
        if self.maxLen != "no limit":
            maxNumChar = int(self.maxLen.split()[0]) * 1000000
            if inputLength > maxNumChar:
                self.infoBox.setText(
                    "Input exceeds max number of characters set by user.", 
                    "warning",
                )
                for channel in [c.name for c in self.outputs]:
                    self.send(channel, None, self)
                return
        else:
            if inputLength > self.nlp.max_length:
                maxNumChar = inputLength          
        
        # Load components if needed...
        disabled, enabled = self.getComponentStatus()
        if self.mustLoad or not(
            self.nlp and set(enabled) <= set(self.loadedComponents)
        ):
            self.loadModel()
        self.nlp.max_length = maxNumChar
        
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))       

        tokenSegments = list()
        entitySegments = list()
        chunkSegments = list()
        sentenceSegments = list()
        
        # Process each input segment...
        for segment in self.inputSeg:
        
            # NLP analysis...
            disabled, _ = self.getComponentStatus()
            disabled = [c for c in disabled if c in set(self.loadedComponents)]
            with self.nlp.disable_pipes(*disabled):
                doc = self.nlp(segment.get_content())

            # Get token segments...
            tokenSegments.extend(spacyItemsToSegments(doc, segment))

            # Get named entity segments...
            if self.segmentEntities:
                entitySegments.extend(spacyItemsToSegments(doc.ents, segment))

            # Get noun chunk segments...
            if self.segmentChunks:
                chunkSegments.extend(
                    spacyItemsToSegments(doc.noun_chunks, segment), 
                )

            # Get sentences segments...
            if self.segmentSentences:
                sentenceSegments.extend(
                    spacyItemsToSegments(doc.sents, segment), 
                )

            progressBar.advance()

        # Build segmentations and send them to output...                   
        tokenSeg = Segmentation(tokenSegments, self.captionTitle + "_tokens")
        self.send("Tokenized text", tokenSeg, self)
        if self.segmentChunks:
            chunkSeg = Segmentation(
                chunkSegments, 
                self.captionTitle + "_chunks",
            )
            self.send("Noun chunks", chunkSeg, self)
        if self.segmentEntities:
            entitySeg = Segmentation(
                entitySegments, 
                self.captionTitle + "_entities",
            )
            self.send("Named entities", entitySeg, self)
        if self.segmentSentences:
            sentenceSeg = Segmentation(
                sentenceSegments, 
                self.captionTitle + "_sentences",
            )
            self.send("Sentences", sentenceSeg, self)

        # Set status to OK and report data size...
        message = "%i token@p" % len(tokenSeg)
        message = pluralize(message, len(tokenSeg))
        if self.segmentChunks:
            message += ", %i chunk@p" % len(chunkSeg)
            message = pluralize(message, len(chunkSeg))
        if self.segmentEntities:
            message += ", %i " % len(entitySeg)
            message += "entity" if len(entitySeg) == 1 else "entities"
        if self.segmentSentences:
            message += ", %i sentence@p" % len(sentenceSeg)
            message = pluralize(message, len(sentenceSeg))
        message += " sent to output."
        last_comma_idx = message.rfind(",")
        if last_comma_idx > -1:
            message = message[:last_comma_idx] + " and" +    \
                message[last_comma_idx+1:]
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()             
Exemple #15
0
    def sendData(self):
        """Compute result of widget processing and send to output"""
        if not self.importedCorpora:
            self.infoBox.setText("Please add a corpus to the selection.",
                                 "warning")
            self.send("Files", None, self)
            self.send("Utterances", None, self)
            return

        # Clear created Inputs and initialize progress bar...
        self.clearCreatedInputs()
        numberOfSteps = 2 if self.outputUtterances else 1
        numberOfSteps += 2 if self.outputWords else 0
        self.infoBox.setText(
            "(1/%i) Retrieving data, please wait..." % numberOfSteps,
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.importedCorpora))

        annotations = list()

        # Iterate over corpora...
        for importedCorpus in self.importedCorpora:

            corpus = importedCorpus.split("/")[-1]

            # Try to retrieve corpus from cache...
            try:
                basepath = os.path.dirname(
                    os.path.abspath(inspect.getfile(inspect.currentframe())))
                corpusFilepath = os.path.normpath(
                    os.path.join(
                        basepath,
                        self.__class__.cachedFoldername,
                        importedCorpus[len(self.__class__.baseUrl):],
                    ))
                myZip = zipfile.ZipFile(corpusFilepath)

            except IOError:

                # Else try to download (and cache) requested zip file...
                try:
                    response = requests.get(importedCorpus)
                    myZip = zipfile.ZipFile(io.BytesIO(response.content))
                    corpusFolderpath = os.path.dirname(corpusFilepath)
                    try:
                        os.makedirs(corpusFolderpath)
                    except OSError:
                        pass
                    try:
                        outputFile = open(corpusFilepath, "wb")
                        outputFile.write(response.content)
                        outputFile.close()
                    except IOError:
                        pass

                # If an error occurs (e.g. connection error)...
                except:

                    # Set Info box and widget to "error" state.
                    self.infoBox.setText(
                        "Couldn't download corpus %s from CHILDES website." %
                        corpus, "error")

                    # Reset output channel.
                    self.send("Files", None, self)
                    self.send("Utterances", None, self)
                    progressBar.finish()
                    self.controlArea.setDisabled(False)
                    return

            # Create Input for each zipped file and store annotations...
            for file in myZip.infolist():
                file_content = myZip.read(file).decode('utf-8')

                # If word segmentation is requested...
                if self.outputWords:
                    # Implement replacements.
                    file_content = re.sub(
                        r"<w.+?(<replacement.+</replacement>).*?</w>",
                        r"\1",
                        file_content,
                    )
                    # Prepend pre-clitics.
                    file_content, n = re.subn(
                        r"(<mor .+?)(<mor-pre>.+</mor-pre>)",
                        r"\2\1",
                        file_content,
                    )
                    # Move <gra> into <mw>.
                    file_content, n = re.subn(
                        r"(</mw>)(<gra.+?/>)",
                        r"\2\1",
                        file_content,
                    )

                newInput = Input(file_content, self.captionTitle + "_files")
                self.createdInputs.append(newInput)
                chatSeg = Segmenter.import_xml(newInput, "CHAT")
                annotations.append(dict())
                annotations[-1]["file_path"] = file.filename
                for key in ["Corpus", "Lang", "PID"]:
                    try:
                        annotations[-1][key.lower()] =  \
                            chatSeg[0].annotations[key]
                    except KeyError:
                        pass
                participantListSeg = Segmenter.import_xml(
                    newInput, "Participants")
                recodedInput, _ = Segmenter.recode(
                    participantListSeg,
                    [(re.compile("/>"), "> </participant>")])
                participantSeg = Segmenter.import_xml(recodedInput,
                                                      "participant")
                targetChildData = list()
                for participant in participantSeg:
                    if participant.annotations["role"] != "Target_Child":
                        continue
                    targetChildData.append(dict())
                    if "age" in participant.annotations:
                        targetChildData[-1]["target_child_age"] =   \
                            participant.annotations["age"]
                        age_parse = re.search(
                            r"(\d+)Y(\d+)M(\d+)D",
                            participant.annotations["age"],
                        )
                        if age_parse:
                            targetChildData[-1]["target_child_years"] =     \
                                age_parse.group(1)
                            months = int(age_parse.group(2))   \
                                + 12 * int(age_parse.group(1))
                            targetChildData[-1]["target_child_months"] =     \
                            '%02d' % months
                            days = int(age_parse.group(3))   \
                                + 30 * months
                            targetChildData[-1]["target_child_days"] =     \
                            '%02d' % days
                    if "id" in participant.annotations:
                        targetChildData[-1]["target_child_id"] =   \
                            participant.annotations["id"]
                    if "sex" in participant.annotations:
                        targetChildData[-1]["target_child_sex"] =   \
                            participant.annotations["sex"]
                if len(targetChildData) == 1:
                    annotations[-1].update(targetChildData[0])

            progressBar.advance()

        # If there's only one file, the widget's output is the created Input...
        if len(self.createdInputs) == 1:
            self.fileSegmentation = self.createdInputs[0]

        # Otherwise the widget's output is a concatenation...
        else:
            self.fileSegmentation = Segmenter.concatenate(
                self.createdInputs,
                self.captionTitle + "_files",
                import_labels_as=None,
            )

        # Annotate segments...
        for idx, segment in enumerate(self.fileSegmentation):
            segment.annotations.update(annotations[idx])
            self.fileSegmentation[idx] = segment

        # Terminate progress bar...
        progressBar.finish()

        message = "%i file@p" % len(self.fileSegmentation)
        message = pluralize(message, len(self.fileSegmentation))
        self.send("Files", self.fileSegmentation, self)

        # Build utterance segmentation if needed...
        if self.outputUtterances:
            self.infoBox.setText(
                "(2/%i) Building utterance segmentation, please wait..."    \
                    % numberOfSteps,
                "warning",
            )
            progressBar = ProgressBar(self,
                                      iterations=len(self.fileSegmentation))
            self.utteranceSegmentation = Segmenter.import_xml(
                self.fileSegmentation,
                "u",
                progress_callback=progressBar.advance,
                label=self.captionTitle + "_utterances",
            )
            progressBar.finish()
            message += " and " if not self.outputWords else ", "
            message += "%i utterance@p" % len(self.utteranceSegmentation)
            message = pluralize(message, len(self.utteranceSegmentation))
            self.send("Utterances", self.utteranceSegmentation, self)
        else:
            self.send("Utterances", None, self)

        # Build word segmentation if needed...
        if self.outputWords:
            self.infoBox.setText(
                "(%i/%i) Building word segmentation, please wait..."    \
                    % (2 + (1 if self.outputUtterances else 0), numberOfSteps),
                "warning",
            )
            try:
                baseSegmentation = self.utteranceSegmentation
            except:
                baseSegmentation = self.fileSegmentation
            progressBar = ProgressBar(self,
                                      iterations=2 * len(baseSegmentation))
            wordSegmentation = Segmenter.import_xml(
                baseSegmentation,
                "w",
                progress_callback=progressBar.advance,
            )
            mwSegmentation = Segmenter.import_xml(
                baseSegmentation,
                "mw",
                progress_callback=progressBar.advance,
            )

            # Analyze words to extract annotations...
            self.infoBox.setText(
                "(%i/%i) Extracting word annotations, please wait..."    \
                    % (3 + (1 if self.outputUtterances else 0), numberOfSteps),
                "warning",
            )
            progressBar.finish()
            progressBar = ProgressBar(self, iterations=len(wordSegmentation))
            wordSegments = list()
            for word in wordSegmentation:
                mws = word.get_contained_segments(mwSegmentation)
                if mws:
                    for mw in mws:
                        wordSegment = word.deepcopy()
                        wordSegment.annotations.update(
                            self.extractWordAnnotations(mw))
                        wordSegments.append(wordSegment)
                else:
                    wordSegments.append(word)
                progressBar.advance()

            self.wordSegmentation = Segmentation(
                wordSegments,
                label=self.captionTitle + "_words",
            )

            message += " and %i word@p" % len(self.wordSegmentation)
            message = pluralize(message, len(self.wordSegmentation))
            self.send("Words", self.wordSegmentation, self)
        else:
            self.send("Words", None, self)

        # Set status to OK and report data size...
        message += " sent to output."
        message = pluralize(message, len(self.fileSegmentation))
        self.infoBox.setText(message)
        progressBar.finish()

        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()