예제 #1
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        self.str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=self.str_index,
                    start=0,
                    end=2,
                    annotations={
                        'a': '1',
                        'bc': '20'
                    }),
            Segment(str_index=self.str_index, start=3, end=6)
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=self.str_index, start=3, end=5),
            Segment(str_index=self.str_index, start=4, end=6),
        ])

        self.base_output_string = ('segment number 1\n'
                                   '\tcontent:\t"ab"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t0\n'
                                   '\tend:\t2\n'
                                   '\tannotations:\n'
                                   '\t\ta                    1\n'
                                   '\t\tbc                   20\n'
                                   'segment number 2\n'
                                   '\tcontent:\t"cde"\n'
                                   '\tstr_index:\t%i\n'
                                   '\tstart:\t3\n'
                                   '\tend:\t6') % (self.str_index,
                                                   self.str_index)

        self.count = 0
예제 #2
0
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model
        if not self.model:
            self.noLanguageModelWarning()
            return

        # Check that there's an input
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.send('Summary', None, self)
            self.send('HTML_Summary', None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )

        self.controlArea.setDisabled(True)

        # Type of segmentation (per segment or per segmentation)
        segments = list()
        html_segments = list()
        if self.typeSeg == "Summarize each segments individually":
            # Process each segment separately, then create segmentation
            for segment in self.inputSeg:
                content = segment.get_content()
                resume, html_resume = self.summarize(self.cv, content)
                segments.append(Segment(str_index=resume[0].str_index, ))
                html_segments.append(
                    Segment(str_index=html_resume[0].str_index, ))
        elif self.typeSeg == "Summarize all segments as one":
            merged_seg = " ".join(
                [segment.get_content() for segment in self.inputSeg])
            resume, html_resume = self.summarize(self.cv, merged_seg)
            segments.append(Segment(str_index=resume[0].str_index, ))
            html_segments.append(Segment(str_index=html_resume[0].str_index, ))

        # Create segmentation from segment() and assign it to the output
        self.outputSeg = Segmentation(segments, self.captionTitle)
        self.html_outputSeg = Segmentation(html_segments, self.captionTitle)

        # Send segmentation to output channels
        self.send("Summary", self.outputSeg, self)
        self.send('HTML_Summary', self.html_outputSeg, self)

        # Set message to sent
        message = "%i segment@p sent to output " % len(self.outputSeg)
        message = pluralize(message, len(self.outputSeg))
        self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
        self.controlArea.setDisabled(False)
예제 #3
0
 def test_get_content_missing_end(self):
     """Does get_content() work with end=None?"""
     segment = Segment(
         str_index=self.entire_text_seg[0].str_index,
         start=0,
         end=None,
     )
     self.assertEqual(segment.get_content(),
                      'ab cde',
                      msg="get_content() doesn't work with end=None!")
예제 #4
0
 def test_get_content_missing_start_and_end(self):
     """Does get_content() work with start=None and end=None?"""
     segment = Segment(
         str_index=self.entire_text_seg[0].str_index,
         start=None,
         end=None,
     )
     self.assertEqual(
         segment.get_content(),
         'ab cde',
         msg="get_content() doesn't work with start=None and end=None!"
     )
예제 #5
0
 def setUp(self):
     """ Setting up for the test """
     self.entire_text_seg = Input('ab cde')
     self.other_entire_text_seg = Input('d')
     str_index = self.entire_text_seg[0].str_index
     self.first_word_seg = Segmentation(
         [
             Segment(
                     str_index=str_index,
                     start=0,
                     end=2,
                     annotations={'a': 1}
             )
         ]
     )
     self.last_word_seg = Segmentation(
         [Segment(str_index=str_index, start=3, end=6)]
     )
     self.char_seg = Segmentation(
         [
             Segment(str_index=str_index, start=0, end=1),
             Segment(str_index=str_index, start=1, end=2),
             Segment(str_index=str_index, start=2, end=3),
             Segment(str_index=str_index, start=3, end=4),
             Segment(str_index=str_index, start=4, end=5),
             Segment(str_index=str_index, start=5, end=6),
         ]
     )
예제 #6
0
def spacyItemsToSegments(items, parentSegment):
    """Convert spaCy items (tokens or spans) to Textable segments."""
    parentStrIndex = parentSegment.str_index
    parentAnnotations = parentSegment.annotations
    parentStart = parentSegment.start or 0
    segments = list()
    for item in items:
        annotations = parentAnnotations.copy()
        annotations.update(
            {
                k: getattr(item, k) for k in RELEVANT_KEYS
                if hasattr(item, k)
                and getattr(item, k) is not None 
                and getattr(item, k) is not ""
                
            }
        )
        if str(type(item)).endswith("Token'>"):
            startPos = parentStart + item.idx
            endPos = startPos + len(item) 
        else:
            startPos = parentStart + item.start_char
            endPos = parentStart + item.end_char 
        segments.append(
            Segment(
                str_index=parentStrIndex,
                start=startPos,
                end=endPos,
                annotations=annotations,
            )
        )
    return segments
예제 #7
0
 def test_creator_no_annotations(self):
     """Does creator initialize param annotations to {} by default?"""
     segment = Segment(0, annotations=None)
     self.assertEqual(
         segment.annotations,
         dict(),
         msg="creator doesn't init param annotations to {} by default!")
예제 #8
0
 def test_creator(self):
     """Does creator return Segment object?"""
     mock_address = 1
     self.assertIsInstance(
         Segment(mock_address),
         Segment,
         msg="creator doesn't return Segment object!"
     )
예제 #9
0
    def treat_input(self):
        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            del self.headerList[:]
            self.headerList = self.headerList
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        # clear lists
        del self.csvSeg[:]
        del self.contentIsNone[:]

        # Process each input segment...
        for segment in self.inputSeg:
        
            # Input segment attributes...
            inputContent = segment.get_content()
            if not self.deleteQuotes == False :
                inputContent = inputContent.replace('"',"")
            inputAnnotations = segment.annotations
            inputStrIdx = segment.str_index
            inputStart = segment.start or 0
            inputEnd = segment.end or len(inputContent)
            #Call data processing
            csv_stream = io.StringIO(inputContent)
            dialect = sniffer.sniff(csv_stream.readline())
            dialect.quoting=csv.QUOTE_NONE
            csv_stream.seek(0)
            my_reader = csv.reader(csv_stream, dialect)
            position = 0
            # Process each seg in inputContent
            for seg in inputContent:
                segAnnotations = inputAnnotations.copy()
            # This  will launch if sniffer detects a header in the content.
            if sniffer.has_header(inputContent) == True:
                # go back to the start otherwise we're going to start from the
                # second row
                csv_stream.seek(0)
                # the header row is defined here.
                if self.isRenamed == False :
                    self.dict_keys = next(my_reader)
                    for key in self.dict_keys:
                    # this is position of first content
                    # TODO : separator length (if not 1)
                        position += (len(key) + 1)
                else :
                    input_keys = next(my_reader)
                    for key in input_keys:
                    # this is position of first content
                    # TODO : separator length (if not 1)
                        position += (len(key) + 1)


            # This will launch if sniffer does not detect a header 
            # in the content.
            if sniffer.has_header(inputContent) == False:
                # go back to the start otherwise we're going to start from the
                # second row. we do this here even though we don't really care
                # about the first row simply because in general we consider the
                # first row to not have any missing values
                csv_stream.seek(0)
                first_row = next(my_reader)
                n_cols = len(first_row)
                if self.isRenamed == False :
                    self.dict_keys = list()
                    for item in range(1, n_cols+1):
                        self.dict_keys.append(str(item))
                csv_stream.seek(0)


            # clear the list before appending
            del self.headerList[:]

            for key in self.dict_keys:
                # appends the headers to the gui list
                if self.dict_keys.index(key) == self.content_column:
                    self.headerList.append(str(key)+"(*content)")
                    self.headerList = self.headerList
                else :
                    self.headerList.append(str(key))
                    self.headerList = self.headerList


            for idx, row in enumerate(my_reader, start=2):
                # Get old annotations in new dictionary
                oldAnnotations = inputAnnotations.copy()
                segAnnotations = dict()
                # initiate next row starting position
                next_position = position
                for key in oldAnnotations.keys():
                    segAnnotations[key] = oldAnnotations[key]

                # This is the main part where we transform our data into
                # annotations.
                for key in self.dict_keys:
                    # segAnnotations["length"] = position
                    # segAnnotations["row"] = str(row)

                    # if column is content (first column (0) by default)
                    if self.dict_keys.index(key) == self.content_column:
                        # put value as content
                        content = row[self.dict_keys.index(key)]
                    # else we put value in annotation
                    else:
                        # only if value is not None
                        if len(row[self.dict_keys.index(key)]) != 0 :
                            segAnnotations[key] = row[self.dict_keys.index(key)]
                    # implement position and next_position depending on
                    # content column
                    if self.dict_keys.index(key) < self.content_column:
                        position += len(row[self.dict_keys.index(key)]) + 1
                        next_position += len(row[self.dict_keys.index(key)]) + 1
                    if self.dict_keys.index(key) >= self.content_column:
                        next_position += len(row[self.dict_keys.index(key)]) + 1

                if len(content) != 0:
                    self.csvSeg.append(
                        Segment(
                            str_index = inputStrIdx,
                            start = position,
                            end = position + len(content),
                            annotations = segAnnotations
                            )
                        )

                else :
                    # if no content, add idx of the row and do not append
                    # TODO : something with contentIsNone
                    self.contentIsNone.append(idx)

                # initiate new row starting position
                position = next_position
                        
            progressBar.advance()

        unSeg = len(self.csvSeg)         
        # Set status to OK and report segment analyzed...
        message = "%i segment@p analyzed." % unSeg
        message = pluralize(message, unSeg)
        message += " (Ignored %i segment@p with no content)" %      \
            len(self.contentIsNone)
        message = pluralize(message, len(self.contentIsNone))
        self.infoBox.setText(message)

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        self.sendButton.resetSettingsChangedFlag()
        self.sendButton.sendIf()
예제 #10
0
    def setUp(self):
        """ Setting up for the test """
        self.entire_text_seg = Input('ab cde')
        str_index = self.entire_text_seg[0].str_index
        self.word_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=3, end=6)
        ])
        self.char_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=2, end=3),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index, start=4, end=5),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg1 = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
        ])
        self.letter_seg2 = Segmentation([
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index, start=1, end=2),
            Segment(str_index=str_index, start=3, end=4),
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '2'}),
            Segment(str_index=str_index, start=5, end=6),
        ])
        self.single_letter_seg = Segmentation([
            Segment(str_index=str_index,
                    start=4,
                    end=5,
                    annotations={'b': '1'}),
        ])
        self.duplicate_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=1),
            Segment(str_index=str_index, start=0, end=1),
        ])
        self.overlapping_seg = Segmentation([
            Segment(str_index=str_index, start=3, end=5),
            Segment(str_index=str_index, start=4, end=6),
        ])

        self.other_entire_text_seg = Input('abbccc')
        str_index2 = self.other_entire_text_seg[0].str_index
        self.other_letter_seg = Segmentation([
            Segment(str_index=str_index2,
                    start=0,
                    end=1,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=1,
                    end=2,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=2,
                    end=3,
                    annotations={'a': '1'}),
            Segment(str_index=str_index2,
                    start=3,
                    end=4,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=4,
                    end=5,
                    annotations={'a': '2'}),
            Segment(str_index=str_index2,
                    start=5,
                    end=6,
                    annotations={'a': '3'}),
        ])

        self.third_entire_text_seg = Input('bd1')
        str_index3 = self.third_entire_text_seg[0].str_index
        self.third_letter_seg = Segmentation([
            Segment(str_index=str_index3, start=0, end=1),
            Segment(str_index=str_index3,
                    start=1,
                    end=2,
                    annotations={'a': '2'}),
            Segment(str_index=str_index3,
                    start=2,
                    end=3,
                    annotations={'a': 'b'}),
        ])

        self.fourth_entire_text_seg = Input('AB cd\xe9')
        str_index = self.fourth_entire_text_seg[0].str_index
        self.second_word_seg = Segmentation([
            Segment(str_index=str_index, start=0, end=2),
            Segment(str_index=str_index, start=3, end=6),
        ])

        self.xml_seg = Input('<a attr="1"><a attr="2/3/">c<a/>d</a></a>')
        self.wrong_xml_seg = Input('<a><a>test</a>')
        self.wrong_xml_seg2 = Input('<a>test</a></a>')

        self.part_xml_seg = Input('<a>1<a>2<a>3</a>4')
        str_index3 = self.part_xml_seg[0].str_index
        self.part_xml_seg2 = Input('</a>5</a>')
        str_index4 = self.part_xml_seg2[0].str_index
        self.broken_xml_seg = Segmentation([
            Segment(str_index=str_index3, annotations={'a': '1'}),
            Segment(str_index=str_index4),
        ])

        self.count = 0
예제 #11
0
    def sendData(self):
        """Compute result of widget processing and send to output"""

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input", "warning")
            self.send("Linguistically analyzed data", None, self)
            return

        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...",
            "warning",
        )
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.inputSeg))

        tokenizedSegments = list()

        # Process each input segment...
        for segment in self.inputSeg:

            # Input segment attributes...
            inputContent = segment.get_content()
            inputAnnotations = segment.annotations
            inputString = segment.str_index
            inputStart = segment.start or 0
            inputEnd = segment.end or len(inputContent)

            # NLP analysis...
            doc = self.nlp(inputContent)

            # Process each token in input segment...
            for token in doc:
                tokenAnnotations = inputAnnotations.copy()
                tokenAnnotations.update({
                    k: getattr(token, k)
                    for k in RELEVANT_KEYS if getattr(token, k) is not None
                })
                tokenStart = inputStart + token.idx
                tokenizedSegments.append(
                    Segment(
                        str_index=inputString,
                        start=tokenStart,
                        end=tokenStart + len(token),
                        annotations=tokenAnnotations,
                    ))

            progressBar.advance()

        outputSeg = Segmentation(tokenizedSegments, self.captionTitle)

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSeg)
        message = pluralize(message, len(outputSeg))
        self.infoBox.setText(message)

        print(outputSeg.to_string())

        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)

        # Send data to output...
        self.send("Linguistically analyzed data", outputSeg, self)

        self.sendButton.resetSettingsChangedFlag()
예제 #12
0
    def sendData(self):
        """Convert input(s) and send output"""
        if not (self.segmentation or self.corpus):
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Textable segmentation', None, self)
            self.send('Text Mining corpus', None)
            return

        msg_seg = msg_corpus = ""

        num_iterations = 0
        if self.corpus:
            num_iterations += len(self.corpus)
        if self.segmentation:
            num_iterations += len(self.segmentation)
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=num_iterations)

        # Convert corpus to segmentation...
        if self.corpus:
            self.clearCreatedInputs()
            new_segments = list()
            text_feature = self.corpus.text_features[self.segmentContent]
            for row in self.corpus:
                content = row[text_feature].value
                if content == "":
                    continue
                new_input = Input(row[text_feature].value)
                new_segment_annotations = dict()
                for attr in self.corpus.domain:
                    attr_str = str(row[attr])
                    if attr_str != "?":
                        new_segment_annotations[str(attr)] = attr_str
                for meta_attr in self.corpus.domain.metas:
                    meta_attr_str = str(row[meta_attr])
                    if (meta_attr != text_feature and meta_attr_str != "?"):
                        new_segment_annotations[str(meta_attr)] = meta_attr_str
                new_segments.append(
                    Segment(new_input[0].str_index, new_input[0].start,
                            new_input[0].end, new_segment_annotations))
                self.createdInputs.append(new_input)
                progressBar.advance()
            new_segmentation = Segmentation(new_segments, self.captionTitle)
            msg_seg = u'%i segment@p' % len(new_segmentation)
            msg_seg = pluralize(msg_seg, len(new_segmentation))
            self.send('Textable segmentation', new_segmentation, self)
        else:
            self.send('Textable segmentation', None, self)

        # Convert segmentation to corpus...
        if self.segmentation:
            metas = list()
            attributes = list()
            meta_keys = list()
            attribute_keys = list()
            for key in self.segmentation.get_annotation_keys():
                possible_values = set()
                for segment in self.segmentation:
                    try:
                        possible_values.add(str(segment.annotations[key]))
                    except KeyError:
                        pass
                if (self.limitNumCategories
                        and len(possible_values) > self.maxNumCategories):
                    metas.append(StringVariable(key))
                    meta_keys.append(key)
                else:
                    attributes.append(
                        DiscreteVariable(key, values=list(possible_values)))
                    attribute_keys.append(key)
            metas.append(StringVariable("textable_text"))
            domain = Domain(attributes, [], metas)
            rows = list()
            for segment in self.segmentation:
                row = [
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in attribute_keys
                ]
                row.extend([
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in meta_keys
                ])
                row.append(segment.get_content())
                rows.append(row)
                progressBar.advance
            table = Table(domain, rows)
            if textMiningIsInstalled:
                corpus = Corpus(domain,
                                X=table.X,
                                metas=table.metas,
                                text_features=[metas[-1]])
            msg_corpus = u'%i document@p' % len(self.segmentation)
            msg_corpus = pluralize(msg_corpus, len(self.segmentation))
            self.send('Text Mining corpus', corpus)
        else:
            self.send('Text Mining corpus', None)

        progressBar.finish()
        self.controlArea.setDisabled(False)

        if msg_seg or msg_corpus:
            message = msg_seg
            if msg_seg and msg_corpus:
                message += " and "
            message += msg_corpus
            message += " sent to output."
            self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
    def sendData(self):
        """Compute result of widget processing and send to output."""

        # Check that there's a model...
        if not self.model:
            self.noLanguageModelWarning()
            self.sendNoneToOutputs()
            return

        # Check that there's an input...
        if self.inputSeg is None:
            self.infoBox.setText("Widget needs input.", "warning")
            self.sendNoneToOutputs()
            return
       
        # Initialize progress bar.
        self.infoBox.setText(
            u"Processing, please wait...", 
            "warning",
        )

        # Disable control area and initialize progress bar...
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=len(self.char_df))       

        # Get start and end pos of concatenated input segments...
        startPositions = [0]
        endPositions = list()
        numSegments = len(self.inputSeg)
        for idx in range(1, numSegments):
            prevSegLen = len(self.inputSeg[idx-1].get_content())
            startPositions.append(startPositions[-1] + prevSegLen + 1)
            endPositions.append(startPositions[-1] - 1)
        endPositions.append(startPositions[-1] + 
                            len(self.inputSeg[-1].get_content()) + 1)

        # Get or update character aliases...
        find_pairs = sys.modules['charnetto.find_pairs']
        characters = [entry.split(", ") for entry in self.characters]
        find_pairs.map_names(self.char_df, characters)

        # Initializations...
        charSegments = list()
        currentSegmentIdx = 0
                
        # For each character token in Charnetto's output...
        for index, charToken in self.char_df.iterrows():
        
            # Skip non-PER named entities.
            if charToken["tag"] != "PER":
                continue

            # Get index of containing segment...
            while charToken["end_pos"] > endPositions[currentSegmentIdx]:
                currentSegmentIdx += 1
                
            # Create segment for char with its actual coordinates...
            strIndex = self.inputSeg[currentSegmentIdx].str_index
            start = charToken["start_pos"]-startPositions[currentSegmentIdx]
            end = charToken["end_pos"]-startPositions[currentSegmentIdx]
            annotations = {"id": charToken["alias"]}
            charSegments.append(Segment(strIndex, start, end, annotations))
            
            progressBar.advance()

        # Send output...
        outputSegmentation = Segmentation(charSegments, 
                                           label=self.captionTitle)
        self.send("Character segmentation", outputSegmentation, self)
        print(outputSegmentation.to_string())

        # Set status to OK and report data size...
        message = "%i segment@p sent to output." % len(outputSegmentation)
        message = pluralize(message, len(outputSegmentation))
        self.infoBox.setText(message)
        
        # Clear progress bar.
        progressBar.finish()
        self.controlArea.setDisabled(False)
                
        self.sendButton.resetSettingsChangedFlag()