Exemple #1
0
    def unescape(result: MarkedUpText) -> None:
        """
        a "private" method to replace HTML codes like > with corresponding symbols in
        the resulting plain text
        :param result: MarkedUpText containing resulting plain text
        """
        new_text = ''
        transformations = [
        ]  # type: List[Tuple[Tuple[int, int], Tuple[int, int]]]
        last_stop = 0

        for match in _charref.finditer(result.text):
            replacement = _replace_charref(match)
            src_s, src_e = (match.start(), match.end())
            end_e = src_s + len(replacement)
            if end_e != src_e:
                transformations.append(((src_s, src_e), (src_s, end_e)))
            new_text += result.text[last_stop:src_s]
            new_text += replacement
            last_stop = src_e

        new_text += result.text[last_stop:len(result.text)]
        result.text = new_text
        if transformations:
            result.apply_transformations(transformations)
    def convert_to_styled(self, data):
        """
        This scans incoming notes for possible html.  It converts a select few
        tags into StyledText and removes the rest of the tags.  Notes of this
        type occur in data from FTM and ancestry.com.  Result is a much
        cleaner note.

        @param data: a string of text possibly containg html
        @type data: str

        """
        prev = 0
        chunkpos = 0
        chunks = []
        italics = []
        bolds = []
        unders = []
        links = []
        reds = []
        bldpos = -1
        # data = html.unescape(data)      # clean up escaped html "<" etc.
        for mo in re.finditer(html._charref, data._string):
            out = html._replace_charref(mo)
            in_start = mo.start()
            in_end = mo.end()
            data._string = (data._string[:in_start] + out +
                            data._string[(in_start + len(out)):])
            if prev != in_start + len(out):
                chunks.append(data[prev:(in_start + len(out))])
                chunkpos += (in_start - prev + len(out))
            prev = in_end
        chunks.append(data[prev:])

        data = StyledText().join(chunks)
        prev = 0
        chunkpos = 0
        chunks = []
        for mo in re.finditer(self.tok_regex,
                              data._string,
                              flags=(re.DOTALL | re.I)):
            kind = mo.lastgroup
            st_txt = mo.group(kind)
            in_start = mo.start()
            in_end = mo.end()
            if kind == 'SKIP' or kind == 'TABLE':
                if prev != in_start:
                    chunks.append(data[prev:in_start])
                    chunkpos += (in_start - prev)
            elif kind == 'PARAEND':
                chunks.append(data[prev:in_start] + '\n')
                chunkpos += (in_start - prev + 1)
            elif kind == 'ITALIC':
                chunks.append(data[prev:in_start] +
                              data[(in_start + 3):in_end])
                newpos = chunkpos - prev + in_end - 3
                italics.append((chunkpos + in_start - prev, newpos))
                chunkpos = newpos
            elif kind == 'BOLD':
                chunks.append(data[prev:in_start] +
                              data[(in_start + 3):in_end])
                newpos = chunkpos - prev + in_end - 3
                bolds.append((chunkpos + in_start - prev, newpos))
                chunkpos = newpos
            elif kind == 'UNDER':
                chunks.append(data[prev:in_start] +
                              data[(in_start + 3):in_end])
                newpos = chunkpos - prev + in_end - 3
                unders.append((chunkpos + in_start - prev, newpos))
                chunkpos = newpos
            elif kind == 'HTTP':  # HTTP found
                st_txt = mo.group('HTTP')
                oldpos = chunkpos + in_start - prev
                chunks.append(data[prev:in_start] + st_txt)
                chunkpos += (in_start - prev + len(st_txt))
                st_txt = st_txt.rstrip(' .:)')
                newpos = oldpos + len(st_txt)
                links.append((st_txt, oldpos, newpos))
            elif kind == 'HREF':  # HREF found
                st_txt = mo.group('HREFT')
                lk_txt = mo.group('HREFL')
                # fix up relative links emmitted by ancestry.com
                if (lk_txt.startswith("/search/dbextra")
                        or lk_txt.startswith("/handler/domain")):
                    lk_txt = "http://search.ancestry.com" + lk_txt
                oldpos = chunkpos + in_start - prev
                # if tag (minus any trailing '.') is substring of link
                if st_txt[0:-1] in lk_txt:
                    st_txt = lk_txt  # just use the link
                else:  # use link and tag
                    st_txt = " " + lk_txt + " (" + st_txt + ")"
                newpos = oldpos + len(st_txt)
                chunks.append(data[prev:in_start] + st_txt)
                chunkpos += (in_start - prev + len(st_txt))
                links.append((lk_txt, oldpos, newpos))
            elif kind == 'TBLCELL' or kind == 'TBLHDRC':  # Table cell break
                chunks.append(data[prev:in_start] + ':  ')
                chunkpos += (in_start - prev + 3)
            elif kind == 'TBLHDRB':  # header start
                if prev != in_start:
                    chunks.append(data[prev:in_start])
                    chunkpos += (in_start - prev)
                bldpos = chunkpos
            elif kind == 'TBLHDRE':  # Header end
                if bldpos == -1:
                    if prev != in_start:
                        chunks.append(data[prev:in_end])
                        newpos = chunkpos - prev + in_end
                        reds.append((chunkpos + in_start - prev, newpos))
                        chunkpos = newpos
                    print('Invalid table header, no start tag found')
                else:
                    if prev != in_start:
                        chunks.append(data[prev:in_start])
                        chunkpos += (in_start - prev)
                    bolds.append((bldpos, chunkpos))
                    bldpos = -1
            elif kind == 'UNKNWN':
                chunks.append(data[prev:in_end])
                newpos = chunkpos - prev + in_end
                reds.append((chunkpos + in_start - prev, newpos))
                chunkpos = newpos
                print('Unexpected or unimplemented HTML tag', st_txt)
            else:
                print("shouldn't get here")

            prev = in_end
        chunks.append(data[prev:])

        result = StyledText().join(chunks)
        tags = []
        for link in links:
            tags.append(
                StyledTextTag(StyledTextTagType.LINK, link[0],
                              [(link[1], link[2])]))
        if italics:
            tags.append(StyledTextTag(StyledTextTagType.ITALIC, False,
                                      italics))
        if bolds:
            tags.append(StyledTextTag(StyledTextTagType.BOLD, False, bolds))
        if unders:
            tags.append(
                StyledTextTag(StyledTextTagType.UNDERLINE, False, unders))
        if reds:
            tags.append(
                StyledTextTag(StyledTextTagType.HIGHLIGHT, '#FFFF00', reds))
        return StyledText(result._string, tag_merge(result._tags, tags))