Ejemplo n.º 1
0
def textToText(buffer):
    # extract the title (first lines beforce void carriage return)
    #get title
    title = ""

#       start = 0
#       iter = re.finditer('\n', buffer)
#       while 1:
#               try:
#                       match = iter.next()
#               except StopIteration:
#                       break
#       
#               line = buffer[start:match.start()]
#               start = match.start() + 1
#               line = line.rstrip()
#               line = line.lstrip()
#               if line != '':
#                       title = line
#
    # contraction of the content of the buffer
    contractedText = cStringIO.StringIO()
    iter = re.finditer('\w+', buffer)
    bufferLength = len(buffer)
    pos = -1
    space = 1
    while 1:
        pos += 1
        if pos >= bufferLength:
            break

        c = buffer[pos]
        if ord(c) < 32:
            c = ' '
        if texttools.isSpace(c):
            if space == 0:
                contractedText.write(' ')
            space = 1
        else:
            contractedText.write(c)
            space = 0
    text = contractedText.getvalue()
    title = text[0:60]
    return (title, text)
Ejemplo n.º 2
0
def textToText(buffer):
    # extract the title (first lines beforce void carriage return)
    #get title
    title = ""

    #       start = 0
    #       iter = re.finditer('\n', buffer)
    #       while 1:
    #               try:
    #                       match = iter.next()
    #               except StopIteration:
    #                       break
    #
    #               line = buffer[start:match.start()]
    #               start = match.start() + 1
    #               line = line.rstrip()
    #               line = line.lstrip()
    #               if line != '':
    #                       title = line
    #
    # contraction of the content of the buffer
    contractedText = cStringIO.StringIO()
    iter = re.finditer('\w+', buffer)
    bufferLength = len(buffer)
    pos = -1
    space = 1
    while 1:
        pos += 1
        if pos >= bufferLength:
            break

        c = buffer[pos]
        if ord(c) < 32:
            c = ' '
        if texttools.isSpace(c):
            if space == 0:
                contractedText.write(' ')
            space = 1
        else:
            contractedText.write(c)
            space = 0
    text = contractedText.getvalue()
    title = text[0:60]
    return (title, text)
Ejemplo n.º 3
0
    def feed(self, data=None, input=None):
        if data:
            data_length = len(data)
        else:
            data_length = 0
        self.__pos = -1
        while 1:
            self.__pos += 1
            if self.__pos >= data_length:
                break
            if data:
                c = data[self.__pos]
            else:
                c = input.read(1)

#                       print self.__state, c
            # waiting for '<'
            if self.__state == WAITING_INF:
                if c == '<':
                    self.__state = WAITING_FIRSTCHAR_ATTNAME
                else:
                    self.__text.write(c)
            # waiting for tagname
            elif self.__state == WAITING_FIRSTCHAR_ATTNAME:
                if c == '!':
                    self.__state = WAITING_NEXTCHAR_REM1
                elif c == '/':
                    self.handle_data(self.__text.getvalue())
                    self.__text = cStringIO.StringIO()
                    self.__endingTag = 1
                    self.__state = WAITING_NEXTCHAR_ATTNAME
                elif texttools.isAlpha(c):
                    self.__endingTag = 0
                    self.handle_data(self.__text.getvalue())
                    self.__text = cStringIO.StringIO()
                    self.__atts = []
                    self.__attName = c
                    self.__state = WAITING_NEXTCHAR_ATTNAME
                else:
                    self.__text.write("<" + c)

            elif self.__state == WAITING_NEXTCHAR_REM1:
                if c == '-':
                    self.__state = WAITING_NEXTCHAR_REM2
                elif c == '>':
                    self.__state = WAITING_INF
                else:
                    self.__state = WAITING_NEXTCHAR_LIGHTREM

            elif self.__state == WAITING_NEXTCHAR_LIGHTREM:
                if c == '>':
                    self.__state = WAITING_INF

            elif self.__state == WAITING_NEXTCHAR_REM2:
                if c == '-':
                    self.handle_data(self.__text.getvalue())
                    self.__text = cStringIO.StringIO()
                    self.__state = WAITING_IN_REM
                elif c == '>':
                    self.__state = WAITING_INF

            elif self.__state == WAITING_IN_REM:
                if c == '-':
                    self.__state = WAITING_END_REM1
                else:
                    self.__text.write(c)

            elif self.__state == WAITING_END_REM1:
                if c == '-':
                    self.__state = WAITING_END_REM2
                else:
                    self.__text.write("-" + c)
                    self.__state = WAITING_IN_REM


            elif self.__state == WAITING_END_REM2:
                if c == '>':
                    self.handle_comment(self.__text.getvalue())
                    self.__text = cStringIO.StringIO()                                      
                    self.__state = WAITING_INF
                else:
                    self.__text.write("--" + c)
                    self.__state = WAITING_IN_REM

            elif self.__state == WAITING_NEXTCHAR_ATTNAME:
                if texttools.isAlphaNum(c):
                    self.__attName += c
                elif texttools.isSpace(c):
                    self.__state = WAITING_NEXT_AFF
                elif c == '=':
                    self.__state = WAITING_FIRSTCHAR_VALUE
                elif c == '<':
                    self.__state = READ_INF_END_TAG
                elif c == '>':
                    self.__state = READ_SUP_END_TAG

            elif self.__state == WAITING_NEXT_AFF:
                if texttools.isSpace(c):
                    pass
                elif c == '>':
                    self.__state = READ_SUP_END_TAG
                else: 
                    if self.__attName:
                        self.__atts.append((self.__attName, self.__attValue))
                        self.__attName = c
                        self.__attValue = None
                        self.__attCount += 1
                    self.__state = WAITING_NEXTCHAR_ATTNAME

            elif self.__state == WAITING_FIRSTCHAR_VALUE:
                if texttools.isSpace(c):
                    pass
                elif c in ("\"", "'"):
                    self.__quote = c
                    self.__attValue = ""
                    self.__state = WAITING_CHAR_IN_QUOTE_VALUE
                elif c == '<':
#                                       self.__state = READ_END_TAG
# to verify...
                    self.__state = READ_INF_END_TAG
                else:
                    if not self.__attValue:
                        self.__attValue = c
                    else:
                        self.__attValue += c
                    self.__state = WAITING_CHAR_VALUE

            elif self.__state == WAITING_CHAR_VALUE:
                if texttools.isSpace(c):
                    self.__atts.append((self.__attName, self.__attValue))
                    self.__attName = ""
                    self.__attValue = None
                    self.__state = WAITING_NEXTCHAR_ATTNAME
                elif c == '>':
                    self.__state = READ_SUP_END_TAG
                else:
                    if not self.__attValue:
                        self.__attValue = c
                    else:
                        self.__attValue += c


            elif self.__state == WAITING_CHAR_IN_QUOTE_VALUE:
                if c == self.__quote:
                    self.__state = WAITING_AFTER_QUOTE
                else:
                    if not self.__attValue:
                        self.__attValue = c
                    else:
                        self.__attValue += c

            elif self.__state == WAITING_AFTER_QUOTE:
                if texttools.isSpace(c):
                    self.__state = WAITING_SPACE_AFTER_AFF
                elif c == '<':
                    self.__state = READ_INF_END_TAG
                elif c == '>':
                    self.__state = READ_SUP_END_TAG

            elif self.__state == WAITING_SPACE_AFTER_AFF:
                if texttools.isSpace(c):
                    pass
                elif c == '<':
                    self.__state = READ_INF_END_TAG
                elif c == '>':
                    self.__state = READ_SUP_END_TAG
                else:
                    self.__atts.append((self.__attName, self.__attValue))
                    self.__attName = c
                    self.__attValue = None
                    self.__state = WAITING_NEXTCHAR_ATTNAME



            elif self.__state == WAITING_IN_SCRIPT:
                if c == '<':
                    self.__state = WAITING_END_SCRIPT1
                else:
                    self.__text.write(c)

            elif self.__state == WAITING_END_SCRIPT1:
                if c == '/':
                    self.__attName = ""
                    self.__endingTag = 1
                    self.__state = WAITING_END_SCRIPT2
                else:
                    self.__state = WAITING_IN_SCRIPT
                    self.__text.write("<" + c)

            elif self.__state == WAITING_END_SCRIPT2:
                if texttools.isAlpha(c):
                    self.__attName += c
                elif c == '>':
#                                       print self.__attName
                    if self.__attName.lower() == self.__jumpedTag:
                        self.handle_data(self.__text.getvalue())
                        self.__text = cStringIO.StringIO()                                              
                        self.__state = READ_SUP_END_TAG
                    else:
                        self.__state = WAITING_IN_SCRIPT
                        self.__text.write("</" + self.__attName + c)
                else:
                    self.__text.write("</" + self.__attName + c)
                    self.__state = WAITING_IN_SCRIPT

#                       if self.__state == READ_AFF:
#                               if self.__attName:
#                                       self.__atts.append((self.__attName, self.__attValue))
#                                       self.__attName = ""
#                               self.__state = WAITING_INF

            if self.__state == READ_SUP_END_TAG:
                if self.__attName:
                    if self.__lastC == '/' and self.__lastLastC == ' ':
                        self.handle_startendtag(self.__atts[0][0], self.__atts[1:])
                        self.__state = WAITING_INF
                    else:
                        self.__atts.append((self.__attName, self.__attValue))
                        self.__state = WAITING_INF

                        if self.__endingTag:
                            self.handle_endtag(self.__atts[0][0], self.__atts[1:])
                            self.__state = WAITING_INF
                        else:
                            self.handle_starttag(self.__atts[0][0], self.__atts[1:])
                            if self.__atts[0][0].lower() in ('script', 'style'):
                                self.__state = WAITING_IN_SCRIPT
                                self.__jumpedTag = self.__atts[0][0].lower()
                            else:
                                self.__state = WAITING_INF

                self.__attName = ""
                self.__attValue = None

                self.__atts = []

            if self.__state == READ_INF_END_TAG:
                if self.__attName:
                    self.__atts.append((self.__attName, self.__attValue))
                    self.__attName = ""
                self.__state = WAITING_FIRSTCHAR_ATTNAME

            self.__lastLastC = self.__lastC
            self.__lastC = c


        self.__offset += self.__pos