Python stopmarks Examples

Programming Language: Python

Namespace/Package Name: lang

Method/Function: stopmarks

Examples at hotexamples.com: 2

Python stopmarks - 2 examples found. These are the top rated real world Python examples of lang.stopmarks extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: indexer.py Project: kevinwatt/Ozearch

    def contentinsert(self):
        md5url = hashlib.md5(self.uni).hexdigest()
        self.purei.url_md5 = md5url
        self.md5urllist[md5url] = self.uni
        # url db
        self.urlinsert.url = self.uni
        self.urlinsert.md5url = md5url
        self.urlinsert.inserturldb()
        stmk = stopmarks()

        if self.purei.checkexist():
            self.purei.title = self.title.encode("utf-8")
            context = ""
            word = self.content
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            context = context + chr(32)
            contline = []
            contline.append("")
            word = ""  # release word value
            i = 0  # line of contline list
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i = len(contline) - 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 16640.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0  # i for contline
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            self.purei.purecotentinline = contcleanline
            self.purei.content = clearspace(context).encode("utf-8")
            self.purei.insertPurecontent()
            stderr.write(".")

Example #2

Show file

File: indexer.py Project: kevinwatt/Ozearch

def OriginalHTMLprocess(listsplit):
    OriginalHTMLdb = OriginalPage()
    ilog = infologger()
    purei = Purecontent("c")
    pat = re.compile("<([^>]|\n)*>")
    space = re.compile("\&nbsp\;|\&copy\;|\r|\t")
    stmk = stopmarks()
    md5urllist = {}
    for i in listsplit:
        md5url = md5hex(i)
        md5urllist[md5url] = [i]
        word = ""
        st = time.time()
        purei.url_md5 = md5url
        if purei.checkexist():
            OriginalHTMLdb.url = i
            parser = html2txt()
            try:
                parser.feed(OriginalHTMLdb.queryoriginalct())
                charset = parser.charset  # charset detector
                parser.close()
            except:
                charset = ""
            Originaltext = langconvert(OriginalHTMLdb.queryoriginalct(), charset)
            Originaltext = Originaltext.decode("utf-8")
            ilog.sentence_split_info(time.time() - st)
            try:  # If this page is normal html format
                parser = ""
                parser = html2txt()
                parser.feed(Originaltext)
                word = word + parser.text
                if len(word) == 0:
                    word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))
                contenttitle = clearspace(parser.title)
                parser.close()
                # print contenttitle,i,charset
                purei.title = contenttitle.encode("utf-8")
            except:
                try:
                    parser = html2txt()
                    parser.feed(Originaltext)
                    contenttitle = clearspace(parser.title)
                    parser.close()
                except:
                    contenttitle = ""
                purei.title = contenttitle.encode("utf-8")
                word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))

            context = ""
            ilog.sentence_split_info(time.time() - st)
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            ilog.sentence_split_info(purei.title + str(len(context)) + i + charset)
            context = context + chr(32)
            contline = []
            contline.append("")
            i = 0  # line of contline list
            # for x in xrange(len(context)):
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                # sentencecount=len(contline[i])
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i += 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 4000.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0
            ilog.sentence_split_info(time.time() - st)
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            ilog.sentence_split_info(time.time() - st)
            purei.purecotentinline = contcleanline
            purei.content = clearspace(context).encode("utf-8")
            purei.insertPurecontent()
            stderr.write(".")
    OriginalHTMLdb.close()
    purei.close()
    return md5urllist