Ejemplo n.º 1
0
def urldbinsert(listsplit):
    urlinsert = DataInsert()
    urlinsert.urldbinit()  # open the url database group
    for i in listsplit:
        md5url = hashlib.md5(i).hexdigest()
        urlinsert.url = i
        urlinsert.md5url = md5url
        urlinsert.inserturldb()
    urlinsert.urldbclose()
Ejemplo n.º 2
0
class Contentprocess(object):
    def __init__(self):
        self.uni = ""
        self.title = ""
        self.content = ""
        self.md5urllist = {}
        self.purei = Purecontent("c")
        self.urlinsert = DataInsert()
        self.urlinsert.urldbinit()

    def closeandreturn(self):
        self.purei.close()
        self.urlinsert.urldbclose()
        return self.md5urllist

    def contentadd(self, largeinsert):
        for x in largeinsert.keys():
            self.uni = x
            cdata = largeinsert[x]
            self.title = cdata[0]
            self.content = cdata[1]
            self.contentinsert()

    def contentinsert(self):
        md5url = hashlib.md5(self.uni).hexdigest()
        self.purei.url_md5 = md5url
        self.md5urllist[md5url] = self.uni
        # url db
        self.urlinsert.url = self.uni
        self.urlinsert.md5url = md5url
        self.urlinsert.inserturldb()
        stmk = stopmarks()

        if self.purei.checkexist():
            self.purei.title = self.title.encode("utf-8")
            context = ""
            word = self.content
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            context = context + chr(32)
            contline = []
            contline.append("")
            word = ""  # release word value
            i = 0  # line of contline list
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i = len(contline) - 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 16640.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0  # i for contline
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            self.purei.purecotentinline = contcleanline
            self.purei.content = clearspace(context).encode("utf-8")
            self.purei.insertPurecontent()
            stderr.write(".")