Beispiel #1
0
def main():
    if len(sys.argv) <= 1:
        help()
        sys.exit(2)  # common exit code for syntax error
    else:
        if sys.argv:
            if sys.argv[1:] in (["--help"], ["-h"], ["--usage"], ["-?"]):
                help()
                sys.exit(0)
            if sys.argv[1] in ("--zipfile", "-z"):
                for zn in sys.argv:
                    if os.path.exists(zn):
                        filename = zn
                fp = ZipFile(filename, "r")
                namelist = fp.namelist()
                listsplit = []
                OriginalHTMLdb = OriginalPage()
                print "\nOriginalHTML Insert:"
                for i in range(len(namelist)):
                    if split(namelist[i], "/")[-1] != "linkinfo":
                        nametourl = httpsplit(namelist[i])
                        OriginalHTMLdb.url = nametourl
                        if OriginalHTMLdb.checkexist():
                            OriginalHTMLdb.content = fp.read(namelist[i])
                            OriginalHTMLdb.insertoriginalct()
                            listsplit.append(nametourl)
                        stderr.write(".")

            if sys.argv[1] in ("--url", "-u"):
                listsplit = []
                OriginalHTMLdb = OriginalPage()
                OriginalHTMLdb.url = sys.argv[2]
                if OriginalHTMLdb.checkexist():
                    OriginalHTMLdb.content = openhtml(OriginalHTMLdb.url)
                    OriginalHTMLdb.insertoriginalct()
                    listsplit.append(OriginalHTMLdb.url)
    urldbinsert(listsplit)
    OriginalHTMLdb.sync()
    print "\nOriginalHTML Process:"
    md5urllist = OriginalHTMLprocess(listsplit)
    print "\nWordSplitting Process:"
    linesplitinster(md5urllist)
    OriginalHTMLdb.close()
Beispiel #2
0
def OriginalHTMLprocess(listsplit):
    OriginalHTMLdb = OriginalPage()
    ilog = infologger()
    purei = Purecontent("c")
    pat = re.compile("<([^>]|\n)*>")
    space = re.compile("\&nbsp\;|\&copy\;|\r|\t")
    stmk = stopmarks()
    md5urllist = {}
    for i in listsplit:
        md5url = md5hex(i)
        md5urllist[md5url] = [i]
        word = ""
        st = time.time()
        purei.url_md5 = md5url
        if purei.checkexist():
            OriginalHTMLdb.url = i
            parser = html2txt()
            try:
                parser.feed(OriginalHTMLdb.queryoriginalct())
                charset = parser.charset  # charset detector
                parser.close()
            except:
                charset = ""
            Originaltext = langconvert(OriginalHTMLdb.queryoriginalct(), charset)
            Originaltext = Originaltext.decode("utf-8")
            ilog.sentence_split_info(time.time() - st)
            try:  # If this page is normal html format
                parser = ""
                parser = html2txt()
                parser.feed(Originaltext)
                word = word + parser.text
                if len(word) == 0:
                    word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))
                contenttitle = clearspace(parser.title)
                parser.close()
                # print contenttitle,i,charset
                purei.title = contenttitle.encode("utf-8")
            except:
                try:
                    parser = html2txt()
                    parser.feed(Originaltext)
                    contenttitle = clearspace(parser.title)
                    parser.close()
                except:
                    contenttitle = ""
                purei.title = contenttitle.encode("utf-8")
                word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))

            context = ""
            ilog.sentence_split_info(time.time() - st)
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            ilog.sentence_split_info(purei.title + str(len(context)) + i + charset)
            context = context + chr(32)
            contline = []
            contline.append("")
            i = 0  # line of contline list
            # for x in xrange(len(context)):
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                # sentencecount=len(contline[i])
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i += 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 4000.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0
            ilog.sentence_split_info(time.time() - st)
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            ilog.sentence_split_info(time.time() - st)
            purei.purecotentinline = contcleanline
            purei.content = clearspace(context).encode("utf-8")
            purei.insertPurecontent()
            stderr.write(".")
    OriginalHTMLdb.close()
    purei.close()
    return md5urllist