Exemple #1
0
            except:
                pass
            post.update({k: v})
        return post


if __name__ == '__main__':
    spider = MySpider()
    spider.proxy_enable = False
    spider.init_dedup()
    spider.init_downloader()

    # ------------ parse_detail_page() ----------

    url = "http://ggzyjy.jl.gov.cn/JiLinZtb//Template/Default/ZBGGMoreInfoJYXX.aspx?CategoryNum=004001"
    resp = spider.download(url)
    res = spider.parse_detail_page(resp, url)
    # for item in res:
    #     for k, v in item.iteritems():
    #         print k, v
    #         print "*"*30

    # ------------ parse_detail_page() ----------
    # "https://www.bidcenter.com.cn/zbpage-1-1.html",  # 招标公告
    # "https://www.bidcenter.com.cn/zbpage-4-1.html",  # 中标公告
    # "https://www.bidcenter.com.cn/zbpage-6-1.html",  # 招标变更
    # for i in xrange(10,100):
    #     url = "https://www.bidcenter.com.cn/zbpage-6-{0}.html".format(i)
    #     resp = spider.download(url)
    #     res = spider.parse_detail_page(resp, url)
    #     # for item in res:
Exemple #2
0
def readfile(file):
    try:
        if file.startswith('https://') or file.startswith(
                'http://') or file.startswith('ftp://'):
            data = BytesIO(download(file))
        else:
            data = open(file, 'rb')

        if file.endswith('.caj') or file.endswith('.pdf'):
            with StringIO() as outfp:
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr, outfp)
                process_pdf(rsrcmgr, device, data)
                return outfp.getvalue()
        elif file.endswith('.doc'):
            text = ''
            document = olefile.OleFileIO(data)

            wordDocument = document.openstream('WordDocument').read()

            # Parsing the WordDocument Stream
            # See https://msdn.microsoft.com/en-us/library/office/dd904907(v=office.14).aspx
            # And http://b2xtranslator.sourceforge.net/howtos/How_to_retrieve_text_from_a_binary_doc_file.pdf

            # Loading the FIB
            fib = wordDocument[:1472]

            # Loading and Parsing the piece table
            fcClx = int.from_bytes(fib[0x01A2:0x01A5], byteorder='little')
            lcbClx = int.from_bytes(fib[0x01A6:0x01A9], byteorder='little')

            tableFlag = ((int.from_bytes(
                fib[0x000A:0x000E], byteorder='little') & 0x0200) == 0x0200)
            tableName = ('0Table', '1Table')[tableFlag]

            table = document.openstream(tableName).read()

            clx = table[fcClx:fcClx + lcbClx]

            pos = 0
            pieceTable = ''
            lcbPieceTable = 0
            while True:
                if clx[pos] == 2:
                    # this entry is the piece table
                    lcbPieceTable = int.from_bytes(clx[pos + 1:pos + 5],
                                                   byteorder='little')
                    pieceTable = clx[pos + 5:pos + 5 + lcbPieceTable]
                    break
                elif clx[pos] == 1:
                    # skip this entry
                    pos = pos + 1 + 1 + ord(clx[pos + 1])
                else:
                    break

            i = 1
            pieceCount = (lcbPieceTable - 4) / 12
            while i <= pieceCount:
                cpStart = int.from_bytes(pieceTable[i * 4:i * 4 + 4],
                                         byteorder='little')
                cpEnd = int.from_bytes(pieceTable[(i + 1) * 4:(i + 1) * 4 + 4],
                                       byteorder='little')

                offsetPieceDescriptor = int(((pieceCount + 1) * 4) + (i * 8))
                pieceDescriptor = pieceTable[
                    offsetPieceDescriptor:offsetPieceDescriptor + 8]

                fcValue = int.from_bytes(pieceDescriptor[2:6],
                                         byteorder='little')
                isANSII = (fcValue & 0x40000000) == 0x40000000
                fc = fcValue & 0xBFFFFFFF

                encoding = ('utf-16', 'cp1252')[isANSII]
                cb = cpEnd - cpStart
                cb = (cb * 2, cb)[isANSII]
                text += wordDocument[fc:fc + cb].decode(encoding)

                i += 1

            return text
        elif file.endswith('.docx'):
            text = ''
            document = Document(data)

            text += '\n\n'.join(
                [paragraph.text for paragraph in document.paragraphs])

            for table in document.tables:
                text += _parse_docx_table(table, text)

            return text
        elif file.endswith('.htm') or file.endswith('.html'):
            html = html2text.HTML2Text()
            html.ignore_links = True
            return html.handle(data.read().decode('utf-8'))
        elif file.endswith('.rtf'):
            with BytesIO() as outfp:
                document = Rtf15Reader.read(data)
                return PlaintextWriter.write(document, outfp).getvalue()
        elif file.endswith('.txt'):
            return data.read()
        else:
            raise Exception('Unknown file extension')
    except:
        pass