예제 #1
0
    def parsePages(self):
        try:
            pageIndex = 1
            while True:
                urlNextPage = 'http://search.appledaily.com.tw/charity/projlist/Page/%d' % pageIndex
                self.logger.info('downloading page %d...' % pageIndex)
                currentPage = urlNextPage
                status, page = self.downloader.download_page(currentPage,
                                                             self.htmlDir,
                                                             self.assetDir,
                                                             'page%05d.html' % (pageIndex),
                                                             css=False,
                                                             javascript=False,
                                                             image=False)

                if (page == None):
                    self.logger.warn("error: downloading page %d" % (pageIndex))
                    break
                elif (status != 200):
                    self.logger.warn("http response: %s %s" % (status, currentPage))
                    break
                else:
                    self.logger.info('parsing page %d...' % pageIndex)

                    items = page.xpath(u"//tr[@class='odd']")
                    row = 0
                    if (items):
                        for item in items:
                            nodes = item.xpath(u".//td")
                            if (len(nodes) == 6):
                                reportUrl = None
                                detailUrl = None
                                title = None
                                row += 1
                                id = nodes[0].text
                                if( id[0] == 'A'):
                                    node = nodes[1].xpath(u".//a")
                                    if len(node) > 0:
                                        title = node[0].text
                                        reportUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None))
                                    else:
                                        self.logger.warn("title not found")
                                    date = nodes[2].text
                                    status = str(nodes[3].text_content())
                                    amount = nodes[4].text
                                    node = nodes[5].xpath(u".//a")
                                    if len(node) > 0:
                                        detailUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None))
                                    else:
                                        self.logger.warn("detail not found")

                                    if (title == None):
                                        self.logger.warn("title not found")
                                    if (title == None or reportUrl == None or detailUrl == None):
                                        self.logger.warn("parse error!!!")
                                    if (status == u"已結案"):
                                        dir = os.path.join(self.profileDir, id[-1:] + os.sep + id + os.sep)
                                        dirRm = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep)
                                        if (self.getIsProfileSaved(dirRm)):
                                            shutil.rmtree(dirRm, ignore_errors=True)

                                        if (not self.getIsProfileSaved(dir)):
                                            #self.logger.warn("saving profile: page %d, id %s" % (pageIndex, id))
                                            overallEntry = OverallEntry()
                                            overallEntry.id = id
                                            overallEntry.title = StrHelper.trim(title)
                                            overallEntry.total = amount
                                            overallEntry.begindate = date
                                            overallEntry.reportUrl = reportUrl
                                            overallEntry.detailUrl = detailUrl

                                            self.logger.info("saving profile %s" % id)

                                            # FIXME: IOError: [Errno 2] No such file or directory: appledaily/profiles/\u672a\u7d50\u6848/
                                            dir = dir.replace(u"未結案"+os.sep, '')
                                            self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry)
                                            self.saveOverallEntry(overallEntry.id, [overallEntry.id,
                                                                                    overallEntry.begindate,
                                                                                    overallEntry.enddate,
                                                                                    overallEntry.total,
                                                                                    overallEntry.doners,
                                                                                    overallEntry.title,
                                                                                    overallEntry.reporter,
                                                                                    overallEntry.reportUrl,
                                                                                    overallEntry.detailUrl])
                                        self.saveUrls(dir, reportUrl, detailUrl)
                                        #self.saveMetadata(dir, title, date, amount)
                                    elif (status == u"未結案"):
                                        dir = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep)
                                        overallEntry = OverallEntry()
                                        overallEntry.id = id
                                        overallEntry.title = StrHelper.trim(title)
                                        overallEntry.total = amount
                                        overallEntry.begindate = date
                                        overallEntry.reportUrl = reportUrl
                                        overallEntry.detailUrl = detailUrl
                                        self.logger.info("saving profile %s" % id)
                                        self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry)
                                        self.saveOverallEntryPending(overallEntry.id, [overallEntry.id,
                                                                                overallEntry.begindate,
                                                                                overallEntry.enddate,
                                                                                overallEntry.total,
                                                                                overallEntry.doners,
                                                                                overallEntry.title,
                                                                                overallEntry.reporter,
                                                                                overallEntry.reportUrl,
                                                                                overallEntry.detailUrl])
                                        self.saveUrls(dir, reportUrl, detailUrl)
                                        # pass
                                    else:
                                        self.logger.warn("unknown status")

                    self.logger.info("%d items found" % (row))
                    if (row == 0):
                        break
                    if (not items):
                        self.logger.info("items not found")
                        break
                    if (len(items) == 0):
                        self.logger.info("items length == 0")
                        break
                    pageIndex += 1
            self.logger.info('done!')
        except Exception as ex:
            self.logger.exception(LogHelper.getExceptionMsg(ex, "parsePages"))
        finally:
            pass
예제 #2
0
    def saveProfile(self, profileName, dir, reportUrl, detailUrl, overallEntry):
        assetdir = os.path.join(dir, "files" + os.sep)
        if (not os.path.isdir(dir)):
            os.makedirs(dir)
        if (not os.path.isdir(assetdir)):
            os.makedirs(assetdir)

        status, page = self.downloader.download_page(reportUrl,
                                                     dir,
                                                     assetdir,
                                                     '%s_origin.htm' % (profileName),
                                                     css=False,
                                                     javascript=False,
                                                     image=False)
        #self.downloader.clear_cache()
        
        if (page != None):
            reporter = None
            reportContent = ""
            #headers
            items = page.xpath(u"//*[@id='maincontent']//article/header/hgroup/*")
            for item in items:
                header = StrHelper.trim(item.text_content())
                if (header != None and header.startswith(profileName)):
                    header = StrHelper.trim(header[len(profileName):])
                reportContent += header + os.linesep
                break
            reportContent += os.linesep
            #content
            reg = re.compile(ur"^基金會編號.*$", re.MULTILINE)
            allsymbols = ur" ,、。.?!~$%@&#*‧;︰…‥﹐﹒˙·﹔﹕‘’“”〝〞‵′〃├─┼┴┬┤┌┐╞═╪╡│▕└┘╭╮╰╯╔╦╗╠═╬╣╓╥╖╒╤╕║╚╩╝╟╫╢╙╨╜╞╪╡╘╧╛﹣﹦≡|∣∥–︱—︳╴¯ ̄﹉﹊﹍﹎﹋﹌﹏︴﹨∕╲╱\/↑↓←→↖↗↙↘〔〕【】﹝﹞〈〉﹙﹚《》(){}﹛﹜『』「」<>≦≧﹤﹥︵︶︷︸︹︺︻︼︽︾︿﹀∩∪﹁﹂﹃﹄"
            regReporters = [  #re.compile(ur"[。:」\s]+(.{3,4})口述.?記者(.{3,4})(?:採訪整理)?$", re.MULTILINE),
                              re.compile(allsymbols + ur"[\s]+(.{2,4})[口筆]述\s?.?\s?記者(.{2,4})(?:採訪整理)?$", re.MULTILINE),
                              #[\u4e00-\u9fa5] 英文字符之外的字符,包括中文漢字和全角標點
                              re.compile(ur"報導.攝影.(.{2,4})記者$", re.MULTILINE),
                              re.compile(ur"報導.攝影.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"攝影.報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"攝影.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"記者(.{2,4})採訪整理$", re.MULTILINE),
                              re.compile(ur"^【(.{2,4})╱.{2,4}報導】", re.MULTILINE), ]

            #preserve <br> tags as \n
            brs = page.xpath(u"//div[@class='articulum']//br")
            if (len(brs) == 0):
            	brs = page.xpath(u"//div[@class='articulum trans']//br")

            for br in brs:
                br.tail = "\n" + br.tail if br.tail else "\n"

            items = page.xpath(u"//div[@class='articulum']/*")
            if (len(items) == 0):
                items = page.xpath(u"//div[@class='articulum trans']/*")

            for item in items:
                tag = item.tag.lower()
                id = self.get_attrib(item, "id", None)
                # if (tag == "figure"): continue
                # if (tag == "iframe"): break
                if (id == "bcontent" or id == "bhead" or id == "introid"):
                    text = StrHelper.trim(item.text_content())
                    if (text == None or text == ""): continue
                    if (id != "bhead"):
                        for regReporter in regReporters:
                            list = regReporter.findall(text)
                            if (len(list) == 1):
                                if (not isinstance(list[0], basestring)):
                                    reporter = "/".join(list[0])
                                else:
                                    reporter = list[0]
                                text = StrHelper.trim(regReporter.sub('', text))
                                break
                        if (reporter):
                            overallEntry.reporter = reporter
                        else:
                            self.logger.warn("error: parsing reporter: %s" % reportUrl)

                    text = StrHelper.trim(reg.sub('', text))
                    reportContent += text + os.linesep + os.linesep
            FileHelper.saveToFile(os.path.join(dir, reportFileName), reportContent)

        status, page = self.downloader.download_page(detailUrl,
                                                     dir,
                                                     assetdir,
                                                     detailSrcFileName,
                                                     css=False,
                                                     javascript=False,
                                                     image=False)
        if (page != None):
            items = page.xpath(u"//div[@id='charitysidebox3'][1]/div[@id='inquiry3']/table//tr")
            maxDate = None
            if (len(items) > 0):
                file = None
                try:
                    file = open(os.path.join(dir, detailFileName), "wb")
                    csvwriter = csv.writer(file)
                    for index, item in enumerate(items):
                        if (index > 1):
                            cols = item.xpath(u".//td")
                            if (len(cols) == 4):
                                no = StrHelper.trim(cols[0].text)
                                name = StrHelper.trim(cols[1].text)
                                amount = StrHelper.trim(cols[2].text)
                                dateStr = StrHelper.trim(cols[3].text)
                                try:
                                    date = datetime.datetime.strptime(dateStr, "%Y/%m/%d")
                                    if (maxDate == None or date > maxDate):
                                        maxDate = date
                                except Exception as ex:
                                    self.logger.warn("error date format:%s in %s" % (dateStr, detailUrl))
                                csvwriter.writerow([no, dateStr, amount, name])
                    overallEntry.enddate = maxDate.strftime("%Y/%m/%d") if maxDate != None else ""
                    overallEntry.doners = len(items) - 2
                except Exception as ex:
                    self.logger.exception(LogHelper.getExceptionMsg(ex, "error paring detail.html"))
                finally:
                    if (file):
                        file.close()