예제 #1
0
 def saveUrls(self, dir, reportUrl, detailUrl):
     filename = os.path.join(dir, "urls.txt")
     if (not os.path.isfile(filename)):
         FileHelper.saveToFile(filename, '%s\r\b%s' % (reportUrl, detailUrl))
예제 #2
0
    def saveProfile(self, profileName, dir, reportUrl, detailUrl, overallEntry):
        assetdir = os.path.join(dir, "files" + os.sep)
        if (not os.path.isdir(dir)):
            os.makedirs(dir)
        if (not os.path.isdir(assetdir)):
            os.makedirs(assetdir)

        status, page = self.downloader.download_page(reportUrl,
                                                     dir,
                                                     assetdir,
                                                     '%s_origin.htm' % (profileName),
                                                     css=False,
                                                     javascript=False,
                                                     image=False)
        #self.downloader.clear_cache()
        
        if (page != None):
            reporter = None
            reportContent = ""
            #headers
            items = page.xpath(u"//*[@id='maincontent']//article/header/hgroup/*")
            for item in items:
                header = StrHelper.trim(item.text_content())
                if (header != None and header.startswith(profileName)):
                    header = StrHelper.trim(header[len(profileName):])
                reportContent += header + os.linesep
                break
            reportContent += os.linesep
            #content
            reg = re.compile(ur"^基金會編號.*$", re.MULTILINE)
            allsymbols = ur" ,、。.?!~$%@&#*‧;︰…‥﹐﹒˙·﹔﹕‘’“”〝〞‵′〃├─┼┴┬┤┌┐╞═╪╡│▕└┘╭╮╰╯╔╦╗╠═╬╣╓╥╖╒╤╕║╚╩╝╟╫╢╙╨╜╞╪╡╘╧╛﹣﹦≡|∣∥–︱—︳╴¯ ̄﹉﹊﹍﹎﹋﹌﹏︴﹨∕╲╱\/↑↓←→↖↗↙↘〔〕【】﹝﹞〈〉﹙﹚《》(){}﹛﹜『』「」<>≦≧﹤﹥︵︶︷︸︹︺︻︼︽︾︿﹀∩∪﹁﹂﹃﹄"
            regReporters = [  #re.compile(ur"[。:」\s]+(.{3,4})口述.?記者(.{3,4})(?:採訪整理)?$", re.MULTILINE),
                              re.compile(allsymbols + ur"[\s]+(.{2,4})[口筆]述\s?.?\s?記者(.{2,4})(?:採訪整理)?$", re.MULTILINE),
                              #[\u4e00-\u9fa5] 英文字符之外的字符,包括中文漢字和全角標點
                              re.compile(ur"報導.攝影.(.{2,4})記者$", re.MULTILINE),
                              re.compile(ur"報導.攝影.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"攝影.報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"攝影.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"記者(.{2,4})採訪整理$", re.MULTILINE),
                              re.compile(ur"^【(.{2,4})╱.{2,4}報導】", re.MULTILINE), ]

            #preserve <br> tags as \n
            brs = page.xpath(u"//div[@class='articulum']//br")
            if (len(brs) == 0):
            	brs = page.xpath(u"//div[@class='articulum trans']//br")

            for br in brs:
                br.tail = "\n" + br.tail if br.tail else "\n"

            items = page.xpath(u"//div[@class='articulum']/*")
            if (len(items) == 0):
                items = page.xpath(u"//div[@class='articulum trans']/*")

            for item in items:
                tag = item.tag.lower()
                id = self.get_attrib(item, "id", None)
                # if (tag == "figure"): continue
                # if (tag == "iframe"): break
                if (id == "bcontent" or id == "bhead" or id == "introid"):
                    text = StrHelper.trim(item.text_content())
                    if (text == None or text == ""): continue
                    if (id != "bhead"):
                        for regReporter in regReporters:
                            list = regReporter.findall(text)
                            if (len(list) == 1):
                                if (not isinstance(list[0], basestring)):
                                    reporter = "/".join(list[0])
                                else:
                                    reporter = list[0]
                                text = StrHelper.trim(regReporter.sub('', text))
                                break
                        if (reporter):
                            overallEntry.reporter = reporter
                        else:
                            self.logger.warn("error: parsing reporter: %s" % reportUrl)

                    text = StrHelper.trim(reg.sub('', text))
                    reportContent += text + os.linesep + os.linesep
            FileHelper.saveToFile(os.path.join(dir, reportFileName), reportContent)

        status, page = self.downloader.download_page(detailUrl,
                                                     dir,
                                                     assetdir,
                                                     detailSrcFileName,
                                                     css=False,
                                                     javascript=False,
                                                     image=False)
        if (page != None):
            items = page.xpath(u"//div[@id='charitysidebox3'][1]/div[@id='inquiry3']/table//tr")
            maxDate = None
            if (len(items) > 0):
                file = None
                try:
                    file = open(os.path.join(dir, detailFileName), "wb")
                    csvwriter = csv.writer(file)
                    for index, item in enumerate(items):
                        if (index > 1):
                            cols = item.xpath(u".//td")
                            if (len(cols) == 4):
                                no = StrHelper.trim(cols[0].text)
                                name = StrHelper.trim(cols[1].text)
                                amount = StrHelper.trim(cols[2].text)
                                dateStr = StrHelper.trim(cols[3].text)
                                try:
                                    date = datetime.datetime.strptime(dateStr, "%Y/%m/%d")
                                    if (maxDate == None or date > maxDate):
                                        maxDate = date
                                except Exception as ex:
                                    self.logger.warn("error date format:%s in %s" % (dateStr, detailUrl))
                                csvwriter.writerow([no, dateStr, amount, name])
                    overallEntry.enddate = maxDate.strftime("%Y/%m/%d") if maxDate != None else ""
                    overallEntry.doners = len(items) - 2
                except Exception as ex:
                    self.logger.exception(LogHelper.getExceptionMsg(ex, "error paring detail.html"))
                finally:
                    if (file):
                        file.close()