예제 #1
0
    def download_page(self, url, outDir, assetDir, filename, css=False, javascript=False, image=False):
        page = None
        #response = self.get_http_pool(url).request('GET', url, headers=headers)
        try:
            response = self.session.get(url)
        except requests.exceptions.ConnectionError as connectionError:
            return -1, None
        except Exception as ex:
            self.logger.exception(LogHelper.getExceptionMsg(ex, "exception: download %s" % url))
            return -1, None

        if (response.status_code != 200):
            self.logger.warn("http response: %s %s" % (response.status_code, url))
        else:
            html = response.content.decode('utf-8', 'ignore')
            page = htmlparser.fromstring(html, base_url=url)
            if (css or javascript or image):
                if (css):
                    self.download_related_files(page, u"//link", u"href", outDir, assetDir, baseurl=url)
                if (javascript):
                    self.download_related_files(page, u"//script", u"src", outDir, assetDir, baseurl=url)
                if (image):
                    self.download_related_files(page, u"//img", u"src", outDir, assetDir, baseurl=url)

                newHtml = htmlparser.tostring(page)
                self.saveToFile(os.path.join(outDir, filename), newHtml)
            else:
                self.saveToFile(os.path.join(outDir, filename), html)
        return response.status_code, page
예제 #2
0
    def parseProfile(self, name, urlProfile, outputDir, donors):
        result = False
        try:
            self.logger.info('downloading profile %s...' % name)
            #filename = '%s.html' % datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S')
            status, page = self.downloader.download_page(urlProfile,
                                                         outputDir,
                                                         self.assetDir,
                                                         profileHtmlSrcFilename,
                                                         css=True,
                                                         javascript=True,
                                                         image=False)
            if (page == None):
                pass
            elif (status != 200):
                self.logger.warn("http response: %s %s" % (status, urlProfile))
            else:
                detailedFound = False
                progress = ""
                raised = ""
                togo = ""
                nodeDetailes = self.find_element_by_xpath(page, u'//div[@id="funding_details"]')
                #text = htmlparser.tostring(nodeDetailes, "innerHTML")
                text=nodeDetailes.text_content()
                if (text != None and text != ""):
                    text = text.lower()
                    list = re.findall(ur"([\d\.]+?)%\s*(.*?)\$([\d,]+)\s*(.*?)\s*?$", text)
                    if (len(list) == 1 and len(list[0]) == 4):
                        values = list[0]
                        if (values[1] != None and values[3] == "raised"):
                            progress = values[0]
                            raised = values[2]
                            detailedFound = True
                        elif (values[1] != None and values[3] == "to go"):
                            progress = values[0]
                            togo = values[2]
                            detailedFound = True
                    if(not detailedFound):
                        self.logger.error("invalid reg pattern for %s in profile %s" % (text, urlProfile))

                if (not detailedFound):
                    self.logger.error("profile %s details not found" % name)
                else:
                    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
                    if (raised != None):
                        raised = re.sub(ur"[,$]", "", raised)
                    if (togo != None):
                        togo = re.sub(ur"[,$]", "", togo)
                    details = [timestamp, progress, raised, togo, donors]
                    self.logger.info("profile %s details: %s" % (name, str(details)))
                    #self.save_profile_details(os.path.join(outputDir, '%s.txt' % (name)), details)
                    self.save_profile_details(os.path.join(outputDir, 'data.csv'), details)
                    result = True
        except Exception as ex:
            self.logger.exception(LogHelper.getExceptionMsg(ex, "Exception: %s: %s " % ("parseProfile", name)))
        finally:
            pass
        return result
예제 #3
0
    def savePatientInfos(self):
        if (self.patientInfosFile):
            self.patientInfosFile.close()
            self.patientInfosFile = None

        file = None
        try:
            file = open(self.patientInfosFilename, 'w').close()
        except Exception, ex:
            self.logger.exception(LogHelper.getExceptionMsg(ex, "error: can't write cache file"))
예제 #4
0
 def saveToFile(filename, data):
     file = None
     try:
         file = open(filename, mode='wb')
         file.write(data)
     except Exception as ex:
         logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename))
     finally:
         if (file != None):
             file.close()
예제 #5
0
 def saveToFile(self, filename, data):
     file = None
     try:
         file = open(filename, mode='wb')
         file.write(data)
     except Exception as ex:
         self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % (filename)))
     finally:
         if (file):
             file.close()
예제 #6
0
 def save_profile_details(self, filename, list):
     file = None
     try:
         file = open(filename, "ab")
         writer = csv.writer(file)
         writer.writerow(list)
         file.flush()
     except Exception as ex:
         self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % (filename)))
     finally:
         if (file != None):
             file.close()
예제 #7
0
 def saveTextToFile(filename, data, encoding="utf-8"):
     file = None
     try:
         file = codecs.open(filename, mode='w', encoding=encoding)
         #file = codecs.open(filename, mode='wb')
         #file = open(filename, mode='w')
         #file.write(u'\ufeff')  #codecs.BOM_UTF8
         file.write(data)
     except Exception as ex:
         logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename))
     finally:
         if (file != None):
             file.close()
예제 #8
0
 def saveTextToFile(self, filename, data):
     file = None
     try:
         #file = codecs.open(filename, mode='w', encoding="utf-8")
         #file = codecs.open(filename, mode='wb')
         file = open(filename, mode='w')
         #file.write(u'\ufeff')  #codecs.BOM_UTF8
         file.write(data)
     except Exception as ex:
         self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename))
     finally:
         if (file):
             file.close()
예제 #9
0
 def loadPatientInfos(self):
     if (os.path.isfile(self.patientInfosFilename)):
         file = None
         try:
             file = open(self.patientInfosFilename, "r")
             while True:
                 line = file.readline()
                 if not line: break
                 if line == "": continue
                 newEntry = ast.literal_eval(line)
                 if (isinstance(newEntry, dict) and "id" in newEntry):
                     id = newEntry[self.cache_key_id]
                     if (id in self.dictPatientInfos):
                         entry = self.dictPatientInfos[id]
                     else:
                         entry = {}
                         self.dictPatientInfos[id] = entry
                     for key in newEntry:
                         entry[key] = newEntry[key]
             self.logger.info("%d patient infos loaded" % len(self.dictPatientInfos))
         except Exception, ex:
             self.logger.exception(LogHelper.getExceptionMsg(ex, "Can't read cache file"))
         finally:
예제 #10
0
    def parsePages(self):
        try:
            pageIndex = 1
            while True:
                urlNextPage = 'http://search.appledaily.com.tw/charity/projlist/Page/%d' % pageIndex
                self.logger.info('downloading page %d...' % pageIndex)
                currentPage = urlNextPage
                status, page = self.downloader.download_page(currentPage,
                                                             self.htmlDir,
                                                             self.assetDir,
                                                             'page%05d.html' % (pageIndex),
                                                             css=False,
                                                             javascript=False,
                                                             image=False)

                if (page == None):
                    self.logger.warn("error: downloading page %d" % (pageIndex))
                    break
                elif (status != 200):
                    self.logger.warn("http response: %s %s" % (status, currentPage))
                    break
                else:
                    self.logger.info('parsing page %d...' % pageIndex)

                    items = page.xpath(u"//tr[@class='odd']")
                    row = 0
                    if (items):
                        for item in items:
                            nodes = item.xpath(u".//td")
                            if (len(nodes) == 6):
                                reportUrl = None
                                detailUrl = None
                                title = None
                                row += 1
                                id = nodes[0].text
                                if( id[0] == 'A'):
                                    node = nodes[1].xpath(u".//a")
                                    if len(node) > 0:
                                        title = node[0].text
                                        reportUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None))
                                    else:
                                        self.logger.warn("title not found")
                                    date = nodes[2].text
                                    status = str(nodes[3].text_content())
                                    amount = nodes[4].text
                                    node = nodes[5].xpath(u".//a")
                                    if len(node) > 0:
                                        detailUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None))
                                    else:
                                        self.logger.warn("detail not found")

                                    if (title == None):
                                        self.logger.warn("title not found")
                                    if (title == None or reportUrl == None or detailUrl == None):
                                        self.logger.warn("parse error!!!")
                                    if (status == u"已結案"):
                                        dir = os.path.join(self.profileDir, id[-1:] + os.sep + id + os.sep)
                                        dirRm = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep)
                                        if (self.getIsProfileSaved(dirRm)):
                                            shutil.rmtree(dirRm, ignore_errors=True)

                                        if (not self.getIsProfileSaved(dir)):
                                            #self.logger.warn("saving profile: page %d, id %s" % (pageIndex, id))
                                            overallEntry = OverallEntry()
                                            overallEntry.id = id
                                            overallEntry.title = StrHelper.trim(title)
                                            overallEntry.total = amount
                                            overallEntry.begindate = date
                                            overallEntry.reportUrl = reportUrl
                                            overallEntry.detailUrl = detailUrl

                                            self.logger.info("saving profile %s" % id)

                                            # FIXME: IOError: [Errno 2] No such file or directory: appledaily/profiles/\u672a\u7d50\u6848/
                                            dir = dir.replace(u"未結案"+os.sep, '')
                                            self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry)
                                            self.saveOverallEntry(overallEntry.id, [overallEntry.id,
                                                                                    overallEntry.begindate,
                                                                                    overallEntry.enddate,
                                                                                    overallEntry.total,
                                                                                    overallEntry.doners,
                                                                                    overallEntry.title,
                                                                                    overallEntry.reporter,
                                                                                    overallEntry.reportUrl,
                                                                                    overallEntry.detailUrl])
                                        self.saveUrls(dir, reportUrl, detailUrl)
                                        #self.saveMetadata(dir, title, date, amount)
                                    elif (status == u"未結案"):
                                        dir = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep)
                                        overallEntry = OverallEntry()
                                        overallEntry.id = id
                                        overallEntry.title = StrHelper.trim(title)
                                        overallEntry.total = amount
                                        overallEntry.begindate = date
                                        overallEntry.reportUrl = reportUrl
                                        overallEntry.detailUrl = detailUrl
                                        self.logger.info("saving profile %s" % id)
                                        self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry)
                                        self.saveOverallEntryPending(overallEntry.id, [overallEntry.id,
                                                                                overallEntry.begindate,
                                                                                overallEntry.enddate,
                                                                                overallEntry.total,
                                                                                overallEntry.doners,
                                                                                overallEntry.title,
                                                                                overallEntry.reporter,
                                                                                overallEntry.reportUrl,
                                                                                overallEntry.detailUrl])
                                        self.saveUrls(dir, reportUrl, detailUrl)
                                        # pass
                                    else:
                                        self.logger.warn("unknown status")

                    self.logger.info("%d items found" % (row))
                    if (row == 0):
                        break
                    if (not items):
                        self.logger.info("items not found")
                        break
                    if (len(items) == 0):
                        self.logger.info("items length == 0")
                        break
                    pageIndex += 1
            self.logger.info('done!')
        except Exception as ex:
            self.logger.exception(LogHelper.getExceptionMsg(ex, "parsePages"))
        finally:
            pass
예제 #11
0
    def saveProfile(self, profileName, dir, reportUrl, detailUrl, overallEntry):
        assetdir = os.path.join(dir, "files" + os.sep)
        if (not os.path.isdir(dir)):
            os.makedirs(dir)
        if (not os.path.isdir(assetdir)):
            os.makedirs(assetdir)

        status, page = self.downloader.download_page(reportUrl,
                                                     dir,
                                                     assetdir,
                                                     '%s_origin.htm' % (profileName),
                                                     css=False,
                                                     javascript=False,
                                                     image=False)
        #self.downloader.clear_cache()
        
        if (page != None):
            reporter = None
            reportContent = ""
            #headers
            items = page.xpath(u"//*[@id='maincontent']//article/header/hgroup/*")
            for item in items:
                header = StrHelper.trim(item.text_content())
                if (header != None and header.startswith(profileName)):
                    header = StrHelper.trim(header[len(profileName):])
                reportContent += header + os.linesep
                break
            reportContent += os.linesep
            #content
            reg = re.compile(ur"^基金會編號.*$", re.MULTILINE)
            allsymbols = ur" ,、。.?!~$%@&#*‧;︰…‥﹐﹒˙·﹔﹕‘’“”〝〞‵′〃├─┼┴┬┤┌┐╞═╪╡│▕└┘╭╮╰╯╔╦╗╠═╬╣╓╥╖╒╤╕║╚╩╝╟╫╢╙╨╜╞╪╡╘╧╛﹣﹦≡|∣∥–︱—︳╴¯ ̄﹉﹊﹍﹎﹋﹌﹏︴﹨∕╲╱\/↑↓←→↖↗↙↘〔〕【】﹝﹞〈〉﹙﹚《》(){}﹛﹜『』「」<>≦≧﹤﹥︵︶︷︸︹︺︻︼︽︾︿﹀∩∪﹁﹂﹃﹄"
            regReporters = [  #re.compile(ur"[。:」\s]+(.{3,4})口述.?記者(.{3,4})(?:採訪整理)?$", re.MULTILINE),
                              re.compile(allsymbols + ur"[\s]+(.{2,4})[口筆]述\s?.?\s?記者(.{2,4})(?:採訪整理)?$", re.MULTILINE),
                              #[\u4e00-\u9fa5] 英文字符之外的字符,包括中文漢字和全角標點
                              re.compile(ur"報導.攝影.(.{2,4})記者$", re.MULTILINE),
                              re.compile(ur"報導.攝影.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"攝影.報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"攝影.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"報導.(.{2,4})$", re.MULTILINE),
                              re.compile(ur"記者(.{2,4})採訪整理$", re.MULTILINE),
                              re.compile(ur"^【(.{2,4})╱.{2,4}報導】", re.MULTILINE), ]

            #preserve <br> tags as \n
            brs = page.xpath(u"//div[@class='articulum']//br")
            if (len(brs) == 0):
            	brs = page.xpath(u"//div[@class='articulum trans']//br")

            for br in brs:
                br.tail = "\n" + br.tail if br.tail else "\n"

            items = page.xpath(u"//div[@class='articulum']/*")
            if (len(items) == 0):
                items = page.xpath(u"//div[@class='articulum trans']/*")

            for item in items:
                tag = item.tag.lower()
                id = self.get_attrib(item, "id", None)
                # if (tag == "figure"): continue
                # if (tag == "iframe"): break
                if (id == "bcontent" or id == "bhead" or id == "introid"):
                    text = StrHelper.trim(item.text_content())
                    if (text == None or text == ""): continue
                    if (id != "bhead"):
                        for regReporter in regReporters:
                            list = regReporter.findall(text)
                            if (len(list) == 1):
                                if (not isinstance(list[0], basestring)):
                                    reporter = "/".join(list[0])
                                else:
                                    reporter = list[0]
                                text = StrHelper.trim(regReporter.sub('', text))
                                break
                        if (reporter):
                            overallEntry.reporter = reporter
                        else:
                            self.logger.warn("error: parsing reporter: %s" % reportUrl)

                    text = StrHelper.trim(reg.sub('', text))
                    reportContent += text + os.linesep + os.linesep
            FileHelper.saveToFile(os.path.join(dir, reportFileName), reportContent)

        status, page = self.downloader.download_page(detailUrl,
                                                     dir,
                                                     assetdir,
                                                     detailSrcFileName,
                                                     css=False,
                                                     javascript=False,
                                                     image=False)
        if (page != None):
            items = page.xpath(u"//div[@id='charitysidebox3'][1]/div[@id='inquiry3']/table//tr")
            maxDate = None
            if (len(items) > 0):
                file = None
                try:
                    file = open(os.path.join(dir, detailFileName), "wb")
                    csvwriter = csv.writer(file)
                    for index, item in enumerate(items):
                        if (index > 1):
                            cols = item.xpath(u".//td")
                            if (len(cols) == 4):
                                no = StrHelper.trim(cols[0].text)
                                name = StrHelper.trim(cols[1].text)
                                amount = StrHelper.trim(cols[2].text)
                                dateStr = StrHelper.trim(cols[3].text)
                                try:
                                    date = datetime.datetime.strptime(dateStr, "%Y/%m/%d")
                                    if (maxDate == None or date > maxDate):
                                        maxDate = date
                                except Exception as ex:
                                    self.logger.warn("error date format:%s in %s" % (dateStr, detailUrl))
                                csvwriter.writerow([no, dateStr, amount, name])
                    overallEntry.enddate = maxDate.strftime("%Y/%m/%d") if maxDate != None else ""
                    overallEntry.doners = len(items) - 2
                except Exception as ex:
                    self.logger.exception(LogHelper.getExceptionMsg(ex, "error paring detail.html"))
                finally:
                    if (file):
                        file.close()
예제 #12
0
    def parsePages(self):
        try:
            pageIndex = 1
            #urlNextPage = 'https://watsi.org/fund-treatments/page/129'
            urlNextPage = 'https://watsi.org/fund-treatments/'
            while True:
                self.logger.info('downloading page %d...' % pageIndex)
                currentPage = urlNextPage
                status, page = self.downloader.download_page(currentPage,
                                                             self.htmlDir,
                                                             self.assetDir,
                                                             'page%05d.html' % (pageIndex),
                                                             css=False,
                                                             javascript=False,
                                                             image=False)
                if (page == None):
                    self.logger.warn("error: downloading page %d" % (pageIndex))
                    break
                elif (status != 200):
                    self.logger.warn("http response: %s %s" % (status, currentPage))
                    break
                else:
                    self.logger.info('parsing page %d...' % pageIndex)
                    #find next page's url
                    nodes = page.xpath(u"//a[text()='Next ›']")
                    urlNextPage = urlparse.urljoin(currentPage, nodes[0].attrib['href']) if (len(nodes) > 0) else None

                    items = page.xpath(u"//div[@class='profiles']/ul/li")
                    if (items):
                        for item in items:
                            id = item.attrib["id"]
                            node = self.find_element_by_xpath(item, u".//div/a")
                            url = self.get_attrib(node, "href", None)
                            urlProfile = urlparse.urljoin(currentPage, url) if url else None
                            node = self.find_element_by_xpath(item,
                                                              u".//*[@class='info-bar']")  #info-bar 會在 <p> or <div> 中
                            title = node.text if node != None else ""
                            node = self.find_element_by_xpath(item, u".//p[@class='profile-description']")
                            description = node.text if node != None else ""
                            node = self.find_element_by_xpath(item, u".//div[@class='cont']/a/img")
                            imgSrc = self.get_attrib(node, "src", "")

                            #Progress
                            node = self.find_element_by_xpath(item, u".//div[@class='meter orange nostripes']/span")
                            progressStr = self.get_attrib(node, "style", "")
                            list = re.findall(ur"[;^]*?\s*?width:\s*([,\d]*)", progressStr)
                            progress = None
                            if(len(list)==1):
                                progress = list[0]

                            #togo raised donors
                            togo = None
                            raised = None
                            donors = None

                            if(title=="The Universal Fund"):
                                continue
                            else:
                                list = re.findall(ur"\$?([,\d]*)\s*(.*?)\s*\|\s*([,\d]*)\s*(.*)", title)
                                if (len(list) == 1 and len(list[0]) == 4):
                                    values = list[0]
                                    if (values[1] != None and values[1].lower() == "raised"):
                                        raised = values[0]
                                        donors = values[2]
                                    elif (values[1] != None and values[1].lower() == "to go"):
                                        togo = values[0]
                                        donors = values[2]
                                    else:
                                        self.logger.error("invalid reg pattern for %s in page %s" % (title, currentPage))
                                        continue
                                else:
                                    self.logger.error("invalid reg pattern for %s in page %s" % (title, currentPage))
                                    continue

                            if (raised != None):
                                raised = re.sub(ur"[,$]", "", raised)
                            if (togo != None):
                                togo = re.sub(ur"[,$]", "", togo)
                            if (donors != None):
                                donors = re.sub(ur"[,$]", "", donors)

                            #Log.i("%s %s" %(id, progress))
                            #Log.i("%s %s" %(id, urlProfile))
                            # Log.i("%s %s" %(id, title))
                            # Log.i("\t%s" % description)
                            # Log.i("\t%s" % imgSrc)
                            outputDir = os.path.join(os.path.join(self.profileDir, id[-1:]), id)
                            if (progress == '0' and not (os.path.isdir(outputDir))):
                                os.makedirs(outputDir)
                            if not os.path.exists(outputDir):
                                continue
                            if (self.getPrevProgressById(id) != '100'):
                                self.parseProfile(id, urlProfile, outputDir, donors)
                            else:
                                self.ensureProfileDownloaded(id, urlProfile, outputDir)
                            self.saveOverallEntry(id, [id, urlProfile])
                            self.cache_profile_details(id, progress, raised, togo, donors)
                        self.logger.info("%d items found" % (len(items)))
                    if (not items):
                        self.logger.info("items not found")
                        break
                    if (len(items) == 0):
                        self.logger.info("items length == 0")
                        break
                    if (not urlNextPage):
                        self.logger.info("NextPage not found")
                        break
                    pageIndex += 1
            self.savePatientInfos()
            self.logger.info('done!')
        except Exception as ex:
            self.logger.exception(LogHelper.getExceptionMsg(ex, "parsePages"))
        finally:
            pass