def download_page(self, url, outDir, assetDir, filename, css=False, javascript=False, image=False): page = None #response = self.get_http_pool(url).request('GET', url, headers=headers) try: response = self.session.get(url) except requests.exceptions.ConnectionError as connectionError: return -1, None except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "exception: download %s" % url)) return -1, None if (response.status_code != 200): self.logger.warn("http response: %s %s" % (response.status_code, url)) else: html = response.content.decode('utf-8', 'ignore') page = htmlparser.fromstring(html, base_url=url) if (css or javascript or image): if (css): self.download_related_files(page, u"//link", u"href", outDir, assetDir, baseurl=url) if (javascript): self.download_related_files(page, u"//script", u"src", outDir, assetDir, baseurl=url) if (image): self.download_related_files(page, u"//img", u"src", outDir, assetDir, baseurl=url) newHtml = htmlparser.tostring(page) self.saveToFile(os.path.join(outDir, filename), newHtml) else: self.saveToFile(os.path.join(outDir, filename), html) return response.status_code, page
def parseProfile(self, name, urlProfile, outputDir, donors): result = False try: self.logger.info('downloading profile %s...' % name) #filename = '%s.html' % datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S') status, page = self.downloader.download_page(urlProfile, outputDir, self.assetDir, profileHtmlSrcFilename, css=True, javascript=True, image=False) if (page == None): pass elif (status != 200): self.logger.warn("http response: %s %s" % (status, urlProfile)) else: detailedFound = False progress = "" raised = "" togo = "" nodeDetailes = self.find_element_by_xpath(page, u'//div[@id="funding_details"]') #text = htmlparser.tostring(nodeDetailes, "innerHTML") text=nodeDetailes.text_content() if (text != None and text != ""): text = text.lower() list = re.findall(ur"([\d\.]+?)%\s*(.*?)\$([\d,]+)\s*(.*?)\s*?$", text) if (len(list) == 1 and len(list[0]) == 4): values = list[0] if (values[1] != None and values[3] == "raised"): progress = values[0] raised = values[2] detailedFound = True elif (values[1] != None and values[3] == "to go"): progress = values[0] togo = values[2] detailedFound = True if(not detailedFound): self.logger.error("invalid reg pattern for %s in profile %s" % (text, urlProfile)) if (not detailedFound): self.logger.error("profile %s details not found" % name) else: timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') if (raised != None): raised = re.sub(ur"[,$]", "", raised) if (togo != None): togo = re.sub(ur"[,$]", "", togo) details = [timestamp, progress, raised, togo, donors] self.logger.info("profile %s details: %s" % (name, str(details))) #self.save_profile_details(os.path.join(outputDir, '%s.txt' % (name)), details) self.save_profile_details(os.path.join(outputDir, 'data.csv'), details) result = True except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "Exception: %s: %s " % ("parseProfile", name))) finally: pass return result
def savePatientInfos(self): if (self.patientInfosFile): self.patientInfosFile.close() self.patientInfosFile = None file = None try: file = open(self.patientInfosFilename, 'w').close() except Exception, ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "error: can't write cache file"))
def saveToFile(filename, data): file = None try: file = open(filename, mode='wb') file.write(data) except Exception as ex: logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename)) finally: if (file != None): file.close()
def saveToFile(self, filename, data): file = None try: file = open(filename, mode='wb') file.write(data) except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % (filename))) finally: if (file): file.close()
def save_profile_details(self, filename, list): file = None try: file = open(filename, "ab") writer = csv.writer(file) writer.writerow(list) file.flush() except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % (filename))) finally: if (file != None): file.close()
def saveTextToFile(filename, data, encoding="utf-8"): file = None try: file = codecs.open(filename, mode='w', encoding=encoding) #file = codecs.open(filename, mode='wb') #file = open(filename, mode='w') #file.write(u'\ufeff') #codecs.BOM_UTF8 file.write(data) except Exception as ex: logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename)) finally: if (file != None): file.close()
def saveTextToFile(self, filename, data): file = None try: #file = codecs.open(filename, mode='w', encoding="utf-8") #file = codecs.open(filename, mode='wb') file = open(filename, mode='w') #file.write(u'\ufeff') #codecs.BOM_UTF8 file.write(data) except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename)) finally: if (file): file.close()
def loadPatientInfos(self): if (os.path.isfile(self.patientInfosFilename)): file = None try: file = open(self.patientInfosFilename, "r") while True: line = file.readline() if not line: break if line == "": continue newEntry = ast.literal_eval(line) if (isinstance(newEntry, dict) and "id" in newEntry): id = newEntry[self.cache_key_id] if (id in self.dictPatientInfos): entry = self.dictPatientInfos[id] else: entry = {} self.dictPatientInfos[id] = entry for key in newEntry: entry[key] = newEntry[key] self.logger.info("%d patient infos loaded" % len(self.dictPatientInfos)) except Exception, ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "Can't read cache file")) finally:
def parsePages(self): try: pageIndex = 1 while True: urlNextPage = 'http://search.appledaily.com.tw/charity/projlist/Page/%d' % pageIndex self.logger.info('downloading page %d...' % pageIndex) currentPage = urlNextPage status, page = self.downloader.download_page(currentPage, self.htmlDir, self.assetDir, 'page%05d.html' % (pageIndex), css=False, javascript=False, image=False) if (page == None): self.logger.warn("error: downloading page %d" % (pageIndex)) break elif (status != 200): self.logger.warn("http response: %s %s" % (status, currentPage)) break else: self.logger.info('parsing page %d...' % pageIndex) items = page.xpath(u"//tr[@class='odd']") row = 0 if (items): for item in items: nodes = item.xpath(u".//td") if (len(nodes) == 6): reportUrl = None detailUrl = None title = None row += 1 id = nodes[0].text if( id[0] == 'A'): node = nodes[1].xpath(u".//a") if len(node) > 0: title = node[0].text reportUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None)) else: self.logger.warn("title not found") date = nodes[2].text status = str(nodes[3].text_content()) amount = nodes[4].text node = nodes[5].xpath(u".//a") if len(node) > 0: detailUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None)) else: self.logger.warn("detail not found") if (title == None): self.logger.warn("title not found") if (title == None or reportUrl == None or detailUrl == None): self.logger.warn("parse error!!!") if (status == u"已結案"): dir = os.path.join(self.profileDir, id[-1:] + os.sep + id + os.sep) dirRm = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep) if (self.getIsProfileSaved(dirRm)): shutil.rmtree(dirRm, ignore_errors=True) if (not self.getIsProfileSaved(dir)): #self.logger.warn("saving profile: page %d, id %s" % (pageIndex, id)) overallEntry = OverallEntry() overallEntry.id = id overallEntry.title = StrHelper.trim(title) overallEntry.total = amount overallEntry.begindate = date overallEntry.reportUrl = reportUrl overallEntry.detailUrl = detailUrl self.logger.info("saving profile %s" % id) # FIXME: IOError: [Errno 2] No such file or directory: appledaily/profiles/\u672a\u7d50\u6848/ dir = dir.replace(u"未結案"+os.sep, '') self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry) self.saveOverallEntry(overallEntry.id, [overallEntry.id, overallEntry.begindate, overallEntry.enddate, overallEntry.total, overallEntry.doners, overallEntry.title, overallEntry.reporter, overallEntry.reportUrl, overallEntry.detailUrl]) self.saveUrls(dir, reportUrl, detailUrl) #self.saveMetadata(dir, title, date, amount) elif (status == u"未結案"): dir = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep) overallEntry = OverallEntry() overallEntry.id = id overallEntry.title = StrHelper.trim(title) overallEntry.total = amount overallEntry.begindate = date overallEntry.reportUrl = reportUrl overallEntry.detailUrl = detailUrl self.logger.info("saving profile %s" % id) self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry) self.saveOverallEntryPending(overallEntry.id, [overallEntry.id, overallEntry.begindate, overallEntry.enddate, overallEntry.total, overallEntry.doners, overallEntry.title, overallEntry.reporter, overallEntry.reportUrl, overallEntry.detailUrl]) self.saveUrls(dir, reportUrl, detailUrl) # pass else: self.logger.warn("unknown status") self.logger.info("%d items found" % (row)) if (row == 0): break if (not items): self.logger.info("items not found") break if (len(items) == 0): self.logger.info("items length == 0") break pageIndex += 1 self.logger.info('done!') except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "parsePages")) finally: pass
def saveProfile(self, profileName, dir, reportUrl, detailUrl, overallEntry): assetdir = os.path.join(dir, "files" + os.sep) if (not os.path.isdir(dir)): os.makedirs(dir) if (not os.path.isdir(assetdir)): os.makedirs(assetdir) status, page = self.downloader.download_page(reportUrl, dir, assetdir, '%s_origin.htm' % (profileName), css=False, javascript=False, image=False) #self.downloader.clear_cache() if (page != None): reporter = None reportContent = "" #headers items = page.xpath(u"//*[@id='maincontent']//article/header/hgroup/*") for item in items: header = StrHelper.trim(item.text_content()) if (header != None and header.startswith(profileName)): header = StrHelper.trim(header[len(profileName):]) reportContent += header + os.linesep break reportContent += os.linesep #content reg = re.compile(ur"^基金會編號.*$", re.MULTILINE) allsymbols = ur" ,、。.?!~$%@&#*‧;︰…‥﹐﹒˙·﹔﹕‘’“”〝〞‵′〃├─┼┴┬┤┌┐╞═╪╡│▕└┘╭╮╰╯╔╦╗╠═╬╣╓╥╖╒╤╕║╚╩╝╟╫╢╙╨╜╞╪╡╘╧╛﹣﹦≡|∣∥–︱—︳╴¯ ̄﹉﹊﹍﹎﹋﹌﹏︴﹨∕╲╱\/↑↓←→↖↗↙↘〔〕【】﹝﹞〈〉﹙﹚《》(){}﹛﹜『』「」<>≦≧﹤﹥︵︶︷︸︹︺︻︼︽︾︿﹀∩∪﹁﹂﹃﹄" regReporters = [ #re.compile(ur"[。:」\s]+(.{3,4})口述.?記者(.{3,4})(?:採訪整理)?$", re.MULTILINE), re.compile(allsymbols + ur"[\s]+(.{2,4})[口筆]述\s?.?\s?記者(.{2,4})(?:採訪整理)?$", re.MULTILINE), #[\u4e00-\u9fa5] 英文字符之外的字符,包括中文漢字和全角標點 re.compile(ur"報導.攝影.(.{2,4})記者$", re.MULTILINE), re.compile(ur"報導.攝影.(.{2,4})$", re.MULTILINE), re.compile(ur"攝影.報導.(.{2,4})$", re.MULTILINE), re.compile(ur"攝影.(.{2,4})$", re.MULTILINE), re.compile(ur"報導.(.{2,4})$", re.MULTILINE), re.compile(ur"報導.(.{2,4})$", re.MULTILINE), re.compile(ur"記者(.{2,4})採訪整理$", re.MULTILINE), re.compile(ur"^【(.{2,4})╱.{2,4}報導】", re.MULTILINE), ] #preserve <br> tags as \n brs = page.xpath(u"//div[@class='articulum']//br") if (len(brs) == 0): brs = page.xpath(u"//div[@class='articulum trans']//br") for br in brs: br.tail = "\n" + br.tail if br.tail else "\n" items = page.xpath(u"//div[@class='articulum']/*") if (len(items) == 0): items = page.xpath(u"//div[@class='articulum trans']/*") for item in items: tag = item.tag.lower() id = self.get_attrib(item, "id", None) # if (tag == "figure"): continue # if (tag == "iframe"): break if (id == "bcontent" or id == "bhead" or id == "introid"): text = StrHelper.trim(item.text_content()) if (text == None or text == ""): continue if (id != "bhead"): for regReporter in regReporters: list = regReporter.findall(text) if (len(list) == 1): if (not isinstance(list[0], basestring)): reporter = "/".join(list[0]) else: reporter = list[0] text = StrHelper.trim(regReporter.sub('', text)) break if (reporter): overallEntry.reporter = reporter else: self.logger.warn("error: parsing reporter: %s" % reportUrl) text = StrHelper.trim(reg.sub('', text)) reportContent += text + os.linesep + os.linesep FileHelper.saveToFile(os.path.join(dir, reportFileName), reportContent) status, page = self.downloader.download_page(detailUrl, dir, assetdir, detailSrcFileName, css=False, javascript=False, image=False) if (page != None): items = page.xpath(u"//div[@id='charitysidebox3'][1]/div[@id='inquiry3']/table//tr") maxDate = None if (len(items) > 0): file = None try: file = open(os.path.join(dir, detailFileName), "wb") csvwriter = csv.writer(file) for index, item in enumerate(items): if (index > 1): cols = item.xpath(u".//td") if (len(cols) == 4): no = StrHelper.trim(cols[0].text) name = StrHelper.trim(cols[1].text) amount = StrHelper.trim(cols[2].text) dateStr = StrHelper.trim(cols[3].text) try: date = datetime.datetime.strptime(dateStr, "%Y/%m/%d") if (maxDate == None or date > maxDate): maxDate = date except Exception as ex: self.logger.warn("error date format:%s in %s" % (dateStr, detailUrl)) csvwriter.writerow([no, dateStr, amount, name]) overallEntry.enddate = maxDate.strftime("%Y/%m/%d") if maxDate != None else "" overallEntry.doners = len(items) - 2 except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "error paring detail.html")) finally: if (file): file.close()
def parsePages(self): try: pageIndex = 1 #urlNextPage = 'https://watsi.org/fund-treatments/page/129' urlNextPage = 'https://watsi.org/fund-treatments/' while True: self.logger.info('downloading page %d...' % pageIndex) currentPage = urlNextPage status, page = self.downloader.download_page(currentPage, self.htmlDir, self.assetDir, 'page%05d.html' % (pageIndex), css=False, javascript=False, image=False) if (page == None): self.logger.warn("error: downloading page %d" % (pageIndex)) break elif (status != 200): self.logger.warn("http response: %s %s" % (status, currentPage)) break else: self.logger.info('parsing page %d...' % pageIndex) #find next page's url nodes = page.xpath(u"//a[text()='Next ›']") urlNextPage = urlparse.urljoin(currentPage, nodes[0].attrib['href']) if (len(nodes) > 0) else None items = page.xpath(u"//div[@class='profiles']/ul/li") if (items): for item in items: id = item.attrib["id"] node = self.find_element_by_xpath(item, u".//div/a") url = self.get_attrib(node, "href", None) urlProfile = urlparse.urljoin(currentPage, url) if url else None node = self.find_element_by_xpath(item, u".//*[@class='info-bar']") #info-bar 會在 <p> or <div> 中 title = node.text if node != None else "" node = self.find_element_by_xpath(item, u".//p[@class='profile-description']") description = node.text if node != None else "" node = self.find_element_by_xpath(item, u".//div[@class='cont']/a/img") imgSrc = self.get_attrib(node, "src", "") #Progress node = self.find_element_by_xpath(item, u".//div[@class='meter orange nostripes']/span") progressStr = self.get_attrib(node, "style", "") list = re.findall(ur"[;^]*?\s*?width:\s*([,\d]*)", progressStr) progress = None if(len(list)==1): progress = list[0] #togo raised donors togo = None raised = None donors = None if(title=="The Universal Fund"): continue else: list = re.findall(ur"\$?([,\d]*)\s*(.*?)\s*\|\s*([,\d]*)\s*(.*)", title) if (len(list) == 1 and len(list[0]) == 4): values = list[0] if (values[1] != None and values[1].lower() == "raised"): raised = values[0] donors = values[2] elif (values[1] != None and values[1].lower() == "to go"): togo = values[0] donors = values[2] else: self.logger.error("invalid reg pattern for %s in page %s" % (title, currentPage)) continue else: self.logger.error("invalid reg pattern for %s in page %s" % (title, currentPage)) continue if (raised != None): raised = re.sub(ur"[,$]", "", raised) if (togo != None): togo = re.sub(ur"[,$]", "", togo) if (donors != None): donors = re.sub(ur"[,$]", "", donors) #Log.i("%s %s" %(id, progress)) #Log.i("%s %s" %(id, urlProfile)) # Log.i("%s %s" %(id, title)) # Log.i("\t%s" % description) # Log.i("\t%s" % imgSrc) outputDir = os.path.join(os.path.join(self.profileDir, id[-1:]), id) if (progress == '0' and not (os.path.isdir(outputDir))): os.makedirs(outputDir) if not os.path.exists(outputDir): continue if (self.getPrevProgressById(id) != '100'): self.parseProfile(id, urlProfile, outputDir, donors) else: self.ensureProfileDownloaded(id, urlProfile, outputDir) self.saveOverallEntry(id, [id, urlProfile]) self.cache_profile_details(id, progress, raised, togo, donors) self.logger.info("%d items found" % (len(items))) if (not items): self.logger.info("items not found") break if (len(items) == 0): self.logger.info("items length == 0") break if (not urlNextPage): self.logger.info("NextPage not found") break pageIndex += 1 self.savePatientInfos() self.logger.info('done!') except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "parsePages")) finally: pass