def tujiParse2(self, info, urlSign, page, url, domain): encoding, picUrl, title, text, publishTime, mypos, pageNum = tupian.parse( url, page, info) images = [picUrl] if info.has_key('detailFenyePattern'): if pageNum > 1: for i in range(2, pageNum + 1): urlNew = url.replace(info['detailFenyePattern'][0], info['detailFenyePattern'][1] % i) page = getPage(urlNew) if not page: retStatus = self.changeStatus(urlSign, 1) print 'down url:%s failed' % urlNew continue fenyeEncoding, fenyePicUrl, fenyeTitle, fenyeText, fenyePublishTime, fenyeMypos, fenyePageNum = tupian.parse( url, page, info) text = text + fenyeText images.append(fenyePicUrl) if not publishTime: x = time.localtime(time.time()) publishTime = '0000-00-00 00:00:00' text = text.replace('\r', '').replace('\n', ' ').replace('\t', '') picInfo = [] if len(images) == 0: retStatus = self.changeStatus(urlSign, 1) print 'url:%s has no pic' % url return False for i in range(len(images)): picInfo.append({'picUrl': images[i], 'picDesc': '', 'pic_seq': i}) imgJson = json.dumps(picInfo, ensure_ascii=False) sql = "update tbl_content_2 set title = '%s', publishTime = '%s', status = 2, text = '%s', mypos = '%s', images = '%s' where urlSign = '%s'"\ %(MySQLdb.escape_string(title.encode('utf-8')), publishTime.encode('utf-8'), MySQLdb.escape_string(text.encode('utf-8')), MySQLdb.escape_string(mypos.encode('utf-8')), imgJson.encode('utf-8'), urlSign.encode('utf-8')) print sql doDB(sql, 'chuilei') return True
def run(self, domain, info): sql = "select urlSign, url, domain, isAlbum from tbl_content_2 where domain = '%s' \ and status = 0 and category='%s';" % (domain, info['category']) res = getDB(sql, 'chuilei') for item in res: urlSign = item[0] url = item[1] print url domain = item[2] isAlbum = item[3] page = getPage(url) #page = page.decode('utf-8','ignore') if not page: retStatus = self.changeStatus(urlSign, 1) print 'down url:%s failed' % url continue if isAlbum == 0: ret = self.tupianParse(info, urlSign, page, url, domain) if not ret: retStatus = self.changeStatus(urlSign, 1) print 'parse url:%s failed' % url continue elif isAlbum == 1: ret = self.tujiParse(info, urlSign, page, url, domain) if not ret: retStatus = self.changeStatus(urlSign, 1) print 'parse url:%s failed' % url continue elif isAlbum == 2: ret = self.tujiParse2(info, urlSign, page, url, domain) if not ret: retStatus = self.changeStatus(urlSign, 1) print 'parse url:%s failed' % url continue
def getDetailUrl(self, fenyeUrl, info): page = getPage(fenyeUrl) detailUrlList = self.parse(fenyeUrl, page, info['urlPattern']) if not detailUrlList: print 'url:%s can not get detail page url' % fenyeUrl return False self.process(fenyeUrl, detailUrlList, info) return True
def getSourceFy(self, sourceUrl, info): fyList = [] page = getPage(sourceUrl) if not page: return fyList pageNum = pe.pageNumExtract(page, info['domain']) fyList.append(sourceUrl) if pageNum > 1: for i in range(2, pageNum + 1): if info['sourceFenyePattern'][0] == "": urlNew = sourceUrl + info['sourceFenyePattern'][1] % i else: urlNew = sourceUrl.replace( info['sourceFenyePattern'][0], info['sourceFenyePattern'][1] % i) fyList.append(urlNew) return fyList
#encoding: utf-8 import sys import content_extract as ce sys.path.append("../../lib") import download if __name__ == "__main__": html = download.getPage(sys.argv[1]) enc, time, title, text = ce.parse(sys.argv[1], html) print "标题:" + title.encode('utf-8', 'ignore') print "时间:" + time.encode('utf-8', 'ignore') print '=' * 10 print "内容:" + text.encode('utf-8', 'ignore')
time = strtotime(time, '') if webInfo.has_key('imgReplace'): patternList = webInfo['imgReplace'] for picUrl in images: for pattern in patternList: picUrl = picUrl.replace(pattern[0], pattern[1]) imgList.append(picUrl) else: imgList = images #print time.encode('utf-8') #print text.encode('utf-8') return encoding, title, text, time, imgList, mypos if __name__ == "__main__": from original_url_sucai import webInfo #html = open('page.html').read() #enc, title, text, time, images, mypos = parse('http://www.guandongphoto.com/thread-1035924-1-11.html',html, webInfo['guandongphoto.com']) url = sys.argv[1] domain = sys.argv[2] html = getPage(url) enc, title, text, time, images, mypos = parse(url, html, webInfo[domain]) print "标题:" + title.encode('utf-8', 'ignore') print "mypos:" + mypos.encode('utf-8', 'ignore') print "时间:" + time.encode('utf-8', 'ignore') print "内容:\n" + text.encode('utf-8', 'ignore') print '=' * 10 print '图片:' print images
import sys import re import datetime sys.path.append("../lib") from download import getPage from clientSource import * webInfo = { 'category': '美图', 'sourceName': '中关村在线', 'domain': 'zol.com', 'sourceUrl': 'http://sj.zol.com.cn/bizhi/', 'urlPattern': 'http.*/bizhi/.*\.html', } client = Client() for i in range(2, 430): url = "http://www.5857.com/list-11-0-0-0-0-0-%d.html" % i print url page = getPage(url) if not page: print 'down url:%s failed' % url continue client.parse(page, webInfo['sourceUrl'], webInfo['category'], webInfo['sourceName'], webInfo['domain'], webInfo['urlPattern']) #page = open('./page').read() #client = Client() #client.parse(page, webInfo['sourceUrl'], webInfo['category'], webInfo['sourceName'], webInfo['domain'], webInfo['urlPattern'])