class ImageSpider: def __init__(self): self.siteURL = {'meinvtupianjingpin':'http://www.yixiuba.com/meinvtupianjingpin/list_5_%s.html'} #'meituisiwatupian': 'http://www.yixiuba.com/meituisiwatupian/list_2_%s.html', self.siteUrlSiwa = 'http://www.4493.com/siwameitui/index-%s.htm' self.tool = tool.Tool() self.HttpHelper = HttpHelper() self.DBHelper = DBHelper() # 获取索引界面所有MM的信息,list格式 def getContents(self, pageindex): contents = [] for baseUrl in self.siteURL: url = self.siteURL[baseUrl] % str(pageindex) if (pageindex == 1): url = 'http://www.yixiuba.com/%s/' % str(baseUrl) contenthtml = self.HttpHelper.getHtml(url) if contenthtml != None: import lxml.html.soupparser as soupparser dom = soupparser.fromstring(contenthtml) # doc = dom.parse(dom)/html/body/div/dl/dd # /html/body/div/dl/dd # //div[@class='main_top']/ul[@class='new public-box']/li/a nodes = dom.xpath("//div[@class='page1']/ul/li/a") for item in nodes: # 套图名称: item.tesx # 套图URL item.xpath("@href")[0] # append 只能添加一个对象 if (len(item.xpath("@href")) > 0): dict = {'Name': item.xpath("@title")[0], 'Url': item.xpath("@href")[0]} contents.append(dict) return contents # 分析套图数量 def getAllnum(self, pagehtml): # class="content"> import lxml.html.soupparser as soupparser dom = soupparser.fromstring(pagehtml) # nodes = dom.xpath("//div[@class='content']/div[@class='content-page']/span[@class='page-ch']") nodes = dom.xpath("//div[@class='dede_pages']/ul/li/a") text = nodes[0].text number = text[1:len(text) - 3] try: num = int(number) except: print(number) num = 0 return num # 解析图片 def getImage(self, pagehtml): import lxml.html.soupparser as soupparser dom = soupparser.fromstring(pagehtml) # nodes = /html/body/div[3]/div[3]/p/img nodes = dom.xpath("//div[@class='center']/div[@class='page-list']/p/img") images = [] if (len(nodes) >0): for i in range(0, len(nodes)): images.append(nodes[i].xpath("@src")[0]) if (len(images) == 0): return None return images # 获取套图张数并获取图片信息 def getPageImages(self, index): contents = [] # 获取索引界面 套图地址 print u"正在收集第", index, u"页的MM信息" contents = self.getContents(index) print u"收集第", index, u"页的MM信息完成" if contents != None: # 循环套图地址 print u"开始循环下载", index, u"页的MM信息" for item in contents: name = item['Name'] url = item['Url'] if "http" not in url: url = 'http://www.yixiuba.com' + url # print u"发现套图", name, u"套图地址是", url, # 套图地址URL # 得到套图界面代码 detailhtml = self.HttpHelper.getHtml(url) allnum = 0 ishaved = False if detailhtml != None: # 分析套图数量 print u"分析套图数量", print allnum = self.getAllnum(detailhtml) print u"套图数量", allnum, print baseurl = url[0:len(url) - 5] threads = [] for i in range(1, allnum): url = baseurl if "http" not in url: url = 'http://www.yixiuba.com' + url if (i > 1): url = baseurl + '_'+str(i) + ".html" print u"加入生产队列", url, print # self.ProductImage(url, name, i) t1 = threading.Thread(target=self.ProductImage, args=(url, name, i)) threads.append(t1) for t in threads: t.setDaemon(True) t.start() t.join() print u"循环下载", index, u"页的MM信息完成" print # 下载图片 def downloadImage(self, url, name, index): try: pagehtml = self.HttpHelper.getHtml(url) if pagehtml != None: # 循环抓取套图图片 imagesurls = self.getImage(pagehtml) if (imagesurls != None): for i in range(0,len(imagesurls)): filename = name + "/beautiful" + str(index) + imagesurls[i][-4:] if not os.path.exists(filename): # 保存图片 self.saveImg(imagesurls[i], filename) except: print "保存图片失败:", sys.exc_info()[2] # 传入图片地址,文件名,保存单张图片 def saveImg(self, imageURL, fileName): f = [] try: u = urllib.urlopen(imageURL) data = u.read() f = open(fileName, 'wb') f.write(data) print fileName f.close() except: print "Unexpected error:", sys.exc_info()[2] finally: f.close() # 创建新目录 def mkdir(self, path): path = path.strip() # 判断路径是否存在 # 存在 True # 不存在 False isExists = os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 # print u"偷偷新建了名字叫做", path, u'的文件夹' # 创建目录操作函数 os.makedirs(path) return True else: # 如果目录存在则不创建,并提示目录已存在 # print u"名为", path, '的文件夹已经创建' print return False def savePagesInfos(self, start, end): for i in range(start, end + 1): # print u"正在收集第", i, u"页的糗事" self.getPageImages(i) time.sleep(1) # 加入生产队列 def ProductImage(self, url, name, index): try: pagehtml = self.HttpHelper.getHtml(url) if pagehtml != None: # 循环抓取套图图片 imagesurls = self.getImage(pagehtml) if (imagesurls != None and len(imagesurls) > 0): for i in range(0,len(imagesurls)): ImageInfo = self.DBHelper.GetImageUrlInfo(name,imagesurls[i]); if ImageInfo != None: self.DBHelper.InsertImageUrlInfo(name,imagesurls[i]) # if not os.path.exists(filename): # # 保存图片 # if not q.full(): # print "图片加入下载队列:" + imagesurls[i] + filename + "队列数:" + str(q.qsize()) # q.put([imagesurls[i], filename]) # print # else: # print "下载队列已经满了:" + imagesurls[i] + filename + "队列数:" + str(q.qsize()) # print except: print "图片---" + imagesurls[i] + "加入下载队列失败:" + str(sys.exc_info()) print
class GetLianjiaData: def __init__(self): self.siteUrl = 'http://cd.fang.lianjia.com/loupan/pg%s/' self.tool = tool.Tool() self.HttpHelper = HttpHelper() self.DBHelper = DBHelper() # 获取索引界面所有MM的信息,list格式 def getContents(self, pageindex): contents = [] url = self.siteUrl % str(pageindex) if (pageindex == 1): url = 'http://cd.fang.lianjia.com/loupan/' contenthtml = self.HttpHelper.getHtml(url) if contenthtml != None: import lxml.html.soupparser as soupparser dom = soupparser.fromstring(contenthtml) # doc = dom.parse(dom)/html/body/div/dl/dd # /html/body/div/dl/dd # //div[@class='main_top']/ul[@class='new public-box']/li/a # //*[@id="house-lst"]/li[4]/div[2]/div[1]/h2/a #//*[@id="house-lst"]/li[4]/div[@class='info-panel']/div[@class='col-1']/h2/a nodes = dom.xpath("//*[@id='house-lst']/li/div[@class='info-panel']") for item in nodes: # 套图名称: item.tesx # 套图URL item.xpath("@href")[0] # append 只能添加一个对象 try: if (len(item.xpath("./div[@class='col-1']/h2/a")) > 0): #开发商 HouseDevelopers = item.xpath("./div[@class='col-1']/h2/a")[0].text #楼盘地址 houseWhere = item.xpath("./div[@class='col-1']/div[@class='where']/span")[0].text #户型 housnumber = item.xpath("./div[@class='col-1']/div[@class='area']")[0].text #面积 measureArea = item.xpath("./div[@class='col-1']/div[@class='area']/span")[0].text others = item.xpath("./div[@class='col-1']/div[@class='other']/span") otherTags = '' housetypes = '' housePeice = 0 if(len(others)>0): for other in others: otherTags+=other.text+'|'; types = item.xpath("./div[@class='col-1']/div[@class='type']/span") if(len(types)>0): for typeitem in types: housetypes+=typeitem.text+'|'; if (len(item.xpath("./div[@class='col-2']/div[@class='price']")) > 0): #价格 housePeice = item.xpath("./div[@class='col-2']/div[@class='price']/div/span")[0].text dict = {'HouseDevelopers': HouseDevelopers, 'houseWhere': houseWhere, 'housnumber':housnumber, 'measureArea':measureArea, 'otherTags':otherTags, 'housetypes':housetypes, 'housePeice':housePeice} contents.append(dict) except: print "Unexpected error:", sys.exc_info()[2] return contents # 将一页淘宝MM的信息保存起来 def saveHouseInfo(self, pageIndex): # 获取第一页楼盘列表 contents = self.getContents(pageIndex) if contents != None: print u"开始循环第", pageIndex, u"页的楼盘信息" for item in contents: HouseDevelopers = item['HouseDevelopers'] houseWhere = item['houseWhere'] housnumber = item['housnumber'] measureArea = item['measureArea'] otherTags = item['otherTags'] housetypes = item['housetypes'] housePeice = item['housePeice'] # developers,housewhere,area,other,type,price self.DBHelper.InserHouseDatainfo(HouseDevelopers,houseWhere,measureArea,otherTags,housetypes,int(housePeice)) def StartSpider(self, start, end): for i in range(start, end + 1): print u"正在收集第", i, u"页的楼盘信息" self.saveHouseInfo(i) time.sleep(1)