def parse_datas(self, soup): totalfind = soup.select("h2.total.fl > span") if 0 == ToolsBox.strToInt(totalfind[0].get_text()): return '0' page_datas = [] communitys = soup.select("div.info>div.title>a") regions = soup.select('a.district') blocks = soup.select('a.bizcircle') prices = soup.select('.totalPrice>span') forsales = soup.select('.totalSellCount>span') buildyears = soup.select('.positionInfo') for community, region, block, price, forsale, buildyear in zip( communitys, regions, blocks, prices, forsales, buildyears): each_data = dict() each_data['community_name'] = community.get_text() each_data['community_url'] = community.get('href') each_data['region'] = region.get_text() each_data['block'] = block.get_text() each_data['builded_year'] = ToolsBox.strToInt(buildyear.get_text()) each_data['forsale_num'] = ToolsBox.strToInt(forsale.get_text()) each_data['price'] = ToolsBox.strToInt(price.get_text()) # each_data['date'] each_data['from'] = "LJ" if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) # ToolsBox.printDic(page_datas) return page_datas
def parse_datas(self,soup): page_datas = [] titles = soup.select("h1 > a") infos = soup.select("p.house_info") hots = soup.select("p.house_hot") areas = soup.select("div.the_area span") prices = soup.select("div.the_price span") splitby = re.compile(r']|,|\s') for title, info, hot, area, price in zip(titles, infos, hots, areas, prices): each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0} each_data['title'] = title.get_text() each_data['details_url'] = 'http://xm.maitian.cn' + title.get('href') try: each_data['total_price'] = ToolsBox.strToInt(price.get_text()) except Exception as e: with open('logtest.txt', 'a+') as fout: fout.write('*************' + str(datetime.datetime.now()) + '*************\n') fout.write('麦田解析total_price出错,待解析的数据:' + price.get_text()) traceback.print_exc(file=fout) print(traceback.format_exc()) try: each_data['block'] = info.get_text().strip() each_data['community_name'] = splitby.split(each_data['block'])[-1].strip() each_data['block'] = each_data['block'].replace(each_data['community_name'],'') except Exception as e: with open('logtest.txt', 'a+') as fout: fout.write('*************' + str(datetime.datetime.now()) + '*************\n') fout.write('Parse Failt of :%s \n' % info.get_text()) traceback.print_exc(file=fout) print(traceback.format_exc()) # try: # 麦田的格式,这里是户型、优势和楼层 temp = ToolsBox.clearStr(hot.text).split('|') for item in temp: d1 = self.parse_item(item) each_data = self.add_advantage(d1, each_data) #each_data = dict(each_data, **d1) # 这是解析面积 each_data = dict(each_data, **self.parse_item(area.get_text())) each_data['from'] = "MT" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): i = 1 page_datas = [] details = soup.select('dd.detail ') hrefs = soup.select('span.c_blue0041d9.aVisited.f14B > a') comms = soup.select('span.xuzhentian > a') prices = soup.select('span > em') for detail, href, comm, price in zip(details, hrefs, comms, prices): each_data = dict(advantage='', builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0) each_data['title'] = href.get_text().strip() each_data['community_name'] = comm.get_text().strip() each_data['details_url'] = "http://esf.xmhouse.com" + href.get( 'href') each_data['total_price'] = ToolsBox.strToInt(price.get_text()) h_infos = re.search(r'<span style="margin-left: 5px; color: #000000">.*</span>(.*) <div', str(detail), re.S) \ .group(1).replace('<br/>', '').replace('\r\n', '').replace(' ', '').split(',') for item in h_infos: try: d1 = {} d1 = self.parse_item(item) each_data = self.add_advantage( d1, each_data) #each_data = dict(each_data, **d1) except Exception as e: with open('logtest.txt', 'a+') as fout: fout.write('*************' + str(datetime.datetime.now()) + '*************\n') fout.write(' 获取的数据:') for i1 in h_infos: fout.write(i1 + ',') fout.write('\n XmParser解析时发生错误的Item是: ' + str(item) + '\n') traceback.print_exc(file=fout) print(traceback.format_exc()) each_data['from'] = "XMHouse" # ToolsBox.printDic(each_data) # print('******************{0}******************'.format(i)) # i += 1 each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): totalfind = soup.select("span.tit em") if 0 == ToolsBox.strToInt(totalfind[1].get_text()): return '0' page_datas = [] communitys = soup.select("h3 > a") adds = soup.select('.li-info>address') dates = soup.select('p.date') prices = soup.select('p>strong') forsales = soup.select('p.bot-tag>span>a') for community, add, date, price, forsale in zip( communitys, adds, dates, prices, forsales): each_data = dict() each_data['community_name'] = community.get('title') each_data['community_url'] = community.get('href') add1 = ToolsBox.clearStr(add.get_text()) addlist = add1.split(']') if len(addlist) > 1: regionlist = addlist[0].replace('[', '').split('-') if len(regionlist) > 1: each_data['region'], each_data['block'] = regionlist else: each_data['region'] = regionlist each_data['address'] = addlist[1] else: each_data['address'] = add1 each_data['builded_year'] = ToolsBox.strToInt(date.get_text()) each_data['forsale_num'] = ToolsBox.strToInt(forsale.get_text()) each_data['price'] = ToolsBox.strToInt(price.get_text()) # each_data['date'] each_data['from'] = "AJK" if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) # ToolsBox.printDic(page_datas) return page_datas
def parse_datas(self, soup): page_datas = [] details = soup.select(".houseInfo") comms = soup.select(".positionInfo a") prices = soup.select(".totalPrice") titles = soup.select("div.title a.CLICKDATA") for title, detail, price, comm in zip(titles, details, prices, comms): each_data = dict(builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0, details_url=title.get('href'), advantage='') each_data['title'] = title.get_text().strip() houseInfos = re.split(r'\s*[|,\s]\s*', ToolsBox.clearStr(detail.get_text())) # print(houseInfos) # print("1"*20) each_data['community_name'] = comm.get_text().strip() if len(each_data['community_name']) >= 20: input(each_data['community_name'] + ':' + str(len(each_data['community_name']))) # houseInfos = houseInfos[1:] #第一个是小区名称,切片去除 for item in houseInfos: # print(item) d1 = self.parse_item(item) each_data = self.add_advantage(d1, each_data) each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "Beike" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) # print(each_data) if not page_datas: item_num = soup.select(".fl span") if item_num: page_datas = item_num[0].get_text().strip() return page_datas
def parse_datas(self, soup): page_datas = [] items = soup.select('div.info') titles = soup.select('p.title a ') comms = soup.select('p.hlistP a span') addresses = soup.select('p.hlistP a.addressChange') regions = soup.select('p.hlistP > span') mores = soup.select('.moreInfo') prices = soup.select('.price') for item,title,comm,addr,region,price,more in \ zip(items,titles,comms,addresses,regions,prices,mores): each_data = dict(builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0) each_data['title'] = title.get_text() each_data['details_url'] = 'http://www.917.com' + title.get('href') details = item.select('p') for string in details[1].stripped_strings: d1 = self.parse_item(string.strip()) each_data = self.add_advantage(d1, each_data) each_data['community_name'] = comm.get_text() each_data['community_address'] = addr.get_text() each_data['region'] = region.get_text().replace('|', '').replace( ' ', '') each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "917" getP = more.select('p') for p in getP: if '建筑面积' in p.get_text(): d1 = self.parse_item(p.get_text().strip()) each_data = self.add_advantage(d1, each_data) each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self,soup): page_datas = [] titles = soup.select("div.title > a") houseinfo = soup.select("div.houseInfo") positionInfo = soup.select("div.positionInfo") totalprices = soup.select("div.totalPrice") # for title, info, position, totalPrice in zip(titles, houseinfo, positionInfo, totalprices): each_data = {'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0} each_data['title'] = title.get_text() each_data['details_url'] = title.get('href') each_data['total_price'] = ToolsBox.strToInt(totalPrice.get_text()) info_item = info.get_text().split('|') # each_data['community_name'] = info_item[0].strip() # 第1个总是小区名称 for i in range(0, len(info_item)): d1 = self.parse_item(info_item[i].strip()) each_data = self.add_advantage(d1,each_data) position = position.get_text().replace('\t', '').replace('\n', '').split() each_data['community_name'] = position[0].strip() # 10月21日改变了小区名称位置 # print(position) each_data['block'] = position[-1] if ')' not in position[0]: # 链前的别墅会用'4层2008年建'的形式,加入')',以便分隔 position[0] = position[0].replace('层', '层)') for item in position[0].split(')'): # 2017.4.1链家格式有改 d1 = self.parse_item(item.strip()) # 2017.4.1链家格式有改 each_data = self.add_advantage(d1, each_data) # each_data = dict(each_data, **d1) each_data['from'] = "lianjia" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) if not page_datas: total_num = soup.select('.total span') if total_num: page_datas = total_num[0].get_text().strip() return page_datas
def parse_datas(self, soup): page_datas = [] # title = soup.select("title") # if len(title) > 0: # print("The page's title is : {0}".format(title[0].get_text())) # else: # print("There is no title finded!") titles = soup.select(".shop_list > dl h4 a") houses = soup.select("p.tel_shop") comms = soup.select(".shop_list > dl dd p.add_shop a") comm_addresses = soup.select(".shop_list > dl dd p.add_shop span") prices = soup.select(".price_right .red b") for title, comm, comm_addresse, house, price in zip( titles, comms, comm_addresses, houses, prices): each_data = dict(builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0, advantage='') each_data['title'] = title.get('title') each_data['details_url'] = "https://xm.esf.fang.com" + title.get( 'href') for item in house.children: if isinstance(item, bs4.element.NavigableString): d1 = self.parse_item(ToolsBox.clearStr(item)) each_data = self.add_advantage(d1, each_data) each_data['community_name'] = comm.get('title').strip() each_data['community_address'] = comm_addresse.get_text().strip() each_data['comm_url'] = comm.get('href').strip() each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "Soufan" # each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self,soup): page_datas = [] titles = soup.select("h2.fix a") houses = soup.select('p.moudle') houses1 = soup.select('td.sm222 p.msg') # comms = soup.select('span.comm-address') prices = soup.select('div.percent b') # print(titles) for title,detail,detail1,price in zip(titles,houses,houses1,prices): # each_data = {} each_data = dict(advantage='', builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0) each_data['title'] = title.get_text() each_data['details_url'] = 'https://danxia.com' + title.get('href') each_data['community_name'] = detail.select('a')[0].get_text() temp = detail.select('span') for item in temp: d1 = self.parse_item(item.get_text()) each_data = self.add_advantage(d1, each_data) # each_data = dict(each_data, **d1) temp1 = detail1.select('span') for item in temp1: d1 = self.parse_item(item.get_text()) each_data = self.add_advantage(d1, each_data) # each_data = dict(each_data, **d1) each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "Danxia" each_data = self.pipe(each_data) # 2016.6.4增加一个专门的数据处理 if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): page_datas = [] # details = soup.select("div.house-info") # comms = soup.select("div.house-info > a ") # positions = soup.select("div.house-position") # prices = soup.select("span.georgia") # titles = soup.select("h3 > a") # regions = soup.select(".region") # for title,comm,detail,position,price,region in zip(titles,comms,details,positions,prices,regions): # 2019/9/9乐居网改版面的 titles = soup.select("div.title_in") d_urls = soup.select("div.title_in > a") adds = soup.select("div.address") infos = soup.select("div.house_info") prices = soup.select("div.price > span") for title, add, d_url, info, price in zip(titles, adds, d_urls, infos, prices): each_data = dict(builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0, title=title.get('title')) comms = add.select('span') each_data['community_name'] = ToolsBox.clearStr( comms[0].get_text()) for comm in comms: comm = ToolsBox.clearStr(comm.get_text()) if '-' != comm: if '-' in comm: c_item = comm.split('-') each_data['region'] = c_item[0] each_data['block'] = c_item[1] if '年' in comm: out = self.parse_item(comm) each_data = self.add_advantage(out, each_data) h_info = info.select('span') for item in h_info: item = ToolsBox.clearStr(item.get_text()) each_data = self.add_advantage(self.parse_item(item), each_data) each_data['details_url'] = 'https:' + d_url.get('href') each_data['total_price'] = ToolsBox.strToInt(price.get_text()) # , details_url='http://xm.esf.leju.com' + title.get('href') # mr20 = detail.select("span.mr20") # posi = position.select("span") # for j in range(1,len(posi)): # out = self.parse_item(posi[j].get_text()) # each_data = self.add_advantage(out, each_data) # # if len(out) > 0: # # if ('advantage' in each_data.keys()) and ('advantage' in out.keys()): # # each_data['advantage'] = each_data['advantage'] + ',' + out['advantage'] # # else: # # each_data = dict(each_data, **out) # for item in mr20: # d1 = self.parse_item(item.get_text()) # each_data = self.add_advantage(d1, each_data) # # if len(d1) > 0: # # if ('advantage' in each_data.keys()) and ('advantage' in d1.keys()): # # each_data['advantage'] = each_data['advantage'] + ',' + d1['advantage'] # # else: # # each_data = dict(each_data, **d1) # each_data['community_address'] = region.get_text().strip() # each_data['community_name'] = comm.get_text() # each_data['total_price'] =ToolsBox.strToInt(price.get_text()) # each_data['price'] = round(float(each_data['total_price']*10000/each_data['area']),2) each_data['from'] = "lejv" each_data = self.pipe(each_data) if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_page(self, soup): page_datas = [] details_urls = soup.select(".property>a") titles = soup.select("h3.property-content-title-name") houses = soup.select("div.property-content-detail>section") # houses = soup.select('div.house-details') comms = soup.select('.property-content-info-comm-name') adds = soup.select('.property-content-info-comm-address') prices = soup.select('.property-price-total-num') for details_url, title, details, comm, price, add in zip( details_urls, titles, houses, comms, prices, adds): each_data = dict(advantage='', builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0) each_data['title'] = title.get_text() each_data['details_url'] = details_url.get('href') houses = details.select(".property-content-info") detail = houses[0].select("p") for string in detail: d1 = self.parse_item( ToolsBox.clearStr(string.get_text().strip())) each_data = self.add_advantage(d1, each_data) each_data['community_name'] = comm.get_text().strip() add_list = [] for string in add.strings: add_list.append(ToolsBox.clearStr(string.strip())) try: each_data['region'], each_data['block'], each_data[ 'community_address'] = add_list except Exception as e: with open('logtest.txt', 'a+') as fout: fout.write('*************' + str(datetime.datetime.now()) + '*************\n') fout.write('AJK解析区、板块、地址时出错,待解析的数据:') traceback.print_exc(file=fout) print(traceback.format_exc()) # print(price) each_data['total_price'] = ToolsBox.strToInt(price.get_text()) each_data['from'] = "AJK" # try: # 2016.8.1 这里解析也时有出差,把它保留下来 # each_data['total_price'] = ToolsBox.strToInt(price.get_text()) # except Exception as e: # with open('logtest.txt', 'a+') as fout: # fout.write('*************' + str(datetime.datetime.now()) + '*************\n') # fout.write('AJK解析total_price出错,待解析的数据:' + price.get_text()) # traceback.print_exc(file=fout) # print(traceback.format_exc()) # try: # comminfo = comm.get('title').split() # each_data['community_name'] = comminfo[0] # each_data['region'], each_data['block'], each_data['community_address'] = comminfo[1].split('-', 2) # except Exception as e: # with open('logtest.txt', 'a+') as fout: # fout.write('*************' + str(datetime.datetime.now()) + '*************\n') # fout.write('Parse Failt of :%s \n' % comm.get('title')) # traceback.print_exc(file=fout) # print(traceback.format_exc()) # each_data['community_name'] = each_data['community_name'].strip() # try: # house = details.select('span') # # 2016.8.17 重写了字段解析,抽象出一个parse_item方法 # for h in house: # if len(h.attrs) == 0: # string = h.get_text().encode('utf8') # d1 = {} # d1 = self.parse_item(string) # each_data = self.add_advantage(d1, each_data) #each_data = dict(each_data, **d1) # each_data['from'] = "AJK" # except Exception as e: # with open('logtest.txt', 'a+') as fout: # fout.write('*************' + str(datetime.datetime.now()) + '*************\n') # fout.write(' 待解析的数据:\n') # for i1 in house: # fout.write(str(i1) + '\n') # fout.write('\n 字段数:' + str(len(house)) + '\n') # traceback.print_exc(file=fout) # print(traceback.format_exc()) each_data = self.pipe(each_data) # 2016.6.4增加一个专门的数据处理 if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): page_datas = [] details = soup.select(".size") # comms = soup.select("a span.address-eara") # print(comms) prices = soup.select(".num") titles = soup.select("div.ershoufang-list .title a") regions = soup.select("span.area a") lists = soup.select(".ershoufang-list") for title, detail, list1, price, region in zip(titles, details, lists, prices, regions): # for title in titles: each_data = { 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0, 'advantage': '', 'title': title.get('title'), 'details_url': 'http:' + title.get('href') } for item in (detail.stripped_strings): d1 = self.parse_item(item) each_data = self.add_advantage(d1, each_data) each_data['total_price'] = ToolsBox.strToInt(price.get_text()) address = list1.select("dd.address") # print(address[0]) # print(len(address)) # for item in comm.stripped_strings: # print(item) # print(comm.stripped_strings) # print(50*'0') if len(address) > 0: if len(address[0].select("a.address-eara")) > 0: each_data['region'] = ToolsBox.clearStr( address[0].select("a.address-eara")[0].get_text()) if len(address[0].select("span.address-eara")) > 0: # each_data['community_name'] = address[0].select("span.address-eara")[0].get_text() # print(each_data['community_name']) each_data['community_name'] = ToolsBox.clearStr( address[0].select("span.address-eara")[0].get_text()) # try: # except (IndexError) as e: # print("****页面数据不规范*****") # input(address) # each_data['community_name'] = (comm.get_text()) # print(comm.children) # for name in comm.descendants: # print(name) # # pass # print('-'*50) # each_data['region'] = ToolsBox.clearStr(region.get_text()) each_data['from'] = "ganji" # print(each_data) each_data = self.pipe(each_data) # if each_data: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def parse_datas(self, soup): page_datas = [] # print(soup) titles = soup.select("h2.title > a") prices = soup.select('p.sum > b') houses = soup.select('.list-info') for title, price, house in zip(titles, prices, houses): each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0, 'title': title.get_text(), 'details_url': title.get('href'), 'total_price': ToolsBox.strToInt(price.get_text())} details = house.select('p.baseinfo') spans = details[0].select('span') for span in spans: string = ToolsBox.clearStr(span.get_text()).encode('utf8') # d1 = {} d1 = self.parse_item(string) each_data = self.add_advantage(d1, each_data) # each_data = dict(each_data, **d1) comms = details[1].select('a') each_data['community_name'] = comms[0].get_text() if comms[0].get('href') is None: each_data['comm_url'] = '' else: each_data['comm_url'] = 'http://xm.58.com' + comms[0].get('href') each_data['from'] = "58" try: if len(comms) >= 2: # input('region') each_data['region'] = comms[1].get_text().strip() except Exception as e: # print('-------这个记录没有拿到小区的区域------------') # ToolsBox.printDic(each_data) print(e) try: if len(comms) >= 3: # input('address') each_data['community_address'] = comms[2].get_text().strip() except Exception as e: # print('-------这个记录没有拿到小区地址------------') # ToolsBox.printDic(each_data) print(e) each_data = self.pipe(each_data) if each_data: match_comm = re.findall(r'^\d+$', each_data['community_name']) # 不知道为什么,有时小区名称会都是数字,需要屏蔽 # print(match_comm) if len(match_comm) > 0: print('/////////////////出现纯数字的小区了!!!!!!////////////////////////') ToolsBox.priList(each_data) print(soup) # print(each_data['community_name']) # var1 = input(each_data['community_name']+'出现纯数字的小区了!!!!!!!!!') else: page_datas.append(each_data) else: if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data) return page_datas
def craw_a_page(self, new_url, retries=3): # 计算并打印延时情况 if self.delay > 0: sleepSeconds = random.randint(self.delay, self.delay * 2) print('craw {0} after {1} seconds ({2} ~ {3}):'.format( self.count, sleepSeconds, self.delay, self.delay * 2)) else: print('craw {0} :'.format(self.count)) # 获取请求头、代理信息,每个页面都不相同 # proxy = self.proxy_builder() proxy = None self.headers_builder() # 下载 html_cont, code = self.downloader.download(new_url, headers=self.headers, proxy=proxy) # 对下载内容进行处理 # 1、如果被404的处理 if 400 <= code < 600: # if isinstance(html_cont, int) and (400 <= (html_cont) < 600): self.HTTP404 += 1 print("返回异常(在MassController里): {0}".format(code)) if html_cont is not None: self.downloader.getTitle(html_cont) new_urls, new_datas = self.parser.page_parse(html_cont) if new_datas == 'checkcode': # 如果解析出是输入验证码 print(str(datetime.datetime.now())) self.delay = input("遇到验证码,输入延时秒数后,保留已解析的数据......") if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) self.total = self.total + self.outputer.out_mysql() if retries > 0: return self.craw_a_page(new_url, retries - 1) time.sleep(30 * self.HTTP404) # 被禁止访问了,消停一会 if self.HTTP404 > self.HTTP404_stop: # 在安居客中如果是“安全局宿舍”,会出现找不到的错误,这里给它自动跳过 match_comm = re.findall(r'kw=(.*)&from_url', new_url) if unquote(match_comm[0], 'utf-8') != '0': print(str(datetime.datetime.now())) self.delay = input("你似乎被禁止访问了,输入延时秒数后,保留已解析的数据......") if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) self.total = self.total + self.outputer.out_mysql() self.HTTP404 = 0 else: return self.craw_a_page(new_url) # 2、正常得到网页 elif html_cont is not None: # 2019.3.11简化了分析 new_urls, new_datas = self.parser.page_parse(html_cont) # 返回解析内容 if new_datas == 'checkcode': # 如果解析出是输入验证码 print(str(datetime.datetime.now())) self.delay = input("遇到验证码,输入延时秒数后,保留已解析的数据......") if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) self.total = self.total + self.outputer.out_mysql() if retries > 0: return self.craw_a_page(new_url, retries - 1) elif new_datas == '0': #这是查询出来没有数据记录 print('这页查出来的记录数为0,不是解析不出来') print('本页面 datas:没有,urls:当然没有') elif len(new_datas) == 0 and len(new_urls) == 0: # 解析无数据 self.nodata += 1 if self.nodata < self.nodata_stop: print("本页面未解析出数据,可再试{0}次".format(self.nodata_stop - self.nodata)) print(html_cont) time.sleep(random.randint(3, 7)) return self.craw_a_page(new_url) else: with open('logtest.txt', 'a+') as fout: fout.write('\n*******' + str(datetime.datetime.now()) + '*************') fout.write('\n 本页面无数据:%s. \n' % new_url) if self.nodata < 999: self.delay = input( '页面连续无数据,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......') if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) self.nodata = 0 else: #对self.nodata = 1000以上的,如赶集网忽略没有数据 self.nodata = 1000 else: # 正常情况,解析 print('本页面 datas:{0},urls:{1}'.format( len(new_datas), len(new_urls))) # 把页面链接放入url管理器 self.urls.add_new_urls(new_urls) # 把小区名称放入小区管理器 for data in new_datas: self.add_comm(data) # 把挂牌信息传入outputer,清除无效数据后,放在outputer.raw_datas记录集中 self.outputer.collect_data(new_datas) data_num = self.outputer.get_datas_quantity() print( "共%6.0f = %6.0f 重复 + %5.0f 数据池 + %6.0f 存入数据库 " % (data_num['dupli_count'] + data_num['r_data'] + self.total, data_num['dupli_count'], data_num['r_data'], self.total)) if 3000 < data_num['r_data']: print("正在存入数据库中,请稍侯......") storenum = self.outputer.out_mysql() if storenum: self.total = self.total + storenum self.count += 1 self.nodata = 0 if self.nodata < 999 else 1000 # 如果有数据,把self.nodata计数器复原 self.HTTP404 = 0 # 如果有数据,把self.HTTP404计数器清零 # 3、html_cont内容是None,这是出现500以上的download失败 else: print('不能从服务器上下载{0}'.format(new_url)) self.HTTP404 += 1 time.sleep(15 * self.HTTP404) # 被禁止访问了,消停一会 if self.HTTP404 > self.HTTP404_stop: self.delay = input( '连续不能获取页面内容,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......') if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) self.total = self.total + self.outputer.out_mysql() self.HTTP404 = 0 else: # if retries > 0: # return self.craw_a_page(new_url, retries - 1) return self.craw_a_page(new_url) # 延时模块:放在最后,第一次抓取时不用延时 if not 0 >= self.delay: time.sleep(sleepSeconds) # 2017.5。15把下载延时功能放在这里,这个模块相当于控制器
def craw_a_page_of_commPrice(self, new_url, retries=3): # 计算并打印延时情况 if self.delay > 0: sleepSeconds = random.randint(self.delay, self.delay * 2) print('craw {0} after {1} seconds ({2} ~ {3}):'.format( self.count, sleepSeconds, self.delay, self.delay * 2)) else: print('craw {0} :'.format(self.count)) # 获取请求头、代理信息,每个页面都不相同 proxy = None self.headers_builder() # 下载 html_cont, code = self.downloader.download(new_url, headers=self.headers, proxy=proxy) # 对下载内容进行处理 # 1、如果正常得到网页 if html_cont is not None: new_urls, new_datas = self.parser.page_parse(html_cont) # 返回解析内容 if new_datas == 'checkcode': # 如果解析出是输入验证码 print(str(datetime.datetime.now())) self.delay = input("遇到验证码,输入延时秒数后,保留已解析的数据......") if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) # 要改输出方式self.total = self.total + self.outputer.out_mysql() if retries > 0: return self.craw_a_page_of_commPrice(new_url, retries - 1) elif new_datas == '0': # 这是查询出来没有数据记录 print('未找到该小区') print('本页面 datas:没有,urls:当然没有') elif len(new_datas) == 0 and len(new_urls) == 0: # 解析无数据 self.nodata += 1 if self.nodata < self.nodata_stop: print("本页面未解析出数据,可再试{0}次".format(self.nodata_stop - self.nodata)) time.sleep(random.randint(3, 7)) return self.craw_a_page_of_commPrice(new_url) else: with open('logtest.txt', 'a+') as fout: fout.write('\n*******' + str(datetime.datetime.now()) + '*************') fout.write('\n 本页面无数据:%s. \n' % new_url) if self.nodata < 999: self.delay = input( '页面连续无数据,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......') if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) self.nodata = 0 else: self.nodata = 1000 else: # 正常情况,解析 print('本页面 datas:{0},urls:{1}'.format( len(new_datas), len(new_urls))) # 把页面链接放入url管理器 self.urls.add_new_urls(new_urls) # ToolsBox.priList(new_urls) # 把小区名称放入小区管理器 # for data in new_datas: # self.add_comm(data) # ToolsBox.priList(new_datas) # 把挂牌信息传入outputer,清除无效数据后,放在outputer.raw_datas记录集中 # self.outputer.collect_data(new_datas) # data_num = self.outputer.get_datas_quantity() # print("共%6.0f = %6.0f 重复 + %5.0f 数据池 + %6.0f 存入数据库 " % ( # data_num['dupli_count'] + data_num['r_data'] + self.total, data_num['dupli_count'], # data_num['r_data'], # self.total)) # # if 3000 < data_num['r_data']: # print("正在存入数据库中,请稍侯......") # storenum = self.outputer.out_mysql() # if storenum: # self.total = self.total + storenum self.count += 1 self.nodata = 0 if self.nodata < 999 else 1000 # 如果有数据,把self.nodata计数器复原 self.HTTP404 = 0 # 如果有数据,把self.HTTP404计数器清零 # 2、html_cont内容是None else: print('不能从服务器上下载{0}'.format(new_url)) print( "返回异常(在MassController里的craw_a_page_of_commPrice中): {0}".format( code)) self.HTTP404 += 1 time.sleep(15 * self.HTTP404) # 被禁止访问了,消停一会 if self.HTTP404 > self.HTTP404_stop: self.delay = input( '连续不能获取页面内容,可点击上面链接检查,如无问题,输入延时秒数后,保留已解析的数据......') if self.delay == '': self.delay = 0 else: self.delay = ToolsBox.strToInt(self.delay) # #改输出self.total = self.total + self.outputer.out_mysql() self.HTTP404 = 0 else: return self.craw_a_page_of_commPrice(new_url) # 延时模块:放在最后,第一次抓取时不用延时 if not 0 >= self.delay: time.sleep(sleepSeconds)