def collect_area_xiaoqu_data(city_name, area_name, fmt="csv"): """ 对于每个板块,获得这个板块下所有小区的信息 并且将这些信息写入文件保存 :param city_name: 城市 :param area_name: 板块 :param fmt: 保存文件格式 :return: None """ global total_num, today_path csv_file = today_path + "/{0}.csv".format(area_name) with open(csv_file, "w") as f: # 开始获得需要的板块数据 xqs = get_xiaoqu_info(city_name, area_name) # 锁定 if mutex.acquire(1): total_num += len(xqs) # 释放 mutex.release() if fmt == "csv": for xiaoqu in xqs: # print(date_string + "," + xiaoqu.text()) f.write(date_string + "," + xiaoqu.text() + "\n") print("Finish crawl area: " + area_name + ", save data to : " + csv_file) logger.info("Finish crawl area: " + area_name + ", save data to : " + csv_file)
def collect_area_xiaoqu_data(self, city_name, area_name, fmt="csv"): """ 对于每个板块,获得这个板块下所有小区的信息 并且将这些信息写入文件保存 :param city_name: 城市 :param area_name: 板块 :param fmt: 保存文件格式 :return: None """ district_name = area_dict.get(area_name, "") csv_file = self.today_path + "/{0}_{1}.csv".format( district_name, area_name) # csv_file = self.today_path + "/{0}.csv".format(area_name) with open(csv_file, "w") as f: # 开始获得需要的板块数据 xqs = self.get_xiaoqu_info(city_name, area_name) # 锁定 if self.mutex.acquire(1): self.total_num += len(xqs) # 释放 self.mutex.release() if fmt == "csv": for xiaoqu in xqs: f.write(self.date_string + "," + xiaoqu.text() + "\n") print("Finish crawl area: " + area_name + ", save data to : " + csv_file) logger.info("Finish crawl area: " + area_name + ", save data to : " + csv_file)
def get_xiaoqu_info(self, city, area): district = area_dict.get(area, "") chinese_district = get_chinese_district(district) chinese_area = chinese_area_dict.get(area, "") xiaoqu_list = list() page = 'http://{0}.{1}.com/xiaoqu/{2}/'.format(city, SPIDER_NAME, area) print(page) logger.info(page) headers = create_headers() response = requests.get(page, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") # 获得总的页数 try: page_box = soup.find_all('div', class_='page-box')[0] matches = re.search('.*"totalPage":(\d+),.*', str(page_box)) total_page = int(matches.group(1)) except Exception as e: print("\tWarning: only find one page for {0}".format(area)) print(e) total_page = 1 # 从第一页开始,一直遍历到最后一页 for i in range(1, total_page + 1): headers = create_headers() page = 'http://{0}.{1}.com/xiaoqu/{2}/pg{3}'.format( city, SPIDER_NAME, area, i) response = requests.get(page, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") # 获得有小区信息的panel house_elems = soup.find_all('li', class_="xiaoquListItem") for house_elem in house_elems: price = house_elem.find('div', class_="totalPrice") name = house_elem.find('div', class_='title') on_sale = house_elem.find('div', class_="xiaoquListItemSellCount") # 继续清理数据 price = price.text.strip() name = name.text.replace("\n", "") on_sale = on_sale.text.replace("\n", "").strip() # 作为对象保存 xiaoqu = XiaoQu(chinese_district, chinese_area, name, price, on_sale) xiaoqu_list.append(xiaoqu) return xiaoqu_list
def get_xiaoqu_info(city, area): district = area_dict.get(area, "") chinese_district = get_chinese_district(district) chinese_area = chinese_area_dict.get(area, "") xiaoqu_list = list() page = 'http://{0}.lianjia.com/xiaoqu/{1}/'.format(city, area) print(page) logger.info(page) headers = create_headers() response = requests.get(page, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") # 获得总的页数 try: page_box = soup.find_all('div', class_='page-box')[0] matches = re.search('.*"totalPage":(\d+),.*', str(page_box)) total_page = int(matches.group(1)) except Exception as e: print("\tWarning: only find one page for {0}".format(area)) print(e) total_page = 1 # print("total page %d" % total_page) # last_page = soup.find('a', gahref="results_totalpage") # if last_page is not None: # 如果找到了标示最后一页的链接 # total_page = int(last_page.text) # else: # 没有标示最后一页的链接,那么总页数不超过10,从大到小倒序找到最后一页 # href_list = ["results_d{0}".format(i) for i in range(10+1)[1:]] # href_list.reverse() # for href in href_list: # last_page = soup.find('a', gahref=href) # if last_page is not None: # total_page = int(last_page.text) # break # 从第一页开始,一直遍历到最后一页 for i in range(1, total_page + 1): headers = create_headers() page = 'http://{0}.lianjia.com/xiaoqu/{1}/pg{2}'.format(city, area, i) response = requests.get(page, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") # 获得有小区信息的panel house_elems = soup.find_all('li', class_="xiaoquListItem") for house_elem in house_elems: price = house_elem.find('div', class_="totalPrice") name = house_elem.find('div', class_='title') on_sale = house_elem.find('div', class_="xiaoquListItemSellCount") # 继续清理数据 price = price.text.strip() name = name.text.replace("\n", "") on_sale = on_sale.text.replace("\n", "").strip() # 作为对象保存 xiaoqu = XiaoQu(chinese_district, chinese_area, name, price, on_sale) xiaoqu_list.append(xiaoqu) return xiaoqu_list