def get_land_info_by_url(land_url, session): for land in session.query(LandInfo).filter(LandInfo.land_url == land_url): print land.land_url url = settings.host_url + land_url spider = BasicSpider(url, 0) base_soup = spider.get_html_beautiful_soup_without_cookies(land_url) get_detailed_land_info(base_soup, land) session.commit()
def get_detailed_info_by_land_info_url(self, land_info_url): url = settings.host_url + land_info_url spider = BasicSpider(url, self.step) base_soup = spider.get_html_beautiful_soup_without_cookies( land_info_url) # spider.get_tudi_detailed_info(base_soup) #land_detailed_info.get_detailed_land_info(base_soup, land_info_url) land_detailed_info.update_land_detailed_info()
def get_basic_info_by_land_info_page_url_list(self, land_info_page_url, district_row_id=-1): url = settings.host_url + land_info_page_url spider = BasicSpider(url, self.step) base_soup = spider.get_html_beautiful_soup_without_cookies( land_info_page_url) land_info_url_list = spider.get_land_basic_info_by_land_info_url( base_soup, district_row_id)
def get_trade_info_by_url(land_url): url = settings.host_url + land_url spider = BasicSpider(url, 0) base_soup = spider.get_html_beautiful_soup_without_cookies(land_url) if check_need_cookies_or_not(base_soup): base_soup = spider.get_html_beautiful_soup_with_cookies(land_url, cookies=get_cookies(settings.cookies)) trade_info = TradeInfo(land_url) get_trade_info(base_soup, trade_info) return trade_info
def get_land_info_url_list_by_district_url(self, district_url): url = settings.host_url + district_url spider = BasicSpider(url, self.step) base_soup = spider.get_html_beautiful_soup_without_cookies( district_url) land_info_url_list = spider.get_land_url_list_by_distrct_url( base_soup, district_url) district_info = spider.search_region_info_by_url(district_url) if land_info_url_list is not None: for land_info_page_url in land_info_url_list: """ self.get_basic_info_by_land_info_url_lsit(land_info_url, district_info[0]) """ print len(land_info_page_url) print district_info[0] self.get_basic_info_by_land_info_page_url_list( land_info_page_url, district_info[0])
def get_district_url_by_city_url(self, city_url): url = settings.host_url + city_url spider = BasicSpider(url, self.step) base_soup = spider.get_html_beautiful_soup_without_cookies(city_url) spider.get_district_url_list(base_soup, city_url)
def get_city_url_by_province_url(self, province_url): url = settings.host_url + province_url spider = BasicSpider(url, self.step) base_soup = spider.get_html_beautiful_soup_without_cookies( province_url) spider.get_city_url_list(base_soup, province_url)
def get_all_province_url(self): spider = BasicSpider(self.url, self.step) base_soup = spider.get_html_beautiful_soup_without_cookies(self.url) #span =base_soup.find('span', class_='w866 fr') spider.get_province_url_list(base_soup)
def make_spider(cls): options = Options() options.headless = False cls.spider = BasicSpider(base_url, options=options)