def get_page_urls(self): ''' 从初始页面HTML中解析出每一页的URL :return: ''' # 解析出页码列表 pages_ul = self.soup.find('ul', class_='npage fr') pages_lis = pages_ul.find_all('li') pages_lis = [li for li in pages_lis if re.search(r'\d+', li.text)] # 最后一页的li对象 last_page_li = pages_lis[-1] # 从最后一页的li对象中解析出页面URL和页码数 # URL格式:/full/0/765_0_180000_0_0_0_0_10_1,其中最后的'10'表示页码数 last_page_url = last_page_li.find('a')['href'] last_page_url = urljoin.url_path_join( HtmlConstant.ZHILIAN_XIAOYUAN_HOST) page_num = int(last_page_li.text) logging.debug( 'get_page_urls method, last_page_url = {}, page_num = {}', last_page_url, page_num) # 组建URL格式化字符串 splited_url = last_page_url.split('_') page_formatter_url = '_'.join()
def new(self): # type: () -> str """Get new project ID Returns: dict: New dataset properties """ # todo put/post? return requests.get(url_path_join(self.get_address(), "new")).text
def update(self, ds_id, data): # type: (str, dict) -> None """ Update project details Args: ds_id (str): dataset id data (dict): new dataset properties """ return requests.post(url_path_join(self.get_address(), "update", ds_id), json=data)
def project_details(self, ds_id): # type (str) -> dict """ Get project details Args: ds_id (str): dataset id Returns: dict: Dataset details """ return requests.get(url_path_join(self.get_address(), "detail", ds_id)).json()
def use(self, ds_id, usage={}): # type: (str, dict) -> dict """Get dataset info with use note Args: ds_id (str): dataset id usage (dict): usage details Returns: dict: dataset details """ return requests.post(url_path_join(self.get_address(), "use", ds_id), json=usage).json()
def link_crawler(seed_url, link_regex): """ :param seed_url:主页面地址 :param link_regex:正则表达式 :return:Node """ crawl_queue = [seed_url] seen = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() html = download(url, user_agent='Baiduspider', proxy=None, retries=2) for link in get_links(html): #print('seen:', seen) if re.match(link_regex, link, re.I): link = urljoin.url_path_join(seed_url, link) #print('link1:', link) if link not in seen: crawl_queue.append(link) seen.add(link)
def crawl(_dst_url: str): global __count history_list.append(_dst_url) __count = __count + 1 filename = util.get_hash(_dst_url) cache_html_path = cache_page_path + filename + ".html" if file.exist(cache_html_path): html = file.read(cache_html_path) print("缓存读取----->" + filename) else: print("请求网络----->" + filename) html = spider.get_html(_dst_url) file.write(cache_html_path, html) links = spider.get_links(html) img_list = spider.get_img(html) # print(img_list) __image_list.extend(img_list) json_str = json.dumps(img_list, ensure_ascii=False) file.write(cache_link_path + filename + ".json", json_str) global_list.extend(links) global_set = set(global_list) new_set = set() for x in global_set: v = str(x).split("#")[0] if _root in v: new_set.add(v) else: if str(v).find('http') == -1: new_set.add(urljoin.url_path_join(_base_url, v)) return new_set
def htmlParse(seed_url, url, st=None, result={}): logger = logging.getLogger(__name__) if st is None: st = set() current_page_children = set() logger.error("Making network call for url: " + url) content = network.makeGetRequest(url) if content is None: return None logger.error("Request done successfully") soup = BeautifulSoup(content, parser.SOUP_DECODER) for tag in soup.find_all(href=True): if is_internal_link(tag[parser.LINK_ATTR]): if urlparse(tag[parser.LINK_ATTR]).netloc == '': url = urljoin.url_path_join("https://docs.python.org/", tag[parser.LINK_ATTR]) st.add(url) current_page_children.add(url) else: st.add(tag[parser.LINK_ATTR]) current_page_children.add(tag[parser.LINK_ATTR]) result[url] = current_page_children return st
def test_url_path_join_nested(): assert url_path_join("http://example.com/", "/path/path") == \ "http://example.com/path/path"
def test_url_path_join_multiple(): assert url_path_join("http://example.com/", "/path/", "a", "b") == \ "http://example.com/path/a/b"
def test_url_path_join_trailing_slash(): assert url_path_join("http://example.com/", "/path/", trailing_slash=True) == \ "http://example.com/path/"
def test_url_path_join_single(): assert url_path_join("http://example.com/") == \ "http://example.com"
#import urllib3 import requests import re import urljoin import urllib from urllib import robotparser from urllib import request from urllib import error from urllib import parse # w = whois.whois("baidu.com") # print(w) url = 'http://webscraping.com' rp = robotparser.RobotFileParser() rp.set_url(urljoin.url_path_join(url, 'robots.txt')) rp.read() def download(url, user_agent='wyy', proxy=None, retries=2): print('Downloading:', url) headers = {'User-agent': user_agent} if not rp.can_fetch(user_agent, url): print('Blocked by robots.txt:', url) return None req = request.Request(url, headers=headers) opener = request.build_opener() if proxy: proxy_params = {parse.urlparse(url).schema: proxy}
def reload(self): # type: () -> None """ Force server to reload DB """ requests.post(url_path_join(self.get_address(), "reload"))
def scan(self): # type: () -> None """ Force local fs rescan """ return requests.get(url_path_join(self.get_address(), "scan"))
def parse_detail_page(self, response=None, url=None): try: response.encoding = self.encoding unicode_html_body = response.text # dict_content = json.loads(unicode_html_body) # data = dict_content["data"] data = etree.HTML(unicode_html_body) # data =BeautifulSoup(unicode_html_body,"lxml") # data = htmlparser.Parser (unicode_html_body) except Exception as e: print("parse_detail_page(): %s" % e) return None from_tag_url = response.url logger.info("list page url%s" % from_tag_url) # li_content = data.xpath('''//li[@class="l-twos"]''') for i in range(1,21): detail_url = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-title"]/@onclick'''.format(i))[0].replace("window.location.href='","").replace("'","") detail_url = urljoin.url_path_join("http://job.qust.edu.cn",detail_url) if self.getdumps(detail_url): continue date = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-xiangq"]/p[@class="l-tw-xiangqa"]/text()[2]'''.format(i))[0].replace("时间:","") location = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-xiangq2"]/p[@class="l-tw-xiangqa"]/text()[2]'''.format(i))[0].replace("地点:","") uid = str(uuid.uuid3(uuid.NAMESPACE_DNS, detail_url)) + str(uuid.uuid5(uuid.NAMESPACE_DNS, detail_url))[0] ctime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") title = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-title"]/@title'''.format(i))[0] siteName = self.siteName post = { "uuid": uid, # md5 "detailUrl": detail_url, # url "title": title, # 标题 "location": location, # 地点 "siteName": self.siteName, "ctime": ctime, "date": date, } dic = self.handle_post(post) sql = '''insert into list_info (uuid,detailUrl,title,location,siteName,ctime,date) VALUE (%s,%s,%s,%s,%s,%s,%s);''' try: # 执行sql语句 self.db.ping() except Exception as e: logger.info(e) self.db.rollback() # 捕捉到错误就回滚 self.db = MySQLdb.connect(host="127.0.0.1", user="******", passwd="root", db='qust', charset='utf8') try: self.cursor.execute(sql, ( uid, detail_url, title, location, siteName, ctime, date)) self.db.commit() # 把修改的数据提交到数据库 logger.info("入库完成") except Exception as e: logger.info(e) self.db.rollback() # 捕捉到错误就回滚 # self.parse_restore(unicode_html_body) return