def my_main(year_need): """ 需要爬取某一年的日历 :param year_need: int类型,暂支持区间(1900,2100) :return: """ sum_day = 1 sum_week = 1 year_dict = {} # 打开在线日历网页 spider.get_page("https://wannianrili.51240.com/") # 跳转到需要获取那一年的前一年的12月,并获取数据 spider.select_year_option(str(year_need - 1)) spider.select_month_option("12") source = spider.get_source() month_dict, sum_day, sum_week = get_month_dict(source, sum_day, sum_week, str(year_need - 1), "12") year_dict = dict(year_dict.items() + month_dict.items()) last_month_day = sum_day last_month_week = sum_week spider.select_year_option(str(year_need)) for month in range(1, 13): # 月份格式化 if month < 10: month = '0' + str(month) else: month = str(month) print 'month:', month spider.select_month_option(month) source = spider.get_source() month_dict, sum_day, sum_week = get_month_dict(source, sum_day, sum_week, str(year_need), month, last_month_day, last_month_week) year_dict = dict(year_dict.items() + month_dict.items()) # break # 测试 print year_dict # 跳转到需要获取那一年的后一年的1月,并获取数据 spider.select_year_option(str(year_need + 1)) spider.select_month_option("01") source = spider.get_source() month_dict, sum_day, sum_week = get_month_dict(source, sum_day, sum_week, str(year_need + 1), "01") year_dict = dict(year_dict.items() + month_dict.items()) # 修整与统计 year_dict = alter_holiday(year_dict) year_dict = count_next_workday(year_dict) year_dict = count_week_holiday(year_dict) print year_dict col_name, col_data = save.format_data(year_dict, year_need) save.write_calender(str(year_need) + '年日历.csv', col_name, col_data)
def crawl_url5(url=url5): html = get_page(url) ip = re.compile( r"((1?\d?\d.|2[0-4]\d.|25[1-5].){3}(1?\d?\d|2[0-4]\d|25[1-5]):\d+)") for i in re.finditer(ip, html): proxie = {} proxie["http"] = i.group(1) yield proxie
def crawl_url6(url=url6): html = get_page(url) soup = BeautifulSoup(html, 'lxml') soup = soup.find("table", class_="table") tbody = soup.find("tbody") for tr in tbody.find_all("tr"): proxie = {} td = tr.find_all("td") proxie["http"] = ":".join((td[0].text.strip(), td[1].text.strip())) print(proxie)
def crawl_ip3366(self): for page in range(1, 20): start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format( page) html = get_page(start_url) ip_adress = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: yield ':'.join([adress, port])
def crawl_url7(url=url7, page=5): for i in range(1, page + 1): params = {"page": i} html = get_page(url, params=params) time.sleep(5) soup = BeautifulSoup(html, 'lxml') tbody = soup.find("tbody") for tr in tbody.find_all("tr"): td = tr.find_all("td") proxie = {} proxie[(td[4].text.strip()).lower()] = ":".join( (td[1].text.strip(), td[2].text.strip())) yield proxie
def crawl_url5(self, url=url5): """从66ip抓取代理""" print(f"正在从{url}抓取代理") html = get_page(url) if html: ip = re.compile( r"((1?\d?\d.|2[0-4]\d.|25[1-5].){3}(1?\d?\d|2[0-4]\d|25[1-5]):\d+)" ) for i in re.finditer(ip, html): proxie = {} proxie["http"] = i.group(1) yield json.dumps(proxie) else: self.crawl_url5()
def crawl_url2(url=url2, page=5): for i in range(1, page + 1): time.sleep(5) url = f"{url}_{page}.html" print(url) html = get_page(url) soup = BeautifulSoup(html, 'lxml') soup = soup.find("tbody") for tr in soup.find_all("tr"): td = tr.find_all("td") proxie = {} proxie['http'] = ":".join( (td[0].string.strip(), td[1].string.strip())) proxie = json.dumps(proxie) yield proxie
def crawl_url6(self, url=url6): """从http://ip.seofangfa.com抓取代理""" print(f"正在从{url}抓取代理") try: html = get_page(url) soup = BeautifulSoup(html, 'lxml') soup = soup.find("table", class_="table") tbody = soup.find("tbody") for tr in tbody.find_all("tr"): proxie = {} td = tr.find_all("td") proxie["http"] = ":".join( (td[0].text.strip(), td[1].text.strip())) yield json.dumps(proxie) except: self.crawl_url6()
def crawl_url7(self, url=url7, page=5): """从http://ip.jiangxianli.com/抓取代理""" print(f"正在从{url}抓取代理") for i in range(1, page + 1): try: params = {"page": i} html = get_page(url, params=params) time.sleep(5) soup = BeautifulSoup(html, 'lxml') tbody = soup.find("tbody") for tr in tbody.find_all("tr"): td = tr.find_all("td") proxie = {} proxie[(td[4].text.strip()).lower()] = ":".join( (td[1].text.strip(), td[2].text.strip())) yield json.dumps(proxie) except: continue
def crawl_url2(self, url=url2, page=6): """从89ip抓取代理""" realurl = url for i in range(1, page + 1): try: url = f"{realurl}_{i}.html" print(f"正在从{url}抓取代理") html = get_page(url) time.sleep(5) soup = BeautifulSoup(html, 'lxml') soup = soup.find("tbody") for tr in soup.find_all("tr"): td = tr.find_all("td") proxie = {} proxie["http"] = ":".join( (td[0].string.strip(), td[1].string.strip())) proxie = json.dumps(proxie) yield proxie except: continue