def crawl(self): browser = Setting.settingDriver() browser.get(self.url) try: browser.find_element_by_xpath("//a[contains(text(),'»»')]").click() soup = BeautifulSoup(browser.page_source, 'lxml') pageNum = int( soup.select("[class='pagination']")[0].find_all('a')[-1].text) browser.find_element_by_xpath("//a[contains(text(),'««')]").click() time.sleep(2) except IndexError: soup = BeautifulSoup(browser.page_source, 'lxml') pageNum = int( soup.select("[class='pagination']")[0].find_all('a')[-2].text) except: soup = BeautifulSoup(browser.page_source, 'lxml') pageNum = int( soup.select("[class='pagination']")[0].find_all('a')[0].text) soup = BeautifulSoup(browser.page_source, 'lxml') if pageNum > 1: for num in range(pageNum - 1): self.content() browser.find_element_by_xpath("//a[@rel='next']").click() self.content() elif pageNum == 1: self.content() while self.items: self.count += 1 gabriel = Deposit(self.code, self.items.pop()) gabriel.run() print('Crawling and deposit {} data from {}'.format( self.count, self.code))
def content(self): browser = Setting.settingDriver() browser.get(self.url) soup = BeautifulSoup(browser.page_source, 'lxml') for i in range(2, len(soup.select("ul[class='cue-list']")) + 2): tmp_title = soup.select("div.out-showcue-list > ul:nth-of-type(" + str(i) + ") > li:nth-of-type(2) > a")[0].text self.data_dic['title'] = tmp_title tmp_date = soup.select( "div.out-showcue-list > ul:nth-of-type(" + str(i) + ") > li:nth-of-type(1) > p")[0].text.replace('\r', '').replace( '\n', '').strip()[:10] convertDate = datetime.date(int(tmp_date.split('/')[0]), int(tmp_date.split('/')[1]), int(tmp_date.split('/')[2])) self.data_dic['departure_date'] = convertDate tmp_link = ( "http://www.gabriel.com.tw" + soup.select("div.out-showcue-list > ul:nth-of-type(" + str(i) + ") > li:nth-of-type(2) > a")[0]['href']) self.data_dic['link'] = tmp_link tmp_status = soup.select( "div.out-showcue-list > ul:nth-of-type(" + str(i) + ") > li:nth-of-type(5)")[0].text.split(':')[1].strip() self.data_dic['status'] = tmp_status tmp_price = ( soup.select("div.out-showcue-list > ul:nth-of-type(" + str(i) + ") > li:nth-of-type(3)")[0].text.split(':')[1]) self.data_dic['price'] = tmp_price self.data_dic['date_price'] = tmp_price browser.get(tmp_link) detail = (browser.find_elements_by_xpath("(//div[@class='note'])")) keyword = (browser.find_elements_by_tag_name('h6'))[:-1] key = [] day_count = 0 detail_dic = {} for item in detail[:-1]: day_count += 1 detail_dic[("DAY " + str(day_count))] = item.text for item in keyword: if (item.text != ''): key.append(item.text) detail_dic['Keywords'] = key self.data_dic['detail'] = detail_dic self.items.append(self.data_dic) self.resetDataDic() browser.back() browser.close()
def crawler_sqlmap(entry_url, depth=-1, level=1, threads=2, timeout=30, checkhost=True): """启动sqlmap扫描的入口函数。 :param entry_url: 扫描网站的入口地址 :param depth: 网页爬虫爬取页面深度,-1则表示不设置深度,默认-1 :param level: sqlmap扫描测试等级:1-5(默认为1),等级越高使用的测试样例越多,结果越精确,时间也越长 :param threads: sqlmap多线程扫描设置(默认为2) :param timeout: sqlmap扫描超时时间(默认30s) :param checkhost: 检查爬取链接是否属于同一域 :return: 返回值为四元组(ret, url, simple, content) ret: 执行结果, False为失败, True为成功 url: 扫描目标地址 simple: 解析content抽取重要数据生成的报告,字典类型 content: sqlmap返回的完整报告,字典类型 若执行结果为False,那么把扫描错误信息存在扫描关键结果(simple)这个位置 """ settings = Setting(handle=False) settings.depth = depth settings.nocheckhost = not checkhost settings.level = level settings.threads = threads settings.timeout = timeout sqlmap, crawler = None, None try: sqlmap, ip, port = start_sqlmap() # crawler的创建必须在sqlmap启动之后, 才能正确获取sqlmap的端口号 crawler = Crawler(BASE_DIR, ip, port, entry_url, setting=settings) crawler.run() cont, simple = crawler.raw_report() return True, entry_url, simple, cont except: logger.error(traceback.format_exc()) return False, entry_url, traceback.format_exc(), {} finally: if crawler: crawler.close() if sqlmap: sqlmap.terminate()
def getDetail(self, link): browser = Setting.settingDriver() browser.get(link) try: browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") browser.find_element_by_xpath( "//a[contains(text(),'行程内容')]").click() detail = (browser.find_elements_by_xpath( "//div[@class='clp-header md-top-n']/p[2]")) day_count = 0 detail_dic = {} for item in detail: day_count += 1 detail_dic[("DAY " + str(day_count))] = item.text.replace( '\n', '') self.itinerary['detail'] = detail_dic except: no_detail = {"notice": "此項目無行程內容"} self.itinerary['detail'] = no_detail browser.close()
def crawl(self): fu = UserAgent() headers = {'UserAgent': fu.random} resp = requests.get(self.url, headers=headers) html = BeautifulSoup(resp.text, 'lxml') links = html.find('div', class_='trip').find_all('a') links = [link['href'] for link in links] links = [urljoin(resp.url, link) for link in links] links = list(set(links)) wait_list = [] wait_list += links while wait_list: link = wait_list.pop() driver = Setting.settingDriver() driver.get(link) items = driver.find_elements(By.XPATH, '//td') items = [item.text for item in items] flag = True while flag: if items == None or len(items) < 13: break if items[11] == u'結團': if len(items) == 13: break items = items[13:] flag = True continue self.count += 1 convertDate = datetime.date(2018, int(items[3].split('.')[0]), int(items[3].split('.')[1])) phoenix = Deposit(self.tag, items, link, convertDate) phoenix.run() print('Crawling and deposit {} data from {}'.format(self.count, self.tag)) flag = False driver.quit()
import requests # -*- coding: UTF-8 -*- from bs4 import BeautifulSoup from urllib.parse import urljoin import datetime from deposit.lion import Deposit from selenium import webdriver from crawler.setting import Setting now = Setting.getNowDate().strftime("%Y-%m-%d") halfYearByNow = Setting.getHalfYearByNow().strftime("%Y-%m-%d") class Lion(object): def __init__(self, tag_code): self.url = "https://travel.liontravel.com/search?Country=TW&WebCode=B2C&TravelType=1&Page=1&PageSize=1000&DepartureID=&GoDateStart=" + now + "&GoDateEnd=" + halfYearByNow + "&IsEnsureGroup=false&ArriveID=" + tag_code self.code = tag_code self.count = 0 self.itinerary = { 'title': '', 'price': '', 'detail': {}, 'departure_date': [], 'link': [], 'status': [], 'date_price': [] } def resetItinerary(self): self.itinerary = { 'title': '',
def crawl(self): browser = Setting.settingDriver() browser.get(self.url) html = BeautifulSoup(browser.page_source, 'lxml') available = [] waiting = [] datas = [] for i in range( len( html.select("[name='ColaPager$ddlPageNo']")[0].find_all( 'option'))): self.count += 1 html = BeautifulSoup(browser.page_source, 'lxml') for data in html.select("[class='Grid']"): for item in data.select("[class='TourName']"): self.data_dic['title'].append(item.text) tmp_link = "https://www.colatour.com.tw" + str( item['href']) self.data_dic['link'].append(tmp_link) browser.get(tmp_link) detail = browser.find_elements_by_xpath( "//td[@style='background-color: #D1E6FE; color: blue']" ) day_count = 0 detail_dic = {} for detail_data in detail: day_count += 1 detail_dic[("DAY " + str(day_count))] = detail_data.text browser.back() self.data_dic['detail'].append(detail_dic) for item in html.select("[class='GridItem']"): tmp_data = item.text.replace("\n", "").replace("\r", "").replace(" ", "") datas.append(tmp_data) browser.find_element_by_xpath( "//input[@id='ColaPager_cmdNextPage']").click() print('Preparing {} data from {}'.format(self.count, self.code)) browser.quit() count_seat = -1 for i in range(len(datas)): if i % 11 == 2: tmp_month = datas[i].split('/')[0] tmp_day = datas[i].split('/')[1][:2] tmp_date = datetime.date(2018, int(tmp_month), int(tmp_day)) self.data_dic['departure_date'].append(tmp_date) elif i % 11 == 6: self.data_dic['price'].append(datas[i]) self.data_dic['date_price'].append(datas[i]) elif i % 11 == 8: available.append(datas[i]) count_seat += 1 elif i % 11 == 9: waiting.append(datas[i]) elif i % 11 == 10: if (datas[i] != "關團"): tmp_string = "可售:" + available[ count_seat] + " 候補:" + waiting[count_seat] self.data_dic['status'].append(tmp_string) else: self.data_dic['status'].append(datas[i]) cola = Deposit(self.code, self.data_dic) cola.run()
import requests from bs4 import BeautifulSoup from deposit.t1tour import Deposit import datetime from selenium import webdriver from crawler.setting import Setting now = Setting.getNowDate().strftime("%m-%d-%Y").replace('-', '%2F') halfYearByNow = Setting.getHalfYearByNow().strftime("%m-%d-%Y").replace( '-', '%2F') class T1tour(object): def __init__(self, tag_code): self.url = 'http://www.t1tour.com.tw/tour?country=' + tag_code + '&sdate=' + now + '&edate=' + halfYearByNow self.code = tag_code self.data_dic = { 'title': [], 'price': [], 'departure_date': [], 'link': [], 'status': [], 'date_price': [], 'detail': [] } def getPage(self): res = requests.get(self.url) soup = BeautifulSoup(res.text, 'lxml') for num in soup.select("[class='dib']"): total_page = len(num.find_all('a'))