Exemple #1
0
    def crawl(self):
        browser = Setting.settingDriver()
        browser.get(self.url)
        try:
            browser.find_element_by_xpath("//a[contains(text(),'»»')]").click()
            soup = BeautifulSoup(browser.page_source, 'lxml')
            pageNum = int(
                soup.select("[class='pagination']")[0].find_all('a')[-1].text)
            browser.find_element_by_xpath("//a[contains(text(),'««')]").click()
            time.sleep(2)
        except IndexError:
            soup = BeautifulSoup(browser.page_source, 'lxml')
            pageNum = int(
                soup.select("[class='pagination']")[0].find_all('a')[-2].text)
        except:
            soup = BeautifulSoup(browser.page_source, 'lxml')
            pageNum = int(
                soup.select("[class='pagination']")[0].find_all('a')[0].text)

        soup = BeautifulSoup(browser.page_source, 'lxml')
        if pageNum > 1:
            for num in range(pageNum - 1):
                self.content()
                browser.find_element_by_xpath("//a[@rel='next']").click()
                self.content()
        elif pageNum == 1:
            self.content()
        while self.items:
            self.count += 1
            gabriel = Deposit(self.code, self.items.pop())
            gabriel.run()
            print('Crawling and deposit {} data from {}'.format(
                self.count, self.code))
Exemple #2
0
 def content(self):
     browser = Setting.settingDriver()
     browser.get(self.url)
     soup = BeautifulSoup(browser.page_source, 'lxml')
     for i in range(2, len(soup.select("ul[class='cue-list']")) + 2):
         tmp_title = soup.select("div.out-showcue-list > ul:nth-of-type(" +
                                 str(i) +
                                 ") > li:nth-of-type(2) > a")[0].text
         self.data_dic['title'] = tmp_title
         tmp_date = soup.select(
             "div.out-showcue-list > ul:nth-of-type(" + str(i) +
             ") > li:nth-of-type(1) > p")[0].text.replace('\r', '').replace(
                 '\n', '').strip()[:10]
         convertDate = datetime.date(int(tmp_date.split('/')[0]),
                                     int(tmp_date.split('/')[1]),
                                     int(tmp_date.split('/')[2]))
         self.data_dic['departure_date'] = convertDate
         tmp_link = (
             "http://www.gabriel.com.tw" +
             soup.select("div.out-showcue-list > ul:nth-of-type(" + str(i) +
                         ") > li:nth-of-type(2) > a")[0]['href'])
         self.data_dic['link'] = tmp_link
         tmp_status = soup.select(
             "div.out-showcue-list > ul:nth-of-type(" + str(i) +
             ") > li:nth-of-type(5)")[0].text.split(':')[1].strip()
         self.data_dic['status'] = tmp_status
         tmp_price = (
             soup.select("div.out-showcue-list > ul:nth-of-type(" + str(i) +
                         ") > li:nth-of-type(3)")[0].text.split(':')[1])
         self.data_dic['price'] = tmp_price
         self.data_dic['date_price'] = tmp_price
         browser.get(tmp_link)
         detail = (browser.find_elements_by_xpath("(//div[@class='note'])"))
         keyword = (browser.find_elements_by_tag_name('h6'))[:-1]
         key = []
         day_count = 0
         detail_dic = {}
         for item in detail[:-1]:
             day_count += 1
             detail_dic[("DAY " + str(day_count))] = item.text
         for item in keyword:
             if (item.text != ''):
                 key.append(item.text)
                 detail_dic['Keywords'] = key
         self.data_dic['detail'] = detail_dic
         self.items.append(self.data_dic)
         self.resetDataDic()
         browser.back()
     browser.close()
Exemple #3
0
def crawler_sqlmap(entry_url,
                   depth=-1,
                   level=1,
                   threads=2,
                   timeout=30,
                   checkhost=True):
    """启动sqlmap扫描的入口函数。

    :param entry_url: 扫描网站的入口地址
    :param depth: 网页爬虫爬取页面深度,-1则表示不设置深度,默认-1
    :param level: sqlmap扫描测试等级:1-5(默认为1),等级越高使用的测试样例越多,结果越精确,时间也越长
    :param threads: sqlmap多线程扫描设置(默认为2)
    :param timeout: sqlmap扫描超时时间(默认30s)
    :param checkhost: 检查爬取链接是否属于同一域
    :return: 返回值为四元组(ret, url, simple, content)
            ret: 执行结果, False为失败, True为成功
            url: 扫描目标地址
            simple: 解析content抽取重要数据生成的报告,字典类型
            content: sqlmap返回的完整报告,字典类型
            若执行结果为False,那么把扫描错误信息存在扫描关键结果(simple)这个位置
    """
    settings = Setting(handle=False)
    settings.depth = depth
    settings.nocheckhost = not checkhost
    settings.level = level
    settings.threads = threads
    settings.timeout = timeout

    sqlmap, crawler = None, None
    try:
        sqlmap, ip, port = start_sqlmap()
        # crawler的创建必须在sqlmap启动之后, 才能正确获取sqlmap的端口号
        crawler = Crawler(BASE_DIR, ip, port, entry_url, setting=settings)
        crawler.run()
        cont, simple = crawler.raw_report()
        return True, entry_url, simple, cont
    except:
        logger.error(traceback.format_exc())
        return False, entry_url, traceback.format_exc(), {}
    finally:
        if crawler: crawler.close()
        if sqlmap: sqlmap.terminate()
Exemple #4
0
    def getDetail(self, link):
        browser = Setting.settingDriver()
        browser.get(link)
        try:
            browser.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")

            browser.find_element_by_xpath(
                "//a[contains(text(),'行程内容')]").click()
            detail = (browser.find_elements_by_xpath(
                "//div[@class='clp-header md-top-n']/p[2]"))
            day_count = 0
            detail_dic = {}
            for item in detail:
                day_count += 1
                detail_dic[("DAY " + str(day_count))] = item.text.replace(
                    '\n', '')
            self.itinerary['detail'] = detail_dic
        except:
            no_detail = {"notice": "此項目無行程內容"}
            self.itinerary['detail'] = no_detail
        browser.close()
Exemple #5
0
    def crawl(self):
        fu = UserAgent()
        headers = {'UserAgent': fu.random}
        resp = requests.get(self.url, headers=headers)
        html = BeautifulSoup(resp.text, 'lxml')

        links = html.find('div', class_='trip').find_all('a')
        links = [link['href'] for link in links]
        links = [urljoin(resp.url, link) for link in links]
        links = list(set(links))

        wait_list = []
        wait_list += links
        while wait_list:
            link = wait_list.pop()
            driver = Setting.settingDriver()
            driver.get(link)
            items = driver.find_elements(By.XPATH, '//td')
            items = [item.text for item in items]

            flag = True
            while flag:
                if items == None or len(items) < 13:
                    break
                if items[11] == u'結團':
                    if len(items) == 13:
                        break
                    items = items[13:]
                    flag = True
                    continue
                self.count += 1
                convertDate = datetime.date(2018, int(items[3].split('.')[0]), int(items[3].split('.')[1]))
                phoenix = Deposit(self.tag, items, link, convertDate)
                phoenix.run()
                print('Crawling and deposit {} data from {}'.format(self.count, self.tag))
                flag = False

            driver.quit()
Exemple #6
0
import requests
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import datetime
from deposit.lion import Deposit
from selenium import webdriver
from crawler.setting import Setting

now = Setting.getNowDate().strftime("%Y-%m-%d")
halfYearByNow = Setting.getHalfYearByNow().strftime("%Y-%m-%d")


class Lion(object):
    def __init__(self, tag_code):
        self.url = "https://travel.liontravel.com/search?Country=TW&WebCode=B2C&TravelType=1&Page=1&PageSize=1000&DepartureID=&GoDateStart=" + now + "&GoDateEnd=" + halfYearByNow + "&IsEnsureGroup=false&ArriveID=" + tag_code
        self.code = tag_code
        self.count = 0
        self.itinerary = {
            'title': '',
            'price': '',
            'detail': {},
            'departure_date': [],
            'link': [],
            'status': [],
            'date_price': []
        }

    def resetItinerary(self):
        self.itinerary = {
            'title': '',
Exemple #7
0
    def crawl(self):
        browser = Setting.settingDriver()
        browser.get(self.url)
        html = BeautifulSoup(browser.page_source, 'lxml')

        available = []
        waiting = []
        datas = []

        for i in range(
                len(
                    html.select("[name='ColaPager$ddlPageNo']")[0].find_all(
                        'option'))):
            self.count += 1
            html = BeautifulSoup(browser.page_source, 'lxml')
            for data in html.select("[class='Grid']"):
                for item in data.select("[class='TourName']"):
                    self.data_dic['title'].append(item.text)
                    tmp_link = "https://www.colatour.com.tw" + str(
                        item['href'])
                    self.data_dic['link'].append(tmp_link)
                    browser.get(tmp_link)
                    detail = browser.find_elements_by_xpath(
                        "//td[@style='background-color: #D1E6FE; color: blue']"
                    )
                    day_count = 0
                    detail_dic = {}
                    for detail_data in detail:
                        day_count += 1
                        detail_dic[("DAY " +
                                    str(day_count))] = detail_data.text
                    browser.back()
                    self.data_dic['detail'].append(detail_dic)
            for item in html.select("[class='GridItem']"):
                tmp_data = item.text.replace("\n",
                                             "").replace("\r",
                                                         "").replace("  ", "")
                datas.append(tmp_data)

            browser.find_element_by_xpath(
                "//input[@id='ColaPager_cmdNextPage']").click()
            print('Preparing {} data from {}'.format(self.count, self.code))
        browser.quit()
        count_seat = -1
        for i in range(len(datas)):
            if i % 11 == 2:
                tmp_month = datas[i].split('/')[0]
                tmp_day = datas[i].split('/')[1][:2]
                tmp_date = datetime.date(2018, int(tmp_month), int(tmp_day))
                self.data_dic['departure_date'].append(tmp_date)
            elif i % 11 == 6:
                self.data_dic['price'].append(datas[i])
                self.data_dic['date_price'].append(datas[i])
            elif i % 11 == 8:
                available.append(datas[i])
                count_seat += 1
            elif i % 11 == 9:
                waiting.append(datas[i])
            elif i % 11 == 10:
                if (datas[i] != "關團"):
                    tmp_string = "可售:" + available[
                        count_seat] + " 候補:" + waiting[count_seat]
                    self.data_dic['status'].append(tmp_string)
                else:
                    self.data_dic['status'].append(datas[i])

        cola = Deposit(self.code, self.data_dic)
        cola.run()
Exemple #8
0
import requests
from bs4 import BeautifulSoup
from deposit.t1tour import Deposit
import datetime
from selenium import webdriver
from crawler.setting import Setting

now = Setting.getNowDate().strftime("%m-%d-%Y").replace('-', '%2F')
halfYearByNow = Setting.getHalfYearByNow().strftime("%m-%d-%Y").replace(
    '-', '%2F')


class T1tour(object):
    def __init__(self, tag_code):
        self.url = 'http://www.t1tour.com.tw/tour?country=' + tag_code + '&sdate=' + now + '&edate=' + halfYearByNow
        self.code = tag_code
        self.data_dic = {
            'title': [],
            'price': [],
            'departure_date': [],
            'link': [],
            'status': [],
            'date_price': [],
            'detail': []
        }

    def getPage(self):
        res = requests.get(self.url)
        soup = BeautifulSoup(res.text, 'lxml')
        for num in soup.select("[class='dib']"):
            total_page = len(num.find_all('a'))