Ejemplo n.º 1
0
def qunar_calendar_parse_data(air_port, file_path):

    p_dict = {}
    # 读取数据
    with open(file_path, 'r', encoding='utf-8') as f:
        page = f.read()
        f.close()
    # 解析数据
    soup = BeautifulSoup(page, 'lxml')
    try:
        price_list = soup.find_all('span', class_="price")

        for iter in range(4, 34):
            # 前四个数据为8月数据,文件中为“查看”,只解析9月的30个数据
            tmp = BeautifulSoup(str(price_list[iter]), 'lxml')
            date = iter - 3
            p_dict[str(date)] = tmp.body.span.span.text

        qunar_calendar_insert_db(air_port, p_dict)
    except:
        spider_log("price_list not find.")
Ejemplo n.º 2
0
def qunar_calendar_insert_db(air_port, Sep_p_dict):

    db_name = "qunar_calendar_Seq_ticket.db"
    db_path = "E:/ticket_spider/database/" + db_name
    conn = sqlite3.connect(db_path)
    cu = conn.cursor()

    try:
        table_name = re.sub(r"=", '_', air_port)
        table_name = air_port
        CREATE_DB = "CREATE TABLE " + table_name + "(crawl_time CHAR(50), " \
                    "Sep01 INTEGER, Sep02 INTEGER, Sep03 INTEGER, " \
                    "Sep04 INTEGER, Sep05 INTEGER, Sep06 INTEGER, " \
                    "Sep07 INTEGER, Sep08 INTEGER, Sep09 INTEGER, " \
                    "Sep10 INTEGER, Sep11 INTEGER, Sep12 INTEGER, " \
                    "Sep13 INTEGER, Sep14 INTEGER, Sep15 INTEGER, " \
                    "Sep16 INTEGER, Sep17 INTEGER, Sep18 INTEGER, " \
                    "Sep19 INTEGER, Sep20 INTEGER, Sep21 INTEGER, " \
                    "Sep22 INTEGER, Sep23 INTEGER, Sep24 INTEGER, " \
                    "Sep25 INTEGER, Sep26 INTEGER, Sep27 INTEGER, " \
                    "Sep28 INTEGER, Sep29 INTEGER, Sep30 INTEGER)"
        cu.execute(CREATE_DB)
        spider_log(air_port + ": db create,table name: " + table_name)
    except:
        #spider_log(air_port + ": db exist.")
        pass

    crawl_time = current_time()

    try:
        INSERT_DB = "insert into " + table_name + "(crawl_time, " \
                    "Sep01, Sep02, Sep03, Sep04, Sep05, Sep06, Sep07, Sep08, Sep09, Sep10, " \
                    "Sep11, Sep12, Sep13, Sep14, Sep15, Sep16, Sep17, Sep18, Sep19, Sep20, " \
                    "Sep21, Sep22, Sep23, Sep24, Sep25, Sep26, Sep27, Sep28, Sep29, Sep30) " \
                    "values(\'" + crawl_time \
                    + "\',\'" + Sep_p_dict['1'] + "\',\'" + Sep_p_dict['2'] \
                    + "\',\'" + Sep_p_dict['3'] + "\',\'" + Sep_p_dict['4'] \
                    + "\',\'" + Sep_p_dict['5'] + "\',\'" + Sep_p_dict['6'] \
                    + "\',\'" + Sep_p_dict['7'] + "\',\'" + Sep_p_dict['8'] \
                    + "\',\'" + Sep_p_dict['9'] + "\',\'" + Sep_p_dict['10'] \
                    + "\',\'" + Sep_p_dict['11'] + "\',\'" + Sep_p_dict['12'] \
                    + "\',\'" + Sep_p_dict['13'] + "\',\'" + Sep_p_dict['14'] \
                    + "\',\'" + Sep_p_dict['15'] + "\',\'" + Sep_p_dict['16'] \
                    + "\',\'" + Sep_p_dict['17'] + "\',\'" + Sep_p_dict['18'] \
                    + "\',\'" + Sep_p_dict['19'] + "\',\'" + Sep_p_dict['20'] \
                    + "\',\'" + Sep_p_dict['21'] + "\',\'" + Sep_p_dict['22'] \
                    + "\',\'" + Sep_p_dict['23'] + "\',\'" + Sep_p_dict['24'] \
                    + "\',\'" + Sep_p_dict['25'] + "\',\'" + Sep_p_dict['26'] \
                    + "\',\'" + Sep_p_dict['27'] + "\',\'" + Sep_p_dict['28'] \
                    + "\',\'" + Sep_p_dict['29'] + "\',\'" + Sep_p_dict['30'] \
                    + "\')"
        cu.execute(INSERT_DB)
        spider_log(air_port + ": data insert.")
    except:
        spider_log(air_port + ": data insert error.")

    cu.close()
    conn.commit()
    conn.close()
def ctrip_calendar_parse_data(air_port, file_path):
    p_dict = {}

    # 读取数据
    with open(file_path, 'r', encoding='utf-8') as f:
        page = f.read()
        f.close()

    # 解析数据
    soup = BeautifulSoup(page, 'lxml')
    try:
        price_list = soup.table.tbody.find_all(class_="price")
        price_num = len(price_list)  # 这个参数可以取消

        for iter in range(0, price_num):
            ticket_price = re.findall(r"</dfn>(.+?)</div>",
                                      str(price_list[iter]))
            date = iter + 1
            p_dict[str(date)] = ticket_price[0]

        ctrip_calendar_insert_db(air_port, p_dict)
    except:
        spider_log("price_list not find.")
def qunar_calendar_spider(air_port_list):

    spider_log("qunar_spider start, webdriver: Firefox")

    driver = webdriver.Firefox()

    time.sleep(1)

    main_url = "https://flight.qunar.com/site/oneway_list_inter.htm"
    depart_time = "searchDepartureTime=2017-09-01"
    passager_info = "adultNum=1&childNum=0"

    for air_port_item in air_port_list:

        spider_log("crawl air_line: " + air_port_item)

        search_url = main_url + "?" + air_port_item + "?" + depart_time + "?" + passager_info
        spider_log("crawl url: " + search_url)

        driver.get(search_url)
        # 点击低价日历,获取当月(8月)价格情况
        driver.find_element_by_xpath(".//*[@id='dateBar']/div[2]/div").click()
        time.sleep(1)

        # 原来计划获取90天的情况,现在觉得把问题简化比较好,只获取9月的情况
        month_page = driver.page_source
        # 写入文件
        crawl_time = current_time('file_name_hour')
        # name_structure: site + time_stamp + line_info
        file_name = "qunar_" + crawl_time + "_" + air_port_item + ".txt"
        file_path = "E:/ticket_spider/raw_data/" + file_name
        # 打开文件的时候就要指定编码
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(month_page)
            f.close()

        spider_log(air_port_item + ": download, file_path: " + file_path)
        time.sleep(1)

        # 将port信息解析为易读的形式(方便建立数据库)
        location = re.findall(r'[\u4e00-\u9fa5]+', air_port_item)
        air_port = location[0] + "_" + location[1]

        qunar_calendar_parse_data(air_port, file_path)
        time.sleep(1)

    driver.quit()
Ejemplo n.º 5
0
def ctrip_calendar_spider(air_port_list):

    spider_log("ctrip_spider start, webdriver: Firefox")
    driver = webdriver.Firefox()

    time.sleep(1)

    main_url = "http://flights.ctrip.com"
    air_line = "international"
    depart_time = "2017-09-01"
    position = "y_s"

    for air_port_item in air_port_list:

        spider_log("crawl air_line: " + air_port_item)

        search_url = main_url + "/" + air_line + "/" + air_port_item + "?" + depart_time + "&" + position
        spider_log("crawl url: " + search_url)

        driver.get(search_url)
        time.sleep(1)
        # 点击低价日历,获取当月(9月)价格情况
        driver.find_element_by_xpath(
            ".//*[@id='calendar_tab']/div[4]/a").click()
        time.sleep(1)
        month_page = driver.page_source

        # 写入文件,准备解析, 这里使用了文件进行解析,也可以不用文件中转。
        crawl_time = current_time('file_name_hour')
        # name_structure: site + time_stamp + line_info
        file_name = "ctrip_" + crawl_time + "_" + air_port_item + ".txt"
        file_path = "E:/ticket_spider/raw_data/" + file_name
        # 打开文件的时候就要指定编码
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(month_page)
            f.close()

        spider_log(air_port_item + ": download, file_path: " + file_path)
        time.sleep(1)

        ctrip_calendar_parse_data(air_port_item, file_path)
        time.sleep(1)

    driver.quit()
from module.send_email import send_log_email
from module.spider_log import spider_log
from spider.start_spider import start_spider

'''
function: 用于本地运行,设定每1个小时运行一次,每次爬取10条路线,每天爬取5次,爬取一周。
          从下午1点开始爬取,进行到下午5点。
'''
if __name__ == '__main__':

    spider_log("ticket spider local mode start.")

    try:
        start_spider()
        spider_log("---")
        email_content = "spider run success, ctrip price calendar log."
    except:
        email_content = "spider run error."

    log_path = "E:/ticket_spider/log/ticket_calendar_log.txt"
    #send_log_email(email_content, log_path)
Ejemplo n.º 7
0
def start_spider():
    ''' --- module spilt --- '''
    # 将希望爬取的路线写到该list中
    ctrip_air_port_list = [
        'chengdu-losangeles-ctu-lax',  # 洛杉矶
        'chengdu-newyork-ctu-nyc',  # 纽约
        'chengdu-tokyo-ctu-tyo',  # 东京
        'chengdu-seoul-ctu-sel',  # 首尔
        'chengdu-london-ctu-lon',  # 伦敦
        'chengdu-sydney-ctu-syd',  # 悉尼
        'chengdu-paris-ctu-par',  # 巴黎
        'chengdu-moscow-ctu-mow'  # 莫斯科
    ]

    spider_log("ctrip_spider,crawl air_line:")
    spider_log(str(ctrip_air_port_list))
    ctrip_calendar_spider(ctrip_air_port_list)
    spider_log("ctrip_spider stop.")
    time.sleep(1)
    ''' --- module spilt --- '''
    # 由于qunar对PhantJS支持不好,同时Firefox运行不稳定,所以先不加入qunar的数据采集
    # 将希望爬取的路线写到该list中

    qunar_air_port_list = [
        'searchDepartureAirport=成都&searchArrivalAirport=洛杉矶',
        'searchDepartureAirport=成都&searchArrivalAirport=纽约',
        'searchDepartureAirport=成都&searchArrivalAirport=东京',
        'searchDepartureAirport=成都&searchArrivalAirport=首尔',
        'searchDepartureAirport=成都&searchArrivalAirport=伦敦',
        'searchDepartureAirport=成都&searchArrivalAirport=悉尼',
        'searchDepartureAirport=成都&searchArrivalAirport=巴黎',
        'searchDepartureAirport=成都&searchArrivalAirport=莫斯科'
    ]

    spider_log("qunar_spider, crawl air_line:")
    spider_log(str(qunar_air_port_list))
    qunar_calendar_spider(qunar_air_port_list)
    spider_log("qunar_spider stop.")
    time.sleep(1)
    ''' --- module spilt --- '''
import time

from module.send_email import send_email
from module.spider_log import spider_log
from spider.start_spider import start_spider

'''
function: 用于服务器运行,设定每2个小时运行一次,每次爬取10条路线,每天爬取10次,爬取一周。
'''
if __name__ == '__main__':

    spider_log("ticket spider server mode start.")

    count = 1
    # 首次运行
    spider_log("running count: " + str(count))
    start_spider()
    spider_log("---")

    while(1):
        # 每隔 2h 运行一次
        time.sleep(5)
        start_spider()
        spider_log("---")
        count += 1
        spider_log("running count: " + str(count))

        if count == 3:
            break

    email_content = "ctrip price calendar data."
Ejemplo n.º 9
0
def send_log_email(email_content=' ',
                   log_path='',
                   send_to_addr='*****@*****.**'):
    '''
    name: 发送邮件模块
    func: 调用该模块向指定地址发送邮件,已设定默认正文,默认收件人和默认附件,目前不考虑接受log信息(本地调试)
    send_email(attch_text='', file_path='', send_to_addr='')
    '''

    # email 地址与用户口令
    from_addr = "*****@*****.**"
    password = "******"
    # 收件人地址
    to_addr = send_to_addr  # 默认收件地址,默认抄送一份到该地址

    # 构建一个支持附件的邮件容器
    msg = MIMEMultipart()

    # 构造当前时间戳,添加到邮件的主题中
    t = time.localtime()
    time_stamp = str(t.tm_mon) + "." + str(t.tm_mday) + " " + str(
        t.tm_hour) + ":" + str(t.tm_min) + ":" + str(t.tm_sec)
    subject = '[' + time_stamp + '] ' + 'Ticket Crawler Run Report'

    # 填写邮件头信息
    msg["Subject"] = subject  # 邮件主题
    msg["From"] = from_addr
    msg["To"] = to_addr

    # 填写邮件正文信息
    # 默认正文内容
    header_text = '''
    该邮件由 ticket crawler自动生成发送,下方为本次爬取情况:
    '''
    # 希望 attch_text给出:爬虫名,运行时间,爬取数量,
    mime_text = MIMEText(header_text + email_content, 'plain',
                         'utf-8')  # 实例化一个文本邮件对象
    msg.attach(mime_text)

    # 添加邮件附件信息: log
    with open(log_path, 'rb') as f:
        # 设置附件的MIME和文件名
        dir_path = os.path.dirname(log_path)
        file_name = log_path[len(dir_path):]  # 获取文件名
        mime = MIMEBase('log', 'txt', filename=file_name)
        # 加上必要的头部信息
        mime.add_header('Content-Disposition',
                        'attachment',
                        filename=file_name)
        mime.add_header('Content-ID', '<0>')
        mime.add_header('X-Attachment-Id', '0')
        # 读取附件内容
        mime.set_payload(f.read())
        # 用Base64编码
        encoders.encode_base64(mime)
        # 添加到MIMEMultipart
        msg.attach(mime)
        f.close()

    # 发送邮件
    try:
        server = smtplib.SMTP_SSL("smtp.qq.com", 465)
        # server.set_debuglevel(1) # 输出所有交互信息
        server.login(from_addr, password)
        server.sendmail(from_addr, to_addr, msg.as_string())
        server.quit()
        spider_log("mail send to : " + to_addr)
    except:
        spider_log("mail send falied.")