# 尝试乱序的爬取方式 order = {} tool = {} orderlist = [] rlist = [] data = [] oneline = {} city = ['北京', '上海', '广州'] for i in city: for j in city: if i is not j: for date in range(20200706, 202007015): order['city1'] = i order['city2'] = j order['date'] = str(date) tool = copy.copy(order) orderlist.append(tool) # 初始指令池 random.shuffle(orderlist) # 随机排序 for od in orderlist: print(od.values()) data = list(message(getdata(od['city1'], od['city2'], str(od['date'])))) for i in data: if i['Airline'] not in rlist: rlist.append(i['Airline']) # 添加新的公司到名单 time.sleep(random.random() * 30) # 随机休眠时间而且比较长,过于规律会被禁封 print(rlist) # ['南方航空', '东方航空', '上海航空', '海南航空', '中国国航', '吉祥航空', '金鹏航空', '春秋航空', '厦门航空', '中国联合航空', '天津航空']
from mainspider_post import getdata, message import time import random # 一个子程序用于确定在爬虫范围内的航空公司名单 airline = [] data = [] city = ['北京', '上海', '广州'] for i in city: for j in city: if i is not j: for date in range(20200706, 20200720): print(i, j) data = data + list(message(getdata(i, j, str(date)))) time.sleep(random.random() * 30) # 随机休眠时间而且比较长,过于规律会被禁封 for i in data: if i['AirLine'] not in airline: airline.append(i['AirLine']) print(airline)
import mainspider_post from crawler_donghang import donghangcrawler from crawler_jixiang import jixiangcrawler from crawler_lianhang import lianhangcrawler from multiprocessing import Process if __name__ == "__main__": dcity = input("请选择出发城市(北京、上海、广州): ") acity = input("请选择到达城市(北京、上海、广州): ") date = input("请输入日期('20200706'): ") airline = mainspider_post.message( mainspider_post.getdata(dcity, acity, date), dcity, acity, date) print(airline) # 获取航空公司名单(已经去重),爬取携程票据 # 定义爬虫进程 csvlist = [] donghangPro = Process(target=donghangcrawler, args=(dcity, acity, date)) jixiangPro = Process(target=jixiangcrawler, args=(dcity, acity, date)) lianhangPro = Process(target=lianhangcrawler, args=(dcity, acity, date)) for i in airline: if i == '东方航空': print('定位到东方航空') donghangPro.start() donghangPro.join() csvlist.append(["donghang.csv"]) elif i == '吉祥航空': print('定位到吉祥航空') jixiangPro.start() csvlist.append("jixiang.csv") print("吉祥航空爬虫开始运行") elif i == '中国联合航空':
import mainspider_post from crawler_donghang import donghangcrawler from crawler_jixiang import jixiangcrawler from crawler_lianhang import lianhangcrawler from multiprocessing import Process dcity = input("请选择出发城市(北京、上海、广州): ") acity = input("请选择到达城市(北京、上海、广州): ") date = input("请输入日期('20200706'): ") airline = mainspider_post.message(mainspider_post.getdata(dcity, acity, date), dcity, acity, date) print(airline) # 获取航空公司名单(已经去重),爬取携程票据 # 定义爬虫进程 donghangPro = Process(target=donghangcrawler, args=(dcity, acity, date)) jixiangPro = Process(target=jixiangcrawler, args=(dcity, acity, date)) lianhangPro = Process(target=lianhangcrawler, args=(dcity, acity, date)) csvlist = [] for i in airline: if i is '东方航空': donghangPro.start() csvlist.append(["donghang.csv"]) elif i is '吉祥航空': jixiangPro.start() csvlist.append("jixiang.csv") print("吉祥航空爬虫开始运行") elif i is '中国联合航空': lianhangPro.start() csvlist.append("lianghang.csv") print("中国联合航空爬虫开始运行") for i in airline: # 主进程阻塞等待子进程完成 if i is '东方航空':