Ejemplo n.º 1
0
 def parse_line(self, response):
     city_name = response.meta['city_name']
     station_name = response.meta['station_name']
     s_station_name = response.meta['s_station_name']
     end = response.meta['end']
     sdate = response.meta['date']
     self.mark_done(station_name, end['city_name'], sdate)
     soup = bs(response.body, 'lxml')
     scl_list = soup.find('table',
                          attrs={'id': 'ContentPlaceHolder1_GridViewbc'})
     if not scl_list:
         return
     if scl_list:
         scl_list = scl_list.find_all('tr', attrs={'style': True})
     for x in scl_list[1:]:
         y = x.find_all('td')
         ticket_status = y[3].get_text().strip()
         s_d_city_name = end['city_name']
         d_city_name = re.sub("[A-Za-z]", "", s_d_city_name)
         if ticket_status == u"有票":
             drv_date = sdate
             bus_num = y[1].get_text().strip()
             drv_time = y[2].get_text().strip()
             distance = y[4].get_text().strip()
             vehicle_type = y[5].get_text().strip().decode('utf-8')
             full_price = y[6].get_text().strip()
             s_sta_name = y[7].get_text().strip()
             attrs = dict(
                 s_province='山东',
                 s_city_id="",
                 s_city_name=city_name,
                 s_city_code=get_pinyin_first_litter(unicode(city_name)),
                 s_sta_name=station_name,
                 s_sta_id='',
                 d_city_name=d_city_name,
                 d_city_id=end['city_id'],
                 d_city_code=get_pinyin_first_litter(unicode(d_city_name)),
                 d_sta_id='',
                 d_sta_name=s_sta_name,
                 drv_date=drv_date,
                 drv_time=drv_time,
                 drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time),
                                           "%Y-%m-%d %H:%M"),
                 distance=distance,
                 vehicle_type=vehicle_type,
                 seat_type="",
                 bus_num=bus_num,
                 full_price=float(full_price),
                 half_price=float(full_price) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={
                     's_station_name': s_station_name,
                     's_d_city_name': s_d_city_name
                 },
                 left_tickets=45,
                 crawl_source="qdky",
                 shift_id="",
             )
             yield LineItem(**attrs)
Ejemplo n.º 2
0
 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     end_code = response.meta["end_code"]
     sdate = response.meta["date"]
     self.mark_done(start, end, sdate)
     content = response.body
     if not isinstance(content, unicode):
         content = content.decode('utf-8')
     sel = etree.HTML(content)
     scheduleList = sel.xpath(
         '//div[@id="visitorDataTable"]/table/tbody/tr')
     if scheduleList:
         for i in scheduleList[1:]:
             bus_num = i.xpath('td[1]/text()')[0]
             start_station = i.xpath('td[2]/text()')[0]
             end_station = i.xpath('td[2]/text()')[0]
             drv_time = i.xpath('td[5]/span[@class="lv_time"]/text()')[0]
             price = i.xpath('td[8]/span[@class="tk_price"]/text()')[0]
             left_tickets = i.xpath('td[9]/span/text()')[0]
             postdata = i.xpath('td[10]/a/@onclick')[0].split(',')[1][1:-3]
             attrs = dict(
                 s_province='内蒙古',
                 s_city_name=u"呼和浩特",
                 s_city_id='',
                 s_city_code=get_pinyin_first_litter(u"呼和浩特"),
                 s_sta_name=start,
                 s_sta_id='',
                 d_city_name=end,
                 d_city_code=get_pinyin_first_litter(end),
                 d_city_id='',
                 d_sta_name=end,
                 d_sta_id=end_code,
                 drv_date=sdate,
                 drv_time=drv_time,
                 drv_datetime=dte.strptime("%s %s" % (sdate, drv_time),
                                           "%Y-%m-%d %H:%M"),
                 distance="0",
                 vehicle_type="",
                 seat_type="",
                 bus_num=bus_num,
                 full_price=float(price),
                 half_price=float(price) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={"postdata": postdata},
                 left_tickets=int(left_tickets),
                 crawl_source="nmghy",
                 shift_id="",
             )
             yield LineItem(**attrs)
Ejemplo n.º 3
0
 def parse_line(self, response):
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     content = response.body
     if not isinstance(content, unicode):
         content = content.decode('utf-8')
     self.mark_done(start, end[0], sdate)
     sel = etree.HTML(content)
     scheduleInfo = sel.xpath('//input[@id="scheduleInfoJson"]/@value')
     if scheduleInfo:
         scheduleInfo = json.loads(scheduleInfo[0])
         for d in scheduleInfo:
             if not isinstance(d, dict):
                 continue
             if int(d['seatLast']) == 0:
                 continue
             if float(d["price"]) < 5:
                 continue
             attrs = dict(
                 s_province='辽宁',
                 s_city_name=start,
                 s_city_id='',
                 s_city_code=get_pinyin_first_litter(start),
                 s_sta_name=d['fromStation'],
                 s_sta_id='',
                 d_city_name=end[0],
                 d_city_code=get_pinyin_first_litter(end[0]),
                 d_city_id='',
                 d_sta_name=d['toStation'],
                 d_sta_id='',
                 drv_date=sdate,
                 drv_time=d["driveTime"],
                 drv_datetime=dte.strptime(
                     "%s %s" % (sdate, d["driveTime"]), "%Y-%m-%d %H:%M"),
                 distance="0",
                 vehicle_type="",
                 seat_type="",
                 bus_num=d['trainNumber'],
                 full_price=float(d["price"]),
                 half_price=float(d["price"]) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={'lineNo': d['lineNo']},
                 left_tickets=int(d["seatLast"]),
                 crawl_source="lnky",
                 shift_id='',
             )
             yield LineItem(**attrs)
Ejemplo n.º 4
0
 def get_dest_list_from_web(self, province, city, **kwargs):
     lst = []
     for l in open("guangzhou.dest"):
         name = unicode(l.strip())
         print name,
         lst.append({"name": name, "code": get_pinyin_first_litter(name), "dest_id": ""})
     return lst
Ejemplo n.º 5
0
 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     res = json.loads(response.body)
     self.mark_done(start["city_name"], end["stopName"], sdate)
     res = res['detail']
     for d in res:
         if int(d['seatAmount']) == 0:
             continue
         if d['carrStaName'] != u"八王坟":
             continue
         attrs = dict(
             s_province='北京',
             s_city_name="北京",
             s_city_id='',
             s_city_code=get_pinyin_first_litter(u"北京"),
             s_sta_name=d["carrStaName"],
             s_sta_id=d["carryStaId"],
             d_city_name=end['stopName'],
             d_city_code=get_pinyin_first_litter(end['stopName']),
             d_city_id=end['stopId'],
             d_sta_name=d["endstaName"],
             d_sta_id='',
             drv_date=sdate,
             drv_time=d['drvTime'],
             drv_datetime=dte.strptime("%s %s" % (sdate, d['drvTime']),
                                       "%Y-%m-%d %H:%M"),
             distance="0",
             vehicle_type="",
             seat_type="",
             bus_num=d['scheduleId'],
             full_price=float(d['fullPrice']),
             half_price=float(d['fullPrice']) / 2,
             fee=0,
             crawl_datetime=dte.now(),
             extra_info={},
             left_tickets=int(d['seatAmount']),
             crawl_source="e8s",
             shift_id='',
         )
         yield LineItem(**attrs)
Ejemplo n.º 6
0
 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     res = json.loads(response.body)
     self.mark_done(start["findname"], end['city_name'], sdate)
     for d in res['schedules']:
         if int(d['iscansell']) != 1:
             continue
         if float(d['fullprice']) < 11:
             continue
         attrs = dict(
             s_province = start['province'],
             s_city_name = start['findname'],
             s_city_id = start['id'],
             s_city_code= get_pinyin_first_litter(start['findname']),
             s_sta_name= d["busshortname"],
             s_sta_id = d["stationorgid"],
             d_city_name = d["stationname"],
             d_city_code= get_pinyin_first_litter(d["stationname"]),
             d_city_id = d['stationid'],
             d_sta_name = d["stationname"],
             d_sta_id = '',
             drv_date = sdate,
             drv_time = d['departtime'][0:-3],
             drv_datetime = dte.strptime("%s %s" % (sdate, d['departtime'][0:-3]), "%Y-%m-%d %H:%M"),
             distance = d["rundistance"],
             vehicle_type = "",
             seat_type = d['seattype'],
             bus_num = d['schedulecode'],
             full_price = float(d['fullprice']),
             half_price = float(d['fullprice'])/2,
             fee = 3,
             crawl_datetime = dte.now(),
             extra_info = {'start_info':start},
             left_tickets = int(d['residualnumber']),
             crawl_source = "bus365",
             shift_id=d['id'],
         )
         yield LineItem(**attrs)
Ejemplo n.º 7
0
 def parse_start_city(self, response):
     res = json.loads(
         re.findall(r"var _stationList=(\S+)</script>",
                    response.body)[0].replace("Pros", '"Pros"').replace(
                        "Areas", '"Areas"').replace("Stations",
                                                    '"Stations"'))
     line_url = "http://www.96096kp.com/UserData/MQCenterSale.aspx"
     trans = {u"重庆主城": "重庆"}
     for d in res["Areas"][0]["AreaData"]:
         start = {
             "province": "重庆",
             "s_city_id": d["ID"],
             "s_city_name": d["CityDist"],
             "s_city_code": get_pinyin_first_litter(d["CityDist"]),
         }
         if not self.is_need_crawl(city=start["s_city_name"]):
             continue
         for s in self.get_dest_list(province="重庆",
                                     city=trans.get(start["s_city_name"],
                                                    start["s_city_name"])):
             name, code = s["name"], s["code"]
             end = {"d_city_name": name, "d_city_code": code}
             today = datetime.date.today()
             for i in range(self.start_day(), 8):
                 sdate = str(today + datetime.timedelta(days=i))
                 if self.has_done(start["s_city_name"], end["d_city_name"],
                                  sdate):
                     # self.logger.info("ignore %s ==> %s %s" % (start["s_city_name"], end["d_city_name"], sdate))
                     continue
                 params = {
                     "StartStation": start["s_city_name"],
                     "WaitStationCode": "",
                     "OpStation": -1,
                     "OpAddress": -1,
                     "SchDate": sdate,
                     "DstNode": name,
                     "SeatType": "",
                     "SchTime": "",
                     "OperMode": "",
                     "SchCode": "",
                     "txtImgCode": "",
                     "cmd": "MQCenterGetClass",
                     "isCheck": "false",
                 }
                 yield scrapy.Request(line_url,
                                      method="POST",
                                      body=urllib.urlencode(params),
                                      callback=self.parse_line,
                                      meta={
                                          "start": start,
                                          "end": end,
                                          "sdate": sdate
                                      })
Ejemplo n.º 8
0
 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     self.logger.info("finish %s ==> %s" % (start, end))
     self.mark_done(start, end, sdate)
     res = json.loads(response.body)
     sch_list = res['flightList']
     for d in sch_list:
         attrs = dict(
             s_province='上海',
             s_city_name=start,
             s_city_id='',
             s_city_code=get_pinyin_first_litter(unicode(start)),
             s_sta_name=d['stationName'],
             s_sta_id=d['stationId'],
             d_city_name=d['arriveRegionName'],
             d_city_code=get_pinyin_first_litter(d['arriveRegionName']),
             d_city_id=d['arriveRegionId'],
             d_sta_name=d['arriveRegionName'],
             d_sta_id='',
             drv_date=sdate,
             drv_time=d['flightTime'],
             drv_datetime=dte.strptime("%s %s" % (sdate, d['flightTime']),
                                       "%Y-%m-%d %H:%M"),
             distance='0',
             vehicle_type="",
             seat_type='',
             bus_num=d['flightNo'],
             full_price=float(d['price']),
             half_price=float(d['halfPrice']),
             fee=0,
             crawl_datetime=dte.now(),
             extra_info={"raw_info": d},
             left_tickets=int(d['lastCount']),
             crawl_source="shkyzz",
             shift_id='',
         )
         yield LineItem(**attrs)
Ejemplo n.º 9
0
 def parse_start_city(self, response):
     res = json.loads(response.body)
     if res["returnNo"] != "0000":
         self.logger.error("parse_start_city: Unexpected return, %s", res)
         return
     line_url = "http://s4mdata.wanmeibus.com:80/app/v4/ticket/busList.htm"
     for info in res["content"]["cityList"]:
         name = info["cityName"]
         if name not in CITY_TO_PROVINCE:
             continue
         province = CITY_TO_PROVINCE[name]
         if not self.is_need_crawl(city=name, province=province):
             continue
         start = {
             "province": province,
             "city_name": info["cityName"],
             "city_code": info["allSpell"],
             "city_id": info["cityId"],
         }
         for info in self.get_dest_list(province, name):
             end = {
                 "city_name": info["stationName"],
                 "city_code": get_pinyin_first_litter(info["stationName"]),
                 "city_id": info["stationId"],
             }
             self.logger.info("start %s ==> %s" % (start["city_name"], end["city_name"]))
             today = datetime.date.today()
             for i in range(self.start_day(), 8):
                 sdate = str(today + datetime.timedelta(days=i))
                 if self.has_done(start["city_name"], end["city_name"], sdate):
                     continue
                 content = {
                     "pageSize": 1025,
                     "beginCityName": start["city_name"],
                     "currentPage": 1,
                     "endCityName": end["city_name"],
                     "leaveDate": sdate,
                     "beginCityId": start["city_id"],
                     "endCityId": end["city_id"],
                 }
                 fd = self.post_data_templ(content)
                 yield scrapy.Request(line_url,
                                      method="POST",
                                      body=json.dumps(fd),
                                      callback=self.parse_line,
                                      meta={"start": start, "end": end, "date": sdate})
Ejemplo n.º 10
0
 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["date"]
     self.logger.info("finish %s ==> %s" % (start, end["city_name"]))
     self.mark_done(start, end['city_name'], sdate)
     res = json.loads(trans_js_str(response.body))
     for d in res["data"]:
         if d['SchStat'] == '1':
             attrs = dict(
                 s_province=u'广东',
                 s_city_name=u"深圳",
                 s_city_id='',
                 s_city_code=get_pinyin_first_litter(u"深圳"),
                 s_sta_name=d["SchWaitStName"],
                 s_sta_id=d["SchStationCode"],
                 d_city_name=end['city_name'],
                 d_city_code=end['city_code'],
                 d_city_id=d['SchDstNode'],
                 d_sta_name=d["SchNodeName"],
                 d_sta_id=d["SchNodeCode"],
                 drv_date=d["SchDate"],
                 drv_time=d["orderbytime"],
                 drv_datetime=dte.strptime(
                     "%s %s" % (d["SchDate"], d["orderbytime"]),
                     "%Y-%m-%d %H:%M"),
                 distance="0",
                 vehicle_type="",
                 seat_type="",
                 bus_num=d["SchLocalCode"],
                 full_price=float(d["SchStdPrice"]),
                 half_price=float(d["SchStdPrice"]) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={"raw_info": d},
                 left_tickets=int(d["SchTicketCount"]),
                 crawl_source="szky",
                 shift_id="",
             )
             yield LineItem(**attrs)
Ejemplo n.º 11
0
 def parse_start_city(self, response):
     body = response.body.replace("\'", "\"")
     data = json.loads(body)
     today = dte.today()
     line_url = "http://183.6.161.195:9000/api/TicketOrder/QuerySchedule"
     for x in data:
         if x in ["广州"]:
             continue
         if not self.is_need_crawl(city=x):
             continue
         dest_list = self.get_dest_list("广东", x)
         start = {"city_name": x, "city_code": get_pinyin_first_litter(x)}
         for d in dest_list:
             name, code = d["name"], d["code"]
             end = {"city_name": d["name"], "city_code": code}
             self.logger.info("start %s ==> %s" % (x, name))
             for i in range(self.start_day(), 8):
                 sdate = (today +
                          datetime.timedelta(days=i)).strftime("%Y%m%d")
                 if self.has_done(x, name, sdate):
                     continue
                 params = {
                     "fromcity": x,
                     "schdate": sdate,
                     "schtimeend": "",
                     "schtimestart": "",
                     "tocity": name
                 }
                 yield scrapy.Request(line_url,
                                      method="POST",
                                      body=json.dumps(params),
                                      callback=self.parse_line,
                                      headers={
                                          "Content-Type":
                                          "application/json; charset=UTF-8"
                                      },
                                      meta={
                                          "start": start,
                                          "end": end,
                                          "sdate": sdate
                                      })
Ejemplo n.º 12
0
    def get_dest_list_from_web(self, province, city):
        dest_url = 'http://s4mdata.bababus.com:80/app/v5/ticket/cityAllList.htm'
        dest_list = []
        for c in [chr(i) for i in range(97, 123)]:
            content = {
                "searchType": "0",
                "dataVersion": "",
                "beginCityName": city
            }
            fd = self.post_data_templ(content)

            ua = "Mozilla/5.0 (Linux; U; Android 2.2; fr-lu; HTC Legend Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko)  Version/4.0 Mobile Safari/533.1"
            headers = {"User-Agent": ua}
            import requests
            r = requests.post(dest_url, data=json.dumps(fd), headers=headers)
            res = r.json()
            for d in res["content"]["cityList"]:
                end = {
                    "name": d["cityName"],
                    "code": get_pinyin_first_litter(d["cityName"]),
                    "dest_id": d["cityId"],
                }
                dest_list.append(end)
        return dest_list
Ejemplo n.º 13
0
    def parse_line(self, response):
        s_city_name = response.meta['city'].decode('utf-8')
        start = response.meta['start'].decode('utf-8')
        end = response.meta['end'].decode('utf-8')
        sdate = response.meta['sdate'].decode('utf-8')
        self.mark_done(start, end, sdate)
        soup = bs(response.body, 'lxml')
        info = soup.find('table', attrs={'class': 'resulttb'}).find_all(
            'tbody', attrs={'class': 'rebody'})
        for x in info:
            try:
                bus_num = x.find(
                    'td', attrs={'align': 'center'}).get_text().strip()
                s_sta_name = x.find_all(
                    'td')[1].get_text().split()[0]
                d_city_name = x.find_all('td')[1].get_text().split()[1]
                drv_date = x.find_all('td')[2].get_text().strip()
                drv_time = x.find_all('td')[3].get_text().strip()
                # d_sta_name = x.find_all('td')[4].get_text().strip()
                distance = x.find_all('td')[5].get_text().strip()
                vehicle_type = x.find_all('td')[6].get_text().strip()
                full_price = x.find_all('td')[7].get_text().strip()
                left_tickets = int(x.find_all('td')[8].get_text().strip())
                y = x.find_all('td')[9].a.get('href').split('?')[-1]
                extra = {}
                for z in y.split('&'):
                    extra[z.split('=')[0]] = z.split('=')[1]

                attrs = dict(
                    s_province='河南',
                    s_city_id="",
                    s_city_name=s_city_name,
                    s_sta_name=s_sta_name,
                    s_city_code=get_pinyin_first_litter(s_city_name),
                    s_sta_id=extra['g'],
                    d_city_name=d_city_name,
                    d_city_id="",
                    d_city_code=get_pinyin_first_litter(d_city_name),
                    d_sta_id="",
                    d_sta_name=d_city_name,
                    drv_date=drv_date,
                    drv_time=drv_time,
                    drv_datetime=dte.strptime("%s %s" % (
                        drv_date, drv_time), "%Y-%m-%d %H:%M"),
                    distance=unicode(distance),
                    vehicle_type=vehicle_type,
                    seat_type="",
                    bus_num=bus_num,
                    full_price=float(full_price),
                    half_price=float(full_price) / 2,
                    fee=0.0,
                    crawl_datetime=dte.now(),
                    extra_info=extra,
                    left_tickets=left_tickets,
                    crawl_source="hn96520",
                    shift_id="",
                )
                yield LineItem(**attrs)

            except:
                pass
Ejemplo n.º 14
0
    def parse_line(self, response):
        "解析班车"
        res = response.body.decode('gbk')
        start_name = response.meta["start_name"]
        sw_name = response.meta["sw_name"]
        start_code = response.meta["start_code"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        sel = etree.HTML(res) 
        next_url = ''
        for i, j in enumerate(sel.xpath("//a/text()")):
            if j == '下一页':
                next_url = sel.xpath("//a/@href")[i]
#         countObj = re.findall("查询到(\d+)班", str(res))
#         if countObj:
#             count = countObj
#             page = int(math.ceil(count/10))
        form = sel.xpath('//form[@method="Post"]/@action')
        full_price = 0
        left_tickets = 0
        flag = False
        if form:
            sch = sel.xpath('//table[@width="600"]/tr')
            for i in sch[1:]:
                status = i.xpath('td[8]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                if status != '售票':
                    continue
                bus_num = i.xpath('td[1]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                drv_date = i.xpath('td[2]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                drv_date = dte.strftime(dte.strptime(drv_date, '%Y-%m-%d'),'%Y-%m-%d')
                drv_time = i.xpath('td[3]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                start_station = i.xpath('td[4]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                #end_station = i.xpath('td[5]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                distance = i.xpath('td[7]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                href = i.xpath('td[9]/div/a/@onclick')[0]
                if 'javascript:alert' in href:
                    continue
                if not flag:
                    for i in range(5):
                        param = {}
                        for s in href.split(";")[0][15:-1].split("?")[1].split("&"):
                            k, v = s.split("=")
                            param[k] = v.encode('gb2312')
                        query_url = "%s%s" % ('http://www.mp0769.com/orderlist.asp?', urllib.urlencode(param))
                        req = self.urllib2.Request(query_url, headers=self.headers)
                        result = self.urllib2.urlopen(req)
                        content = result.read()
                        res = content.decode('gbk')
                        if '非法操作' in res:
                            query_url = "http://www.mp0769.com/" + href.split(";")[0][15:-1]
                            req = self.urllib2.Request(query_url, headers=self.headers)
                            result = self.urllib2.urlopen(req)
                            content = result.read()
                            res = content.decode('gbk')
                        check_url = re.findall("window.location.href=(.*);", res)[0][1:-1]
                        check_url = "http://www.mp0769.com/" + check_url
                        param = {}
                        for s in check_url.split("?")[1].split("&"):
                            k, v = s.split("=")
                            param[k] = v.encode('gb2312')
                        order_url = "http://www.mp0769.com/orderlist.asp?"
                        order_url = "%s%s" % (order_url, urllib.urlencode(param))
                        req = self.urllib2.Request(order_url, headers=self.headers)
                        result = self.urllib2.urlopen(req)
                        content = result.read()
                        sel = etree.HTML(content)
                        params = {}
                        for s in sel.xpath("//form[@id='Form1']//input"):
                            k, v = s.xpath("@name"), s.xpath("@value")
                            if k:
                                k, v = k[0], v[0] if k else ""
                                params[k] = v.encode('gb2312')
                        if not params or int(params.get('ct_price', 0)) == 0:
                            end_station = params['ct_stname'].decode('gbk')
                        else:
                            print "ct_price ", params['ct_price']
                            full_price = params['ct_price']
                            left_tickets = params['ct_accnum']
                            end_station = params['ct_stname'].decode('gbk')
                            flag = True
                            break
                drv_datetime = dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M")
                if not flag:
                    result = self.query_line_info_by_gdsw(sw_name,end_station,bus_num,drv_datetime)
                    if result:
                        full_price = result['full_price']
                        left_tickets = result['left_tickets']
                        flag = True
                    else:
                        print 111111,sw_name,end_station,bus_num,drv_datetime
                        print 3333333,end
                attrs = dict(
                    s_province = u'广东',
                    s_city_name = u"东莞",
                    s_city_id = '',
                    s_city_code= get_pinyin_first_litter(u"东莞"),
                    s_sta_name = start_station,
                    s_sta_id = start_code,
                    d_city_name = end,
                    d_city_code= get_pinyin_first_litter(end),
                    d_city_id = '',
                    d_sta_name = end_station,
                    d_sta_id = '',
                    drv_date = drv_date,
                    drv_time = drv_time,
                    drv_datetime = drv_datetime,
                    distance = distance,
                    vehicle_type = "",
                    seat_type = "",
                    bus_num = bus_num,
                    full_price = float(full_price),
                    half_price = float(full_price)/2,
                    fee = 0,
                    crawl_datetime = dte.now(),
                    extra_info = {"query_url":href},
                    left_tickets = left_tickets,
                    crawl_source = "dgky",
                    shift_id="",
                )
                yield LineItem(**attrs)
        if next_url:
            url = "http://www.mp0769.com/bccx.asp?"
            param = {}
            try:
                for s in next_url.split("?")[1].split("&"):
                    k, v = s.split("=")
                    param[k] = v.encode('gb2312')
                url = url + urllib.urlencode(param)
            except:
                print next_url
            yield scrapy.Request(url,
                                 method="GET",
                                 callback=self.parse_line,
                                 meta={'start_name': start_name, "sw_name": sw_name,
                                       'start_code': start_code, 'end': end, 'sdate':sdate})
        else:
            self.mark_done(start_name, end, sdate)
Ejemplo n.º 15
0
class WxszSpider(SpiderBase):
    name = "wxsz"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400,
            # 'BusCrawl.middleware.ZjgsmHeaderMiddleware': 410,
            'BusCrawl.middleware.ProxyMiddleware': 410,
        },
        "DOWNLOAD_DELAY": 0.5,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://www.zjgsmwy.com"

    def start_requests(self):
        start_url = "http://coach.wisesz.mobi/coach_v38/main/getstations"
        yield scrapy.FormRequest(start_url, callback=self.parse_start_city)

    def parse_start_city(self, response):
        res = json.loads(response.body)
        if res["errorCode"] != 0:
            self.logger.error("parse_start_city: Unexpected return, %s" %
                              res["rtnMsg"])
            return
        name_trans = {
            u"张家港地区": "张家港",
            u"苏州市区": "苏州",
            u"常熟地区": "常熟",
            u"昆山地区": "昆山",
            u"太仓地区": "太仓",
            u"吴江地区": "吴江",
        }
        line_url = "http://coach.wisesz.mobi/coach_v38/main/get_tickets"
        for d in res["data"]["dataList"]:
            start = {
                "city_id": d["FIELDS1"],
                "city_name": name_trans[d["FIELDS2"]],
            }
            if not self.is_need_crawl(city=start["city_name"]):
                continue
            for sta in d["stations"]:
                start.update({
                    "sta_name": sta["FIELDS3"],
                    "sta_id": sta["FIELDS2"],
                })
                for s in self.get_dest_list("江苏", start["city_name"]):
                    name, code = s["name"], s["code"]
                    end = {"city_name": name, "city_code": code}
                    self.logger.info("start %s ==> %s" %
                                     (start["sta_name"], end["city_name"]))
                    today = datetime.date.today()
                    for i in range(self.start_day(), 8):
                        sdate = (today +
                                 datetime.timedelta(days=i)).strftime("%Y%m%d")
                        if self.has_done(start["sta_name"], end["city_name"],
                                         sdate):
                            continue
                        params = {
                            "departdate": sdate,
                            "destination": end["city_name"],
                            "fromcode": start["sta_id"],
                            "from": start["sta_name"],
                        }
                        yield scrapy.Request(
                            "%s?%s" % (line_url, urllib.urlencode(params)),
                            method="POST",
                            callback=self.parse_line,
                            headers={
                                "Content-Type":
                                "application/json;charset=UTF-8"
                            },
                            meta={
                                "start": start,
                                "end": end,
                                "sdate": sdate
                            })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["sta_name"], end["city_name"], sdate)
        try:
            res = json.loads(response.body)
        except Exception, e:
            self.logger.error(response.body)
            raise e
        if res["errorCode"] != 0:
            self.logger.error("parse_line: Unexpected return, %s", res)
            return
        shift_list = res["data"]["dataList"] or []

        for d in shift_list:
            drv_datetime = dte.strptime("%s %s" % (d["FIELDS1"], d["FIELDS3"]),
                                        "%Y%m%d %H%M")
            attrs = dict(
                s_province="江苏",
                s_city_id=start["city_id"],
                s_city_name=start["city_name"],
                s_sta_name=d["FIELDS4"],
                s_city_code=get_pinyin_first_litter(unicode(
                    start["city_name"])),
                s_sta_id=d["fromcode"],
                d_city_name=end["city_name"],
                d_city_id="",
                d_city_code=end["city_code"],
                d_sta_id=d["FIELDS11"],
                d_sta_name=d["FIELDS5"],
                drv_date=drv_datetime.strftime("%Y-%m-%d"),
                drv_time=drv_datetime.strftime("%H:%M"),
                drv_datetime=drv_datetime,
                distance=unicode(d["FIELDS16"]),
                vehicle_type=d["FIELDS9"],
                seat_type="",
                bus_num=d["FIELDS2"],
                full_price=float(d["FIELDS14"]),
                half_price=float(d["FIELDS15"]),
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "startstation": d["FIELDS17"],
                    "terminalstation": d["FIELDS6"]
                },
                left_tickets=int(d["FIELDS10"]),
                crawl_source="wxsz",
                shift_id="",
            )
            yield LineItem(**attrs)
Ejemplo n.º 16
0
class CTripSpider(SpiderBase):
    name = "ctrip"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },

        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.CtripHeaderMiddleware': 410,
        },
        "DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://m.ctrip.com/restapi/busphp/app/index.php"

    def start_requests(self):
        # 这是个pc网页页面
        headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/47.0.2526.106 Safari/537.36"}
        web_page = "http://qiche.tieyou.com/index.php?param=/ajax/cityList"
        return [scrapy.Request(web_page, headers=headers, callback=self.parse_start_city)]

    def parse_start_city(self, response):
        res = json.loads(response.body[1:-1])
        params = dict(
            param="/api/home",
            method="product.getToCityList",
            ref="ctrip.h5",
            partner="ctrip.app",
            clientType="Android--hybrid",
            vendor="",
            fromCity="",
            contentType="json",
        )
        for pro in res['hotFromCity']['province']:
            province = pro["province_name"]
            if not province or not self.is_need_crawl(province=province):
                continue
            self.logger.info("start province: %s" % province)

            for ci in pro["citys"]:
                d = {
                    "province": province,
                    "name": ci,
                }
                if not self.is_need_crawl(city=ci):
                    continue
                self.logger.info("start province: %s city: %s", province, ci)
                params.update(fromCity=ci)
                url = "%s?%s" % (self.base_url, urllib.urlencode(params))
                yield scrapy.Request(url, callback=self.parse_target_city, meta={"start": d})

    def parse_target_city(self, response):
        res = json.loads(response.body)
        if int(res["code"]) != 1:
            self.logger.error("parse_target_city: Unexpected return, %s" % res["message"])
            return

        start = response.meta["start"]
        for tar in res["return"]:
            d = {
                "name": tar["name"],
            }

            today = datetime.date.today()
            for i in range(1, 10):
                sdate = str(today+datetime.timedelta(days=i))
                if self.has_done(start["name"], d["name"], sdate):
                    #self.logger.info("ignore %s ==> %s %s" % (start["name"], d["name"], sdate))
                    continue
                params = dict(
                    param="/api/home",
                    method="product.getBusList",
                    v="1.0",
                    ref="ctrip.h5",
                    partner="ctrip.app",
                    clientType="Android--hybrid",
                    fromCity=start["name"],
                    toCity=d["name"],
                    fromDate=sdate,
                    contentType="json",
                )
                url = "%s?%s" % (self.base_url, urllib.urlencode(params))
                yield scrapy.Request(url, callback=self.parse_line, meta={"start": start, "end": d, "drv_date": sdate})

    def parse_line(self, response):
        "解析班车"
        try:
            res = json.loads(response.body)
        except Exception, e:
            print response.body
            raise e
        start = response.meta["start"]
        end = response.meta["end"]
        drv_date = response.meta["drv_date"]
        self.mark_done(start["name"], end["name"], drv_date)
        if int(res["code"]) != 1:
            #self.logger.error("parse_line: Unexpected return, %s" % str(res))
            return
        for d in res["return"]:
            if not d["bookable"]:
                continue
            if d["busType"] == "流水班":
                continue
            from_station = unicode(d["fromStationName"])
            to_station = unicode(d["toStationName"])
            ticket_info = d["showTicketInfo"]
            if ticket_info == "有票":
                left_tickets = 45
            elif ticket_info.endswith("张"):
                left_tickets = int(ticket_info[:-1])
            elif ticket_info == "预约购票":
                continue
            else:
                print ticket_info, d["bookable"]
                1/0

            attrs = dict(
                s_province = start["province"],
                s_city_name = d["fromCityName"],
                s_city_id="",
                s_city_code=get_pinyin_first_litter(d["fromCityName"]),
                s_sta_name = from_station,
                s_sta_id="",
                d_city_name = d["toCityName"],
                d_city_id="",
                d_city_code=get_pinyin_first_litter(d["toCityName"]),
                d_sta_name = to_station,
                d_sta_id="",
                drv_date = drv_date,
                drv_time = d["fromTime"],
                drv_datetime = dte.strptime("%s %s" % (drv_date, d["fromTime"]), "%Y-%m-%d %H:%M"),
                distance = "0",
                vehicle_type = d["busType"],
                seat_type = "",
                bus_num = d["busNumber"],
                full_price = float(d["fullPrice"]),
                half_price = float(d["fullPrice"])/2,
                fee = 0,
                crawl_datetime = dte.now(),
                extra_info = {},
                left_tickets = left_tickets,
                crawl_source = "ctrip",
                shift_id="",
            )
            yield LineItem(**attrs)
Ejemplo n.º 17
0
 def start_requests(self):
     days = 8
     today = datetime.date.today()
     headers = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0",
         'Content-Type':
         'application/x-www-form-urlencoded',
         'Referer':
         'http://www.zhwsbs.gov.cn:9013/shfw/zaotsTicket/pageLists.xhtml',
     }
     data = {
         'SchDate': '',
         'SchTime': '',
         'checkCode': '',
         'StartStation': '"-"',
         'SchDstNodeName': '',
     }
     sta_info = {
         u'香洲长途站': 'C1K001-102017',
         u'上冲站': 'C1K027-102018',
         u'南溪站': 'C1K013-102019',
         u'拱北通大站': 'C1K030-102023',
         u'斗门站': 'C2K003-102027',
         # u'井岸站': 'C2K001-102028',
         u'红旗站': 'C1K006-102030',
         u'三灶站': 'C1K004-102031',
         u'平沙站': 'C1K007-102032',
         u'南水站': 'C1K008-102033',
         u'唐家站': 'TJZ001-102020',
         u'金鼎站': 'JDZ001-102021',
         u'拱北票务中心': 'GBPW01-102024',
         u'西埔站': 'XPZ001-102029',
     }
     code, cookies = self.update_cookies()
     for s_name, s_id in sta_info.items():
         start = {
             "name": s_name,
             "id": s_id,
             "code": get_pinyin_first_litter(s_name)
         }
         for y in xrange(self.start_day(), days):
             for es in self.get_dest_list("广东", "珠海", s_name):
                 name, d_code = es["name"], es["code"]
                 end = {"name": name, "code": d_code}
                 sdate = str(today + datetime.timedelta(days=y))
                 if self.has_done(start["name"], end["name"], sdate):
                     continue
                 data['SchDstNodeName'] = end["name"]
                 data['SchDate'] = sdate
                 data['checkCode'] = code
                 data['StartStation'] = s_id
                 yield scrapy.Request(
                     url=self.url,
                     callback=self.parse_line,
                     method='POST',
                     body=urllib.urlencode(data),
                     headers=headers,
                     cookies=dict(cookies),
                     meta={
                         'start': start,
                         'end': end,
                         'sdate': sdate,
                     },
                 )
Ejemplo n.º 18
0
class SzkyWapSpider(SpiderBase):
    name = "szky_wap"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },

        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.SzkyHeaderMiddleware': 410,
        },
#        "DOWNLOAD_DELAY": 0.1,
       "RANDOMIZE_DOWNLOAD_DELAY": True,
    }

    def start_requests(self):
        start_url = "http://www.vchepiao.cn/mb/base/bus/queryNewSKY"
        station_dict = {
                        "B1K003": '福田汽车客运站',
                        "B1K002": "深圳湾客运服务点",
                        "B1K004": "南山汽车客运站",
                        "B1K005": "盐田汽车客运站",
                        "B1K006": "东湖汽车客运站",
                        "B2K037": "深圳北汽车客运站",
                        "B1K010": "皇岗汽车客运站",
                        "B2K040": "机场汽车客运站",
                        }
        for k, v in station_dict.items():
            data = {
                "stationCode": k,
                }
            yield scrapy.FormRequest(start_url,
                                     method="POST",
                                     formdata=data,
                                     callback=self.parse_target_city,
                                     meta={"start_code": k})

    def parse_target_city(self, response):
        start_code = response.meta["start_code"]
        res = json.loads(response.body)
        if not res["success"]:
            self.logger.error("parse_target_city: Unexpected return, %s", res)
            return
        line_url = "http://www.vchepiao.cn/mb/base/bus/queryBusSKY"
        end_list = res['data']
        for end in end_list:
            today = datetime.date.today()
            for j in range(1, 7):
                sdate = str(today+datetime.timedelta(days=j))
                sdate_tra = sdate.replace('-', '')
#                 if self.has_done(start[1], end["depotName"], sdate):
#                     self.logger.info("ignore %s ==> %s %s" % (start[1], end["depotName"], sdate))
#                     continue
                data = {
                    "fromCity": "深圳",
                    "stationCode": start_code,
                    "dstNode": end['NDName'],
                    "schDate": sdate_tra
                }
                yield scrapy.FormRequest(line_url,
                                         method="POST",
                                         formdata=data,
                                         callback=self.parse_line,
                                         meta={"start_code": start_code, "end": end, "date": sdate})

    def parse_line(self, response):
        "解析班车"
        start_code = response.meta["start_code"]
        end = response.meta["end"]
        sdate = response.meta["date"]
#         self.mark_done(start[1], end["depotName"], sdate)
        try:
            res = json.loads(response.body)
        except Exception, e:
            raise e
#         if  res["values"]["resultList"]:
#             print res["values"]["resultList"]
#             print start["name"] ,end["depotName"]
        if not res["success"]:
            #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"])
            return
        if res["data"]["list"]:
            print res
        for d in res["data"]["list"]:
            if d['SchStat'] == '1':
                attrs = dict(
                    s_province = u'广东',
                    s_city_name = u"深圳",
                    s_city_id = '',
                    s_city_code= get_pinyin_first_litter(u"深圳"),
                    s_sta_name = d["SchWaitStName"],
                    s_sta_id = d["SchStationCode"],
                    d_city_name = d["SchDstCity"],
                    d_city_code=get_pinyin_first_litter(d["SchDstCity"]),
                    d_city_id = d['SchStationCode'],
                    d_sta_name = d["SchNodeName"],
                    d_sta_id = d["SchDstNode"],
                    drv_date = d["SchDate"],
                    drv_time = d["orderbytime"],
                    drv_datetime = dte.strptime("%s %s" % (d["SchDate"], d["orderbytime"]), "%Y-%m-%d %H:%M"),
                    distance = "0",
                    vehicle_type = "",
                    seat_type = "",
                    bus_num = d["SchLocalCode"],
                    full_price = float(d["SchStdPrice"]),
                    half_price = float(d["SchStdPrice"])/2,
                    fee = 0,
                    crawl_datetime = dte.now(),
                    extra_info = {"raw_info": d},
                    left_tickets = int(d["SchSeatCount"]),
                    crawl_source = "szky",
                    shift_id="",
                )
                yield LineItem(**attrs)
Ejemplo n.º 19
0
 try:
     res = json.loads(response.body)
 except Exception, e:
     raise e
 if res["akfAjaxResult"] != "0":
     #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"])
     return
 for d in res["values"]["resultList"]:
     if d['stopFlag'] == '0':
         if float(d["fullPrice"]) < 10:
             continue
         attrs = dict(
             s_province = '河北',
             s_city_name = city_name,
             s_city_id = start['code'],
             s_city_code= get_pinyin_first_litter(unicode(city_name)),
             s_sta_name = d["startDepotName"],
             s_sta_id = d["startDepotCode"],
             d_city_name = end["depotName"],
             d_city_code=get_pinyin_first_litter(end["depotName"]),
             d_city_id = end['depotCode'],
             d_sta_name = d["arrivalDepotName"],
             d_sta_id = d["arrivalDepotCode"],
             drv_date = d["departDate"],
             drv_time = d["leaveTime"],
             drv_datetime = dte.strptime("%s %s" % (d["departDate"], d["leaveTime"]), "%Y-%m-%d %H:%M"),
             distance = "0",
             vehicle_type = "",
             seat_type = "",
             bus_num = d["busCode"],
             full_price = float(d["fullPrice"]),
Ejemplo n.º 20
0
    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["date"]
        content = response.body
        self.mark_done(start["name"], end["StopName"], sdate)
        if not isinstance(content, unicode):
            content = content.decode('utf-8')
        sel = etree.HTML(content)
        scheduleList = sel.xpath('//div[@id="scheduleList"]/table/tbody/tr')
        for i in range(0, len(scheduleList), 2):
            s = scheduleList[i]
            time = s.xpath('td[@class="departureTimeCell"]/span/text()')[0]
            station = s.xpath('td[@class="routeNameCell"]/span/text()')
            scheduleIdSpan = s.xpath(
                'td[@class="scheduleAndBusLicenseCes"]/span[@class="scheduleSpan"]/span[@class="scheduleIdSpan"]/text()'
            )[0]
            scheduleIdSpan = scheduleIdSpan.replace('\r\n', '').replace(
                '\t', '').replace(' ', '')
            price = s.xpath(
                'td[@class="ticketPriceCell"]/span[@class="ticketPriceSpan"]/span[@class="ticketPriceValueSpan"]/text()'
            )[0]
            ScheduleString = s.xpath(
                'td[@class="operationCell"]/@data-schedule')[0]
            left_tickets = 45
            left_less = s.xpath('td[@class="memoCell"]/span/@class')
            if left_less:
                left_tickets = 0

            station_code_mapping = {
                u"六里桥": "1000",
                u"首都机场站": "1112",
                u"赵公口": "1103",
                u"木樨园": "1104",
                u"丽泽桥": "1106",
                u"新发地": "1107",
                u"莲花池": "1108",
                u"四惠": "1109",
                u"永定门": "1110",
                u"北郊": "1111",
            }
            attrs = dict(
                s_province='北京',
                s_city_name="北京",
                s_city_id='',
                s_city_code=get_pinyin_first_litter(u"北京"),
                s_sta_name=station[0],
                s_sta_id=station_code_mapping[station[0]],
                d_city_name=end['StopName'],
                d_city_code=get_pinyin_first_litter(end['StopName']),
                d_city_id=end['StopId'],
                d_sta_name=end['StopName'],
                d_sta_id='',
                drv_date=sdate,
                drv_time=time,
                drv_datetime=dte.strptime("%s %s" % (sdate, time),
                                          "%Y-%m-%d %H:%M"),
                distance="0",
                vehicle_type="",
                seat_type="",
                bus_num=scheduleIdSpan,
                full_price=float(price),
                half_price=float(price) / 2,
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "ScheduleString": ScheduleString,
                    "ArrivingStopJson": json.dumps(end)
                },
                left_tickets=left_tickets,
                crawl_source="bjky",
                shift_id='',
            )
            yield LineItem(**attrs)
Ejemplo n.º 21
0
class CBDSpider(SpiderBase):
    name = "cbd"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.CbdHeaderMiddleware': 410,
        },
        #"DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }

    def get_dest_list(self, province, city):
        url = "http://www.chebada.com/Home/GetBusDestinations"
        for city in [
                city, city + "市", city + "县",
                city.rstrip(u"市").rstrip("县")
        ]:
            r = requests.post(url,
                              headers={
                                  "User-Agent":
                                  "Chrome",
                                  "Content-Type":
                                  "application/x-www-form-urlencoded"
                              },
                              data=urllib.urlencode({"departure": city}))
            lst = []
            temp = {}
            res = r.json()["response"]
            if "body" not in res:
                continue
            for d in res["body"]["destinationList"]:
                for c in d["cities"]:
                    if c["name"] in temp:
                        continue
                    temp[c["name"]] = 1
                    lst.append({"name": c["name"], "code": c["shortEnName"]})
            return lst

    def start_requests(self):
        # 这是个pc网页页面
        line_url = "http://m.chebada.com/Schedule/GetBusSchedules"
        start_list = [
            "苏州",
            "南京",
            "无锡",
            "常州",
            "南通",
            "张家港",
            "昆山",
            "吴江",
            "常熟",
            "太仓",
            "镇江",
            "宜兴",
            "江阴",
            "兴化",
            "盐城",
            "扬州",
            "连云港",
            "徐州",
            "宿迁",
            "淮安",
            "句容",
            "靖江",
            "大丰",
            "扬中",
            "溧阳",
            "射阳",
            "滨海",
            "盱眙",
            "涟水",
            "宝应",
            "丹阳",
            "海安",
            "海门",
            "金坛",
            "江都",
            "启东",
            "如皋",
            "如东",
            "泗阳",
            "沭阳",
            "泰兴",
            "仪征",
        ]
        for name in start_list:
            name = unicode(name)
            if not self.is_need_crawl(city=name):
                continue
            self.logger.info("start crawl city %s", name)
            start = {"name": name, "province": "江苏"}
            for s in self.get_dest_list(start["province"], start["name"]):
                name, code = s["name"], s["code"]
                end = {"name": name, "short_pinyin": code}

                today = datetime.date.today()
                for i in range(self.start_day(), 4):
                    sdate = str(today + datetime.timedelta(days=i))
                    if self.has_done(start["name"], end["name"], sdate):
                        self.logger.info("ignore %s ==> %s %s" %
                                         (start["name"], end["name"], sdate))
                        continue
                    params = dict(
                        departure=start["name"],
                        destination=end["name"],
                        departureDate=sdate,
                        page="1",
                        pageSize="1025",
                        hasCategory="true",
                        category="0",
                        dptTimeSpan="0",
                        bookingType="0",
                    )
                    yield scrapy.FormRequest(line_url,
                                             formdata=params,
                                             callback=self.parse_line,
                                             meta={
                                                 "start": start,
                                                 "end": end,
                                                 "sdate": sdate
                                             })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["name"], end["name"], sdate)
        self.logger.info("finish %s ==> %s" % (start["name"], end["name"]))
        try:
            res = json.loads(response.body)
        except Exception, e:
            print response.body
            raise e
        res = res["response"]
        if int(res["header"]["rspCode"]) != 0:
            #self.logger.error("parse_target_city: Unexpected return, %s" % res["header"])
            return

        for d in res["body"]["scheduleList"]:
            # if int(d["canBooking"]) != 1:
            #     continue
            left_tickets = int(d["ticketLeft"])
            from_city = unicode(d["departure"])
            to_city = unicode(d["destination"])
            from_station = unicode(d["dptStation"])
            to_station = unicode(d["arrStation"])

            attrs = dict(
                s_province=start["province"],
                s_city_id="",
                s_city_name=from_city,
                s_sta_name=from_station,
                s_city_code=get_pinyin_first_litter(from_city),
                s_sta_id="",
                d_city_name=to_city,
                d_city_id="",
                d_city_code=end["short_pinyin"],
                d_sta_id="",
                d_sta_name=to_station,
                drv_date=d["dptDate"],
                drv_time=d["dptTime"],
                drv_datetime=dte.strptime(
                    "%s %s" % (d["dptDate"], d["dptTime"]), "%Y-%m-%d %H:%M"),
                distance=unicode(d["distance"]),
                vehicle_type=d["coachType"],
                seat_type="",
                bus_num=d["coachNo"],
                full_price=float(d["ticketPrice"]),
                half_price=float(d["ticketPrice"]) / 2,
                fee=float(d["ticketFee"]),
                crawl_datetime=dte.now(),
                extra_info={"raw_info": d},
                left_tickets=left_tickets,
                crawl_source="cbd",
                shift_id="",
            )
            yield LineItem(**attrs)
Ejemplo n.º 22
0
 def parse_line(self, response):
     "解析班车"
     province_list = ('吉林', '辽宁', '河北', '黑龙江', '广东', "云南", '山西', '山东',
                      '广西壮族自治', '江西', '河南', '浙江', '安徽', '湖北', '湖南', "贵州",
                      '陕西', '江苏', '内蒙古自治', "四川", '海南', '山东', '甘肃', '青海',
                      '宁夏回族自治', "新疆维吾尔自治", '西藏自治', '贵州', '广西')
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     res = json.loads(response.body)
     self.logger.info("finish %s ==> %s" %
                      (start["station_name"], end["zdmc"]))
     self.mark_done(start['station_name'], end["zdmc"], sdate)
     xml_text = re.findall(
         r"<getScheduledBusResult>(.*)</getScheduledBusResult>",
         res.get('msg', ''), re.S)[0]
     root = ET.fromstring(xml_text)
     node_find = root.find('Body')
     if node_find.attrib['size'] == '0':
         return
     res = node_find.findall('ScheduledBus')
     for d in res:
         s_sta_name = start['station_name']
         s_sta_id = start['czbh']
         d_city_name = end['zdmc']
         if len(d_city_name) >= 4:
             if d_city_name.startswith(province_list):
                 for j in province_list:
                     if d_city_name.startswith(j):
                         d_city_name = d_city_name.replace(j, '')
                         break
         d_sta_name = d.find('MDZMC').text
         drv_time = d.find('FCSJ').text
         distance = d.find('LC').text
         seat_type = d.find('CXMC').text
         bus_num = d.find('CCBH').text
         full_price = d.find('PJ').text
         left_tickets = d.find('YPZS').text
         d_city_id = d.find('MDZBH').text
         attrs = dict(
             s_province='海南',
             s_city_name=start['city_name'],
             s_city_id='',
             s_city_code=get_pinyin_first_litter(unicode(
                 start['city_name'])),
             s_sta_name=s_sta_name,
             s_sta_id=s_sta_id,
             d_city_name=d_city_name,
             d_city_code=get_pinyin_first_litter(d_city_name),
             d_city_id=d_city_id,
             d_sta_name=d_sta_name,
             d_sta_id='',
             drv_date=sdate,
             drv_time=drv_time,
             drv_datetime=dte.strptime("%s %s" % (sdate, drv_time),
                                       "%Y-%m-%d %H:%M"),
             distance=distance,
             vehicle_type="",
             seat_type=seat_type,
             bus_num=bus_num,
             full_price=float(full_price),
             half_price=float(full_price) / 2,
             fee=0,
             crawl_datetime=dte.now(),
             extra_info={},
             left_tickets=int(left_tickets),
             crawl_source="hainky",
             shift_id='',
         )
         yield LineItem(**attrs)
Ejemplo n.º 23
0
     res = json.loads(response.body)
 except Exception, e:
     raise e
 if end['name'] != params['arriveStation']:
     self.mark_done(params["startCity"], params["arriveStation"], sdate)
 busTripInfoSet = res['data']['busTripInfoSet']
 cityOpenSale = res['data']['cityOpenSale']
 if len(busTripInfoSet) > 0 and cityOpenSale:
     for d in busTripInfoSet:
         if d['tickets'] == 0 or d['tempClose'] == 1:
             continue
         attrs = dict(
             s_province = self.province,
             s_city_name = params['startCity'],
             s_city_id = '',
             s_city_code= get_pinyin_first_litter(params['startCity']),
             s_sta_name = params['startStation'],
             s_sta_id = '',
             d_city_name = params['arriveCity'],
             d_city_code= get_pinyin_first_litter(params['arriveCity']),
             d_city_id = '',
             d_sta_name = params['arriveStation'],
             d_sta_id = '',
             drv_date = sdate,
             drv_time = d["time"][0:-3],
             drv_datetime = dte.strptime("%s %s" % (sdate, d["time"][0:-3]), "%Y-%m-%d %H:%M"),
             distance = "0",
             vehicle_type = "",
             seat_type = "",
             bus_num = d["id"],
             full_price = float(d["price"]),
Ejemplo n.º 24
0
 self.mark_done(start["name"], end["depotName"] + end['depotCode'],
                sdate)
 try:
     res = json.loads(response.body)
 except Exception, e:
     raise e
 if res["akfAjaxResult"] != "0":
     #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"])
     return
 for d in res["values"]["resultList"]:
     if d['stopFlag'] == '0':
         attrs = dict(
             s_province='贵州',
             s_city_name=start["name"],
             s_city_id=start['code'],
             s_city_code=get_pinyin_first_litter(start["name"]),
             s_sta_name=d["startDepotName"],
             s_sta_id=d["startDepotCode"],
             d_city_name=end["depotName"],
             d_city_code=get_pinyin_first_litter(end["depotName"]),
             d_city_id=end['depotCode'],
             d_sta_name=d["arrivalDepotName"],
             d_sta_id=d["arrivalDepotCode"],
             drv_date=d["departDate"],
             drv_time=d["leaveTime"],
             drv_datetime=dte.strptime(
                 "%s %s" % (d["departDate"], d["leaveTime"]),
                 "%Y-%m-%d %H:%M"),
             distance="0",
             vehicle_type="",
             seat_type="",
Ejemplo n.º 25
0
class ZjgsmSpider(SpiderBase):
    name = "zjgsm"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ZjgsmHeaderMiddleware': 410,
            'BusCrawl.middleware.ProxyMiddleware': 410,
        },
        "DOWNLOAD_DELAY": 0.5,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://www.zjgsmwy.com"

    def start_requests(self):
        # 这是个pc网页页面
        start_url = self.base_url + "/busticket/busticket/service/Busticket.getAreaList.json"
        yield scrapy.FormRequest(start_url, callback=self.parse_start_city)

    def parse_start_city(self, response):
        res = json.loads(response.body)
        if res["rtnCode"] != "000000":
            self.logger.error("parse_start_city: Unexpected return, %s" %
                              res["rtnMsg"])
            return
        name_trans = {
            u"张家港地区": "张家港",
            u"苏州市区": "苏州",
            u"常熟地区": "常熟",
            u"昆山地区": "昆山",
            u"太仓地区": "太仓",
            u"吴江地区": "吴江",
        }
        station_url = self.base_url + "/busticket/busticket/service/Busticket.getStationList.json"
        for d in res["responseData"]:
            city = {
                "city_id": d["areacode"],
                "city_name": name_trans[d["areaname"]],
            }
            if not self.is_need_crawl(city=city["city_name"]):
                continue
            yield scrapy.FormRequest(station_url,
                                     formdata={"AREACODE": city["city_id"]},
                                     callback=self.parse_start_station,
                                     meta={"start": city})

    def parse_start_station(self, response):
        res = json.loads(response.body)
        start = response.meta["start"]
        if res["rtnCode"] != "000000":
            self.logger.error("parse_start_station: Unexpected return, %s" %
                              res["rtnMsg"])
            return
        line_url = self.base_url + "/busticket/busticket/service/Busticket.getBusTicketList.json"
        for d in res["responseData"]:
            start["sta_name"] = d["stationname"]
            for s in self.get_dest_list("江苏", start["city_name"]):
                name, code = s["name"], s["code"]
                end = {"city_name": name, "city_code": code}
                self.logger.info("start %s ==> %s" %
                                 (start["sta_name"], end["city_name"]))
                today = datetime.date.today()
                for i in range(self.start_day(), 4):
                    sdate = str(today + datetime.timedelta(days=i))
                    if self.has_done(start["sta_name"], end["city_name"],
                                     sdate):
                        continue
                    params = {
                        "AREACODE": start["city_id"],
                        "ONSTAION": start["sta_name"],
                        "OFFSTATION": end["city_name"],
                        "STARTDATE": sdate,
                    }
                    yield scrapy.Request(line_url,
                                         method="POST",
                                         body=urllib.urlencode(params),
                                         callback=self.parse_line,
                                         meta={
                                             "start": start,
                                             "end": end,
                                             "sdate": sdate
                                         })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["sta_name"], end["city_name"], sdate)
        try:
            res = json.loads(response.body)
        except Exception, e:
            self.logger.error(response.body)
            raise e
        if res["rtnCode"] != "000000":
            self.logger.error("parse_line: Unexpected return, %s" %
                              res["rtnMsg"])
            return
        shift_list = res["responseData"]["shiftList"] or []

        for d in shift_list:
            drv_datetime = dte.strptime(
                "%s %s" % (d["startdate"], d["starttime"]), "%Y%m%d %H%M")
            attrs = dict(
                s_province="江苏",
                s_city_id=start["city_id"],
                s_city_name=start["city_name"],
                s_sta_name=d["onstation"],
                s_city_code=get_pinyin_first_litter(unicode(
                    start["city_name"])),
                s_sta_id="",
                d_city_name=end["city_name"],
                d_city_id="",
                d_city_code=end["city_code"],
                d_sta_id=d["offstationcode"],
                d_sta_name=d["offstation"],
                drv_date=drv_datetime.strftime("%Y-%m-%d"),
                drv_time=drv_datetime.strftime("%H:%M"),
                drv_datetime=drv_datetime,
                distance=unicode(d["distance"]),
                vehicle_type=d["bustype"],
                seat_type="",
                bus_num=d["shift"],
                full_price=float(d["price"]),
                half_price=float(d["halfprice"]),
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "startstation": d["startstation"],
                    "terminalstation": d["terminalstation"]
                },
                left_tickets=int(d["availablenum"]),
                crawl_source="zjgsm",
                shift_id="",
            )
            yield LineItem(**attrs)
Ejemplo n.º 26
0
class FjkySpider(SpiderBase):
    name = "fjky"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.FjkyHeaderMiddleware': 410,
        },
        #        "DOWNLOAD_DELAY": 0.1,
        #        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }

    def query_start_predate(self, code):
        url = 'http://www.968980.cn/com/yxd/pris/openapi/queryPreDate.action'
        data = {
            "startDepotCode": code,
        }
        res = requests.post(url, data=data)
        res = res.json()
        predate = 0
        if res['akfAjaxResult'] != '0':
            predate = 0
        else:
            predate = res['values']['preDate']
        return predate

    def get_init_dest_list(self, start_info):
        province_list = ('吉林', '辽宁', '河北', '黑龙江', '广东', "云南", '山西', '山东',
                         '广西壮族自治', '江西', '河南', '浙江', '安徽', '湖北', '湖南', "贵州",
                         '陕西', '江苏', '内蒙古自治', "四川", '海南', '山东', '甘肃', '青海',
                         '宁夏回族自治', "新疆维吾尔自治", '西藏自治', '贵州')
        rds = get_redis()
        rds_key = "crawl:dest:fjky16"
        dest_str = rds.get(rds_key)
        if not dest_str:
            target_url = "http://www.968980.cn//com/yxd/pris/wsgp/queryCity.action"
            data = {
                "flag": "false",
                "isArrive": "true",
                "isStart": "false",
                "iststation": "1",
                "startCode": start_info['code'],
                "zjm": '',
            }
            r = requests.post(target_url,
                              data=urllib.urlencode(data),
                              headers={
                                  "User-Agent":
                                  "Chrome",
                                  "Content-Type":
                                  "application/x-www-form-urlencoded"
                              })
            res = r.json()
            lst = []
            if res['values']['ca']:
                for i in res['values']['ca'][0]:
                    tmp = {}
                    tmp['code'] = i[0]
                    if i[4] in ['1', '2']:
                        tmp['name'] = i[1].strip(' ')
                    else:
                        lev_list = i[3].split(' ')
                        if len(lev_list) < 3:
                            tmp['name'] = i[1].strip(' ')
                        else:
                            tmp['name'] = lev_list[-1].strip(')').strip(' ')
                            province = lev_list[0].strip('(').strip(' ')
                            if province == '福建省':
                                tmp['name'] = i[1].strip(' ')
                    target_name = tmp['name']
                    if target_name.endswith('站'):
                        continue
                    if '直辖' not in target_name:
                        if not target_name or len(target_name) > 4:
                            if target_name.startswith(province_list):
                                target_name1 = target_name
                                for j in province_list:
                                    if target_name.startswith(j):
                                        target_name = target_name.replace(
                                            j, '')
                                        break
                    tmp['name'] = target_name
                    if not tmp['name'].endswith(('市', '县', '州', '区', '旗')):
                        continue
                    lst.append(tmp)
            dest_str = json.dumps(lst)
            rds.set(rds_key, dest_str)
        lst = json.loads(dest_str)
        return lst

    def start_requests(self):
        start_url = "http://www.968980.cn/com/yxd/pris/openapi/cityQueryAll.action"
        yield scrapy.FormRequest(start_url,
                                 method="POST",
                                 formdata={},
                                 callback=self.parse_start_city)

    def parse_start_city(self, response):
        res = json.loads(response.body)
        if res["akfAjaxResult"] != "0":
            self.logger.error("parse_start_city: Unexpected return, %s", res)
            return
        start_list = []
        for i in res['values']['list']:
            for j in i['list']:
                start_list.append(j)
#         end_list = self.get_init_dest_list(start_list[0])
        line_url = 'http://www.968980.cn/com/yxd/pris/openapi/queryAllTicket.action'
        for start in start_list:
            if not self.is_need_crawl(city=start['name']):
                continue
            end_list = self.get_dest_list('福建', start['name'])
            for end in end_list:
                end['code'] = end['dest_id']
                today = datetime.date.today()
                for j in range(0, 7):
                    sdate = str(today + datetime.timedelta(days=j))
                    if self.has_done(start['name'], end["name"], sdate):
                        self.logger.info("ignore %s ==> %s %s" %
                                         (start['name'], end["name"], sdate))
                        continue
                    data = {
                        "arrivalDepotCode": end['code'],
                        "beginTime": sdate,
                        "startName": unicode(start['name']),
                        "endName": unicode(end["name"]),
                        "startDepotCode": start['code']
                    }
                    yield scrapy.FormRequest(line_url,
                                             method="POST",
                                             formdata=data,
                                             callback=self.parse_line,
                                             meta={
                                                 "start": start,
                                                 "end": end,
                                                 "date": sdate
                                             })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["date"]
        self.mark_done(start['name'], end["name"], sdate)
        try:
            res = json.loads(response.body)
        except Exception, e:
            raise e
        if res["akfAjaxResult"] != "0":
            #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"])
            return
        for d in res["values"]["resultList"]:
            if d['stopFlag'] == '0':
                #                 if float(d["fullPrice"]) < 5 or int(d["remainSeats"]) < 2:
                #                     continue
                attrs = dict(
                    s_province='福建',
                    s_city_name=start['name'],
                    s_city_id=start['code'],
                    s_city_code=get_pinyin_first_litter(start['name']),
                    s_sta_name=d["startDepotName"],
                    s_sta_id=d["startDepotCode"],
                    d_city_name=end["name"],
                    d_city_code=get_pinyin_first_litter(end["name"]),
                    d_city_id=end['code'],
                    d_sta_name=d["arrivalDepotName"],
                    d_sta_id=d["arrivalDepotCode"],
                    drv_date=d["departDate"],
                    drv_time=d["leaveTime"],
                    drv_datetime=dte.strptime(
                        "%s %s" % (d["departDate"], d["leaveTime"]),
                        "%Y-%m-%d %H:%M"),
                    distance="0",
                    vehicle_type="",
                    seat_type="",
                    bus_num=d["busCode"],
                    full_price=float(d["fullPrice"]),
                    half_price=float(d["fullPrice"]) / 2,
                    fee=0,
                    crawl_datetime=dte.now(),
                    extra_info={
                        "busCodeType": d["busCodeType"],
                        "regsName": d["regsName"],
                        "busCompanyCode": d["busCompanyCode"],
                        "s_code": start['code'],
                        'e_code': end['code']
                    },
                    left_tickets=int(d["remainSeats"]),
                    crawl_source="fjky",
                    shift_id="",
                )
                yield LineItem(**attrs)
Ejemplo n.º 27
0
    def parse_line(self, response):
        trainListInfo = json.loads(response.body)
        if trainListInfo:
            start = response.meta["start"]
            end = response.meta["end"]
            crawl_province = response.meta["crawl_province"]
            crawl_city = response.meta["crawl_city"]
            payload = response.meta["payload"]
            sdate = payload['sendDate']
            item = LineItem()
            item['crawl_source'] = 'bus100'
            item['s_province'] = crawl_province['province_name']
            item['s_city_id'] = start['countyId']
            item['s_city_name'] = start['countyName']
            item['s_sta_id'] = start['countyId']
            start_short_name = start['pinyinPrefix']
            if not start_short_name or start_short_name == 'null':
                start_short_name = get_pinyin_first_litter(
                    item['start_city_name'])
            item['s_city_code'] = start_short_name
            item['d_city_name'] = end['portName']
            d_city_code = end['pinyinPrefix']
            if not d_city_code:
                d_city_code = "".join(
                    map(
                        lambda x: x[0],
                        pypinyin.pinyin(unicode(end['portName']),
                                        style=pypinyin.FIRST_LETTER)))
            item['drv_date'] = sdate
            item['d_city_code'] = d_city_code

            nextPage = int(trainListInfo['nextPage'])
            pageNo = int(trainListInfo['pageNo'])
            #                             print m['msg']
            sel = etree.HTML(trainListInfo['msg'])
            trains = sel.xpath('//div[@class="trainList"]')
            for n in trains:
                d_str = n.xpath("@data-list")[0]
                d_str = d_str[d_str.index("id=") + 3:]
                shiftid = d_str[:d_str.index(",")]
                station = n.xpath('ul/li[@class="start"]/p/text()')
                time = n.xpath('ul/li[@class="time"]/p/strong/text()')
                #             print 'time->',time[0]
                banci = ''
                banci = n.xpath(
                    'ul/li[@class="time"]/p[@class="carNum"]/text()')
                if banci:
                    banci = banci[0].replace('\r\n', '').replace(' ', '')
                else:
                    ord_banci = n.xpath(
                        'ul/li[@class="time"]/p[@class="banci"]/text()')
                    if ord_banci:
                        banci = ord_banci[0]
                price = n.xpath('ul/li[@class="price"]/strong/text()')
                #             print 'price->',price[0]
                infor = n.xpath(
                    'ul/li[@class="carriers"]/p[@class="info"]/text()')
                distance = ''
                if infor:
                    distance = infor[0].replace('\r\n', '').replace(' ', '')
                buyInfo = n.xpath('ul/li[@class="buy"]')
                flag = 0
                for buy in buyInfo:
                    flag = buy.xpath('a[@class="btn"]/text()')  #判断可以买票
                    if flag:
                        flag = 1
                    else:
                        flag = 0
                item['drv_time'] = time[0]
                item['drv_datetime'] = datetime.datetime.strptime(
                    sdate + ' ' + time[0], "%Y-%m-%d %H:%M")
                item['s_sta_name'] = station[0]
                item['d_sta_name'] = station[1]
                item['bus_num'] = banci.decode("utf-8").strip().rstrip(u"次")
                item["full_price"] = float(str(price[0]).split('¥')[-1])
                item["half_price"] = float(str(price[0]).split('¥')[-1]) / 2
                item['distance'] = distance
                item['shift_id'] = str(shiftid)
                item['crawl_datetime'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                item['vehicle_type'] = ''
                item['seat_type'] = ''
                item['fee'] = 0
                item['left_tickets'] = 50 if flag else 0
                item['extra_info'] = {"flag": flag}
                yield item

            if nextPage > pageNo:
                url = 'http://84100.com/getBusShift/ajax' + '?pageNo=%s' % nextPage
                yield scrapy.FormRequest(url,
                                         formdata=payload,
                                         callback=self.parse_line,
                                         meta={
                                             "payload": payload,
                                             'crawl_province': crawl_province,
                                             'crawl_city': crawl_city,
                                             'start': start,
                                             "end": end
                                         })
            elif nextPage and nextPage == pageNo:
                self.mark_done(start["countyName"], end['portName'], sdate)
Ejemplo n.º 28
0
class CBDSpider(SpiderBase):
    name = "lvtu100"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.Lvtu100HeaderMiddleware': 410,
        },
        #"DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://m.ctrip.com/restapi/busphp/app/index.php"

    def get_request_data(self, custom):
        data = {
            "appid": "lvtu100.andorid",
            "timestamp": str(int(time.time())),
            "format": "json",
            "version": "1.0",
        }
        data.update(custom)
        key_lst = filter(lambda x: data[x], data.keys())
        key_lst.sort()
        data["sign"] = md5("".join("%s%s" % (k, data[k]) for k in key_lst) +
                           "0348ba1cbbfa0fa9ca627394e999fea5")
        return data

    def get_dest_list(self, province, city):
        """
        覆盖了父类实现
        """
        url = "http://api.lvtu100.com/products/getstopcity"
        params = self.get_request_data({
            "startProvince": province,
            "startcityname": city
        })
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Linux; U; Android 2.3; en-us) AppleWebKit/999+ (KHTML, like Gecko) Safari/999.9",
            "Content-Type": "application/x-www-form-urlencoded",
        }
        r = requests.post(url, data=urllib.urlencode(params), headers=headers)
        ret = r.json()
        return map(
            lambda d: {
                "city_name": d["cityname"],
                "province": d["province"],
                "city_code": d["shortspell"]
            }, ret["data"]["resultList"])

    def start_requests(self):
        url = "http://api.lvtu100.com/products/get_allstartcity"
        params = self.get_request_data({})
        yield scrapy.FormRequest(url,
                                 formdata=params,
                                 callback=self.parse_starting)

    def parse_starting(self, response):
        url = "http://api.lvtu100.com/products/getgoods"
        ret = json.loads(response.body)
        today = datetime.date.today()
        for city_info in ret["data"]:
            for d in city_info["lstcitys"]:
                province = d["province"]
                if not self.is_need_crawl(
                        province=province) and not self.is_need_crawl(
                            province=province.rstrip(u"省")):
                    continue
                start = {
                    "city_id": d["startcityid"],
                    "city_code": d["shortspell"],
                    "city_name": d["cityname"],
                    "province": d["province"]
                }
                if not self.is_need_crawl(
                        city=start["city_name"]) or start["city_name"] in [
                            "宝应"
                        ]:
                    continue

                for end in self.get_dest_list(province, start["city_name"]):
                    for i in range(self.start_day(), 8):
                        sdate = str(today + datetime.timedelta(days=i))
                        if self.has_done(start["city_name"], end["city_name"],
                                         sdate):
                            continue
                        params = {
                            "startprovince": start["province"],
                            "startcity": start["city_name"],
                            "departdate": sdate,
                            "fromstation": "",
                            "pagestring": '{"page":1,"pagesize":1024}',
                            "range": "",
                            "stopprovince": end["province"],
                            "stopcity": end["city_name"],
                        }
                        yield scrapy.FormRequest(
                            url,
                            formdata=self.get_request_data(params),
                            callback=self.parse_line,
                            meta={
                                "start": start,
                                "end": end,
                                "sdate": sdate
                            })

    def parse_line(self, response):
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["city_name"], end["city_name"], sdate)
        self.logger.info("start %s ==> %s" %
                         (start["city_name"], end["city_name"]))
        try:
            res = json.loads(response.body)
        except Exception, e:
            print response.body
            raise e
        if int(res["code"]) != 0:
            self.logger.error("parse_line: Unexpected return, %s" % res)
            return

        s_sta_info = {d["productid"]: d for d in res["data"]["stations"]}
        d_sta_info = {d["productid"]: d for d in res["data"]["stopstations"]}
        for d in res["data"]["flight"]["resultList"]:
            if int(d["islocked"]) == 1:
                continue
            s_sta = s_sta_info[d["productid"]]
            d_sta = d_sta_info[d["productid"]]
            attrs = dict(
                s_province=start["province"].rstrip("省"),
                s_city_id=start["city_id"],
                s_city_name=start["city_name"],
                s_sta_name=s_sta["stationname"],
                s_city_code=start["city_code"],
                s_sta_id=s_sta["stationid"],
                d_city_name=d_sta["stopcity"],
                d_city_id="",
                d_city_code=get_pinyin_first_litter(d_sta["stopcity"]),
                d_sta_id="",
                d_sta_name=d_sta["stationname"],
                drv_date=d["departdate"],
                drv_time=d["departtime"],
                drv_datetime=dte.strptime(
                    "%s %s" % (d["departdate"], d["departtime"]),
                    "%Y-%m-%d %H:%M"),
                distance=unicode(d.get("distance", "") or ""),
                vehicle_type=d.get("bustype", "") or "",
                seat_type="",
                bus_num=d["itemno"],
                full_price=float(d["price"]),
                half_price=float(d["price"]) / 2,
                fee=3,
                crawl_datetime=dte.now(),
                extra_info={
                    "goodsid": d["goodsid"],
                    "itemid": d["itemid"],
                    "startProvince": start["province"],
                    "stopprovince": end["province"],
                    "productid": d["productid"]
                },
                left_tickets=10,
                crawl_source="lvtu100",
                shift_id="",
            )
            yield LineItem(**attrs)
Ejemplo n.º 29
0
class TongChengSpider(SpiderBase):
    name = "tongcheng"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.TongChengHeaderMiddleware': 410,
        },
        # "DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://m.ctrip.com/restapi/busphp/app/index.php"

    def start_requests(self):
        # 这是个pc网页页面
        line_url = "http://m.ly.com/bus/BusJson/BusSchedule"
        for name in [
                "苏州", "南京", "无锡", "常州", "南通", "张家港", "昆山", "吴江", "常熟", "太仓",
                "镇江", "宜兴", "江阴", "兴化", "盐城", "扬州", "连云港", "徐州", "宿迁"
        ]:
            if not self.is_need_crawl(city=name):
                continue
            self.logger.info("start crawl city %s", name)
            start = {"name": name, "province": "江苏"}
            for s in self.get_dest_list(start["province"], start["name"]):
                name, code = s.split("|")
                end = {"name": name, "short_pinyin": code}
                self.logger.info("start %s ==> %s" %
                                 (start["name"], end["name"]))

                today = datetime.date.today()
                for i in range(self.start_day(), 8):
                    sdate = str(today + datetime.timedelta(days=i))
                    if self.has_done(start["name"], end["name"], sdate):
                        self.logger.info("ignore %s ==> %s %s" %
                                         (start["name"], end["name"], sdate))
                        continue
                    params = dict(Departure=start["name"],
                                  Destination=end["name"],
                                  DepartureDate=sdate,
                                  DepartureStation="",
                                  DptTimeSpan=0,
                                  HasCategory="true",
                                  Category="0",
                                  SubCategory="",
                                  ExParms="",
                                  Page="1",
                                  PageSize="1025",
                                  BookingType="0")
                    yield scrapy.Request(line_url,
                                         method="POST",
                                         body=urllib.urlencode(params),
                                         callback=self.parse_line,
                                         meta={
                                             "start": start,
                                             "end": end,
                                             "sdate": sdate
                                         })

    def parse_line(self, response):
        "解析班车"
        try:
            res = json.loads(response.body)
        except Exception, e:
            self.logger.error("%s %s", response.body, e)
            return
        res = res["response"]
        if int(res["header"]["rspCode"]) != 0:
            # self.logger.error("parse_target_city: Unexpected return, %s" % res["header"])
            return
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["name"], end["name"], sdate)

        for d in res["body"]["schedule"]:
            if not d["canBooking"]:
                continue
            left_tickets = int(d["ticketLeft"])
            from_city = unicode(d["departure"])
            to_city = unicode(d["destination"])
            from_station = unicode(d["dptStation"])
            to_station = unicode(d["arrStation"])

            attrs = dict(
                s_province=start["province"],
                s_city_id="",
                s_city_name=from_city,
                s_sta_name=from_station,
                s_city_code=get_pinyin_first_litter(from_city),
                s_sta_id=d.get("dptStationCode", ""),
                d_city_name=to_city,
                d_city_id="",
                d_city_code=end["short_pinyin"],
                d_sta_id="",
                d_sta_name=to_station,
                drv_date=d["dptDate"],
                drv_time=d["dptTime"],
                drv_datetime=dte.strptime(
                    "%s %s" % (d["dptDate"], d["dptTime"]), "%Y-%m-%d %H:%M"),
                distance=unicode(d["distance"]),
                vehicle_type=d["coachType"],
                seat_type="",
                bus_num=d["coachNo"],
                full_price=float(d["ticketPrice"]),
                half_price=float(d["ticketPrice"]) / 2,
                fee=float(d["ticketFee"]),
                crawl_datetime=dte.now(),
                extra_info={},
                left_tickets=left_tickets,
                crawl_source="tongcheng",
                shift_id="",
            )
            yield LineItem(**attrs)
Ejemplo n.º 30
0
    def parse_line(self, response):
        trainListInfo = json.loads(response.body)
        if trainListInfo:
            start = response.meta["start"]
            end = response.meta["end"]
            crawl_province = response.meta["crawl_province"]
            crawl_city = response.meta["crawl_city"]
            payload = response.meta["payload"]
            sdate = payload['sendDate']
            nextPage = int(trainListInfo['nextPage'])
            pageNo = int(trainListInfo['pageNo'])
            #                             print m['msg']
            content = trainListInfo['msg']
            if not isinstance(content, unicode):
                content = content.decode('utf-8')
            sel = etree.HTML(content)
            trains = sel.xpath('//div[@class="trainList"]')
            for n in trains:
                flag = 0
                buyInfo = n.xpath('ul/li[@class="buy"]/a[@class="btn"]/text()')
                if buyInfo:
                    d_str = n.xpath("@data-list")[0]
                    shift_str = d_str[d_str.index("id=") + 3:]
                    left_str = d_str[d_str.index("leftSeatNum=") + 12:]
                    shiftid = shift_str[:shift_str.index(",")]
                    leftSeatNum = left_str[:left_str.index(",")]
                    station = n.xpath('ul/li[@class="start"]/p/text()')
                    time = n.xpath('ul/li[@class="time"]/p/strong/text()')
                    bus_num = ''
                    bus_num = n.xpath(
                        'ul/li[@class="time"]/p[@class="carNum"]/text()')
                    if bus_num:
                        bus_num = bus_num[0].replace('\r\n',
                                                     '').replace(' ', '')
                    price = n.xpath('ul/li[@class="price"]/strong/text()')
                    flag = 1
                    attrs = dict(
                        s_province=crawl_province['province_name'],
                        s_city_name=start['countyName'],
                        s_city_id=start['countyId'],
                        s_city_code=get_pinyin_first_litter(
                            start['countyName']),
                        s_sta_name=station[0],
                        s_sta_id=start['countyId'],
                        d_city_name=end['portName'],
                        d_city_code=get_pinyin_first_litter(end['portName']),
                        d_city_id='',
                        d_sta_name=station[1],
                        d_sta_id='',
                        drv_date=sdate,
                        drv_time=time[0],
                        drv_datetime=dte.strptime("%s %s" % (sdate, time[0]),
                                                  "%Y-%m-%d %H:%M"),
                        distance=0,
                        vehicle_type="",
                        seat_type='',
                        bus_num=bus_num.decode("utf-8").strip().rstrip(u"次"),
                        full_price=float(str(price[0]).split('¥')[-1]),
                        half_price=float(str(price[0]).split('¥')[-1]) / 2,
                        fee=0,
                        crawl_datetime=dte.now(),
                        extra_info={"flag": flag},
                        left_tickets=int(leftSeatNum),
                        crawl_source="xintuyun",
                        shift_id=shiftid,
                    )
                    yield LineItem(**attrs)

            if nextPage > pageNo:
                url = 'http://www.xintuyun.cn/getBusShift/ajax' + '?pageNo=%s' % nextPage
                yield scrapy.FormRequest(url,
                                         formdata=payload,
                                         callback=self.parse_line,
                                         meta={
                                             "payload": payload,
                                             'crawl_province': crawl_province,
                                             'crawl_city': crawl_city,
                                             'start': start,
                                             "end": end
                                         })
            elif nextPage and nextPage == pageNo:
                self.mark_done(start["countyName"], end['portName'], sdate)