def parse_line(self, response): city_name = response.meta['city_name'] station_name = response.meta['station_name'] s_station_name = response.meta['s_station_name'] end = response.meta['end'] sdate = response.meta['date'] self.mark_done(station_name, end['city_name'], sdate) soup = bs(response.body, 'lxml') scl_list = soup.find('table', attrs={'id': 'ContentPlaceHolder1_GridViewbc'}) if not scl_list: return if scl_list: scl_list = scl_list.find_all('tr', attrs={'style': True}) for x in scl_list[1:]: y = x.find_all('td') ticket_status = y[3].get_text().strip() s_d_city_name = end['city_name'] d_city_name = re.sub("[A-Za-z]", "", s_d_city_name) if ticket_status == u"有票": drv_date = sdate bus_num = y[1].get_text().strip() drv_time = y[2].get_text().strip() distance = y[4].get_text().strip() vehicle_type = y[5].get_text().strip().decode('utf-8') full_price = y[6].get_text().strip() s_sta_name = y[7].get_text().strip() attrs = dict( s_province='山东', s_city_id="", s_city_name=city_name, s_city_code=get_pinyin_first_litter(unicode(city_name)), s_sta_name=station_name, s_sta_id='', d_city_name=d_city_name, d_city_id=end['city_id'], d_city_code=get_pinyin_first_litter(unicode(d_city_name)), d_sta_id='', d_sta_name=s_sta_name, drv_date=drv_date, drv_time=drv_time, drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M"), distance=distance, vehicle_type=vehicle_type, seat_type="", bus_num=bus_num, full_price=float(full_price), half_price=float(full_price) / 2, fee=0, crawl_datetime=dte.now(), extra_info={ 's_station_name': s_station_name, 's_d_city_name': s_d_city_name }, left_tickets=45, crawl_source="qdky", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end= response.meta["end"] sdate = response.meta["sdate"] self.mark_done(start["sta_name"], end["city_name"], sdate) soup = BeautifulSoup(response.body.replace("<!--", "").replace("-->", ""), "lxml") for e in soup.findAll("tr"): lst = e.findAll("td") if not lst: continue # td>MK0041</td> # <td>泰州南站<br/>南京</td> # <td>2016-04-08</td> # <td><span class="lv_time">05:45</span></td> # <td>大型高一</td> # <td>51</td> # <td><span class="tk_price">56</span></td> # <td><span class="lv_time">17</span></td> # <td>途径南京东站,终点站 : 南京南站</td> # <td><a href="#" onclick="if(buy_confirm('MK0041')) window.location.href='/index.php/busOrder/index/czozNTg6InsiQkNIIjoiTUswMDQxIiwiU0NaTUMiOiJcdTZjZjBcdTVkZGVcdTUzNTdcdTdhZDkiLCJTRlpETSI6IjllZDIzNDU0YjNlYWVlNDQzODEyMWJlOWM2NGNiNmUyIiwiRERaTUMiOiJcdTUzNTdcdTRlYWMiLCJERFpETSI6IjBlMmYwY2U4YmQ5MDk1NThkYWViMjFjNTUyMGI3M2NhIiwiWkRaRE0iOiIwMDAwMDAwMDAiLCJTQ1pETSI6IjllZDIzNDU0YjNlYWVlNDQzODEyMWJlOWM2NGNiNmUyIiwiWkRaTUMiOiJcdTUzNTdcdTRlYWMiLCJGQ1JRIjoiMjAxNi0wNC0wOCIsIkZDU0oiOiIwNTo0NSIsIkNYIjoiXHU1OTI3XHU1NzhiXHU5YWQ4XHU0ZTAwIiwiWVBTIjoiMTciLCJIRFpXIjoiNTEiLCJRUEoiOiI1NiIsIlRQSiI6IjI4In0iOw~~';" class="buy_btn" title="购票">购票</a></td> bus_num = lst[0].text.strip() drv_date = lst[2].text.strip() drv_time = lst[3].text.strip() bus_type = lst[4].text.strip() price = float(lst[6].text.strip()) left_tickets = int(lst[7].text.strip()) lock_form_url = re.findall(r"href='(\S+)'", lst[9].select_one("a").get("onclick"))[0] attrs = dict( s_province = "江苏", s_city_id = "", s_city_name = "泰州", s_sta_name = start["sta_name"], s_city_code= "tz", s_sta_id= "", d_city_name = end["city_name"], d_city_id=end["city_id"], d_city_code=end["city_code"], d_sta_id="", d_sta_name=end["city_name"], drv_date=drv_date, drv_time=drv_time, drv_datetime = dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M"), distance = "0", vehicle_type = bus_type, seat_type = "", bus_num = bus_num, full_price = price, half_price = price/2, fee = 0, crawl_datetime = dte.now(), extra_info = {"lock_form_url": lock_form_url}, left_tickets = left_tickets, crawl_source = "tzky", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): if response.body.strip() in ["[]", 0, "0"]: return res_lst = json.loads(response.body) start = response.meta['start'] end = response.meta['end'] sdate = response.meta['sdate'] self.mark_done(start, end, sdate) self.logger.info("finish %s==>%s %s", start["name"], end["name"], sdate) for x in res_lst: drv_date = x['bpnDate'] drv_time = x['bpnSendTime'] s_sta_name = x['shifazhan'] s_sta_id = x['siID'] d_sta_name = x['prtName'] left_tickets = x['bpnLeftNum'] vehicle_type = x['btpName'] extra = { 'sid': x['siID'], 'dpid': x['prtID'], 'l': x['bliID'], 't': x['bpnDate'], } bus_num = x['bliID'] full_price = x['prcPrice'] attrs = dict( s_province=start["province"], s_city_id=start["id"], s_city_name=start["name"], s_sta_name=s_sta_name, # 不太确定 s_city_code=start["code"], s_sta_id=s_sta_id, d_city_name=end["name"], d_city_id=end["dest_id"], d_city_code=end["code"], d_sta_id="", d_sta_name=d_sta_name, drv_date=drv_date, drv_time=drv_time, drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M"), distance='', vehicle_type=vehicle_type, seat_type="", bus_num=bus_num, full_price=float(full_price), half_price=float(full_price) / 2, fee=0.0, crawl_datetime=dte.now(), extra_info=extra, left_tickets=int(left_tickets), crawl_source="sd365", shift_id="", ) if int(left_tickets): yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] res = json.loads(response.body) if res["rtn_code"] != "00": self.logger.error("parse_line: Unexpected return, %s", res) return shift_list = res["data"] or [] self.mark_done(start["sta_name"], end["city_name"], sdate) for d in shift_list: drv_datetime = dte.strptime( "%s %s" % (d["drive_date"], d["plan_time"]), "%Y%m%d %H%M") s_sta_name = d["rst_name"] if u"��" in d["dst_name"]: continue if u"��" in s_sta_name: # 有乱码 s_sta_name = start["sta_name"] attrs = dict( s_province="江苏", s_city_id="", s_city_name=start["city_name"], s_sta_name=d["rst_name"], s_city_code=start["city_code"], s_sta_id=d["rstcode"], d_city_name=end["city_name"], d_city_id="", d_city_code=end["city_code"], d_sta_id=d["dstcode"], d_sta_name=d["dst_name"], drv_date=drv_datetime.strftime("%Y-%m-%d"), drv_time=drv_datetime.strftime("%H:%M"), drv_datetime=drv_datetime, distance=unicode(d["mileage"]), vehicle_type=d["m_name"], seat_type="", bus_num=d["bus_code"], full_price=float(d["full_price"]), half_price=float(d["half_price"]), fee=0, crawl_datetime=dte.now(), extra_info={ "startstation": d["sst_name"], "terminalstation": d["tst_name"], "startstationcode": d["sstcode"] }, left_tickets=int(d["available_tickets"]), crawl_source="jsdlky", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] end_code = response.meta["end_code"] sdate = response.meta["date"] self.mark_done(start, end, sdate) content = response.body if not isinstance(content, unicode): content = content.decode('utf-8') sel = etree.HTML(content) scheduleList = sel.xpath( '//div[@id="visitorDataTable"]/table/tbody/tr') if scheduleList: for i in scheduleList[1:]: bus_num = i.xpath('td[1]/text()')[0] start_station = i.xpath('td[2]/text()')[0] end_station = i.xpath('td[2]/text()')[0] drv_time = i.xpath('td[5]/span[@class="lv_time"]/text()')[0] price = i.xpath('td[8]/span[@class="tk_price"]/text()')[0] left_tickets = i.xpath('td[9]/span/text()')[0] postdata = i.xpath('td[10]/a/@onclick')[0].split(',')[1][1:-3] attrs = dict( s_province='内蒙古', s_city_name=u"呼和浩特", s_city_id='', s_city_code=get_pinyin_first_litter(u"呼和浩特"), s_sta_name=start, s_sta_id='', d_city_name=end, d_city_code=get_pinyin_first_litter(end), d_city_id='', d_sta_name=end, d_sta_id=end_code, drv_date=sdate, drv_time=drv_time, drv_datetime=dte.strptime("%s %s" % (sdate, drv_time), "%Y-%m-%d %H:%M"), distance="0", vehicle_type="", seat_type="", bus_num=bus_num, full_price=float(price), half_price=float(price) / 2, fee=0, crawl_datetime=dte.now(), extra_info={"postdata": postdata}, left_tickets=int(left_tickets), crawl_source="nmghy", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): start = response.meta['start'] end = response.meta['end'] sdate = response.meta['sdate'].decode('utf-8') self.mark_done(start["sta_name"], end["name"], sdate) self.logger.info("finish %s=>%s, %s", start["sta_name"], end["name"], sdate) soup = bs(response.body, 'lxml') for tr_o in soup.select("table #selbuy"): td_lst = tr_o.find_all('td') if len(td_lst) < 2: continue index_tr = lambda idx: td_lst[idx].text.strip().decode("utf-8") drv_date, drv_time = sdate, index_tr(1) if u"流水" in drv_time: continue attrs = dict( s_province='山东', s_city_id="", s_city_name=start["city_name"], s_sta_name=start["sta_name"], s_city_code=start["city_code"], s_sta_id=start["sta_id"], d_city_name=end["name"], d_city_id=end["dest_id"], d_city_code=end["code"], d_sta_id=end["dest_id"], d_sta_name=end["name"], drv_date=drv_date, drv_time=drv_time, drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M"), distance='', vehicle_type=index_tr(4), seat_type="", bus_num=index_tr(0), full_price=float(index_tr(6)), half_price=float(index_tr(6)) / 2, fee=0.0, crawl_datetime=dte.now(), extra_info={"startNo": index_tr(11)}, left_tickets=int(index_tr(10)), crawl_source="glcx", shift_id="", ) if attrs["left_tickets"]: yield LineItem(**attrs)
def parse_line(self, response): start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] content = response.body if not isinstance(content, unicode): content = content.decode('utf-8') self.mark_done(start, end[0], sdate) sel = etree.HTML(content) scheduleInfo = sel.xpath('//input[@id="scheduleInfoJson"]/@value') if scheduleInfo: scheduleInfo = json.loads(scheduleInfo[0]) for d in scheduleInfo: if not isinstance(d, dict): continue if int(d['seatLast']) == 0: continue if float(d["price"]) < 5: continue attrs = dict( s_province='辽宁', s_city_name=start, s_city_id='', s_city_code=get_pinyin_first_litter(start), s_sta_name=d['fromStation'], s_sta_id='', d_city_name=end[0], d_city_code=get_pinyin_first_litter(end[0]), d_city_id='', d_sta_name=d['toStation'], d_sta_id='', drv_date=sdate, drv_time=d["driveTime"], drv_datetime=dte.strptime( "%s %s" % (sdate, d["driveTime"]), "%Y-%m-%d %H:%M"), distance="0", vehicle_type="", seat_type="", bus_num=d['trainNumber'], full_price=float(d["price"]), half_price=float(d["price"]) / 2, fee=0, crawl_datetime=dte.now(), extra_info={'lineNo': d['lineNo']}, left_tickets=int(d["seatLast"]), crawl_source="lnky", shift_id='', ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["date"] res = json.loads(response.body) if res["code"] != 1100: # self.logger.error("parse_line: Unexpected return, %s", res["message"]) return self.mark_done(start["city_name"], end["city_name"], sdate) for d in res["data"]: if d["stationCode"] == "7FC222B8-A1EA-42E3-B242-D1CFA3AF28C1": # 过滤郑州非汽车站内票 continue drv_datetime = dte.strptime("%s %s" % (d["dptDate"], d["dptTime"]), "%Y-%m-%d %H:%M:%S") attrs = dict( s_province=start["province"], s_city_name=start["city_name"], s_city_id=start["city_id"], s_city_code=start["city_code"], s_sta_name=d["dptStation"], s_sta_id=d["stationCode"], d_city_name=end["city_name"], d_city_code=end["city_code"], d_city_id=end["city_id"], d_sta_name=d["arrStation"], d_sta_id="", drv_date=drv_datetime.strftime("%Y-%m-%d"), drv_time=drv_datetime.strftime("%H:%M"), drv_datetime=drv_datetime, distance="0", vehicle_type=d["coachType"], seat_type="", bus_num=d["coachNo"], full_price=float(d["ticketPrice"]), half_price=float(d["ticketPrice"]) / 2, fee=float(d["fee"]), crawl_datetime=dte.now(), extra_info={ "exData1": d["exData1"], "exData2": d["exData2"] }, left_tickets=int(d["ticketLeft"] or 0), crawl_source="fangbian", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): start = response.meta['start'] end = response.meta['end'] sdate = response.meta['sdate'] self.mark_done(start["sta_name"], end["name"], sdate) soup = bs(response.body, 'lxml') for tr_o in soup.select("#ctl00_ContentPlaceHolder1_GVBccx tr")[1:]: if tr_o.get("class") and "GridViewHeaderStyle" in tr_o.get( "class"): continue td_lst = tr_o.select("td") index_tr = lambda idx: td_lst[idx].text.strip() drv_date, drv_time = index_tr(0), index_tr(5) if u"流水" in drv_time: continue attrs = dict( s_province='江苏', s_city_id=start["city_id"], s_city_name=start["city_name"], s_sta_name=index_tr(1), s_city_code=start["city_code"], s_sta_id=start["sta_id"], d_city_name=end["name"], d_city_id="", d_city_code=end["code"], d_sta_id="", d_sta_name=index_tr(3), drv_date=drv_date, drv_time=drv_time, drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M"), distance=unicode(index_tr(11)), vehicle_type=unicode(index_tr(10)), seat_type="", bus_num=index_tr(2), full_price=float(index_tr(6)), half_price=float(index_tr(7)), fee=0, crawl_datetime=dte.now(), extra_info={"lock_url": td_lst[12].find("a").get("href")}, left_tickets=int(index_tr(8)), crawl_source="xyjt", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] res = json.loads(response.body) self.mark_done(start["city_name"], end["stopName"], sdate) res = res['detail'] for d in res: if int(d['seatAmount']) == 0: continue if d['carrStaName'] != u"八王坟": continue attrs = dict( s_province='北京', s_city_name="北京", s_city_id='', s_city_code=get_pinyin_first_litter(u"北京"), s_sta_name=d["carrStaName"], s_sta_id=d["carryStaId"], d_city_name=end['stopName'], d_city_code=get_pinyin_first_litter(end['stopName']), d_city_id=end['stopId'], d_sta_name=d["endstaName"], d_sta_id='', drv_date=sdate, drv_time=d['drvTime'], drv_datetime=dte.strptime("%s %s" % (sdate, d['drvTime']), "%Y-%m-%d %H:%M"), distance="0", vehicle_type="", seat_type="", bus_num=d['scheduleId'], full_price=float(d['fullPrice']), half_price=float(d['fullPrice']) / 2, fee=0, crawl_datetime=dte.now(), extra_info={}, left_tickets=int(d['seatAmount']), crawl_source="e8s", shift_id='', ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" res = json.loads(response.body) start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] self.mark_done(start["city_name"], end["city_name"], sdate) for d in res['body']["ticketLines"]: drv_datetime = dte.strptime(d["drvDateTime"], "%Y-%m-%d %H:%M") drv_date, drv_time = d["drvDateTime"].split(" ") if int(d["amount"]) == 0 or d["schTypeId"] != "0": #过滤不是固定班的班次 continue attrs = dict( s_province=start["province"], s_city_name=start["city_name"], s_city_id=start["city_id"], s_city_code=start["city_code"], s_sta_name=d["carryStaName"], s_sta_id=d["carryStaId"], d_city_name=d["stopName"], d_city_id="", d_city_code=end["city_code"], d_sta_name=d["stopName"], d_sta_id="", drv_date=drv_date, drv_time=drv_time, drv_datetime=drv_datetime, distance=d["mile"], vehicle_type=d["busTypeName"], seat_type="", bus_num=d["schId"], full_price=float(d["fullPrice"]), half_price=float(d["halfPrice"]), fee = float(d["servicePrice"]), crawl_datetime = dte.now(), extra_info = {"sign_id": d["signId"],'end':end}, left_tickets = int(d["amount"]), crawl_source = "scqcp", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] res = json.loads(response.body) self.mark_done(start["findname"], end['city_name'], sdate) for d in res['schedules']: if int(d['iscansell']) != 1: continue if float(d['fullprice']) < 11: continue attrs = dict( s_province = start['province'], s_city_name = start['findname'], s_city_id = start['id'], s_city_code= get_pinyin_first_litter(start['findname']), s_sta_name= d["busshortname"], s_sta_id = d["stationorgid"], d_city_name = d["stationname"], d_city_code= get_pinyin_first_litter(d["stationname"]), d_city_id = d['stationid'], d_sta_name = d["stationname"], d_sta_id = '', drv_date = sdate, drv_time = d['departtime'][0:-3], drv_datetime = dte.strptime("%s %s" % (sdate, d['departtime'][0:-3]), "%Y-%m-%d %H:%M"), distance = d["rundistance"], vehicle_type = "", seat_type = d['seattype'], bus_num = d['schedulecode'], full_price = float(d['fullprice']), half_price = float(d['fullprice'])/2, fee = 3, crawl_datetime = dte.now(), extra_info = {'start_info':start}, left_tickets = int(d['residualnumber']), crawl_source = "bus365", shift_id=d['id'], ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["date"] self.logger.info("finish %s ==> %s" % (start, end["city_name"])) self.mark_done(start, end['city_name'], sdate) res = json.loads(trans_js_str(response.body)) for d in res["data"]: if d['SchStat'] == '1': attrs = dict( s_province=u'广东', s_city_name=u"深圳", s_city_id='', s_city_code=get_pinyin_first_litter(u"深圳"), s_sta_name=d["SchWaitStName"], s_sta_id=d["SchStationCode"], d_city_name=end['city_name'], d_city_code=end['city_code'], d_city_id=d['SchDstNode'], d_sta_name=d["SchNodeName"], d_sta_id=d["SchNodeCode"], drv_date=d["SchDate"], drv_time=d["orderbytime"], drv_datetime=dte.strptime( "%s %s" % (d["SchDate"], d["orderbytime"]), "%Y-%m-%d %H:%M"), distance="0", vehicle_type="", seat_type="", bus_num=d["SchLocalCode"], full_price=float(d["SchStdPrice"]), half_price=float(d["SchStdPrice"]) / 2, fee=0, crawl_datetime=dte.now(), extra_info={"raw_info": d}, left_tickets=int(d["SchTicketCount"]), crawl_source="szky", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] self.logger.info("finish %s ==> %s" % (start, end)) self.mark_done(start, end, sdate) res = json.loads(response.body) sch_list = res['flightList'] for d in sch_list: attrs = dict( s_province='上海', s_city_name=start, s_city_id='', s_city_code=get_pinyin_first_litter(unicode(start)), s_sta_name=d['stationName'], s_sta_id=d['stationId'], d_city_name=d['arriveRegionName'], d_city_code=get_pinyin_first_litter(d['arriveRegionName']), d_city_id=d['arriveRegionId'], d_sta_name=d['arriveRegionName'], d_sta_id='', drv_date=sdate, drv_time=d['flightTime'], drv_datetime=dte.strptime("%s %s" % (sdate, d['flightTime']), "%Y-%m-%d %H:%M"), distance='0', vehicle_type="", seat_type='', bus_num=d['flightNo'], full_price=float(d['price']), half_price=float(d['halfPrice']), fee=0, crawl_datetime=dte.now(), extra_info={"raw_info": d}, left_tickets=int(d['lastCount']), crawl_source="shkyzz", shift_id='', ) yield LineItem(**attrs)
def parse_line(self, response): trainListInfo = json.loads(response.body) if trainListInfo: start = response.meta["start"] end = response.meta["end"] crawl_province = response.meta["crawl_province"] crawl_city = response.meta["crawl_city"] payload = response.meta["payload"] sdate = payload['sendDate'] item = LineItem() item['crawl_source'] = 'bus100' item['s_province'] = crawl_province['province_name'] item['s_city_id'] = start['countyId'] item['s_city_name'] = start['countyName'] item['s_sta_id'] = start['countyId'] start_short_name = start['pinyinPrefix'] if not start_short_name or start_short_name == 'null': start_short_name = get_pinyin_first_litter( item['start_city_name']) item['s_city_code'] = start_short_name item['d_city_name'] = end['portName'] d_city_code = end['pinyinPrefix'] if not d_city_code: d_city_code = "".join( map( lambda x: x[0], pypinyin.pinyin(unicode(end['portName']), style=pypinyin.FIRST_LETTER))) item['drv_date'] = sdate item['d_city_code'] = d_city_code nextPage = int(trainListInfo['nextPage']) pageNo = int(trainListInfo['pageNo']) # print m['msg'] sel = etree.HTML(trainListInfo['msg']) trains = sel.xpath('//div[@class="trainList"]') for n in trains: d_str = n.xpath("@data-list")[0] d_str = d_str[d_str.index("id=") + 3:] shiftid = d_str[:d_str.index(",")] station = n.xpath('ul/li[@class="start"]/p/text()') time = n.xpath('ul/li[@class="time"]/p/strong/text()') # print 'time->',time[0] banci = '' banci = n.xpath( 'ul/li[@class="time"]/p[@class="carNum"]/text()') if banci: banci = banci[0].replace('\r\n', '').replace(' ', '') else: ord_banci = n.xpath( 'ul/li[@class="time"]/p[@class="banci"]/text()') if ord_banci: banci = ord_banci[0] price = n.xpath('ul/li[@class="price"]/strong/text()') # print 'price->',price[0] infor = n.xpath( 'ul/li[@class="carriers"]/p[@class="info"]/text()') distance = '' if infor: distance = infor[0].replace('\r\n', '').replace(' ', '') buyInfo = n.xpath('ul/li[@class="buy"]') flag = 0 for buy in buyInfo: flag = buy.xpath('a[@class="btn"]/text()') #判断可以买票 if flag: flag = 1 else: flag = 0 item['drv_time'] = time[0] item['drv_datetime'] = datetime.datetime.strptime( sdate + ' ' + time[0], "%Y-%m-%d %H:%M") item['s_sta_name'] = station[0] item['d_sta_name'] = station[1] item['bus_num'] = banci.decode("utf-8").strip().rstrip(u"次") item["full_price"] = float(str(price[0]).split('¥')[-1]) item["half_price"] = float(str(price[0]).split('¥')[-1]) / 2 item['distance'] = distance item['shift_id'] = str(shiftid) item['crawl_datetime'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['vehicle_type'] = '' item['seat_type'] = '' item['fee'] = 0 item['left_tickets'] = 50 if flag else 0 item['extra_info'] = {"flag": flag} yield item if nextPage > pageNo: url = 'http://84100.com/getBusShift/ajax' + '?pageNo=%s' % nextPage yield scrapy.FormRequest(url, formdata=payload, callback=self.parse_line, meta={ "payload": payload, 'crawl_province': crawl_province, 'crawl_city': crawl_city, 'start': start, "end": end }) elif nextPage and nextPage == pageNo: self.mark_done(start["countyName"], end['portName'], sdate)
def parse_line(self, response): s_city_name = response.meta['city'].decode('utf-8') start = response.meta['start'].decode('utf-8') end = response.meta['end'].decode('utf-8') sdate = response.meta['sdate'].decode('utf-8') self.mark_done(start, end, sdate) soup = bs(response.body, 'lxml') info = soup.find('table', attrs={'class': 'resulttb'}).find_all( 'tbody', attrs={'class': 'rebody'}) for x in info: try: bus_num = x.find( 'td', attrs={'align': 'center'}).get_text().strip() s_sta_name = x.find_all( 'td')[1].get_text().split()[0] d_city_name = x.find_all('td')[1].get_text().split()[1] drv_date = x.find_all('td')[2].get_text().strip() drv_time = x.find_all('td')[3].get_text().strip() # d_sta_name = x.find_all('td')[4].get_text().strip() distance = x.find_all('td')[5].get_text().strip() vehicle_type = x.find_all('td')[6].get_text().strip() full_price = x.find_all('td')[7].get_text().strip() left_tickets = int(x.find_all('td')[8].get_text().strip()) y = x.find_all('td')[9].a.get('href').split('?')[-1] extra = {} for z in y.split('&'): extra[z.split('=')[0]] = z.split('=')[1] attrs = dict( s_province='河南', s_city_id="", s_city_name=s_city_name, s_sta_name=s_sta_name, s_city_code=get_pinyin_first_litter(s_city_name), s_sta_id=extra['g'], d_city_name=d_city_name, d_city_id="", d_city_code=get_pinyin_first_litter(d_city_name), d_sta_id="", d_sta_name=d_city_name, drv_date=drv_date, drv_time=drv_time, drv_datetime=dte.strptime("%s %s" % ( drv_date, drv_time), "%Y-%m-%d %H:%M"), distance=unicode(distance), vehicle_type=vehicle_type, seat_type="", bus_num=bus_num, full_price=float(full_price), half_price=float(full_price) / 2, fee=0.0, crawl_datetime=dte.now(), extra_info=extra, left_tickets=left_tickets, crawl_source="hn96520", shift_id="", ) yield LineItem(**attrs) except: pass
class SzkyWapSpider(SpiderBase): name = "szky_wap" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.ProxyMiddleware': 410, 'BusCrawl.middleware.SzkyHeaderMiddleware': 410, }, # "DOWNLOAD_DELAY": 0.1, "RANDOMIZE_DOWNLOAD_DELAY": True, } def start_requests(self): start_url = "http://www.vchepiao.cn/mb/base/bus/queryNewSKY" station_dict = { "B1K003": '福田汽车客运站', "B1K002": "深圳湾客运服务点", "B1K004": "南山汽车客运站", "B1K005": "盐田汽车客运站", "B1K006": "东湖汽车客运站", "B2K037": "深圳北汽车客运站", "B1K010": "皇岗汽车客运站", "B2K040": "机场汽车客运站", } for k, v in station_dict.items(): data = { "stationCode": k, } yield scrapy.FormRequest(start_url, method="POST", formdata=data, callback=self.parse_target_city, meta={"start_code": k}) def parse_target_city(self, response): start_code = response.meta["start_code"] res = json.loads(response.body) if not res["success"]: self.logger.error("parse_target_city: Unexpected return, %s", res) return line_url = "http://www.vchepiao.cn/mb/base/bus/queryBusSKY" end_list = res['data'] for end in end_list: today = datetime.date.today() for j in range(1, 7): sdate = str(today+datetime.timedelta(days=j)) sdate_tra = sdate.replace('-', '') # if self.has_done(start[1], end["depotName"], sdate): # self.logger.info("ignore %s ==> %s %s" % (start[1], end["depotName"], sdate)) # continue data = { "fromCity": "深圳", "stationCode": start_code, "dstNode": end['NDName'], "schDate": sdate_tra } yield scrapy.FormRequest(line_url, method="POST", formdata=data, callback=self.parse_line, meta={"start_code": start_code, "end": end, "date": sdate}) def parse_line(self, response): "解析班车" start_code = response.meta["start_code"] end = response.meta["end"] sdate = response.meta["date"] # self.mark_done(start[1], end["depotName"], sdate) try: res = json.loads(response.body) except Exception, e: raise e # if res["values"]["resultList"]: # print res["values"]["resultList"] # print start["name"] ,end["depotName"] if not res["success"]: #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"]) return if res["data"]["list"]: print res for d in res["data"]["list"]: if d['SchStat'] == '1': attrs = dict( s_province = u'广东', s_city_name = u"深圳", s_city_id = '', s_city_code= get_pinyin_first_litter(u"深圳"), s_sta_name = d["SchWaitStName"], s_sta_id = d["SchStationCode"], d_city_name = d["SchDstCity"], d_city_code=get_pinyin_first_litter(d["SchDstCity"]), d_city_id = d['SchStationCode'], d_sta_name = d["SchNodeName"], d_sta_id = d["SchDstNode"], drv_date = d["SchDate"], drv_time = d["orderbytime"], drv_datetime = dte.strptime("%s %s" % (d["SchDate"], d["orderbytime"]), "%Y-%m-%d %H:%M"), distance = "0", vehicle_type = "", seat_type = "", bus_num = d["SchLocalCode"], full_price = float(d["SchStdPrice"]), half_price = float(d["SchStdPrice"])/2, fee = 0, crawl_datetime = dte.now(), extra_info = {"raw_info": d}, left_tickets = int(d["SchSeatCount"]), crawl_source = "szky", shift_id="", ) yield LineItem(**attrs)
class AnxingBusSpider(SpiderBase): name = "anxingbus" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.ProxyMiddleware': 410, }, "DOWNLOAD_DELAY": 0.6, "RANDOMIZE_DOWNLOAD_DELAY": True, } base_url = "http://android.anxingbus.com" # BASE_URL = "http://www.anxingbus.com" def start_requests(self): url = self.base_url + "/sell/GetCity" yield scrapy.Request(url, callback=self.parse_starting, headers=HEADERS) def get_dest_list_from_web(self, province, city, unitid=""): # def get_dest_list(self, province, city, unitid=""): data = {"unitid": unitid, "cityName": city} url = self.base_url + "/sell/GetEndStations?" + urllib.urlencode(data) r = requests.get(url, headers=HEADERS) ret = r.json() result = [] for d in ret["data"][0].values(): for city_id, city_info_str in d.items(): lst = city_info_str.split("|") if city_id != lst[0]: raise Exception() result.append({ "dest_id": city_id, "name": lst[1], "code": lst[3] }) return result def parse_starting(self, response): ret = json.loads(response.body) url = self.base_url + "/sell/GetBus" today = datetime.date.today() for d in ret["data"][0].values(): for city_id, city_info_str in d.items(): lst = city_info_str.split("|") if city_id != lst[0]: raise Exception() city_name = unicode(lst[1]) if city_name not in C2P: continue start = { "city_id": city_id, "city_name": city_name, "city_code": lst[3], "unitid": lst[9], "province": C2P[city_name] } if not self.is_need_crawl(city=start["city_name"], province=C2P[city_name]): continue for end in self.get_dest_list(start["province"], start["city_name"], unitid=start["unitid"]): for i in range(self.start_day(), 8): sdate = str(today + datetime.timedelta(days=i)) if self.has_done(start["city_name"], end["name"], sdate): continue params = { "unitID": start["unitid"], "busType": 0, "cityID": start["city_id"], "sellPlateStationID": "", "sellStationID": "", "endCityID": end["dest_id"], "endStationID": "", "busStartTime": sdate, "busEndTime": "%s 23:59:59" % sdate, "curPage": 1, "pageSize": 1024, } yield scrapy.Request("%s?%s" % (url, urllib.urlencode(params)), callback=self.parse_line, meta={ "start": start, "end": end, "sdate": sdate, "params": params }, headers=HEADERS) def parse_line(self, response): start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] try: res = json.loads(response.body) except Exception, e: print response.body raise e self.mark_done(start["city_name"], end["name"], sdate) self.logger.info("finish %s ==> %s %s" % (start["city_name"], end["name"], sdate)) for d in res.get("data", []): drv_datetime = dte.strptime(d["BusTime"], "%Y-%m-%d %H:%M") attrs = dict( s_province=start["province"].rstrip("省"), s_city_id=start["city_id"], s_city_name=start["city_name"], s_sta_name=d["SellStationName"], s_city_code=start["city_code"], s_sta_id=d["SellStationID"], d_city_name=end["name"], d_city_id=end["dest_id"], d_city_code=end["code"], d_sta_id=d["StationID"], d_sta_name=d["StationName"], drv_date=drv_datetime.strftime("%Y-%m-%d"), drv_time=drv_datetime.strftime("%H:%M"), drv_datetime=drv_datetime, distance="", vehicle_type="%s(%s)" % (d["BusType"], d["Kind"]), seat_type="", bus_num=d["BusID"], full_price=float(d["FullPrice"]), half_price=float(d["HalfPrice"]) / 2, fee=0, crawl_datetime=dte.now(), extra_info={ "UnitID": d["UnitID"], "BusGuid": d["BusGuid"], "Type": d["Type"], "IsDirect": d["IsDirect"] }, left_tickets=int(d["SeatNum"]), crawl_source="anxing", shift_id="", ) yield LineItem(**attrs)
class WxszSpider(SpiderBase): name = "wxsz" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400, # 'BusCrawl.middleware.ZjgsmHeaderMiddleware': 410, 'BusCrawl.middleware.ProxyMiddleware': 410, }, "DOWNLOAD_DELAY": 0.5, "RANDOMIZE_DOWNLOAD_DELAY": True, } base_url = "http://www.zjgsmwy.com" def start_requests(self): start_url = "http://coach.wisesz.mobi/coach_v38/main/getstations" yield scrapy.FormRequest(start_url, callback=self.parse_start_city) def parse_start_city(self, response): res = json.loads(response.body) if res["errorCode"] != 0: self.logger.error("parse_start_city: Unexpected return, %s" % res["rtnMsg"]) return name_trans = { u"张家港地区": "张家港", u"苏州市区": "苏州", u"常熟地区": "常熟", u"昆山地区": "昆山", u"太仓地区": "太仓", u"吴江地区": "吴江", } line_url = "http://coach.wisesz.mobi/coach_v38/main/get_tickets" for d in res["data"]["dataList"]: start = { "city_id": d["FIELDS1"], "city_name": name_trans[d["FIELDS2"]], } if not self.is_need_crawl(city=start["city_name"]): continue for sta in d["stations"]: start.update({ "sta_name": sta["FIELDS3"], "sta_id": sta["FIELDS2"], }) for s in self.get_dest_list("江苏", start["city_name"]): name, code = s["name"], s["code"] end = {"city_name": name, "city_code": code} self.logger.info("start %s ==> %s" % (start["sta_name"], end["city_name"])) today = datetime.date.today() for i in range(self.start_day(), 8): sdate = (today + datetime.timedelta(days=i)).strftime("%Y%m%d") if self.has_done(start["sta_name"], end["city_name"], sdate): continue params = { "departdate": sdate, "destination": end["city_name"], "fromcode": start["sta_id"], "from": start["sta_name"], } yield scrapy.Request( "%s?%s" % (line_url, urllib.urlencode(params)), method="POST", callback=self.parse_line, headers={ "Content-Type": "application/json;charset=UTF-8" }, meta={ "start": start, "end": end, "sdate": sdate }) def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] self.mark_done(start["sta_name"], end["city_name"], sdate) try: res = json.loads(response.body) except Exception, e: self.logger.error(response.body) raise e if res["errorCode"] != 0: self.logger.error("parse_line: Unexpected return, %s", res) return shift_list = res["data"]["dataList"] or [] for d in shift_list: drv_datetime = dte.strptime("%s %s" % (d["FIELDS1"], d["FIELDS3"]), "%Y%m%d %H%M") attrs = dict( s_province="江苏", s_city_id=start["city_id"], s_city_name=start["city_name"], s_sta_name=d["FIELDS4"], s_city_code=get_pinyin_first_litter(unicode( start["city_name"])), s_sta_id=d["fromcode"], d_city_name=end["city_name"], d_city_id="", d_city_code=end["city_code"], d_sta_id=d["FIELDS11"], d_sta_name=d["FIELDS5"], drv_date=drv_datetime.strftime("%Y-%m-%d"), drv_time=drv_datetime.strftime("%H:%M"), drv_datetime=drv_datetime, distance=unicode(d["FIELDS16"]), vehicle_type=d["FIELDS9"], seat_type="", bus_num=d["FIELDS2"], full_price=float(d["FIELDS14"]), half_price=float(d["FIELDS15"]), fee=0, crawl_datetime=dte.now(), extra_info={ "startstation": d["FIELDS17"], "terminalstation": d["FIELDS6"] }, left_tickets=int(d["FIELDS10"]), crawl_source="wxsz", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" res = response.body.decode('gbk') start_name = response.meta["start_name"] sw_name = response.meta["sw_name"] start_code = response.meta["start_code"] end = response.meta["end"] sdate = response.meta["sdate"] sel = etree.HTML(res) next_url = '' for i, j in enumerate(sel.xpath("//a/text()")): if j == '下一页': next_url = sel.xpath("//a/@href")[i] # countObj = re.findall("查询到(\d+)班", str(res)) # if countObj: # count = countObj # page = int(math.ceil(count/10)) form = sel.xpath('//form[@method="Post"]/@action') full_price = 0 left_tickets = 0 flag = False if form: sch = sel.xpath('//table[@width="600"]/tr') for i in sch[1:]: status = i.xpath('td[8]/div/text()')[0].replace('\r\n', '').replace('\t', '').replace(' ', '') if status != '售票': continue bus_num = i.xpath('td[1]/div/text()')[0].replace('\r\n', '').replace('\t', '').replace(' ', '') drv_date = i.xpath('td[2]/div/text()')[0].replace('\r\n', '').replace('\t', '').replace(' ', '') drv_date = dte.strftime(dte.strptime(drv_date, '%Y-%m-%d'),'%Y-%m-%d') drv_time = i.xpath('td[3]/div/text()')[0].replace('\r\n', '').replace('\t', '').replace(' ', '') start_station = i.xpath('td[4]/div/text()')[0].replace('\r\n', '').replace('\t', '').replace(' ', '') #end_station = i.xpath('td[5]/div/text()')[0].replace('\r\n', '').replace('\t', '').replace(' ', '') distance = i.xpath('td[7]/div/text()')[0].replace('\r\n', '').replace('\t', '').replace(' ', '') href = i.xpath('td[9]/div/a/@onclick')[0] if 'javascript:alert' in href: continue if not flag: for i in range(5): param = {} for s in href.split(";")[0][15:-1].split("?")[1].split("&"): k, v = s.split("=") param[k] = v.encode('gb2312') query_url = "%s%s" % ('http://www.mp0769.com/orderlist.asp?', urllib.urlencode(param)) req = self.urllib2.Request(query_url, headers=self.headers) result = self.urllib2.urlopen(req) content = result.read() res = content.decode('gbk') if '非法操作' in res: query_url = "http://www.mp0769.com/" + href.split(";")[0][15:-1] req = self.urllib2.Request(query_url, headers=self.headers) result = self.urllib2.urlopen(req) content = result.read() res = content.decode('gbk') check_url = re.findall("window.location.href=(.*);", res)[0][1:-1] check_url = "http://www.mp0769.com/" + check_url param = {} for s in check_url.split("?")[1].split("&"): k, v = s.split("=") param[k] = v.encode('gb2312') order_url = "http://www.mp0769.com/orderlist.asp?" order_url = "%s%s" % (order_url, urllib.urlencode(param)) req = self.urllib2.Request(order_url, headers=self.headers) result = self.urllib2.urlopen(req) content = result.read() sel = etree.HTML(content) params = {} for s in sel.xpath("//form[@id='Form1']//input"): k, v = s.xpath("@name"), s.xpath("@value") if k: k, v = k[0], v[0] if k else "" params[k] = v.encode('gb2312') if not params or int(params.get('ct_price', 0)) == 0: end_station = params['ct_stname'].decode('gbk') else: print "ct_price ", params['ct_price'] full_price = params['ct_price'] left_tickets = params['ct_accnum'] end_station = params['ct_stname'].decode('gbk') flag = True break drv_datetime = dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M") if not flag: result = self.query_line_info_by_gdsw(sw_name,end_station,bus_num,drv_datetime) if result: full_price = result['full_price'] left_tickets = result['left_tickets'] flag = True else: print 111111,sw_name,end_station,bus_num,drv_datetime print 3333333,end attrs = dict( s_province = u'广东', s_city_name = u"东莞", s_city_id = '', s_city_code= get_pinyin_first_litter(u"东莞"), s_sta_name = start_station, s_sta_id = start_code, d_city_name = end, d_city_code= get_pinyin_first_litter(end), d_city_id = '', d_sta_name = end_station, d_sta_id = '', drv_date = drv_date, drv_time = drv_time, drv_datetime = drv_datetime, distance = distance, vehicle_type = "", seat_type = "", bus_num = bus_num, full_price = float(full_price), half_price = float(full_price)/2, fee = 0, crawl_datetime = dte.now(), extra_info = {"query_url":href}, left_tickets = left_tickets, crawl_source = "dgky", shift_id="", ) yield LineItem(**attrs) if next_url: url = "http://www.mp0769.com/bccx.asp?" param = {} try: for s in next_url.split("?")[1].split("&"): k, v = s.split("=") param[k] = v.encode('gb2312') url = url + urllib.urlencode(param) except: print next_url yield scrapy.Request(url, method="GET", callback=self.parse_line, meta={'start_name': start_name, "sw_name": sw_name, 'start_code': start_code, 'end': end, 'sdate':sdate}) else: self.mark_done(start_name, end, sdate)
class CqkySpider(SpiderBase): name = "cqky" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.CqkyProxyMiddleware': 410, 'BusCrawl.middleware.CqkyHeaderMiddleware': 410, }, #"DOWNLOAD_DELAY": 0.2, "RANDOMIZE_DOWNLOAD_DELAY": True, } def start_requests(self): start_url = "http://www.96096kp.com/StationSelect3.aspx" yield scrapy.Request( start_url, callback=self.parse_start_city, ) def get_dest_list_from_web(self, province, city, station=""): # 需要子类实现 url = "http://www.96096kp.com/UserData/MQCenterSale.aspx" d_list = [] for c in [chr(i) for i in range(97, 123)]: params = { "cmd": "QueryNode", "StartStation": "重庆主城" if city == "重庆" else city, "q": c, } headers = { "Host": "www.96096kp.com", "Origin": "http://www.96096kp.com", "Referer": "http://www.96096kp.com/TicketMain.aspx", "X-Requested-With": "XMLHttpRequest", "Content-Type": "application/x-www-form-urlencoded", "User-Agent": "Chrome", } r = requests.post(url, headers=headers, data=urllib.urlencode(params)) for d in r.json(): d_list.append({ "name": d["NDName"], "code": d["NDCode"], "dest_id": "", }) return d_list def parse_start_city(self, response): res = json.loads( re.findall(r"var _stationList=(\S+)</script>", response.body)[0].replace("Pros", '"Pros"').replace( "Areas", '"Areas"').replace("Stations", '"Stations"')) line_url = "http://www.96096kp.com/UserData/MQCenterSale.aspx" trans = {u"重庆主城": "重庆"} for d in res["Areas"][0]["AreaData"]: start = { "province": "重庆", "s_city_id": d["ID"], "s_city_name": d["CityDist"], "s_city_code": get_pinyin_first_litter(d["CityDist"]), } if not self.is_need_crawl(city=start["s_city_name"]): continue for s in self.get_dest_list(province="重庆", city=trans.get(start["s_city_name"], start["s_city_name"])): name, code = s["name"], s["code"] end = {"d_city_name": name, "d_city_code": code} today = datetime.date.today() for i in range(self.start_day(), 8): sdate = str(today + datetime.timedelta(days=i)) if self.has_done(start["s_city_name"], end["d_city_name"], sdate): # self.logger.info("ignore %s ==> %s %s" % (start["s_city_name"], end["d_city_name"], sdate)) continue params = { "StartStation": start["s_city_name"], "WaitStationCode": "", "OpStation": -1, "OpAddress": -1, "SchDate": sdate, "DstNode": name, "SeatType": "", "SchTime": "", "OperMode": "", "SchCode": "", "txtImgCode": "", "cmd": "MQCenterGetClass", "isCheck": "false", } yield scrapy.Request(line_url, method="POST", body=urllib.urlencode(params), callback=self.parse_line, meta={ "start": start, "end": end, "sdate": sdate }) def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] self.mark_done(start["s_city_name"], end["d_city_name"], sdate) content = response.body for k in set(re.findall("([A-Za-z]+):", content)): content = re.sub(r"\b%s\b" % k, '"%s"' % k, content) self.logger.info("finish %s ==> %s" % (start["s_city_name"], end["d_city_name"])) try: res = json.loads(content) except Exception, e: self.logger.error("parse_line: %s" % content) raise e if res["success"] != "true": self.logger.error("parse_line: Unexpected return, %s" % res) return for d in res["data"]: attrs = dict( s_province=start["province"], s_city_id=start["s_city_id"], s_city_name=start["s_city_name"], s_sta_name=d["SchStationName"], s_city_code=start["s_city_code"], s_sta_id=d["SchStationCode"], d_city_name=end["d_city_name"], d_city_id="", d_city_code=end["d_city_code"], d_sta_id="", d_sta_name=d["SchDstNodeName"], drv_date=d["SchDate"], drv_time=d["SchTime"], drv_datetime=dte.strptime( "%s %s" % (d["SchDate"], d["SchTime"]), "%Y-%m-%d %H:%M"), distance=unicode(d["SchDist"]), vehicle_type=d["SchBusType"], seat_type="", bus_num=d["SchLocalCode"], full_price=float(d["SchPrice"]), half_price=float(d["SchDiscPrice"]), fee=0, crawl_datetime=dte.now(), extra_info={"raw_info": d}, left_tickets=int(d["SchTicketCount"]), crawl_source="cqky", shift_id="", ) yield LineItem(**attrs)
class CTripSpider(SpiderBase): name = "ctrip" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.ProxyMiddleware': 410, 'BusCrawl.middleware.CtripHeaderMiddleware': 410, }, "DOWNLOAD_DELAY": 0.2, "RANDOMIZE_DOWNLOAD_DELAY": True, } base_url = "http://m.ctrip.com/restapi/busphp/app/index.php" def start_requests(self): # 这是个pc网页页面 headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/47.0.2526.106 Safari/537.36"} web_page = "http://qiche.tieyou.com/index.php?param=/ajax/cityList" return [scrapy.Request(web_page, headers=headers, callback=self.parse_start_city)] def parse_start_city(self, response): res = json.loads(response.body[1:-1]) params = dict( param="/api/home", method="product.getToCityList", ref="ctrip.h5", partner="ctrip.app", clientType="Android--hybrid", vendor="", fromCity="", contentType="json", ) for pro in res['hotFromCity']['province']: province = pro["province_name"] if not province or not self.is_need_crawl(province=province): continue self.logger.info("start province: %s" % province) for ci in pro["citys"]: d = { "province": province, "name": ci, } if not self.is_need_crawl(city=ci): continue self.logger.info("start province: %s city: %s", province, ci) params.update(fromCity=ci) url = "%s?%s" % (self.base_url, urllib.urlencode(params)) yield scrapy.Request(url, callback=self.parse_target_city, meta={"start": d}) def parse_target_city(self, response): res = json.loads(response.body) if int(res["code"]) != 1: self.logger.error("parse_target_city: Unexpected return, %s" % res["message"]) return start = response.meta["start"] for tar in res["return"]: d = { "name": tar["name"], } today = datetime.date.today() for i in range(1, 10): sdate = str(today+datetime.timedelta(days=i)) if self.has_done(start["name"], d["name"], sdate): #self.logger.info("ignore %s ==> %s %s" % (start["name"], d["name"], sdate)) continue params = dict( param="/api/home", method="product.getBusList", v="1.0", ref="ctrip.h5", partner="ctrip.app", clientType="Android--hybrid", fromCity=start["name"], toCity=d["name"], fromDate=sdate, contentType="json", ) url = "%s?%s" % (self.base_url, urllib.urlencode(params)) yield scrapy.Request(url, callback=self.parse_line, meta={"start": start, "end": d, "drv_date": sdate}) def parse_line(self, response): "解析班车" try: res = json.loads(response.body) except Exception, e: print response.body raise e start = response.meta["start"] end = response.meta["end"] drv_date = response.meta["drv_date"] self.mark_done(start["name"], end["name"], drv_date) if int(res["code"]) != 1: #self.logger.error("parse_line: Unexpected return, %s" % str(res)) return for d in res["return"]: if not d["bookable"]: continue if d["busType"] == "流水班": continue from_station = unicode(d["fromStationName"]) to_station = unicode(d["toStationName"]) ticket_info = d["showTicketInfo"] if ticket_info == "有票": left_tickets = 45 elif ticket_info.endswith("张"): left_tickets = int(ticket_info[:-1]) elif ticket_info == "预约购票": continue else: print ticket_info, d["bookable"] 1/0 attrs = dict( s_province = start["province"], s_city_name = d["fromCityName"], s_city_id="", s_city_code=get_pinyin_first_litter(d["fromCityName"]), s_sta_name = from_station, s_sta_id="", d_city_name = d["toCityName"], d_city_id="", d_city_code=get_pinyin_first_litter(d["toCityName"]), d_sta_name = to_station, d_sta_id="", drv_date = drv_date, drv_time = d["fromTime"], drv_datetime = dte.strptime("%s %s" % (drv_date, d["fromTime"]), "%Y-%m-%d %H:%M"), distance = "0", vehicle_type = d["busType"], seat_type = "", bus_num = d["busNumber"], full_price = float(d["fullPrice"]), half_price = float(d["fullPrice"])/2, fee = 0, crawl_datetime = dte.now(), extra_info = {}, left_tickets = left_tickets, crawl_source = "ctrip", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): trainListInfo = json.loads(response.body) if trainListInfo: start = response.meta["start"] end = response.meta["end"] crawl_province = response.meta["crawl_province"] crawl_city = response.meta["crawl_city"] payload = response.meta["payload"] sdate = payload['sendDate'] nextPage = int(trainListInfo['nextPage']) pageNo = int(trainListInfo['pageNo']) # print m['msg'] content = trainListInfo['msg'] if not isinstance(content, unicode): content = content.decode('utf-8') sel = etree.HTML(content) trains = sel.xpath('//div[@class="trainList"]') for n in trains: flag = 0 buyInfo = n.xpath('ul/li[@class="buy"]/a[@class="btn"]/text()') if buyInfo: d_str = n.xpath("@data-list")[0] shift_str = d_str[d_str.index("id=") + 3:] left_str = d_str[d_str.index("leftSeatNum=") + 12:] shiftid = shift_str[:shift_str.index(",")] leftSeatNum = left_str[:left_str.index(",")] station = n.xpath('ul/li[@class="start"]/p/text()') time = n.xpath('ul/li[@class="time"]/p/strong/text()') bus_num = '' bus_num = n.xpath( 'ul/li[@class="time"]/p[@class="carNum"]/text()') if bus_num: bus_num = bus_num[0].replace('\r\n', '').replace(' ', '') price = n.xpath('ul/li[@class="price"]/strong/text()') flag = 1 attrs = dict( s_province=crawl_province['province_name'], s_city_name=start['countyName'], s_city_id=start['countyId'], s_city_code=get_pinyin_first_litter( start['countyName']), s_sta_name=station[0], s_sta_id=start['countyId'], d_city_name=end['portName'], d_city_code=get_pinyin_first_litter(end['portName']), d_city_id='', d_sta_name=station[1], d_sta_id='', drv_date=sdate, drv_time=time[0], drv_datetime=dte.strptime("%s %s" % (sdate, time[0]), "%Y-%m-%d %H:%M"), distance=0, vehicle_type="", seat_type='', bus_num=bus_num.decode("utf-8").strip().rstrip(u"次"), full_price=float(str(price[0]).split('¥')[-1]), half_price=float(str(price[0]).split('¥')[-1]) / 2, fee=0, crawl_datetime=dte.now(), extra_info={"flag": flag}, left_tickets=int(leftSeatNum), crawl_source="xintuyun", shift_id=shiftid, ) yield LineItem(**attrs) if nextPage > pageNo: url = 'http://www.xintuyun.cn/getBusShift/ajax' + '?pageNo=%s' % nextPage yield scrapy.FormRequest(url, formdata=payload, callback=self.parse_line, meta={ "payload": payload, 'crawl_province': crawl_province, 'crawl_city': crawl_city, 'start': start, "end": end }) elif nextPage and nextPage == pageNo: self.mark_done(start["countyName"], end['portName'], sdate)
class BabaSpider(SpiderBase): name = "baba" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.ProxyMiddleware': 410, 'BusCrawl.middleware.BabaHeaderMiddleware': 410, }, } def post_data_templ(self, content): tmpl = { "content": content, "common": { "pushToken": "864895020513527", "channelVer": "BabaBus", "usId": "", "appId": "com.hundsun.InternetSaleTicket", "appVer": "1.4.0", "loginStatus": "0", "imei": "864895020513527", "mobileVer": "6.0", "terminalType": "1", "platformCode": "01", "phone": "", }, "key": "" } return tmpl def start_requests(self): start_url = "http://s4mdata.bababus.com:80/app/v5/ticket/cityAllListFrom.htm" content = {"dataVersion": "", "searchType": "0"} fd = self.post_data_templ(content) yield scrapy.Request(start_url, method="POST", body=json.dumps(fd), callback=self.parse_start_city) def get_dest_list_from_web(self, province, city): dest_url = 'http://s4mdata.bababus.com:80/app/v5/ticket/cityAllList.htm' dest_list = [] for c in [chr(i) for i in range(97, 123)]: content = { "searchType": "0", "dataVersion": "", "beginCityName": city } fd = self.post_data_templ(content) ua = "Mozilla/5.0 (Linux; U; Android 2.2; fr-lu; HTC Legend Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" headers = {"User-Agent": ua} import requests r = requests.post(dest_url, data=json.dumps(fd), headers=headers) res = r.json() for d in res["content"]["cityList"]: end = { "name": d["cityName"], "code": get_pinyin_first_litter(d["cityName"]), "dest_id": d["cityId"], } dest_list.append(end) return dest_list def parse_start_city(self, response): res = json.loads(response.body) if res["returnNo"] != "0000": self.logger.error("parse_start_city: Unexpected return, %s", res) return line_url = "http://s4mdata.bababus.com:80/app/v5/ticket/busList.htm" for info in res["content"]["cityList"]: name = info["cityName"] if name not in CITY_TO_PROVINCE: continue province = CITY_TO_PROVINCE[name] if not self.is_need_crawl(city=name, province=province): continue start = { "province": province, "city_name": info["cityName"], "city_code": info["allSpell"], "city_id": info["cityId"], } for d in self.get_dest_list(province, name): end = { "city_name": d["name"], "city_code": d["code"], "city_id": d["dest_id"], } today = datetime.date.today() for i in range(self.start_day(), 8): sdate = str(today + datetime.timedelta(days=i)) if self.has_done(start["city_name"], end["city_name"], sdate): continue content = { "pageSize": 1025, "beginCityName": start["city_name"], "currentPage": 1, "endCityName": end["city_name"], "leaveDate": sdate, "beginCityId": start["city_id"], "endCityId": end["city_id"], } fd = self.post_data_templ(content) yield scrapy.Request(line_url, method="POST", body=json.dumps(fd), callback=self.parse_line, meta={ "start": start, "end": end, "date": sdate }) def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["date"] try: res = json.loads(response.body) except Exception, e: raise e if res["returnNo"] != "0000": return self.logger.info("finish %s ==> %s" % (start["city_name"], end["city_name"])) self.mark_done(start["city_name"], end["city_name"], sdate) for d in res["content"]["busList"]: try: drv_datetime = dte.strptime( "%s %s" % (d["leaveDate"], d["leaveTime"]), "%Y-%m-%d %H:%M") except: continue attrs = dict( s_province=start["province"], s_city_name=start["city_name"], s_city_id=start["city_id"], s_city_code=start["city_code"], s_sta_name=d["beginStation"], s_sta_id=d["beginStationId"], d_city_name=end["city_name"], d_city_code=end["city_code"], d_city_id=end["city_id"], d_sta_name=d["endStation"], d_sta_id=d.get("endStationId", ""), drv_date=d["leaveDate"], drv_time=d["leaveTime"], drv_datetime=drv_datetime, distance="0", vehicle_type=d["busType"], seat_type="", bus_num=d["busId"], full_price=float(d["fullPrice"]), half_price=float(d["fullPrice"]) / 2, fee=0, crawl_datetime=dte.now(), extra_info={ "depotName": d.get("depotName", ""), "sbId": d["sbId"], "stId": d["stId"], "depotId": d["depotId"] }, left_tickets=int(d["remainCount"]), crawl_source="baba", shift_id="", ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["date"] content = response.body self.mark_done(start["name"], end["StopName"], sdate) if not isinstance(content, unicode): content = content.decode('utf-8') sel = etree.HTML(content) scheduleList = sel.xpath('//div[@id="scheduleList"]/table/tbody/tr') for i in range(0, len(scheduleList), 2): s = scheduleList[i] time = s.xpath('td[@class="departureTimeCell"]/span/text()')[0] station = s.xpath('td[@class="routeNameCell"]/span/text()') scheduleIdSpan = s.xpath( 'td[@class="scheduleAndBusLicenseCes"]/span[@class="scheduleSpan"]/span[@class="scheduleIdSpan"]/text()' )[0] scheduleIdSpan = scheduleIdSpan.replace('\r\n', '').replace( '\t', '').replace(' ', '') price = s.xpath( 'td[@class="ticketPriceCell"]/span[@class="ticketPriceSpan"]/span[@class="ticketPriceValueSpan"]/text()' )[0] ScheduleString = s.xpath( 'td[@class="operationCell"]/@data-schedule')[0] left_tickets = 45 left_less = s.xpath('td[@class="memoCell"]/span/@class') if left_less: left_tickets = 0 station_code_mapping = { u"六里桥": "1000", u"首都机场站": "1112", u"赵公口": "1103", u"木樨园": "1104", u"丽泽桥": "1106", u"新发地": "1107", u"莲花池": "1108", u"四惠": "1109", u"永定门": "1110", u"北郊": "1111", } attrs = dict( s_province='北京', s_city_name="北京", s_city_id='', s_city_code=get_pinyin_first_litter(u"北京"), s_sta_name=station[0], s_sta_id=station_code_mapping[station[0]], d_city_name=end['StopName'], d_city_code=get_pinyin_first_litter(end['StopName']), d_city_id=end['StopId'], d_sta_name=end['StopName'], d_sta_id='', drv_date=sdate, drv_time=time, drv_datetime=dte.strptime("%s %s" % (sdate, time), "%Y-%m-%d %H:%M"), distance="0", vehicle_type="", seat_type="", bus_num=scheduleIdSpan, full_price=float(price), half_price=float(price) / 2, fee=0, crawl_datetime=dte.now(), extra_info={ "ScheduleString": ScheduleString, "ArrivingStopJson": json.dumps(end) }, left_tickets=left_tickets, crawl_source="bjky", shift_id='', ) yield LineItem(**attrs)
def parse_line(self, response): "解析班车" province_list = ('吉林', '辽宁', '河北', '黑龙江', '广东', "云南", '山西', '山东', '广西壮族自治', '江西', '河南', '浙江', '安徽', '湖北', '湖南', "贵州", '陕西', '江苏', '内蒙古自治', "四川", '海南', '山东', '甘肃', '青海', '宁夏回族自治', "新疆维吾尔自治", '西藏自治', '贵州', '广西') start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] res = json.loads(response.body) self.logger.info("finish %s ==> %s" % (start["station_name"], end["zdmc"])) self.mark_done(start['station_name'], end["zdmc"], sdate) xml_text = re.findall( r"<getScheduledBusResult>(.*)</getScheduledBusResult>", res.get('msg', ''), re.S)[0] root = ET.fromstring(xml_text) node_find = root.find('Body') if node_find.attrib['size'] == '0': return res = node_find.findall('ScheduledBus') for d in res: s_sta_name = start['station_name'] s_sta_id = start['czbh'] d_city_name = end['zdmc'] if len(d_city_name) >= 4: if d_city_name.startswith(province_list): for j in province_list: if d_city_name.startswith(j): d_city_name = d_city_name.replace(j, '') break d_sta_name = d.find('MDZMC').text drv_time = d.find('FCSJ').text distance = d.find('LC').text seat_type = d.find('CXMC').text bus_num = d.find('CCBH').text full_price = d.find('PJ').text left_tickets = d.find('YPZS').text d_city_id = d.find('MDZBH').text attrs = dict( s_province='海南', s_city_name=start['city_name'], s_city_id='', s_city_code=get_pinyin_first_litter(unicode( start['city_name'])), s_sta_name=s_sta_name, s_sta_id=s_sta_id, d_city_name=d_city_name, d_city_code=get_pinyin_first_litter(d_city_name), d_city_id=d_city_id, d_sta_name=d_sta_name, d_sta_id='', drv_date=sdate, drv_time=drv_time, drv_datetime=dte.strptime("%s %s" % (sdate, drv_time), "%Y-%m-%d %H:%M"), distance=distance, vehicle_type="", seat_type=seat_type, bus_num=bus_num, full_price=float(full_price), half_price=float(full_price) / 2, fee=0, crawl_datetime=dte.now(), extra_info={}, left_tickets=int(left_tickets), crawl_source="hainky", shift_id='', ) yield LineItem(**attrs)
class TongChengSpider(SpiderBase): name = "tongcheng" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.ProxyMiddleware': 410, 'BusCrawl.middleware.TongChengHeaderMiddleware': 410, }, # "DOWNLOAD_DELAY": 0.2, "RANDOMIZE_DOWNLOAD_DELAY": True, } base_url = "http://m.ctrip.com/restapi/busphp/app/index.php" def start_requests(self): # 这是个pc网页页面 line_url = "http://m.ly.com/bus/BusJson/BusSchedule" for name in [ "苏州", "南京", "无锡", "常州", "南通", "张家港", "昆山", "吴江", "常熟", "太仓", "镇江", "宜兴", "江阴", "兴化", "盐城", "扬州", "连云港", "徐州", "宿迁" ]: if not self.is_need_crawl(city=name): continue self.logger.info("start crawl city %s", name) start = {"name": name, "province": "江苏"} for s in self.get_dest_list(start["province"], start["name"]): name, code = s.split("|") end = {"name": name, "short_pinyin": code} self.logger.info("start %s ==> %s" % (start["name"], end["name"])) today = datetime.date.today() for i in range(self.start_day(), 8): sdate = str(today + datetime.timedelta(days=i)) if self.has_done(start["name"], end["name"], sdate): self.logger.info("ignore %s ==> %s %s" % (start["name"], end["name"], sdate)) continue params = dict(Departure=start["name"], Destination=end["name"], DepartureDate=sdate, DepartureStation="", DptTimeSpan=0, HasCategory="true", Category="0", SubCategory="", ExParms="", Page="1", PageSize="1025", BookingType="0") yield scrapy.Request(line_url, method="POST", body=urllib.urlencode(params), callback=self.parse_line, meta={ "start": start, "end": end, "sdate": sdate }) def parse_line(self, response): "解析班车" try: res = json.loads(response.body) except Exception, e: self.logger.error("%s %s", response.body, e) return res = res["response"] if int(res["header"]["rspCode"]) != 0: # self.logger.error("parse_target_city: Unexpected return, %s" % res["header"]) return start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] self.mark_done(start["name"], end["name"], sdate) for d in res["body"]["schedule"]: if not d["canBooking"]: continue left_tickets = int(d["ticketLeft"]) from_city = unicode(d["departure"]) to_city = unicode(d["destination"]) from_station = unicode(d["dptStation"]) to_station = unicode(d["arrStation"]) attrs = dict( s_province=start["province"], s_city_id="", s_city_name=from_city, s_sta_name=from_station, s_city_code=get_pinyin_first_litter(from_city), s_sta_id=d.get("dptStationCode", ""), d_city_name=to_city, d_city_id="", d_city_code=end["short_pinyin"], d_sta_id="", d_sta_name=to_station, drv_date=d["dptDate"], drv_time=d["dptTime"], drv_datetime=dte.strptime( "%s %s" % (d["dptDate"], d["dptTime"]), "%Y-%m-%d %H:%M"), distance=unicode(d["distance"]), vehicle_type=d["coachType"], seat_type="", bus_num=d["coachNo"], full_price=float(d["ticketPrice"]), half_price=float(d["ticketPrice"]) / 2, fee=float(d["ticketFee"]), crawl_datetime=dte.now(), extra_info={}, left_tickets=left_tickets, crawl_source="tongcheng", shift_id="", ) yield LineItem(**attrs)
d_city_code=get_pinyin_first_litter(end["depotName"]), d_city_id=end['depotCode'], d_sta_name=d["arrivalDepotName"], d_sta_id=d["arrivalDepotCode"], drv_date=d["departDate"], drv_time=d["leaveTime"], drv_datetime=dte.strptime( "%s %s" % (d["departDate"], d["leaveTime"]), "%Y-%m-%d %H:%M"), distance="0", vehicle_type="", seat_type="", bus_num=d["busCode"], full_price=float(d["fullPrice"]), half_price=float(d["fullPrice"]) / 2, fee=0, crawl_datetime=dte.now(), extra_info={ "busCodeType": d["busCodeType"], "regsName": d["regsName"], "busCompanyCode": d["busCompanyCode"], "s_code": start["code"], 'e_code': end['depotCode'], 'arriveIsArea': arriveIsArea }, left_tickets=int(d["remainSeats"]), crawl_source="gzqcp", shift_id="", ) yield LineItem(**attrs)
class CBDSpider(SpiderBase): name = "lvtu100" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.ProxyMiddleware': 410, 'BusCrawl.middleware.Lvtu100HeaderMiddleware': 410, }, #"DOWNLOAD_DELAY": 0.2, "RANDOMIZE_DOWNLOAD_DELAY": True, } base_url = "http://m.ctrip.com/restapi/busphp/app/index.php" def get_request_data(self, custom): data = { "appid": "lvtu100.andorid", "timestamp": str(int(time.time())), "format": "json", "version": "1.0", } data.update(custom) key_lst = filter(lambda x: data[x], data.keys()) key_lst.sort() data["sign"] = md5("".join("%s%s" % (k, data[k]) for k in key_lst) + "0348ba1cbbfa0fa9ca627394e999fea5") return data def get_dest_list(self, province, city): """ 覆盖了父类实现 """ url = "http://api.lvtu100.com/products/getstopcity" params = self.get_request_data({ "startProvince": province, "startcityname": city }) headers = { "User-Agent": "Mozilla/5.0 (Linux; U; Android 2.3; en-us) AppleWebKit/999+ (KHTML, like Gecko) Safari/999.9", "Content-Type": "application/x-www-form-urlencoded", } r = requests.post(url, data=urllib.urlencode(params), headers=headers) ret = r.json() return map( lambda d: { "city_name": d["cityname"], "province": d["province"], "city_code": d["shortspell"] }, ret["data"]["resultList"]) def start_requests(self): url = "http://api.lvtu100.com/products/get_allstartcity" params = self.get_request_data({}) yield scrapy.FormRequest(url, formdata=params, callback=self.parse_starting) def parse_starting(self, response): url = "http://api.lvtu100.com/products/getgoods" ret = json.loads(response.body) today = datetime.date.today() for city_info in ret["data"]: for d in city_info["lstcitys"]: province = d["province"] if not self.is_need_crawl( province=province) and not self.is_need_crawl( province=province.rstrip(u"省")): continue start = { "city_id": d["startcityid"], "city_code": d["shortspell"], "city_name": d["cityname"], "province": d["province"] } if not self.is_need_crawl( city=start["city_name"]) or start["city_name"] in [ "宝应" ]: continue for end in self.get_dest_list(province, start["city_name"]): for i in range(self.start_day(), 8): sdate = str(today + datetime.timedelta(days=i)) if self.has_done(start["city_name"], end["city_name"], sdate): continue params = { "startprovince": start["province"], "startcity": start["city_name"], "departdate": sdate, "fromstation": "", "pagestring": '{"page":1,"pagesize":1024}', "range": "", "stopprovince": end["province"], "stopcity": end["city_name"], } yield scrapy.FormRequest( url, formdata=self.get_request_data(params), callback=self.parse_line, meta={ "start": start, "end": end, "sdate": sdate }) def parse_line(self, response): start = response.meta["start"] end = response.meta["end"] sdate = response.meta["sdate"] self.mark_done(start["city_name"], end["city_name"], sdate) self.logger.info("start %s ==> %s" % (start["city_name"], end["city_name"])) try: res = json.loads(response.body) except Exception, e: print response.body raise e if int(res["code"]) != 0: self.logger.error("parse_line: Unexpected return, %s" % res) return s_sta_info = {d["productid"]: d for d in res["data"]["stations"]} d_sta_info = {d["productid"]: d for d in res["data"]["stopstations"]} for d in res["data"]["flight"]["resultList"]: if int(d["islocked"]) == 1: continue s_sta = s_sta_info[d["productid"]] d_sta = d_sta_info[d["productid"]] attrs = dict( s_province=start["province"].rstrip("省"), s_city_id=start["city_id"], s_city_name=start["city_name"], s_sta_name=s_sta["stationname"], s_city_code=start["city_code"], s_sta_id=s_sta["stationid"], d_city_name=d_sta["stopcity"], d_city_id="", d_city_code=get_pinyin_first_litter(d_sta["stopcity"]), d_sta_id="", d_sta_name=d_sta["stationname"], drv_date=d["departdate"], drv_time=d["departtime"], drv_datetime=dte.strptime( "%s %s" % (d["departdate"], d["departtime"]), "%Y-%m-%d %H:%M"), distance=unicode(d.get("distance", "") or ""), vehicle_type=d.get("bustype", "") or "", seat_type="", bus_num=d["itemno"], full_price=float(d["price"]), half_price=float(d["price"]) / 2, fee=3, crawl_datetime=dte.now(), extra_info={ "goodsid": d["goodsid"], "itemid": d["itemid"], "startProvince": start["province"], "stopprovince": end["province"], "productid": d["productid"] }, left_tickets=10, crawl_source="lvtu100", shift_id="", ) yield LineItem(**attrs)
class FjkySpider(SpiderBase): name = "fjky" custom_settings = { "ITEM_PIPELINES": { 'BusCrawl.pipeline.MongoPipeline': 300, }, "DOWNLOADER_MIDDLEWARES": { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400, 'BusCrawl.middleware.ProxyMiddleware': 410, 'BusCrawl.middleware.FjkyHeaderMiddleware': 410, }, # "DOWNLOAD_DELAY": 0.1, # "RANDOMIZE_DOWNLOAD_DELAY": True, } def query_start_predate(self, code): url = 'http://www.968980.cn/com/yxd/pris/openapi/queryPreDate.action' data = { "startDepotCode": code, } res = requests.post(url, data=data) res = res.json() predate = 0 if res['akfAjaxResult'] != '0': predate = 0 else: predate = res['values']['preDate'] return predate def get_init_dest_list(self, start_info): province_list = ('吉林', '辽宁', '河北', '黑龙江', '广东', "云南", '山西', '山东', '广西壮族自治', '江西', '河南', '浙江', '安徽', '湖北', '湖南', "贵州", '陕西', '江苏', '内蒙古自治', "四川", '海南', '山东', '甘肃', '青海', '宁夏回族自治', "新疆维吾尔自治", '西藏自治', '贵州') rds = get_redis() rds_key = "crawl:dest:fjky16" dest_str = rds.get(rds_key) if not dest_str: target_url = "http://www.968980.cn//com/yxd/pris/wsgp/queryCity.action" data = { "flag": "false", "isArrive": "true", "isStart": "false", "iststation": "1", "startCode": start_info['code'], "zjm": '', } r = requests.post(target_url, data=urllib.urlencode(data), headers={ "User-Agent": "Chrome", "Content-Type": "application/x-www-form-urlencoded" }) res = r.json() lst = [] if res['values']['ca']: for i in res['values']['ca'][0]: tmp = {} tmp['code'] = i[0] if i[4] in ['1', '2']: tmp['name'] = i[1].strip(' ') else: lev_list = i[3].split(' ') if len(lev_list) < 3: tmp['name'] = i[1].strip(' ') else: tmp['name'] = lev_list[-1].strip(')').strip(' ') province = lev_list[0].strip('(').strip(' ') if province == '福建省': tmp['name'] = i[1].strip(' ') target_name = tmp['name'] if target_name.endswith('站'): continue if '直辖' not in target_name: if not target_name or len(target_name) > 4: if target_name.startswith(province_list): target_name1 = target_name for j in province_list: if target_name.startswith(j): target_name = target_name.replace( j, '') break tmp['name'] = target_name if not tmp['name'].endswith(('市', '县', '州', '区', '旗')): continue lst.append(tmp) dest_str = json.dumps(lst) rds.set(rds_key, dest_str) lst = json.loads(dest_str) return lst def start_requests(self): start_url = "http://www.968980.cn/com/yxd/pris/openapi/cityQueryAll.action" yield scrapy.FormRequest(start_url, method="POST", formdata={}, callback=self.parse_start_city) def parse_start_city(self, response): res = json.loads(response.body) if res["akfAjaxResult"] != "0": self.logger.error("parse_start_city: Unexpected return, %s", res) return start_list = [] for i in res['values']['list']: for j in i['list']: start_list.append(j) # end_list = self.get_init_dest_list(start_list[0]) line_url = 'http://www.968980.cn/com/yxd/pris/openapi/queryAllTicket.action' for start in start_list: if not self.is_need_crawl(city=start['name']): continue end_list = self.get_dest_list('福建', start['name']) for end in end_list: end['code'] = end['dest_id'] today = datetime.date.today() for j in range(0, 7): sdate = str(today + datetime.timedelta(days=j)) if self.has_done(start['name'], end["name"], sdate): self.logger.info("ignore %s ==> %s %s" % (start['name'], end["name"], sdate)) continue data = { "arrivalDepotCode": end['code'], "beginTime": sdate, "startName": unicode(start['name']), "endName": unicode(end["name"]), "startDepotCode": start['code'] } yield scrapy.FormRequest(line_url, method="POST", formdata=data, callback=self.parse_line, meta={ "start": start, "end": end, "date": sdate }) def parse_line(self, response): "解析班车" start = response.meta["start"] end = response.meta["end"] sdate = response.meta["date"] self.mark_done(start['name'], end["name"], sdate) try: res = json.loads(response.body) except Exception, e: raise e if res["akfAjaxResult"] != "0": #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"]) return for d in res["values"]["resultList"]: if d['stopFlag'] == '0': # if float(d["fullPrice"]) < 5 or int(d["remainSeats"]) < 2: # continue attrs = dict( s_province='福建', s_city_name=start['name'], s_city_id=start['code'], s_city_code=get_pinyin_first_litter(start['name']), s_sta_name=d["startDepotName"], s_sta_id=d["startDepotCode"], d_city_name=end["name"], d_city_code=get_pinyin_first_litter(end["name"]), d_city_id=end['code'], d_sta_name=d["arrivalDepotName"], d_sta_id=d["arrivalDepotCode"], drv_date=d["departDate"], drv_time=d["leaveTime"], drv_datetime=dte.strptime( "%s %s" % (d["departDate"], d["leaveTime"]), "%Y-%m-%d %H:%M"), distance="0", vehicle_type="", seat_type="", bus_num=d["busCode"], full_price=float(d["fullPrice"]), half_price=float(d["fullPrice"]) / 2, fee=0, crawl_datetime=dte.now(), extra_info={ "busCodeType": d["busCodeType"], "regsName": d["regsName"], "busCompanyCode": d["busCompanyCode"], "s_code": start['code'], 'e_code': end['code'] }, left_tickets=int(d["remainSeats"]), crawl_source="fjky", shift_id="", ) yield LineItem(**attrs)