Ejemplos de LineItem en Python, ejemplos de BusCrawl.item.LineItem en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: qdky.py Proyecto: xianglei0610/buscrawl

 def parse_line(self, response):
     city_name = response.meta['city_name']
     station_name = response.meta['station_name']
     s_station_name = response.meta['s_station_name']
     end = response.meta['end']
     sdate = response.meta['date']
     self.mark_done(station_name, end['city_name'], sdate)
     soup = bs(response.body, 'lxml')
     scl_list = soup.find('table',
                          attrs={'id': 'ContentPlaceHolder1_GridViewbc'})
     if not scl_list:
         return
     if scl_list:
         scl_list = scl_list.find_all('tr', attrs={'style': True})
     for x in scl_list[1:]:
         y = x.find_all('td')
         ticket_status = y[3].get_text().strip()
         s_d_city_name = end['city_name']
         d_city_name = re.sub("[A-Za-z]", "", s_d_city_name)
         if ticket_status == u"有票":
             drv_date = sdate
             bus_num = y[1].get_text().strip()
             drv_time = y[2].get_text().strip()
             distance = y[4].get_text().strip()
             vehicle_type = y[5].get_text().strip().decode('utf-8')
             full_price = y[6].get_text().strip()
             s_sta_name = y[7].get_text().strip()
             attrs = dict(
                 s_province='山东',
                 s_city_id="",
                 s_city_name=city_name,
                 s_city_code=get_pinyin_first_litter(unicode(city_name)),
                 s_sta_name=station_name,
                 s_sta_id='',
                 d_city_name=d_city_name,
                 d_city_id=end['city_id'],
                 d_city_code=get_pinyin_first_litter(unicode(d_city_name)),
                 d_sta_id='',
                 d_sta_name=s_sta_name,
                 drv_date=drv_date,
                 drv_time=drv_time,
                 drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time),
                                           "%Y-%m-%d %H:%M"),
                 distance=distance,
                 vehicle_type=vehicle_type,
                 seat_type="",
                 bus_num=bus_num,
                 full_price=float(full_price),
                 half_price=float(full_price) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={
                     's_station_name': s_station_name,
                     's_d_city_name': s_d_city_name
                 },
                 left_tickets=45,
                 crawl_source="qdky",
                 shift_id="",
             )
             yield LineItem(**attrs)

Ejemplo n.º 2

0

Mostrar archivo

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end= response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["sta_name"], end["city_name"], sdate)

        soup = BeautifulSoup(response.body.replace("<!--", "").replace("-->", ""), "lxml")
        for e in soup.findAll("tr"):
            lst = e.findAll("td")
            if not lst:
                continue
            # td>MK0041</td>
            # <td>泰州南站<br/>南京</td>
            # <td>2016-04-08</td>
            # <td><span class="lv_time">05:45</span></td>
            # <td>大型高一</td>
            # <td>51</td>
            # <td><span class="tk_price">56</span></td>
            # <td><span class="lv_time">17</span></td>
            # <td>途径南京东站，终点站 : 南京南站</td>
            # <td><a href="#" onclick="if(buy_confirm('MK0041')) window.location.href='/index.php/busOrder/index/czozNTg6InsiQkNIIjoiTUswMDQxIiwiU0NaTUMiOiJcdTZjZjBcdTVkZGVcdTUzNTdcdTdhZDkiLCJTRlpETSI6IjllZDIzNDU0YjNlYWVlNDQzODEyMWJlOWM2NGNiNmUyIiwiRERaTUMiOiJcdTUzNTdcdTRlYWMiLCJERFpETSI6IjBlMmYwY2U4YmQ5MDk1NThkYWViMjFjNTUyMGI3M2NhIiwiWkRaRE0iOiIwMDAwMDAwMDAiLCJTQ1pETSI6IjllZDIzNDU0YjNlYWVlNDQzODEyMWJlOWM2NGNiNmUyIiwiWkRaTUMiOiJcdTUzNTdcdTRlYWMiLCJGQ1JRIjoiMjAxNi0wNC0wOCIsIkZDU0oiOiIwNTo0NSIsIkNYIjoiXHU1OTI3XHU1NzhiXHU5YWQ4XHU0ZTAwIiwiWVBTIjoiMTciLCJIRFpXIjoiNTEiLCJRUEoiOiI1NiIsIlRQSiI6IjI4In0iOw~~';" class="buy_btn" title="购票">购票</a></td>
            bus_num = lst[0].text.strip()
            drv_date = lst[2].text.strip()
            drv_time = lst[3].text.strip()
            bus_type = lst[4].text.strip()
            price = float(lst[6].text.strip())
            left_tickets = int(lst[7].text.strip())
            lock_form_url = re.findall(r"href='(\S+)'", lst[9].select_one("a").get("onclick"))[0]

            attrs = dict(
                s_province = "江苏",
                s_city_id = "",
                s_city_name = "泰州",
                s_sta_name = start["sta_name"],
                s_city_code= "tz",
                s_sta_id= "",
                d_city_name = end["city_name"],
                d_city_id=end["city_id"],
                d_city_code=end["city_code"],
                d_sta_id="",
                d_sta_name=end["city_name"],
                drv_date=drv_date,
                drv_time=drv_time,
                drv_datetime = dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M"),
                distance = "0",
                vehicle_type = bus_type,
                seat_type = "",
                bus_num = bus_num,
                full_price = price,
                half_price = price/2,
                fee = 0,
                crawl_datetime = dte.now(),
                extra_info = {"lock_form_url": lock_form_url},
                left_tickets = left_tickets,
                crawl_source = "tzky",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 3

0

Mostrar archivo

 def parse_line(self, response):
     if response.body.strip() in ["[]", 0, "0"]:
         return
     res_lst = json.loads(response.body)
     start = response.meta['start']
     end = response.meta['end']
     sdate = response.meta['sdate']
     self.mark_done(start, end, sdate)
     self.logger.info("finish %s==>%s %s", start["name"], end["name"],
                      sdate)
     for x in res_lst:
         drv_date = x['bpnDate']
         drv_time = x['bpnSendTime']
         s_sta_name = x['shifazhan']
         s_sta_id = x['siID']
         d_sta_name = x['prtName']
         left_tickets = x['bpnLeftNum']
         vehicle_type = x['btpName']
         extra = {
             'sid': x['siID'],
             'dpid': x['prtID'],
             'l': x['bliID'],
             't': x['bpnDate'],
         }
         bus_num = x['bliID']
         full_price = x['prcPrice']
         attrs = dict(
             s_province=start["province"],
             s_city_id=start["id"],
             s_city_name=start["name"],
             s_sta_name=s_sta_name,  # 不太确定
             s_city_code=start["code"],
             s_sta_id=s_sta_id,
             d_city_name=end["name"],
             d_city_id=end["dest_id"],
             d_city_code=end["code"],
             d_sta_id="",
             d_sta_name=d_sta_name,
             drv_date=drv_date,
             drv_time=drv_time,
             drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time),
                                       "%Y-%m-%d %H:%M"),
             distance='',
             vehicle_type=vehicle_type,
             seat_type="",
             bus_num=bus_num,
             full_price=float(full_price),
             half_price=float(full_price) / 2,
             fee=0.0,
             crawl_datetime=dte.now(),
             extra_info=extra,
             left_tickets=int(left_tickets),
             crawl_source="sd365",
             shift_id="",
         )
         if int(left_tickets):
             yield LineItem(**attrs)

Ejemplo n.º 4

0

Mostrar archivo

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        res = json.loads(response.body)
        if res["rtn_code"] != "00":
            self.logger.error("parse_line: Unexpected return, %s", res)
            return
        shift_list = res["data"] or []
        self.mark_done(start["sta_name"], end["city_name"], sdate)

        for d in shift_list:
            drv_datetime = dte.strptime(
                "%s %s" % (d["drive_date"], d["plan_time"]), "%Y%m%d %H%M")
            s_sta_name = d["rst_name"]
            if u"��" in d["dst_name"]:
                continue
            if u"��" in s_sta_name:  # 有乱码
                s_sta_name = start["sta_name"]
            attrs = dict(
                s_province="江苏",
                s_city_id="",
                s_city_name=start["city_name"],
                s_sta_name=d["rst_name"],
                s_city_code=start["city_code"],
                s_sta_id=d["rstcode"],
                d_city_name=end["city_name"],
                d_city_id="",
                d_city_code=end["city_code"],
                d_sta_id=d["dstcode"],
                d_sta_name=d["dst_name"],
                drv_date=drv_datetime.strftime("%Y-%m-%d"),
                drv_time=drv_datetime.strftime("%H:%M"),
                drv_datetime=drv_datetime,
                distance=unicode(d["mileage"]),
                vehicle_type=d["m_name"],
                seat_type="",
                bus_num=d["bus_code"],
                full_price=float(d["full_price"]),
                half_price=float(d["half_price"]),
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "startstation": d["sst_name"],
                    "terminalstation": d["tst_name"],
                    "startstationcode": d["sstcode"]
                },
                left_tickets=int(d["available_tickets"]),
                crawl_source="jsdlky",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: nmghy.py Proyecto: xianglei0610/buscrawl

 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     end_code = response.meta["end_code"]
     sdate = response.meta["date"]
     self.mark_done(start, end, sdate)
     content = response.body
     if not isinstance(content, unicode):
         content = content.decode('utf-8')
     sel = etree.HTML(content)
     scheduleList = sel.xpath(
         '//div[@id="visitorDataTable"]/table/tbody/tr')
     if scheduleList:
         for i in scheduleList[1:]:
             bus_num = i.xpath('td[1]/text()')[0]
             start_station = i.xpath('td[2]/text()')[0]
             end_station = i.xpath('td[2]/text()')[0]
             drv_time = i.xpath('td[5]/span[@class="lv_time"]/text()')[0]
             price = i.xpath('td[8]/span[@class="tk_price"]/text()')[0]
             left_tickets = i.xpath('td[9]/span/text()')[0]
             postdata = i.xpath('td[10]/a/@onclick')[0].split(',')[1][1:-3]
             attrs = dict(
                 s_province='内蒙古',
                 s_city_name=u"呼和浩特",
                 s_city_id='',
                 s_city_code=get_pinyin_first_litter(u"呼和浩特"),
                 s_sta_name=start,
                 s_sta_id='',
                 d_city_name=end,
                 d_city_code=get_pinyin_first_litter(end),
                 d_city_id='',
                 d_sta_name=end,
                 d_sta_id=end_code,
                 drv_date=sdate,
                 drv_time=drv_time,
                 drv_datetime=dte.strptime("%s %s" % (sdate, drv_time),
                                           "%Y-%m-%d %H:%M"),
                 distance="0",
                 vehicle_type="",
                 seat_type="",
                 bus_num=bus_num,
                 full_price=float(price),
                 half_price=float(price) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={"postdata": postdata},
                 left_tickets=int(left_tickets),
                 crawl_source="nmghy",
                 shift_id="",
             )
             yield LineItem(**attrs)

Ejemplo n.º 6

0

Mostrar archivo

    def parse_line(self, response):
        start = response.meta['start']
        end = response.meta['end']
        sdate = response.meta['sdate'].decode('utf-8')
        self.mark_done(start["sta_name"], end["name"], sdate)
        self.logger.info("finish %s=>%s, %s", start["sta_name"], end["name"],
                         sdate)

        soup = bs(response.body, 'lxml')
        for tr_o in soup.select("table #selbuy"):
            td_lst = tr_o.find_all('td')
            if len(td_lst) < 2:
                continue
            index_tr = lambda idx: td_lst[idx].text.strip().decode("utf-8")

            drv_date, drv_time = sdate, index_tr(1)
            if u"流水" in drv_time:
                continue
            attrs = dict(
                s_province='山东',
                s_city_id="",
                s_city_name=start["city_name"],
                s_sta_name=start["sta_name"],
                s_city_code=start["city_code"],
                s_sta_id=start["sta_id"],
                d_city_name=end["name"],
                d_city_id=end["dest_id"],
                d_city_code=end["code"],
                d_sta_id=end["dest_id"],
                d_sta_name=end["name"],
                drv_date=drv_date,
                drv_time=drv_time,
                drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time),
                                          "%Y-%m-%d %H:%M"),
                distance='',
                vehicle_type=index_tr(4),
                seat_type="",
                bus_num=index_tr(0),
                full_price=float(index_tr(6)),
                half_price=float(index_tr(6)) / 2,
                fee=0.0,
                crawl_datetime=dte.now(),
                extra_info={"startNo": index_tr(11)},
                left_tickets=int(index_tr(10)),
                crawl_source="glcx",
                shift_id="",
            )
            if attrs["left_tickets"]:
                yield LineItem(**attrs)

Ejemplo n.º 7

0

Mostrar archivo

 def parse_line(self, response):
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     content = response.body
     if not isinstance(content, unicode):
         content = content.decode('utf-8')
     self.mark_done(start, end[0], sdate)
     sel = etree.HTML(content)
     scheduleInfo = sel.xpath('//input[@id="scheduleInfoJson"]/@value')
     if scheduleInfo:
         scheduleInfo = json.loads(scheduleInfo[0])
         for d in scheduleInfo:
             if not isinstance(d, dict):
                 continue
             if int(d['seatLast']) == 0:
                 continue
             if float(d["price"]) < 5:
                 continue
             attrs = dict(
                 s_province='辽宁',
                 s_city_name=start,
                 s_city_id='',
                 s_city_code=get_pinyin_first_litter(start),
                 s_sta_name=d['fromStation'],
                 s_sta_id='',
                 d_city_name=end[0],
                 d_city_code=get_pinyin_first_litter(end[0]),
                 d_city_id='',
                 d_sta_name=d['toStation'],
                 d_sta_id='',
                 drv_date=sdate,
                 drv_time=d["driveTime"],
                 drv_datetime=dte.strptime(
                     "%s %s" % (sdate, d["driveTime"]), "%Y-%m-%d %H:%M"),
                 distance="0",
                 vehicle_type="",
                 seat_type="",
                 bus_num=d['trainNumber'],
                 full_price=float(d["price"]),
                 half_price=float(d["price"]) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={'lineNo': d['lineNo']},
                 left_tickets=int(d["seatLast"]),
                 crawl_source="lnky",
                 shift_id='',
             )
             yield LineItem(**attrs)

Ejemplo n.º 8

0

Mostrar archivo

 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["date"]
     res = json.loads(response.body)
     if res["code"] != 1100:
         # self.logger.error("parse_line: Unexpected return, %s", res["message"])
         return
     self.mark_done(start["city_name"], end["city_name"], sdate)
     for d in res["data"]:
         if d["stationCode"] == "7FC222B8-A1EA-42E3-B242-D1CFA3AF28C1":  # 过滤郑州非汽车站内票
             continue
         drv_datetime = dte.strptime("%s %s" % (d["dptDate"], d["dptTime"]),
                                     "%Y-%m-%d %H:%M:%S")
         attrs = dict(
             s_province=start["province"],
             s_city_name=start["city_name"],
             s_city_id=start["city_id"],
             s_city_code=start["city_code"],
             s_sta_name=d["dptStation"],
             s_sta_id=d["stationCode"],
             d_city_name=end["city_name"],
             d_city_code=end["city_code"],
             d_city_id=end["city_id"],
             d_sta_name=d["arrStation"],
             d_sta_id="",
             drv_date=drv_datetime.strftime("%Y-%m-%d"),
             drv_time=drv_datetime.strftime("%H:%M"),
             drv_datetime=drv_datetime,
             distance="0",
             vehicle_type=d["coachType"],
             seat_type="",
             bus_num=d["coachNo"],
             full_price=float(d["ticketPrice"]),
             half_price=float(d["ticketPrice"]) / 2,
             fee=float(d["fee"]),
             crawl_datetime=dte.now(),
             extra_info={
                 "exData1": d["exData1"],
                 "exData2": d["exData2"]
             },
             left_tickets=int(d["ticketLeft"] or 0),
             crawl_source="fangbian",
             shift_id="",
         )
         yield LineItem(**attrs)

Ejemplo n.º 9

0

Mostrar archivo

 def parse_line(self, response):
     start = response.meta['start']
     end = response.meta['end']
     sdate = response.meta['sdate']
     self.mark_done(start["sta_name"], end["name"], sdate)
     soup = bs(response.body, 'lxml')
     for tr_o in soup.select("#ctl00_ContentPlaceHolder1_GVBccx tr")[1:]:
         if tr_o.get("class") and "GridViewHeaderStyle" in tr_o.get(
                 "class"):
             continue
         td_lst = tr_o.select("td")
         index_tr = lambda idx: td_lst[idx].text.strip()
         drv_date, drv_time = index_tr(0), index_tr(5)
         if u"流水" in drv_time:
             continue
         attrs = dict(
             s_province='江苏',
             s_city_id=start["city_id"],
             s_city_name=start["city_name"],
             s_sta_name=index_tr(1),
             s_city_code=start["city_code"],
             s_sta_id=start["sta_id"],
             d_city_name=end["name"],
             d_city_id="",
             d_city_code=end["code"],
             d_sta_id="",
             d_sta_name=index_tr(3),
             drv_date=drv_date,
             drv_time=drv_time,
             drv_datetime=dte.strptime("%s %s" % (drv_date, drv_time),
                                       "%Y-%m-%d %H:%M"),
             distance=unicode(index_tr(11)),
             vehicle_type=unicode(index_tr(10)),
             seat_type="",
             bus_num=index_tr(2),
             full_price=float(index_tr(6)),
             half_price=float(index_tr(7)),
             fee=0,
             crawl_datetime=dte.now(),
             extra_info={"lock_url": td_lst[12].find("a").get("href")},
             left_tickets=int(index_tr(8)),
             crawl_source="xyjt",
             shift_id="",
         )
         yield LineItem(**attrs)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: e8s.py Proyecto: xianglei0610/buscrawl

 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     res = json.loads(response.body)
     self.mark_done(start["city_name"], end["stopName"], sdate)
     res = res['detail']
     for d in res:
         if int(d['seatAmount']) == 0:
             continue
         if d['carrStaName'] != u"八王坟":
             continue
         attrs = dict(
             s_province='北京',
             s_city_name="北京",
             s_city_id='',
             s_city_code=get_pinyin_first_litter(u"北京"),
             s_sta_name=d["carrStaName"],
             s_sta_id=d["carryStaId"],
             d_city_name=end['stopName'],
             d_city_code=get_pinyin_first_litter(end['stopName']),
             d_city_id=end['stopId'],
             d_sta_name=d["endstaName"],
             d_sta_id='',
             drv_date=sdate,
             drv_time=d['drvTime'],
             drv_datetime=dte.strptime("%s %s" % (sdate, d['drvTime']),
                                       "%Y-%m-%d %H:%M"),
             distance="0",
             vehicle_type="",
             seat_type="",
             bus_num=d['scheduleId'],
             full_price=float(d['fullPrice']),
             half_price=float(d['fullPrice']) / 2,
             fee=0,
             crawl_datetime=dte.now(),
             extra_info={},
             left_tickets=int(d['seatAmount']),
             crawl_source="e8s",
             shift_id='',
         )
         yield LineItem(**attrs)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: scqcp.py Proyecto: xianglei0610/buscrawl

 def parse_line(self, response):
     "解析班车"
     res = json.loads(response.body)
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     self.mark_done(start["city_name"], end["city_name"], sdate)
     for d in res['body']["ticketLines"]:
         drv_datetime = dte.strptime(d["drvDateTime"], "%Y-%m-%d %H:%M")
         drv_date, drv_time = d["drvDateTime"].split(" ")
         if int(d["amount"]) == 0 or d["schTypeId"] != "0": #过滤不是固定班的班次
             continue
         attrs = dict(
             s_province=start["province"],
             s_city_name=start["city_name"],
             s_city_id=start["city_id"],
             s_city_code=start["city_code"],
             s_sta_name=d["carryStaName"],
             s_sta_id=d["carryStaId"],
             d_city_name=d["stopName"],
             d_city_id="",
             d_city_code=end["city_code"],
             d_sta_name=d["stopName"],
             d_sta_id="",
             drv_date=drv_date,
             drv_time=drv_time,
             drv_datetime=drv_datetime,
             distance=d["mile"],
             vehicle_type=d["busTypeName"],
             seat_type="",
             bus_num=d["schId"],
             full_price=float(d["fullPrice"]),
             half_price=float(d["halfPrice"]),
             fee = float(d["servicePrice"]),
             crawl_datetime = dte.now(),
             extra_info = {"sign_id": d["signId"],'end':end},
             left_tickets = int(d["amount"]),
             crawl_source = "scqcp",
             shift_id="",
         )
         yield LineItem(**attrs)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: bus365.py Proyecto: xianglei0610/buscrawl

 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     res = json.loads(response.body)
     self.mark_done(start["findname"], end['city_name'], sdate)
     for d in res['schedules']:
         if int(d['iscansell']) != 1:
             continue
         if float(d['fullprice']) < 11:
             continue
         attrs = dict(
             s_province = start['province'],
             s_city_name = start['findname'],
             s_city_id = start['id'],
             s_city_code= get_pinyin_first_litter(start['findname']),
             s_sta_name= d["busshortname"],
             s_sta_id = d["stationorgid"],
             d_city_name = d["stationname"],
             d_city_code= get_pinyin_first_litter(d["stationname"]),
             d_city_id = d['stationid'],
             d_sta_name = d["stationname"],
             d_sta_id = '',
             drv_date = sdate,
             drv_time = d['departtime'][0:-3],
             drv_datetime = dte.strptime("%s %s" % (sdate, d['departtime'][0:-3]), "%Y-%m-%d %H:%M"),
             distance = d["rundistance"],
             vehicle_type = "",
             seat_type = d['seattype'],
             bus_num = d['schedulecode'],
             full_price = float(d['fullprice']),
             half_price = float(d['fullprice'])/2,
             fee = 3,
             crawl_datetime = dte.now(),
             extra_info = {'start_info':start},
             left_tickets = int(d['residualnumber']),
             crawl_source = "bus365",
             shift_id=d['id'],
         )
         yield LineItem(**attrs)

Ejemplo n.º 13

0

Mostrar archivo

 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["date"]
     self.logger.info("finish %s ==> %s" % (start, end["city_name"]))
     self.mark_done(start, end['city_name'], sdate)
     res = json.loads(trans_js_str(response.body))
     for d in res["data"]:
         if d['SchStat'] == '1':
             attrs = dict(
                 s_province=u'广东',
                 s_city_name=u"深圳",
                 s_city_id='',
                 s_city_code=get_pinyin_first_litter(u"深圳"),
                 s_sta_name=d["SchWaitStName"],
                 s_sta_id=d["SchStationCode"],
                 d_city_name=end['city_name'],
                 d_city_code=end['city_code'],
                 d_city_id=d['SchDstNode'],
                 d_sta_name=d["SchNodeName"],
                 d_sta_id=d["SchNodeCode"],
                 drv_date=d["SchDate"],
                 drv_time=d["orderbytime"],
                 drv_datetime=dte.strptime(
                     "%s %s" % (d["SchDate"], d["orderbytime"]),
                     "%Y-%m-%d %H:%M"),
                 distance="0",
                 vehicle_type="",
                 seat_type="",
                 bus_num=d["SchLocalCode"],
                 full_price=float(d["SchStdPrice"]),
                 half_price=float(d["SchStdPrice"]) / 2,
                 fee=0,
                 crawl_datetime=dte.now(),
                 extra_info={"raw_info": d},
                 left_tickets=int(d["SchTicketCount"]),
                 crawl_source="szky",
                 shift_id="",
             )
             yield LineItem(**attrs)

Ejemplo n.º 14

0

Mostrar archivo

 def parse_line(self, response):
     "解析班车"
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     self.logger.info("finish %s ==> %s" % (start, end))
     self.mark_done(start, end, sdate)
     res = json.loads(response.body)
     sch_list = res['flightList']
     for d in sch_list:
         attrs = dict(
             s_province='上海',
             s_city_name=start,
             s_city_id='',
             s_city_code=get_pinyin_first_litter(unicode(start)),
             s_sta_name=d['stationName'],
             s_sta_id=d['stationId'],
             d_city_name=d['arriveRegionName'],
             d_city_code=get_pinyin_first_litter(d['arriveRegionName']),
             d_city_id=d['arriveRegionId'],
             d_sta_name=d['arriveRegionName'],
             d_sta_id='',
             drv_date=sdate,
             drv_time=d['flightTime'],
             drv_datetime=dte.strptime("%s %s" % (sdate, d['flightTime']),
                                       "%Y-%m-%d %H:%M"),
             distance='0',
             vehicle_type="",
             seat_type='',
             bus_num=d['flightNo'],
             full_price=float(d['price']),
             half_price=float(d['halfPrice']),
             fee=0,
             crawl_datetime=dte.now(),
             extra_info={"raw_info": d},
             left_tickets=int(d['lastCount']),
             crawl_source="shkyzz",
             shift_id='',
         )
         yield LineItem(**attrs)

Ejemplo n.º 15

0

Mostrar archivo

    def parse_line(self, response):
        trainListInfo = json.loads(response.body)
        if trainListInfo:
            start = response.meta["start"]
            end = response.meta["end"]
            crawl_province = response.meta["crawl_province"]
            crawl_city = response.meta["crawl_city"]
            payload = response.meta["payload"]
            sdate = payload['sendDate']
            item = LineItem()
            item['crawl_source'] = 'bus100'
            item['s_province'] = crawl_province['province_name']
            item['s_city_id'] = start['countyId']
            item['s_city_name'] = start['countyName']
            item['s_sta_id'] = start['countyId']
            start_short_name = start['pinyinPrefix']
            if not start_short_name or start_short_name == 'null':
                start_short_name = get_pinyin_first_litter(
                    item['start_city_name'])
            item['s_city_code'] = start_short_name
            item['d_city_name'] = end['portName']
            d_city_code = end['pinyinPrefix']
            if not d_city_code:
                d_city_code = "".join(
                    map(
                        lambda x: x[0],
                        pypinyin.pinyin(unicode(end['portName']),
                                        style=pypinyin.FIRST_LETTER)))
            item['drv_date'] = sdate
            item['d_city_code'] = d_city_code

            nextPage = int(trainListInfo['nextPage'])
            pageNo = int(trainListInfo['pageNo'])
            #                             print m['msg']
            sel = etree.HTML(trainListInfo['msg'])
            trains = sel.xpath('//div[@class="trainList"]')
            for n in trains:
                d_str = n.xpath("@data-list")[0]
                d_str = d_str[d_str.index("id=") + 3:]
                shiftid = d_str[:d_str.index(",")]
                station = n.xpath('ul/li[@class="start"]/p/text()')
                time = n.xpath('ul/li[@class="time"]/p/strong/text()')
                #             print 'time->',time[0]
                banci = ''
                banci = n.xpath(
                    'ul/li[@class="time"]/p[@class="carNum"]/text()')
                if banci:
                    banci = banci[0].replace('\r\n', '').replace(' ', '')
                else:
                    ord_banci = n.xpath(
                        'ul/li[@class="time"]/p[@class="banci"]/text()')
                    if ord_banci:
                        banci = ord_banci[0]
                price = n.xpath('ul/li[@class="price"]/strong/text()')
                #             print 'price->',price[0]
                infor = n.xpath(
                    'ul/li[@class="carriers"]/p[@class="info"]/text()')
                distance = ''
                if infor:
                    distance = infor[0].replace('\r\n', '').replace(' ', '')
                buyInfo = n.xpath('ul/li[@class="buy"]')
                flag = 0
                for buy in buyInfo:
                    flag = buy.xpath('a[@class="btn"]/text()')  #判断可以买票
                    if flag:
                        flag = 1
                    else:
                        flag = 0
                item['drv_time'] = time[0]
                item['drv_datetime'] = datetime.datetime.strptime(
                    sdate + ' ' + time[0], "%Y-%m-%d %H:%M")
                item['s_sta_name'] = station[0]
                item['d_sta_name'] = station[1]
                item['bus_num'] = banci.decode("utf-8").strip().rstrip(u"次")
                item["full_price"] = float(str(price[0]).split('￥')[-1])
                item["half_price"] = float(str(price[0]).split('￥')[-1]) / 2
                item['distance'] = distance
                item['shift_id'] = str(shiftid)
                item['crawl_datetime'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                item['vehicle_type'] = ''
                item['seat_type'] = ''
                item['fee'] = 0
                item['left_tickets'] = 50 if flag else 0
                item['extra_info'] = {"flag": flag}
                yield item

            if nextPage > pageNo:
                url = 'http://84100.com/getBusShift/ajax' + '?pageNo=%s' % nextPage
                yield scrapy.FormRequest(url,
                                         formdata=payload,
                                         callback=self.parse_line,
                                         meta={
                                             "payload": payload,
                                             'crawl_province': crawl_province,
                                             'crawl_city': crawl_city,
                                             'start': start,
                                             "end": end
                                         })
            elif nextPage and nextPage == pageNo:
                self.mark_done(start["countyName"], end['portName'], sdate)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: hn96520.py Proyecto: xianglei0610/buscrawl

    def parse_line(self, response):
        s_city_name = response.meta['city'].decode('utf-8')
        start = response.meta['start'].decode('utf-8')
        end = response.meta['end'].decode('utf-8')
        sdate = response.meta['sdate'].decode('utf-8')
        self.mark_done(start, end, sdate)
        soup = bs(response.body, 'lxml')
        info = soup.find('table', attrs={'class': 'resulttb'}).find_all(
            'tbody', attrs={'class': 'rebody'})
        for x in info:
            try:
                bus_num = x.find(
                    'td', attrs={'align': 'center'}).get_text().strip()
                s_sta_name = x.find_all(
                    'td')[1].get_text().split()[0]
                d_city_name = x.find_all('td')[1].get_text().split()[1]
                drv_date = x.find_all('td')[2].get_text().strip()
                drv_time = x.find_all('td')[3].get_text().strip()
                # d_sta_name = x.find_all('td')[4].get_text().strip()
                distance = x.find_all('td')[5].get_text().strip()
                vehicle_type = x.find_all('td')[6].get_text().strip()
                full_price = x.find_all('td')[7].get_text().strip()
                left_tickets = int(x.find_all('td')[8].get_text().strip())
                y = x.find_all('td')[9].a.get('href').split('?')[-1]
                extra = {}
                for z in y.split('&'):
                    extra[z.split('=')[0]] = z.split('=')[1]

                attrs = dict(
                    s_province='河南',
                    s_city_id="",
                    s_city_name=s_city_name,
                    s_sta_name=s_sta_name,
                    s_city_code=get_pinyin_first_litter(s_city_name),
                    s_sta_id=extra['g'],
                    d_city_name=d_city_name,
                    d_city_id="",
                    d_city_code=get_pinyin_first_litter(d_city_name),
                    d_sta_id="",
                    d_sta_name=d_city_name,
                    drv_date=drv_date,
                    drv_time=drv_time,
                    drv_datetime=dte.strptime("%s %s" % (
                        drv_date, drv_time), "%Y-%m-%d %H:%M"),
                    distance=unicode(distance),
                    vehicle_type=vehicle_type,
                    seat_type="",
                    bus_num=bus_num,
                    full_price=float(full_price),
                    half_price=float(full_price) / 2,
                    fee=0.0,
                    crawl_datetime=dte.now(),
                    extra_info=extra,
                    left_tickets=left_tickets,
                    crawl_source="hn96520",
                    shift_id="",
                )
                yield LineItem(**attrs)

            except:
                pass

Ejemplo n.º 17

0

Mostrar archivo

Archivo: szky_wap.py Proyecto: xianglei0610/buscrawl

class SzkyWapSpider(SpiderBase):
    name = "szky_wap"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },

        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.SzkyHeaderMiddleware': 410,
        },
#        "DOWNLOAD_DELAY": 0.1,
       "RANDOMIZE_DOWNLOAD_DELAY": True,
    }

    def start_requests(self):
        start_url = "http://www.vchepiao.cn/mb/base/bus/queryNewSKY"
        station_dict = {
                        "B1K003": '福田汽车客运站',
                        "B1K002": "深圳湾客运服务点",
                        "B1K004": "南山汽车客运站",
                        "B1K005": "盐田汽车客运站",
                        "B1K006": "东湖汽车客运站",
                        "B2K037": "深圳北汽车客运站",
                        "B1K010": "皇岗汽车客运站",
                        "B2K040": "机场汽车客运站",
                        }
        for k, v in station_dict.items():
            data = {
                "stationCode": k,
                }
            yield scrapy.FormRequest(start_url,
                                     method="POST",
                                     formdata=data,
                                     callback=self.parse_target_city,
                                     meta={"start_code": k})

    def parse_target_city(self, response):
        start_code = response.meta["start_code"]
        res = json.loads(response.body)
        if not res["success"]:
            self.logger.error("parse_target_city: Unexpected return, %s", res)
            return
        line_url = "http://www.vchepiao.cn/mb/base/bus/queryBusSKY"
        end_list = res['data']
        for end in end_list:
            today = datetime.date.today()
            for j in range(1, 7):
                sdate = str(today+datetime.timedelta(days=j))
                sdate_tra = sdate.replace('-', '')
#                 if self.has_done(start[1], end["depotName"], sdate):
#                     self.logger.info("ignore %s ==> %s %s" % (start[1], end["depotName"], sdate))
#                     continue
                data = {
                    "fromCity": "深圳",
                    "stationCode": start_code,
                    "dstNode": end['NDName'],
                    "schDate": sdate_tra
                }
                yield scrapy.FormRequest(line_url,
                                         method="POST",
                                         formdata=data,
                                         callback=self.parse_line,
                                         meta={"start_code": start_code, "end": end, "date": sdate})

    def parse_line(self, response):
        "解析班车"
        start_code = response.meta["start_code"]
        end = response.meta["end"]
        sdate = response.meta["date"]
#         self.mark_done(start[1], end["depotName"], sdate)
        try:
            res = json.loads(response.body)
        except Exception, e:
            raise e
#         if  res["values"]["resultList"]:
#             print res["values"]["resultList"]
#             print start["name"] ,end["depotName"]
        if not res["success"]:
            #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"])
            return
        if res["data"]["list"]:
            print res
        for d in res["data"]["list"]:
            if d['SchStat'] == '1':
                attrs = dict(
                    s_province = u'广东',
                    s_city_name = u"深圳",
                    s_city_id = '',
                    s_city_code= get_pinyin_first_litter(u"深圳"),
                    s_sta_name = d["SchWaitStName"],
                    s_sta_id = d["SchStationCode"],
                    d_city_name = d["SchDstCity"],
                    d_city_code=get_pinyin_first_litter(d["SchDstCity"]),
                    d_city_id = d['SchStationCode'],
                    d_sta_name = d["SchNodeName"],
                    d_sta_id = d["SchDstNode"],
                    drv_date = d["SchDate"],
                    drv_time = d["orderbytime"],
                    drv_datetime = dte.strptime("%s %s" % (d["SchDate"], d["orderbytime"]), "%Y-%m-%d %H:%M"),
                    distance = "0",
                    vehicle_type = "",
                    seat_type = "",
                    bus_num = d["SchLocalCode"],
                    full_price = float(d["SchStdPrice"]),
                    half_price = float(d["SchStdPrice"])/2,
                    fee = 0,
                    crawl_datetime = dte.now(),
                    extra_info = {"raw_info": d},
                    left_tickets = int(d["SchSeatCount"]),
                    crawl_source = "szky",
                    shift_id="",
                )
                yield LineItem(**attrs)

Ejemplo n.º 18

0

Mostrar archivo

class AnxingBusSpider(SpiderBase):
    name = "anxingbus"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
        },
        "DOWNLOAD_DELAY": 0.6,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://android.anxingbus.com"

    # BASE_URL = "http://www.anxingbus.com"

    def start_requests(self):
        url = self.base_url + "/sell/GetCity"
        yield scrapy.Request(url,
                             callback=self.parse_starting,
                             headers=HEADERS)

    def get_dest_list_from_web(self, province, city, unitid=""):
        # def get_dest_list(self, province, city, unitid=""):
        data = {"unitid": unitid, "cityName": city}
        url = self.base_url + "/sell/GetEndStations?" + urllib.urlencode(data)
        r = requests.get(url, headers=HEADERS)
        ret = r.json()
        result = []
        for d in ret["data"][0].values():
            for city_id, city_info_str in d.items():
                lst = city_info_str.split("|")
                if city_id != lst[0]:
                    raise Exception()
                result.append({
                    "dest_id": city_id,
                    "name": lst[1],
                    "code": lst[3]
                })
        return result

    def parse_starting(self, response):
        ret = json.loads(response.body)
        url = self.base_url + "/sell/GetBus"

        today = datetime.date.today()
        for d in ret["data"][0].values():
            for city_id, city_info_str in d.items():
                lst = city_info_str.split("|")
                if city_id != lst[0]:
                    raise Exception()
                city_name = unicode(lst[1])
                if city_name not in C2P:
                    continue
                start = {
                    "city_id": city_id,
                    "city_name": city_name,
                    "city_code": lst[3],
                    "unitid": lst[9],
                    "province": C2P[city_name]
                }
                if not self.is_need_crawl(city=start["city_name"],
                                          province=C2P[city_name]):
                    continue
                for end in self.get_dest_list(start["province"],
                                              start["city_name"],
                                              unitid=start["unitid"]):
                    for i in range(self.start_day(), 8):
                        sdate = str(today + datetime.timedelta(days=i))
                        if self.has_done(start["city_name"], end["name"],
                                         sdate):
                            continue
                        params = {
                            "unitID": start["unitid"],
                            "busType": 0,
                            "cityID": start["city_id"],
                            "sellPlateStationID": "",
                            "sellStationID": "",
                            "endCityID": end["dest_id"],
                            "endStationID": "",
                            "busStartTime": sdate,
                            "busEndTime": "%s 23:59:59" % sdate,
                            "curPage": 1,
                            "pageSize": 1024,
                        }
                        yield scrapy.Request("%s?%s" %
                                             (url, urllib.urlencode(params)),
                                             callback=self.parse_line,
                                             meta={
                                                 "start": start,
                                                 "end": end,
                                                 "sdate": sdate,
                                                 "params": params
                                             },
                                             headers=HEADERS)

    def parse_line(self, response):
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        try:
            res = json.loads(response.body)
        except Exception, e:
            print response.body
            raise e
        self.mark_done(start["city_name"], end["name"], sdate)
        self.logger.info("finish %s ==> %s %s" %
                         (start["city_name"], end["name"], sdate))

        for d in res.get("data", []):
            drv_datetime = dte.strptime(d["BusTime"], "%Y-%m-%d %H:%M")
            attrs = dict(
                s_province=start["province"].rstrip("省"),
                s_city_id=start["city_id"],
                s_city_name=start["city_name"],
                s_sta_name=d["SellStationName"],
                s_city_code=start["city_code"],
                s_sta_id=d["SellStationID"],
                d_city_name=end["name"],
                d_city_id=end["dest_id"],
                d_city_code=end["code"],
                d_sta_id=d["StationID"],
                d_sta_name=d["StationName"],
                drv_date=drv_datetime.strftime("%Y-%m-%d"),
                drv_time=drv_datetime.strftime("%H:%M"),
                drv_datetime=drv_datetime,
                distance="",
                vehicle_type="%s(%s)" % (d["BusType"], d["Kind"]),
                seat_type="",
                bus_num=d["BusID"],
                full_price=float(d["FullPrice"]),
                half_price=float(d["HalfPrice"]) / 2,
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "UnitID": d["UnitID"],
                    "BusGuid": d["BusGuid"],
                    "Type": d["Type"],
                    "IsDirect": d["IsDirect"]
                },
                left_tickets=int(d["SeatNum"]),
                crawl_source="anxing",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 19

0

Mostrar archivo

class WxszSpider(SpiderBase):
    name = "wxsz"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400,
            # 'BusCrawl.middleware.ZjgsmHeaderMiddleware': 410,
            'BusCrawl.middleware.ProxyMiddleware': 410,
        },
        "DOWNLOAD_DELAY": 0.5,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://www.zjgsmwy.com"

    def start_requests(self):
        start_url = "http://coach.wisesz.mobi/coach_v38/main/getstations"
        yield scrapy.FormRequest(start_url, callback=self.parse_start_city)

    def parse_start_city(self, response):
        res = json.loads(response.body)
        if res["errorCode"] != 0:
            self.logger.error("parse_start_city: Unexpected return, %s" %
                              res["rtnMsg"])
            return
        name_trans = {
            u"张家港地区": "张家港",
            u"苏州市区": "苏州",
            u"常熟地区": "常熟",
            u"昆山地区": "昆山",
            u"太仓地区": "太仓",
            u"吴江地区": "吴江",
        }
        line_url = "http://coach.wisesz.mobi/coach_v38/main/get_tickets"
        for d in res["data"]["dataList"]:
            start = {
                "city_id": d["FIELDS1"],
                "city_name": name_trans[d["FIELDS2"]],
            }
            if not self.is_need_crawl(city=start["city_name"]):
                continue
            for sta in d["stations"]:
                start.update({
                    "sta_name": sta["FIELDS3"],
                    "sta_id": sta["FIELDS2"],
                })
                for s in self.get_dest_list("江苏", start["city_name"]):
                    name, code = s["name"], s["code"]
                    end = {"city_name": name, "city_code": code}
                    self.logger.info("start %s ==> %s" %
                                     (start["sta_name"], end["city_name"]))
                    today = datetime.date.today()
                    for i in range(self.start_day(), 8):
                        sdate = (today +
                                 datetime.timedelta(days=i)).strftime("%Y%m%d")
                        if self.has_done(start["sta_name"], end["city_name"],
                                         sdate):
                            continue
                        params = {
                            "departdate": sdate,
                            "destination": end["city_name"],
                            "fromcode": start["sta_id"],
                            "from": start["sta_name"],
                        }
                        yield scrapy.Request(
                            "%s?%s" % (line_url, urllib.urlencode(params)),
                            method="POST",
                            callback=self.parse_line,
                            headers={
                                "Content-Type":
                                "application/json;charset=UTF-8"
                            },
                            meta={
                                "start": start,
                                "end": end,
                                "sdate": sdate
                            })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["sta_name"], end["city_name"], sdate)
        try:
            res = json.loads(response.body)
        except Exception, e:
            self.logger.error(response.body)
            raise e
        if res["errorCode"] != 0:
            self.logger.error("parse_line: Unexpected return, %s", res)
            return
        shift_list = res["data"]["dataList"] or []

        for d in shift_list:
            drv_datetime = dte.strptime("%s %s" % (d["FIELDS1"], d["FIELDS3"]),
                                        "%Y%m%d %H%M")
            attrs = dict(
                s_province="江苏",
                s_city_id=start["city_id"],
                s_city_name=start["city_name"],
                s_sta_name=d["FIELDS4"],
                s_city_code=get_pinyin_first_litter(unicode(
                    start["city_name"])),
                s_sta_id=d["fromcode"],
                d_city_name=end["city_name"],
                d_city_id="",
                d_city_code=end["city_code"],
                d_sta_id=d["FIELDS11"],
                d_sta_name=d["FIELDS5"],
                drv_date=drv_datetime.strftime("%Y-%m-%d"),
                drv_time=drv_datetime.strftime("%H:%M"),
                drv_datetime=drv_datetime,
                distance=unicode(d["FIELDS16"]),
                vehicle_type=d["FIELDS9"],
                seat_type="",
                bus_num=d["FIELDS2"],
                full_price=float(d["FIELDS14"]),
                half_price=float(d["FIELDS15"]),
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "startstation": d["FIELDS17"],
                    "terminalstation": d["FIELDS6"]
                },
                left_tickets=int(d["FIELDS10"]),
                crawl_source="wxsz",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 20

0

Mostrar archivo

    def parse_line(self, response):
        "解析班车"
        res = response.body.decode('gbk')
        start_name = response.meta["start_name"]
        sw_name = response.meta["sw_name"]
        start_code = response.meta["start_code"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        sel = etree.HTML(res) 
        next_url = ''
        for i, j in enumerate(sel.xpath("//a/text()")):
            if j == '下一页':
                next_url = sel.xpath("//a/@href")[i]
#         countObj = re.findall("查询到(\d+)班", str(res))
#         if countObj:
#             count = countObj
#             page = int(math.ceil(count/10))
        form = sel.xpath('//form[@method="Post"]/@action')
        full_price = 0
        left_tickets = 0
        flag = False
        if form:
            sch = sel.xpath('//table[@width="600"]/tr')
            for i in sch[1:]:
                status = i.xpath('td[8]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                if status != '售票':
                    continue
                bus_num = i.xpath('td[1]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                drv_date = i.xpath('td[2]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                drv_date = dte.strftime(dte.strptime(drv_date, '%Y-%m-%d'),'%Y-%m-%d')
                drv_time = i.xpath('td[3]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                start_station = i.xpath('td[4]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                #end_station = i.xpath('td[5]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                distance = i.xpath('td[7]/div/text()')[0].replace('\r\n', '').replace('\t',  '').replace(' ',  '')
                href = i.xpath('td[9]/div/a/@onclick')[0]
                if 'javascript:alert' in href:
                    continue
                if not flag:
                    for i in range(5):
                        param = {}
                        for s in href.split(";")[0][15:-1].split("?")[1].split("&"):
                            k, v = s.split("=")
                            param[k] = v.encode('gb2312')
                        query_url = "%s%s" % ('http://www.mp0769.com/orderlist.asp?', urllib.urlencode(param))
                        req = self.urllib2.Request(query_url, headers=self.headers)
                        result = self.urllib2.urlopen(req)
                        content = result.read()
                        res = content.decode('gbk')
                        if '非法操作' in res:
                            query_url = "http://www.mp0769.com/" + href.split(";")[0][15:-1]
                            req = self.urllib2.Request(query_url, headers=self.headers)
                            result = self.urllib2.urlopen(req)
                            content = result.read()
                            res = content.decode('gbk')
                        check_url = re.findall("window.location.href=(.*);", res)[0][1:-1]
                        check_url = "http://www.mp0769.com/" + check_url
                        param = {}
                        for s in check_url.split("?")[1].split("&"):
                            k, v = s.split("=")
                            param[k] = v.encode('gb2312')
                        order_url = "http://www.mp0769.com/orderlist.asp?"
                        order_url = "%s%s" % (order_url, urllib.urlencode(param))
                        req = self.urllib2.Request(order_url, headers=self.headers)
                        result = self.urllib2.urlopen(req)
                        content = result.read()
                        sel = etree.HTML(content)
                        params = {}
                        for s in sel.xpath("//form[@id='Form1']//input"):
                            k, v = s.xpath("@name"), s.xpath("@value")
                            if k:
                                k, v = k[0], v[0] if k else ""
                                params[k] = v.encode('gb2312')
                        if not params or int(params.get('ct_price', 0)) == 0:
                            end_station = params['ct_stname'].decode('gbk')
                        else:
                            print "ct_price ", params['ct_price']
                            full_price = params['ct_price']
                            left_tickets = params['ct_accnum']
                            end_station = params['ct_stname'].decode('gbk')
                            flag = True
                            break
                drv_datetime = dte.strptime("%s %s" % (drv_date, drv_time), "%Y-%m-%d %H:%M")
                if not flag:
                    result = self.query_line_info_by_gdsw(sw_name,end_station,bus_num,drv_datetime)
                    if result:
                        full_price = result['full_price']
                        left_tickets = result['left_tickets']
                        flag = True
                    else:
                        print 111111,sw_name,end_station,bus_num,drv_datetime
                        print 3333333,end
                attrs = dict(
                    s_province = u'广东',
                    s_city_name = u"东莞",
                    s_city_id = '',
                    s_city_code= get_pinyin_first_litter(u"东莞"),
                    s_sta_name = start_station,
                    s_sta_id = start_code,
                    d_city_name = end,
                    d_city_code= get_pinyin_first_litter(end),
                    d_city_id = '',
                    d_sta_name = end_station,
                    d_sta_id = '',
                    drv_date = drv_date,
                    drv_time = drv_time,
                    drv_datetime = drv_datetime,
                    distance = distance,
                    vehicle_type = "",
                    seat_type = "",
                    bus_num = bus_num,
                    full_price = float(full_price),
                    half_price = float(full_price)/2,
                    fee = 0,
                    crawl_datetime = dte.now(),
                    extra_info = {"query_url":href},
                    left_tickets = left_tickets,
                    crawl_source = "dgky",
                    shift_id="",
                )
                yield LineItem(**attrs)
        if next_url:
            url = "http://www.mp0769.com/bccx.asp?"
            param = {}
            try:
                for s in next_url.split("?")[1].split("&"):
                    k, v = s.split("=")
                    param[k] = v.encode('gb2312')
                url = url + urllib.urlencode(param)
            except:
                print next_url
            yield scrapy.Request(url,
                                 method="GET",
                                 callback=self.parse_line,
                                 meta={'start_name': start_name, "sw_name": sw_name,
                                       'start_code': start_code, 'end': end, 'sdate':sdate})
        else:
            self.mark_done(start_name, end, sdate)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: cqky.py Proyecto: xianglei0610/buscrawl

class CqkySpider(SpiderBase):
    name = "cqky"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.CqkyProxyMiddleware': 410,
            'BusCrawl.middleware.CqkyHeaderMiddleware': 410,
        },
        #"DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }

    def start_requests(self):
        start_url = "http://www.96096kp.com/StationSelect3.aspx"
        yield scrapy.Request(
            start_url,
            callback=self.parse_start_city,
        )

    def get_dest_list_from_web(self, province, city, station=""):
        # 需要子类实现
        url = "http://www.96096kp.com/UserData/MQCenterSale.aspx"
        d_list = []
        for c in [chr(i) for i in range(97, 123)]:
            params = {
                "cmd": "QueryNode",
                "StartStation": "重庆主城" if city == "重庆" else city,
                "q": c,
            }
            headers = {
                "Host": "www.96096kp.com",
                "Origin": "http://www.96096kp.com",
                "Referer": "http://www.96096kp.com/TicketMain.aspx",
                "X-Requested-With": "XMLHttpRequest",
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Chrome",
            }
            r = requests.post(url,
                              headers=headers,
                              data=urllib.urlencode(params))
            for d in r.json():
                d_list.append({
                    "name": d["NDName"],
                    "code": d["NDCode"],
                    "dest_id": "",
                })
        return d_list

    def parse_start_city(self, response):
        res = json.loads(
            re.findall(r"var _stationList=(\S+)</script>",
                       response.body)[0].replace("Pros", '"Pros"').replace(
                           "Areas", '"Areas"').replace("Stations",
                                                       '"Stations"'))
        line_url = "http://www.96096kp.com/UserData/MQCenterSale.aspx"
        trans = {u"重庆主城": "重庆"}
        for d in res["Areas"][0]["AreaData"]:
            start = {
                "province": "重庆",
                "s_city_id": d["ID"],
                "s_city_name": d["CityDist"],
                "s_city_code": get_pinyin_first_litter(d["CityDist"]),
            }
            if not self.is_need_crawl(city=start["s_city_name"]):
                continue
            for s in self.get_dest_list(province="重庆",
                                        city=trans.get(start["s_city_name"],
                                                       start["s_city_name"])):
                name, code = s["name"], s["code"]
                end = {"d_city_name": name, "d_city_code": code}
                today = datetime.date.today()
                for i in range(self.start_day(), 8):
                    sdate = str(today + datetime.timedelta(days=i))
                    if self.has_done(start["s_city_name"], end["d_city_name"],
                                     sdate):
                        # self.logger.info("ignore %s ==> %s %s" % (start["s_city_name"], end["d_city_name"], sdate))
                        continue
                    params = {
                        "StartStation": start["s_city_name"],
                        "WaitStationCode": "",
                        "OpStation": -1,
                        "OpAddress": -1,
                        "SchDate": sdate,
                        "DstNode": name,
                        "SeatType": "",
                        "SchTime": "",
                        "OperMode": "",
                        "SchCode": "",
                        "txtImgCode": "",
                        "cmd": "MQCenterGetClass",
                        "isCheck": "false",
                    }
                    yield scrapy.Request(line_url,
                                         method="POST",
                                         body=urllib.urlencode(params),
                                         callback=self.parse_line,
                                         meta={
                                             "start": start,
                                             "end": end,
                                             "sdate": sdate
                                         })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["s_city_name"], end["d_city_name"], sdate)
        content = response.body
        for k in set(re.findall("([A-Za-z]+):", content)):
            content = re.sub(r"\b%s\b" % k, '"%s"' % k, content)

        self.logger.info("finish %s ==> %s" %
                         (start["s_city_name"], end["d_city_name"]))
        try:
            res = json.loads(content)
        except Exception, e:
            self.logger.error("parse_line: %s" % content)
            raise e
        if res["success"] != "true":
            self.logger.error("parse_line: Unexpected return, %s" % res)
            return

        for d in res["data"]:
            attrs = dict(
                s_province=start["province"],
                s_city_id=start["s_city_id"],
                s_city_name=start["s_city_name"],
                s_sta_name=d["SchStationName"],
                s_city_code=start["s_city_code"],
                s_sta_id=d["SchStationCode"],
                d_city_name=end["d_city_name"],
                d_city_id="",
                d_city_code=end["d_city_code"],
                d_sta_id="",
                d_sta_name=d["SchDstNodeName"],
                drv_date=d["SchDate"],
                drv_time=d["SchTime"],
                drv_datetime=dte.strptime(
                    "%s %s" % (d["SchDate"], d["SchTime"]), "%Y-%m-%d %H:%M"),
                distance=unicode(d["SchDist"]),
                vehicle_type=d["SchBusType"],
                seat_type="",
                bus_num=d["SchLocalCode"],
                full_price=float(d["SchPrice"]),
                half_price=float(d["SchDiscPrice"]),
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={"raw_info": d},
                left_tickets=int(d["SchTicketCount"]),
                crawl_source="cqky",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 22

0

Mostrar archivo

class CTripSpider(SpiderBase):
    name = "ctrip"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },

        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.CtripHeaderMiddleware': 410,
        },
        "DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://m.ctrip.com/restapi/busphp/app/index.php"

    def start_requests(self):
        # 这是个pc网页页面
        headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/47.0.2526.106 Safari/537.36"}
        web_page = "http://qiche.tieyou.com/index.php?param=/ajax/cityList"
        return [scrapy.Request(web_page, headers=headers, callback=self.parse_start_city)]

    def parse_start_city(self, response):
        res = json.loads(response.body[1:-1])
        params = dict(
            param="/api/home",
            method="product.getToCityList",
            ref="ctrip.h5",
            partner="ctrip.app",
            clientType="Android--hybrid",
            vendor="",
            fromCity="",
            contentType="json",
        )
        for pro in res['hotFromCity']['province']:
            province = pro["province_name"]
            if not province or not self.is_need_crawl(province=province):
                continue
            self.logger.info("start province: %s" % province)

            for ci in pro["citys"]:
                d = {
                    "province": province,
                    "name": ci,
                }
                if not self.is_need_crawl(city=ci):
                    continue
                self.logger.info("start province: %s city: %s", province, ci)
                params.update(fromCity=ci)
                url = "%s?%s" % (self.base_url, urllib.urlencode(params))
                yield scrapy.Request(url, callback=self.parse_target_city, meta={"start": d})

    def parse_target_city(self, response):
        res = json.loads(response.body)
        if int(res["code"]) != 1:
            self.logger.error("parse_target_city: Unexpected return, %s" % res["message"])
            return

        start = response.meta["start"]
        for tar in res["return"]:
            d = {
                "name": tar["name"],
            }

            today = datetime.date.today()
            for i in range(1, 10):
                sdate = str(today+datetime.timedelta(days=i))
                if self.has_done(start["name"], d["name"], sdate):
                    #self.logger.info("ignore %s ==> %s %s" % (start["name"], d["name"], sdate))
                    continue
                params = dict(
                    param="/api/home",
                    method="product.getBusList",
                    v="1.0",
                    ref="ctrip.h5",
                    partner="ctrip.app",
                    clientType="Android--hybrid",
                    fromCity=start["name"],
                    toCity=d["name"],
                    fromDate=sdate,
                    contentType="json",
                )
                url = "%s?%s" % (self.base_url, urllib.urlencode(params))
                yield scrapy.Request(url, callback=self.parse_line, meta={"start": start, "end": d, "drv_date": sdate})

    def parse_line(self, response):
        "解析班车"
        try:
            res = json.loads(response.body)
        except Exception, e:
            print response.body
            raise e
        start = response.meta["start"]
        end = response.meta["end"]
        drv_date = response.meta["drv_date"]
        self.mark_done(start["name"], end["name"], drv_date)
        if int(res["code"]) != 1:
            #self.logger.error("parse_line: Unexpected return, %s" % str(res))
            return
        for d in res["return"]:
            if not d["bookable"]:
                continue
            if d["busType"] == "流水班":
                continue
            from_station = unicode(d["fromStationName"])
            to_station = unicode(d["toStationName"])
            ticket_info = d["showTicketInfo"]
            if ticket_info == "有票":
                left_tickets = 45
            elif ticket_info.endswith("张"):
                left_tickets = int(ticket_info[:-1])
            elif ticket_info == "预约购票":
                continue
            else:
                print ticket_info, d["bookable"]
                1/0

            attrs = dict(
                s_province = start["province"],
                s_city_name = d["fromCityName"],
                s_city_id="",
                s_city_code=get_pinyin_first_litter(d["fromCityName"]),
                s_sta_name = from_station,
                s_sta_id="",
                d_city_name = d["toCityName"],
                d_city_id="",
                d_city_code=get_pinyin_first_litter(d["toCityName"]),
                d_sta_name = to_station,
                d_sta_id="",
                drv_date = drv_date,
                drv_time = d["fromTime"],
                drv_datetime = dte.strptime("%s %s" % (drv_date, d["fromTime"]), "%Y-%m-%d %H:%M"),
                distance = "0",
                vehicle_type = d["busType"],
                seat_type = "",
                bus_num = d["busNumber"],
                full_price = float(d["fullPrice"]),
                half_price = float(d["fullPrice"])/2,
                fee = 0,
                crawl_datetime = dte.now(),
                extra_info = {},
                left_tickets = left_tickets,
                crawl_source = "ctrip",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 23

0

Mostrar archivo

    def parse_line(self, response):
        trainListInfo = json.loads(response.body)
        if trainListInfo:
            start = response.meta["start"]
            end = response.meta["end"]
            crawl_province = response.meta["crawl_province"]
            crawl_city = response.meta["crawl_city"]
            payload = response.meta["payload"]
            sdate = payload['sendDate']
            nextPage = int(trainListInfo['nextPage'])
            pageNo = int(trainListInfo['pageNo'])
            #                             print m['msg']
            content = trainListInfo['msg']
            if not isinstance(content, unicode):
                content = content.decode('utf-8')
            sel = etree.HTML(content)
            trains = sel.xpath('//div[@class="trainList"]')
            for n in trains:
                flag = 0
                buyInfo = n.xpath('ul/li[@class="buy"]/a[@class="btn"]/text()')
                if buyInfo:
                    d_str = n.xpath("@data-list")[0]
                    shift_str = d_str[d_str.index("id=") + 3:]
                    left_str = d_str[d_str.index("leftSeatNum=") + 12:]
                    shiftid = shift_str[:shift_str.index(",")]
                    leftSeatNum = left_str[:left_str.index(",")]
                    station = n.xpath('ul/li[@class="start"]/p/text()')
                    time = n.xpath('ul/li[@class="time"]/p/strong/text()')
                    bus_num = ''
                    bus_num = n.xpath(
                        'ul/li[@class="time"]/p[@class="carNum"]/text()')
                    if bus_num:
                        bus_num = bus_num[0].replace('\r\n',
                                                     '').replace(' ', '')
                    price = n.xpath('ul/li[@class="price"]/strong/text()')
                    flag = 1
                    attrs = dict(
                        s_province=crawl_province['province_name'],
                        s_city_name=start['countyName'],
                        s_city_id=start['countyId'],
                        s_city_code=get_pinyin_first_litter(
                            start['countyName']),
                        s_sta_name=station[0],
                        s_sta_id=start['countyId'],
                        d_city_name=end['portName'],
                        d_city_code=get_pinyin_first_litter(end['portName']),
                        d_city_id='',
                        d_sta_name=station[1],
                        d_sta_id='',
                        drv_date=sdate,
                        drv_time=time[0],
                        drv_datetime=dte.strptime("%s %s" % (sdate, time[0]),
                                                  "%Y-%m-%d %H:%M"),
                        distance=0,
                        vehicle_type="",
                        seat_type='',
                        bus_num=bus_num.decode("utf-8").strip().rstrip(u"次"),
                        full_price=float(str(price[0]).split('￥')[-1]),
                        half_price=float(str(price[0]).split('￥')[-1]) / 2,
                        fee=0,
                        crawl_datetime=dte.now(),
                        extra_info={"flag": flag},
                        left_tickets=int(leftSeatNum),
                        crawl_source="xintuyun",
                        shift_id=shiftid,
                    )
                    yield LineItem(**attrs)

            if nextPage > pageNo:
                url = 'http://www.xintuyun.cn/getBusShift/ajax' + '?pageNo=%s' % nextPage
                yield scrapy.FormRequest(url,
                                         formdata=payload,
                                         callback=self.parse_line,
                                         meta={
                                             "payload": payload,
                                             'crawl_province': crawl_province,
                                             'crawl_city': crawl_city,
                                             'start': start,
                                             "end": end
                                         })
            elif nextPage and nextPage == pageNo:
                self.mark_done(start["countyName"], end['portName'], sdate)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: baba.py Proyecto: xianglei0610/buscrawl

class BabaSpider(SpiderBase):
    name = "baba"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.BabaHeaderMiddleware': 410,
        },
    }

    def post_data_templ(self, content):
        tmpl = {
            "content": content,
            "common": {
                "pushToken": "864895020513527",
                "channelVer": "BabaBus",
                "usId": "",
                "appId": "com.hundsun.InternetSaleTicket",
                "appVer": "1.4.0",
                "loginStatus": "0",
                "imei": "864895020513527",
                "mobileVer": "6.0",
                "terminalType": "1",
                "platformCode": "01",
                "phone": "",
            },
            "key": ""
        }
        return tmpl

    def start_requests(self):
        start_url = "http://s4mdata.bababus.com:80/app/v5/ticket/cityAllListFrom.htm"
        content = {"dataVersion": "", "searchType": "0"}
        fd = self.post_data_templ(content)
        yield scrapy.Request(start_url,
                             method="POST",
                             body=json.dumps(fd),
                             callback=self.parse_start_city)

    def get_dest_list_from_web(self, province, city):
        dest_url = 'http://s4mdata.bababus.com:80/app/v5/ticket/cityAllList.htm'
        dest_list = []
        for c in [chr(i) for i in range(97, 123)]:
            content = {
                "searchType": "0",
                "dataVersion": "",
                "beginCityName": city
            }
            fd = self.post_data_templ(content)

            ua = "Mozilla/5.0 (Linux; U; Android 2.2; fr-lu; HTC Legend Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko)  Version/4.0 Mobile Safari/533.1"
            headers = {"User-Agent": ua}
            import requests
            r = requests.post(dest_url, data=json.dumps(fd), headers=headers)
            res = r.json()
            for d in res["content"]["cityList"]:
                end = {
                    "name": d["cityName"],
                    "code": get_pinyin_first_litter(d["cityName"]),
                    "dest_id": d["cityId"],
                }
                dest_list.append(end)
        return dest_list

    def parse_start_city(self, response):
        res = json.loads(response.body)
        if res["returnNo"] != "0000":
            self.logger.error("parse_start_city: Unexpected return, %s", res)
            return
        line_url = "http://s4mdata.bababus.com:80/app/v5/ticket/busList.htm"
        for info in res["content"]["cityList"]:
            name = info["cityName"]
            if name not in CITY_TO_PROVINCE:
                continue
            province = CITY_TO_PROVINCE[name]
            if not self.is_need_crawl(city=name, province=province):
                continue
            start = {
                "province": province,
                "city_name": info["cityName"],
                "city_code": info["allSpell"],
                "city_id": info["cityId"],
            }
            for d in self.get_dest_list(province, name):
                end = {
                    "city_name": d["name"],
                    "city_code": d["code"],
                    "city_id": d["dest_id"],
                }
                today = datetime.date.today()
                for i in range(self.start_day(), 8):
                    sdate = str(today + datetime.timedelta(days=i))
                    if self.has_done(start["city_name"], end["city_name"],
                                     sdate):
                        continue
                    content = {
                        "pageSize": 1025,
                        "beginCityName": start["city_name"],
                        "currentPage": 1,
                        "endCityName": end["city_name"],
                        "leaveDate": sdate,
                        "beginCityId": start["city_id"],
                        "endCityId": end["city_id"],
                    }
                    fd = self.post_data_templ(content)
                    yield scrapy.Request(line_url,
                                         method="POST",
                                         body=json.dumps(fd),
                                         callback=self.parse_line,
                                         meta={
                                             "start": start,
                                             "end": end,
                                             "date": sdate
                                         })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["date"]
        try:
            res = json.loads(response.body)
        except Exception, e:
            raise e
        if res["returnNo"] != "0000":
            return
        self.logger.info("finish %s ==> %s" %
                         (start["city_name"], end["city_name"]))
        self.mark_done(start["city_name"], end["city_name"], sdate)
        for d in res["content"]["busList"]:
            try:
                drv_datetime = dte.strptime(
                    "%s %s" % (d["leaveDate"], d["leaveTime"]),
                    "%Y-%m-%d %H:%M")
            except:
                continue
            attrs = dict(
                s_province=start["province"],
                s_city_name=start["city_name"],
                s_city_id=start["city_id"],
                s_city_code=start["city_code"],
                s_sta_name=d["beginStation"],
                s_sta_id=d["beginStationId"],
                d_city_name=end["city_name"],
                d_city_code=end["city_code"],
                d_city_id=end["city_id"],
                d_sta_name=d["endStation"],
                d_sta_id=d.get("endStationId", ""),
                drv_date=d["leaveDate"],
                drv_time=d["leaveTime"],
                drv_datetime=drv_datetime,
                distance="0",
                vehicle_type=d["busType"],
                seat_type="",
                bus_num=d["busId"],
                full_price=float(d["fullPrice"]),
                half_price=float(d["fullPrice"]) / 2,
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "depotName": d.get("depotName", ""),
                    "sbId": d["sbId"],
                    "stId": d["stId"],
                    "depotId": d["depotId"]
                },
                left_tickets=int(d["remainCount"]),
                crawl_source="baba",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: bjky.py Proyecto: xianglei0610/buscrawl

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["date"]
        content = response.body
        self.mark_done(start["name"], end["StopName"], sdate)
        if not isinstance(content, unicode):
            content = content.decode('utf-8')
        sel = etree.HTML(content)
        scheduleList = sel.xpath('//div[@id="scheduleList"]/table/tbody/tr')
        for i in range(0, len(scheduleList), 2):
            s = scheduleList[i]
            time = s.xpath('td[@class="departureTimeCell"]/span/text()')[0]
            station = s.xpath('td[@class="routeNameCell"]/span/text()')
            scheduleIdSpan = s.xpath(
                'td[@class="scheduleAndBusLicenseCes"]/span[@class="scheduleSpan"]/span[@class="scheduleIdSpan"]/text()'
            )[0]
            scheduleIdSpan = scheduleIdSpan.replace('\r\n', '').replace(
                '\t', '').replace(' ', '')
            price = s.xpath(
                'td[@class="ticketPriceCell"]/span[@class="ticketPriceSpan"]/span[@class="ticketPriceValueSpan"]/text()'
            )[0]
            ScheduleString = s.xpath(
                'td[@class="operationCell"]/@data-schedule')[0]
            left_tickets = 45
            left_less = s.xpath('td[@class="memoCell"]/span/@class')
            if left_less:
                left_tickets = 0

            station_code_mapping = {
                u"六里桥": "1000",
                u"首都机场站": "1112",
                u"赵公口": "1103",
                u"木樨园": "1104",
                u"丽泽桥": "1106",
                u"新发地": "1107",
                u"莲花池": "1108",
                u"四惠": "1109",
                u"永定门": "1110",
                u"北郊": "1111",
            }
            attrs = dict(
                s_province='北京',
                s_city_name="北京",
                s_city_id='',
                s_city_code=get_pinyin_first_litter(u"北京"),
                s_sta_name=station[0],
                s_sta_id=station_code_mapping[station[0]],
                d_city_name=end['StopName'],
                d_city_code=get_pinyin_first_litter(end['StopName']),
                d_city_id=end['StopId'],
                d_sta_name=end['StopName'],
                d_sta_id='',
                drv_date=sdate,
                drv_time=time,
                drv_datetime=dte.strptime("%s %s" % (sdate, time),
                                          "%Y-%m-%d %H:%M"),
                distance="0",
                vehicle_type="",
                seat_type="",
                bus_num=scheduleIdSpan,
                full_price=float(price),
                half_price=float(price) / 2,
                fee=0,
                crawl_datetime=dte.now(),
                extra_info={
                    "ScheduleString": ScheduleString,
                    "ArrivingStopJson": json.dumps(end)
                },
                left_tickets=left_tickets,
                crawl_source="bjky",
                shift_id='',
            )
            yield LineItem(**attrs)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: hainky.py Proyecto: xianglei0610/buscrawl

 def parse_line(self, response):
     "解析班车"
     province_list = ('吉林', '辽宁', '河北', '黑龙江', '广东', "云南", '山西', '山东',
                      '广西壮族自治', '江西', '河南', '浙江', '安徽', '湖北', '湖南', "贵州",
                      '陕西', '江苏', '内蒙古自治', "四川", '海南', '山东', '甘肃', '青海',
                      '宁夏回族自治', "新疆维吾尔自治", '西藏自治', '贵州', '广西')
     start = response.meta["start"]
     end = response.meta["end"]
     sdate = response.meta["sdate"]
     res = json.loads(response.body)
     self.logger.info("finish %s ==> %s" %
                      (start["station_name"], end["zdmc"]))
     self.mark_done(start['station_name'], end["zdmc"], sdate)
     xml_text = re.findall(
         r"<getScheduledBusResult>(.*)</getScheduledBusResult>",
         res.get('msg', ''), re.S)[0]
     root = ET.fromstring(xml_text)
     node_find = root.find('Body')
     if node_find.attrib['size'] == '0':
         return
     res = node_find.findall('ScheduledBus')
     for d in res:
         s_sta_name = start['station_name']
         s_sta_id = start['czbh']
         d_city_name = end['zdmc']
         if len(d_city_name) >= 4:
             if d_city_name.startswith(province_list):
                 for j in province_list:
                     if d_city_name.startswith(j):
                         d_city_name = d_city_name.replace(j, '')
                         break
         d_sta_name = d.find('MDZMC').text
         drv_time = d.find('FCSJ').text
         distance = d.find('LC').text
         seat_type = d.find('CXMC').text
         bus_num = d.find('CCBH').text
         full_price = d.find('PJ').text
         left_tickets = d.find('YPZS').text
         d_city_id = d.find('MDZBH').text
         attrs = dict(
             s_province='海南',
             s_city_name=start['city_name'],
             s_city_id='',
             s_city_code=get_pinyin_first_litter(unicode(
                 start['city_name'])),
             s_sta_name=s_sta_name,
             s_sta_id=s_sta_id,
             d_city_name=d_city_name,
             d_city_code=get_pinyin_first_litter(d_city_name),
             d_city_id=d_city_id,
             d_sta_name=d_sta_name,
             d_sta_id='',
             drv_date=sdate,
             drv_time=drv_time,
             drv_datetime=dte.strptime("%s %s" % (sdate, drv_time),
                                       "%Y-%m-%d %H:%M"),
             distance=distance,
             vehicle_type="",
             seat_type=seat_type,
             bus_num=bus_num,
             full_price=float(full_price),
             half_price=float(full_price) / 2,
             fee=0,
             crawl_datetime=dte.now(),
             extra_info={},
             left_tickets=int(left_tickets),
             crawl_source="hainky",
             shift_id='',
         )
         yield LineItem(**attrs)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: tongcheng.py Proyecto: xianglei0610/buscrawl

class TongChengSpider(SpiderBase):
    name = "tongcheng"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.BrowserRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.TongChengHeaderMiddleware': 410,
        },
        # "DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://m.ctrip.com/restapi/busphp/app/index.php"

    def start_requests(self):
        # 这是个pc网页页面
        line_url = "http://m.ly.com/bus/BusJson/BusSchedule"
        for name in [
                "苏州", "南京", "无锡", "常州", "南通", "张家港", "昆山", "吴江", "常熟", "太仓",
                "镇江", "宜兴", "江阴", "兴化", "盐城", "扬州", "连云港", "徐州", "宿迁"
        ]:
            if not self.is_need_crawl(city=name):
                continue
            self.logger.info("start crawl city %s", name)
            start = {"name": name, "province": "江苏"}
            for s in self.get_dest_list(start["province"], start["name"]):
                name, code = s.split("|")
                end = {"name": name, "short_pinyin": code}
                self.logger.info("start %s ==> %s" %
                                 (start["name"], end["name"]))

                today = datetime.date.today()
                for i in range(self.start_day(), 8):
                    sdate = str(today + datetime.timedelta(days=i))
                    if self.has_done(start["name"], end["name"], sdate):
                        self.logger.info("ignore %s ==> %s %s" %
                                         (start["name"], end["name"], sdate))
                        continue
                    params = dict(Departure=start["name"],
                                  Destination=end["name"],
                                  DepartureDate=sdate,
                                  DepartureStation="",
                                  DptTimeSpan=0,
                                  HasCategory="true",
                                  Category="0",
                                  SubCategory="",
                                  ExParms="",
                                  Page="1",
                                  PageSize="1025",
                                  BookingType="0")
                    yield scrapy.Request(line_url,
                                         method="POST",
                                         body=urllib.urlencode(params),
                                         callback=self.parse_line,
                                         meta={
                                             "start": start,
                                             "end": end,
                                             "sdate": sdate
                                         })

    def parse_line(self, response):
        "解析班车"
        try:
            res = json.loads(response.body)
        except Exception, e:
            self.logger.error("%s %s", response.body, e)
            return
        res = res["response"]
        if int(res["header"]["rspCode"]) != 0:
            # self.logger.error("parse_target_city: Unexpected return, %s" % res["header"])
            return
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["name"], end["name"], sdate)

        for d in res["body"]["schedule"]:
            if not d["canBooking"]:
                continue
            left_tickets = int(d["ticketLeft"])
            from_city = unicode(d["departure"])
            to_city = unicode(d["destination"])
            from_station = unicode(d["dptStation"])
            to_station = unicode(d["arrStation"])

            attrs = dict(
                s_province=start["province"],
                s_city_id="",
                s_city_name=from_city,
                s_sta_name=from_station,
                s_city_code=get_pinyin_first_litter(from_city),
                s_sta_id=d.get("dptStationCode", ""),
                d_city_name=to_city,
                d_city_id="",
                d_city_code=end["short_pinyin"],
                d_sta_id="",
                d_sta_name=to_station,
                drv_date=d["dptDate"],
                drv_time=d["dptTime"],
                drv_datetime=dte.strptime(
                    "%s %s" % (d["dptDate"], d["dptTime"]), "%Y-%m-%d %H:%M"),
                distance=unicode(d["distance"]),
                vehicle_type=d["coachType"],
                seat_type="",
                bus_num=d["coachNo"],
                full_price=float(d["ticketPrice"]),
                half_price=float(d["ticketPrice"]) / 2,
                fee=float(d["ticketFee"]),
                crawl_datetime=dte.now(),
                extra_info={},
                left_tickets=left_tickets,
                crawl_source="tongcheng",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 28

0

Mostrar archivo

Archivo: gzqcp.py Proyecto: xianglei0610/buscrawl

                    d_city_code=get_pinyin_first_litter(end["depotName"]),
                    d_city_id=end['depotCode'],
                    d_sta_name=d["arrivalDepotName"],
                    d_sta_id=d["arrivalDepotCode"],
                    drv_date=d["departDate"],
                    drv_time=d["leaveTime"],
                    drv_datetime=dte.strptime(
                        "%s %s" % (d["departDate"], d["leaveTime"]),
                        "%Y-%m-%d %H:%M"),
                    distance="0",
                    vehicle_type="",
                    seat_type="",
                    bus_num=d["busCode"],
                    full_price=float(d["fullPrice"]),
                    half_price=float(d["fullPrice"]) / 2,
                    fee=0,
                    crawl_datetime=dte.now(),
                    extra_info={
                        "busCodeType": d["busCodeType"],
                        "regsName": d["regsName"],
                        "busCompanyCode": d["busCompanyCode"],
                        "s_code": start["code"],
                        'e_code': end['depotCode'],
                        'arriveIsArea': arriveIsArea
                    },
                    left_tickets=int(d["remainSeats"]),
                    crawl_source="gzqcp",
                    shift_id="",
                )
                yield LineItem(**attrs)

Ejemplo n.º 29

0

Mostrar archivo

class CBDSpider(SpiderBase):
    name = "lvtu100"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.Lvtu100HeaderMiddleware': 410,
        },
        #"DOWNLOAD_DELAY": 0.2,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }
    base_url = "http://m.ctrip.com/restapi/busphp/app/index.php"

    def get_request_data(self, custom):
        data = {
            "appid": "lvtu100.andorid",
            "timestamp": str(int(time.time())),
            "format": "json",
            "version": "1.0",
        }
        data.update(custom)
        key_lst = filter(lambda x: data[x], data.keys())
        key_lst.sort()
        data["sign"] = md5("".join("%s%s" % (k, data[k]) for k in key_lst) +
                           "0348ba1cbbfa0fa9ca627394e999fea5")
        return data

    def get_dest_list(self, province, city):
        """
        覆盖了父类实现
        """
        url = "http://api.lvtu100.com/products/getstopcity"
        params = self.get_request_data({
            "startProvince": province,
            "startcityname": city
        })
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Linux; U; Android 2.3; en-us) AppleWebKit/999+ (KHTML, like Gecko) Safari/999.9",
            "Content-Type": "application/x-www-form-urlencoded",
        }
        r = requests.post(url, data=urllib.urlencode(params), headers=headers)
        ret = r.json()
        return map(
            lambda d: {
                "city_name": d["cityname"],
                "province": d["province"],
                "city_code": d["shortspell"]
            }, ret["data"]["resultList"])

    def start_requests(self):
        url = "http://api.lvtu100.com/products/get_allstartcity"
        params = self.get_request_data({})
        yield scrapy.FormRequest(url,
                                 formdata=params,
                                 callback=self.parse_starting)

    def parse_starting(self, response):
        url = "http://api.lvtu100.com/products/getgoods"
        ret = json.loads(response.body)
        today = datetime.date.today()
        for city_info in ret["data"]:
            for d in city_info["lstcitys"]:
                province = d["province"]
                if not self.is_need_crawl(
                        province=province) and not self.is_need_crawl(
                            province=province.rstrip(u"省")):
                    continue
                start = {
                    "city_id": d["startcityid"],
                    "city_code": d["shortspell"],
                    "city_name": d["cityname"],
                    "province": d["province"]
                }
                if not self.is_need_crawl(
                        city=start["city_name"]) or start["city_name"] in [
                            "宝应"
                        ]:
                    continue

                for end in self.get_dest_list(province, start["city_name"]):
                    for i in range(self.start_day(), 8):
                        sdate = str(today + datetime.timedelta(days=i))
                        if self.has_done(start["city_name"], end["city_name"],
                                         sdate):
                            continue
                        params = {
                            "startprovince": start["province"],
                            "startcity": start["city_name"],
                            "departdate": sdate,
                            "fromstation": "",
                            "pagestring": '{"page":1,"pagesize":1024}',
                            "range": "",
                            "stopprovince": end["province"],
                            "stopcity": end["city_name"],
                        }
                        yield scrapy.FormRequest(
                            url,
                            formdata=self.get_request_data(params),
                            callback=self.parse_line,
                            meta={
                                "start": start,
                                "end": end,
                                "sdate": sdate
                            })

    def parse_line(self, response):
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["sdate"]
        self.mark_done(start["city_name"], end["city_name"], sdate)
        self.logger.info("start %s ==> %s" %
                         (start["city_name"], end["city_name"]))
        try:
            res = json.loads(response.body)
        except Exception, e:
            print response.body
            raise e
        if int(res["code"]) != 0:
            self.logger.error("parse_line: Unexpected return, %s" % res)
            return

        s_sta_info = {d["productid"]: d for d in res["data"]["stations"]}
        d_sta_info = {d["productid"]: d for d in res["data"]["stopstations"]}
        for d in res["data"]["flight"]["resultList"]:
            if int(d["islocked"]) == 1:
                continue
            s_sta = s_sta_info[d["productid"]]
            d_sta = d_sta_info[d["productid"]]
            attrs = dict(
                s_province=start["province"].rstrip("省"),
                s_city_id=start["city_id"],
                s_city_name=start["city_name"],
                s_sta_name=s_sta["stationname"],
                s_city_code=start["city_code"],
                s_sta_id=s_sta["stationid"],
                d_city_name=d_sta["stopcity"],
                d_city_id="",
                d_city_code=get_pinyin_first_litter(d_sta["stopcity"]),
                d_sta_id="",
                d_sta_name=d_sta["stationname"],
                drv_date=d["departdate"],
                drv_time=d["departtime"],
                drv_datetime=dte.strptime(
                    "%s %s" % (d["departdate"], d["departtime"]),
                    "%Y-%m-%d %H:%M"),
                distance=unicode(d.get("distance", "") or ""),
                vehicle_type=d.get("bustype", "") or "",
                seat_type="",
                bus_num=d["itemno"],
                full_price=float(d["price"]),
                half_price=float(d["price"]) / 2,
                fee=3,
                crawl_datetime=dte.now(),
                extra_info={
                    "goodsid": d["goodsid"],
                    "itemid": d["itemid"],
                    "startProvince": start["province"],
                    "stopprovince": end["province"],
                    "productid": d["productid"]
                },
                left_tickets=10,
                crawl_source="lvtu100",
                shift_id="",
            )
            yield LineItem(**attrs)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: fjky.py Proyecto: xianglei0610/buscrawl

class FjkySpider(SpiderBase):
    name = "fjky"
    custom_settings = {
        "ITEM_PIPELINES": {
            'BusCrawl.pipeline.MongoPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
            None,
            'BusCrawl.middleware.MobileRandomUserAgentMiddleware': 400,
            'BusCrawl.middleware.ProxyMiddleware': 410,
            'BusCrawl.middleware.FjkyHeaderMiddleware': 410,
        },
        #        "DOWNLOAD_DELAY": 0.1,
        #        "RANDOMIZE_DOWNLOAD_DELAY": True,
    }

    def query_start_predate(self, code):
        url = 'http://www.968980.cn/com/yxd/pris/openapi/queryPreDate.action'
        data = {
            "startDepotCode": code,
        }
        res = requests.post(url, data=data)
        res = res.json()
        predate = 0
        if res['akfAjaxResult'] != '0':
            predate = 0
        else:
            predate = res['values']['preDate']
        return predate

    def get_init_dest_list(self, start_info):
        province_list = ('吉林', '辽宁', '河北', '黑龙江', '广东', "云南", '山西', '山东',
                         '广西壮族自治', '江西', '河南', '浙江', '安徽', '湖北', '湖南', "贵州",
                         '陕西', '江苏', '内蒙古自治', "四川", '海南', '山东', '甘肃', '青海',
                         '宁夏回族自治', "新疆维吾尔自治", '西藏自治', '贵州')
        rds = get_redis()
        rds_key = "crawl:dest:fjky16"
        dest_str = rds.get(rds_key)
        if not dest_str:
            target_url = "http://www.968980.cn//com/yxd/pris/wsgp/queryCity.action"
            data = {
                "flag": "false",
                "isArrive": "true",
                "isStart": "false",
                "iststation": "1",
                "startCode": start_info['code'],
                "zjm": '',
            }
            r = requests.post(target_url,
                              data=urllib.urlencode(data),
                              headers={
                                  "User-Agent":
                                  "Chrome",
                                  "Content-Type":
                                  "application/x-www-form-urlencoded"
                              })
            res = r.json()
            lst = []
            if res['values']['ca']:
                for i in res['values']['ca'][0]:
                    tmp = {}
                    tmp['code'] = i[0]
                    if i[4] in ['1', '2']:
                        tmp['name'] = i[1].strip(' ')
                    else:
                        lev_list = i[3].split(' ')
                        if len(lev_list) < 3:
                            tmp['name'] = i[1].strip(' ')
                        else:
                            tmp['name'] = lev_list[-1].strip(')').strip(' ')
                            province = lev_list[0].strip('(').strip(' ')
                            if province == '福建省':
                                tmp['name'] = i[1].strip(' ')
                    target_name = tmp['name']
                    if target_name.endswith('站'):
                        continue
                    if '直辖' not in target_name:
                        if not target_name or len(target_name) > 4:
                            if target_name.startswith(province_list):
                                target_name1 = target_name
                                for j in province_list:
                                    if target_name.startswith(j):
                                        target_name = target_name.replace(
                                            j, '')
                                        break
                    tmp['name'] = target_name
                    if not tmp['name'].endswith(('市', '县', '州', '区', '旗')):
                        continue
                    lst.append(tmp)
            dest_str = json.dumps(lst)
            rds.set(rds_key, dest_str)
        lst = json.loads(dest_str)
        return lst

    def start_requests(self):
        start_url = "http://www.968980.cn/com/yxd/pris/openapi/cityQueryAll.action"
        yield scrapy.FormRequest(start_url,
                                 method="POST",
                                 formdata={},
                                 callback=self.parse_start_city)

    def parse_start_city(self, response):
        res = json.loads(response.body)
        if res["akfAjaxResult"] != "0":
            self.logger.error("parse_start_city: Unexpected return, %s", res)
            return
        start_list = []
        for i in res['values']['list']:
            for j in i['list']:
                start_list.append(j)
#         end_list = self.get_init_dest_list(start_list[0])
        line_url = 'http://www.968980.cn/com/yxd/pris/openapi/queryAllTicket.action'
        for start in start_list:
            if not self.is_need_crawl(city=start['name']):
                continue
            end_list = self.get_dest_list('福建', start['name'])
            for end in end_list:
                end['code'] = end['dest_id']
                today = datetime.date.today()
                for j in range(0, 7):
                    sdate = str(today + datetime.timedelta(days=j))
                    if self.has_done(start['name'], end["name"], sdate):
                        self.logger.info("ignore %s ==> %s %s" %
                                         (start['name'], end["name"], sdate))
                        continue
                    data = {
                        "arrivalDepotCode": end['code'],
                        "beginTime": sdate,
                        "startName": unicode(start['name']),
                        "endName": unicode(end["name"]),
                        "startDepotCode": start['code']
                    }
                    yield scrapy.FormRequest(line_url,
                                             method="POST",
                                             formdata=data,
                                             callback=self.parse_line,
                                             meta={
                                                 "start": start,
                                                 "end": end,
                                                 "date": sdate
                                             })

    def parse_line(self, response):
        "解析班车"
        start = response.meta["start"]
        end = response.meta["end"]
        sdate = response.meta["date"]
        self.mark_done(start['name'], end["name"], sdate)
        try:
            res = json.loads(response.body)
        except Exception, e:
            raise e
        if res["akfAjaxResult"] != "0":
            #self.logger.error("parse_line: Unexpected return, %s, %s->%s, %s", sdate, start["city_name"], end["city_name"], res["header"])
            return
        for d in res["values"]["resultList"]:
            if d['stopFlag'] == '0':
                #                 if float(d["fullPrice"]) < 5 or int(d["remainSeats"]) < 2:
                #                     continue
                attrs = dict(
                    s_province='福建',
                    s_city_name=start['name'],
                    s_city_id=start['code'],
                    s_city_code=get_pinyin_first_litter(start['name']),
                    s_sta_name=d["startDepotName"],
                    s_sta_id=d["startDepotCode"],
                    d_city_name=end["name"],
                    d_city_code=get_pinyin_first_litter(end["name"]),
                    d_city_id=end['code'],
                    d_sta_name=d["arrivalDepotName"],
                    d_sta_id=d["arrivalDepotCode"],
                    drv_date=d["departDate"],
                    drv_time=d["leaveTime"],
                    drv_datetime=dte.strptime(
                        "%s %s" % (d["departDate"], d["leaveTime"]),
                        "%Y-%m-%d %H:%M"),
                    distance="0",
                    vehicle_type="",
                    seat_type="",
                    bus_num=d["busCode"],
                    full_price=float(d["fullPrice"]),
                    half_price=float(d["fullPrice"]) / 2,
                    fee=0,
                    crawl_datetime=dte.now(),
                    extra_info={
                        "busCodeType": d["busCodeType"],
                        "regsName": d["regsName"],
                        "busCompanyCode": d["busCompanyCode"],
                        "s_code": start['code'],
                        'e_code': end['code']
                    },
                    left_tickets=int(d["remainSeats"]),
                    crawl_source="fjky",
                    shift_id="",
                )
                yield LineItem(**attrs)