コード例 #1
0
ファイル: axn_india.py プロジェクト: Bencakes/cms-data-tools
def parse_items(response):
    items = []
    list_pos = response.xpath("//ul[@class='schedule-listings']/li[@class='listing']")
    for li in list_pos:
        try:
            title = li.xpath("./div[@class='content']/h2[@class='title']").extract()[0]
            reg = re.compile(r'\<h2.*?\>\s*(?:\<a href.*?\>)*(.*?)(?:\</a\>)*\s*\</h2\>', re.S)
            s = reg.search(title)
            title = s.group(1)

            meta = li.xpath("./div[@class='content']/div[@class='meta']/text()").extract()[0]
            name = title + " " + meta
        except IndexError:
            print "Xpath parse name error!"
            name = title
        except AttributeError:
            print "re parse name error!"
            continue

        try:
            desc = li.xpath("./div[@class='content']/div[@class='synopsis']/text()").extract()[0]
        except IndexError:
            print "description is none!"
            desc = ""

        try:
            timeStr = li.xpath("./div[@class='date-time']/time/text()").extract()[0]
            dateStr = li.xpath("./div[@class='date-time']/span/text()").extract()[0]

            #将6.00am 这种格式转化为24小时制
            timeStr = time12to24(timeStr)

            times = "%s,%s,%s" % (time.strftime('%Y'), dateStr.strip(), timeStr)
            time_struct = time.strptime(times, "%Y,%A, %d %b,%H:%M")
            ftime = time.strftime("%Y.%m.%d %H:%M:%S", time_struct)
        except IndexError:
            print "Xpath parse time error!"
            continue
        except ValueError:
            print "time values error!"
            continue

        item = EpgItem()
        item['name'] = name
        item['starttime'] = ftime
        item['endtime'] = ''
        item['desc'] = desc.strip()
        items.append(item)


    return items
コード例 #2
0
ファイル: mbc_3.py プロジェクト: Bencakes/cms-data-tools
    def parse(self, response):
        program_position = response.xpath("//div[@class='box-container-wrapper']/div[contains(@class,'date-program-wrapper')]")
        for dates in program_position:
            date = dates.xpath("./div[@class='box-inner-container-header']/h2/text()").extract()[0][-10:]
            date = trans_format(date, "%d-%m-%Y", "%Y.%m.%d")
            programs = dates.xpath("./div[@class='box-inner-container-wrapper']/div")
            for program in programs:
                name = program.xpath("./div[@class='title']/h2/text()").extract()[0]
                times = program.xpath("./div[@class='timing']/time/text()").extract()[0]
                times = times.split("/")[0][0:7].strip().replace(":", ".")
                times = time12to24(times)
                starttime = trans_format("%s %s" % (date, times), "%Y.%m.%d %H:%M")
                ftime = datetime.datetime.strptime(starttime, "%Y.%m.%d %H:%M:%S")
                ftime = ftime + datetime.timedelta(hours=8)
                starttime = ftime.strftime("%Y.%m.%d %H:%M:%S")

                item = EpgItem()
                item['name'] = name
                item['starttime'] = starttime
                item['endtime'] = ''
                item['desc'] = ''
                yield item
コード例 #3
0
    def parse_epg(self, response):
        date_str = response.url[-8:]
        program_position = response.xpath("//div[@class='schedule_grid ']")
        reg = re.compile(r"\s\s+")
        for dates in program_position:
            program = dates.xpath("./div[@class='schedule_details']")
            program_time = program.xpath("./p[@class='info']/text()").extract()[0]
            title = program.xpath("./p[@class='title']/a/text()").extract()[0]
            try:
                subtitle = program.xpath("./p[@class='title']/a/span/text()").extract()[0]
            except IndexError:
                subtitle = ""
            except ValueError:
                subtitle = ""

            program_time = time12to24(reg.sub("", program_time).replace(" ", ""))
            starttime = trans_format("%s %s" % (date_str, program_time), self.formats + " %H:%M")

            item = EpgItem()
            item["name"] = reg.sub("", title.strip() + subtitle.strip())
            item["starttime"] = starttime
            item["endtime"] = ""
            item["desc"] = ""
            yield item