Ejemplo n.º 1
0
    def parse_one(self, response):
        item = ShanghaioneItem()
        item['title'] = re.findall(r'<h1>.*?</h1>',
                                   response.body)[0].decode("gbk")[4:-5]
        title_link = re.findall(r'<h1>.*?</h1>',
                                response.body)[0].decode("gbk")[4:-5]

        m2 = hashlib.md5()
        m2.update(title_link.encode("utf-8"))
        item['md5'] = m2.hexdigest()
        times = re.findall(r"t =\'.*\'", response.body)[0][4:-1]
        try:
            time.strptime(times, "%Y-%m-%d")
            item['times'] = times
        except:
            item['times'] = ""
        #item['content'] = response.xpath("//div[@id='zoom']/div").extract()[0]
        if response.xpath("//div[@id='zoom']/div"):
            item['content'] = response.xpath(
                "//div[@id='zoom']/div").extract()[0]
        else:
            item['content'] = ""
        item['yuan'] = "广东省经济和信息化委员会"
        item["province"] = "广东"
        # print item
        yield item
Ejemplo n.º 2
0
    def parse_one(self, response):
        item = ShanghaioneItem()
        item['title'] = response.xpath("//div[@class='nstit']/h1/text()").extract()[0]
        title_link = response.xpath("//div[@class='nstit']/h1/text()").extract()[0]

        m2 = hashlib.md5()
        m2.update(title_link.encode("utf-8"))
        item['md5'] = m2.hexdigest()
        times = response.xpath("//div[@class='nstimes0']/text()").extract()[0].strip().split()[2][3:]
        try:
            time.strptime(times, "%Y-%m-%d")
            item['times'] = times
        except:
            item['times'] = ""
        item['content'] = response.xpath("//div[@class='TRS_Editor']").extract()[0]
        item['yuan'] = "江苏省经济和信息化委员会"
        item["province"] = "江苏"
        yield item
Ejemplo n.º 3
0
    def parse_one(self, response):
        item = ShanghaioneItem()
        item['title'] = response.xpath("//td[@class='title']/text()").extract()[0]
        title_link = response.xpath("//td[@class='title']/text()").extract()[0]

        m2 = hashlib.md5()
        m2.update(title_link.encode("utf-8"))
        item['md5'] = m2.hexdigest()
        times = response.xpath("//td[@style='line-height:20px;font-size:12px;']/text()").extract()[-1].replace("印发时间:","")
        try:
            time.strptime(times, "%Y-%m-%d")
            item['times'] = times
        except:
            item['times'] = ""
        item['content'] = response.xpath("//div[@id='zoom']").extract()
        item['yuan'] = "浙江省经济和信息化委员会"
        item["province"] = "浙江"
        yield item
Ejemplo n.º 4
0
    def parse_one(self, response):
        item = ShanghaioneItem()
        item['title'] = response.xpath("//h1[@id='ivs_title']/text()").extract()[0]
        title_link = response.xpath("//h1[@id='ivs_title']/text()").extract()[0]

        m2 = hashlib.md5()
        m2.update(title_link.encode("utf-8"))
        item['md5'] = m2.hexdigest()
        a = response.xpath("//h3[@class='view_tit_1']/text()").extract()[0]
        times = re.findall(r"\d.*\d", a)[0]
        try:
            time.strptime(times, "%Y-%m-%d")
            item['times'] = times
        except:
            item['times'] = ""
        item['content'] = response.xpath("//div[@id='ivs_content']").extract()[0]
        item['yuan'] = "上海市经济和信息化委员会"
        item["province"] = "上海"
        yield item