Beispiel #1
0
 def extract_and_fill(self, item, data):
     html = decodeHtml(data) #转换到utf8编码
     info = parseHtml(html) #解析
     
     item['publish_datetime'] = info['datetime']
     item['title'] = info['title']
     item['content'] = info['text']
Beispiel #2
0
    def parse_detail(self, response):

        url = response.url
        item = CrawlItem(
            url=url, site=self.site_name, crawl_datetime=datetime.datetime.now(), uuid=hashlib.md5(url).hexdigest()
        )

        # 数据
        data = response.body

        # 转换到utf8编码
        html = decodeHtml(data)

        # 解析
        info = parseHtml(html)
        item["publish_datetime"] = info["datetime"]
        item["title"] = info["title"]
        item["content"] = info["text"]

        return item
Beispiel #3
0
 def getHtml(url):
     opener = urllib2.urlopen(url)
     data = opener.read()
     html = decodeHtml(data)
     return html