コード例 #1
0
ファイル: wangyi.py プロジェクト: CN-P5/163spider
    def get_qsyk_and_insert(self, docid):
        cover_img = MySQLdb.escape_string(docid['cover_img'])
        docid = docid['docid']

        if self.db_has_exist(docid):
            return

        url = "http://c.3g.163.com/nc/article/%s/full.html" % str(docid)
        data = utils.download_page(url, True)

        if data:
            data = data[docid]
            if data:
                ptime = data['ptime']
                today = ptime.split(' ')[0]
                imgs = data['img']
                body = data['body'].encode('utf-8')

                title = data['title'].replace(' ', '').replace('(', '-').replace('(', '-').replace(')', '').replace(')', '')

                for img in imgs:
                    body = body.replace(img['ref'], "<img src=\"" + img['src'] + "\"/><hr>")

                body = body.replace('%', '%%')
                body = MySQLdb.escape_string(body)
                sql = "insert into wangyi(item_type, title, url, docid, cover_img, ptime, today, body) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self._item_type, title, url, docid, cover_img, ptime, today, body)
                utils.insert_mysql(sql)
コード例 #2
0
ファイル: wangyi.py プロジェクト: xiaogang00/web-crawler
    def get_qsyk_and_insert(self, docid):
        cover_img = MySQLdb.escape_string(docid['cover_img'])
        docid = docid['docid']

        if self.db_has_exist(docid):
            return

        url = "http://c.3g.163.com/nc/article/%s/full.html" % str(docid)
        data = utils.download_page(url, True)

        if data:
            data = data[docid]
            if data:
                ptime = data['ptime']
                today = ptime.split(' ')[0]
                imgs = data['img']
                body = data['body'].encode('utf-8')

                title = data['title'].replace(' ', '').replace(
                    '(', '-').replace('(', '-').replace(')',
                                                        '').replace(')', '')

                for img in imgs:
                    body = body.replace(
                        img['ref'], "<img src=\"" + img['src'] + "\"/><hr>")

                body = body.replace('%', '%%')
                body = MySQLdb.escape_string(body)
                sql = "insert into wangyi(item_type, title, url, docid, cover_img, ptime, today, body) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
                    self._item_type, title, url, docid, cover_img, ptime,
                    today, body)
                utils.insert_mysql(sql)
コード例 #3
0
ファイル: wangyi.py プロジェクト: CN-P5/163spider
 def get_docid_from_json(self):
     """ 根据指定的起始、结束区间,提取这个区间的每日轻松一刻的 url 关键元素 """
     url = self._list_url + str(self._start) + "-" + str(self._end) + ".html"
     self._data = utils.download_page(url)
     if self._data:
         self._data = json.loads(self._data)
         if self._data.has_key(self._list_docid):
             self._data = self._data[self._list_docid]
             self.extract_docid()
コード例 #4
0
ファイル: wangyi.py プロジェクト: xiaogang00/web-crawler
 def get_docid_from_json(self):
     """ 根据指定的起始、结束区间,提取这个区间的每日轻松一刻的 url 关键元素 """
     url = self._list_url + str(self._start) + "-" + str(
         self._end) + ".html"
     self._data = utils.download_page(url)
     if self._data:
         self._data = json.loads(self._data)
         if self._data.has_key(self._list_docid):
             self._data = self._data[self._list_docid]
             self.extract_docid()