def save_article_dict_list(self, nick_name, article_dict_list): # 先删除超过限制数量的文章 if self.col.find(dict(nick_name=self.name)).count() > self.limit: oldest_doc = list(self.col.find(dict(nick_name=self.name)). sort([('ori_create_time', 1)]).limit(1))[0] oldest_doc_id = oldest_doc.get('_id') self.col.remove({'_id': oldest_doc_id}) self.logger.info( "%s:删除:%s : %s\n" % ( self.name, oldest_doc.get('title'), datestr_from_stamp( oldest_doc.get('ori_create_time'), '%Y-%m-%d' ) ) ) for o in article_dict_list: if o['title']: o_date = datestr_from_stamp( o.get('ori_create_time'), '%Y-%m-%d' ) self.logger.info( '%s-保存文章 title : %s %s\n', self.name, o['title'], o_date ) o['tag_id'] = self.tag_id self.col.update({'_id': gid()}, {'$set': o}, True)
def save_article_dict_list(self, nick_name, article_dict_list): # 先删除超过限制数量的文章 if self.col.find(dict(nick_name=self.name)).count() > self.limit: oldest_doc = list( self.col.find(dict(nick_name=self.name)).sort([ ('ori_create_time', 1) ]).limit(1))[0] oldest_doc_id = oldest_doc.get('_id') self.col.remove({'_id': oldest_doc_id}) self.logger.info( "%s:删除:%s : %s\n" % (self.name, oldest_doc.get('title'), datestr_from_stamp(oldest_doc.get('ori_create_time'), '%Y-%m-%d'))) for o in article_dict_list: if o['title']: o_date = datestr_from_stamp(o.get('ori_create_time'), '%Y-%m-%d') self.logger.info('%s-保存文章 title : %s %s\n', self.name, o['title'], o_date) o['tag_id'] = self.tag_id self.col.update({'_id': gid()}, {'$set': o}, True)
def to_dict(cls, doc): post = bson_to_json(doc) pre_url = 'http://read.html5.qq.com/image?src=forum&q=5&r=0&imgflag=7&imageUrl=' post['image'] = pre_url + post['cdn_url'] post['date'] = datestr_from_stamp(post['ori_create_time'], '%Y-%m-%d') return ObjectDict(post)
def fetch_page(self, page_url): """拿到单个文章页面,在文章url里加上参数f=json可以直接得到json格式 的数据,处理json拿到需要的字段。 """ if self.col.find(dict(nick_name=self.name)).count() > self.limit: oldest_doc = list( self.col.find(dict(nick_name=self.name)).sort([ ('ori_create_time', 1) ]).limit(1))[0] oldest_doc_id = oldest_doc.get('_id') self.col.remove({'_id': oldest_doc_id}) self.logger.info( "%s:删除:%s : %s\n" % (self.name, oldest_doc.get('title'), datestr_from_stamp(oldest_doc.get('ori_create_time'), '%Y-%m-%d'))) # 先拿到搜狗跳转到微信文章的地址 pre_r = get(page_url, headers=self.headers) wechat_url = pre_r.url.split('#')[0] + '&f=json' if 'mp.weixin' not in wechat_url: return r = get(wechat_url, headers=self.headers) self.logger.info(wechat_url) if self.col.find_one(dict(nick_name=self.name, url=wechat_url)): raise DocumentExistsException("article exist") if r.status_code != 200: return o = json.loads(r.text) if o.get('title') is None: # 文章被投诉后没有此字段,跳过 return fields = { 'cdn_url', 'nick_name', 'title', 'content', 'desc', 'link', 'ori_create_time' } media_fields = {'round_head_img', 'nick_name', 'signature'} media_dict = {k: o.get(k) for k in media_fields} article_dict = {k: o.get(k) for k in fields} if self.col.find_one(dict(nick_name=self.name, title=o['title'])): raise DocumentExistsException("article exist") too_old_days = 10 if days_from_now(o['ori_create_time']) > too_old_days: # 10天之前的跳过 self.logger.info('%s跳过%d天前文章 title : %s\n', self.name, too_old_days, o['title']) raise DocumentExpireException("expire") if o['title'] and o['content']: o_date = datestr_from_stamp(o.get('ori_create_time'), '%Y-%m-%d') self.logger.info('%s-保存文章 title : %s %s\n', self.name, o['title'], o_date) article_dict['nick_name'] = self.name article_dict['url'] = wechat_url article_dict['tag_id'] = self.tag_id del article_dict['content'] self.col.update({'_id': gid()}, {'$set': article_dict}, True) # http://mp.weixin.qq.com/s?__biz=MjM5NjAxMDc4MA==&mid=404900944&idx=1&sn=fe2d53ce562ee51e7163a60d4c95484a#rd biz = extract('__biz=', '==', article_dict['link']) self.media_col.update({'_id': biz}, {'$set': media_dict}, True)