def __call__(self, data, url): rid = url_last(url) cid = self.cid title = self.title(data) rec_num = txt_wrap_by('<span class="rec-num">', '人</span>', data) or 0 like_num = txt_wrap_by('<span class="fav-num" data-tid="', '</a>喜欢</span>', data) or 0 if like_num: like_num = txt_wrap_by('<a href="#">', '人', like_num) yield parse_like , URL_LIKE%(cid, rid), cid, rid _topic = _owner = 0 owner_id = self.user_id(data) if owner_id is None: return try: owner_id = int(owner_id) except ValueError: _owner_id = DoubanUser.by_url(owner_id) if _owner_id: owner_id = _owner_id else: _owner = owner_id owner_id = 0 topic_id = self.topic_id(data) try: topic_id = int(topic_id) except ValueError: _topic = topic_id topic_id = 0 time = self.time(data) if time: time = int_by_string(time) feed_id = douban_feed_new( cid, rid, rec_num, like_num, title, self.htm(data), time, owner_id, topic_id ) if _owner or _topic: DoubanFeedOwner(id=feed_id, topic=_topic, owner=_owner).save() #for user_id in user_id_by_txt(data): # yield douban_recommendation_begin_tuple(user_id) if url in EXIST_PARSE: EXIST_PARSE.remove(url)
def __call__(self, data, url): rid = url_last(url) cid = self.cid title = self.title(data) rec_num = txt_wrap_by('<span class="rec-num">', '人</span>', data) or 0 like_num = txt_wrap_by('<span class="fav-num" data-tid="', '</a>喜欢</span>', data) or 0 if like_num: like_num = txt_wrap_by('<a href="#">', '人', like_num) yield parse_like, URL_LIKE % (cid, rid), cid, rid _topic = _owner = 0 owner_id = self.user_id(data) if owner_id is None: return try: owner_id = int(owner_id) except ValueError: _owner_id = DoubanUser.by_url(owner_id) if _owner_id: owner_id = _owner_id else: _owner = owner_id owner_id = 0 topic_id = self.topic_id(data) try: topic_id = int(topic_id) except ValueError: _topic = topic_id topic_id = 0 time = self.time(data) if time: time = int_by_string(time) feed_id = douban_feed_new(cid, rid, rec_num, like_num, title, self.htm(data), time, owner_id, topic_id) if _owner or _topic: DoubanFeedOwner(id=feed_id, topic=_topic, owner=_owner).save() #for user_id in user_id_by_txt(data): # yield douban_recommendation_begin_tuple(user_id) if url in EXIST_PARSE: EXIST_PARSE.remove(url)
def douban_recommendation(data, url, start_index=None): data = loads(data) entry_list = data[u'entry'] user_id, url = map( str, [i['@href'].strip('/').rsplit('/', 1)[-1] for i in data[u'author'][u'link'][:2]] ) if start_index == 1: name = data[u'title'][u'$t'][:-4] DoubanUser.new(user_id, url, name) if entry_list: for i in entry_list: title = i[u'content'][u'$t'].replace('\r', ' ').replace('\n', ' ').strip() # for uid in user_id_by_txt(title): # yield douban_recommendation_begin_tuple(uid) attribute = i[u'db:attribute'] cid = str(attribute[0][u'$t']) if cid in DOUBAN_REC_CID: cid = DOUBAN_REC_CID[cid] id = i[u'id'][u'$t'].rsplit('/', 1)[1] time = i[u'published'][u'$t'].split('+', 1)[0] time = int_by_string(time) douban_rec_new( id , user_id, cid, title, time ) from douban_parse import DOUBAN_REC_PARSE if cid in DOUBAN_REC_PARSE: _ = DOUBAN_REC_PARSE[cid](title, user_id) if _ is not None: for item in _: yield item if start_index is not None: start = start_index+10 url = '%s&max-result=10&start-index=%s'%(URL_REC%user_id, start) yield douban_recommendation, url, start else: f = DoubanFetched.get_or_create(id=user_id) f.save()
def douban_recommendation(data, url, start_index=None): data = loads(data) entry_list = data[u'entry'] user_id, url = map(str, [ i['@href'].strip('/').rsplit('/', 1)[-1] for i in data[u'author'][u'link'][:2] ]) if start_index == 1: name = data[u'title'][u'$t'][:-4] DoubanUser.new(user_id, url, name) if entry_list: for i in entry_list: title = i[u'content'][u'$t'].replace('\r', ' ').replace('\n', ' ').strip() # for uid in user_id_by_txt(title): # yield douban_recommendation_begin_tuple(uid) attribute = i[u'db:attribute'] cid = str(attribute[0][u'$t']) if cid in DOUBAN_REC_CID: cid = DOUBAN_REC_CID[cid] id = i[u'id'][u'$t'].rsplit('/', 1)[1] time = i[u'published'][u'$t'].split('+', 1)[0] time = int_by_string(time) douban_rec_new(id, user_id, cid, title, time) from douban_parse import DOUBAN_REC_PARSE if cid in DOUBAN_REC_PARSE: _ = DOUBAN_REC_PARSE[cid](title, user_id) if _ is not None: for item in _: yield item if start_index is not None: start = start_index + 10 url = '%s&max-result=10&start-index=%s' % (URL_REC % user_id, start) yield douban_recommendation, url, start else: f = DoubanFetched.get_or_create(id=user_id) f.save()
def wm_save(id, like, name, author, link, create_time, txt): wm = SpiderWm.get(wmid=id) if wm: return wm now = time() if '前' in create_time: create_time = now else: create_time = int_by_string(create_time) like = int(like or 0) wm = SpiderWm( wmid=id, like=like, name=name, author=author, link=link, time=create_time, txt=txt ) wm.save() return wm
def wm_save(id, like, name, author, link, create_time, txt): wm = SpiderWm.get(wmid=id) if wm: return wm now = time() if '前' in create_time: create_time = now else: create_time = int_by_string(create_time) like = int(like or 0) wm = SpiderWm(wmid=id, like=like, name=name, author=author, link=link, time=create_time, txt=txt) wm.save() return wm