def handle(self, *args, **options): self.stdout.write("Start updating videos\n") items = Item.objects.all() # items = Item.objects.filter(pk=24341) video_count = 0 page_not_exist_count = 0 for item in items: video_site = is_video(item.url) if not video_site: continue try: webpage = open(settings.MEDIA_ROOT + item.file, "r") except: page_not_exist_count += 1 continue soup = BeautifulSoup(webpage.read(), fromEncoding="utf-8") flag, thumb, video_url = extract_media(item.url, soup, video_site) if flag: video_count += 1 item.type = 1 item.thumb = thumb item.video_url = video_url item.save() self.stdout.write("Finished updating" + str(video_count) + " videos") self.stdout.write(str(page_not_exist_count) + "wegpage files not exist")
def crawl_video(self): try: sina_key = KeyValue.objects.get(key='CRAWL_SINA_KEY').value sina_secret = KeyValue.objects.get(key='CRAWL_SINA_SECRET').value since_id = KeyValue.objects.get(key='CRAWL_SINA_SINCE_ID').value except: print('Error in getting necessary key-value\n') return #Construct video source from database vss = Video_Source.objects.all() vs_dict = {} for vs in vss: sa = vs.sina_account if sa in vs_dict: vs_dict[sa] = vs_dict[sa].append(vs.channel_id) else: vs_dict[sa] = [vs.channel_id] #print(str(vs_dict)) auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET) auth.setToken(sina_key, sina_secret) api = API(auth) count_per_page = 200 page = 1 # Feature: 0:All 1,Original 2:Picture 3:Video 4:Music if since_id: timeline = api.friends_timeline(count=count_per_page, since_id=since_id, page=page, feature=3) else: timeline = api.friends_timeline(count=count_per_page, page=page, feature=3) is_first_obj = True while timeline: #while the list user_timeline is not [] for obj in timeline: try: status_id = self.getAtt(obj, 'id') if is_first_obj: since_id_obj = KeyValue.objects.get(key='CRAWL_SINA_SINCE_ID') since_id_obj.value = status_id since_id_obj.save() is_first_obj = False text = self.getAtt(obj, 'text') friend = self.getAtt(obj, 'user') friend_id = self.getAtt(friend, 'id') #64 int #screen_name = self.getAtt(friend, 'screen_name') snippet = '' retweet = self.getAtt(obj, 'retweeted_status') if retweet: #Discard the original text snippet = text text = self.getAtt(retweet, 'text') date = self.getAtt(obj, "created_at") comments = api.comments(id=status_id) comments_list = [] for comment in comments: comments_list.append(self.getAtt(comment, 'text')) snippet = ' '.join(comments_list) urls = re.findall('http://t.cn/[a-zA-Z0-9]{5,8}', text) #process text before using it as title of item ,strip @ and urls url_index = text.find('http://') text = text[:url_index] text = text.replace(u'【', ' ') text = text.replace(u'】', ' ') for url in urls: #Typically there should only be one parsed = urlparse.urlparse(url) h = httplib.HTTPConnection(parsed.netloc) h.request('HEAD', parsed.path) response = h.getresponse() if response.status / 100 == 3 and response.getheader('Location'): #continue checking redirect url = response.getheader('Location') if not is_video(url): continue msg_or_id = add_item(url, pre_name=text) if isinstance(msg_or_id, int) or isinstance(msg_or_id, long): item = Item.objects.get(pk=msg_or_id) if date: item.create_date = date item.snippet = snippet if friend_id in vs_dict: item.channels = ','.join([str(x) for x in vs_dict[friend_id]]) item.save() except Exception, e: #Tolerate error in importing one weibo continue page += 1 print 'page:' + str(page) if page == 4: break if since_id: timeline = api.friends_timeline(count=count_per_page, since_id=since_id, page=page) else: timeline = api.friends_timeline(count=count_per_page, page=page)