コード例 #1
0
ファイル: temp_update_video.py プロジェクト: alexdiao/3805
 def handle(self, *args, **options):
     self.stdout.write("Start updating videos\n")
     items = Item.objects.all()
     # items = Item.objects.filter(pk=24341)
     video_count = 0
     page_not_exist_count = 0
     for item in items:
         video_site = is_video(item.url)
         if not video_site:
             continue
         try:
             webpage = open(settings.MEDIA_ROOT + item.file, "r")
         except:
             page_not_exist_count += 1
             continue
         soup = BeautifulSoup(webpage.read(), fromEncoding="utf-8")
         flag, thumb, video_url = extract_media(item.url, soup, video_site)
         if flag:
             video_count += 1
             item.type = 1
             item.thumb = thumb
             item.video_url = video_url
             item.save()
     self.stdout.write("Finished updating" + str(video_count) + " videos")
     self.stdout.write(str(page_not_exist_count) + "wegpage files not exist")
コード例 #2
0
ファイル: crawl_video.py プロジェクト: alexdiao/3805
 def crawl_video(self):
     try:
         sina_key = KeyValue.objects.get(key='CRAWL_SINA_KEY').value
         sina_secret = KeyValue.objects.get(key='CRAWL_SINA_SECRET').value
         since_id = KeyValue.objects.get(key='CRAWL_SINA_SINCE_ID').value
     except:
         print('Error in getting necessary key-value\n')
         return 
     
     #Construct video source from database
     vss = Video_Source.objects.all()
     vs_dict = {}
     for vs in vss:
         sa = vs.sina_account
         if sa in vs_dict:
             vs_dict[sa] =  vs_dict[sa].append(vs.channel_id)
         else:
             vs_dict[sa] = [vs.channel_id]
     
     #print(str(vs_dict))
     
     auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET)
     auth.setToken(sina_key, sina_secret)
     api = API(auth)
     count_per_page = 200
     page = 1
     # Feature: 0:All 1,Original 2:Picture 3:Video 4:Music
     if since_id:
         timeline = api.friends_timeline(count=count_per_page,  since_id=since_id, page=page, feature=3)
     else:
         timeline = api.friends_timeline(count=count_per_page, page=page, feature=3)
     is_first_obj = True
     while timeline: #while the list user_timeline is not []
         for obj in timeline:
             try:
                 status_id = self.getAtt(obj, 'id')
                 if is_first_obj:
                     since_id_obj = KeyValue.objects.get(key='CRAWL_SINA_SINCE_ID')
                     since_id_obj.value = status_id
                     since_id_obj.save()
                     is_first_obj = False
                 text = self.getAtt(obj, 'text')
                 friend = self.getAtt(obj, 'user')
                 friend_id = self.getAtt(friend, 'id') #64 int
                 #screen_name = self.getAtt(friend, 'screen_name')  
                 snippet = ''
                 retweet = self.getAtt(obj, 'retweeted_status')
                 if retweet: #Discard the original text
                     snippet = text
                     text = self.getAtt(retweet, 'text')
                 date = self.getAtt(obj, "created_at") 
                 comments = api.comments(id=status_id)
                 comments_list = []
                 for comment in comments:
                     comments_list.append(self.getAtt(comment, 'text'))
                 snippet = ' '.join(comments_list)
                 urls = re.findall('http://t.cn/[a-zA-Z0-9]{5,8}', text)
                 
                 #process text before using it as title of item ,strip @ and urls
                 url_index = text.find('http://')
                 text = text[:url_index]
                 text = text.replace(u'【', ' ')
                 text = text.replace(u'】', ' ')
                 
                 for url in urls: #Typically there should only be one
                     parsed = urlparse.urlparse(url)
                     h = httplib.HTTPConnection(parsed.netloc)
                     h.request('HEAD', parsed.path)
                     response = h.getresponse()
                     if response.status / 100 == 3 and response.getheader('Location'): #continue checking redirect
                         url = response.getheader('Location')
                     if not is_video(url):
                         continue
                     msg_or_id = add_item(url, pre_name=text)
                     if isinstance(msg_or_id, int) or isinstance(msg_or_id, long):
                         item = Item.objects.get(pk=msg_or_id)
                         if date:
                             item.create_date = date
                         item.snippet = snippet
                         if friend_id in vs_dict:
                             item.channels = ','.join([str(x) for x in vs_dict[friend_id]])                           
                         item.save()
             except Exception, e: #Tolerate error in importing one weibo
                 continue 
         page += 1
         print 'page:' + str(page)
         if page == 4:
             break
         if since_id:
             timeline = api.friends_timeline(count=count_per_page,  since_id=since_id, page=page)
         else:
             timeline = api.friends_timeline(count=count_per_page, page=page)