def testYoutube(self): # Test None url url=None data = scraper.yt_scrape(url) self.assertEqual(0, data['view_count']) self.assertEqual(0, data['subscriber_count']) # Test empty url url='' data = scraper.yt_scrape(url) self.assertEqual(0, data['view_count']) self.assertEqual(0, data['subscriber_count']) # Test 0 url url='0' data = scraper.yt_scrape(url) self.assertEqual(0, data['view_count']) self.assertEqual(0, data['subscriber_count']) # Test youtube id url='http://www.youtube.com/user/Gereports' data = scraper.yt_scrape(url) self.assertTrue(data['view_count'] > 0) self.assertTrue(data['subscriber_count'] > 0) # Test another format url url='https://www.youtube.com/citi' data = scraper.yt_scrape(url) self.assertTrue(data['view_count'] > 0) self.assertTrue(data['subscriber_count'] > 0)
def testYoutube(self): # Test None url url = None data = scraper.yt_scrape(url) self.assertEqual(0, data['view_count']) self.assertEqual(0, data['subscriber_count']) # Test empty url url = '' data = scraper.yt_scrape(url) self.assertEqual(0, data['view_count']) self.assertEqual(0, data['subscriber_count']) # Test 0 url url = '0' data = scraper.yt_scrape(url) self.assertEqual(0, data['view_count']) self.assertEqual(0, data['subscriber_count']) # Test youtube id url = 'http://www.youtube.com/user/Gereports' data = scraper.yt_scrape(url) self.assertTrue(data['view_count'] > 0) self.assertTrue(data['subscriber_count'] > 0) # Test another format url url = 'https://www.youtube.com/citi' data = scraper.yt_scrape(url) self.assertTrue(data['view_count'] > 0) self.assertTrue(data['subscriber_count'] > 0)
def get_social_media(self, company_list, db_filename): """ Call scraper to get company social media data @type company_list: list @param company_list: the CompanyURL object list @rtype: list @return: the CompanySocialMedia object list """ # Define a progress bar on console limit = len(company_list) prog = ProgressBar(0, limit, 70, mode='fixed') oldprog = str(prog) i = 0 # Call twitter api in batch mode company_dict = {} for company in company_list: company_sm_data = CompanySocialMedia(company.company_name) if scraper.check_url(company.tw_url, 'twitter.com'): twitter_id = scraper.get_twitter_id(company.tw_url) if twitter_id: company_dict[twitter_id] = company_sm_data count = 0 twitter_ids = '' twitter_user_list = [] for k, v in company_dict.iteritems(): twitter_ids = twitter_ids + ',' + k count += 1 if count == 100: # Remove the first ',' twitter_ids = twitter_ids[1:] # call api twitter_user_list.extend(simple_twitter_api.UsersLookup(twitter_ids)) twitter_ids = '' count = 0 twitter_user_list.extend(simple_twitter_api.UsersLookup(twitter_ids)) twitter_user_dict = simple_twitter_api.build_dict(twitter_user_list) result = [] current_datetime = datetime.now() for company in company_list: company_sm_data = CompanySocialMedia(company.company_name) fb_data = scraper.scrap_facebook_raw_data(company.fb_url) # If can not get fb data from html, just try to get it from graph api if fb_data['likes'] == 0 and fb_data['talking_about_count'] == 0 and fb_data['checkins'] == 0: fb_data = scraper.fb_scrape(company.fb_url) # Get max checkins from previous records fb_data['checkins']=getMaxCheckins(company.company_name, db_filename) if fb_data['likes'] == 0: fb_data['likes'] = getMaxLikes(company.company_name, db_filename) if fb_data['talking_about_count'] == 0: fb_data['talking_about_count'] = getMaxTalkingAboutCount(company.company_name, db_filename) fb_data = handleFBData(fb_data) #tw_data = scraper.tw_scrape(company.tw_url) data = {'twitter_id': '', 'followers_count': 0, 'tweets': 0} if scraper.check_url(company.tw_url, 'twitter.com'): tw_id = scraper.get_twitter_id(company.tw_url) data['twitter_id'] = tw_id tw_data = twitter_user_dict.get(tw_id.lower(), data) else: tw_data = data yt_data = scraper.yt_scrape(company.yt_url) company_sm_data.fb_likes = fb_data['likes'] company_sm_data.fb_talking_about_count = fb_data['talking_about_count'] company_sm_data.fb_checkins = fb_data['checkins'] company_sm_data.tw_followers_count = tw_data['followers_count'] company_sm_data.tw_tweets = tw_data['tweets'] company_sm_data.yt_subscriber_count = yt_data['subscriber_count'] company_sm_data.yt_view_count = yt_data['view_count'] #log.debug('%d, %d, %d' % (company_sm_data.fb_likes, company_sm_data.fb_talking_about_count, company_sm_data.fb_checkins)) fb_metrics = calculator.cal_fb_hm(company_sm_data.fb_likes, company_sm_data.fb_talking_about_count, company_sm_data.fb_checkins) tw_metrics = calculator.cal_tw_hm(tw_data['twitter_id'], company_sm_data.tw_followers_count, company_sm_data.tw_tweets) yt_metrics = calculator.cal_yt_hm(company_sm_data.yt_subscriber_count, company_sm_data.yt_view_count) micro_metrics = calculator.cal_macro_metrics(fb_metrics['fb_health'], tw_metrics['tw_health'], yt_metrics['yt_health']) company_sm_data.fb_metrics = fb_metrics company_sm_data.tw_metrics = tw_metrics company_sm_data.yt_metrics = yt_metrics company_sm_data.micro_metrics = micro_metrics # Keep same time_taken for this batch operation company_sm_data.time_taken = current_datetime result.append(company_sm_data) # Print a progress bar on console i += 1 prog.update_amount(i) if oldprog != str(prog): print str(prog), '\r', sys.stdout.flush() oldprog=str(prog) return result