Ejemplo n.º 1
0
    def testYoutube(self):
        # Test None url
        url=None
        data = scraper.yt_scrape(url)
        self.assertEqual(0, data['view_count'])
        self.assertEqual(0, data['subscriber_count'])

        # Test empty url
        url=''
        data = scraper.yt_scrape(url)
        self.assertEqual(0, data['view_count'])
        self.assertEqual(0, data['subscriber_count'])

        # Test 0 url
        url='0'
        data = scraper.yt_scrape(url)
        self.assertEqual(0, data['view_count'])
        self.assertEqual(0, data['subscriber_count'])

        # Test youtube id
        url='http://www.youtube.com/user/Gereports'
        data = scraper.yt_scrape(url)
        self.assertTrue(data['view_count'] > 0)
        self.assertTrue(data['subscriber_count'] > 0)

        # Test another format url
        url='https://www.youtube.com/citi'
        data = scraper.yt_scrape(url)
        self.assertTrue(data['view_count'] > 0)
        self.assertTrue(data['subscriber_count'] > 0)
Ejemplo n.º 2
0
    def testYoutube(self):
        # Test None url
        url = None
        data = scraper.yt_scrape(url)
        self.assertEqual(0, data['view_count'])
        self.assertEqual(0, data['subscriber_count'])

        # Test empty url
        url = ''
        data = scraper.yt_scrape(url)
        self.assertEqual(0, data['view_count'])
        self.assertEqual(0, data['subscriber_count'])

        # Test 0 url
        url = '0'
        data = scraper.yt_scrape(url)
        self.assertEqual(0, data['view_count'])
        self.assertEqual(0, data['subscriber_count'])

        # Test youtube id
        url = 'http://www.youtube.com/user/Gereports'
        data = scraper.yt_scrape(url)
        self.assertTrue(data['view_count'] > 0)
        self.assertTrue(data['subscriber_count'] > 0)

        # Test another format url
        url = 'https://www.youtube.com/citi'
        data = scraper.yt_scrape(url)
        self.assertTrue(data['view_count'] > 0)
        self.assertTrue(data['subscriber_count'] > 0)
Ejemplo n.º 3
0
    def get_social_media(self, company_list, db_filename):
        """
            Call scraper to get company social media data

            @type company_list: list
            @param company_list: the CompanyURL object list

            @rtype: list
            @return: the CompanySocialMedia object list
        """

        # Define a progress bar on console
        limit = len(company_list)
        prog = ProgressBar(0, limit, 70, mode='fixed')
        oldprog = str(prog)
        i = 0

        # Call twitter api in batch mode
        company_dict = {}
        for company in company_list:
            company_sm_data = CompanySocialMedia(company.company_name)
            if scraper.check_url(company.tw_url, 'twitter.com'):
                twitter_id = scraper.get_twitter_id(company.tw_url)
                if twitter_id:
                    company_dict[twitter_id] = company_sm_data

        count = 0
        twitter_ids = ''
        twitter_user_list = []
        for k, v in company_dict.iteritems():
            twitter_ids = twitter_ids + ',' + k
            count += 1
            if count == 100:
                # Remove the first ','
                twitter_ids = twitter_ids[1:]
                # call api
                twitter_user_list.extend(simple_twitter_api.UsersLookup(twitter_ids))

                twitter_ids = ''
                count = 0
        twitter_user_list.extend(simple_twitter_api.UsersLookup(twitter_ids))

        twitter_user_dict = simple_twitter_api.build_dict(twitter_user_list)

        result = []
        current_datetime = datetime.now()
        for company in company_list:
            company_sm_data = CompanySocialMedia(company.company_name)
            fb_data = scraper.scrap_facebook_raw_data(company.fb_url)
            # If can not get fb data from html, just try to get it from graph api
            if fb_data['likes'] == 0 and fb_data['talking_about_count'] == 0 and fb_data['checkins'] == 0:
                fb_data = scraper.fb_scrape(company.fb_url)
                # Get max checkins from previous records
                fb_data['checkins']=getMaxCheckins(company.company_name, db_filename)
                if fb_data['likes'] == 0:
                    fb_data['likes'] = getMaxLikes(company.company_name, db_filename)
                if fb_data['talking_about_count'] == 0:
                    fb_data['talking_about_count'] = getMaxTalkingAboutCount(company.company_name, db_filename)

            fb_data = handleFBData(fb_data)

            #tw_data = scraper.tw_scrape(company.tw_url)

            data = {'twitter_id': '', 'followers_count': 0, 'tweets': 0}
            if scraper.check_url(company.tw_url, 'twitter.com'):
                tw_id = scraper.get_twitter_id(company.tw_url)
                data['twitter_id'] = tw_id
                tw_data = twitter_user_dict.get(tw_id.lower(), data)
            else:
                tw_data = data

            yt_data = scraper.yt_scrape(company.yt_url)
            company_sm_data.fb_likes = fb_data['likes']
            company_sm_data.fb_talking_about_count = fb_data['talking_about_count']
            company_sm_data.fb_checkins = fb_data['checkins']
            company_sm_data.tw_followers_count = tw_data['followers_count']
            company_sm_data.tw_tweets = tw_data['tweets']
            company_sm_data.yt_subscriber_count = yt_data['subscriber_count']
            company_sm_data.yt_view_count = yt_data['view_count']

            #log.debug('%d, %d, %d' % (company_sm_data.fb_likes, company_sm_data.fb_talking_about_count, company_sm_data.fb_checkins))
            fb_metrics = calculator.cal_fb_hm(company_sm_data.fb_likes, company_sm_data.fb_talking_about_count, company_sm_data.fb_checkins)
            tw_metrics = calculator.cal_tw_hm(tw_data['twitter_id'], company_sm_data.tw_followers_count, company_sm_data.tw_tweets)
            yt_metrics = calculator.cal_yt_hm(company_sm_data.yt_subscriber_count, company_sm_data.yt_view_count)
            micro_metrics = calculator.cal_macro_metrics(fb_metrics['fb_health'], tw_metrics['tw_health'], yt_metrics['yt_health'])
            company_sm_data.fb_metrics = fb_metrics
            company_sm_data.tw_metrics = tw_metrics
            company_sm_data.yt_metrics = yt_metrics
            company_sm_data.micro_metrics = micro_metrics

            # Keep same time_taken for this batch operation
            company_sm_data.time_taken = current_datetime

            result.append(company_sm_data)

            # Print a progress bar on console
            i += 1
            prog.update_amount(i)
            if oldprog != str(prog):
                print str(prog), '\r',
                sys.stdout.flush()
                oldprog=str(prog)

        return result