Example #1
0
 def handle_urls(self, gm_worker, gm_job):
     self._urldata = dict()
     self._urls = gm_job.data
     self._num_processed = 0
     self._gm_job = gm_job
     logger.info(self._urls)
     AsyncBatch(self._urls, self.parse_response)
     return self._urldata
Example #2
0
 def handle_urls(self, gm_worker, gm_job):
     self._urldata = dict()
     self._urls = gm_job.data
     self._num_processed = 0
     self._gm_job = gm_job
     logger.info(self._urls)
     AsyncBatch(self._urls, self.parse_response)
     return self._urldata
Example #3
0
    def parse_response(self, url, response, response_utf8):
        if response.error:
            logger.info('error: %s' % response.error)

        if response_utf8 is not None:
            logger.info('received %s' % url)
            self._urldata[url] = list()

            try:
                d = pq(response_utf8)
            except ValueError:
                logger.info('Parse of %s failed, trying soup parser...' % url)
                d = pq(response_utf8, parser='soup')

            for link in d('a'):
                href = link.get('href')
                if href is None:
                    continue
                split_url = urlsplit(href)
                if split_url.hostname is None:
                    continue
                url_match = self._url_re.match(href)
                if url_match:
                    logger.info('Twitter User Found: %s' % url_match.group(4))
                    if url_match.group(4) not in self._urldata[url]:
                        self._urldata[url].append(url_match.group(4))

        else:
            logger.debug('Parse of %s failed' % url)
            self._urldata[url] = None

        self._num_processed += 1
Example #4
0
    def __init__(self, filename):

        self._gm_client = JSONGearmanClient([options.jobserver])

        try:
            urls = list()
            for line in open(filename, 'r'):
                urls.append(line.strip())

            url_chunks = [urls[i : i + options.chunk_size] for i in range(0, len(urls), options.chunk_size)]

            jobs = list()
            for url_chunk in url_chunks:
                jobs.append({'task': 'tweet_scout', 'data': url_chunk});

            submitted_job_requests = self._gm_client.submit_multiple_jobs(jobs, background=False, wait_until_complete=False)
            job_count = len(submitted_job_requests)
            complete_count = 0
            p = ProgressBar(maxval=job_count).start()
            while complete_count != job_count:
                try:
                    self._gm_client.wait_until_jobs_completed(submitted_job_requests, poll_timeout=1)
                    self._gm_client.get_job_statuses(submitted_job_requests, poll_timeout=100)
                except:
                    pass
                complete_count = 0
                for job in submitted_job_requests:
                    if job.complete is True:
                        complete_count += 1
                p.update(complete_count)

            count = 0
            twitter_user_counts = defaultdict(int)
            for job in submitted_job_requests:
                for url, twitter_users in job.result.iteritems():
                    logger.debug('%s: %s' % (url, twitter_users))
                    if twitter_users is not None:
                        count += 1
                        for twitter_user in twitter_users:
                            twitter_user_counts[twitter_user] += 1

            print '\nFound %d Twitter users in %d successfully parsed pages:' % (len(twitter_user_counts), count)
            for user in sorted(twitter_user_counts, key=twitter_user_counts.get, reverse=True):
                print '%s,%d' % (user, twitter_user_counts[user])

        except KeyboardInterrupt:
            logger.info('Exiting')
            pass
        except Exception, e:
            logger.exception('Exiting - %s' % e)
Example #5
0
    def parse_response(self, url, response, response_utf8):
        if response.error:
            logger.info('error: %s' % response.error)

        if response_utf8 is not None:
            logger.info('received %s' % url)
            self._urldata[url] = list()

            try:
                d = pq(response_utf8)
            except ValueError:
               logger.info('Parse of %s failed, trying soup parser...' % url)
               d = pq(response_utf8, parser='soup')

            for link in d('a'):
                href = link.get('href')
                if href is None:
                    continue
                split_url = urlsplit(href)
                if split_url.hostname is None:
                    continue
                url_match = self._url_re.match(href)
                if url_match:
                    logger.info('Twitter User Found: %s' % url_match.group(4))
                    if url_match.group(4) not in self._urldata[url]:
                        self._urldata[url].append(url_match.group(4))

        else:
            logger.debug('Parse of %s failed' % url)
            self._urldata[url] = None

        self._num_processed += 1
Example #6
0
    def __init__(self):

        self._urldata = dict()
        self._urls = list()
        self._gm_job = None

        self._gm_worker = JSONGearmanWorker([options.jobserver])
        self._gm_worker.register_task('tweet_scout', self.handle_urls)
        self._url_re = re.compile('^http(s)?://(www\.)?twitter\.com/(?!share)(?!home)(?!intent)(#!/)?([a-zA-Z0-9_]{1,15}[^/])(/\w+)*$')

        try:
            logger.info('TweetScout initialized and ready for work');
            self._gm_worker.work()
        except KeyboardInterrupt:
            logger.info('Exiting')
            pass
        except Exception, e:
            logger.error('Exiting - %s' % e)
Example #7
0
    def __init__(self):

        self._urldata = dict()
        self._urls = list()
        self._gm_job = None

        self._gm_worker = JSONGearmanWorker([options.jobserver])
        self._gm_worker.register_task('tweet_scout', self.handle_urls)
        self._url_re = re.compile(
            '^http(s)?://(www\.)?twitter\.com/(?!share)(?!home)(?!intent)(#!/)?([a-zA-Z0-9_]{1,15}[^/])(/\w+)*$'
        )

        try:
            logger.info('TweetScout initialized and ready for work')
            self._gm_worker.work()
        except KeyboardInterrupt:
            logger.info('Exiting')
            pass
        except Exception, e:
            logger.error('Exiting - %s' % e)
Example #8
0
    def __init__(self, filename):

        self._gm_client = JSONGearmanClient([options.jobserver])

        try:
            urls = list()
            for line in open(filename, 'r'):
                urls.append(line.strip())

            url_chunks = [
                urls[i:i + options.chunk_size]
                for i in range(0, len(urls), options.chunk_size)
            ]

            jobs = list()
            for url_chunk in url_chunks:
                jobs.append({
                    'task': 'tweet_scout',
                    'data': url_chunk
                })

            submitted_job_requests = self._gm_client.submit_multiple_jobs(
                jobs, background=False, wait_until_complete=False)
            job_count = len(submitted_job_requests)
            complete_count = 0
            p = ProgressBar(maxval=job_count).start()
            while complete_count != job_count:
                try:
                    self._gm_client.wait_until_jobs_completed(
                        submitted_job_requests, poll_timeout=1)
                    self._gm_client.get_job_statuses(submitted_job_requests,
                                                     poll_timeout=100)
                except:
                    pass
                complete_count = 0
                for job in submitted_job_requests:
                    if job.complete is True:
                        complete_count += 1
                p.update(complete_count)

            count = 0
            twitter_user_counts = defaultdict(int)
            for job in submitted_job_requests:
                for url, twitter_users in job.result.iteritems():
                    logger.debug('%s: %s' % (url, twitter_users))
                    if twitter_users is not None:
                        count += 1
                        for twitter_user in twitter_users:
                            twitter_user_counts[twitter_user] += 1

            print '\nFound %d Twitter users in %d successfully parsed pages:' % (
                len(twitter_user_counts), count)
            for user in sorted(twitter_user_counts,
                               key=twitter_user_counts.get,
                               reverse=True):
                print '%s,%d' % (user, twitter_user_counts[user])

        except KeyboardInterrupt:
            logger.info('Exiting')
            pass
        except Exception, e:
            logger.exception('Exiting - %s' % e)