Exemple #1
0
    def parse_response(self, url, response, response_utf8):
        if response.error:
            logger.info('error: %s' % response.error)

        if response_utf8 is not None:
            logger.info('received %s' % url)
            self._urldata[url] = list()

            try:
                d = pq(response_utf8)
            except ValueError:
                logger.info('Parse of %s failed, trying soup parser...' % url)
                d = pq(response_utf8, parser='soup')

            for link in d('a'):
                href = link.get('href')
                if href is None:
                    continue
                split_url = urlsplit(href)
                if split_url.hostname is None:
                    continue
                url_match = self._url_re.match(href)
                if url_match:
                    logger.info('Twitter User Found: %s' % url_match.group(4))
                    if url_match.group(4) not in self._urldata[url]:
                        self._urldata[url].append(url_match.group(4))

        else:
            logger.debug('Parse of %s failed' % url)
            self._urldata[url] = None

        self._num_processed += 1
Exemple #2
0
    def parse_response(self, url, response, response_utf8):
        if response.error:
            logger.info('error: %s' % response.error)

        if response_utf8 is not None:
            logger.info('received %s' % url)
            self._urldata[url] = list()

            try:
                d = pq(response_utf8)
            except ValueError:
               logger.info('Parse of %s failed, trying soup parser...' % url)
               d = pq(response_utf8, parser='soup')

            for link in d('a'):
                href = link.get('href')
                if href is None:
                    continue
                split_url = urlsplit(href)
                if split_url.hostname is None:
                    continue
                url_match = self._url_re.match(href)
                if url_match:
                    logger.info('Twitter User Found: %s' % url_match.group(4))
                    if url_match.group(4) not in self._urldata[url]:
                        self._urldata[url].append(url_match.group(4))

        else:
            logger.debug('Parse of %s failed' % url)
            self._urldata[url] = None

        self._num_processed += 1
    def __init__(self, filename):

        self._gm_client = JSONGearmanClient([options.jobserver])

        try:
            urls = list()
            for line in open(filename, 'r'):
                urls.append(line.strip())

            url_chunks = [urls[i : i + options.chunk_size] for i in range(0, len(urls), options.chunk_size)]

            jobs = list()
            for url_chunk in url_chunks:
                jobs.append({'task': 'tweet_scout', 'data': url_chunk});

            submitted_job_requests = self._gm_client.submit_multiple_jobs(jobs, background=False, wait_until_complete=False)
            job_count = len(submitted_job_requests)
            complete_count = 0
            p = ProgressBar(maxval=job_count).start()
            while complete_count != job_count:
                try:
                    self._gm_client.wait_until_jobs_completed(submitted_job_requests, poll_timeout=1)
                    self._gm_client.get_job_statuses(submitted_job_requests, poll_timeout=100)
                except:
                    pass
                complete_count = 0
                for job in submitted_job_requests:
                    if job.complete is True:
                        complete_count += 1
                p.update(complete_count)

            count = 0
            twitter_user_counts = defaultdict(int)
            for job in submitted_job_requests:
                for url, twitter_users in job.result.iteritems():
                    logger.debug('%s: %s' % (url, twitter_users))
                    if twitter_users is not None:
                        count += 1
                        for twitter_user in twitter_users:
                            twitter_user_counts[twitter_user] += 1

            print '\nFound %d Twitter users in %d successfully parsed pages:' % (len(twitter_user_counts), count)
            for user in sorted(twitter_user_counts, key=twitter_user_counts.get, reverse=True):
                print '%s,%d' % (user, twitter_user_counts[user])

        except KeyboardInterrupt:
            logger.info('Exiting')
            pass
        except Exception, e:
            logger.exception('Exiting - %s' % e)
Exemple #4
0
    def __init__(self, filename):

        self._gm_client = JSONGearmanClient([options.jobserver])

        try:
            urls = list()
            for line in open(filename, 'r'):
                urls.append(line.strip())

            url_chunks = [
                urls[i:i + options.chunk_size]
                for i in range(0, len(urls), options.chunk_size)
            ]

            jobs = list()
            for url_chunk in url_chunks:
                jobs.append({
                    'task': 'tweet_scout',
                    'data': url_chunk
                })

            submitted_job_requests = self._gm_client.submit_multiple_jobs(
                jobs, background=False, wait_until_complete=False)
            job_count = len(submitted_job_requests)
            complete_count = 0
            p = ProgressBar(maxval=job_count).start()
            while complete_count != job_count:
                try:
                    self._gm_client.wait_until_jobs_completed(
                        submitted_job_requests, poll_timeout=1)
                    self._gm_client.get_job_statuses(submitted_job_requests,
                                                     poll_timeout=100)
                except:
                    pass
                complete_count = 0
                for job in submitted_job_requests:
                    if job.complete is True:
                        complete_count += 1
                p.update(complete_count)

            count = 0
            twitter_user_counts = defaultdict(int)
            for job in submitted_job_requests:
                for url, twitter_users in job.result.iteritems():
                    logger.debug('%s: %s' % (url, twitter_users))
                    if twitter_users is not None:
                        count += 1
                        for twitter_user in twitter_users:
                            twitter_user_counts[twitter_user] += 1

            print '\nFound %d Twitter users in %d successfully parsed pages:' % (
                len(twitter_user_counts), count)
            for user in sorted(twitter_user_counts,
                               key=twitter_user_counts.get,
                               reverse=True):
                print '%s,%d' % (user, twitter_user_counts[user])

        except KeyboardInterrupt:
            logger.info('Exiting')
            pass
        except Exception, e:
            logger.exception('Exiting - %s' % e)