def handle_urls(self, gm_worker, gm_job): self._urldata = dict() self._urls = gm_job.data self._num_processed = 0 self._gm_job = gm_job logger.info(self._urls) AsyncBatch(self._urls, self.parse_response) return self._urldata
def parse_response(self, url, response, response_utf8): if response.error: logger.info('error: %s' % response.error) if response_utf8 is not None: logger.info('received %s' % url) self._urldata[url] = list() try: d = pq(response_utf8) except ValueError: logger.info('Parse of %s failed, trying soup parser...' % url) d = pq(response_utf8, parser='soup') for link in d('a'): href = link.get('href') if href is None: continue split_url = urlsplit(href) if split_url.hostname is None: continue url_match = self._url_re.match(href) if url_match: logger.info('Twitter User Found: %s' % url_match.group(4)) if url_match.group(4) not in self._urldata[url]: self._urldata[url].append(url_match.group(4)) else: logger.debug('Parse of %s failed' % url) self._urldata[url] = None self._num_processed += 1
def __init__(self, filename): self._gm_client = JSONGearmanClient([options.jobserver]) try: urls = list() for line in open(filename, 'r'): urls.append(line.strip()) url_chunks = [urls[i : i + options.chunk_size] for i in range(0, len(urls), options.chunk_size)] jobs = list() for url_chunk in url_chunks: jobs.append({'task': 'tweet_scout', 'data': url_chunk}); submitted_job_requests = self._gm_client.submit_multiple_jobs(jobs, background=False, wait_until_complete=False) job_count = len(submitted_job_requests) complete_count = 0 p = ProgressBar(maxval=job_count).start() while complete_count != job_count: try: self._gm_client.wait_until_jobs_completed(submitted_job_requests, poll_timeout=1) self._gm_client.get_job_statuses(submitted_job_requests, poll_timeout=100) except: pass complete_count = 0 for job in submitted_job_requests: if job.complete is True: complete_count += 1 p.update(complete_count) count = 0 twitter_user_counts = defaultdict(int) for job in submitted_job_requests: for url, twitter_users in job.result.iteritems(): logger.debug('%s: %s' % (url, twitter_users)) if twitter_users is not None: count += 1 for twitter_user in twitter_users: twitter_user_counts[twitter_user] += 1 print '\nFound %d Twitter users in %d successfully parsed pages:' % (len(twitter_user_counts), count) for user in sorted(twitter_user_counts, key=twitter_user_counts.get, reverse=True): print '%s,%d' % (user, twitter_user_counts[user]) except KeyboardInterrupt: logger.info('Exiting') pass except Exception, e: logger.exception('Exiting - %s' % e)
def __init__(self): self._urldata = dict() self._urls = list() self._gm_job = None self._gm_worker = JSONGearmanWorker([options.jobserver]) self._gm_worker.register_task('tweet_scout', self.handle_urls) self._url_re = re.compile('^http(s)?://(www\.)?twitter\.com/(?!share)(?!home)(?!intent)(#!/)?([a-zA-Z0-9_]{1,15}[^/])(/\w+)*$') try: logger.info('TweetScout initialized and ready for work'); self._gm_worker.work() except KeyboardInterrupt: logger.info('Exiting') pass except Exception, e: logger.error('Exiting - %s' % e)
def __init__(self): self._urldata = dict() self._urls = list() self._gm_job = None self._gm_worker = JSONGearmanWorker([options.jobserver]) self._gm_worker.register_task('tweet_scout', self.handle_urls) self._url_re = re.compile( '^http(s)?://(www\.)?twitter\.com/(?!share)(?!home)(?!intent)(#!/)?([a-zA-Z0-9_]{1,15}[^/])(/\w+)*$' ) try: logger.info('TweetScout initialized and ready for work') self._gm_worker.work() except KeyboardInterrupt: logger.info('Exiting') pass except Exception, e: logger.error('Exiting - %s' % e)
def __init__(self, filename): self._gm_client = JSONGearmanClient([options.jobserver]) try: urls = list() for line in open(filename, 'r'): urls.append(line.strip()) url_chunks = [ urls[i:i + options.chunk_size] for i in range(0, len(urls), options.chunk_size) ] jobs = list() for url_chunk in url_chunks: jobs.append({ 'task': 'tweet_scout', 'data': url_chunk }) submitted_job_requests = self._gm_client.submit_multiple_jobs( jobs, background=False, wait_until_complete=False) job_count = len(submitted_job_requests) complete_count = 0 p = ProgressBar(maxval=job_count).start() while complete_count != job_count: try: self._gm_client.wait_until_jobs_completed( submitted_job_requests, poll_timeout=1) self._gm_client.get_job_statuses(submitted_job_requests, poll_timeout=100) except: pass complete_count = 0 for job in submitted_job_requests: if job.complete is True: complete_count += 1 p.update(complete_count) count = 0 twitter_user_counts = defaultdict(int) for job in submitted_job_requests: for url, twitter_users in job.result.iteritems(): logger.debug('%s: %s' % (url, twitter_users)) if twitter_users is not None: count += 1 for twitter_user in twitter_users: twitter_user_counts[twitter_user] += 1 print '\nFound %d Twitter users in %d successfully parsed pages:' % ( len(twitter_user_counts), count) for user in sorted(twitter_user_counts, key=twitter_user_counts.get, reverse=True): print '%s,%d' % (user, twitter_user_counts[user]) except KeyboardInterrupt: logger.info('Exiting') pass except Exception, e: logger.exception('Exiting - %s' % e)