def parse_response(self, url, response, response_utf8): if response.error: logger.info('error: %s' % response.error) if response_utf8 is not None: logger.info('received %s' % url) self._urldata[url] = list() try: d = pq(response_utf8) except ValueError: logger.info('Parse of %s failed, trying soup parser...' % url) d = pq(response_utf8, parser='soup') for link in d('a'): href = link.get('href') if href is None: continue split_url = urlsplit(href) if split_url.hostname is None: continue url_match = self._url_re.match(href) if url_match: logger.info('Twitter User Found: %s' % url_match.group(4)) if url_match.group(4) not in self._urldata[url]: self._urldata[url].append(url_match.group(4)) else: logger.debug('Parse of %s failed' % url) self._urldata[url] = None self._num_processed += 1
def __init__(self, filename): self._gm_client = JSONGearmanClient([options.jobserver]) try: urls = list() for line in open(filename, 'r'): urls.append(line.strip()) url_chunks = [urls[i : i + options.chunk_size] for i in range(0, len(urls), options.chunk_size)] jobs = list() for url_chunk in url_chunks: jobs.append({'task': 'tweet_scout', 'data': url_chunk}); submitted_job_requests = self._gm_client.submit_multiple_jobs(jobs, background=False, wait_until_complete=False) job_count = len(submitted_job_requests) complete_count = 0 p = ProgressBar(maxval=job_count).start() while complete_count != job_count: try: self._gm_client.wait_until_jobs_completed(submitted_job_requests, poll_timeout=1) self._gm_client.get_job_statuses(submitted_job_requests, poll_timeout=100) except: pass complete_count = 0 for job in submitted_job_requests: if job.complete is True: complete_count += 1 p.update(complete_count) count = 0 twitter_user_counts = defaultdict(int) for job in submitted_job_requests: for url, twitter_users in job.result.iteritems(): logger.debug('%s: %s' % (url, twitter_users)) if twitter_users is not None: count += 1 for twitter_user in twitter_users: twitter_user_counts[twitter_user] += 1 print '\nFound %d Twitter users in %d successfully parsed pages:' % (len(twitter_user_counts), count) for user in sorted(twitter_user_counts, key=twitter_user_counts.get, reverse=True): print '%s,%d' % (user, twitter_user_counts[user]) except KeyboardInterrupt: logger.info('Exiting') pass except Exception, e: logger.exception('Exiting - %s' % e)
def __init__(self, filename): self._gm_client = JSONGearmanClient([options.jobserver]) try: urls = list() for line in open(filename, 'r'): urls.append(line.strip()) url_chunks = [ urls[i:i + options.chunk_size] for i in range(0, len(urls), options.chunk_size) ] jobs = list() for url_chunk in url_chunks: jobs.append({ 'task': 'tweet_scout', 'data': url_chunk }) submitted_job_requests = self._gm_client.submit_multiple_jobs( jobs, background=False, wait_until_complete=False) job_count = len(submitted_job_requests) complete_count = 0 p = ProgressBar(maxval=job_count).start() while complete_count != job_count: try: self._gm_client.wait_until_jobs_completed( submitted_job_requests, poll_timeout=1) self._gm_client.get_job_statuses(submitted_job_requests, poll_timeout=100) except: pass complete_count = 0 for job in submitted_job_requests: if job.complete is True: complete_count += 1 p.update(complete_count) count = 0 twitter_user_counts = defaultdict(int) for job in submitted_job_requests: for url, twitter_users in job.result.iteritems(): logger.debug('%s: %s' % (url, twitter_users)) if twitter_users is not None: count += 1 for twitter_user in twitter_users: twitter_user_counts[twitter_user] += 1 print '\nFound %d Twitter users in %d successfully parsed pages:' % ( len(twitter_user_counts), count) for user in sorted(twitter_user_counts, key=twitter_user_counts.get, reverse=True): print '%s,%d' % (user, twitter_user_counts[user]) except KeyboardInterrupt: logger.info('Exiting') pass except Exception, e: logger.exception('Exiting - %s' % e)