def __init__(self, shard_id=0, shard_count=1): self.shard_id = shard_id self.shard_count = shard_count self.author_to_crawl = [] self.br = pbrowser.get_browser() self.logfile = open("log", "a") self.crawl_history = self.build_crawl_history()
def wget(url): br = pbrowser.get_browser() failure_count = 0 max_try = int(_config['wget.retry']) content = '' while failure_count < max_try: try: response = br.open(url, timeout=8) content = response.read() break except Exception, err: failure_count += 1 _logger.error('wget failed once, for URL: %(url)s: %(detail)s' % { 'url': url, 'detail': str(err) })
def grab_image(self, html, entry): wee_url = entry.link.encode('utf-8') soup = BeautifulSoup(html, fromEncoding="utf-8") img = soup.find('img', src=True) if img == None: _logger.debug("%s has no image inside" % wee_url) return url = img['src'] _logger.debug('downloading image from %s' % url) try: br = pbrowser.get_browser() image = br.download_image(url, base_url=wee_url).read() except Exception, err: _logger.error("downloading image failed(%s), baseurl(%s): %s" % (url, wee_url, traceback.format_exc())) return
def spam_one_blog(self, anchor, href, target_url): if target_url.find('/interstitial?url=') != -1: _logger.debug('stripped %s to %s' % (target_url, target_url[len('/interstitial?url='):])) target_url = target_url[len('/interstitial?url='):] error = '' retry = 0 # Open blog post page browser = pbrowser.get_browser() while retry < 5: try: res = browser.open(target_url, timeout=10) html = res.read() break except Exception, err: error += 'open blog url failed (%d / 5):%s\n' % (retry + 1, err) retry += 1
def get_image(url): _logger.debug("processing (%s)" % url) br = pbrowser.get_browser() html = br.open(url) soup = BeautifulSoup(html) href = soup.find("img", {"id": "laimagen"})['src'] filename = os.path.basename(href) if os.path.exists(BASE + filename): _logger.debug("ignore (%s), existed" % href) return _logger.debug("will open (%s)" % href) img = None try: img = br.download_image(href, timeout=120).read() except Exception, err: _logger.error("failed to downloading from (%s)" % href) return
def __init__(self): self.br = pbrowser.get_browser()
img = None try: img = br.download_image(href, timeout=120).read() except Exception, err: _logger.error("failed to downloading from (%s)" % href) return with open(BASE + filename, 'w') as output: output.write(img) _logger.debug("(%s) saved" % href) url = "http://www.iimmgg.com/gallery/g9cdec57288d68186a5a45d8cce577f98/" br = pbrowser.get_browser() _logger.debug("openning %s" % url) html = br.open(url) soup = BeautifulSoup(html) list_div = soup.find("div", {"id": "galeria"}) all_links = list_div.findAll("a", {"href": True}) pool = eventlet.GreenPool() for anchor in all_links: _logger.debug("openning link:(%s)" % anchor['href']) pool.spawn(get_image, anchor['href']) pool.waitall()
def post_spam(input_file, limit=1000000000, verbose=False): if verbose: from datetime import datetime path = 'dump.' + str(datetime.now()).replace(' ', '_') os.mkdir(path) _editor_config = load_config('poster-account.conf') #init from util.pbrowser import get_browser browser = get_browser() from poseidon.composer.composer import parse_composed contents = parse_composed(input_file) handlers = _load_handlers() for count in range(limit): _logger.info('posting round %d' % (count + 1)) for site_name in _editor_config: site_conf = _editor_config[site_name] config = {} config.update(_eng_conf[site_name]) handler = handlers[config['handler']] _logger.debug('spamming [%s]...' % site_name) for login in site_conf['logins']: # skip if post limit exceeded if 'post-limit' in login and count >= int(login['post-limit']): continue content = random.choice(contents) if content == None or content == '': continue title = _get_titles(content) config = { 'title': title, 'content': content, } config['username'] = login['username'] config['password'] = login['password'] config['base-url'] = login['base-url'] success, html = handler.post_blog(browser, config) if success: _logger.info('succeeded %s with %s:%s, base-url:%s' % (site_name, config['username'], config['password'], config['base-url'])) else: _logger.error('failed %s with %s:%s, base-url:%s' % (site_name, config['username'], config['password'], config['base-url'])) if verbose: with open( path + '/' + str(datetime.now()).replace(' ', '_') + '.html', 'w') as dump: dump.write(html) _logger.info('spamming finished')