Example #1
0
 def __init__(self, shard_id=0, shard_count=1):
     self.shard_id = shard_id
     self.shard_count = shard_count
     self.author_to_crawl = []
     self.br = pbrowser.get_browser()
     self.logfile = open("log", "a")
     self.crawl_history = self.build_crawl_history()
Example #2
0
def wget(url):
    br = pbrowser.get_browser()
    failure_count = 0
    max_try = int(_config['wget.retry'])
    content = ''
    while failure_count < max_try:
        try:
            response = br.open(url, timeout=8)
            content = response.read()
            break
        except Exception, err:
            failure_count += 1
            _logger.error('wget failed once, for URL: %(url)s: %(detail)s' % {
                'url': url,
                'detail': str(err)
            })
Example #3
0
    def grab_image(self, html, entry):

        wee_url = entry.link.encode('utf-8')
        soup = BeautifulSoup(html, fromEncoding="utf-8")
        img = soup.find('img', src=True)
        if img == None:
            _logger.debug("%s has no image inside" % wee_url)
            return
        url = img['src']

        _logger.debug('downloading image from %s' % url)
        try:
            br = pbrowser.get_browser()
            image = br.download_image(url, base_url=wee_url).read()
        except Exception, err:
            _logger.error("downloading image failed(%s), baseurl(%s): %s" %
                          (url, wee_url, traceback.format_exc()))
            return
Example #4
0
 def spam_one_blog(self, anchor, href, target_url):
     if target_url.find('/interstitial?url=') != -1:
         _logger.debug('stripped %s to %s' %
                       (target_url, target_url[len('/interstitial?url='):]))
         target_url = target_url[len('/interstitial?url='):]
     error = ''
     retry = 0
     # Open blog post page
     browser = pbrowser.get_browser()
     while retry < 5:
         try:
             res = browser.open(target_url, timeout=10)
             html = res.read()
             break
         except Exception, err:
             error += 'open blog url failed (%d / 5):%s\n' % (retry + 1,
                                                              err)
             retry += 1
Example #5
0
def get_image(url):
    _logger.debug("processing (%s)" % url)
    br = pbrowser.get_browser()
    html = br.open(url)
    soup = BeautifulSoup(html)
    href = soup.find("img", {"id": "laimagen"})['src']
    filename = os.path.basename(href)

    if os.path.exists(BASE + filename):
        _logger.debug("ignore (%s), existed" % href)
        return

    _logger.debug("will open (%s)" % href)

    img = None
    try:
        img = br.download_image(href, timeout=120).read()
    except Exception, err:
        _logger.error("failed to downloading from (%s)" % href)
        return
Example #6
0
 def __init__(self):
     self.br = pbrowser.get_browser()
Example #7
0
    img = None
    try:
        img = br.download_image(href, timeout=120).read()
    except Exception, err:
        _logger.error("failed to downloading from (%s)" % href)
        return

    with open(BASE + filename, 'w') as output:
        output.write(img)
    _logger.debug("(%s) saved" % href)


url = "http://www.iimmgg.com/gallery/g9cdec57288d68186a5a45d8cce577f98/"

br = pbrowser.get_browser()

_logger.debug("openning %s" % url)
html = br.open(url)
soup = BeautifulSoup(html)

list_div = soup.find("div", {"id": "galeria"})
all_links = list_div.findAll("a", {"href": True})
pool = eventlet.GreenPool()

for anchor in all_links:
    _logger.debug("openning link:(%s)" % anchor['href'])
    pool.spawn(get_image, anchor['href'])

pool.waitall()
Example #8
0
def post_spam(input_file, limit=1000000000, verbose=False):

    if verbose:
        from datetime import datetime
        path = 'dump.' + str(datetime.now()).replace(' ', '_')
        os.mkdir(path)

    _editor_config = load_config('poster-account.conf')
    #init

    from util.pbrowser import get_browser
    browser = get_browser()

    from poseidon.composer.composer import parse_composed

    contents = parse_composed(input_file)

    handlers = _load_handlers()

    for count in range(limit):

        _logger.info('posting round %d' % (count + 1))

        for site_name in _editor_config:
            site_conf = _editor_config[site_name]
            config = {}
            config.update(_eng_conf[site_name])
            handler = handlers[config['handler']]
            _logger.debug('spamming [%s]...' % site_name)
            for login in site_conf['logins']:
                # skip if post limit exceeded
                if 'post-limit' in login and count >= int(login['post-limit']):
                    continue

                content = random.choice(contents)
                if content == None or content == '':
                    continue
                title = _get_titles(content)
                config = {
                    'title': title,
                    'content': content,
                }
                config['username'] = login['username']
                config['password'] = login['password']
                config['base-url'] = login['base-url']

                success, html = handler.post_blog(browser, config)
                if success:
                    _logger.info('succeeded %s with %s:%s, base-url:%s' %
                                 (site_name, config['username'],
                                  config['password'], config['base-url']))
                else:
                    _logger.error('failed %s with %s:%s, base-url:%s' %
                                  (site_name, config['username'],
                                   config['password'], config['base-url']))
                if verbose:
                    with open(
                            path + '/' +
                            str(datetime.now()).replace(' ', '_') + '.html',
                            'w') as dump:
                        dump.write(html)

    _logger.info('spamming finished')