コード例 #1
0
def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'):
    max_layer_count = 3

    if hint == '':
        _logger.debug('hint is None, will return nothing')
        return None, ''
    if type(hint) == str:
        hint = util.convert_to_utf8(hint)
        hint = hint.decode('utf-8')
    br = get_browser()
    _logger.debug('hint=(%s), opening %s' %
                  (hint.encode('utf-8'), url.encode('utf-8')))
    br.open(url)
    html = br.get_html_source()
    html = util.convert_to_utf8(html, hint_encoding)
    html = fix_malformated_tags(html)

    soup = BSoup(html, fromEncoding='utf-8')
    hint_tag = _find_tag_by_best_match(soup, hint)

    if hint_tag == None:
        _logger.debug('no hint is found')
        return None, ''

    tag = hint_tag.parent
    _logger.debug('found matching tag: %s(%s)' %
                  (str(tag)[:200], str(tag.attrs)))
    image_data = None
    image_url = ''
    found_image = False

    layer_count = 0
    while tag != None and not found_image and layer_count <= max_layer_count:
        _logger.debug('trying tag(%s), %s' % (tag.name, tag.attrs))
        imgs = tag.findAll('img', src=re.compile('(.jpg|.png|.jpeg|.gif)$'))
        for img in imgs:
            try:
                #print 'browser url:' + br.geturl()
                image_data = br.download_image(img['src']).read()
                import Image
                from StringIO import StringIO
                pic = Image.open(StringIO(image_data))
                pic_size = pic.size[0] * pic.size[1]
                _logger.debug('got image(%d, %s)' % (pic_size, img['src']))
            except Exception, err:
                _logger.error('failed to download image(%s): %s' %
                              (img['src'], err))
                continue

            if pic_size >= 100000 and _not_thin_banner(image_data):
                _logger.debug(
                    'selected main image, level: %d, url: (%s), size: (%d)' %
                    (layer_count, img['src'], pic_size))
                image_url = img['src']
                found_image = True
                break
        if not (hasattr(tag, 'name') and
                (tag.name == 'td' or tag.name == 'tr')):
            layer_count += 1
        tag = tag.parent
コード例 #2
0
def ask_google(keywords,
               needed,
               proxy=None,
               callback=None,
               terminate=None,
               sleep_min=1,
               sleep_max=3):
    keywords = urllib.quote_plus(keywords)
    random.seed()
    if needed > 1000:
        needed = 1000
    br = get_browser()
    if proxy != None:
        br.set_proxies({'http': proxy, 'https': proxy})
    results = set()
    url = 'http://www.google.com/search?q=%s' % keywords
    current_page = 1
    # Kick off searching
    fail_num = 0
    _logger.info('searching [%s] for %d results from %s' %
                 (keywords, needed, url))
    while fail_num < 5:
        try:
            response = br.open(url, timeout=5.0)
            break
        except Exception, err:
            _logger.error('initial fetching failed(%s): %s' % (url, err))
            fail_num += 1
コード例 #3
0
def extract_main_body(url, selenium, encoding):
    br = get_browser()
    _logger.debug('opening %s' % url)
    br.open(url)
    html = br.get_html_source()
    _logger.debug('removing non-content tags')
    html = clean_html(html, encoding)  # html is now a utf-8 string
    if html == '':
        _logger.error('clean_html failed, aborting')
        return ''

    # use meta description, if any
    # soup = BSoup(html, fromEncoding='utf-8')
    # desc = soup.find('meta', attrs={'name': 'description'})
    # if desc != None and hasattr(desc, 'content') and util.chinese_charactor_count(desc['content'].strip()) > 35:
    #     _logger.debug('use meta description as main body')
    #     return html_unescape(desc['content'].strip())

    contents = extract_content(html, selenium)
    limit = 70
    while limit >= 50:
        for content in contents:
            char_count = util.chinese_charactor_count(content.strip())
            if char_count > limit and content[:140].count(' ') <= 3:
                _logger.debug('found main body(%s), char count:%d' %
                              (content.encode('utf-8'), char_count))
                return content
        limit -= 5
    return ''
コード例 #4
0
def get_main_image(url):
    br = get_browser()
    html = br.open(url).read()
    soup = BSoup(html)
    max_img = None
    max_size = 0
    max_url = None
    all_img = soup.findAll('img', src=re.compile("(.jpg|.png)$"))
    _logger.debug('fetching %d condidate images' % len(all_img))
    for img in all_img:
        try:
            image_data = br.download_image(img['src']).read()
            image_size = len(image_data)
            if max_size < image_size:
                max_img = image_data
                max_url = img['src']
                max_size = image_size
        except Exception, err:
            _logger.error('error when downloading(%s):%s' % (img['src'], err))
        else:
            _logger.debug("%s:%d" % (img['src'], image_size))
コード例 #5
0
def clean_html(html, encoding):
    """
    Given html of type <str>. This function alcomplish following stuff:
    1. Remove non-content tags such as HTML comment, declaration, CData etc
    2. Adjust the encoding so that it's consistent with charset meta tag.
       If there's no such tag, use UTF8 and add <meta ... content="charset='UTF8'" />.
       As for now, we always return UTF8 encoded string and set meta charset to UTF8
    3. Various clean up: remove <meta charset="">, change '·' to ' '  
    """

    # remove junks dealing with IE6
    ptn = re.compile(r'<!–+\[.+?\]>.+?<!\[endif\]–+>', re.S)
    html = ptn.sub('', html)
    # remove junks like <meta charset="gbk" />
    ptn = re.compile(r'<meta charset=.*>', re.I)
    html = ptn.sub('', html)

    try:
        soup = BSoup(util.convert_to_utf8(html, encoding),
                     fromEncoding='utf-8')
    except Exception, err:
        _logger.error('Failed to create BeautifulSoup:%s' % err)
        return ""
コード例 #6
0
ファイル: worker.py プロジェクト: cecton/pueuey
 def handle_failure(self, job, e):
     _logger.error("count#qc.job-error=1 job=%s error=%s"
                   % (repr(job), repr(e)))
コード例 #7
0
def get_main_image_with_hint(url, hint, selenium, hint_encoding='utf-8'):
    _logger.debug('hint=(%s), opening %s' % (hint, url))

    if hint == '':
        _logger.debug('hint is None, will return nothing')
        return None, ''
    if type(hint) == str:
        hint = util.convert_to_utf8(hint)
        hint = hint.decode('utf-8')

    # prepare selenium
    _logger.debug('opening %s in Selenium' % url)
    selenium.open(url)

    html = selenium.get_html_source()
    html = fix_malformated_tags(html)

    soup = BSoup(html, fromEncoding='utf-8')
    hint_tag = _find_tag_by_best_match(soup, hint)

    if hint_tag == None:
        _logger.debug('no hint is found')
        return None, ''

    tag = hint_tag.parent
    _logger.debug('found matching tag: %s(%s)' %
                  (str(tag)[:200], str(tag.attrs)))

    # get left position of matching
    xpath = u'//%s[text()="%s"]' % (tag.name, tag.text)
    matching_tag_left = selenium.get_element_position_left(xpath)
    matching_tag_top = selenium.get_element_position_top(xpath)
    matching_tag_width = selenium.get_element_width(xpath)

    _logger.debug('matching tag position:(left: %d, top: %d)' %
                  (matching_tag_left, matching_tag_top))

    image_data = None
    image_url = ''
    found_image = False

    br = get_browser()

    for img in soup.findAll('img', src=True):
        xpath = u'//img[@src="%s"]' % img['src']
        try:
            left = selenium.get_element_position_left(xpath)
            top = selenium.get_element_position_top(xpath)
        except Exception, err:
            _logger.error('failed to get positon for element, xpath=(%s): %s' %
                          (xpath, err))
            continue
        if top < matching_tag_top or left > matching_tag_left + matching_tag_width / 2:
            _logger.debug(
                'ignoring img for bad pos, (top:%d, left:%d, url:%s)' %
                (top, left, img['src']))
            continue

        try:
            image_data = br.download_image(img['src'], base_url=url).read()
            import Image
            from StringIO import StringIO
            pic = Image.open(StringIO(image_data))
            pic_size = pic.size[0] * pic.size[1]
            _logger.debug('got image(%d, %s)' % (pic_size, img['src']))
        except Exception, err:
            _logger.error('failed to download image(%s): %s' %
                          (img['src'], err))
            continue
コード例 #8
0
    results = set()
    url = 'http://www.google.com/search?q=%s' % keywords
    current_page = 1
    # Kick off searching
    fail_num = 0
    _logger.info('searching [%s] for %d results from %s' %
                 (keywords, needed, url))
    while fail_num < 5:
        try:
            response = br.open(url, timeout=5.0)
            break
        except Exception, err:
            _logger.error('initial fetching failed(%s): %s' % (url, err))
            fail_num += 1
    if fail_num == 5:
        _logger.error('permanently failed')
        return []
    soup = BSoup(response.read())
    results.update(
        set([li.find('a')['href'] for li in soup.findAll('li', 'g')]))

    if callback != None:
        for item in results:
            callback(item)

    if terminate != None:
        for index, item in enumerate(results):
            if terminate(item):
                return {'page': current_page, 'url': url, 'rank': index + 1}

    current_page += 1