def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'): max_layer_count = 3 if hint == '': _logger.debug('hint is None, will return nothing') return None, '' if type(hint) == str: hint = util.convert_to_utf8(hint) hint = hint.decode('utf-8') br = get_browser() _logger.debug('hint=(%s), opening %s' % (hint.encode('utf-8'), url.encode('utf-8'))) br.open(url) html = br.get_html_source() html = util.convert_to_utf8(html, hint_encoding) html = fix_malformated_tags(html) soup = BSoup(html, fromEncoding='utf-8') hint_tag = _find_tag_by_best_match(soup, hint) if hint_tag == None: _logger.debug('no hint is found') return None, '' tag = hint_tag.parent _logger.debug('found matching tag: %s(%s)' % (str(tag)[:200], str(tag.attrs))) image_data = None image_url = '' found_image = False layer_count = 0 while tag != None and not found_image and layer_count <= max_layer_count: _logger.debug('trying tag(%s), %s' % (tag.name, tag.attrs)) imgs = tag.findAll('img', src=re.compile('(.jpg|.png|.jpeg|.gif)$')) for img in imgs: try: #print 'browser url:' + br.geturl() image_data = br.download_image(img['src']).read() import Image from StringIO import StringIO pic = Image.open(StringIO(image_data)) pic_size = pic.size[0] * pic.size[1] _logger.debug('got image(%d, %s)' % (pic_size, img['src'])) except Exception, err: _logger.error('failed to download image(%s): %s' % (img['src'], err)) continue if pic_size >= 100000 and _not_thin_banner(image_data): _logger.debug( 'selected main image, level: %d, url: (%s), size: (%d)' % (layer_count, img['src'], pic_size)) image_url = img['src'] found_image = True break if not (hasattr(tag, 'name') and (tag.name == 'td' or tag.name == 'tr')): layer_count += 1 tag = tag.parent
def ask_google(keywords, needed, proxy=None, callback=None, terminate=None, sleep_min=1, sleep_max=3): keywords = urllib.quote_plus(keywords) random.seed() if needed > 1000: needed = 1000 br = get_browser() if proxy != None: br.set_proxies({'http': proxy, 'https': proxy}) results = set() url = 'http://www.google.com/search?q=%s' % keywords current_page = 1 # Kick off searching fail_num = 0 _logger.info('searching [%s] for %d results from %s' % (keywords, needed, url)) while fail_num < 5: try: response = br.open(url, timeout=5.0) break except Exception, err: _logger.error('initial fetching failed(%s): %s' % (url, err)) fail_num += 1
def extract_main_body(url, selenium, encoding): br = get_browser() _logger.debug('opening %s' % url) br.open(url) html = br.get_html_source() _logger.debug('removing non-content tags') html = clean_html(html, encoding) # html is now a utf-8 string if html == '': _logger.error('clean_html failed, aborting') return '' # use meta description, if any # soup = BSoup(html, fromEncoding='utf-8') # desc = soup.find('meta', attrs={'name': 'description'}) # if desc != None and hasattr(desc, 'content') and util.chinese_charactor_count(desc['content'].strip()) > 35: # _logger.debug('use meta description as main body') # return html_unescape(desc['content'].strip()) contents = extract_content(html, selenium) limit = 70 while limit >= 50: for content in contents: char_count = util.chinese_charactor_count(content.strip()) if char_count > limit and content[:140].count(' ') <= 3: _logger.debug('found main body(%s), char count:%d' % (content.encode('utf-8'), char_count)) return content limit -= 5 return ''
def get_main_image(url): br = get_browser() html = br.open(url).read() soup = BSoup(html) max_img = None max_size = 0 max_url = None all_img = soup.findAll('img', src=re.compile("(.jpg|.png)$")) _logger.debug('fetching %d condidate images' % len(all_img)) for img in all_img: try: image_data = br.download_image(img['src']).read() image_size = len(image_data) if max_size < image_size: max_img = image_data max_url = img['src'] max_size = image_size except Exception, err: _logger.error('error when downloading(%s):%s' % (img['src'], err)) else: _logger.debug("%s:%d" % (img['src'], image_size))
def clean_html(html, encoding): """ Given html of type <str>. This function alcomplish following stuff: 1. Remove non-content tags such as HTML comment, declaration, CData etc 2. Adjust the encoding so that it's consistent with charset meta tag. If there's no such tag, use UTF8 and add <meta ... content="charset='UTF8'" />. As for now, we always return UTF8 encoded string and set meta charset to UTF8 3. Various clean up: remove <meta charset="">, change '·' to ' ' """ # remove junks dealing with IE6 ptn = re.compile(r'<!–+\[.+?\]>.+?<!\[endif\]–+>', re.S) html = ptn.sub('', html) # remove junks like <meta charset="gbk" /> ptn = re.compile(r'<meta charset=.*>', re.I) html = ptn.sub('', html) try: soup = BSoup(util.convert_to_utf8(html, encoding), fromEncoding='utf-8') except Exception, err: _logger.error('Failed to create BeautifulSoup:%s' % err) return ""
def handle_failure(self, job, e): _logger.error("count#qc.job-error=1 job=%s error=%s" % (repr(job), repr(e)))
def get_main_image_with_hint(url, hint, selenium, hint_encoding='utf-8'): _logger.debug('hint=(%s), opening %s' % (hint, url)) if hint == '': _logger.debug('hint is None, will return nothing') return None, '' if type(hint) == str: hint = util.convert_to_utf8(hint) hint = hint.decode('utf-8') # prepare selenium _logger.debug('opening %s in Selenium' % url) selenium.open(url) html = selenium.get_html_source() html = fix_malformated_tags(html) soup = BSoup(html, fromEncoding='utf-8') hint_tag = _find_tag_by_best_match(soup, hint) if hint_tag == None: _logger.debug('no hint is found') return None, '' tag = hint_tag.parent _logger.debug('found matching tag: %s(%s)' % (str(tag)[:200], str(tag.attrs))) # get left position of matching xpath = u'//%s[text()="%s"]' % (tag.name, tag.text) matching_tag_left = selenium.get_element_position_left(xpath) matching_tag_top = selenium.get_element_position_top(xpath) matching_tag_width = selenium.get_element_width(xpath) _logger.debug('matching tag position:(left: %d, top: %d)' % (matching_tag_left, matching_tag_top)) image_data = None image_url = '' found_image = False br = get_browser() for img in soup.findAll('img', src=True): xpath = u'//img[@src="%s"]' % img['src'] try: left = selenium.get_element_position_left(xpath) top = selenium.get_element_position_top(xpath) except Exception, err: _logger.error('failed to get positon for element, xpath=(%s): %s' % (xpath, err)) continue if top < matching_tag_top or left > matching_tag_left + matching_tag_width / 2: _logger.debug( 'ignoring img for bad pos, (top:%d, left:%d, url:%s)' % (top, left, img['src'])) continue try: image_data = br.download_image(img['src'], base_url=url).read() import Image from StringIO import StringIO pic = Image.open(StringIO(image_data)) pic_size = pic.size[0] * pic.size[1] _logger.debug('got image(%d, %s)' % (pic_size, img['src'])) except Exception, err: _logger.error('failed to download image(%s): %s' % (img['src'], err)) continue
results = set() url = 'http://www.google.com/search?q=%s' % keywords current_page = 1 # Kick off searching fail_num = 0 _logger.info('searching [%s] for %d results from %s' % (keywords, needed, url)) while fail_num < 5: try: response = br.open(url, timeout=5.0) break except Exception, err: _logger.error('initial fetching failed(%s): %s' % (url, err)) fail_num += 1 if fail_num == 5: _logger.error('permanently failed') return [] soup = BSoup(response.read()) results.update( set([li.find('a')['href'] for li in soup.findAll('li', 'g')])) if callback != None: for item in results: callback(item) if terminate != None: for index, item in enumerate(results): if terminate(item): return {'page': current_page, 'url': url, 'rank': index + 1} current_page += 1