def set_meta_charset(soup, encoding): """ Simply find the 'charset=' string and replace the value to 'encoding'. """ meta_processed = False for meta in soup.findAll('meta'): if meta.has_key('content'): content = meta['content'] match = soup.CHARSET_RE.search(content) if match: def rewrite(match): return match.group(1) + encoding newAttr = soup.CHARSET_RE.sub(rewrite, content) meta['content'] = newAttr _logger.debug('new attr:%s' % newAttr) meta_processed = True break if not meta_processed: meta = BeautifulSoup.Tag(soup, 'meta', attrs={ 'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8' }) soup.insert(0, meta)
def abs_url(self, url): try: referer = self.geturl() return abs_url(referer, url) except Exception, err: _logger.debug('failed to get absolute url: %s' % err) return url
def _find_tag_by_best_match(soup, target): match_tag = None best_ratio = 0 for tag in soup.findAll(text=True): ratio = util.match_ratio(target, html_unescape(unicode(tag).strip())) if ratio > 0.9 and _prefer_tag(tag, match_tag): _logger.debug('>0.9 match and prefer tag:%s,(%f)' % (str(tag), ratio)) best_ratio = ratio match_tag = tag elif ratio > best_ratio + 0.3: _logger.debug('>best+0.3 tag:%s,(%f)' % (str(tag), ratio)) best_ratio = ratio match_tag = tag elif ratio >= best_ratio - 0.1 and _prefer_tag(tag, match_tag): _logger.debug('>best-0.1 and prefer tag:%s,(%f)' % (str(tag), ratio)) best_ratio = ratio match_tag = tag if match_tag == None or best_ratio < 0.7: _logger.debug('No match') return None _logger.debug('best match ratio:%f' % best_ratio) return match_tag
def fix_malformated_tags(html): # fix unclosed <img> ptn = re.compile(r'(<img.*?src=[^>]*?)(?<!/)>', re.I) html, count = ptn.subn(r'\1 />', html) _logger.debug('%d non closing <img> fixed' % count) # fix unquoted alt attributes ptn = re.compile(r'alt=(?!("|\'))(.+?) ', re.I) html, count = ptn.subn('alt="\2" ', html) return html
def find_rank(domain, keyword): def check_item(url): if url.find('/interstitial?url=') != -1: url = url[len('/interstitial?url='):] pos = urlparse(url).netloc.find(domain) return pos != -1 and (pos == 0 or urlparse(url).netloc[pos - 1] == '.') _logger.debug('searching rank, keyword: (%s), domain: (%s)' % (keyword, domain)) return ask_google(keyword, 1000, terminate=check_item)
def download_image(self, url, base_url=None, timeout=30): try: if base_url != None: referer = base_url else: referer = self.geturl() #print 'verified browser url:' + referer url = abs_url(referer, url) request = mechanize.Request(url=url, headers={'referer': referer}) except Exception, err: _logger.debug('download image without set referer: %s', err) request = mechanize.Request(url=url)
def get_all_href(url, encoding='utf-8'): br = get_browser() _logger.debug('opening url(%s) for links' % url) br.open(url) _logger.debug('loaded (%s)' % url) html = br.get_html_source() soup = BSoup(util.convert_to_utf8(html, encoding), fromEncoding='utf-8') all_href = [] for a in soup.findAll('a', href=True): a['href'] = br.abs_url(a['href']) all_href.append(a) return all_href
def _not_thin_banner(data): from StringIO import StringIO import Image pic = Image.open(StringIO(data)) if pic.size[0] > pic.size[1]: big = pic.size[0] small = pic.size[1] else: big = pic.size[1] small = pic.size[0] ratio = float(big) / float(small) _logger.debug('image demension: (%d / %d = %f)' % (big, small, ratio)) return ratio < 2.5
def abs_url(base_url, url): url = url.strip() if url[:4] == 'http': return url _logger.debug('relative url:(%s), baseurl:(%s)' % (url, base_url)) basep = urlparse(base_url) filename = basep.path.split("/")[-1] # For URL is direcotry but not end with '/', append '/' if filename != '' and not "." in filename and filename[-1] != '/': base_url += '/' _logger.debug('baseurl changed to %s' % base_url) return urljoin(base_url, url)
def get_html_source(self): try: html = self.response().read() status = self.response().info() if status.has_key('content-encoding'): if status['content-encoding'].lower() == 'gzip': _logger.debug('need to extract gzip') import gzip from StringIO import StringIO html = gzip.GzipFile(fileobj=StringIO(html)).read() else: raise Exception('unknown content-encoding header: %s' % status['content-encoding']) except Exception, err: raise err
def train_iteration(self, filepath): viterbi = Viterbi("EMPTY") viterbi.v = self.v with open(filepath) as train_file: corpus = gen_sentence_train(train_file) count = 0 for doc in corpus: count += 1 if count % 1000 == 0: _logger.debug("%d sentence processed" % count) sent = [s[0] for s in doc] tags = [s[1] for s in doc] tags_pred = viterbi.decode_one(list(sent)) assert len(sent) == len(tags) == len(tags_pred) feat_gold = feat_vect(sent, tags) feat_pred = feat_vect(sent, tags_pred) for feat in feat_pred: self.v[feat] -= feat_pred[feat] for feat in feat_gold: self.v[feat] += feat_gold[feat]
def get_main_image(url): br = get_browser() html = br.open(url).read() soup = BSoup(html) max_img = None max_size = 0 max_url = None all_img = soup.findAll('img', src=re.compile("(.jpg|.png)$")) _logger.debug('fetching %d condidate images' % len(all_img)) for img in all_img: try: image_data = br.download_image(img['src']).read() image_size = len(image_data) if max_size < image_size: max_img = image_data max_url = img['src'] max_size = image_size except Exception, err: _logger.error('error when downloading(%s):%s' % (img['src'], err)) else: _logger.debug("%s:%d" % (img['src'], image_size))
def extract_main_body(url, selenium, encoding): br = get_browser() _logger.debug('opening %s' % url) br.open(url) html = br.get_html_source() _logger.debug('removing non-content tags') html = clean_html(html, encoding) # html is now a utf-8 string if html == '': _logger.error('clean_html failed, aborting') return '' # use meta description, if any # soup = BSoup(html, fromEncoding='utf-8') # desc = soup.find('meta', attrs={'name': 'description'}) # if desc != None and hasattr(desc, 'content') and util.chinese_charactor_count(desc['content'].strip()) > 35: # _logger.debug('use meta description as main body') # return html_unescape(desc['content'].strip()) contents = extract_content(html, selenium) limit = 70 while limit >= 50: for content in contents: char_count = util.chinese_charactor_count(content.strip()) if char_count > limit and content[:140].count(' ') <= 3: _logger.debug('found main body(%s), char count:%d' % (content.encode('utf-8'), char_count)) return content limit -= 5 return ''
def extract_content(html, selenium): # safe to temp file and open with selenium escaped = [] path = util.dump2file_with_date(content=html, ext='html') url = "file://%s/%s" % (os.path.abspath('.'), path) _logger.debug('opening %s' % url) selenium.open(url) text = selenium.get_body_text() text = html_unescape(text) text = text.replace('\r', '\n') text = re.sub(r'\s{3, }', '\n', text) text = text.split('\n') # os.system('rm %s' % path) escaped = [] for i in text: i = i.strip() for para in i.split('\n'): para = para.strip() if len(para) > 0: escaped.append(para) return escaped
def convert_to_utf8(text, encoding='utf-8'): try: return text.decode(encoding).encode('utf8') except: _logger.debug('decode using %s failed', encoding) try: return text.decode('gbk').encode('utf8') except: _logger.debug('decode using gbk failed') try: return text.decode('gb2312').encode('utf8') except: _logger.debug('decode using gb2312 failed') try: return text.decode('utf-8').encode('utf8') except: _logger.debug('decode using utf-8 failed') try: return text.decode('gb18030').encode('utf8') except: _logger.debug('decode using gb18030 failed') raise Exception( 'can\'t decode input string, all reasonable Chinese encoding method failed' )
continue try: image_data = br.download_image(img['src'], base_url=url).read() import Image from StringIO import StringIO pic = Image.open(StringIO(image_data)) pic_size = pic.size[0] * pic.size[1] _logger.debug('got image(%d, %s)' % (pic_size, img['src'])) except Exception, err: _logger.error('failed to download image(%s): %s' % (img['src'], err)) continue if pic_size >= 60000 and _not_thin_banner(image_data): _logger.debug('selected main image, url: (%s), size: (%d)' % (img['src'], pic_size)) image_url = img['src'] found_image = True break if found_image: return image_data, abs_url(url, image_url) else: return None, '' def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'): max_layer_count = 3 if hint == '': _logger.debug('hint is None, will return nothing')
def get_main_image_with_hint(url, hint, selenium, hint_encoding='utf-8'): _logger.debug('hint=(%s), opening %s' % (hint, url)) if hint == '': _logger.debug('hint is None, will return nothing') return None, '' if type(hint) == str: hint = util.convert_to_utf8(hint) hint = hint.decode('utf-8') # prepare selenium _logger.debug('opening %s in Selenium' % url) selenium.open(url) html = selenium.get_html_source() html = fix_malformated_tags(html) soup = BSoup(html, fromEncoding='utf-8') hint_tag = _find_tag_by_best_match(soup, hint) if hint_tag == None: _logger.debug('no hint is found') return None, '' tag = hint_tag.parent _logger.debug('found matching tag: %s(%s)' % (str(tag)[:200], str(tag.attrs))) # get left position of matching xpath = u'//%s[text()="%s"]' % (tag.name, tag.text) matching_tag_left = selenium.get_element_position_left(xpath) matching_tag_top = selenium.get_element_position_top(xpath) matching_tag_width = selenium.get_element_width(xpath) _logger.debug('matching tag position:(left: %d, top: %d)' % (matching_tag_left, matching_tag_top)) image_data = None image_url = '' found_image = False br = get_browser() for img in soup.findAll('img', src=True): xpath = u'//img[@src="%s"]' % img['src'] try: left = selenium.get_element_position_left(xpath) top = selenium.get_element_position_top(xpath) except Exception, err: _logger.error('failed to get positon for element, xpath=(%s): %s' % (xpath, err)) continue if top < matching_tag_top or left > matching_tag_left + matching_tag_width / 2: _logger.debug( 'ignoring img for bad pos, (top:%d, left:%d, url:%s)' % (top, left, img['src'])) continue try: image_data = br.download_image(img['src'], base_url=url).read() import Image from StringIO import StringIO pic = Image.open(StringIO(image_data)) pic_size = pic.size[0] * pic.size[1] _logger.debug('got image(%d, %s)' % (pic_size, img['src'])) except Exception, err: _logger.error('failed to download image(%s): %s' % (img['src'], err)) continue
def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'): max_layer_count = 3 if hint == '': _logger.debug('hint is None, will return nothing') return None, '' if type(hint) == str: hint = util.convert_to_utf8(hint) hint = hint.decode('utf-8') br = get_browser() _logger.debug('hint=(%s), opening %s' % (hint.encode('utf-8'), url.encode('utf-8'))) br.open(url) html = br.get_html_source() html = util.convert_to_utf8(html, hint_encoding) html = fix_malformated_tags(html) soup = BSoup(html, fromEncoding='utf-8') hint_tag = _find_tag_by_best_match(soup, hint) if hint_tag == None: _logger.debug('no hint is found') return None, '' tag = hint_tag.parent _logger.debug('found matching tag: %s(%s)' % (str(tag)[:200], str(tag.attrs))) image_data = None image_url = '' found_image = False layer_count = 0 while tag != None and not found_image and layer_count <= max_layer_count: _logger.debug('trying tag(%s), %s' % (tag.name, tag.attrs)) imgs = tag.findAll('img', src=re.compile('(.jpg|.png|.jpeg|.gif)$')) for img in imgs: try: #print 'browser url:' + br.geturl() image_data = br.download_image(img['src']).read() import Image from StringIO import StringIO pic = Image.open(StringIO(image_data)) pic_size = pic.size[0] * pic.size[1] _logger.debug('got image(%d, %s)' % (pic_size, img['src'])) except Exception, err: _logger.error('failed to download image(%s): %s' % (img['src'], err)) continue if pic_size >= 100000 and _not_thin_banner(image_data): _logger.debug( 'selected main image, level: %d, url: (%s), size: (%d)' % (layer_count, img['src'], pic_size)) image_url = img['src'] found_image = True break if not (hasattr(tag, 'name') and (tag.name == 'td' or tag.name == 'tr')): layer_count += 1 tag = tag.parent