Ejemplo n.º 1
0
def set_meta_charset(soup, encoding):
    """
    Simply find the 'charset=' string and replace the value to 'encoding'.
    """
    meta_processed = False
    for meta in soup.findAll('meta'):
        if meta.has_key('content'):
            content = meta['content']
            match = soup.CHARSET_RE.search(content)
            if match:

                def rewrite(match):
                    return match.group(1) + encoding

                newAttr = soup.CHARSET_RE.sub(rewrite, content)
                meta['content'] = newAttr
                _logger.debug('new attr:%s' % newAttr)
                meta_processed = True
                break

    if not meta_processed:
        meta = BeautifulSoup.Tag(soup,
                                 'meta',
                                 attrs={
                                     'http-equiv': 'Content-Type',
                                     'content': 'text/html; charset=utf-8'
                                 })
        soup.insert(0, meta)
Ejemplo n.º 2
0
 def abs_url(self, url):
     try:
         referer = self.geturl()
         return abs_url(referer, url)
     except Exception, err:
         _logger.debug('failed to get absolute url: %s' % err)
         return url
Ejemplo n.º 3
0
def _find_tag_by_best_match(soup, target):
    match_tag = None
    best_ratio = 0
    for tag in soup.findAll(text=True):
        ratio = util.match_ratio(target, html_unescape(unicode(tag).strip()))

        if ratio > 0.9 and _prefer_tag(tag, match_tag):
            _logger.debug('>0.9 match and prefer tag:%s,(%f)' %
                          (str(tag), ratio))
            best_ratio = ratio
            match_tag = tag
        elif ratio > best_ratio + 0.3:
            _logger.debug('>best+0.3 tag:%s,(%f)' % (str(tag), ratio))
            best_ratio = ratio
            match_tag = tag
        elif ratio >= best_ratio - 0.1 and _prefer_tag(tag, match_tag):
            _logger.debug('>best-0.1 and prefer tag:%s,(%f)' %
                          (str(tag), ratio))
            best_ratio = ratio
            match_tag = tag

    if match_tag == None or best_ratio < 0.7:
        _logger.debug('No match')
        return None

    _logger.debug('best match ratio:%f' % best_ratio)
    return match_tag
Ejemplo n.º 4
0
def fix_malformated_tags(html):
    # fix unclosed <img>
    ptn = re.compile(r'(<img.*?src=[^>]*?)(?<!/)>', re.I)
    html, count = ptn.subn(r'\1 />', html)
    _logger.debug('%d non closing <img> fixed' % count)

    # fix unquoted alt attributes
    ptn = re.compile(r'alt=(?!("|\'))(.+?) ', re.I)
    html, count = ptn.subn('alt="\2" ', html)

    return html
Ejemplo n.º 5
0
def find_rank(domain, keyword):
    def check_item(url):
        if url.find('/interstitial?url=') != -1:
            url = url[len('/interstitial?url='):]
        pos = urlparse(url).netloc.find(domain)
        return pos != -1 and (pos == 0 or urlparse(url).netloc[pos - 1] == '.')

    _logger.debug('searching rank, keyword: (%s), domain: (%s)' %
                  (keyword, domain))

    return ask_google(keyword, 1000, terminate=check_item)
Ejemplo n.º 6
0
 def download_image(self, url, base_url=None, timeout=30):
     try:
         if base_url != None:
             referer = base_url
         else:
             referer = self.geturl()
         #print 'verified browser url:' + referer
         url = abs_url(referer, url)
         request = mechanize.Request(url=url, headers={'referer': referer})
     except Exception, err:
         _logger.debug('download image without set referer: %s', err)
         request = mechanize.Request(url=url)
Ejemplo n.º 7
0
def get_all_href(url, encoding='utf-8'):
    br = get_browser()
    _logger.debug('opening url(%s) for links' % url)
    br.open(url)
    _logger.debug('loaded (%s)' % url)
    html = br.get_html_source()
    soup = BSoup(util.convert_to_utf8(html, encoding), fromEncoding='utf-8')

    all_href = []
    for a in soup.findAll('a', href=True):
        a['href'] = br.abs_url(a['href'])
        all_href.append(a)
    return all_href
Ejemplo n.º 8
0
def _not_thin_banner(data):
    from StringIO import StringIO
    import Image
    pic = Image.open(StringIO(data))
    if pic.size[0] > pic.size[1]:
        big = pic.size[0]
        small = pic.size[1]
    else:
        big = pic.size[1]
        small = pic.size[0]
    ratio = float(big) / float(small)
    _logger.debug('image demension: (%d / %d = %f)' % (big, small, ratio))
    return ratio < 2.5
Ejemplo n.º 9
0
def abs_url(base_url, url):
    url = url.strip()
    if url[:4] == 'http':
        return url

    _logger.debug('relative url:(%s), baseurl:(%s)' % (url, base_url))
    basep = urlparse(base_url)
    filename = basep.path.split("/")[-1]

    # For URL is direcotry but not end with '/', append '/'
    if filename != '' and not "." in filename and filename[-1] != '/':
        base_url += '/'
        _logger.debug('baseurl changed to %s' % base_url)
    return urljoin(base_url, url)
Ejemplo n.º 10
0
 def get_html_source(self):
     try:
         html = self.response().read()
         status = self.response().info()
         if status.has_key('content-encoding'):
             if status['content-encoding'].lower() == 'gzip':
                 _logger.debug('need to extract gzip')
                 import gzip
                 from StringIO import StringIO
                 html = gzip.GzipFile(fileobj=StringIO(html)).read()
             else:
                 raise Exception('unknown content-encoding header: %s' %
                                 status['content-encoding'])
     except Exception, err:
         raise err
Ejemplo n.º 11
0
 def train_iteration(self, filepath):
     viterbi = Viterbi("EMPTY")
     viterbi.v = self.v
     with open(filepath) as train_file:
         corpus = gen_sentence_train(train_file)
         count = 0
         for doc in corpus:
             count += 1
             if count % 1000 == 0:
                 _logger.debug("%d sentence processed" % count)
             sent = [s[0] for s in doc]
             tags = [s[1] for s in doc]
             tags_pred = viterbi.decode_one(list(sent))
             assert len(sent) == len(tags) == len(tags_pred)
             feat_gold = feat_vect(sent, tags)
             feat_pred = feat_vect(sent, tags_pred)
             for feat in feat_pred:
                 self.v[feat] -= feat_pred[feat]
             for feat in feat_gold:
                 self.v[feat] += feat_gold[feat]
Ejemplo n.º 12
0
def get_main_image(url):
    br = get_browser()
    html = br.open(url).read()
    soup = BSoup(html)
    max_img = None
    max_size = 0
    max_url = None
    all_img = soup.findAll('img', src=re.compile("(.jpg|.png)$"))
    _logger.debug('fetching %d condidate images' % len(all_img))
    for img in all_img:
        try:
            image_data = br.download_image(img['src']).read()
            image_size = len(image_data)
            if max_size < image_size:
                max_img = image_data
                max_url = img['src']
                max_size = image_size
        except Exception, err:
            _logger.error('error when downloading(%s):%s' % (img['src'], err))
        else:
            _logger.debug("%s:%d" % (img['src'], image_size))
Ejemplo n.º 13
0
def extract_main_body(url, selenium, encoding):
    br = get_browser()
    _logger.debug('opening %s' % url)
    br.open(url)
    html = br.get_html_source()
    _logger.debug('removing non-content tags')
    html = clean_html(html, encoding)  # html is now a utf-8 string
    if html == '':
        _logger.error('clean_html failed, aborting')
        return ''

    # use meta description, if any
    # soup = BSoup(html, fromEncoding='utf-8')
    # desc = soup.find('meta', attrs={'name': 'description'})
    # if desc != None and hasattr(desc, 'content') and util.chinese_charactor_count(desc['content'].strip()) > 35:
    #     _logger.debug('use meta description as main body')
    #     return html_unescape(desc['content'].strip())

    contents = extract_content(html, selenium)
    limit = 70
    while limit >= 50:
        for content in contents:
            char_count = util.chinese_charactor_count(content.strip())
            if char_count > limit and content[:140].count(' ') <= 3:
                _logger.debug('found main body(%s), char count:%d' %
                              (content.encode('utf-8'), char_count))
                return content
        limit -= 5
    return ''
Ejemplo n.º 14
0
def extract_content(html, selenium):
    # safe to temp file and open with selenium
    escaped = []
    path = util.dump2file_with_date(content=html, ext='html')
    url = "file://%s/%s" % (os.path.abspath('.'), path)
    _logger.debug('opening %s' % url)
    selenium.open(url)
    text = selenium.get_body_text()
    text = html_unescape(text)
    text = text.replace('\r', '\n')
    text = re.sub(r'\s{3, }', '\n', text)
    text = text.split('\n')  #

    os.system('rm %s' % path)

    escaped = []
    for i in text:
        i = i.strip()
        for para in i.split('\n'):
            para = para.strip()
            if len(para) > 0:
                escaped.append(para)
    return escaped
Ejemplo n.º 15
0
def convert_to_utf8(text, encoding='utf-8'):
    try:
        return text.decode(encoding).encode('utf8')
    except:
        _logger.debug('decode using %s failed', encoding)
        try:
            return text.decode('gbk').encode('utf8')
        except:
            _logger.debug('decode using gbk failed')
            try:
                return text.decode('gb2312').encode('utf8')
            except:
                _logger.debug('decode using gb2312 failed')
                try:
                    return text.decode('utf-8').encode('utf8')
                except:
                    _logger.debug('decode using utf-8 failed')
                    try:
                        return text.decode('gb18030').encode('utf8')
                    except:
                        _logger.debug('decode using gb18030 failed')
                        raise Exception(
                            'can\'t decode input string, all reasonable Chinese encoding method failed'
                        )
Ejemplo n.º 16
0
            continue

        try:
            image_data = br.download_image(img['src'], base_url=url).read()
            import Image
            from StringIO import StringIO
            pic = Image.open(StringIO(image_data))
            pic_size = pic.size[0] * pic.size[1]
            _logger.debug('got image(%d, %s)' % (pic_size, img['src']))
        except Exception, err:
            _logger.error('failed to download image(%s): %s' %
                          (img['src'], err))
            continue

        if pic_size >= 60000 and _not_thin_banner(image_data):
            _logger.debug('selected main image, url: (%s), size: (%d)' %
                          (img['src'], pic_size))
            image_url = img['src']
            found_image = True
            break

    if found_image:
        return image_data, abs_url(url, image_url)
    else:
        return None, ''


def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'):
    max_layer_count = 3

    if hint == '':
        _logger.debug('hint is None, will return nothing')
Ejemplo n.º 17
0
def get_main_image_with_hint(url, hint, selenium, hint_encoding='utf-8'):
    _logger.debug('hint=(%s), opening %s' % (hint, url))

    if hint == '':
        _logger.debug('hint is None, will return nothing')
        return None, ''
    if type(hint) == str:
        hint = util.convert_to_utf8(hint)
        hint = hint.decode('utf-8')

    # prepare selenium
    _logger.debug('opening %s in Selenium' % url)
    selenium.open(url)

    html = selenium.get_html_source()
    html = fix_malformated_tags(html)

    soup = BSoup(html, fromEncoding='utf-8')
    hint_tag = _find_tag_by_best_match(soup, hint)

    if hint_tag == None:
        _logger.debug('no hint is found')
        return None, ''

    tag = hint_tag.parent
    _logger.debug('found matching tag: %s(%s)' %
                  (str(tag)[:200], str(tag.attrs)))

    # get left position of matching
    xpath = u'//%s[text()="%s"]' % (tag.name, tag.text)
    matching_tag_left = selenium.get_element_position_left(xpath)
    matching_tag_top = selenium.get_element_position_top(xpath)
    matching_tag_width = selenium.get_element_width(xpath)

    _logger.debug('matching tag position:(left: %d, top: %d)' %
                  (matching_tag_left, matching_tag_top))

    image_data = None
    image_url = ''
    found_image = False

    br = get_browser()

    for img in soup.findAll('img', src=True):
        xpath = u'//img[@src="%s"]' % img['src']
        try:
            left = selenium.get_element_position_left(xpath)
            top = selenium.get_element_position_top(xpath)
        except Exception, err:
            _logger.error('failed to get positon for element, xpath=(%s): %s' %
                          (xpath, err))
            continue
        if top < matching_tag_top or left > matching_tag_left + matching_tag_width / 2:
            _logger.debug(
                'ignoring img for bad pos, (top:%d, left:%d, url:%s)' %
                (top, left, img['src']))
            continue

        try:
            image_data = br.download_image(img['src'], base_url=url).read()
            import Image
            from StringIO import StringIO
            pic = Image.open(StringIO(image_data))
            pic_size = pic.size[0] * pic.size[1]
            _logger.debug('got image(%d, %s)' % (pic_size, img['src']))
        except Exception, err:
            _logger.error('failed to download image(%s): %s' %
                          (img['src'], err))
            continue
Ejemplo n.º 18
0
def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'):
    max_layer_count = 3

    if hint == '':
        _logger.debug('hint is None, will return nothing')
        return None, ''
    if type(hint) == str:
        hint = util.convert_to_utf8(hint)
        hint = hint.decode('utf-8')
    br = get_browser()
    _logger.debug('hint=(%s), opening %s' %
                  (hint.encode('utf-8'), url.encode('utf-8')))
    br.open(url)
    html = br.get_html_source()
    html = util.convert_to_utf8(html, hint_encoding)
    html = fix_malformated_tags(html)

    soup = BSoup(html, fromEncoding='utf-8')
    hint_tag = _find_tag_by_best_match(soup, hint)

    if hint_tag == None:
        _logger.debug('no hint is found')
        return None, ''

    tag = hint_tag.parent
    _logger.debug('found matching tag: %s(%s)' %
                  (str(tag)[:200], str(tag.attrs)))
    image_data = None
    image_url = ''
    found_image = False

    layer_count = 0
    while tag != None and not found_image and layer_count <= max_layer_count:
        _logger.debug('trying tag(%s), %s' % (tag.name, tag.attrs))
        imgs = tag.findAll('img', src=re.compile('(.jpg|.png|.jpeg|.gif)$'))
        for img in imgs:
            try:
                #print 'browser url:' + br.geturl()
                image_data = br.download_image(img['src']).read()
                import Image
                from StringIO import StringIO
                pic = Image.open(StringIO(image_data))
                pic_size = pic.size[0] * pic.size[1]
                _logger.debug('got image(%d, %s)' % (pic_size, img['src']))
            except Exception, err:
                _logger.error('failed to download image(%s): %s' %
                              (img['src'], err))
                continue

            if pic_size >= 100000 and _not_thin_banner(image_data):
                _logger.debug(
                    'selected main image, level: %d, url: (%s), size: (%d)' %
                    (layer_count, img['src'], pic_size))
                image_url = img['src']
                found_image = True
                break
        if not (hasattr(tag, 'name') and
                (tag.name == 'td' or tag.name == 'tr')):
            layer_count += 1
        tag = tag.parent