Example #1
0
def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'):
    max_layer_count = 3

    if hint == '':
        _logger.debug('hint is None, will return nothing')
        return None, ''
    if type(hint) == str:
        hint = util.convert_to_utf8(hint)
        hint = hint.decode('utf-8')
    br = get_browser()
    _logger.debug('hint=(%s), opening %s' %
                  (hint.encode('utf-8'), url.encode('utf-8')))
    br.open(url)
    html = br.get_html_source()
    html = util.convert_to_utf8(html, hint_encoding)
    html = fix_malformated_tags(html)

    soup = BSoup(html, fromEncoding='utf-8')
    hint_tag = _find_tag_by_best_match(soup, hint)

    if hint_tag == None:
        _logger.debug('no hint is found')
        return None, ''

    tag = hint_tag.parent
    _logger.debug('found matching tag: %s(%s)' %
                  (str(tag)[:200], str(tag.attrs)))
    image_data = None
    image_url = ''
    found_image = False

    layer_count = 0
    while tag != None and not found_image and layer_count <= max_layer_count:
        _logger.debug('trying tag(%s), %s' % (tag.name, tag.attrs))
        imgs = tag.findAll('img', src=re.compile('(.jpg|.png|.jpeg|.gif)$'))
        for img in imgs:
            try:
                #print 'browser url:' + br.geturl()
                image_data = br.download_image(img['src']).read()
                import Image
                from StringIO import StringIO
                pic = Image.open(StringIO(image_data))
                pic_size = pic.size[0] * pic.size[1]
                _logger.debug('got image(%d, %s)' % (pic_size, img['src']))
            except Exception, err:
                _logger.error('failed to download image(%s): %s' %
                              (img['src'], err))
                continue

            if pic_size >= 100000 and _not_thin_banner(image_data):
                _logger.debug(
                    'selected main image, level: %d, url: (%s), size: (%d)' %
                    (layer_count, img['src'], pic_size))
                image_url = img['src']
                found_image = True
                break
        if not (hasattr(tag, 'name') and
                (tag.name == 'td' or tag.name == 'tr')):
            layer_count += 1
        tag = tag.parent
Example #2
0
def new_mark():
    form = MarkForm()
    if form.validate_on_submit():
        m = Mark()
        form.populate_obj(m)
        m.owner_id = g.user.id
        m.created = datetime.utcnow()
        if form.tags.data:
            m.tags = ' '.join([t.strip()
                              for t in form.tags.data.strip().split(',')])\
                        .lower()
        m.clicks = 0
        if not form.title.data:
            soup = BSoup(urlopen(form.url.data))
            m.title = soup.title.string
        db.session.add(m)
        db.session.commit()
        flash('New mark %s added' % (m.title), category='info')
        return redirect(url_for('marks'))
    if request.args.get('url'):
        form.url.data = request.args.get('url')
    if request.args.get('title'):
        form.title.data = request.args.get('title')
    if request.args.get('type') == 'feed':
        form.type.data = 'feed'
    return render_template('mark/new.html', title='New mark', form=form)
Example #3
0
    def scrape(self, force=False):
        html = self.download(force)

        # Parse HTML
        html_soup = BSoup(html)

        for link in html_soup.body.findAll(href=re.compile('\.mp3$')):
            enclosure = link['href']
            if not self.items.filter(enclosure=enclosure).exists():
                item = Item(title=link.text.strip(), enclosure=link['href'])
                item.get_info()
                self.items.add(item)
Example #4
0
def get_all_href(url, encoding='utf-8'):
    br = get_browser()
    _logger.debug('opening url(%s) for links' % url)
    br.open(url)
    _logger.debug('loaded (%s)' % url)
    html = br.get_html_source()
    soup = BSoup(util.convert_to_utf8(html, encoding), fromEncoding='utf-8')

    all_href = []
    for a in soup.findAll('a', href=True):
        a['href'] = br.abs_url(a['href'])
        all_href.append(a)
    return all_href
Example #5
0
def RunConversion():
    global DBlist, DBdict
    path = "manpages/"
    dirList = os.listdir(path)

    for fname in dirList:
        if fname.endswith(".html"):
            DBdict = dict()
            content = False
            print "\nReading", fname
            newstring = '.'.join(fname.split('.')[0:-1]) + '.txt'
            f = open(path + fname, 'r')
            content = f.read()  #NAME
            f.close()
            if content:
                #            if content :
                try:
                    content = (re.sub(".*[M|n]an.*converted.*", "", content))
                    content = (re.sub(".*man2html.*", "", content))
                    soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES)
                    c = ''.join(soup.body(text=True))
                    f = open(path + newstring, 'w')
                    towrite = c.encode('utf-8')
                    cleandata = re.search("(\w+\(.*)", towrite, re.S).group(1)

                    DBdict['name'] = fname.split('.')[
                        0][:-1] + "(" + fname.split('.')[0][-1:] + ")".strip()
                    DBdict['cleandata'] = cleandata.strip()
                    if re.search("NAME\n(.*)\n", cleandata, re.S):
                        DBdict['header'] = re.search("NAME\n(.+?)\n",
                                                     cleandata,
                                                     re.S).group(1).strip()
                    else:
                        DBdict['header'] = fname.split('.')[0][:-1]
                    DBlist.append(DBdict)

                    f.write(cleandata)
                    f.close()
                    print newstring, " done !"
                except TypeError, e:
                    print "*" * 100, "Error", fname
                    ErrorFile.write(
                        str("\tError " + fname + " - " + str(e) + "\n"))
                except UnicodeEncodeError, e:
                    print "*" * 100, "Error", fname
                    ErrorFile.write(
                        str("\t\tError " + fname + " - " + str(e) + "\n"))
                except AttributeError, e:
                    print "*" * 100, "Error", fname
                    ErrorFile.write(
                        str("\t\t\tError " + fname + " - " + str(e) + "\n"))
Example #6
0
def bookmark_edit(request):
    user = User.by_id(authenticated_userid(request))
    id = int(request.params.get('id', -1))
    bookmark = user.bookmark(id)
    if not bookmark:
        return HTTPNotFound()
    form = BookmarkUpdateForm(request.POST, bookmark)
    if request.method == 'POST' and form.validate():
        form.populate_obj(bookmark)
        bookmark.tags = ' '.join(
                            [t.strip() for t in form.tags.data.strip().split(',')])\
                            .lower()
        if not form.title.data:
            soup = BSoup(urlopen(form.url.data))
            bookmark.title = soup.title.string
        return HTTPFound(location=request.route_url('index'))
    return {'form': form, 'action': 'edit', 'title': 'Edit' + bookmark.title}
Example #7
0
def bookmark_create(request):
    bookmark = Bookmark()
    form = BookmarkCreateForm(request.POST)
    if request.method == 'POST' and form.validate():
        form.populate_obj(bookmark)
        user_id = authenticated_userid(request)
        bookmark.tags = ' '.join(
                            [t.strip() for t in form.tags.data.strip().split(',')])\
                            .lower()
        bookmark.owner_id = user_id
        if not form.title.data:
            soup = BSoup(urlopen(form.url.data))
            bookmark.title = soup.title.string
        DBSession.add(bookmark)
        request.session.flash('Bookmark %s created' % (bookmark.title))
        return HTTPFound(location=request.route_url('index'))
    return {'form': form, 'action': 'new', 'title': 'New'}
Example #8
0
def new_bookmark():
    form = BookmarkForm()
    if form.validate_on_submit():
        b = Bookmark()
        form.populate_obj(b)
        b.owner_id = g.user.id
        b.created = datetime.utcnow()
        b.tags = ' '.join(
                      [t.strip() for t in form.tags.data.strip().split(',')])\
                    .lower()
        b.clicks = 0
        if not form.title.data:
            soup = BSoup(urlopen(form.url.data))
            b.title = soup.title.string
        db.session.add(b)
        db.session.commit()
        flash('New bookmark %s added' % (b.title), category='info')
        return redirect(url_for('index'))
    return render_template('new.html', title='New', form=form)
Example #9
0
def get_main_image(url):
    br = get_browser()
    html = br.open(url).read()
    soup = BSoup(html)
    max_img = None
    max_size = 0
    max_url = None
    all_img = soup.findAll('img', src=re.compile("(.jpg|.png)$"))
    _logger.debug('fetching %d condidate images' % len(all_img))
    for img in all_img:
        try:
            image_data = br.download_image(img['src']).read()
            image_size = len(image_data)
            if max_size < image_size:
                max_img = image_data
                max_url = img['src']
                max_size = image_size
        except Exception, err:
            _logger.error('error when downloading(%s):%s' % (img['src'], err))
        else:
            _logger.debug("%s:%d" % (img['src'], image_size))
Example #10
0
def clean_html(html, encoding):
    """
    Given html of type <str>. This function alcomplish following stuff:
    1. Remove non-content tags such as HTML comment, declaration, CData etc
    2. Adjust the encoding so that it's consistent with charset meta tag.
       If there's no such tag, use UTF8 and add <meta ... content="charset='UTF8'" />.
       As for now, we always return UTF8 encoded string and set meta charset to UTF8
    3. Various clean up: remove <meta charset="">, change '·' to ' '  
    """

    # remove junks dealing with IE6
    ptn = re.compile(r'<!–+\[.+?\]>.+?<!\[endif\]–+>', re.S)
    html = ptn.sub('', html)
    # remove junks like <meta charset="gbk" />
    ptn = re.compile(r'<meta charset=.*>', re.I)
    html = ptn.sub('', html)

    try:
        soup = BSoup(util.convert_to_utf8(html, encoding),
                     fromEncoding='utf-8')
    except Exception, err:
        _logger.error('Failed to create BeautifulSoup:%s' % err)
        return ""
Example #11
0
def get_main_image_with_hint(url, hint, selenium, hint_encoding='utf-8'):
    _logger.debug('hint=(%s), opening %s' % (hint, url))

    if hint == '':
        _logger.debug('hint is None, will return nothing')
        return None, ''
    if type(hint) == str:
        hint = util.convert_to_utf8(hint)
        hint = hint.decode('utf-8')

    # prepare selenium
    _logger.debug('opening %s in Selenium' % url)
    selenium.open(url)

    html = selenium.get_html_source()
    html = fix_malformated_tags(html)

    soup = BSoup(html, fromEncoding='utf-8')
    hint_tag = _find_tag_by_best_match(soup, hint)

    if hint_tag == None:
        _logger.debug('no hint is found')
        return None, ''

    tag = hint_tag.parent
    _logger.debug('found matching tag: %s(%s)' %
                  (str(tag)[:200], str(tag.attrs)))

    # get left position of matching
    xpath = u'//%s[text()="%s"]' % (tag.name, tag.text)
    matching_tag_left = selenium.get_element_position_left(xpath)
    matching_tag_top = selenium.get_element_position_top(xpath)
    matching_tag_width = selenium.get_element_width(xpath)

    _logger.debug('matching tag position:(left: %d, top: %d)' %
                  (matching_tag_left, matching_tag_top))

    image_data = None
    image_url = ''
    found_image = False

    br = get_browser()

    for img in soup.findAll('img', src=True):
        xpath = u'//img[@src="%s"]' % img['src']
        try:
            left = selenium.get_element_position_left(xpath)
            top = selenium.get_element_position_top(xpath)
        except Exception, err:
            _logger.error('failed to get positon for element, xpath=(%s): %s' %
                          (xpath, err))
            continue
        if top < matching_tag_top or left > matching_tag_left + matching_tag_width / 2:
            _logger.debug(
                'ignoring img for bad pos, (top:%d, left:%d, url:%s)' %
                (top, left, img['src']))
            continue

        try:
            image_data = br.download_image(img['src'], base_url=url).read()
            import Image
            from StringIO import StringIO
            pic = Image.open(StringIO(image_data))
            pic_size = pic.size[0] * pic.size[1]
            _logger.debug('got image(%d, %s)' % (pic_size, img['src']))
        except Exception, err:
            _logger.error('failed to download image(%s): %s' %
                          (img['src'], err))
            continue
Example #12
0
    current_page = 1
    # Kick off searching
    fail_num = 0
    _logger.info('searching [%s] for %d results from %s' %
                 (keywords, needed, url))
    while fail_num < 5:
        try:
            response = br.open(url, timeout=5.0)
            break
        except Exception, err:
            _logger.error('initial fetching failed(%s): %s' % (url, err))
            fail_num += 1
    if fail_num == 5:
        _logger.error('permanently failed')
        return []
    soup = BSoup(response.read())
    results.update(
        set([li.find('a')['href'] for li in soup.findAll('li', 'g')]))

    if callback != None:
        for item in results:
            callback(item)

    if terminate != None:
        for index, item in enumerate(results):
            if terminate(item):
                return {'page': current_page, 'url': url, 'rank': index + 1}

    current_page += 1

    html = ''
Example #13
0
# #f.write(a.replace('\n{3,}', '\n').encode('utf-8'))
#
# f.close()
for i in range(1, 9):
    if i is not 6:
        path = "/Users/fyelles/Desktop/man-html-20111120/htmlman%s/" % (
            str(i))  # insert the path to the directory of interest
        dirList = os.listdir(path)
        for fname in dirList:
            if fname.endswith(".html"):
                content = False
                print "\nReading", fname
                newstring = '.'.join(fname.split('.')[0:-1]) + '.txt'
                f = open(path + fname, 'r')
                content = f.read()
                f.close()
                soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES)
                c = ''.join(soup.body(text=True))
                f = open(path + newstring, 'w')
                f.write((re.sub('\n{3,}', '\n\n', c)).encode('utf-8'))
                f.close()
                print newstring, " done !"


def main():
    pass


if __name__ == '__main__':
    main()