def get_main_image_with_hint_old(url, hint, hint_encoding='utf-8'): max_layer_count = 3 if hint == '': _logger.debug('hint is None, will return nothing') return None, '' if type(hint) == str: hint = util.convert_to_utf8(hint) hint = hint.decode('utf-8') br = get_browser() _logger.debug('hint=(%s), opening %s' % (hint.encode('utf-8'), url.encode('utf-8'))) br.open(url) html = br.get_html_source() html = util.convert_to_utf8(html, hint_encoding) html = fix_malformated_tags(html) soup = BSoup(html, fromEncoding='utf-8') hint_tag = _find_tag_by_best_match(soup, hint) if hint_tag == None: _logger.debug('no hint is found') return None, '' tag = hint_tag.parent _logger.debug('found matching tag: %s(%s)' % (str(tag)[:200], str(tag.attrs))) image_data = None image_url = '' found_image = False layer_count = 0 while tag != None and not found_image and layer_count <= max_layer_count: _logger.debug('trying tag(%s), %s' % (tag.name, tag.attrs)) imgs = tag.findAll('img', src=re.compile('(.jpg|.png|.jpeg|.gif)$')) for img in imgs: try: #print 'browser url:' + br.geturl() image_data = br.download_image(img['src']).read() import Image from StringIO import StringIO pic = Image.open(StringIO(image_data)) pic_size = pic.size[0] * pic.size[1] _logger.debug('got image(%d, %s)' % (pic_size, img['src'])) except Exception, err: _logger.error('failed to download image(%s): %s' % (img['src'], err)) continue if pic_size >= 100000 and _not_thin_banner(image_data): _logger.debug( 'selected main image, level: %d, url: (%s), size: (%d)' % (layer_count, img['src'], pic_size)) image_url = img['src'] found_image = True break if not (hasattr(tag, 'name') and (tag.name == 'td' or tag.name == 'tr')): layer_count += 1 tag = tag.parent
def new_mark(): form = MarkForm() if form.validate_on_submit(): m = Mark() form.populate_obj(m) m.owner_id = g.user.id m.created = datetime.utcnow() if form.tags.data: m.tags = ' '.join([t.strip() for t in form.tags.data.strip().split(',')])\ .lower() m.clicks = 0 if not form.title.data: soup = BSoup(urlopen(form.url.data)) m.title = soup.title.string db.session.add(m) db.session.commit() flash('New mark %s added' % (m.title), category='info') return redirect(url_for('marks')) if request.args.get('url'): form.url.data = request.args.get('url') if request.args.get('title'): form.title.data = request.args.get('title') if request.args.get('type') == 'feed': form.type.data = 'feed' return render_template('mark/new.html', title='New mark', form=form)
def scrape(self, force=False): html = self.download(force) # Parse HTML html_soup = BSoup(html) for link in html_soup.body.findAll(href=re.compile('\.mp3$')): enclosure = link['href'] if not self.items.filter(enclosure=enclosure).exists(): item = Item(title=link.text.strip(), enclosure=link['href']) item.get_info() self.items.add(item)
def get_all_href(url, encoding='utf-8'): br = get_browser() _logger.debug('opening url(%s) for links' % url) br.open(url) _logger.debug('loaded (%s)' % url) html = br.get_html_source() soup = BSoup(util.convert_to_utf8(html, encoding), fromEncoding='utf-8') all_href = [] for a in soup.findAll('a', href=True): a['href'] = br.abs_url(a['href']) all_href.append(a) return all_href
def RunConversion(): global DBlist, DBdict path = "manpages/" dirList = os.listdir(path) for fname in dirList: if fname.endswith(".html"): DBdict = dict() content = False print "\nReading", fname newstring = '.'.join(fname.split('.')[0:-1]) + '.txt' f = open(path + fname, 'r') content = f.read() #NAME f.close() if content: # if content : try: content = (re.sub(".*[M|n]an.*converted.*", "", content)) content = (re.sub(".*man2html.*", "", content)) soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES) c = ''.join(soup.body(text=True)) f = open(path + newstring, 'w') towrite = c.encode('utf-8') cleandata = re.search("(\w+\(.*)", towrite, re.S).group(1) DBdict['name'] = fname.split('.')[ 0][:-1] + "(" + fname.split('.')[0][-1:] + ")".strip() DBdict['cleandata'] = cleandata.strip() if re.search("NAME\n(.*)\n", cleandata, re.S): DBdict['header'] = re.search("NAME\n(.+?)\n", cleandata, re.S).group(1).strip() else: DBdict['header'] = fname.split('.')[0][:-1] DBlist.append(DBdict) f.write(cleandata) f.close() print newstring, " done !" except TypeError, e: print "*" * 100, "Error", fname ErrorFile.write( str("\tError " + fname + " - " + str(e) + "\n")) except UnicodeEncodeError, e: print "*" * 100, "Error", fname ErrorFile.write( str("\t\tError " + fname + " - " + str(e) + "\n")) except AttributeError, e: print "*" * 100, "Error", fname ErrorFile.write( str("\t\t\tError " + fname + " - " + str(e) + "\n"))
def bookmark_edit(request): user = User.by_id(authenticated_userid(request)) id = int(request.params.get('id', -1)) bookmark = user.bookmark(id) if not bookmark: return HTTPNotFound() form = BookmarkUpdateForm(request.POST, bookmark) if request.method == 'POST' and form.validate(): form.populate_obj(bookmark) bookmark.tags = ' '.join( [t.strip() for t in form.tags.data.strip().split(',')])\ .lower() if not form.title.data: soup = BSoup(urlopen(form.url.data)) bookmark.title = soup.title.string return HTTPFound(location=request.route_url('index')) return {'form': form, 'action': 'edit', 'title': 'Edit' + bookmark.title}
def bookmark_create(request): bookmark = Bookmark() form = BookmarkCreateForm(request.POST) if request.method == 'POST' and form.validate(): form.populate_obj(bookmark) user_id = authenticated_userid(request) bookmark.tags = ' '.join( [t.strip() for t in form.tags.data.strip().split(',')])\ .lower() bookmark.owner_id = user_id if not form.title.data: soup = BSoup(urlopen(form.url.data)) bookmark.title = soup.title.string DBSession.add(bookmark) request.session.flash('Bookmark %s created' % (bookmark.title)) return HTTPFound(location=request.route_url('index')) return {'form': form, 'action': 'new', 'title': 'New'}
def new_bookmark(): form = BookmarkForm() if form.validate_on_submit(): b = Bookmark() form.populate_obj(b) b.owner_id = g.user.id b.created = datetime.utcnow() b.tags = ' '.join( [t.strip() for t in form.tags.data.strip().split(',')])\ .lower() b.clicks = 0 if not form.title.data: soup = BSoup(urlopen(form.url.data)) b.title = soup.title.string db.session.add(b) db.session.commit() flash('New bookmark %s added' % (b.title), category='info') return redirect(url_for('index')) return render_template('new.html', title='New', form=form)
def get_main_image(url): br = get_browser() html = br.open(url).read() soup = BSoup(html) max_img = None max_size = 0 max_url = None all_img = soup.findAll('img', src=re.compile("(.jpg|.png)$")) _logger.debug('fetching %d condidate images' % len(all_img)) for img in all_img: try: image_data = br.download_image(img['src']).read() image_size = len(image_data) if max_size < image_size: max_img = image_data max_url = img['src'] max_size = image_size except Exception, err: _logger.error('error when downloading(%s):%s' % (img['src'], err)) else: _logger.debug("%s:%d" % (img['src'], image_size))
def clean_html(html, encoding): """ Given html of type <str>. This function alcomplish following stuff: 1. Remove non-content tags such as HTML comment, declaration, CData etc 2. Adjust the encoding so that it's consistent with charset meta tag. If there's no such tag, use UTF8 and add <meta ... content="charset='UTF8'" />. As for now, we always return UTF8 encoded string and set meta charset to UTF8 3. Various clean up: remove <meta charset="">, change '·' to ' ' """ # remove junks dealing with IE6 ptn = re.compile(r'<!–+\[.+?\]>.+?<!\[endif\]–+>', re.S) html = ptn.sub('', html) # remove junks like <meta charset="gbk" /> ptn = re.compile(r'<meta charset=.*>', re.I) html = ptn.sub('', html) try: soup = BSoup(util.convert_to_utf8(html, encoding), fromEncoding='utf-8') except Exception, err: _logger.error('Failed to create BeautifulSoup:%s' % err) return ""
def get_main_image_with_hint(url, hint, selenium, hint_encoding='utf-8'): _logger.debug('hint=(%s), opening %s' % (hint, url)) if hint == '': _logger.debug('hint is None, will return nothing') return None, '' if type(hint) == str: hint = util.convert_to_utf8(hint) hint = hint.decode('utf-8') # prepare selenium _logger.debug('opening %s in Selenium' % url) selenium.open(url) html = selenium.get_html_source() html = fix_malformated_tags(html) soup = BSoup(html, fromEncoding='utf-8') hint_tag = _find_tag_by_best_match(soup, hint) if hint_tag == None: _logger.debug('no hint is found') return None, '' tag = hint_tag.parent _logger.debug('found matching tag: %s(%s)' % (str(tag)[:200], str(tag.attrs))) # get left position of matching xpath = u'//%s[text()="%s"]' % (tag.name, tag.text) matching_tag_left = selenium.get_element_position_left(xpath) matching_tag_top = selenium.get_element_position_top(xpath) matching_tag_width = selenium.get_element_width(xpath) _logger.debug('matching tag position:(left: %d, top: %d)' % (matching_tag_left, matching_tag_top)) image_data = None image_url = '' found_image = False br = get_browser() for img in soup.findAll('img', src=True): xpath = u'//img[@src="%s"]' % img['src'] try: left = selenium.get_element_position_left(xpath) top = selenium.get_element_position_top(xpath) except Exception, err: _logger.error('failed to get positon for element, xpath=(%s): %s' % (xpath, err)) continue if top < matching_tag_top or left > matching_tag_left + matching_tag_width / 2: _logger.debug( 'ignoring img for bad pos, (top:%d, left:%d, url:%s)' % (top, left, img['src'])) continue try: image_data = br.download_image(img['src'], base_url=url).read() import Image from StringIO import StringIO pic = Image.open(StringIO(image_data)) pic_size = pic.size[0] * pic.size[1] _logger.debug('got image(%d, %s)' % (pic_size, img['src'])) except Exception, err: _logger.error('failed to download image(%s): %s' % (img['src'], err)) continue
current_page = 1 # Kick off searching fail_num = 0 _logger.info('searching [%s] for %d results from %s' % (keywords, needed, url)) while fail_num < 5: try: response = br.open(url, timeout=5.0) break except Exception, err: _logger.error('initial fetching failed(%s): %s' % (url, err)) fail_num += 1 if fail_num == 5: _logger.error('permanently failed') return [] soup = BSoup(response.read()) results.update( set([li.find('a')['href'] for li in soup.findAll('li', 'g')])) if callback != None: for item in results: callback(item) if terminate != None: for index, item in enumerate(results): if terminate(item): return {'page': current_page, 'url': url, 'rank': index + 1} current_page += 1 html = ''
# #f.write(a.replace('\n{3,}', '\n').encode('utf-8')) # # f.close() for i in range(1, 9): if i is not 6: path = "/Users/fyelles/Desktop/man-html-20111120/htmlman%s/" % ( str(i)) # insert the path to the directory of interest dirList = os.listdir(path) for fname in dirList: if fname.endswith(".html"): content = False print "\nReading", fname newstring = '.'.join(fname.split('.')[0:-1]) + '.txt' f = open(path + fname, 'r') content = f.read() f.close() soup = BSoup(content, convertEntities=BSoup.HTML_ENTITIES) c = ''.join(soup.body(text=True)) f = open(path + newstring, 'w') f.write((re.sub('\n{3,}', '\n\n', c)).encode('utf-8')) f.close() print newstring, " done !" def main(): pass if __name__ == '__main__': main()