def fetch_title(bookmark): title_bookmark = Bookmark.query.get(bookmark.id) r = requests.get(title_bookmark.main_url) soup = BeautifulSoup4(r.content) title = soup.title.string title_bookmark.title = title.encode('utf-8') db.session.commit()
def fetch_description(bookmark): desc_bookmark = Bookmark.query.get(bookmark.id) r = requests.get(desc_bookmark.main_url) soup = BeautifulSoup4(r.content) desc = soup.find(attrs={"name": "description"}) if desc is not None: desc = desc['content'] desc_bookmark.description = desc[:256].encode('utf-8') db.session.commit()
def setUp(self): query_user = User.query.filter_by( email='*****@*****.**').first() if query_user: query_bookmarks = Bookmark.query.filter_by(user=query_user.id) for bmark in query_bookmarks: db.session.delete(bmark) db.session.commit() db.session.delete(query_user) db.session.commit() create_user = User() create_user.first_name = 'Instapaper' create_user.last_name = 'Test' create_user.email = '*****@*****.**' create_user.password = '******' create_user.active = True create_user.confirmed_at = datetime.datetime.utcnow() db.session.add(create_user) db.session.commit() self.user = create_user with open('Instapaper.html') as json_file: create_file = open( os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+') self.data = html.document_fromstring(json_file.read()) self.data = html.tostring(self.data) self.html_data = BeautifulSoup4(self.data) self.bookmarks = {} for tag in self.html_data.find_all('h1'): parent_elem = tag.find_next_sibling('ol') links = parent_elem.find_all('a') for link in links: title = link.text url = link['href'] tags = [tag.text] tags.append('Imported') # Thanks Instapaper for not adding timestamps self.bookmarks[url] = { 'href': url, 'title': title, 'tags': tags } create_file.write(self.data) self.file_path = create_file.name create_file.close() init_parser = InstapaperParser(self.file_path, self.user.id) init_parser.process() init_parser.add_to_database() self.query = Bookmark.query.filter_by(user=self.user.id).all() self.html_parser = HTMLParser()
def fulltext_extract(bookmark): browser = webdriver.PhantomJS(service_args=[ "--ignore-ssl-errors=true", "--ssl-protocol=tlsv1", "--load-images=no" ]) fulltext_bookmark = Bookmark.query.get(bookmark.id) browser.get(fulltext_bookmark.main_url) body = browser.find_element_by_tag_name('body') bodytext = body.text soup = BeautifulSoup4(bodytext) full_text = soup.text full_text = " ".join(full_text.split()) full_text = full_text.replace('\n', '') full_text = full_text.encode('utf-8') fulltext_bookmark.full_text = full_text db.session.commit() browser.quit()
def setUp(self): query_user = User.query.filter_by(email='*****@*****.**').first() if query_user: query_bookmarks = Bookmark.query.filter_by(user=query_user.id) for bmark in query_bookmarks: db.session.delete(bmark) db.session.commit() db.session.delete(query_user) db.session.commit() create_user = User() create_user.first_name = 'Pocket' create_user.last_name = 'Test' create_user.email = '*****@*****.**' create_user.password = '******' create_user.active = True create_user.confirmed_at = datetime.datetime.utcnow() db.session.add(create_user) db.session.commit() self.user = create_user with open('Pocket.html') as json_file: create_file = open( os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_pocket.html'), 'w+') self.data = json_file.read() self.html_data = BeautifulSoup4(self.data) self.bookmarks = {} for link in self.html_data.find_all('a'): tags = link['tags'].split(',') tags.append('Imported') dt = datetime.datetime.utcfromtimestamp( float(link['time_added'])) self.bookmarks[link['href']] = { 'href': link['href'], 'title': link.text, 'tags': tags, 'dt': dt } create_file.write(self.data) self.file_path = create_file.name create_file.close() init_parser = PocketParser(self.file_path, self.user.id) init_parser.process() init_parser.add_to_database() self.query = Bookmark.query.filter_by(user=self.user.id).all() self.html_parser = HTMLParser()
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: self.html = self.opened_file.read() self.soup = BeautifulSoup4(self.html) self.user = user_id self.urls = dict() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for bmark in self.check_duplicates_query: self.check_duplicates[bmark.main_url] = bmark self.tags_dict = dict() self.tags_set = set() self.html_parser = HTMLParser.HTMLParser() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def readable_extract(bookmark): bookmark_readify = Bookmark.query.get(bookmark.id) url = bookmark_readify.main_url parsed_url = urlparse.urlparse(url) for netloc in ignored_netlocs: if netloc in parsed_url.netloc: return r = requests.get(bookmark_readify.main_url) soup = BeautifulSoup4(r.content, "lxml") make_links_absolute(soup, bookmark_readify.main_url) html_self_closing_tags = [ 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' ] """ Above list from http://xahlee.info/js/html5_non-closing_tag.html""" empty_tags = soup.findAll(lambda tag: tag.name not in html_self_closing_tags and not tag.contents and (tag.string is None or not tag.string.strip())) [empty_tag.extract() for empty_tag in empty_tags] cleanhtml = soup.encode_contents() readable_article = Document(cleanhtml).summary() bookmark_readify.readability_html = readable_article db.session.commit()
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: # So Instapaper doesn't close <li> tags # This was causing infinite recursion when using BS directly # Hence why the stuff below is being done, so that the <li> tags get closed self.html = html.document_fromstring(self.opened_file.read()) self.html = html.tostring(self.html) self.soup = BeautifulSoup4(self.html) self.user = user_id self.urls = dict() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for bmark in self.check_duplicates_query: self.check_duplicates[bmark.main_url] = bmark self.tags_dict = dict() self.tags_set = set() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def extract_forms(self, url): response = self.session.get(url) parsesd_html = BeautifulSoup4(response.content, features="lxml") return parsesd_html.findAll("form")
#-*- coding: utf-8 -*- from urllib2 import urlopen from bs4 import BeautifulSoup4 as BeautifulSoup url = "https://www.rottentomatoes.com/" html = urlopen(url) source = html.read() # 바이트코드 type으로 소스를 읽는다. html.close() # urlopen을 진행한 후에는 close를 한다. soup = BeautifulSoup( source, "html5lib" ) # 파싱할 문서를 BeautifulSoup 클래스의 생성자에 넘겨주어 문서 개체를 생성, 관습적으로 soup 이라 부름 table = soup.find(id="Top-Box-Office") movies = table.find_all(class_="middle_col") for movie in movies: title = movie.get_text() print(title) link = movie.a.get('href') url = 'https://www.rottentomatoes.com' + link print(url)