Esempio n. 1
0
def fetch_title(bookmark):
    title_bookmark = Bookmark.query.get(bookmark.id)
    r = requests.get(title_bookmark.main_url)
    soup = BeautifulSoup4(r.content)
    title = soup.title.string
    title_bookmark.title = title.encode('utf-8')
    db.session.commit()
Esempio n. 2
0
def fetch_description(bookmark):
    desc_bookmark = Bookmark.query.get(bookmark.id)
    r = requests.get(desc_bookmark.main_url)
    soup = BeautifulSoup4(r.content)
    desc = soup.find(attrs={"name": "description"})
    if desc is not None:
        desc = desc['content']
        desc_bookmark.description = desc[:256].encode('utf-8')
        db.session.commit()
Esempio n. 3
0
 def setUp(self):
     query_user = User.query.filter_by(
         email='*****@*****.**').first()
     if query_user:
         query_bookmarks = Bookmark.query.filter_by(user=query_user.id)
         for bmark in query_bookmarks:
             db.session.delete(bmark)
         db.session.commit()
         db.session.delete(query_user)
         db.session.commit()
     create_user = User()
     create_user.first_name = 'Instapaper'
     create_user.last_name = 'Test'
     create_user.email = '*****@*****.**'
     create_user.password = '******'
     create_user.active = True
     create_user.confirmed_at = datetime.datetime.utcnow()
     db.session.add(create_user)
     db.session.commit()
     self.user = create_user
     with open('Instapaper.html') as json_file:
         create_file = open(
             os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'],
                          'test_instapaper.html'), 'w+')
         self.data = html.document_fromstring(json_file.read())
         self.data = html.tostring(self.data)
         self.html_data = BeautifulSoup4(self.data)
         self.bookmarks = {}
         for tag in self.html_data.find_all('h1'):
             parent_elem = tag.find_next_sibling('ol')
             links = parent_elem.find_all('a')
             for link in links:
                 title = link.text
                 url = link['href']
                 tags = [tag.text]
                 tags.append('Imported')
                 #  Thanks Instapaper for not adding timestamps
                 self.bookmarks[url] = {
                     'href': url,
                     'title': title,
                     'tags': tags
                 }
         create_file.write(self.data)
         self.file_path = create_file.name
         create_file.close()
     init_parser = InstapaperParser(self.file_path, self.user.id)
     init_parser.process()
     init_parser.add_to_database()
     self.query = Bookmark.query.filter_by(user=self.user.id).all()
     self.html_parser = HTMLParser()
Esempio n. 4
0
def fulltext_extract(bookmark):
    browser = webdriver.PhantomJS(service_args=[
        "--ignore-ssl-errors=true", "--ssl-protocol=tlsv1", "--load-images=no"
    ])
    fulltext_bookmark = Bookmark.query.get(bookmark.id)
    browser.get(fulltext_bookmark.main_url)
    body = browser.find_element_by_tag_name('body')
    bodytext = body.text
    soup = BeautifulSoup4(bodytext)
    full_text = soup.text
    full_text = " ".join(full_text.split())
    full_text = full_text.replace('\n', '')
    full_text = full_text.encode('utf-8')
    fulltext_bookmark.full_text = full_text
    db.session.commit()
    browser.quit()
Esempio n. 5
0
 def setUp(self):
     query_user = User.query.filter_by(email='*****@*****.**').first()
     if query_user:
         query_bookmarks = Bookmark.query.filter_by(user=query_user.id)
         for bmark in query_bookmarks:
             db.session.delete(bmark)
         db.session.commit()
         db.session.delete(query_user)
         db.session.commit()
     create_user = User()
     create_user.first_name = 'Pocket'
     create_user.last_name = 'Test'
     create_user.email = '*****@*****.**'
     create_user.password = '******'
     create_user.active = True
     create_user.confirmed_at = datetime.datetime.utcnow()
     db.session.add(create_user)
     db.session.commit()
     self.user = create_user
     with open('Pocket.html') as json_file:
         create_file = open(
             os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'],
                          'test_pocket.html'), 'w+')
         self.data = json_file.read()
         self.html_data = BeautifulSoup4(self.data)
         self.bookmarks = {}
         for link in self.html_data.find_all('a'):
             tags = link['tags'].split(',')
             tags.append('Imported')
             dt = datetime.datetime.utcfromtimestamp(
                 float(link['time_added']))
             self.bookmarks[link['href']] = {
                 'href': link['href'],
                 'title': link.text,
                 'tags': tags,
                 'dt': dt
             }
     create_file.write(self.data)
     self.file_path = create_file.name
     create_file.close()
     init_parser = PocketParser(self.file_path, self.user.id)
     init_parser.process()
     init_parser.add_to_database()
     self.query = Bookmark.query.filter_by(user=self.user.id).all()
     self.html_parser = HTMLParser()
Esempio n. 6
0
 def __init__(self, file_name, user_id):
     with open(file_name, 'r') as self.opened_file:
         self.html = self.opened_file.read()
     self.soup = BeautifulSoup4(self.html)
     self.user = user_id
     self.urls = dict()
     self.check_duplicates = dict()
     self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                         Bookmark.deleted == False).all()
     for bmark in self.check_duplicates_query:
         self.check_duplicates[bmark.main_url] = bmark
     self.tags_dict = dict()
     self.tags_set = set()
     self.html_parser = HTMLParser.HTMLParser()
     self.valid_url = re.compile(
         r'^(?:[a-z0-9\.\-]*)://'
         r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
         r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
         r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
         r'(?::\d+)?'
         r'(?:/?|[/?]\S+)$', re.IGNORECASE)
Esempio n. 7
0
def readable_extract(bookmark):
    bookmark_readify = Bookmark.query.get(bookmark.id)
    url = bookmark_readify.main_url
    parsed_url = urlparse.urlparse(url)
    for netloc in ignored_netlocs:
        if netloc in parsed_url.netloc:
            return
    r = requests.get(bookmark_readify.main_url)
    soup = BeautifulSoup4(r.content, "lxml")
    make_links_absolute(soup, bookmark_readify.main_url)
    html_self_closing_tags = [
        'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
        'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
    ]
    """ Above list from http://xahlee.info/js/html5_non-closing_tag.html"""
    empty_tags = soup.findAll(lambda tag: tag.name not in
                              html_self_closing_tags and not tag.contents and
                              (tag.string is None or not tag.string.strip()))
    [empty_tag.extract() for empty_tag in empty_tags]
    cleanhtml = soup.encode_contents()
    readable_article = Document(cleanhtml).summary()
    bookmark_readify.readability_html = readable_article
    db.session.commit()
Esempio n. 8
0
 def __init__(self, file_name, user_id):
     with open(file_name, 'r') as self.opened_file:
         #  So Instapaper doesn't close <li> tags
         #  This was causing infinite recursion when using BS directly
         #  Hence why the stuff below is being done, so that the <li> tags get closed
         self.html = html.document_fromstring(self.opened_file.read())
         self.html = html.tostring(self.html)
     self.soup = BeautifulSoup4(self.html)
     self.user = user_id
     self.urls = dict()
     self.check_duplicates = dict()
     self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                         Bookmark.deleted == False).all()
     for bmark in self.check_duplicates_query:
         self.check_duplicates[bmark.main_url] = bmark
     self.tags_dict = dict()
     self.tags_set = set()
     self.valid_url = re.compile(
         r'^(?:[a-z0-9\.\-]*)://'
         r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
         r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
         r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
         r'(?::\d+)?'
         r'(?:/?|[/?]\S+)$', re.IGNORECASE)
Esempio n. 9
0
 def extract_forms(self, url):
     response = self.session.get(url)
     parsesd_html = BeautifulSoup4(response.content, features="lxml")
     return parsesd_html.findAll("form")
Esempio n. 10
0
#-*- coding: utf-8 -*-

from urllib2 import urlopen
from bs4 import BeautifulSoup4 as BeautifulSoup

url = "https://www.rottentomatoes.com/"
html = urlopen(url)
source = html.read()  # 바이트코드 type으로 소스를 읽는다.
html.close()  # urlopen을 진행한 후에는 close를 한다.

soup = BeautifulSoup(
    source, "html5lib"
)  # 파싱할 문서를 BeautifulSoup 클래스의 생성자에 넘겨주어 문서 개체를 생성, 관습적으로 soup 이라 부름
table = soup.find(id="Top-Box-Office")
movies = table.find_all(class_="middle_col")

for movie in movies:
    title = movie.get_text()
    print(title)
    link = movie.a.get('href')
    url = 'https://www.rottentomatoes.com' + link
    print(url)