コード例 #1
0
 def find_by_class(self, html_data, claas_name):
     soup = BeautifulSoup(html_data, 'html.parser')
     found_needed_data = soup.find("div", class_=claas_name)
     if found_needed_data is not None:
         return str(found_needed_data)
     else:
         return ""
コード例 #2
0
 def __init__(self, namespaceHTMLElements, soup=None):
     if soup:
         self.soup = soup
     else:
         from lib.bs4 import BeautifulSoup
         self.soup = BeautifulSoup("", "html.parser")
     super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
コード例 #3
0
ファイル: views.py プロジェクト: qitang1811/zerothave
    def parseAndStoreDoc(self):
        for e in self.doc.entries:
            title = jinja2.Markup(e.title).unescape()
            link = e.links[0]["href"]
            soup = BeautifulSoup(e.description)
            try:
                description = jinja2.Markup(soup.findAll("p")[-1]).unescape()
            except IndexError as e:
                description = jinja2.Markup(soup).unescape()

            # try:
            #     imgLink = soup.findAll("img")[0]["src"]
            # except IndexError as e:
            imgLink = GQ.gqLogo

            srcName = self.srcName
            labelName = self.srcName.split("_")[1]
            article = Article(title=title,
                              description=description,
                              imgLink=imgLink,
                              link=link,
                              srcName=srcName,
                              labelName=labelName)
            article.put()
        logging.info("Storing data from %s" % srcName)
コード例 #4
0
ファイル: views.py プロジェクト: qitang1811/zerothave
    def parseAndStoreDoc(self):
        for e in self.doc.entries:
            title = jinja2.Markup(e.title).unescape()
            soup = BeautifulSoup(e.description)
            description = jinja2.Markup(soup.getText()).unescape()
            #truncate the length of
            if len(description) > 500:
                description = description[:500]

            link = e.links[0]["href"]
            srcName = self.srcName
            labelName = self.srcName.split("_")[1]
            # if no image in entry, use the source logo instead
            try:
                imgLink = e.media_content[0]["url"]
            except AttributeError as e:
                imgLink = NewYorkTimes.logo
            article = Article(title=title,
                              description=description,
                              imgLink=imgLink,
                              link=link,
                              srcName=srcName,
                              labelName=labelName)
            article.put()
        logging.info("Storing data to database!(from %s)" % srcName)
コード例 #5
0
ファイル: _html5lib.py プロジェクト: yahyalmh/ChangeWallpaper
 def fragmentClass(self):
     from lib.bs4 import BeautifulSoup
     # TODO: Why is the parser 'html.parser' here? To avoid an
     # infinite loop?
     self.soup = BeautifulSoup("", "html.parser")
     self.soup.name = "[document_fragment]"
     return Element(self.soup, self.soup, None)
コード例 #6
0
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
        builder = self.default_builder
        obj = BeautifulSoup(to_parse, builder=builder)
        if compare_parsed_to is None:
            compare_parsed_to = to_parse

        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
コード例 #7
0
 def try_frame(self, html_data):
     soup = BeautifulSoup(html_data, 'html.parser')
     frames = soup.find_all('frame')
     try:
         new_url = frames[-1].get('src')
         print "new url " + new_url
         return new_url
     except:
         return ""
コード例 #8
0
ファイル: xml_bs.py プロジェクト: jacobo3d/TACTIC-Handler
 def get_pipeline(self, in_pipeline=None):
     # print(in_pipeline)
     # print(self.info['code'])
     pipeline = BeautifulSoup(in_pipeline['pipeline'], 'html.parser')
     all_pipeline = []
     for pipe in pipeline.find_all('process'):
         all_pipeline.append(pipe['name'])
     # items[in_pipeline['search_type']] = all_pipeline
     self.pipeline['pipeline'] = all_pipeline
     print(self.pipeline['pipeline'])
コード例 #9
0
    def test_formatter_processes_script_tag_for_xml_documents(self):
        doc = """
  <script type="text/javascript">
  </script>
"""
        soup = BeautifulSoup(doc, "lxml-xml")
        # lxml would have stripped this while parsing, but we can add
        # it later.
        soup.script.string = 'console.log("< < hey > > ");'
        encoded = soup.encode()
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
コード例 #10
0
ファイル: views.py プロジェクト: qitang1811/zerothave
 def parseAndStoreDoc(self):
     for e in self.doc.entries:
         title = jinja2.Markup(e.title).unescape()
         link = e.link
         imgLink = e.imgurl
         soup = BeautifulSoup(e.description)
         description = jinja2.Markup(soup.getText()).unescape()
         srcName = self.srcName
         labelName = self.srcName.split("_")[1]
         article = Article(title=title, description=description, imgLink=imgLink, link=link, srcName=srcName, labelName=labelName)
         article.put()
     logging.info("Storing data from %s" %srcName)
コード例 #11
0
 def find_by_id(self, html_data, given_id):
     """
     This function get html text and return the content of specific class id that is given as parameter
     :param html_data: html string
     :param given_id: string
     :return: string of the content by id or empty string if id not found in the html
     """
     soup = BeautifulSoup(html_data, 'html.parser')
     found_needed_data = soup.find(id=given_id)
     print(found_needed_data)
     if found_needed_data is not None:
         return str(found_needed_data)
     else:
         return ""
コード例 #12
0
    def test_last_ditch_entity_replacement(self):
        # This is a UTF-8 document that contains bytestrings
        # completely incompatible with UTF-8 (ie. encoded with some other
        # encoding).
        #
        # Since there is no consistent encoding for the document,
        # Unicode, Dammit will eventually encode the document as UTF-8
        # and encode the incompatible characters as REPLACEMENT
        # CHARACTER.
        #
        # If chardet is installed, it will detect that the document
        # can be converted into ISO-8859-1 without errors. This happens
        # to be the wrong encoding, but it is a consistent encoding, so the
        # code we're testing here won't run.
        #
        # So we temporarily disable chardet if it's present.
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
        chardet = bs4.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            bs4.dammit.chardet_dammit = noop
            dammit = UnicodeDammit(doc)
            self.assertEqual(True, dammit.contains_replacement_characters)
            self.assertTrue("\ufffd" in dammit.unicode_markup)

            soup = BeautifulSoup(doc, "html.parser")
            self.assertTrue(soup.contains_replacement_characters)
        finally:
            logging.disable(logging.NOTSET)
            bs4.dammit.chardet_dammit = chardet
コード例 #13
0
ファイル: _html5lib.py プロジェクト: DobyTang/LazyLibrarian
 def __init__(self, namespaceHTMLElements, soup=None):
     if soup:
         self.soup = soup
     else:
         from lib.bs4 import BeautifulSoup
         self.soup = BeautifulSoup("", "html.parser")
     super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
コード例 #14
0
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
    data = rdoc(num_elements)
    print(("Generated a large invalid HTML document (%d bytes)." % len(data)))

    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
            print(("BS4+%s parsed the markup in %.2fs." % (parser, b - a)))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print(("Raw lxml parsed the markup in %.2fs." % (b - a)))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print(("Raw html5lib parsed the markup in %.2fs." % (b - a)))
コード例 #15
0
ファイル: views.py プロジェクト: qitang1811/zerothave
 def parseAndStoreDoc(self):
     for e in self.doc.entries:
         title = jinja2.Markup(e.title).unescape()
         link = e.links[0]["href"]
         soup = BeautifulSoup(e.description)
         description = jinja2.Markup(soup.getText()).unescape()
         #retrieve the img link if not exit use logo instead
         try:
             imgLink = e.media_thumbnail[0]["url"]
         except AttributeError as e:
             imgLink = BBC.bbcLogo
         srcName = self.srcName
         labelName = self.srcName.split("_")[1]
         article = Article(title=title, description=description, imgLink=imgLink, link=link, srcName=srcName, labelName=labelName)
         article.put()
     logging.info("Storing data from %s" %srcName)
コード例 #16
0
    def parse_page(self, page_content):

        try:
            parsed_html = BeautifulSoup(page_content, "html.parser")
            # html = list(parsed_html.children)[2]
            # body = list(html.children)[3]

            images_tag = parsed_html.find_all('img')
            for tag in images_tag:
                url = tag.get('src')
                if url is not None:
                    self.image_url = self.page_url.split("astropix")[0] + url
                    break
        except Exception as e:
            # print(e)
            pass
コード例 #17
0
 def test_custom_builder_class(self):
     # Verify that you can pass in a custom Builder class and
     # it'll be instantiated with the appropriate keyword arguments.
     class Mock(object):
         def __init__(self, **kwargs):
             self.called_with = kwargs
             self.is_xml = True
             self.store_line_numbers = False
             self.cdata_list_attributes = []
             self.preserve_whitespace_tags = []
             self.string_containers = {}
         def initialize_soup(self, soup):
             pass
         def feed(self, markup):
             self.fed = markup
         def reset(self):
             pass
         def ignore(self, ignore):
             pass
         set_up_substitutions = can_be_empty_element = ignore
         def prepare_markup(self, *args, **kwargs):
             yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
             
     kwargs = dict(
         var="value",
         # This is a deprecated BS3-era keyword argument, which
         # will be stripped out.
         convertEntities=True,
     )
     with warnings.catch_warnings(record=True):
         soup = BeautifulSoup('', builder=Mock, **kwargs)
     assert isinstance(soup.builder, Mock)
     self.assertEqual(dict(var="value"), soup.builder.called_with)
     self.assertEqual("prepared markup", soup.builder.fed)
     
     # You can also instantiate the TreeBuilder yourself. In this
     # case, that specific object is used and any keyword arguments
     # to the BeautifulSoup constructor are ignored.
     builder = Mock(**kwargs)
     with warnings.catch_warnings(record=True) as w:
         soup = BeautifulSoup(
             '', builder=builder, ignored_value=True,
         )
     msg = str(w[0].message)
     assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
     self.assertEqual(builder, soup.builder)
     self.assertEqual(kwargs, builder.called_with)
コード例 #18
0
def context_query(process):
    """
    Query for Context elements
    Creating one list of lists, to reduce count of queries to the server
    :param process - list of tab names (vfx/asset)
    """

    search_type = 'sthpw/pipeline'

    filters = [('search_type', process), ('project_code', env.Env().get_project())]
    assets = server_query(search_type, filters)

    from lib.bs4 import BeautifulSoup

    if assets:
        # TODO may be worth it to simplify this
        # contexts = collections.OrderedDict()
        #
        # for proc in process:
        #     contexts[proc] = []
        #
        # items = contexts.copy()
        # for context in contexts:
        #     for asset in assets:
        #         if context == asset['search_type']:
        #             contexts[context] = Et.fromstring(asset['pipeline'].encode('utf-8'))
        #
        # for key, val in contexts.iteritems():
        #     if len(val):
        #         for element in val.iter('process'):
        #             items[key].append(element.attrib['name'])

        items = collections.OrderedDict()

        for proc in process:
            items[proc] = []

        for asset in assets:
            if asset['search_type'] in process:
                pipeline = BeautifulSoup(asset['pipeline'], 'html.parser')
                all_pipeline = []
                for pipe in pipeline.find_all('process'):
                    all_pipeline.append(pipe['name'])
                items[asset['search_type']] = all_pipeline

        return items
コード例 #19
0
    def test_beautifulsoup_constructor_does_lookup(self):

        with warnings.catch_warnings(record=True) as w:
            # This will create a warning about not explicitly
            # specifying a parser, but we'll ignore it.

            # You can pass in a string.
            BeautifulSoup("", features="html")
            # Or a list of strings.
            BeautifulSoup("", features=["html", "fast"])

        # You'll get an exception if BS can't find an appropriate
        # builder.
        self.assertRaises(ValueError,
                          BeautifulSoup,
                          "",
                          features="no-such-feature")
コード例 #20
0
ファイル: views.py プロジェクト: qitang1811/zerothave
 def parseAndStoreDoc(self):
     for e in self.doc.entries:
         title = jinja2.Markup(e.title).unescape()
         link = e.link
         imgLink = e.imgurl
         soup = BeautifulSoup(e.description)
         description = jinja2.Markup(soup.getText()).unescape()
         srcName = self.srcName
         labelName = self.srcName.split("_")[1]
         article = Article(title=title,
                           description=description,
                           imgLink=imgLink,
                           link=link,
                           srcName=srcName,
                           labelName=labelName)
         article.put()
     logging.info("Storing data from %s" % srcName)
コード例 #21
0
ファイル: views.py プロジェクト: qitang1811/zerothave
    def parseAndStoreDoc(self):
        for e in self.doc.entries:
            title = jinja2.Markup(e.title).unescape()
            link = e.links[0]["href"]
            soup = BeautifulSoup(e.description)
            try:
                description = jinja2.Markup(soup.findAll("p")[-1]).unescape()
            except IndexError as e:
                description = jinja2.Markup(soup).unescape()

            # try:
            #     imgLink = soup.findAll("img")[0]["src"]
            # except IndexError as e:
            imgLink = GQ.gqLogo

            srcName = self.srcName
            labelName = self.srcName.split("_")[1]
            article = Article(title=title, description=description, imgLink=imgLink, link=link, srcName=srcName, labelName=labelName)
            article.put()
        logging.info("Storing data from %s" %srcName)
コード例 #22
0
ファイル: views.py プロジェクト: qitang1811/zerothave
    def parseAndStoreDoc(self):
        for e in self.doc.entries:
            title = jinja2.Markup(e.title).unescape()
            soup = BeautifulSoup(e.description)
            description = jinja2.Markup(soup.getText()).unescape()
            #truncate the length of
            if len(description) > 500:
                description = description[:500]

            link = e.links[0]["href"]
            srcName = self.srcName
            labelName = self.srcName.split("_")[1]
            # if no image in entry, use the source logo instead
            try:
                imgLink = e.media_content[0]["url"]
            except AttributeError as e:
                imgLink = NewYorkTimes.logo
            article = Article(title=title, description=description, imgLink=imgLink, link=link, srcName=srcName, labelName=labelName)
            article.put()
        logging.info("Storing data to database!(from %s)" %srcName)
コード例 #23
0
ファイル: views.py プロジェクト: qitang1811/zerothave
 def parseAndStoreDoc(self):
     for e in self.doc.entries:
         title = jinja2.Markup(e.title).unescape()
         link = e.links[0]["href"]
         soup = BeautifulSoup(e.description)
         description = jinja2.Markup(soup.getText()).unescape()
         #retrieve the img link if not exit use logo instead
         try:
             imgLink = e.media_thumbnail[0]["url"]
         except AttributeError as e:
             imgLink = BBC.bbcLogo
         srcName = self.srcName
         labelName = self.srcName.split("_")[1]
         article = Article(title=title,
                           description=description,
                           imgLink=imgLink,
                           link=link,
                           srcName=srcName,
                           labelName=labelName)
         article.put()
     logging.info("Storing data from %s" % srcName)
コード例 #24
0
ファイル: _html5lib.py プロジェクト: yahyalmh/ChangeWallpaper
    def __init__(self,
                 namespaceHTMLElements,
                 soup=None,
                 store_line_numbers=True,
                 **kwargs):
        if soup:
            self.soup = soup
        else:
            from lib.bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup("",
                                      "html.parser",
                                      store_line_numbers=store_line_numbers,
                                      **kwargs)
        # TODO: What are **kwargs exactly? Should they be passed in
        # here in addition to/instead of being passed to the BeautifulSoup
        # constructor?
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers
コード例 #25
0
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)

    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b - a)
コード例 #26
0
ファイル: Bing.py プロジェクト: yahyalmh/ChangeWallpaper
    def parse_page(self, page_content):
        try:
            parsed_html = BeautifulSoup(page_content, "html.parser")
            html = list(parsed_html.children)[1]
            body = list(html.children)[1]

            # for link in body.find_all('div'):

            wall_tag = body.find('div', class_='img_cont')
            style = wall_tag.get('style')
            if style is not None and "background-image" in style:
                image_address = style.split("/")[1].split("jpg")[0]
                self.image_url = self.page_url + image_address + "jpg"

            # image_name_id = "iotd_title"
            self.extract_image_name = body.find('a', class_='title').getText()

        except Exception as e:
            pass
コード例 #27
0
ファイル: views.py プロジェクト: qitang1811/zerothave
 def parseAndStoreDoc(self):
     for e in self.doc.entries:
         title = jinja2.Markup(e.title).unescape()
         desc = BeautifulSoup(e.description).findAll("p")[0].getText()
         description = jinja2.Markup(desc).unescape()
         #truncate the length
         if len(description) > 500:
             description = description[:500]
         link = e.links[0]["href"]
         srcName = self.srcName
         labelName = self.srcName.split("_")[1]
         #img from feed
         imgLink = WashingtonPost.logoSrc
         article = Article(title=title,
                           description=description,
                           imgLink=imgLink,
                           link=link,
                           srcName=srcName,
                           labelName=labelName)
         article.put()
     logging.info("storing data from %s" % srcName)
コード例 #28
0
def TPB(book=None, test=False):
    errmsg = ''
    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?")

    cat = 0  # 601=ebooks, 102=audiobooks, 0=all, no mag category
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 102
        elif book['library'] == 'eBook':
            cat = 601
        elif book['library'] == 'magazine':
            cat = 0

    sterm = makeUnicode(book['searchterm'])

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:

        params = {
            "q": book['searchterm'],
            "category": cat,
            "page": page,
            "orderby": "99"
        }

        searchURL = providerurl + "?%s" % urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)

        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" % (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' % (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider))
            soup = BeautifulSoup(result, 'html5lib')
            # tpb uses a named table
            table = soup.find('table', id='searchResult')
            if table:
                rows = table.find_all('tr')
            else:
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers
            for row in rows:
                td = row.find_all('td')
                if len(td) > 2:
                    try:
                        new_soup = BeautifulSoup(str(td[1]), 'html5lib')
                        link = new_soup.find("a")
                        magnet = link.get("href")
                        title = link.text
                        size = td[1].text.split(', Size ')[1].split('iB')[0]
                        size = size.replace('&nbsp;', '')
                        size = size_in_bytes(size)
                        try:
                            seeders = int(td[2].text.replace(',', ''))
                        except ValueError:
                            seeders = 0

                        if minimumseeders < seeders:
                            # no point in asking for magnet link if not enough seeders
                            magurl = '%s/%s' % (host, magnet)
                            result, success = fetchURL(magurl)
                            if not success:
                                logger.debug('Error fetching url %s, %s' % (magurl, result))
                            else:
                                magnet = None
                                new_soup = BeautifulSoup(result, 'html5lib')
                                for link in new_soup.find_all('a'):
                                    output = link.get('href')
                                    if output and output.startswith('magnet'):
                                        magnet = output
                                        break
                            if not magnet or not title:
                                logger.debug('Missing magnet or title')
                            else:
                                results.append({
                                    'bookid': book['bookid'],
                                    'tor_prov': provider,
                                    'tor_title': title,
                                    'tor_url': magnet,
                                    'tor_size': str(size),
                                    'tor_type': 'magnet',
                                    'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY']
                                })
                                logger.debug('Found %s. Size: %s' % (title, size))
                                next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" % (provider, str(e)))
                        logger.debug('%s: %s' % (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn('Maximum results page search reached, still more results available')
            next_page = False

    logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #29
0
def WWT(book=None, test=False):
    errmsg = ''
    provider = "WorldWideTorrents"
    host = lazylibrarian.CONFIG['WWT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/torrents-search.php")

    sterm = makeUnicode(book['searchterm'])

    cat = 0  # 0=all, 36=ebooks, 52=mags, 56=audiobooks
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 56
        elif book['library'] == 'eBook':
            cat = 36
        elif book['library'] == 'magazine':
            cat = 52

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:
        params = {
            "search": book['searchterm'],
            "page": page,
            "cat": cat
        }
        searchURL = providerurl + "/?%s" % urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # might return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" % (provider, sterm))
                success = True
            elif '503' in result:
                logger.warn("Cloudflare bot detection? %s: %s" % (provider, result))
                logger.warn("Try unblocking %s from a browser" % providerurl)
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' % (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider))
            soup = BeautifulSoup(result, 'html5lib')
            rows = []
            try:
                tables = soup.find_all('table')  # un-named table
                table = tables[2]
                if table:
                    rows = table.find_all('tr')
            except IndexError:  # no results table in result page
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers

            for row in rows:
                td = row.find_all('td')
                if len(td) > 3:
                    try:
                        title = unaccented(td[0].text)
                        # can return magnet or torrent or both.
                        magnet = ''
                        url = ''
                        mode = 'torrent'
                        try:
                            magnet = 'magnet' + str(td[0]).split('href="magnet')[1].split('"')[0]
                            mode = 'magnet'
                        except IndexError:
                            pass
                        try:
                            url = url_fix(host + '/download.php') + \
                                          str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent'
                            mode = 'torrent'
                        except IndexError:
                            pass

                        if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']):
                            url = magnet
                            mode = 'magnet'

                        try:
                            size = str(td[1].text).replace('&nbsp;', '').upper()
                            size = size_in_bytes(size)
                        except ValueError:
                            size = 0
                        try:
                            seeders = int(td[2].text.replace(',', ''))
                        except ValueError:
                            seeders = 0

                        if not url or not title:
                            logger.debug('Missing url or title')
                        elif minimumseeders < seeders:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                                'tor_type': mode,
                                'priority': lazylibrarian.CONFIG['WWT_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                            next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" % (provider, str(e)))
                        logger.debug('%s: %s' % (provider, traceback.format_exc()))
        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn('Maximum results page search reached, still more results available')
            next_page = False

    logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #30
0
def KAT(book=None, test=False):
    errmsg = ''
    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" + quote(book['searchterm']))

    params = {
        "category": "books",
        "field": "seeders",
        "sorder": "desc"
    }
    searchURL = providerurl + "/?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, result))
            errmsg = result
        result = False

    if test:
        return success

    results = []

    if result:
        logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result, 'html5lib')
        rows = []
        try:
            table = soup.find_all('table')[1]  # un-named table
            if table:
                rows = table.find_all('tr')
        except IndexError:  # no results table in result page
            rows = []

        if len(rows) > 1:
            rows = rows[1:]  # first row is headers

        for row in rows:
            td = row.find_all('td')
            if len(td) > 3:
                try:
                    title = unaccented(td[0].text)
                    # kat can return magnet or torrent or both.
                    magnet = ''
                    url = ''
                    mode = 'torrent'
                    try:
                        magnet = 'magnet' + str(td[0]).split('href="magnet')[1].split('"')[0]
                        mode = 'magnet'
                    except IndexError:
                        pass
                    try:
                        url = 'http' + str(td[0]).split('href="http')[1].split('.torrent?')[0] + '.torrent'
                        mode = 'torrent'
                    except IndexError:
                        pass

                    if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']):
                        url = magnet
                        mode = 'magnet'

                    try:
                        size = str(td[1].text).replace('&nbsp;', '').upper()
                        size = size_in_bytes(size)
                    except ValueError:
                        size = 0
                    try:
                        seeders = int(td[3].text.replace(',', ''))
                    except ValueError:
                        seeders = 0

                    if not url or not title:
                        logger.debug('Missing url or title')
                    elif minimumseeders < seeders:
                        results.append({
                            'bookid': book['bookid'],
                            'tor_prov': provider,
                            'tor_title': title,
                            'tor_url': url,
                            'tor_size': str(size),
                            'tor_type': mode,
                            'priority': lazylibrarian.CONFIG['KAT_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders)))
                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" % (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #31
0
 def fragmentClass(self):
     from lib.bs4 import BeautifulSoup
     self.soup = BeautifulSoup("", "html.parser")
     self.soup.name = "[document_fragment]"
     return Element(self.soup, self.soup, None)
コード例 #32
0
 def test_no_warning_if_explicit_parser_specified(self):
     with warnings.catch_warnings(record=True) as w:
         soup = BeautifulSoup("<a><b></b></a>", "html.parser")
     self.assertEqual([], w)
コード例 #33
0
 def soup(self, markup, **kwargs):
     """Build a Beautiful Soup object from markup."""
     builder = kwargs.pop('builder', self.default_builder)
     return BeautifulSoup(markup, builder=builder, **kwargs)
コード例 #34
0
ファイル: _html5lib.py プロジェクト: yahyalmh/ChangeWallpaper
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
    def __init__(self,
                 namespaceHTMLElements,
                 soup=None,
                 store_line_numbers=True,
                 **kwargs):
        if soup:
            self.soup = soup
        else:
            from lib.bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup("",
                                      "html.parser",
                                      store_line_numbers=store_line_numbers,
                                      **kwargs)
        # TODO: What are **kwargs exactly? Should they be passed in
        # here in addition to/instead of being passed to the BeautifulSoup
        # constructor?
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers

    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
        kwargs = {}
        if self.parser and self.store_line_numbers:
            # This represents the point immediately after the end of the
            # tag. We don't know when the tag started, but we do know
            # where it ended -- the character just before this one.
            sourceline, sourcepos = self.parser.tokenizer.stream.position()
            kwargs['sourceline'] = sourceline
            kwargs['sourcepos'] = sourcepos - 1
        tag = self.soup.new_tag(name, namespace, **kwargs)

        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        from lib.bs4 import BeautifulSoup
        # TODO: Why is the parser 'html.parser' here? To avoid an
        # infinite loop?
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return treebuilder_base.TreeBuilder.getFragment(self).element

    def testSerializer(self, element):
        from lib.bs4 import BeautifulSoup
        rv = []
        doctype_re = re.compile(
            r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

        def serializeElement(element, indent=0):
            if isinstance(element, BeautifulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent, ))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in list(element.attrs.items()):
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace],
                                              name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))

                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' *
                                                  (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)

        serializeElement(element, 0)

        return "\n".join(rv)
コード例 #35
0
def GEN(book=None, prov=None, test=False):
    errmsg = ''
    provider = "libgen.io"
    if prov is None:
        prov = 'GEN'
    host = lazylibrarian.CONFIG[prov + '_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    search = lazylibrarian.CONFIG[prov + '_SEARCH']
    if not search or not search.endswith('.php'):
        search = 'search.php'
    if 'index.php' not in search and 'search.php' not in search:
        search = 'search.php'
    if search[0] == '/':
        search = search[1:]

    sterm = makeUnicode(book['searchterm'])

    page = 1
    results = []
    next_page = True

    while next_page:
        if 'index.php' in search:
            params = {
                "s": book['searchterm'],
                "f_lang": "All",
                "f_columns": 0,
                "f_ext": "All"
            }
        else:
            params = {
                "view": "simple",
                "open": 0,
                "phrase": 0,
                "column": "def",
                "res": 100,
                "req": book['searchterm']
            }

        if page > 1:
            params['page'] = page

        providerurl = url_fix(host + "/%s" % search)
        searchURL = providerurl + "?%s" % urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            elif '111' in result:
                # looks like libgen has ip based access limits
                logger.error(
                    'Access forbidden. Please wait a while before trying %s again.'
                    % provider)
                errmsg = result
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching page data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            try:
                soup = BeautifulSoup(result, 'html5lib')
                try:
                    table = soup.find_all('table')[2]  # un-named table
                    if table:
                        rows = table.find_all('tr')
                except IndexError:  # no results table in result page
                    rows = []

                if 'search.php' in search and len(rows) > 1:
                    rows = rows[1:]

                for row in rows:
                    author = ''
                    title = ''
                    size = ''
                    extn = ''
                    link = ''
                    td = row.find_all('td')
                    if 'index.php' in search and len(td) > 3:
                        try:
                            author = formatAuthorName(td[0].text)
                            title = td[2].text
                            newsoup = BeautifulSoup(str(td[4]), 'html5lib')
                            data = newsoup.find('a')
                            link = data.get('href')
                            extn = data.text.split('(')[0]
                            size = data.text.split('(')[1].split(')')[0]
                            size = size.upper()
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen index.php results: %s' %
                                str(e))

                    elif 'search.php' in search and len(td) > 8:
                        try:
                            author = formatAuthorName(td[1].text)
                            title = td[2].text
                            size = td[7].text.upper()
                            extn = td[8].text
                            newsoup = BeautifulSoup(str(td[2]), 'html5lib')
                            link = newsoup.get('href')
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen search.php results; %s' %
                                str(e))

                    if not size:
                        size = 0
                    else:
                        try:
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0

                    if link and title:
                        if author:
                            title = author.strip() + ' ' + title.strip()
                        if extn:
                            title = title + '.' + extn

                        if not link.startswith('http'):
                            if "/ads.php?" in link:
                                url = url_fix(host + link)
                            else:
                                url = url_fix(host + "/ads.php?" + link)
                        else:
                            url = redirect_url(host, link)

                        bookresult, success = fetchURL(url)
                        if not success:
                            # may return 404 if no results, not really an error
                            if '404' in bookresult:
                                logger.debug(
                                    "No results found from %s for %s" %
                                    (provider, sterm))
                            else:
                                logger.debug(url)
                                logger.debug(
                                    'Error fetching link data from %s: %s' %
                                    (provider, bookresult))
                                errmsg = bookresult
                            bookresult = False

                        if bookresult:
                            url = None
                            try:
                                new_soup = BeautifulSoup(
                                    bookresult, 'html5lib')
                                for link in new_soup.find_all('a'):
                                    output = link.get('href')
                                    if output:
                                        if output.startswith(
                                                'http'
                                        ) and '/get.php' in output:
                                            url = output
                                            break
                                        elif '/get.php' in output:
                                            url = '/get.php' + output.split(
                                                '/get.php')[1]
                                            break
                                        elif '/download/book' in output:
                                            url = '/download/book' + output.split(
                                                '/download/book')[1]
                                            break

                                if url and not url.startswith('http'):
                                    url = url_fix(host + url)
                                else:
                                    url = redirect_url(host, url)
                            except Exception as e:
                                logger.error(
                                    '%s parsing bookresult for %s: %s' %
                                    (type(e).__name__, link, str(e)))
                                url = None

                        if url:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider + '/' + search,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'direct',
                                'priority':
                                lazylibrarian.CONFIG[prov + '_DLPRIORITY']
                            })
                            logger.debug('Found %s, Size %s' % (title, size))
                        next_page = True

            except Exception as e:
                logger.error("An error occurred in the %s parser: %s" %
                             (provider, str(e)))
                logger.debug('%s: %s' % (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #36
0
ファイル: diagnose.py プロジェクト: MaximKiselev/weeman
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
        with open(data) as fp:
            data = fp.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
        return
    print

    for parser in basic_parsers:
        print "Trying to parse your markup with %s" % parser
        success = False
        try:
            soup = BeautifulSoup(data, parser)
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "Here's what %s did with the markup:" % parser
            print soup.prettify()

        print "-" * 80

def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
    Soup code is running.
コード例 #37
0
ファイル: _html5lib.py プロジェクト: DobyTang/LazyLibrarian
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):

    def __init__(self, namespaceHTMLElements, soup=None):
        if soup:
            self.soup = soup
        else:
            from lib.bs4 import BeautifulSoup
            self.soup = BeautifulSoup("", "html.parser")
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
        tag = self.soup.new_tag(name, namespace)
        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        from lib.bs4 import BeautifulSoup
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return treebuilder_base.TreeBuilder.getFragment(self).element

    def testSerializer(self, element):
        from lib.bs4 import BeautifulSoup
        rv = []
        doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

        def serializeElement(element, indent=0):
            if isinstance(element, BeautifulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in element.attrs.items():
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace], name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))

                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)
        serializeElement(element, 0)

        return "\n".join(rv)
コード例 #38
0
def diagnose(data):
    """Diagnostic suite for isolating common problems.

    :param data: A string containing markup that needs to be explained.
    :return: None; diagnostics are printed to standard output.
    """
    print(("Diagnostic running on Beautiful Soup %s" % __version__))
    print(("Python version %s" % sys.version))

    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
        for builder in builder_registry.builders:
            if name in builder.features:
                break
        else:
            basic_parsers.remove(name)
            print((
                "I noticed that %s is not installed. Installing it may help." %
                name))

    if 'lxml' in basic_parsers:
        basic_parsers.append("lxml-xml")
        try:
            from lxml import etree
            print(("Found lxml version %s" %
                   ".".join(map(str, etree.LXML_VERSION))))
        except ImportError as e:
            print("lxml is not installed or couldn't be imported.")

    if 'html5lib' in basic_parsers:
        try:
            import html5lib
            print(("Found html5lib version %s" % html5lib.__version__))
        except ImportError as e:
            print("html5lib is not installed or couldn't be imported.")

    if hasattr(data, 'read'):
        data = data.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' %
               data))
        print(
            "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
        )
        return
    else:
        try:
            if os.path.exists(data):
                print((
                    '"%s" looks like a filename. Reading data from the file.' %
                    data))
                with open(data) as fp:
                    data = fp.read()
        except ValueError:
            # This can happen on some platforms when the 'filename' is
            # too long. Assume it's data and not a filename.
            pass
        print("")

    for parser in basic_parsers:
        print(("Trying to parse your markup with %s" % parser))
        success = False
        try:
            soup = BeautifulSoup(data, features=parser)
            success = True
        except Exception as e:
            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
            print(("Here's what %s did with the markup:" % parser))
            print((soup.prettify()))

        print(("-" * 80))
コード例 #39
0
def TPB(book=None, test=False):
    errmsg = ''
    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?")

    cat = 0  # 601=ebooks, 102=audiobooks, 0=all, no mag category
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 102
        elif book['library'] == 'eBook':
            cat = 601
        elif book['library'] == 'magazine':
            cat = 0

    sterm = makeUnicode(book['searchterm'])

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:

        params = {
            "q": book['searchterm'],
            "category": cat,
            "page": page,
            "orderby": "99"
        }

        searchURL = providerurl + "?%s" % urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)

        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result, 'html5lib')
            # tpb uses a named table
            table = soup.find('table', id='searchResult')
            if table:
                rows = table.find_all('tr')
            else:
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers
            for row in rows:
                td = row.find_all('td')
                if len(td) > 2:
                    try:
                        new_soup = BeautifulSoup(str(td[1]), 'html5lib')
                        link = new_soup.find("a")
                        magnet = link.get("href")
                        title = link.text
                        size = td[1].text.split(', Size ')[1].split('iB')[0]
                        size = size.replace('&nbsp;', '')
                        mult = 1
                        try:
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if minimumseeders < int(seeders):
                            # no point in asking for magnet link if not enough seeders
                            magurl = '%s/%s' % (host, magnet)
                            result, success = fetchURL(magurl)
                            if not success:
                                logger.debug('Error fetching url %s, %s' %
                                             (magurl, result))
                            else:
                                magnet = None
                                new_soup = BeautifulSoup(result, 'html5lib')
                                for link in new_soup.find_all('a'):
                                    output = link.get('href')
                                    if output and output.startswith('magnet'):
                                        magnet = output
                                        break
                            if not magnet or not title:
                                logger.debug('Missing magnet or title')
                            else:
                                results.append({
                                    'bookid':
                                    book['bookid'],
                                    'tor_prov':
                                    provider,
                                    'tor_title':
                                    title,
                                    'tor_url':
                                    magnet,
                                    'tor_size':
                                    str(size),
                                    'tor_type':
                                    'magnet',
                                    'priority':
                                    lazylibrarian.CONFIG['TPB_DLPRIORITY']
                                })
                                logger.debug('Found %s. Size: %s' %
                                             (title, size))
                                next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #40
0
def GEN(book=None, prov=None, test=False):
    errmsg = ''
    provider = "libgen.io"
    if not prov:
        prov = 'GEN'
    host = lazylibrarian.CONFIG[prov + '_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    search = lazylibrarian.CONFIG[prov + '_SEARCH']
    if not search or not search.endswith('.php'):
        search = 'search.php'
    if 'index.php' not in search and 'search.php' not in search:
        search = 'search.php'
    if search[0] == '/':
        search = search[1:]

    sterm = makeUnicode(book['searchterm'])

    page = 1
    results = []
    next_page = True

    while next_page:
        if 'index.php' in search:
            params = {
                "s": book['searchterm'],
                "f_lang": "All",
                "f_columns": 0,
                "f_ext": "All"
            }
        else:
            params = {
                "view": "simple",
                "open": 0,
                "phrase": 0,
                "column": "def",
                "res": 100,
                "req": book['searchterm']
            }

        if page > 1:
            params['page'] = page

        providerurl = url_fix(host + "/%s" % search)
        searchURL = providerurl + "?%s" % urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" % (provider, sterm))
                success = True
            elif '111' in result:
                # looks like libgen has ip based access limits
                logger.error('Access forbidden. Please wait a while before trying %s again.' % provider)
                errmsg = result
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching page data from %s: %s' % (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider))
            try:
                soup = BeautifulSoup(result, 'html5lib')
                rows = []

                try:
                    table = soup.find_all('table', rules='rows')[-1]  # the last table with rules=rows
                    if table:
                        rows = table.find_all('tr')

                except IndexError:  # no results table in result page
                    rows = []

                if len(rows) > 1:  # skip table headers
                    rows = rows[1:]

                for row in rows:
                    author = ''
                    title = ''
                    size = ''
                    extn = ''
                    link = ''
                    td = row.find_all('td')

                    if 'index.php' in search and len(td) > 3:
                        # Foreign fiction
                        try:
                            author = formatAuthorName(td[0].text)
                            title = td[2].text
                            newsoup = BeautifulSoup(str(td[4]), 'html5lib')
                            data = newsoup.find('a')
                            if data:
                                link = data.get('href')
                                extn = td[4].text.split('(')[0].strip()
                                size = td[4].text.split('(')[1].split(')')[0]
                                size = size.upper()
                        except IndexError as e:
                            logger.debug('Error parsing libgen index.php results: %s' % str(e))

                    elif 'search.php' in search and len(td) > 8:
                        # Non-fiction
                        try:
                            author = formatAuthorName(td[1].text)
                            title = td[2].text
                            size = td[7].text.upper()
                            extn = td[8].text
                            link = ''
                            newsoup = BeautifulSoup(str(td[2]), 'html5lib')
                            for res in newsoup.find_all('a'):
                                output = res.get('href')
                                if 'md5' in output:
                                    link = output
                                    break
                        except IndexError as e:
                            logger.debug('Error parsing libgen search.php results; %s' % str(e))

                    size = size_in_bytes(size)

                    if link and title:
                        if author:
                            title = author.strip() + ' ' + title.strip()
                        if extn:
                            title = title + '.' + extn

                        if link.startswith('http'):
                            url = redirect_url(host, link)
                        else:
                            if "/index.php?" in link:
                                link = 'md5' + link.split('md5')[1]

                            if "/ads.php?" in link:
                                url = url_fix(host + "/" + link)
                            else:
                                url = url_fix(host + "/ads.php?" + link)

                        bookresult, success = fetchURL(url)
                        if not success:
                            logger.debug('Error fetching link data from %s: %s' % (provider, bookresult))
                            logger.debug(url)
                            url = None
                        else:
                            url = None
                            try:
                                new_soup = BeautifulSoup(bookresult, 'html5lib')
                                for link in new_soup.find_all('a'):
                                    output = link.get('href')
                                    if output:
                                        if output.startswith('http') and '/get.php' in output:
                                            url = output
                                            break
                                        elif '/get.php' in output:
                                            url = '/get.php' + output.split('/get.php')[1]
                                            break
                                        elif '/download/book' in output:
                                            url = '/download/book' + output.split('/download/book')[1]
                                            break

                                if url and not url.startswith('http'):
                                    url = url_fix(host + url)
                                else:
                                    url = redirect_url(host, url)
                            except Exception as e:
                                logger.error('%s parsing bookresult for %s: %s' % (type(e).__name__, link, str(e)))
                                url = None

                        if url:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider + '/' + search,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                                'tor_type': 'direct',
                                'priority': lazylibrarian.CONFIG[prov + '_DLPRIORITY']
                            })
                            logger.debug('Found %s, Size %s' % (title, size))
                        next_page = True

            except Exception as e:
                logger.error("An error occurred in the %s parser: %s" % (provider, str(e)))
                logger.debug('%s: %s' % (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn('Maximum results page search reached, still more results available')
            next_page = False

    logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #41
0
def TDL(book=None, test=False):
    errmsg = ''
    provider = "torrentdownloads"
    host = lazylibrarian.CONFIG['TDL_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {
        "type": "search",
        "cid": "2",
        "search": book['searchterm']
    }
    searchURL = providerurl + "/rss.xml?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'].replace(',', ''))
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    try:
                        pubdate = item['published']
                    except KeyError:
                        pubdate = None

                    if link and minimumseeders < seeders:
                        # no point requesting the magnet link if not enough seeders
                        # TDL gives us a relative link
                        result, success = fetchURL(providerurl+link)
                        if success:
                            new_soup = BeautifulSoup(result, 'html5lib')
                            for link in new_soup.find_all('a'):
                                output = link.get('href')
                                if output and output.startswith('magnet'):
                                    url = output
                                    break

                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            res = {
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                                'tor_type': 'magnet',
                                'priority': lazylibrarian.CONFIG['TDL_DLPRIORITY']
                            }
                            if pubdate:
                                res['tor_date'] = pubdate
                            logger.debug('Found %s. Size: %s' % (title, size))
                            results.append(res)
                    else:
                        logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" % (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm))

    return results, errmsg
コード例 #42
0
def KAT(book=None, test=False):
    errmsg = ''
    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" + quote(book['searchterm']))

    params = {"category": "books", "field": "seeders", "sorder": "desc"}
    searchURL = providerurl + "/?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
            errmsg = result
        result = False

    if test:
        return success

    results = []

    if result:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result, 'html5lib')
        rows = []
        try:
            table = soup.find_all('table')[1]  # un-named table
            if table:
                rows = table.find_all('tr')
        except IndexError:  # no results table in result page
            rows = []

        if len(rows) > 1:
            rows = rows[1:]  # first row is headers

        for row in rows:
            td = row.find_all('td')
            if len(td) > 3:
                try:
                    title = unaccented(td[0].text)
                    # kat can return magnet or torrent or both.
                    magnet = ''
                    url = ''
                    mode = 'torrent'
                    try:
                        magnet = 'magnet' + str(
                            td[0]).split('href="magnet')[1].split('"')[0]
                        mode = 'magnet'
                    except IndexError:
                        pass
                    try:
                        url = 'http' + str(td[0]).split('href="http')[1].split(
                            '.torrent?')[0] + '.torrent'
                        mode = 'torrent'
                    except IndexError:
                        pass

                    if not url or (magnet and url
                                   and lazylibrarian.CONFIG['PREFER_MAGNET']):
                        url = magnet
                        mode = 'magnet'

                    try:
                        size = str(td[1].text).replace('&nbsp;', '').upper()
                        mult = 1
                        if 'K' in size:
                            size = size.split('K')[0]
                            mult = 1024
                        elif 'M' in size:
                            size = size.split('M')[0]
                            mult = 1024 * 1024
                        elif 'G' in size:
                            size = size.split('G')[0]
                            mult = 1024 * 1024 * 1024
                        size = int(float(size) * mult)
                    except (ValueError, IndexError):
                        size = 0
                    try:
                        seeders = int(td[3].text)
                    except ValueError:
                        seeders = 0

                    if not url or not title:
                        logger.debug('Missing url or title')
                    elif minimumseeders < int(seeders):
                        results.append({
                            'bookid':
                            book['bookid'],
                            'tor_prov':
                            provider,
                            'tor_title':
                            title,
                            'tor_url':
                            url,
                            'tor_size':
                            str(size),
                            'tor_type':
                            mode,
                            'priority':
                            lazylibrarian.CONFIG['KAT_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))
                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #43
0
ファイル: binsearch.py プロジェクト: adlerre/Sick-Beard
    def _doSearch(self, search_string, show=None, max_age=0):
        
        params = {"q":search_string,
                  "m": "n",
                  "max": 400,
                  "minsize": 100,
                  "adv_sort": "date",
                  "adv_col": "on",
                  "adv_nfo": "on",
                  "adv_age": sickbeard.USENET_RETENTION}

        # if max_age is set, use it, don't allow it to be missing
        if max_age or not params['adv_age']:
            params['adv_age'] = max_age

        searchURL = self.urls["search"] % urllib.urlencode(params)

        logger.log(u"Search url: " + searchURL)
        
        data = self.getURL(searchURL)
        
        if not data:
            logger.log(u"No data returned from " + searchURL, logger.ERROR)
            return []
        
        res_items = []
        
        try:
            html = BeautifulSoup(data)
            main_table = html.find('table', attrs={'id':'r2'})

            if not main_table:
                return []

            items = main_table.find_all('tr')

            for row in items:
                title = row.find('span', attrs={'class':'s'})

                if not title: continue

                nzb_id = row.find('input', attrs={'type':'checkbox'})['name']
                info = row.find('span', attrs={'class':'d'})

                def extra_check(item):
                    parts = re.search('available:.(?P<parts>\d+)./.(?P<total>\d+)', info.text)
                    total = tryInt(parts.group('total'))
                    parts = tryInt(parts.group('parts'))

                    if (total / parts) < 0.95 or ((total / parts) >= 0.95 and not ('par2' in info.text.lower() or 'pa3' in info.text.lower())):
                        logger.log('Wrong: \'%s\', not complete: %s out of %s' % (item['name'], parts, total), logger.WARNING)
                        return False

                    if 'requires password' in info.text.lower():
                        logger.log('Wrong: \'%s\', passworded' % (item['name']), logger.WARNING)
                        return False

                    return True

                res_items.append({
                    'id': nzb_id,
                    'title': title.text,
                    'url': self.urls['download'] % nzb_id,
                    'extra_check': extra_check
                })

        except:
            logger.log('Failed to parse HTML response from BinSearch: %s' % traceback.format_exc(), logger.ERROR)
        
        results = []
        
        for curItem in res_items:
            (title, url) = self._get_title_and_url(curItem)
            if title and url and curItem['extra_check']:
                logger.log(u"Adding item from BinSearch to results: " + title, logger.DEBUG)
                results.append(curItem)
            else:
                logger.log(u"The HTML returned from the " + self.name + " incomplete, this result is unusable", logger.DEBUG)

        return results
コード例 #44
0
 def test_warning_if_parser_specified_too_vague(self):
     with warnings.catch_warnings(record=True) as w:
         soup = BeautifulSoup("<a><b></b></a>", "html")
     self._assert_no_parser_specified(w)
コード例 #45
0
ファイル: publichd.py プロジェクト: zimbo69/Sick-Beard
    def _doSearch(self, search_params, show=None):

        results = []
        items = {'Season': [], 'Episode': [], 'RSS': []}

        for mode in search_params.keys():
            for search_string in search_params[mode]:

                if mode == 'RSS':
                    searchURL = self.url + 'index.php?page=torrents&active=1&category=%s' %(';'.join(self.categories[mode]))
                    logger.log(u"PublicHD cache update URL: "+ searchURL, logger.DEBUG)
                else:
                    searchURL = self.searchurl %(urllib.quote(unidecode(search_string)), ';'.join(self.categories[mode]))
                    logger.log(u"Search string: " + searchURL, logger.DEBUG)

                html = self.getURL(searchURL)

                if not html:
                    continue

                #remove unneccecary <option> lines which are slowing down BeautifulSoup
                optreg = re.compile( r'<option.*</option>' )
                html = os.linesep.join([s for s in html.splitlines() if not optreg.search(s)])

                try:
                    soup = BeautifulSoup(html, features=["html5lib", "permissive"])

                    torrent_table = soup.find('table', attrs = {'id' : 'torrbg'})
                    torrent_rows = torrent_table.find_all('tr') if torrent_table else []

                    #Continue only if one Release is found
                    if len(torrent_rows)<2:
                        logger.log(u"The Data returned from " + self.name + " do not contains any torrent", logger.DEBUG)
                        continue

                    for tr in torrent_rows[1:]:

                        try:
                            link = self.url + tr.find(href=re.compile('page=torrent-details'))['href']
                            title = tr.find(lambda x: x.has_attr('title')).text.replace('_','.')
                            url = tr.find(href=re.compile('magnet+'))['href']
                            seeders = int(tr.find_all('td', {'class': 'header'})[4].text)
                            leechers = int(tr.find_all('td', {'class': 'header'})[5].text)
                        except (AttributeError, TypeError):
                            continue

                        if mode != 'RSS' and seeders == 0:
                            continue

                        if not title or not url:
                            continue

                        item = title, url, link, seeders, leechers

                        items[mode].append(item)

                except Exception, e:
                    logger.log(u"Failed to parsing " + self.name + " Traceback: "  + traceback.format_exc(), logger.ERROR)

            #For each search mode sort all the items by seeders
            items[mode].sort(key=lambda tup: tup[3], reverse=True)

            results += items[mode]
コード例 #46
0
ファイル: _html5lib.py プロジェクト: DobyTang/LazyLibrarian
 def fragmentClass(self):
     from lib.bs4 import BeautifulSoup
     self.soup = BeautifulSoup("", "html.parser")
     self.soup.name = "[document_fragment]"
     return Element(self.soup, self.soup, None)
コード例 #47
0
def WWT(book=None, test=False):
    errmsg = ''
    provider = "WorldWideTorrents"
    host = lazylibrarian.CONFIG['WWT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/torrents-search.php")

    sterm = makeUnicode(book['searchterm'])

    cat = 0  # 0=all, 36=ebooks, 52=mags, 56=audiobooks
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 56
        elif book['library'] == 'eBook':
            cat = 36
        elif book['library'] == 'magazine':
            cat = 52

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:
        params = {"search": book['searchterm'], "page": page, "cat": cat}
        searchURL = providerurl + "/?%s" % urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # might return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result, 'html5lib')

            try:
                tables = soup.find_all('table')  # un-named table
                table = tables[2]
                if table:
                    rows = table.find_all('tr')
            except IndexError:  # no results table in result page
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers

            for row in rows:
                td = row.find_all('td')
                if len(td) > 3:
                    try:
                        title = unaccented(td[0].text)
                        # can return magnet or torrent or both.
                        magnet = ''
                        url = ''
                        mode = 'torrent'
                        try:
                            magnet = 'magnet' + str(
                                td[0]).split('href="magnet')[1].split('"')[0]
                            mode = 'magnet'
                        except IndexError:
                            pass
                        try:
                            url = url_fix(host + '/download.php') + \
                                          str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent'
                            mode = 'torrent'
                        except IndexError:
                            pass

                        if not url or (magnet and url and
                                       lazylibrarian.CONFIG['PREFER_MAGNET']):
                            url = magnet
                            mode = 'magnet'

                        try:
                            size = str(td[1].text).replace('&nbsp;',
                                                           '').upper()
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if not url or not title:
                            logger.debug('Missing url or title')
                        elif minimumseeders < int(seeders):
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                mode,
                                'priority':
                                lazylibrarian.CONFIG['WWT_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                            next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))
        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg
コード例 #48
0
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
        with open(data) as fp:
            data = fp.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
        return
    print

    for parser in basic_parsers:
        print "Trying to parse your markup with %s" % parser
        success = False
        try:
            soup = BeautifulSoup(data, parser)
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "Here's what %s did with the markup:" % parser
            print soup.prettify()

        print "-" * 80


def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
コード例 #49
0
def TDL(book=None, test=False):
    errmsg = ''
    provider = "torrentdownloads"
    host = lazylibrarian.CONFIG['TDL_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < int(seeders):
                        # no point requesting the magnet link if not enough seeders
                        # TDL gives us a relative link
                        result, success = fetchURL(providerurl + link)
                        if success:
                            new_soup = BeautifulSoup(result, 'html5lib')
                            for link in new_soup.find_all('a'):
                                output = link.get('href')
                                if output and output.startswith('magnet'):
                                    url = output
                                    break

                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'magnet',
                                'priority':
                                lazylibrarian.CONFIG['TDL_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg
コード例 #50
0
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements, soup=None):
        if soup:
            self.soup = soup
        else:
            from lib.bs4 import BeautifulSoup
            self.soup = BeautifulSoup("", "html.parser")
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
        tag = self.soup.new_tag(name, namespace)
        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        from lib.bs4 import BeautifulSoup
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return treebuilder_base.TreeBuilder.getFragment(self).element

    def testSerializer(self, element):
        from lib.bs4 import BeautifulSoup
        rv = []
        doctype_re = re.compile(
            r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

        def serializeElement(element, indent=0):
            if isinstance(element, BeautifulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent, ))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in element.attrs.items():
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace],
                                              name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))

                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' *
                                                  (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)

        serializeElement(element, 0)

        return "\n".join(rv)