예제 #1
0
def open_search(url, query, max_results=10, timeout=60):
    description = Description(url)
    url_template = description.get_best_template()
    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = etree.fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    if 'http://opds-spec.org/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/image/thumbnail' in rel:
                        s.cover_url = href
                    elif 'http://opds-spec.org/acquisition/buy' in rel:
                        s.detail_item = href
                    elif 'http://opds-spec.org/acquisition/sample' in rel:
                        pass
                    elif 'http://opds-spec.org/acquisition' in rel:
                        if type:
                            ext = guess_extension(type)
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = href
            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip()

            price_e = data.xpath('.//*[local-name() = "price"][1]')
            if price_e:
                price_e = price_e[0]
                currency_code = price_e.get('currencycode', '')
                price = ''.join(price_e.xpath('.//text()')).strip()
                s.price = currency_code + ' ' + price
                s.price = s.price.strip()

            yield s
예제 #2
0
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'):
    '''
    Manybooks uses a very strange opds feed. The opds
    main feed is structured like a stanza feed. The
    search result entries give very little information
    and requires you to go to a detail link. The detail
    link has the wrong type specified (text/html instead
    of application/atom+xml).
    '''

    description = Description(open_search_url)
    url_template = description.get_best_template()
    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        raw_data = f.read()
        raw_data = raw_data.decode('utf-8', 'replace')
        doc = etree.fromstring(raw_data)
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]')
            if not detail_links:
                continue
            detail_link = detail_links[0]
            detail_href = detail_link.get('href')
            if not detail_href:
                continue

            s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html'
            # These can have HTML inside of them. We are going to get them again later
            # just in case.
            s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip()

            # Follow the detail link to get the rest of the info.
            with closing(br.open(detail_href, timeout=timeout/4)) as df:
                ddoc = etree.fromstring(df.read())
                ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
                if ddata:
                    ddata = ddata[0]

                    # This is the real title and author info we want. We got
                    # it previously just in case it's not specified here for some reason.
                    s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip()
                    s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip()
                    if s.author.startswith(','):
                        s.author = s.author[1:]
                    if s.author.endswith(','):
                        s.author = s.author[:-1]

                    s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip()

                    for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
                        type = link.get('type')
                        href = link.get('href')
                        if type:
                            ext = mimetypes.guess_extension(type)
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = href

            s.price = '$0.00'
            s.drm = SearchResult.DRM_UNLOCKED
            s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'

            yield s
    def search(self, query, max_results=10, timeout=60):
        '''
        Manybooks uses a very strange opds feed. The opds
        main feed is structured like a stanza feed. The 
        search result entries give very little information
        and requires you to go to a detail link. The detail
        link has the wrong type specified (text/html instead
        of application/atom+xml).
        '''
        if not hasattr(self, 'open_search_url'):
            return

        description = Description(self.open_search_url)
        url_template = description.get_best_template()
        if not url_template:
            return
        oquery = Query(url_template)

        # set up initial values
        oquery.searchTerms = query
        oquery.count = max_results
        url = oquery.url()

        counter = max_results
        br = browser()
        with closing(br.open(url, timeout=timeout)) as f:
            raw_data = f.read()
            raw_data = raw_data.decode('utf-8', 'replace')
            doc = etree.fromstring(raw_data)
            for data in doc.xpath('//*[local-name() = "entry"]'):
                if counter <= 0:
                    break

                counter -= 1

                s = SearchResult()

                detail_links = data.xpath(
                    './*[local-name() = "link" and @type = "text/html"]')
                if not detail_links:
                    continue
                detail_link = detail_links[0]
                detail_href = detail_link.get('href')
                if not detail_href:
                    continue

                s.detail_item = 'http://manybooks.net/titles/' + detail_href.split(
                    'tid=')[-1] + '.html'
                # These can have HTML inside of them. We are going to get them again later
                # just in case.
                s.title = ''.join(
                    data.xpath('./*[local-name() = "title"]//text()')).strip()
                s.author = ', '.join(
                    data.xpath(
                        './*[local-name() = "author"]//text()')).strip()

                # Follow the detail link to get the rest of the info.
                with closing(br.open(detail_href, timeout=timeout / 4)) as df:
                    ddoc = etree.fromstring(df.read())
                    ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
                    if ddata:
                        ddata = ddata[0]

                        # This is the real title and author info we want. We got
                        # it previously just in case it's not specified here for some reason.
                        s.title = ''.join(
                            ddata.xpath('./*[local-name() = "title"]//text()')
                        ).strip()
                        s.author = ', '.join(
                            ddata.xpath('./*[local-name() = "author"]//text()')
                        ).strip()
                        if s.author.startswith(','):
                            s.author = s.author[1:]
                        if s.author.endswith(','):
                            s.author = s.author[:-1]

                        s.cover_url = ''.join(
                            ddata.xpath(
                                './*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href'
                            )).strip()

                        for link in ddata.xpath(
                                './*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'
                        ):
                            type = link.get('type')
                            href = link.get('href')
                            if type:
                                ext = mimetypes.guess_extension(type)
                                if ext:
                                    ext = ext[1:].upper().strip()
                                    s.downloads[ext] = href

                s.price = '$0.00'
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'

                yield s
def search_flibusta(url, query, web_url, max_results=10, timeout=60):
    description = Description(url)
    url_template = description.get_best_template()
    if not url_template:
        return
    oquery = Query(url_template)

    # set up initial values
    oquery.searchTerms = query
    oquery.count = max_results
    url = oquery.url()

    counter = max_results
    br = browser()
    with closing(br.open(url, timeout=timeout)) as f:
        doc = etree.fromstring(f.read())
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break

            counter -= 1

            s = SearchResult()

            s.detail_item = ''.join(
                data.xpath('./*[local-name() = "id"]/text()')).strip()

            for link in data.xpath('./*[local-name() = "link"]'):
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')

                if rel and href and type:
                    if 'http://opds-spec.org/thumbnail' in rel:
                        s.cover_url = web_url + href
                    elif 'http://opds-spec.org/image/thumbnail' in rel:
                        s.cover_url = web_url + href
                    elif 'http://opds-spec.org/acquisition/buy' in rel:
                        s.detail_item = web_url + href
                    elif 'http://opds-spec.org/acquisition/sample' in rel:
                        pass
                    elif 'http://opds-spec.org/acquisition/open-access' in rel:
                        if 'application/fb2+zip' in type:
                            s.downloads['FB2'] = web_url + href
                        elif 'application/txt+zip' in type:
                            s.downloads['TXT'] = web_url + href
                        elif 'application/html+zip' in type:
                            s.downloads['HTML'] = web_url + href
                        elif 'application/x-mobipocket-ebook' in type:
                            s.downloads['MOBI'] = web_url + href
                        elif type:
                            ext = guess_extension(type)
                            ext2 = guess_extension(type.replace("+zip", ""))
                            if ext:
                                ext = ext[1:].upper().strip()
                                s.downloads[ext] = web_url + href
                            elif ext2:
                                ext2 = ext2[1:].upper().strip()
                                s.downloads[ext2] = web_url + href
            s.formats = ', '.join(s.downloads.keys()).strip()

            s.title = ' '.join(
                data.xpath('./*[local-name() = "title"]//text()')).strip()
            s.author = ', '.join(
                data.xpath(
                    './*[local-name() = "author"]//*[local-name() = "name"]//text()'
                )).strip()

            s.price = '$0.00'
            s.drm = SearchResult.DRM_UNLOCKED

            yield s
예제 #5
0
	def search(self, query, max_results=10, timeout=60):
		description = Description(self.open_search_url)
		url_template = description.get_best_template()
		if not url_template:
			return
		oquery = Query(url_template)

		# set up initial values
		oquery.searchTerms = query
		oquery.count = max_results
		url = oquery.url()

		counter = max_results
		br = self.create_browser()
		while url != None and counter > 0:
			with closing(br.open(url, timeout=timeout)) as f:
				s = f.read()
				doc = etree.fromstring(s)
				url = None
				for link in doc.xpath('//*[local-name() = "link"]'):
					rel = link.get('rel')
					href = link.get('href')
					type = link.get('type')

					if rel and href and type:
						if rel == 'next' and type == 'application/atom+xml':
							if href[0] == "/":
								href = self.base_url + href
							url = href

				for data in doc.xpath('//*[local-name() = "entry"]'):
					if counter <= 0:
						break

					counter -= 1

					s = SearchResult()

					s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()

					drm = False

					for link in data.xpath('./*[local-name() = "link"]'):
						rel = link.get('rel')
						href = link.get('href')
						type = link.get('type')

						if rel and href and type:
							if 'http://opds-spec.org/thumbnail' in rel:
								s.cover_url = href
							elif 'http://opds-spec.org/image/thumbnail' in rel:
								s.cover_url = href
							elif 'http://opds-spec.org/acquisition/buy' in rel:
								s.detail_item = href
							elif 'http://opds-spec.org/acquisition' in rel:
								if type:
									ext = guess_extension(type)
									if type == 'application/fb2+xml':
										ext = '.fb2'
									if ext:
										ext = ext[1:].upper().strip()
										if href[0] == "/":
											href = self.base_url + href
										s.downloads[ext] = href
								for enc in link.xpath('./*[local-name() = "encryption_method"]'):
									drm = True
					s.formats = ', '.join(s.downloads.keys()).strip()

					s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
					s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip()

					s.drm = SearchResult.DRM_LOCKED if drm else SearchResult.DRM_UNLOCKED

					price_e = data.xpath('.//*[local-name() = "price"][1]')
					if price_e:
						price_e = price_e[0]
						currency_code = price_e.get('currencycode', '')
						price = ''.join(price_e.xpath('.//text()')).strip()
						s.price = currency_code + ' ' + price
						s.price = s.price.strip()
					if s.cover_url:
						s.cover_bak = s.cover_url
						s.cover_url = None
					yield s