Beispiel #1
0
    def GET(self, query):
        qs = web.ctx.query
        if qs.startswith('?'):
            qs = qs[1:]

        params = cgi.parse_qs(qs)

        if not 'start' in params:
            start = 0
        else:
            start = params['start'][0] # XXX hack for .html ending -- remove once fixed
            if start.endswith('.html'):
                start = start[:-5]
            start = int(start)

        q  = params['q'][0]
        qq = urllib.quote(q)
        solrUrl       = 'http://se.us.archive.org:8983/solr/select?q='+qq+'+AND+'+pubInfo['query_base']+'&fl=identifier,title,creator,oai_updatedate,date,contributor,publisher,subject,language,format&rows='+str(numRows)+'&start='+str(start*numRows)+'&wt=json'
        titleFragment = 'search results for ' + q
        urn           = pubInfo['urnroot'] + ':search:%s:%d' % (qq, start)

        ingestor = catalog.ingest.SolrToCatalog(pubInfo, solrUrl, urn,
                                                start=start, numRows=numRows,
                                                urlBase='/search?q=%s&start=' % (qq), # XXX adding .html to end...
                                                titleFragment = titleFragment)

        c = ingestor.getCatalog()

        web.header('Content-Type', 'text/html')
        r = output.ArchiveCatalogToHtml(c, device = getDevice())
        return r.toString()
Beispiel #2
0
    def GET(self, letter, start):
        mode = 'xml'
        if not start:
            start = 0
        else:
            if start.endswith('.html'):
                start = start[:-5]
                mode = 'html'
            start = int(start)

        #TODO: add Image PDFs to this query
        solrUrl = pubInfo['solr_base'] + '&q=firstTitle%3A' + letter.upper(
        ) + '&sort=titleSorter+asc&rows=' + str(numRows) + '&start=' + str(
            start * numRows)
        titleFragment = 'books starting with "%s"' % (letter.upper())
        urn = pubInfo['urnroot'] + ':%s:%d' % (letter, start)

        ingestor = catalog.ingest.SolrToCatalog(pubInfo,
                                                solrUrl,
                                                urn,
                                                start=start,
                                                numRows=numRows,
                                                urlBase='%s/alpha/%s/' %
                                                (pubInfo['url_base'], letter),
                                                titleFragment=titleFragment)
        c = ingestor.getCatalog()

        if 'html' == mode:
            web.header('Content-Type', 'text/html')
            r = output.ArchiveCatalogToHtml(c, device=getDevice())
            return r.toString()
        else:
            web.header('Content-Type', pubInfo['mimetype'])
            r = output.CatalogToAtom(c, fabricateContentElement=True)
            return r.toString()
Beispiel #3
0
    def GET(self, start, extension):
        if extension == '.html':
            extension = 'html'
        else:
            extension = 'xml'

        if not start:
            start = 0
        else:
            if start.endswith('.html'):
                extension = 'html'
                start = start[:-5]
            start = int(start)

        crawlNumRows = 1000;
        solrUrl       = pubInfo['solr_base'] + '&q='+pubInfo['query_base']+'&rows='+str(crawlNumRows)+'&start='+str(start*crawlNumRows)
        titleFragment = '- crawlable feed'
        urn           = pubInfo['urnroot'] + ':crawl:%d' % (start)
        ingestor = catalog.ingest.IASolrToCatalog(pubInfo, solrUrl, urn,
                                                start=start, numRows=crawlNumRows,
                                                urlBase='/catalog/crawlable/',
                                                titleFragment = titleFragment)
        c = ingestor.getCatalog()

        if 'html' == extension:
            web.header('Content-Type', 'text/html')
            r = output.ArchiveCatalogToHtml(c, device = getDevice())
            return r.toString()
        else:
            web.header('Content-Type', pubInfo['mimetype'])
            r = output.CatalogToAtom(c, fabricateContentElement=True)
            return r.toString()
Beispiel #4
0
    def GET(self, mode='xml'):

        datestr = catalog.getCurrentDate()

        c = catalog.Catalog(
            title=pubInfo['name'] + ' Aggregator',
            urn=pubInfo['urnroot'],
            url=pubInfo['opdsroot'],
            datestr=datestr,
            author=pubInfo['name'],
            authorUri=pubInfo['uri'],
        )

        l = catalog.Link(url='alpha.' + mode,
                         type=bookserver.catalog.Link.opds)
        e = catalog.Entry(
            {
                'title': 'Alphabetical By Title',
                'urn': pubInfo['urnroot'] + ':titles:all',
                'updated': datestr,
                'content': 'Alphabetical list of all titles.'
            },
            links=[l])
        c.addEntry(e)

        l = catalog.Link(url='providers.' + mode,
                         type=bookserver.catalog.Link.opds)
        e = catalog.Entry(
            {
                'title': 'By Provider',
                'urn': pubInfo['urnroot'] + ':providers:all',
                'updated': datestr,
                'content': 'Listing of all publishers and sellers.'
            },
            links=[l])
        c.addEntry(e)

        #l = catalog.Link(url = 'devices.'+mode, type = types[mode])
        #e = catalog.Entry({'title'   : 'By Device',
        #                   'urn'     : pubInfo['urnroot'] + ':devices',
        #                   'updated' : datestr,
        #                   'content' : 'Filter by books compatible with your e-book reading device.'
        #                 }, links=[l])
        #c.addEntry(e)

        osDescriptionDoc = 'http://bookserver.archive.org/aggregator/opensearch.xml'
        o = catalog.OpenSearch(osDescriptionDoc)
        c.addOpenSearch(o)

        if 'html' == mode:
            r = output.ArchiveCatalogToHtml(c, device=getDevice())
            web.header('Content-Type', 'text/html')
            return r.toString()
        else:
            r = output.CatalogToAtom(c)
            web.header('Content-Type', pubInfo['mimetype'])
            return r.toString()
Beispiel #5
0
    def GET(self, extension):
        #IA is continuously scanning books. Since this OPDS file is constructed
        #from search engine results, let's change the updated date every midnight
        #TODO: create a version of /alpha.xml with the correct updated dates,
        #and cache it for an hour to ease load on solr
        datestr = getDateString()

        c = catalog.Catalog(
                            title     = 'Internet Archive - All Titles',
                            urn       = pubInfo['urnroot'] + ':titles:all',
                            url       = pubInfo['opdsroot'] + '/alpha.xml',
                            datestr   = datestr,
                            author    = 'Internet Archive',
                            authorUri = 'http://www.archive.org',
                            crawlableUrl = pubInfo['opdsroot'] + '/crawlable',
                           )

        for letter in string.ascii_uppercase:
            lower = letter.lower()

            if 'html' == extension:
                linkType = 'text/html'
            elif 'xml' == extension:
                linkType = 'application/atom+xml'
            else:
                raise ValueError('Unsupported extension %s' % extension)

            l = catalog.Link(url = self.alphaURL(extension, lower, 0), type = linkType)
            e = catalog.Entry({'title'   : 'Titles: ' + letter,
                               'urn'     : pubInfo['urnroot'] + ':titles:'+lower,
                               'updated' : datestr,
                               'content' : 'Titles starting with ' + letter
                             }, links=(l,))
            c.addEntry(e)

        osDescriptionDoc = 'http://bookserver.archive.org/catalog/opensearch.xml'
        o = catalog.OpenSearch(osDescriptionDoc)
        c.addOpenSearch(o)

        if ('xml' == extension):
            web.header('Content-Type', pubInfo['mimetype'])
            r = output.CatalogToAtom(c)
            return r.toString()
        else:
            web.header('Content-Type', 'text/html')
            r = output.ArchiveCatalogToHtml(c, device = getDevice())
            return r.toString()
Beispiel #6
0
    def GET(self, extension):
        solrUrl       = pubInfo['solr_base']+'&q='+pubInfo['query_base']+'&sort=month+desc&rows='+str(numRows)

        titleFragment = 'Most Downloaded Books in the last Month'
        urn           = pubInfo['urnroot'] + ':downloads'
        ingestor = catalog.ingest.IASolrToCatalog(pubInfo, solrUrl, urn, titleFragment=titleFragment)
        c = ingestor.getCatalog()

        if ('xml' == extension):
            web.header('Content-Type', pubInfo['mimetype'])
            r = output.CatalogToAtom(c, fabricateContentElement=True)
            return r.toString()
        elif ('html' == extension):
            web.header('Content-Type', 'text/html')
            r = output.ArchiveCatalogToHtml(c, device = getDevice())
            return r.toString()
        else:
            web.seeother('/')
Beispiel #7
0
    def GET(self, extension):
        #IA is continuously scanning books. Since this OPDS file is constructed
        #from search engine results, let's change the updated date every midnight
        #TODO: create a version of /alpha.xml with the correct updated dates,
        #and cache it for an hour to ease load on solr
        datestr = catalog.getCurrentDate()

        c = catalog.Catalog(
            title=pubInfo['name'] + ' Aggregator - All Titles',
            urn=pubInfo['urnroot'] + ':titles:all',
            url=pubInfo['opdsroot'] + '/alpha.xml',
            datestr=datestr,
            author=pubInfo['name'],
            authorUri=pubInfo['uri'],
        )

        for letter in string.ascii_uppercase:
            lower = letter.lower()

            l = catalog.Link(url=self.alphaURL(extension, lower, 0),
                             type=bookserver.catalog.Link.opds)
            e = catalog.Entry(
                {
                    'title': 'Titles: ' + letter,
                    'urn': pubInfo['urnroot'] + ':titles:' + lower,
                    'updated': datestr,
                    'content': 'Titles starting with ' + letter
                },
                links=[l])
            c.addEntry(e)

        osDescriptionDoc = 'http://bookserver.archive.org/aggregator/opensearch.xml'
        o = catalog.OpenSearch(osDescriptionDoc)
        c.addOpenSearch(o)

        web.header('Content-Type', types[extension])

        if ('xml' == extension):
            r = output.CatalogToAtom(c)
        else:
            r = output.ArchiveCatalogToHtml(c, device=getDevice())

        return r.toString()
Beispiel #8
0
    def GET(self, mode):
        #TODO: get correct updated dates
        datestr = catalog.getCurrentDate()

        c = catalog.Catalog(
            title=pubInfo['name'] + ' Aggregator - All Providers',
            urn=pubInfo['urnroot'] + ':providers:all',
            url=pubInfo['opdsroot'] + '/providers.' + mode,
            datestr=datestr,
            author=pubInfo['name'],
            authorUri=pubInfo['uri'],
        )

        for provider in providers:
            if 'html' == mode:
                ext = '.html'  # $$$ should do URL mapping in output side?
            else:
                ext = ''

            l = catalog.Link(url='provider/' + provider + '/0' + ext,
                             type=bookserver.catalog.Link.opds)
            e = catalog.Entry(
                {
                    'title': providers[provider],
                    'urn': pubInfo['urnroot'] + ':providers:' + provider,
                    'updated': datestr,
                    'content': 'All Titles for provider ' + provider
                },
                links=[l])
            c.addEntry(e)

        osDescriptionDoc = 'http://bookserver.archive.org/aggregator/opensearch.xml'
        o = catalog.OpenSearch(osDescriptionDoc)
        c.addOpenSearch(o)

        web.header('Content-Type', types[mode])
        if ('xml' == mode):
            r = output.CatalogToAtom(c)
        else:
            r = output.ArchiveCatalogToHtml(c, device=getDevice())

        return r.toString()
Beispiel #9
0
    def GET(self, domain, start):
        mode = 'xml'
        if not start:
            start = 0
        else:
            if start.endswith('.html'):
                start = start[:-5]
                mode = 'html'
            start = int(start)

        #TODO: add Image PDFs to this query
        solrUrl = pubInfo[
            'solr_base'] + '&q=provider%3A' + domain + '&sort=titleSorter+asc&rows=' + str(
                numRows) + '&start=' + str(start * numRows)
        titleFragment = 'books for provider ' + providers[domain]
        urn = pubInfo['urnroot'] + ':provider:%s:%d' % (domain, start)

        ingestor = catalog.ingest.SolrToCatalog(pubInfo,
                                                solrUrl,
                                                urn,
                                                start=start,
                                                numRows=numRows,
                                                urlBase='%s/provider/%s/' %
                                                (pubInfo['url_base'], domain),
                                                titleFragment=titleFragment)
        c = ingestor.getCatalog()

        web.header('Content-Type', types[mode])

        if ('xml' == mode):
            r = output.CatalogToAtom(c, fabricateContentElement=True)
        else:
            r = output.ArchiveCatalogToHtml(c,
                                            device=getDevice(),
                                            provider=domain)

        return r.toString()
Beispiel #10
0
    def GET(self, query):
        qs = web.ctx.query
        if qs.startswith('?'):
            qs = qs[1:]

        params = cgi.parse_qs(qs)

        if not 'start' in params:
            start = 0
        else:
            start = params['start'][
                0]  # XXX hack for .html ending -- remove once fixed
            if start.endswith('.html'):
                start = start[:-5]
            start = int(start)

        if 'q' in params:
            q = params['q'][0]
        else:
            q = ''

        # Provider-specific search
        if 'provider' in params:
            providerMatch = re.search('(\w+)$', params['provider'][0])
            if providerMatch:
                provider = providerMatch.group(0)
                if not re.search('provider:', q):
                    if len(q) > 0:
                        q += ' AND '
                    q += 'provider:%s' % provider
            else:
                provider = None
        else:
            provider = None

        # Device-specific search
        # $$$ extend to other devices
        if 'device' in params:
            deviceStr = params['device'][0]
            if re.search('Kindle', deviceStr):
                formatStr = 'format:mobi'
                if not re.search(formatStr, q):  # XXX brittle
                    if len(q) > 0:
                        q += ' AND '
                    q += formatStr

        qq = urllib.quote(q)
        solrUrl = pubInfo[
            'solr_base'] + '&q=' + qq + '&sort=titleSorter+asc&rows=' + str(
                numRows) + '&start=' + str(start * numRows)

        #solrUrl       = pubInfo['solr_base'] + '?q='+qq+'+AND+mediatype%3Atexts+AND+format%3A(LuraTech+PDF)&fl=identifier,title,creator,oai_updatedate,date,contributor,publisher,subject,language,format&rows='+str(numRows)+'&start='+str(start*numRows)+'&wt=json'
        titleFragment = 'search results for ' + q
        urn = pubInfo['urnroot'] + ':search:%s:%d' % (qq, start)

        ingestor = catalog.ingest.SolrToCatalog(
            pubInfo,
            solrUrl,
            urn,
            start=start,
            numRows=numRows,
            # XXX assuming calling from archive.org/bookserver/catalog
            # XXX HTML output is adding .html to end...
            urlBase='/bookserver/catalog/search?q=%s&start=' % (qq),
            titleFragment=titleFragment)

        c = ingestor.getCatalog()

        web.header('Content-Type', 'text/html')
        r = output.ArchiveCatalogToHtml(c,
                                        device=getDevice(),
                                        query=q,
                                        provider=provider)
        return r.toString()
Beispiel #11
0
    def GET(self, url):
        mode = 'xml'
        if url and url.endswith('.html'):
            mode = 'html'

        datestr = getDateString()

        c = catalog.Catalog(
                            title     = 'Internet Archive Catalog',
                            urn       = pubInfo['urnroot'],
                            url       = pubInfo['opdsroot'] + '/',
                            datestr   = datestr,
                            author    = 'Internet Archive',
                            authorUri = 'http://www.archive.org',
                            crawlableUrl = pubInfo['opdsroot'] + '/crawlable',
                           )

        if 'html' == mode:
            links = { 'alpha': 'alpha.html',
                      'downloads': 'downloads.html',
                      'new': 'new.html'
            }
            type = 'text/html'
        else:
            links = {'alpha': 'alpha.xml',
                     'downloads': 'downloads.xml',
                     'new': 'new'
            }
            type = 'application/atom+xml'

        l = catalog.Link(url = links['alpha'], type = type)
        e = catalog.Entry({'title'  : 'Alphabetical By Title',
                           'urn'     : pubInfo['urnroot'] + ':titles:all',
                           'updated' : datestr,
                           'content' : 'Alphabetical list of all titles.'
                         }, links=(l,))
        c.addEntry(e)

        l = catalog.Link(url = links['downloads'], type = type)
        e = catalog.Entry({'title'   : 'Most Downloaded Books',
                           'urn'     : pubInfo['urnroot'] + ':downloads',
                           'updated' : datestr,
                           'content' : 'The most downloaded books from the Internet Archive in the last month.'
                         }, links=(l,))

        c.addEntry(e)

        l = catalog.Link(url = links['new'], type = type)
        e = catalog.Entry({'title'   : 'Recent Scans',
                           'urn'     : pubInfo['urnroot'] + ':new',
                           'updated' : datestr,
                           'content' : 'Books most recently scanned by the Internet Archive.'
                         }, links=(l,))

        c.addEntry(e)

        osDescriptionDoc = 'http://bookserver.archive.org/catalog/opensearch.xml'
        o = catalog.OpenSearch(osDescriptionDoc)
        c.addOpenSearch(o)

        if url and url.endswith('.html'):
            r = output.ArchiveCatalogToHtml(c, device = getDevice())
            web.header('Content-Type', 'text/html')
            return r.toString()
        else:
            r = output.CatalogToAtom(c)
            web.header('Content-Type', pubInfo['mimetype'])
            return r.toString()