Example #1
0
def rss2translate(url=None, format=None):
    """Convert RSS 2.0 feed to Atom or RSS 1.0
    
    Sample request:
    * curl "http://localhost:8880/akara.rss2translate?url=http://feeds.delicious.com/v2/rss/recent"

    This is a demo and is not meant as an industrial-strength converter.
    """
    # Support connection-negotiation in addition to query parameter
    if not format:
        accepted_imts = request.environ.get('HTTP_ACCEPT', '').split(',')
        imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts))
        if imt == 'RDF_IMT':
            format = 'rss1'
        else:
            format = 'atom'
    
    if not url:
        raise AssertionError("The 'url' query parameter is mandatory.")

    import feedparser # From http://www.feedparser.org/
    feed = feedparser.parse(url)
    
    # Note: bad URLs might mean the feed doesn't have headers
    logger.debug('Feed info: ' + repr((url, feed.version, feed.encoding, feed.headers.get('Content-type'))))

    updated = getattr(feed.feed, 'updated_parsed', None)
    if updated:
        #FIXME: Double-check this conversion
        updated = datetime(*updated[:7]).isoformat()
    
    f = atomtools.feed(title=feed.feed.title, updated=updated, id=feed.feed.link)
    for e in feed.entries:
        updated = getattr(e, 'updated_parsed', None)
        if updated:
            #FIXME: Double-check this conversion
            updated = datetime(*updated[:7]).isoformat()
        links = [
            #FIXME: self?
            (e.link, u'alternate'),
        ]
        f.append(
            e.link,
            e.title,
            updated = updated,
            summary=e.description,
            #e.author_detail.name
            #authors=authors,
            links=links,
        )

    if format == 'atom':
        result = f.xml_encode()
        response.add_header("Content-Type", ATOM_IMT)
    else:
        result = f.rss1format()
        response.add_header("Content-Type", RDF_IMT)
    return result
Example #2
0
def atomize_oai_record(endpoint=None, id=None):
    '''
    endpoint - the OAI request URL, e.g. http://dspace.mit.edu/oai/request
    id, e.g. the article ID, e.g. oai:dspace.mit.edu:1721.1/5451
    
    Sample request:
    curl "http://localhost:8880/akara.oai.atom?endpoint=http://dspace.mit.edu/oai/request&id=oai:dspace.mit.edu:1721.1/5451"
    '''
    if endpoint is None:
        raise ValueError('endpoint required')
    if id is None:
        raise ValueError('id required')
    qstr = urllib.urlencode({
        'verb': 'GetRecord',
        'metadataPrefix': 'oai_dc',
        'identifier': id
    })
    url = endpoint + '?' + qstr
    doc = bindery.parse(url, model=OAI_MODEL)
    resources = metadata_dict(generate_metadata(doc))
    #print resources
    f = feed(ATOM_ENVELOPE)
    #f = feed(ATOM_ENVELOPE, title=resources['title'], id=resources['id'])
    #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')}))
    #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': u'http://kds-kci.zepheira.com/sciencedirect.discovery'}))
    #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')}))
    #f.source.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')}))
    #maxarticles = DEFAULT_MAX_RESULTS
    maxarticles = 3
    for record in islice(doc.OAI_PMH, 0, maxarticles):
        resource = unicode(resources[id])
        print resource
        authors = [(a, None, None) for a in unicode(resource[u'creator'])]
        links = [
            (unicode(resource['handle']), u'alternate'),
        ]
        #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ]
        #elements = [
        #    E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON),
        #    E((SD_NS, u'sd:journal-name'), unicode(article.journalName)),
        #]
        f.append(
            id,
            unicode(resource['title'][0]),
            updated=unicode(resource['date'][0]),
            summary=unicode(resource['description'][0]),
            authors=authors,
            links=links,
            #categories=categories,
            #elements=elements,
        )

    return f.source.xml_encode('xml-indent')
Example #3
0
def atomize_oai_record(endpoint=None, id=None):
    '''
    endpoint - the OAI request URL, e.g. http://dspace.mit.edu/oai/request
    id, e.g. the article ID, e.g. oai:dspace.mit.edu:1721.1/5451
    
    Sample request:
    curl "http://localhost:8880/akara.oai.atom?endpoint=http://dspace.mit.edu/oai/request&id=oai:dspace.mit.edu:1721.1/5451"
    '''
    if endpoint is None:
        raise ValueError('endpoint required')
    if id is None:
        raise ValueError('id required')
    qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id})
    url = endpoint + '?' + qstr
    doc = bindery.parse(url, model=OAI_MODEL)
    resources = metadata_dict(generate_metadata(doc))
    #print resources
    f = feed(ATOM_ENVELOPE)
    #f = feed(ATOM_ENVELOPE, title=resources['title'], id=resources['id'])
    #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')}))
    #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': u'http://kds-kci.zepheira.com/sciencedirect.discovery'}))
    #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')}))
    #f.source.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')}))
    #maxarticles = DEFAULT_MAX_RESULTS
    maxarticles = 3
    for record in islice(doc.OAI_PMH, 0, maxarticles):
        resource = unicode(resources[id])
        print resource
        authors = [ (a, None, None) for a in unicode(resource[u'creator']) ]
        links = [
            (unicode(resource['handle']), u'alternate'),
        ]
        #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ]
        #elements = [
        #    E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON),
        #    E((SD_NS, u'sd:journal-name'), unicode(article.journalName)),
        #]
        f.append(
            id,
            unicode(resource['title'][0]),
            updated=unicode(resource['date'][0]),
            summary=unicode(resource['description'][0]),
            authors=authors,
            links=links,
            #categories=categories,
            #elements=elements,
        )

    return f.source.xml_encode('xml-indent')
Example #4
0
def run(input=None, outfullhtml=None, outfulljson=None, outchoicehtml=None):
    pubfeed = atomtools.feed('http://uche.ogbuji.net/publications', input)

    output = structwriter(stream=outfullhtml, indent=u"yes")
    fullfeed = output.cofeed(ROOT(E_CURSOR(u'div', {u'class': u'articles'})))

    output = structwriter(stream=outchoicehtml, indent=u"yes")
    choicefeed = output.cofeed(ROOT(E_CURSOR(u'div', {u'class': u'articles'})))

    h = event_handler(fullfeed, choicefeed)

    for e in list(pubfeed.feed.entry):
        h.execute(e)

    fullfeed.close()
    choicefeed.close()
Example #5
0
def atom_results(doc, metadata, self_link, alt_link, search_terms):
    f = feed(ATOM_ENVELOPE, title=search_terms.decode('utf-8'), id=self_link.decode('utf-8'))
    #f.feed.update = self_link.decode('utf-8')
    f.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')}))
    f.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': OSCI_BASE + u'/content/pubmed.discovery'}))
    f.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')}))
    f.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')}))
    #amara.xml_print(doc, indent=True)
    for aid in islice(doc.PubmedArticleSet.xml_select(u"PubmedArticle/MedlineCitation/PMID"), 0, DEFAULT_MAX_RESULTS):
        #print >> sys.stderr, metadata
        #if u'ArticleTitle' not in resource:
        #    continue
        resource = metadata[unicode(aid)]
        try:
            authors = [ (u'%s, %s, %s'%(U(metadata[a][u'LastName']), U(metadata[a].get(u'FirstName', u'')), U(metadata[a][u'Initials'])), None, None) for a in resource.get(u'Author', []) ]
        except:
            authors = []
        links = [
            (PUBMED_ID_BASE + unicode(aid), u'self'),
            (NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid), u'alternate'),
        ]
        #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ]
        elements = [
            E((ATOM_NAMESPACE, u'content'), {u'src': NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid)}),
        #    E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON),
        #    E((SD_NS, u'sd:journal-name'), unicode(article.journalName)),
        ]
        #logger.debug(repr((aid, resource.keys(), resource[u'DateCreated'][0])))
        #if u'ArticleId:doi' in resource and U(resource[u'ArticleId:doi']):
        id_uri = u'doi:' + U(resource[u'ArticleId:doi']) if resource.get(u'ArticleId:doi') else PUBMED_ID_BASE + unicode(aid)
        f.append(
            id_uri,
            U(resource[u'ArticleTitle']),
            updated=datetime.datetime(*(int(bit) for bit in U(resource[u'DateCreated']).split('/'))).isoformat(),
            summary=U(resource.get(u'AbstractText', [])),
            authors=authors,
            links=links,
            #categories=categories,
            elements=elements,
        )
        #print >> sys.stderr, article.xml_select(u'//*[contains(name(), "journal")]')
        #entry['journal_cover'] = 

    #FIXME: indent
    return f.xml_encode()
Example #6
0
def dspace_adapter(search=None, id=None):
    '''
    Sample queries:
    curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'):
        for li in islice(doc.xml_select(u'//*[@id="'+RESULTS_DIV+'"]//*[@class="artifact-description"]/..'), 0, maxarticles):
            row = li.xml_parent.xml_parent
            title = li.xml_select(u'.//*[@class="artifact-title"]')[0]
            rel_id = title.a.href.partition(u'/handle/')[2]
            dspace_id = DSPACE_ID_BASE + rel_id
            alt_link = DSPACE_ARTICLE_BASE + u'1721.1/7488'
            #Do not quote.  DSpace doesn't like that
            #alt_link = DSPACE_ARTICLE_BASE + urllib.quote(u'1721.1/7488', '')
            title = unicode(title)
            summary = unicode(row.xml_select(u'string(.//*[@class="summary"])'))
            updated = unicode(row.xml_select(u'string(.//*[@class="date"])')).strip().partition(u'Published: ')[2]
            #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM
            authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="author"]//b)')).split(';') ]

            #Retrieve the DSpace page
            qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id})
            url = DSPACE_OAI_ENDPOINT + '?' + qstr
            print >> sys.stderr, url
            #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ]

            doc = bindery.parse(url, model=OAI_MODEL)
            #print >> sys.stderr, list(generate_metadata(doc))
            resources, first_id = metadata_dict(generate_metadata(doc))
            record = doc.OAI_PMH

            resource = resources[first_id]

            authors = [ (a, None, None) for a in resource[u'creator'] ]
            links = [
                (DSPACE_ARTICLE_BASE + rel_id, u'alternate'),
                (u'dspace?id=' + dspace_id, u'self'),
            ]
            elements = [
                E((ATOM_NAMESPACE, u'content'), {u'src': alt_link}),
            ]
            f.append(
                dspace_id,
                U(resource['title']),
                updated=U(resource['date']),
                summary=U(resource['description']),
                authors=authors,
                links=links,
                #categories=categories,
                elements=elements,
            )

        #FIXME: indent
        return f.xml_encode()
            links.remove(link)
    if not links:
        return ()
    return E(
        u"div",
        {u"class": u"seealso-wrapper"},
        u"See also:",
        E(
            u"ul",
            {u"class": u"seealso"},
            (E(u"li", {u"class": U(link.rel)}, U(link.href)) for link in links if None not in (link.rel, link.href)),
        ),
    )


pubfeed = atomtools.feed("http://uche.ogbuji.net/publications", sys.argv[1])

w = structwriter(indent=u"yes").feed(
    ROOT(
        E(
            u"div",
            {u"class": u"articles"},
            (
                E(
                    u"article",
                    E(u"h2", (E(u"a", {u"href": main_link(e)}, U(e.title)) if main_link(e) else E(u"a", U(e.title)))),
                    (E(u"h3", U(subtitle)) for subtitle in (e.subtitle or []) if U(subtitle).strip()),
                    (
                        E(
                            u"div",
                            {u"class": u"author"},
Example #8
0
def jove_adapter(search=None, id=None):
    '''
    Sample queries:
    curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'):
        for item in islice(doc.xml_select(u'//*[@class="result_table"]//*[@class="article_title"]'), 0, maxarticles):
            row = item.xml_parent.xml_parent
            title = unicode(item)
            alt_link = item.a.href
            summary = unicode(row.xml_select(u'string(.//*[@class="summary"])'))
            updated = unicode(row.xml_select(u'string(.//*[@class="publication_date"])')).strip().partition(u'Published: ')[2]
            #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM
            authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="authors"]//b)')).split(',') ]
            keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ]
            icon = first_item(row.xml_select(u'.//*[@class="thumbnail"]')).img.src
            icon = ''.join(icon.split())
            jove_id = item.a.href[len(JOVE_ARTICLE):]

            links = [
                (JOVE_ADAPTER_BASE + '?id=' + jove_id, u'self'),
                (icon, u'icon'),
                #(NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid), u'alternate'),
            ]
            #print >> sys.stderr, links
            #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ]
            elements = [
                E((ATOM_NAMESPACE, u'content'), {u'src': item.a.href}),
            #    E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON),
            #    E((SD_NS, u'sd:journal-name'), unicode(article.journalName)),
            ]
            elements.extend([
#                E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'href': JOVE_ADAPTER_BASE + '/?id=' + jove_id}),
                E((ATOM_NAMESPACE, u'link'), {u'rel': u'icon', u'href': icon}),
            ])
            f.append(
                item.a.href,
                title,
                updated=datetime.datetime.now().isoformat(),
                summary=summary,
                authors=authors,
                links=links,
                categories=keywords,
                elements=elements,
            )
            #print >> sys.stderr, article.xml_select(u'//*[contains(name(), "journal")]')
            #entry['journal_cover'] = 

        for e in f.feed.entry:
            ENTRY_CACHE[jove_id] = e.xml_encode()
        #FIXME: indent
        return f.xml_encode()