def rss2translate(url=None, format=None): """Convert RSS 2.0 feed to Atom or RSS 1.0 Sample request: * curl "http://localhost:8880/akara.rss2translate?url=http://feeds.delicious.com/v2/rss/recent" This is a demo and is not meant as an industrial-strength converter. """ # Support connection-negotiation in addition to query parameter if not format: accepted_imts = request.environ.get('HTTP_ACCEPT', '').split(',') imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) if imt == 'RDF_IMT': format = 'rss1' else: format = 'atom' if not url: raise AssertionError("The 'url' query parameter is mandatory.") import feedparser # From http://www.feedparser.org/ feed = feedparser.parse(url) # Note: bad URLs might mean the feed doesn't have headers logger.debug('Feed info: ' + repr((url, feed.version, feed.encoding, feed.headers.get('Content-type')))) updated = getattr(feed.feed, 'updated_parsed', None) if updated: #FIXME: Double-check this conversion updated = datetime(*updated[:7]).isoformat() f = atomtools.feed(title=feed.feed.title, updated=updated, id=feed.feed.link) for e in feed.entries: updated = getattr(e, 'updated_parsed', None) if updated: #FIXME: Double-check this conversion updated = datetime(*updated[:7]).isoformat() links = [ #FIXME: self? (e.link, u'alternate'), ] f.append( e.link, e.title, updated = updated, summary=e.description, #e.author_detail.name #authors=authors, links=links, ) if format == 'atom': result = f.xml_encode() response.add_header("Content-Type", ATOM_IMT) else: result = f.rss1format() response.add_header("Content-Type", RDF_IMT) return result
def atomize_oai_record(endpoint=None, id=None): ''' endpoint - the OAI request URL, e.g. http://dspace.mit.edu/oai/request id, e.g. the article ID, e.g. oai:dspace.mit.edu:1721.1/5451 Sample request: curl "http://localhost:8880/akara.oai.atom?endpoint=http://dspace.mit.edu/oai/request&id=oai:dspace.mit.edu:1721.1/5451" ''' if endpoint is None: raise ValueError('endpoint required') if id is None: raise ValueError('id required') qstr = urllib.urlencode({ 'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id }) url = endpoint + '?' + qstr doc = bindery.parse(url, model=OAI_MODEL) resources = metadata_dict(generate_metadata(doc)) #print resources f = feed(ATOM_ENVELOPE) #f = feed(ATOM_ENVELOPE, title=resources['title'], id=resources['id']) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': u'http://kds-kci.zepheira.com/sciencedirect.discovery'})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')})) #f.source.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')})) #maxarticles = DEFAULT_MAX_RESULTS maxarticles = 3 for record in islice(doc.OAI_PMH, 0, maxarticles): resource = unicode(resources[id]) print resource authors = [(a, None, None) for a in unicode(resource[u'creator'])] links = [ (unicode(resource['handle']), u'alternate'), ] #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ] #elements = [ # E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON), # E((SD_NS, u'sd:journal-name'), unicode(article.journalName)), #] f.append( id, unicode(resource['title'][0]), updated=unicode(resource['date'][0]), summary=unicode(resource['description'][0]), authors=authors, links=links, #categories=categories, #elements=elements, ) return f.source.xml_encode('xml-indent')
def atomize_oai_record(endpoint=None, id=None): ''' endpoint - the OAI request URL, e.g. http://dspace.mit.edu/oai/request id, e.g. the article ID, e.g. oai:dspace.mit.edu:1721.1/5451 Sample request: curl "http://localhost:8880/akara.oai.atom?endpoint=http://dspace.mit.edu/oai/request&id=oai:dspace.mit.edu:1721.1/5451" ''' if endpoint is None: raise ValueError('endpoint required') if id is None: raise ValueError('id required') qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id}) url = endpoint + '?' + qstr doc = bindery.parse(url, model=OAI_MODEL) resources = metadata_dict(generate_metadata(doc)) #print resources f = feed(ATOM_ENVELOPE) #f = feed(ATOM_ENVELOPE, title=resources['title'], id=resources['id']) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': u'http://kds-kci.zepheira.com/sciencedirect.discovery'})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')})) #f.source.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')})) #maxarticles = DEFAULT_MAX_RESULTS maxarticles = 3 for record in islice(doc.OAI_PMH, 0, maxarticles): resource = unicode(resources[id]) print resource authors = [ (a, None, None) for a in unicode(resource[u'creator']) ] links = [ (unicode(resource['handle']), u'alternate'), ] #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ] #elements = [ # E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON), # E((SD_NS, u'sd:journal-name'), unicode(article.journalName)), #] f.append( id, unicode(resource['title'][0]), updated=unicode(resource['date'][0]), summary=unicode(resource['description'][0]), authors=authors, links=links, #categories=categories, #elements=elements, ) return f.source.xml_encode('xml-indent')
def run(input=None, outfullhtml=None, outfulljson=None, outchoicehtml=None): pubfeed = atomtools.feed('http://uche.ogbuji.net/publications', input) output = structwriter(stream=outfullhtml, indent=u"yes") fullfeed = output.cofeed(ROOT(E_CURSOR(u'div', {u'class': u'articles'}))) output = structwriter(stream=outchoicehtml, indent=u"yes") choicefeed = output.cofeed(ROOT(E_CURSOR(u'div', {u'class': u'articles'}))) h = event_handler(fullfeed, choicefeed) for e in list(pubfeed.feed.entry): h.execute(e) fullfeed.close() choicefeed.close()
def atom_results(doc, metadata, self_link, alt_link, search_terms): f = feed(ATOM_ENVELOPE, title=search_terms.decode('utf-8'), id=self_link.decode('utf-8')) #f.feed.update = self_link.decode('utf-8') f.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')})) f.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': OSCI_BASE + u'/content/pubmed.discovery'})) f.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')})) f.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')})) #amara.xml_print(doc, indent=True) for aid in islice(doc.PubmedArticleSet.xml_select(u"PubmedArticle/MedlineCitation/PMID"), 0, DEFAULT_MAX_RESULTS): #print >> sys.stderr, metadata #if u'ArticleTitle' not in resource: # continue resource = metadata[unicode(aid)] try: authors = [ (u'%s, %s, %s'%(U(metadata[a][u'LastName']), U(metadata[a].get(u'FirstName', u'')), U(metadata[a][u'Initials'])), None, None) for a in resource.get(u'Author', []) ] except: authors = [] links = [ (PUBMED_ID_BASE + unicode(aid), u'self'), (NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid), u'alternate'), ] #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ] elements = [ E((ATOM_NAMESPACE, u'content'), {u'src': NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid)}), # E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON), # E((SD_NS, u'sd:journal-name'), unicode(article.journalName)), ] #logger.debug(repr((aid, resource.keys(), resource[u'DateCreated'][0]))) #if u'ArticleId:doi' in resource and U(resource[u'ArticleId:doi']): id_uri = u'doi:' + U(resource[u'ArticleId:doi']) if resource.get(u'ArticleId:doi') else PUBMED_ID_BASE + unicode(aid) f.append( id_uri, U(resource[u'ArticleTitle']), updated=datetime.datetime(*(int(bit) for bit in U(resource[u'DateCreated']).split('/'))).isoformat(), summary=U(resource.get(u'AbstractText', [])), authors=authors, links=links, #categories=categories, elements=elements, ) #print >> sys.stderr, article.xml_select(u'//*[contains(name(), "journal")]') #entry['journal_cover'] = #FIXME: indent return f.xml_encode()
def dspace_adapter(search=None, id=None): ''' Sample queries: curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'): for li in islice(doc.xml_select(u'//*[@id="'+RESULTS_DIV+'"]//*[@class="artifact-description"]/..'), 0, maxarticles): row = li.xml_parent.xml_parent title = li.xml_select(u'.//*[@class="artifact-title"]')[0] rel_id = title.a.href.partition(u'/handle/')[2] dspace_id = DSPACE_ID_BASE + rel_id alt_link = DSPACE_ARTICLE_BASE + u'1721.1/7488' #Do not quote. DSpace doesn't like that #alt_link = DSPACE_ARTICLE_BASE + urllib.quote(u'1721.1/7488', '') title = unicode(title) summary = unicode(row.xml_select(u'string(.//*[@class="summary"])')) updated = unicode(row.xml_select(u'string(.//*[@class="date"])')).strip().partition(u'Published: ')[2] #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="author"]//b)')).split(';') ] #Retrieve the DSpace page qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id}) url = DSPACE_OAI_ENDPOINT + '?' + qstr print >> sys.stderr, url #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ] doc = bindery.parse(url, model=OAI_MODEL) #print >> sys.stderr, list(generate_metadata(doc)) resources, first_id = metadata_dict(generate_metadata(doc)) record = doc.OAI_PMH resource = resources[first_id] authors = [ (a, None, None) for a in resource[u'creator'] ] links = [ (DSPACE_ARTICLE_BASE + rel_id, u'alternate'), (u'dspace?id=' + dspace_id, u'self'), ] elements = [ E((ATOM_NAMESPACE, u'content'), {u'src': alt_link}), ] f.append( dspace_id, U(resource['title']), updated=U(resource['date']), summary=U(resource['description']), authors=authors, links=links, #categories=categories, elements=elements, ) #FIXME: indent return f.xml_encode()
links.remove(link) if not links: return () return E( u"div", {u"class": u"seealso-wrapper"}, u"See also:", E( u"ul", {u"class": u"seealso"}, (E(u"li", {u"class": U(link.rel)}, U(link.href)) for link in links if None not in (link.rel, link.href)), ), ) pubfeed = atomtools.feed("http://uche.ogbuji.net/publications", sys.argv[1]) w = structwriter(indent=u"yes").feed( ROOT( E( u"div", {u"class": u"articles"}, ( E( u"article", E(u"h2", (E(u"a", {u"href": main_link(e)}, U(e.title)) if main_link(e) else E(u"a", U(e.title)))), (E(u"h3", U(subtitle)) for subtitle in (e.subtitle or []) if U(subtitle).strip()), ( E( u"div", {u"class": u"author"},
def jove_adapter(search=None, id=None): ''' Sample queries: curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'): for item in islice(doc.xml_select(u'//*[@class="result_table"]//*[@class="article_title"]'), 0, maxarticles): row = item.xml_parent.xml_parent title = unicode(item) alt_link = item.a.href summary = unicode(row.xml_select(u'string(.//*[@class="summary"])')) updated = unicode(row.xml_select(u'string(.//*[@class="publication_date"])')).strip().partition(u'Published: ')[2] #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="authors"]//b)')).split(',') ] keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ] icon = first_item(row.xml_select(u'.//*[@class="thumbnail"]')).img.src icon = ''.join(icon.split()) jove_id = item.a.href[len(JOVE_ARTICLE):] links = [ (JOVE_ADAPTER_BASE + '?id=' + jove_id, u'self'), (icon, u'icon'), #(NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid), u'alternate'), ] #print >> sys.stderr, links #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ] elements = [ E((ATOM_NAMESPACE, u'content'), {u'src': item.a.href}), # E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON), # E((SD_NS, u'sd:journal-name'), unicode(article.journalName)), ] elements.extend([ # E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'href': JOVE_ADAPTER_BASE + '/?id=' + jove_id}), E((ATOM_NAMESPACE, u'link'), {u'rel': u'icon', u'href': icon}), ]) f.append( item.a.href, title, updated=datetime.datetime.now().isoformat(), summary=summary, authors=authors, links=links, categories=keywords, elements=elements, ) #print >> sys.stderr, article.xml_select(u'//*[contains(name(), "journal")]') #entry['journal_cover'] = for e in f.feed.entry: ENTRY_CACHE[jove_id] = e.xml_encode() #FIXME: indent return f.xml_encode()