key_map = { "publisher": "citation_publisher", "abstract": "description", "issue": "citation_issue", "issn": "citation.issn", "title": "citation_title", "volume": "citation_volume", "start_page": "citation_firstpage", "end_page": "citation_lastpage" } doi = metaheaders.get_item("citation_doi") if not doi: bail('Unable to find a DOI') sys.exit(0) doi = doi.replace("doi:", "") print "begin_tsv" print "linkout\tDOI\t\t%s\t\t" % (doi) print "type\tJOUR" print "doi\t" + doi for f in key_map.keys(): k = key_map[f] v = metaheaders.get_item(k) if not v: continue v = v.strip() print "%s\t%s" % (f, v)
# key-value pairs in the url. s = url.split("?") url_head = s[0] url_tail = "&".join(s[1:]) # Some IEEE URLs look like ./a/b?¶m=value - we need to sort this out if url_tail[0] == '&': url_tail = url_tail[1:] url = url_head + "?" + url_tail try: ar_number = int(urlparams(url)["arnumber"]) except KeyError: bail("Couldn't find an 'arNumber' field in the URL") metaheaders = metaheaders.MetaHeaders( "http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=%d" % ar_number) root = metaheaders.root abstract = '' abstractDiv = root.xpath("//a[@name='Abstract']/../*/text()") if abstractDiv: abstract = abstractDiv[0] abstract = re.sub("^Abstract\s*", "", abstract).strip() #print etree.tostring(root, pretty_print=True)
handler = urllib2.HTTPHandler(debuglevel=0) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) location = urllib2.urlopen(url) # we may have followed redirects, esp. from springerlink.com path = urlparse(location.geturl()).path page = unicode(location.read().strip(), "utf8") root = lxml.html.document_fromstring(page) m = re.search("/([^/]+)/(10\.\d\d\d\d)(?:/|%2f)(.*)", path, re.I) if not m: bail("Unrecognised URL %s - cannot extract a DOI" % url) (atype, doi_pref, doi_suff) = (m.group(1), m.group(2), m.group(3)) doi = "%s/%s" % (doi_pref, doi_suff) print "begin_tsv" print "linkout\tSLINK2\t\t%s\t\t%s" % (atype, doi) print "linkout\tDOI\t\t%s\t\t" % doi for div in root.cssselect("div.abstract-content"): print "abstract\t%s" % div.xpath("string()").strip() # Sometimes have abstracts in different languages, e.g., # http://link.springer.com/article/10.1007%2Fbf01975011 # Let's assume the 1st one is English. break
socket.setdefaulttimeout(15) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) # Read URL from stdin url = sys.stdin.readline().strip() u = urlparse(url) # urlparse('http://www.cwi.nl:80/%7Eguido/Python.html') # ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') q = parse_qs(u.query) if (q.has_key("articleid")): article_id = q["articleid"][0] else: bail("Could not determine the articleId") # http://proceedings.spiedigitallibrary.org/downloadCitation.aspx?format=ris&articleid=763979 ris_file_url = "http://%s/downloadCitation.aspx?format=ris&articleid=%s" % ( u.netloc, article_id) cookie_jar = cookielib.CookieJar() handlers = [] handlers.append(urllib2.HTTPHandler(debuglevel=0)) handlers.append(urllib2.HTTPCookieProcessor(cookie_jar)) opener = urllib2.build_opener(*handlers) opener.addheaders = [("User-Agent", "CiteULike/1.0 +http://www.citeulike.org/") ] urllib2.install_opener(opener) try:
import re, sys, urlparse, urllib2 from cultools import urlparams, bail import socket socket.setdefaulttimeout(15) # # Read URL from stdin and check it's OK # url = sys.stdin.readline().strip() # # Fetch the page - don't need it, but it validates the URL the user posted # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") isbn = "" m = re.search(r'isbn=(\w+)', page) if m: isbn = m.group(1) else: bail("Couldn't find an ISBN in that page") print "status\tredirect\thttp://www.worldcat.org/isbn/%s" % isbn
import re, sys, urlparse, urllib2 from cultools import urlparams, bail import socket socket.setdefaulttimeout(15) # # Read URL from stdin and check it's OK # url = sys.stdin.readline().strip() oclc_match = re.search(r'/(oclc|isbn)/([0-9\-]+)', url, re.IGNORECASE) if not oclc_match: bail("Couldn't find either an 'oclc' or 'isbn' in the URL (" + url + ")") type = oclc_match.group(1) id = oclc_match.group(2) # # Fetch the page - don't need it, but it validates the URL the user posted # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") if (type == "isbn"): isbn = id m = re.search(r'/oclc/(\d+)', page)
from cultools import urlparams, bail import socket socket.setdefaulttimeout(15) # # Read URL from stdin and check it's OK # url = sys.stdin.readline().strip() url_host = urlparse.urlparse(url)[1] if url_host in ['www.envplan.com', 'www.perceptionweb.com']: linkout = 'PION' else: bail("Unrecognised site: " + url_host + " - does the plugin need updating") try: id = urlparams(url)["id"] except: bail("Couldn't find an 'id' field in the URL (" + url + ")") # # Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") #
if authors: for a in authors: print "author\t%s" % a metaheaders.print_item("title", "citation_title") metaheaders.print_date("citation_publication_date") metaheaders.print_item("volume", "citation_volume") metaheaders.print_item("start_page", "citation_firstpage") metaheaders.print_item("end_page", "citation_lastpage") metaheaders.print_item("issue", "citation_issue") metaheaders.print_item("serial", "citation.issn") publisher = metaheaders.get_item("citation_publisher") if publisher: print "publisher\t%s" % publisher.strip() metaheaders.print_item("abstract", "description") metaheaders.print_item("journal", "citation_journal_title") metaheaders.print_item("title_secondary", "citation_conference") doi = metaheaders.get_item("citation_doi") if doi: doi = doi.replace("doi:", "") print "doi\t%s" % doi print "linkout\tDOI\t\t%s\t\t" % (doi) print "linkout\tCSDL\t\t%s\t\t" % (doi) else: bail("Couldn't find an DOI") print "end_tsv" print "status\tok"