def url_to_id(url, page): # first try meta headers metaheaders = MetaHeaders(page=page,name='scheme') jstoreId = metaheaders.get_item("jstore-stable") doi = metaheaders.get_item("doi") if doi and jstoreId: print "doi=%s, id=%s" % (doi,jstoreId) return (jstoreId, doi) # If there's a doi=DOI in the URL then we'll have that try: doi = urlparams(url)["doi"] m = re.search(r'(10.\d\d\d\d/(\d+))', doi) if m: return (int(m.group(2)),m.group(1)) m = re.search(r'(10.\d\d\d\d/.+)', doi) if m: return (None,m.group(1)) except KeyError: pass # If it's the old style SICI then, annoyingly, we'll need to fetch it if 'sici=' in url: m = re.search(r'<a id="info" href="/stable/(\d+)">Article Information</a>', page) if m: return (int(m.group(1)), None) else: return (None, None) # Otherwise assume anything which looks like /123123/ is an ID #m = re.search(r'https?://.*?jstor.+?/(\d{4,})(/|$|\?|#)', url) m = re.search(r'/(10.\d\d\d\d/(\d+))', url) if m: return (m.group(1), m.group(1)) # sometimes there's a general DOI, no jstore ID m = re.search(r'/(10.\d\d\d\d/.+)', url) if m: return (None, m.group(1)) # plain old jstore ID, at least 4 digits m = re.search(r'/(\d{4,})', url) if m: return (int(m.group(1)), None) return (None,None)
def url_to_id(url, page): # first try meta headers metaheaders = MetaHeaders(page=page, name='scheme') jstoreId = metaheaders.get_item("jstore-stable") doi = metaheaders.get_item("doi") if doi and jstoreId: print "doi=%s, id=%s" % (doi, jstoreId) return (jstoreId, doi) # If there's a doi=DOI in the URL then we'll have that try: doi = urlparams(url)["doi"] m = re.search(r'(10.\d\d\d\d/(\d+))', doi) if m: return (int(m.group(2)), m.group(1)) m = re.search(r'(10.\d\d\d\d/.+)', doi) if m: return (None, m.group(1)) except KeyError: pass # If it's the old style SICI then, annoyingly, we'll need to fetch it if 'sici=' in url: m = re.search( r'<a id="info" href="/stable/(\d+)">Article Information</a>', page) if m: return (int(m.group(1)), None) else: return (None, None) # Otherwise assume anything which looks like /123123/ is an ID #m = re.search(r'https?://.*?jstor.+?/(\d{4,})(/|$|\?|#)', url) m = re.search(r'/(10.\d\d\d\d/(\d+))', url) if m: return (m.group(1), m.group(1)) # sometimes there's a general DOI, no jstore ID m = re.search(r'/(10.\d\d\d\d/.+)', url) if m: return (None, m.group(1)) # plain old jstore ID, at least 4 digits m = re.search(r'/(\d{4,})', url) if m: return (int(m.group(1)), None) return (None, None)
import codecs import metaheaders #from subprocess import Popen, PIPE from lxml import etree socket.setdefaulttimeout(15) warnings.simplefilter("ignore", DeprecationWarning) # Read URL from stdin url = sys.stdin.readline().strip() sys.stdout = codecs.getwriter('utf-8')(sys.stdout) if url.startswith("http://ieeexplore.ieee.org/Xplore/login.jsp?url="): url = unquote(urlparams(url)["url"]) # Some IEEE urls are malformed and have ? characters instead of & to separate # key-value pairs in the url. s = url.split("?") url_head = s[0] url_tail = "&".join(s[1:]) # Some IEEE URLs look like ./a/b?¶m=value - we need to sort this out if url_tail[0] == '&': url_tail = url_tail[1:] url = url_head + "?" + url_tail try:
#from subprocess import Popen, PIPE from lxml import etree socket.setdefaulttimeout(15) warnings.simplefilter("ignore",DeprecationWarning) # Read URL from stdin url = sys.stdin.readline().strip() sys.stdout = codecs.getwriter('utf-8')(sys.stdout) if url.startswith("http://ieeexplore.ieee.org/Xplore/login.jsp?url="): url = unquote(urlparams(url)["url"]) # Some IEEE urls are malformed and have ? characters instead of & to separate # key-value pairs in the url. s = url.split("?") url_head = s[0] url_tail = "&".join(s[1:]) # Some IEEE URLs look like ./a/b?¶m=value - we need to sort this out if url_tail[0] == '&': url_tail = url_tail[1:] url = url_head + "?" + url_tail
socket.setdefaulttimeout(15) # # Read URL from stdin and check it's OK # url = sys.stdin.readline().strip() url_host = urlparse.urlparse(url)[1] if url_host in ['www.envplan.com', 'www.perceptionweb.com']: linkout = 'PION' else: bail("Unrecognised site: " + url_host + " - does the plugin need updating") try: id = urlparams(url)["id"] except: bail("Couldn't find an 'id' field in the URL (" + url + ")") # # Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") # # Fetch the RIS file # ris_file_url = 'http://' + url_host + "/ris.cgi?id=" + id
# # Read URL from stdin and check it's OK # url = sys.stdin.readline().strip() url_host = urlparse.urlparse(url)[1] if url_host in [ 'www.envplan.com', 'www.perceptionweb.com' ]: linkout = 'PION' else: bail("Unrecognised site: " + url_host + " - does the plugin need updating") try: id = urlparams(url)["id"] except: bail("Couldn't find an 'id' field in the URL (" + url + ")") # # Fetch the page # try: page = urllib2.urlopen(url).read().strip() except: bail("Couldn't fetch page (" + url + ")") # # Fetch the RIS file #