Ejemplo n.º 1
0
def url_to_id(url, page):


	# first try meta headers
	metaheaders = MetaHeaders(page=page,name='scheme')

	jstoreId = metaheaders.get_item("jstore-stable")
	doi = metaheaders.get_item("doi")

	if doi and jstoreId:
		print "doi=%s, id=%s" % (doi,jstoreId)
		return (jstoreId, doi)

	# If there's a doi=DOI in the URL then we'll have that
	try:
		doi = urlparams(url)["doi"]
		m = re.search(r'(10.\d\d\d\d/(\d+))', doi)
		if m:
			return (int(m.group(2)),m.group(1))
		m = re.search(r'(10.\d\d\d\d/.+)', doi)
		if m:
			return (None,m.group(1))
	except KeyError:
		pass


	# If it's the old style SICI then, annoyingly, we'll need to fetch it
	if 'sici=' in url:
		m = re.search(r'<a id="info" href="/stable/(\d+)">Article Information</a>', page)
		if m:
			return (int(m.group(1)), None)
		else:
			return (None, None)

	# Otherwise assume anything which looks like /123123/ is an ID
	#m = re.search(r'https?://.*?jstor.+?/(\d{4,})(/|$|\?|#)', url)
	m = re.search(r'/(10.\d\d\d\d/(\d+))', url)
	if m:
		return (m.group(1), m.group(1))

	# sometimes there's a general DOI, no jstore ID
	m = re.search(r'/(10.\d\d\d\d/.+)', url)
	if m:
		return (None, m.group(1))


	# plain old jstore ID, at least 4 digits
	m = re.search(r'/(\d{4,})', url)
	if m:
		return (int(m.group(1)), None)

	return (None,None)
Ejemplo n.º 2
0
def url_to_id(url, page):

    # first try meta headers
    metaheaders = MetaHeaders(page=page, name='scheme')

    jstoreId = metaheaders.get_item("jstore-stable")
    doi = metaheaders.get_item("doi")

    if doi and jstoreId:
        print "doi=%s, id=%s" % (doi, jstoreId)
        return (jstoreId, doi)

    # If there's a doi=DOI in the URL then we'll have that
    try:
        doi = urlparams(url)["doi"]
        m = re.search(r'(10.\d\d\d\d/(\d+))', doi)
        if m:
            return (int(m.group(2)), m.group(1))
        m = re.search(r'(10.\d\d\d\d/.+)', doi)
        if m:
            return (None, m.group(1))
    except KeyError:
        pass

    # If it's the old style SICI then, annoyingly, we'll need to fetch it
    if 'sici=' in url:
        m = re.search(
            r'<a id="info" href="/stable/(\d+)">Article Information</a>', page)
        if m:
            return (int(m.group(1)), None)
        else:
            return (None, None)

    # Otherwise assume anything which looks like /123123/ is an ID
    #m = re.search(r'https?://.*?jstor.+?/(\d{4,})(/|$|\?|#)', url)
    m = re.search(r'/(10.\d\d\d\d/(\d+))', url)
    if m:
        return (m.group(1), m.group(1))

    # sometimes there's a general DOI, no jstore ID
    m = re.search(r'/(10.\d\d\d\d/.+)', url)
    if m:
        return (None, m.group(1))

    # plain old jstore ID, at least 4 digits
    m = re.search(r'/(\d{4,})', url)
    if m:
        return (int(m.group(1)), None)

    return (None, None)
Ejemplo n.º 3
0
import codecs
import metaheaders
#from subprocess import Popen, PIPE
from lxml import etree

socket.setdefaulttimeout(15)

warnings.simplefilter("ignore", DeprecationWarning)

# Read URL from stdin
url = sys.stdin.readline().strip()

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

if url.startswith("http://ieeexplore.ieee.org/Xplore/login.jsp?url="):
    url = unquote(urlparams(url)["url"])

# Some IEEE urls are malformed and have ? characters instead of & to separate
# key-value pairs in the url.
s = url.split("?")

url_head = s[0]
url_tail = "&".join(s[1:])

# Some IEEE URLs look like ./a/b?&param=value - we need to sort this out
if url_tail[0] == '&':
    url_tail = url_tail[1:]

url = url_head + "?" + url_tail

try:
#from subprocess import Popen, PIPE
from lxml import etree

socket.setdefaulttimeout(15)

warnings.simplefilter("ignore",DeprecationWarning)


# Read URL from stdin
url = sys.stdin.readline().strip()

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)


if url.startswith("http://ieeexplore.ieee.org/Xplore/login.jsp?url="):
	url = unquote(urlparams(url)["url"])


# Some IEEE urls are malformed and have ? characters instead of & to separate
# key-value pairs in the url.
s = url.split("?")

url_head = s[0]
url_tail = "&".join(s[1:])

# Some IEEE URLs look like ./a/b?&param=value - we need to sort this out
if url_tail[0] == '&':
	url_tail = url_tail[1:]

url = url_head + "?" + url_tail
Ejemplo n.º 5
0
socket.setdefaulttimeout(15)

#
# Read URL from stdin and check it's OK
#
url = sys.stdin.readline().strip()

url_host = urlparse.urlparse(url)[1]

if url_host in ['www.envplan.com', 'www.perceptionweb.com']:
    linkout = 'PION'
else:
    bail("Unrecognised site: " + url_host + " - does the plugin need updating")

try:
    id = urlparams(url)["id"]
except:
    bail("Couldn't find an 'id' field in the URL (" + url + ")")

#
# Fetch the page
#
try:
    page = urllib2.urlopen(url).read().strip()
except:
    bail("Couldn't fetch page (" + url + ")")

#
# Fetch the RIS file
#
ris_file_url = 'http://' + url_host + "/ris.cgi?id=" + id

#
# Read URL from stdin and check it's OK
#
url = sys.stdin.readline().strip()

url_host = urlparse.urlparse(url)[1]

if url_host in [ 'www.envplan.com', 'www.perceptionweb.com' ]:
	linkout = 'PION'
else:
	bail("Unrecognised site: " + url_host + " - does the plugin need updating")

try:
	id = urlparams(url)["id"]
except:
	bail("Couldn't find an 'id' field in the URL (" + url + ")")

#
# Fetch the page
#
try:
	page = urllib2.urlopen(url).read().strip()
except:
	bail("Couldn't fetch page (" + url + ")")


#
# Fetch the RIS file
#