Example #1
0
key_map = {
    "publisher": "citation_publisher",
    "abstract": "description",
    "issue": "citation_issue",
    "issn": "citation.issn",
    "title": "citation_title",
    "volume": "citation_volume",
    "start_page": "citation_firstpage",
    "end_page": "citation_lastpage"
}

doi = metaheaders.get_item("citation_doi")

if not doi:
    bail('Unable to find a DOI')
    sys.exit(0)

doi = doi.replace("doi:", "")

print "begin_tsv"
print "linkout\tDOI\t\t%s\t\t" % (doi)
print "type\tJOUR"
print "doi\t" + doi
for f in key_map.keys():
    k = key_map[f]
    v = metaheaders.get_item(k)
    if not v:
        continue
    v = v.strip()
    print "%s\t%s" % (f, v)
# key-value pairs in the url.
s = url.split("?")

url_head = s[0]
url_tail = "&".join(s[1:])

# Some IEEE URLs look like ./a/b?&param=value - we need to sort this out
if url_tail[0] == '&':
    url_tail = url_tail[1:]

url = url_head + "?" + url_tail

try:
    ar_number = int(urlparams(url)["arnumber"])
except KeyError:
    bail("Couldn't find an 'arNumber' field in the URL")

metaheaders = metaheaders.MetaHeaders(
    "http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=%d" % ar_number)

root = metaheaders.root

abstract = ''

abstractDiv = root.xpath("//a[@name='Abstract']/../*/text()")

if abstractDiv:
    abstract = abstractDiv[0]
    abstract = re.sub("^Abstract\s*", "", abstract).strip()

#print etree.tostring(root, pretty_print=True)
Example #3
0
handler = urllib2.HTTPHandler(debuglevel=0)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
location = urllib2.urlopen(url)

# we may have followed redirects, esp. from springerlink.com
path = urlparse(location.geturl()).path

page = unicode(location.read().strip(), "utf8")

root = lxml.html.document_fromstring(page)

m = re.search("/([^/]+)/(10\.\d\d\d\d)(?:/|%2f)(.*)", path, re.I)
if not m:
    bail("Unrecognised URL %s - cannot extract a DOI" % url)

(atype, doi_pref, doi_suff) = (m.group(1), m.group(2), m.group(3))
doi = "%s/%s" % (doi_pref, doi_suff)

print "begin_tsv"
print "linkout\tSLINK2\t\t%s\t\t%s" % (atype, doi)
print "linkout\tDOI\t\t%s\t\t" % doi

for div in root.cssselect("div.abstract-content"):
    print "abstract\t%s" % div.xpath("string()").strip()
    # Sometimes have abstracts in different languages, e.g.,
    # http://link.springer.com/article/10.1007%2Fbf01975011
    # Let's assume the 1st one is English.
    break
Example #4
0
socket.setdefaulttimeout(15)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

# Read URL from stdin
url = sys.stdin.readline().strip()

u = urlparse(url)

# urlparse('http://www.cwi.nl:80/%7Eguido/Python.html')
# ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',  params='', query='', fragment='')
q = parse_qs(u.query)
if (q.has_key("articleid")):
    article_id = q["articleid"][0]
else:
    bail("Could not determine the articleId")

# http://proceedings.spiedigitallibrary.org/downloadCitation.aspx?format=ris&articleid=763979
ris_file_url = "http://%s/downloadCitation.aspx?format=ris&articleid=%s" % (
    u.netloc, article_id)
cookie_jar = cookielib.CookieJar()
handlers = []
handlers.append(urllib2.HTTPHandler(debuglevel=0))
handlers.append(urllib2.HTTPCookieProcessor(cookie_jar))

opener = urllib2.build_opener(*handlers)
opener.addheaders = [("User-Agent", "CiteULike/1.0 +http://www.citeulike.org/")
                     ]
urllib2.install_opener(opener)

try:
import re, sys, urlparse, urllib2
from cultools import urlparams, bail
import socket

socket.setdefaulttimeout(15)


#
# Read URL from stdin and check it's OK
#
url = sys.stdin.readline().strip()

#
# Fetch the page - don't need it, but it validates the URL the user posted
#
try:
	page = urllib2.urlopen(url).read().strip()
except:
	bail("Couldn't fetch page (" + url + ")")

isbn = ""

m = re.search(r'isbn=(\w+)', page)
if m:
	isbn = m.group(1)
else:
	bail("Couldn't find an ISBN in that page")

print "status\tredirect\thttp://www.worldcat.org/isbn/%s" % isbn

Example #6
0
import re, sys, urlparse, urllib2
from cultools import urlparams, bail

import socket

socket.setdefaulttimeout(15)

#
# Read URL from stdin and check it's OK
#
url = sys.stdin.readline().strip()

oclc_match = re.search(r'/(oclc|isbn)/([0-9\-]+)', url, re.IGNORECASE)

if not oclc_match:
    bail("Couldn't find either an 'oclc' or 'isbn' in the URL (" + url + ")")

type = oclc_match.group(1)
id = oclc_match.group(2)

#
# Fetch the page - don't need it, but it validates the URL the user posted
#
try:
    page = urllib2.urlopen(url).read().strip()
except:
    bail("Couldn't fetch page (" + url + ")")

if (type == "isbn"):
    isbn = id
    m = re.search(r'/oclc/(\d+)', page)
from cultools import urlparams, bail
import socket

socket.setdefaulttimeout(15)

#
# Read URL from stdin and check it's OK
#
url = sys.stdin.readline().strip()

url_host = urlparse.urlparse(url)[1]

if url_host in ['www.envplan.com', 'www.perceptionweb.com']:
    linkout = 'PION'
else:
    bail("Unrecognised site: " + url_host + " - does the plugin need updating")

try:
    id = urlparams(url)["id"]
except:
    bail("Couldn't find an 'id' field in the URL (" + url + ")")

#
# Fetch the page
#
try:
    page = urllib2.urlopen(url).read().strip()
except:
    bail("Couldn't fetch page (" + url + ")")

#
Example #8
0
if authors:
    for a in authors:
        print "author\t%s" % a

metaheaders.print_item("title", "citation_title")
metaheaders.print_date("citation_publication_date")
metaheaders.print_item("volume", "citation_volume")
metaheaders.print_item("start_page", "citation_firstpage")
metaheaders.print_item("end_page", "citation_lastpage")
metaheaders.print_item("issue", "citation_issue")
metaheaders.print_item("serial", "citation.issn")
publisher = metaheaders.get_item("citation_publisher")
if publisher:
    print "publisher\t%s" % publisher.strip()

metaheaders.print_item("abstract", "description")
metaheaders.print_item("journal", "citation_journal_title")
metaheaders.print_item("title_secondary", "citation_conference")

doi = metaheaders.get_item("citation_doi")
if doi:
    doi = doi.replace("doi:", "")
    print "doi\t%s" % doi
    print "linkout\tDOI\t\t%s\t\t" % (doi)
    print "linkout\tCSDL\t\t%s\t\t" % (doi)
else:
    bail("Couldn't find an DOI")

print "end_tsv"
print "status\tok"