def _get_scopus_url(self, biblio_dict): url_template_one_journal = "https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME({first_author})%20AND%20TITLE({title})%20AND%20SRCTITLE({journal})&field=citedby-count&apiKey="+os.environ["SCOPUS_KEY"]+"&insttoken="+os.environ["SCOPUS_INSTTOKEN"] url_template_two_journals = "https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME({first_author})%20AND%20TITLE({title})%20AND%20(SRCTITLE({journal1})%20OR%20SRCTITLE({journal2}))&field=citedby-count&apiKey="+os.environ["SCOPUS_KEY"]+"&insttoken="+os.environ["SCOPUS_INSTTOKEN"] url_template_issn = "https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME({first_author})%20AND%20TITLE({title})%20AND%20ISSN({issn})&field=citedby-count&apiKey="+os.environ["SCOPUS_KEY"]+"&insttoken="+os.environ["SCOPUS_INSTTOKEN"] alt_journal_names = { "BMJ": "British Medical Journal", "Ecol Letters": "Ecology Letters" } first_author = biblio_dict.get("first_author", None) if not first_author: first_author = biblio_dict["authors"].split(" ")[0] # title lookups go better without question marks # see https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME(Piwowar)%20AND%20TITLE(Who%20shares%20Who%20doesn%27t%20Factors%20associated%20with%20openly%20archiving%20raw%20research%20data)%20AND%20SRCTITLE(PLOS%20ONE)&field=citedby-count&apiKey= title = to_unicode_or_bust(biblio_dict["title"]).encode('utf8') title = title.replace("(", "{(}").replace(")", "{)}") title = title.replace("?", "") journal = None if "journal" in biblio_dict: journal = to_unicode_or_bust(biblio_dict["journal"]).encode('utf8') journal = journal.replace("(", "{(}").replace(")", "{)}") journal = journal.replace(" & ", " and ") issn = biblio_dict.get("issn", None) url = None if title and first_author and journal: if journal in alt_journal_names.keys(): journal1 = journal journal2 = alt_journal_names[journal] url = url_template_two_journals.format( first_author=urllib.quote(first_author), title=urllib.quote(title), journal1=urllib.quote(journal1), journal2=urllib.quote(journal2)) elif journal.lower().startswith("the journal"): journal1 = journal journal2 = re.sub("^the journal", "Journal", journal, flags=re.IGNORECASE) url = url_template_two_journals.format( first_author=urllib.quote(first_author), title=urllib.quote(title), journal1=urllib.quote(journal1), journal2=urllib.quote(journal2)) else: url = url_template_one_journal.format( first_author=urllib.quote(first_author), title=urllib.quote(title), journal=urllib.quote(journal)) elif title and first_author and issn: # example: http://www.mendeley.com/research/codeco-grammar-notation-controlled-natural-language-predictive-editors/ url = url_template_issn.format( first_author=urllib.quote(first_author), title=urllib.quote(title), issn=urllib.quote(issn)) else: logger.debug("missing title or journal/issn, so can't look up in scopus using biblio") return url
def _to_unicode(self, text): text = unicode_helpers.to_unicode_or_bust(text) if "{" in text: text = text.replace("\\", "") for i, j in self.bibtex_to_unicode.iteritems(): text = text.replace(i, j) return text
def remove_unneeded_characters(input_string, encoding='utf-8', char_classes_to_remove=["C", "M", "P", "S", "Z"]): input_was_unicode = True if isinstance(input_string, basestring): if not isinstance(input_string, unicode): input_was_unicode = False unicode_input = to_unicode_or_bust(input_string) response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove) if not input_was_unicode: response = response.encode(encoding) return response
def _extract_biblio(self, page, id=None): biblio_dict = {} if not page: return biblio_dict unicode_page = to_unicode_or_bust(page) try: parsed_html = lxml.html.document_fromstring(unicode_page) try: response = parsed_html.find(".//title").text if response and response.strip(): biblio_dict["title"] = response.strip() except AttributeError: pass try: response = parsed_html.find(".//h1").text if response and response.strip(): biblio_dict["h1"] = response.strip() except AttributeError: pass # throws ParserError when document is empty except (ValueError, lxml.etree.ParserError): logger.warning(u"%20s couldn't parse %s so giving up on webpage biblio" % (self.provider_name, id)) try: response = re.search("<title>(.+?)</title>", unicode_page).group(1) response.replace("\n", "") response.replace("\r", "") if response: biblio_dict["title"] = response.strip() except AttributeError: pass return biblio_dict
def _to_unicode(self, text): text = unicode_helpers.to_unicode_or_bust(text) return text