Ejemplo n.º 1
0
    def _get_scopus_url(self, biblio_dict):
        url_template_one_journal = "https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME({first_author})%20AND%20TITLE({title})%20AND%20SRCTITLE({journal})&field=citedby-count&apiKey="+os.environ["SCOPUS_KEY"]+"&insttoken="+os.environ["SCOPUS_INSTTOKEN"]            
        url_template_two_journals = "https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME({first_author})%20AND%20TITLE({title})%20AND%20(SRCTITLE({journal1})%20OR%20SRCTITLE({journal2}))&field=citedby-count&apiKey="+os.environ["SCOPUS_KEY"]+"&insttoken="+os.environ["SCOPUS_INSTTOKEN"]
        url_template_issn = "https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME({first_author})%20AND%20TITLE({title})%20AND%20ISSN({issn})&field=citedby-count&apiKey="+os.environ["SCOPUS_KEY"]+"&insttoken="+os.environ["SCOPUS_INSTTOKEN"]            
        alt_journal_names = {
            "BMJ": "British Medical Journal",
            "Ecol Letters": "Ecology Letters"
            }

        first_author = biblio_dict.get("first_author", None)
        if not first_author:
            first_author = biblio_dict["authors"].split(" ")[0]

        # title lookups go better without question marks
        # see https://api.elsevier.com/content/search/index:SCOPUS?query=AUTHLASTNAME(Piwowar)%20AND%20TITLE(Who%20shares%20Who%20doesn%27t%20Factors%20associated%20with%20openly%20archiving%20raw%20research%20data)%20AND%20SRCTITLE(PLOS%20ONE)&field=citedby-count&apiKey=
        title = to_unicode_or_bust(biblio_dict["title"]).encode('utf8')
        title = title.replace("(", "{(}").replace(")", "{)}")
        title = title.replace("?", "")

        journal = None
        if "journal" in biblio_dict:
            journal = to_unicode_or_bust(biblio_dict["journal"]).encode('utf8')
            journal = journal.replace("(", "{(}").replace(")", "{)}")
            journal = journal.replace(" & ", " and ")

        issn = biblio_dict.get("issn", None)

        url = None
        if title and first_author and journal:
            if journal in alt_journal_names.keys():
                journal1 = journal
                journal2 = alt_journal_names[journal]
                url = url_template_two_journals.format(
                        first_author=urllib.quote(first_author), 
                        title=urllib.quote(title), 
                        journal1=urllib.quote(journal1), 
                        journal2=urllib.quote(journal2))
            elif journal.lower().startswith("the journal"):
                journal1 = journal
                journal2 = re.sub("^the journal", "Journal", journal, flags=re.IGNORECASE)
                url = url_template_two_journals.format(
                        first_author=urllib.quote(first_author), 
                        title=urllib.quote(title), 
                        journal1=urllib.quote(journal1), 
                        journal2=urllib.quote(journal2))                
            else:
                url = url_template_one_journal.format(
                        first_author=urllib.quote(first_author), 
                        title=urllib.quote(title), 
                        journal=urllib.quote(journal))
        elif title and first_author and issn:
            # example: http://www.mendeley.com/research/codeco-grammar-notation-controlled-natural-language-predictive-editors/
            url = url_template_issn.format(
                    first_author=urllib.quote(first_author), 
                    title=urllib.quote(title), 
                    issn=urllib.quote(issn))
        else:
            logger.debug("missing title or journal/issn, so can't look up in scopus using biblio")

        return url
Ejemplo n.º 2
0
 def _to_unicode(self, text):
     text = unicode_helpers.to_unicode_or_bust(text)
     if "{" in text:
         text = text.replace("\\", "")
         for i, j in self.bibtex_to_unicode.iteritems():
             text = text.replace(i, j)
     return text
Ejemplo n.º 3
0
def remove_unneeded_characters(input_string, encoding='utf-8', char_classes_to_remove=["C", "M", "P", "S", "Z"]):
    input_was_unicode = True
    if isinstance(input_string, basestring):
        if not isinstance(input_string, unicode):
            input_was_unicode = False

    unicode_input = to_unicode_or_bust(input_string)

    response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove)

    if not input_was_unicode:
        response = response.encode(encoding)
        
    return response
Ejemplo n.º 4
0
    def _extract_biblio(self, page, id=None):
        biblio_dict = {}

        if not page:
            return biblio_dict
        
        unicode_page = to_unicode_or_bust(page)
        try:
            parsed_html = lxml.html.document_fromstring(unicode_page)

            try:
                response = parsed_html.find(".//title").text
                if response and response.strip():
                    biblio_dict["title"] = response.strip()
            except AttributeError:
                pass

            try:
                response = parsed_html.find(".//h1").text
                if response and response.strip():
                    biblio_dict["h1"] = response.strip()
            except AttributeError:
                pass            

        # throws ParserError when document is empty        
        except (ValueError, lxml.etree.ParserError):
            logger.warning(u"%20s couldn't parse %s so giving up on webpage biblio" 
                            % (self.provider_name, id)) 
            try:
                response = re.search("<title>(.+?)</title>", unicode_page).group(1)
                response.replace("\n", "")
                response.replace("\r", "")
                if response:
                    biblio_dict["title"] = response.strip()
            except AttributeError:
                pass
        return biblio_dict    
Ejemplo n.º 5
0
 def _to_unicode(self, text):
     text = unicode_helpers.to_unicode_or_bust(text)
     return text