def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+ 'URL was: '+request) name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+ 'URL was: '+request) alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except KeyError, IndexError: pass
def fetch_journal(search_terms, matching_mode = 'exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] # Make the title HTML-safe before searching for it in the database or in the API if 'title' in search_terms: search_terms['title'] = kill_html(search_terms['title']) original_search_terms = search_terms.copy() # Check the arguments if not all(map(lambda x: x in allowed_fields, (key for key in search_terms))): raise ValueError('The search terms have to belong to '+str(allowed_fields)+ 'but the dictionary I got is '+str(search_terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in search_terms: search_terms[key] = remove_diacritics(search_terms[key]) # First check we don't have it already journal = find_journal_in_model(search_terms) if journal: return journal # Perform the query root = perform_romeo_query(search_terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: # Retry with a less restrictive matching type if matching_mode == 'exact': return fetch_journal(original_search_terms, 'contains') return None if len(journals) > 1: print ("Warning, "+str(len(journals))+" journals match the RoMEO request, "+ "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException('RoMEO returned a journal without title.\n'+ 'URL was: '+request) if len(names) > 1: print("Warning, "+str(len(names))+" names provided for one journal, "+ "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except KeyError, IndexError: pass
def get_form_initial_data(self): data = super(OSFProtocol, self).get_form_initial_data() if self.paper.abstract: data['abstract'] = kill_html(self.paper.abstract) return data
def get_form_initial_data(self): data = super(HALProtocol, self).get_form_initial_data() data['first_name'] = self.user.first_name data['last_name'] = self.user.last_name # Abstract if self.paper.abstract: data['abstract'] = kill_html(self.paper.abstract) else: self.paper.consolidate_metadata(wait=False) # Topic topic_text = '' if 'abstract' in data: topic_text = data['abstract'] else: topic_text = self.paper.title data['topic'] = self.predict_topic(topic_text) if data['topic'] == 'OTHER': del data['topic'] # Depositing author most_similar_idx = None first, last = (self.user.first_name, self.user.last_name) if first and last: most_similar_idx = most_similar_author((first,last), self.paper.author_name_pairs()) data['depositing_author'] = most_similar_idx return data
def get_form_initial_data(self): data = super(ZenodoProtocol, self).get_form_initial_data() data['license'] = ZENODO_DEFAULT_LICENSE_CHOICE if self.paper.abstract: data['abstract'] = kill_html(self.paper.abstract) else: self.paper.consolidate_metadata(wait=False) return data
def get_form(self): data = {} data['paper_id'] = self.paper.id if self.paper.abstract: data['abstract'] = kill_html(self.paper.abstract) else: self.paper.consolidate_metadata(wait=False) return HALForm(initial=data)
def get_form(self): data = {} data["license"] = "other-open" data["paper_id"] = self.paper.id if self.paper.abstract: data["abstract"] = kill_html(self.paper.abstract) else: self.paper.consolidate_metadata(wait=False) return ZenodoForm(initial=data)
def get_form(self): data = {} data['license'] = 'other-open' data['paper_id'] = self.paper.id if self.paper.abstract: data['abstract'] = kill_html(self.paper.abstract) else: self.paper.consolidate_metadata(wait=False) return ZenodoForm(initial=data)
def waitForConsolidatedField(request): try: paper = Paper.objects.get(pk=int(request.GET["id"])) except (KeyError, ValueError, Paper.DoesNotExist): return HttpResponseForbidden('Invalid paper id', content_type='text/plain') field = request.GET.get('field') value = None success = False paper.consolidate_metadata(wait=True) if field == 'abstract': value = kill_html(paper.abstract) success = len(paper.abstract) > 64 else: return {'success':success,'message':'Invalid field'} return {'success':success,'value':value}
def waitForConsolidatedField(request): success = False try: paper = Paper.objects.get(pk=int(request.GET["id"])) except (KeyError, ValueError, Paper.DoesNotExist): return {'success': success, 'message': 'Invalid paper id'}, 404 field = request.GET.get('field') value = None try: paper.consolidate_metadata(wait=True) except TimeoutError: # Zotero instance is down / slow / failing, consolidation failed. Not # a big deal. pass if field == 'abstract': value = kill_html(paper.abstract) success = len(paper.abstract) > 64 else: return {'success': success, 'message': 'Invalid field'}, 401 return {'success': success, 'value': value}
def test_kill_html(self): self.assertEqual(kill_html('My title<sub>is</sub><a href="http://dissem.in"><sup>nice</sup> </a>'), 'My titleisnice')
def get_or_create_publisher(self, romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO. If the data from RoMEO is more fresh than what we have in cache, we update our model. """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') romeo_parent_id = None try: romeo_parent_id = xml.attrib['parentid'] except KeyError: pass name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass last_update = self._get_romeo_date(xml, './dateupdated') # Check if we already have it. # Sadly the romeo_id is not unique (as publishers imported from doaj # all get the same id, so we have to use the name too). matches = None if re.match(r'\d+', romeo_id): # numeric ids are unambiguous matches = Publisher.objects.filter(romeo_id=romeo_id) elif alias: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: first_match = matches[0] if first_match.last_updated is not None and first_match.last_updated >= last_update: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except (KeyError, IndexError): pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.') postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the postprint policy.') pdfversion = None try: pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the pdf archiving policy.') # Compute OA status of the publisher status = 'UNK' if not matches: publisher = Publisher() else: publisher = matches[0] publisher.name = name publisher.alias = alias publisher.url = url publisher.preprint = preprint publisher.postprint = postprint publisher.pdfversion = pdfversion publisher.romeo_id = romeo_id publisher.romeo_parent_id = romeo_parent_id publisher.oa_status = status publisher.last_updated = last_update publisher.save() if matches: publisher.publishercopyrightlink_set.all().delete() publisher.publisherrestrictiondetail_set.all().delete() publisher.publishercondition_set.all().delete() # Add the conditions, restrictions, and copyright for restriction in xml.findall('./preprints/prerestrictions/prerestriction'): self.add_restriction(restriction, 'preprint', publisher) for restriction in xml.findall('./postprints/postrestrictions/postrestriction'): self.add_restriction(restriction, 'postprint', publisher) for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'): self.add_restriction(restriction, 'pdfversion', publisher) for condition in xml.findall('./conditions/condition'): if condition.text: c = PublisherCondition(publisher=publisher, text=condition.text.strip()) c.save() # Update the publisher status publisher.oa_status = publisher.classify_oa_status() publisher.save(update_fields=['oa_status']) # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the # adequate task for link in xml.findall('./copyrightlinks/copyrightlink'): text = None url = None texts = link.findall('./copyrightlinktext') if texts: text = nstrip(texts[0].text) urls = link.findall('./copyrightlinkurl') if urls: url = nstrip(urls[0].text) if url and text: cplink = PublisherCopyrightLink( text=text, url=url[:1024], publisher=publisher) cplink.save() return publisher
def create_paper_plain_fingerprint(title, authors, year): """ Creates a robust summary of a bibliographic reference. This plain fingerprint should then be converted to an actual fingerprint by hashing it (so that the length remains constant). :param title: the title of the paper :param authors: the list of author names, represented as (first_name, last_name) pairs :param year: the year of publication of the paper >>> create_paper_plain_fingerprint(' It cleans whitespace And Case\\n',[('John','Doe')], 2015) u'it-cleans-whitespace-and-case/doe' >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015) u'html-tags-are-removed/doe' >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015) u'les-accents-sont-supprimes/doe' >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015) u'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors' >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014) u'ambiguity-2014/doe' """ title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('', title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) # if len(title) <= 16: if not '-' in title: buf += '-' + str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]), remove_diacritics(author[1])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i, w in enumerate(last_name_words): if (w[0].isupper() or (i > 0 and last_name_separators[i - 1] == '-')): last_words.append(w) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = map(ulower, last_words) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/' + fp return buf
def createMetadata(self, form): metadata = {} oairecords = self.paper.sorted_oai_records publications = self.paper.publication_set.all() # Document type dt = swordDocumentType(self.paper) metadata['upload_type'] = dt[0] if dt[0] == 'publication': metadata['publication_type'] = dt[1] # Publication date metadata['publication_date'] = self.paper.pubdate.isoformat() # Title metadata['title'] = self.paper.title # Creators def formatAuthor(author): res = {'name': author.name.last + ', ' + author.name.first} if author.researcher and author.researcher.orcid: res['orcid'] = author.researcher.orcid # TODO: affiliation return res metadata['creators'] = map(formatAuthor, self.paper.authors) # Abstract # If we are currently fetching the abstract, wait for the task to complete if self.paper.task: self.paper.consolidate_metadata(wait=True) abstract = form.cleaned_data['abstract'] or kill_html( self.paper.abstract) metadata['description'] = abstract # Access right: TODO # License metadata['license'] = form.cleaned_data['license'] # Embargo date: TODO # DOI for publi in publications: metadata['doi'] = publi.doi if publi.pubdate: metadata['publication_date'] = publi.pubdate.isoformat() if publi.journal: metadata['journal_title'] = publi.journal.title else: metadata['journal_title'] = publi.title if publi.volume: metadata['journal_volume'] = publi.volume if publi.issue: metadata['journal_issue'] = publi.issue if publi.pages: metadata['journal_pages'] = publi.pages if publi.container: metadata['conference_title'] = publi.container break # Keywords TODO (this involves having separated keywords in OAI records.) # Notes TODO # metadata['notes'] = 'Uploaded by dissem.in on behalf of ' … # Related identifiers idents = map( lambda r: { 'relation': 'isAlternateIdentifier', 'identifier': r.splash_url }, oairecords) for publi in publications: if publi.journal and publi.journal.issn: idents.append({ 'relation': 'isPartOf', 'identifier': publi.journal.issn }) data = {"metadata": metadata} return data
def createMetadata(self, form): metadata = {} oairecords = self.paper.sorted_oai_records publications = self.paper.publications # Document type dt = swordDocumentType(self.paper) metadata['upload_type'] = dt[0] if dt[0] == 'publication': metadata['publication_type'] = dt[1] # Publication date metadata['publication_date'] = self.paper.pubdate.isoformat() # Title metadata['title'] = self.paper.title # Creators def formatAuthor(author): res = {'name': author.name.last + ', ' + author.name.first} if author.researcher and author.researcher.orcid: res['orcid'] = author.researcher.orcid # TODO: affiliation return res metadata['creators'] = map(formatAuthor, self.paper.authors) # Abstract abstract = form.cleaned_data['abstract'] or kill_html( self.paper.abstract) metadata['description'] = abstract # Access right: TODO # License metadata['license'] = form.cleaned_data['license'] # Embargo date: TODO # DOI for publi in publications: metadata['doi'] = publi.doi if publi.pubdate: metadata['publication_date'] = publi.pubdate.isoformat() if publi.journal: metadata['journal_title'] = publi.journal.title else: metadata['journal_title'] = publi.journal_title if publi.volume: metadata['journal_volume'] = publi.volume if publi.issue: metadata['journal_issue'] = publi.issue if publi.pages: metadata['journal_pages'] = publi.pages if publi.container: metadata['conference_title'] = publi.container break # Related identifiers idents = map( lambda r: { 'relation': 'isAlternateIdentifier', 'identifier': r.splash_url }, oairecords) for publi in publications: if publi.journal and publi.journal.issn: idents.append({ 'relation': 'isPartOf', 'identifier': publi.journal.issn }) data = {"metadata": metadata} return data
def createMetadata(self, form): metadata = {} oairecords = self.paper.sorted_oai_records publications = self.paper.publications # Document type dt = swordDocumentType(self.paper) metadata['upload_type'] = dt[0] if dt[0] == 'publication': metadata['publication_type'] = dt[1] # Publication date metadata['publication_date'] = self.paper.pubdate.isoformat() # Title metadata['title'] = self.paper.title # Creators def formatAuthor(author): res = {'name': author.name.last+', '+author.name.first} if author.researcher and author.researcher.orcid: res['orcid'] = author.researcher.orcid # TODO: affiliation return res metadata['creators'] = list(map(formatAuthor, self.paper.authors)) # Abstract abstract = form.cleaned_data[ 'abstract'] or kill_html(self.paper.abstract) metadata['description'] = abstract # Access right: TODO # License metadata['license'] = form.cleaned_data['license'] # Embargo date: TODO # DOI for publi in publications: metadata['doi'] = publi.doi if publi.pubdate: metadata['publication_date'] = publi.pubdate.isoformat() if publi.journal: metadata['journal_title'] = publi.journal.title else: metadata['journal_title'] = publi.journal_title if publi.volume: metadata['journal_volume'] = publi.volume if publi.issue: metadata['journal_issue'] = publi.issue if publi.pages: metadata['journal_pages'] = publi.pages if publi.container: metadata['conference_title'] = publi.container break # Related identifiers idents = [{ 'relation': 'isAlternateIdentifier', 'identifier': r.splash_url } for r in oairecords] for publi in publications: if publi.journal and publi.journal.issn: idents.append( {'relation': 'isPartOf', 'identifier': publi.journal.issn}) data = {"metadata": metadata} return data
def create_paper_plain_fingerprint(title, authors, year): """ Creates a robust summary of a bibliographic reference. This plain fingerprint should then be converted to an actual fingerprint by hashing it (so that the length remains constant). :param title: the title of the paper :param authors: the list of author names, represented as (first_name, last_name) pairs :param year: the year of publication of the paper >>> create_paper_plain_fingerprint(' It cleans whitespace And Case\\n',[('John','Doe')], 2015) 'it-cleans-whitespace-and-case/doe' >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015) 'html-tags-are-removed/doe' >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015) 'les-accents-sont-supprimes/doe' >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015) 'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors' >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014) 'ambiguity-2014/doe' """ title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('', title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) # if len(title) <= 16: if not '-' in title: buf += '-'+str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]), remove_diacritics(author[1])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i, w in enumerate(last_name_words): if (w[0].isupper() or (i > 0 and last_name_separators[i-1] == '-')): last_words.append(w) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = list(map(ulower, last_words)) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/'+fp return buf
def createMetadata(self, form): metadata = {} oairecords = self.paper.sorted_oai_records publications = self.paper.publications # Document type dt = swordDocumentType(self.paper) metadata['upload_type'] = dt[0] if dt[0] == 'publication': metadata['publication_type'] = dt[1] # Publication date metadata['publication_date'] = self.paper.pubdate.isoformat() # Title metadata['title'] = self.paper.title # Creators def formatAuthor(author): res = {'name':author.name.last+', '+author.name.first} if author.researcher and author.researcher.orcid: res['orcid'] = author.researcher.orcid # TODO: affiliation return res metadata['creators'] = map(formatAuthor, self.paper.authors) # Abstract # If we are currently fetching the abstract, wait for the task to complete if self.paper.task: self.paper.consolidate_metadata(wait=True) abstract = form.cleaned_data['abstract'] or kill_html(self.paper.abstract) metadata['description'] = abstract # Access right: TODO # License metadata['license'] = form.cleaned_data['license'] # Embargo date: TODO # DOI for publi in publications: metadata['doi'] = publi.doi if publi.pubdate: metadata['publication_date'] = publi.pubdate.isoformat() if publi.journal: metadata['journal_title'] = publi.journal.title else: metadata['journal_title'] = publi.title if publi.volume: metadata['journal_volume'] = publi.volume if publi.issue: metadata['journal_issue'] = publi.issue if publi.pages: metadata['journal_pages'] = publi.pages if publi.container: metadata['conference_title'] = publi.container break # Keywords TODO (this involves having separated keywords in OAI records.) # Notes TODO # metadata['notes'] = 'Uploaded by dissem.in on behalf of ' … # Related identifiers idents = map(lambda r: {'relation':'isAlternateIdentifier','identifier':r.splash_url}, oairecords) for publi in publications: if publi.journal and publi.journal.issn: idents.append({'relation':'isPartOf','identifier':publi.journal.issn}) data = {"metadata": metadata} return data
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] terms = search_terms.copy() # Make the title HTML-safe before searching for it in the database or in # the API if 'title' in terms: terms['title'] = kill_html(terms['title']) # Check the arguments if not all(key in allowed_fields for key in terms): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in terms: terms[key] = remove_diacritics(terms[key]) if len(terms[key]) > 256: return None # First check we don't have it already journal = find_journal_in_model(terms) if journal: return journal # Perform the query if matching_mode != 'exact': terms['qtype'] = matching_mode root = perform_romeo_query(terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: return None elif len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'Terms were: ' + unicode(terms)) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except (KeyError, IndexError): pass # Now we may have additional info, so it's worth trying again in the model model_journal = find_journal_in_model({'issn': issn, 'jtitle': name}) if model_journal: return model_journal # Otherwise we need to find the publisher publishers = root.findall('./publishers/publisher') if not publishers: return None # TODO here we shouldn't default to the first one but look it up using the # <romeopub> publisher_desc = publishers[0] publisher = get_or_create_publisher(publisher_desc) result = Journal(title=name, issn=issn, publisher=publisher) result.save() return result