def test_to_paper(self, container_title, title, citeproc):
     p = self.test_class.to_paper(citeproc)
     # Ensure that paper is in database (i.e. created)
     assert p.pk >= 1
     # Check paper fields
     for author_p, author_c in zip(p.authors_list, citeproc['author']):
         assert author_p['name']['first'] == author_c['given']
         assert author_p['name']['last'] == author_c['family']
         assert author_p['affiliation'] == author_c['affiliation'][0][
             'name']
         assert author_p['orcid'] == author_c['ORCID']
     assert p.pubdate == date(*citeproc['issued']['date-parts'][0])
     assert p.title == title
     # Ensure that oairecord is in database (i.e. created)
     r = OaiRecord.objects.get(about=p)
     # Check oairecord fields
     assert r.doi == citeproc['DOI']
     assert r.identifier == doi_to_crossref_identifier(citeproc['DOI'])
     assert r.issue == citeproc['issue']
     assert r.journal_title == container_title
     assert r.pages == citeproc['page']
     assert r.pubdate == date(*citeproc['issued']['date-parts'][0])
     assert r.publisher_name == citeproc['publisher']
     assert r.source == OaiSource.objects.get(identifier='crossref')
     assert r.splash_url == doi_to_url(citeproc['DOI'])
     assert r.volume == citeproc['volume']
Exemple #2
0
    def create_oairecord(self, record):
        """
        Given one line of the dump (represented as a dict),
        add it to the corresponding paper (if it exists)
        """
        doi = to_doi(record['doi'])
        if not doi:
            return
        prefix = doi.split('/')[0]
        if prefix in free_doi_prefixes:
            return

        paper = Paper.get_by_doi(doi)
        if not paper:
            try:
                paper = Paper.create_by_doi(doi)
            except (MetadataSourceException, ValueError):
                return
            if not paper:
                print('no such paper for doi {doi}'.format(doi=doi))
                return

        url = record['url']

        # just to speed things up a bit...
        if paper.pdf_url == url:
            return

        identifier = 'oadoi:' + url
        source = self.oadoi_source

        if record['host_type'] == 'publisher':
            url = doi_to_url(doi)
            identifier = doi_to_crossref_identifier(doi)
            source = self.crossref_source

        record = BareOaiRecord(paper=paper,
                               doi=doi,
                               pubtype=paper.doctype,
                               source=source,
                               identifier=identifier,
                               splash_url=url,
                               pdf_url=record['url'])
        try:
            paper.add_oairecord(record)
            paper.update_availability()
            # TODO re-enable this
            #paper.update_index()
        except (DataError, ValueError):
            print('Record does not fit in the DB')
 def test_get_oairecord_data(self, db, monkeypatch, container_title, issn,
                             citeproc, journal):
     """
     We do some assertions on the results, but relatively lax, as we test the called functions, too
     """
     monkeypatch.setattr(Journal, 'find', lambda issn, title: journal)
     r = self.test_class._get_oairecord_data(citeproc)
     assert r['doi'] == citeproc['DOI']
     assert r['description'] == citeproc['abstract']
     assert r['identifier'] == doi_to_crossref_identifier(citeproc['DOI'])
     assert r['issn'] == issn
     assert r['issue'] == citeproc['issue']
     assert r['journal'] == journal
     assert r['journal_title'] == container_title
     assert r['pages'] == citeproc['page']
     assert r['pdf_url'] == ''  # Is not OA
     assert r['pubdate'] == date(*citeproc['issued']['date-parts'][0])
     assert r['publisher_name'] == citeproc['publisher']
     assert r['pubtype'] == citeproc['type']
     assert r['source'] == OaiSource.objects.get(identifier='crossref')
     assert r['splash_url'] == doi_to_url(citeproc['DOI'])
     assert r['volume'] == citeproc['volume']
Exemple #4
0
    def _get_oairecord_data(cls, data):
        """
        :param data: citeproc metadata
        :returns: Returns a dict, ready to passed to a BarePaper instance
        :raises: CiteprocError
        """
        doi = cls._get_doi(data)
        splash_url = doi_to_url(doi)
        licenses = data.get('licenses', [])
        pdf_url = cls._get_pdf_url(doi, licenses, splash_url)

        journal_title = cls._get_container(data)
        issn = cls._get_issn(data)
        journal = Journal.find(issn=issn, title=journal_title)

        publisher_name = data.get('publisher', '')[:512]
        publisher = cls._get_publisher(publisher_name, journal)

        bare_oairecord_data = {
            'doi': doi,
            'description': cls._get_abstract(data),
            'identifier': doi_to_crossref_identifier(doi),
            'issn': issn,
            'issue': data.get('issue', ''),
            'journal': journal,
            'journal_title': journal_title,
            'pages': data.get('page', ''),
            'pdf_url': pdf_url,
            'pubdate': cls._get_pubdate(data),
            'publisher': publisher,
            'publisher_name': publisher_name,
            'pubtype': cls._get_pubtype(data),
            'source': OaiSource.objects.get(identifier='crossref'),
            'splash_url': splash_url,
            'volume': data.get('volume', ''),
        }

        return bare_oairecord_data
Exemple #5
0
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not metadata.get('container-title'):
        return
    doi = to_doi(metadata.get('DOI', None))

    title = metadata['container-title']
    if isinstance(title, list):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN', None)
    if issn and isinstance(issn, list):
        issn = issn[0]  # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume', None)
    pages = metadata.get('page', None)
    issue = metadata.get('issue', None)
    date_dict = metadata.get('issued', dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type', 'unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)
    splash_url = doi_to_url(doi)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL')
                    for license in metadata.get('license', [])])
    doi_prefix = doi.split('/')[0]
    if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)):
        pdf_url = splash_url

    # Lookup journal
    search_terms = {'jtitle': title}
    if issn:
        search_terms['issn'] = issn
    journal = fetch_journal(search_terms)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = fetch_publisher(publisher_name)

    barepub = BareOaiRecord(
            paper=paper,
            journal_title=title,
            issue=issue,
            volume=volume,
            pubdate=pubdate,
            pages=pages,
            doi=doi,
            pubtype=pubtype,
            publisher_name=publisher_name,
            journal=journal,
            publisher=publisher,
            pdf_url=pdf_url,
            splash_url=splash_url,
            source=OaiSource.objects.get(identifier='crossref'),
            identifier=doi_to_crossref_identifier(doi))
    rec = paper.add_oairecord(barepub)
    paper.update_availability()
    return paper, rec
Exemple #6
0
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not metadata.get('container-title'):
        return
    doi = to_doi(metadata.get('DOI', None))

    title = metadata['container-title']
    if isinstance(title, list):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN', None)
    if issn and isinstance(issn, list):
        issn = issn[0]  # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume', None)
    pages = metadata.get('page', None)
    issue = metadata.get('issue', None)
    date_dict = metadata.get('issued', dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type', 'unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)
    splash_url = doi_to_url(doi)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL')
                    for license in metadata.get('license', [])])
    doi_prefix = doi.split('/')[0]
    if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)):
        pdf_url = splash_url

    # Lookup journal
    journal = Journal.find(issn=issn, title=title)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = Publisher.find(publisher_name)

    barepub = BareOaiRecord(
            paper=paper,
            journal_title=title,
            issue=issue,
            volume=volume,
            pubdate=pubdate,
            pages=pages,
            doi=doi,
            pubtype=pubtype,
            publisher_name=publisher_name,
            journal=journal,
            publisher=publisher,
            pdf_url=pdf_url,
            splash_url=splash_url,
            source=OaiSource.objects.get(identifier='crossref'),
            identifier=doi_to_crossref_identifier(doi))
    rec = paper.add_oairecord(barepub)
    paper.update_availability()
    return paper, rec
Exemple #7
0
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not 'container-title' in metadata or not metadata['container-title']:
        return
    doi = to_doi(metadata.get('DOI',None))

    title = metadata['container-title']
    if type(title) == type([]):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN',None)
    if issn and type(issn) == type([]):
        issn = issn[0] # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume',None)
    pages = metadata.get('page',None)
    issue = metadata.get('issue',None)
    date_dict = metadata.get('issued',dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type','unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])])
    if any(map(is_oa_license, licenses)):
        pdf_url = doi_to_url(doi)

    splash_url = doi_to_url(doi)

    # Lookup journal
    search_terms = {'jtitle':title}
    if issn:
        search_terms['issn'] = issn
    journal = fetch_journal(search_terms)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = fetch_publisher(publisher_name)

    barepub = BareOaiRecord(
            paper=paper,
            journal_title=title,
            issue=issue,
            volume=volume,
            pubdate=pubdate,
            pages=pages,
            doi=doi,
            pubtype=pubtype,
            publisher_name=publisher_name,
            journal=journal,
            publisher=publisher,
            pdf_url=pdf_url,
            splash_url=splash_url,
            source=crossref_oai_source,
            identifier=doi_to_crossref_identifier(doi))
    rec = paper.add_oairecord(barepub)
    paper.update_availability()
    return paper, rec
Exemple #8
0
    def create_oairecord(self, record, update_index=True, create_missing_dois=True):
        """
        Given one line of the dump (represented as a dict),
        add it to the corresponding paper (if it exists)
        """
        doi = to_doi(record['doi'])
        if not doi:
            return
        prefix = doi.split('/')[0]
        if prefix in free_doi_prefixes:
            return
        if not record.get('oa_locations'):
            return

        paper = Paper.get_by_doi(doi)
        if not paper:
            if not create_missing_dois:
                return
            try:
                paper = Paper.create_by_doi(doi)
            except (MetadataSourceException, ValueError):
                return
            if not paper:
                logger.info('no such paper for doi {doi}'.format(doi=doi))
                return
        logger.info(doi)
        paper.cache_oairecords()

        for oa_location in record.get('oa_locations') or []:
            url = oa_location['url']

            # just to speed things up a bit...
            if paper.pdf_url == url:
                return

            identifier='oadoi:'+url
            source = self.oadoi_source

            if oa_location['host_type'] == 'publisher':
                url = doi_to_url(doi)
                identifier = doi_to_crossref_identifier(doi)
                source = self.crossref_source

            record = BareOaiRecord(
                paper=paper,
                doi=doi,
                pubtype=paper.doctype,
                source=source,
                identifier=identifier,
                splash_url=url,
                pdf_url=oa_location['url'])
            try:
                # We disable checks by DOI since we know the paper has been looked up by DOI already.
                old_pdf_url = paper.pdf_url
                paper.add_oairecord(record, check_by_doi=False)
                super(Paper, paper).update_availability()
                if old_pdf_url != paper.pdf_url:
                    paper.save()
                    if update_index:
                        paper.update_index()
            except (DataError, ValueError):
                logger.warning('Record does not fit in the DB')