コード例 #1
0
ファイル: test_bare.py プロジェクト: Phyks/dissemin
    def test_cleanup_desc(self):
        r = BareOaiRecord()

        r.description = "International audience ; While price and data…"
        r.cleanup_description()
        self.assertEqual(r.description, "While price and data…")

        r.description = " Abstract: While price and data…"
        r.cleanup_description()
        self.assertEqual(r.description, "While price and data…")

        r.description = None
        r.cleanup_description()
        self.assertEqual(r.description, None)
コード例 #2
0
    def create_paper(self, work):
        assert (not work.skipped)
        # Create paper
        authors, orcids = work.authors_and_orcids
        paper = BarePaper.create(
            work.title,
            authors,
            work.pubdate,
            visible=True,
            affiliations=None,
            orcids=orcids,
        )
        record = BareOaiRecord(source=self.oai_source,
                               identifier=work.api_uri,
                               splash_url=work.splash_url,
                               pubtype=work.pubtype)

        paper.add_oairecord(record)

        try:
            p = Paper.from_bare(paper)
            p = self.associate_researchers(p)
            p.save()
            p.update_index()
        except ValueError:
            p = None

        return p
コード例 #3
0
    def submit_deposit_wrapper(self, *args, **kwargs):
        """
        Wrapper of the submit_deposit method (that should not need to be
        reimplemented). It catches DepositErrors raised in the deposit process
        and adds the logs to its return value.
        """
        try:
            # Small hack to get notifications
            name = getattr(self.user, 'name', None)
            first_name = getattr(self.user, 'first_name', None)
            last_name = getattr(self.user, 'last_name', None)
            if first_name and last_name:
                name = '%s %s' % (first_name, last_name)
            notification_payload = {
                'name': str(name),
                'repo': self.repository.name,
                'paperurl': self.paper.url,
            }

            result = self.submit_deposit(*args, **kwargs)
            result.logs = self._logs

            # Create the corresponding OAI record
            if result.splash_url:
                rec = BareOaiRecord(
                    source=self.repository.oaisource,
                    identifier=('deposition:%d:%s' %
                                (self.repository.id, str(result.identifier))),
                    splash_url=result.splash_url,
                    pdf_url=result.pdf_url)
                result.oairecord = self.paper.add_oairecord(rec)

            settings.DEPOSIT_NOTIFICATION_CALLBACK(notification_payload)

            # In case that the paper is on user todo list, remove it
            # If it's not on the list, nothing happens here, since m2m field
            self.paper.todolist.remove(self.user)

            return result
        except DepositError as e:
            self.log('Message: ' + e.args[0])
            notification_payload['paperurl'] += ' ' + e.args[0]
            settings.DEPOSIT_NOTIFICATION_CALLBACK(notification_payload)
            return DepositResult(logs=self._logs,
                                 status='failed',
                                 message=e.args[0])
        except Exception as e:
            self.log("Caught exception:")
            self.log(str(type(e)) + ': ' + str(e) + '')
            self.log(traceback.format_exc())
            return DepositResult(
                logs=self._logs,
                status='failed',
                message=
                _('Failed to connect to the repository. Please try again later.'
                  ))
コード例 #4
0
    def create_oairecord(self, record):
        """
        Given one line of the dump (represented as a dict),
        add it to the corresponding paper (if it exists)
        """
        doi = to_doi(record['doi'])
        if not doi:
            return
        prefix = doi.split('/')[0]
        if prefix in free_doi_prefixes:
            return

        paper = Paper.get_by_doi(doi)
        if not paper:
            try:
                paper = Paper.create_by_doi(doi)
            except (MetadataSourceException, ValueError):
                return
            if not paper:
                print('no such paper for doi {doi}'.format(doi=doi))
                return

        url = record['url']

        # just to speed things up a bit...
        if paper.pdf_url == url:
            return

        identifier = 'oadoi:' + url
        source = self.oadoi_source

        if record['host_type'] == 'publisher':
            url = doi_to_url(doi)
            identifier = doi_to_crossref_identifier(doi)
            source = self.crossref_source

        record = BareOaiRecord(paper=paper,
                               doi=doi,
                               pubtype=paper.doctype,
                               source=source,
                               identifier=identifier,
                               splash_url=url,
                               pdf_url=record['url'])
        try:
            paper.add_oairecord(record)
            paper.update_availability()
            # TODO re-enable this
            #paper.update_index()
        except (DataError, ValueError):
            print('Record does not fit in the DB')
コード例 #5
0
    def add_oai_record(self, header, metadata, paper):
        """
        Add a record (from OAI-PMH) to the given paper
        """
        identifier = header.identifier()

        # description in oai_dc means abstract
        curdesc = ""
        for desc in metadata['description']:
            if len(desc) > len(curdesc):
                curdesc = desc
        curdesc = sanitize_html(curdesc)

        # Run extractor to find the URLs
        splash_url, pdf_url = self.extract_urls(header, metadata,
                                                self.oaisource.identifier)

        keywords = ' | '.join(metadata['subject'])
        contributors = ' '.join(metadata['contributor'])[:4096]

        typenorms = ['typenorm:' + tn for tn in metadata.get('typenorm', [])]
        pubtype_list = metadata.get('type', []) + typenorms
        pubtype = None
        for raw_pubtype in pubtype_list:
            pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype)
            if pubtype is not None:
                break

        if pubtype is None:
            pubtype = self.oaisource.default_pubtype

        # Find the DOI, if any
        doi = None
        for url in metadata['identifier'] + metadata['relation'] + metadata[
                'source']:
            if not doi:
                doi = to_doi(url)

        record = BareOaiRecord(source=self.oaisource,
                               identifier=identifier,
                               description=curdesc,
                               keywords=keywords,
                               contributors=contributors,
                               pubtype=pubtype,
                               pdf_url=pdf_url,
                               splash_url=splash_url,
                               doi=doi)
        paper.add_oairecord(record)
コード例 #6
0
    def create_paper(self, data_paper):
        assert (not data_paper.skipped)
        # Create paper
        paper = BarePaper.create(
            data_paper.title,
            data_paper.authors,
            data_paper.pubdate,
            visible=True,
            affiliations=None,
            orcids=data_paper.orcids,
        )
        record = BareOaiRecord(source=orcid_oai_source(),
                               identifier=data_paper.identifier,
                               splash_url=data_paper.splash_url,
                               pubtype=data_paper.doctype)

        paper.add_oairecord(record)

        return paper
コード例 #7
0
ファイル: oai.py プロジェクト: jilljenn/dissemin
def add_oai_record(record, source, paper):
    """ Add a record (from OAI-PMH) to the given paper """
    header = record[0]
    identifier = header.identifier()

    # A description is useful
    curdesc = ""
    for desc in record[1]._map['description']:
        if len(desc) > len(curdesc):
            curdesc = desc
    curdesc = sanitize_html(curdesc)

    # Run extractor to find the URLs
    pdf_url = None
    splash_url = None
    if source.identifier:
        try:
            extractor = REGISTERED_EXTRACTORS[source.identifier]
            urls = extractor.extract(record)
            pdf_url = urls.get('pdf')
            splash_url = urls.get('splash')
        except KeyError:
            print "Warning, invalid extractor for source " + source.name

    keywords = ' '.join(record[1]._map['subject'])
    contributors = ' '.join(record[1]._map['contributor'])[:4096]

    pubtype_list = record[1]._map.get('type')
    pubtype = None
    if len(pubtype_list) > 0:
        pubtype = pubtype_list[0]
    #pubtype = source.default_pubtype
    pubtype = PUBTYPE_TRANSLATIONS.get(pubtype, source.default_pubtype)

    record = BareOaiRecord(source=source,
                           identifier=identifier,
                           description=curdesc,
                           keywords=keywords,
                           contributors=contributors,
                           pubtype=pubtype,
                           pdf_url=pdf_url,
                           splash_url=splash_url)
    paper.add_oairecord(record)
コード例 #8
0
ファイル: orcid.py プロジェクト: tarsbase/dissemin
    def create_paper(self, work):
        assert (not work.skipped)
        # Create paper
        authors, orcids = work.authors_and_orcids
        paper = BarePaper.create(
            work.title,
            authors,
            work.pubdate,
            visible=True,
            affiliations=None,
            orcids=orcids,
        )
        record = BareOaiRecord(source=orcid_oai_source(),
                               identifier=work.api_uri,
                               splash_url=work.splash_url,
                               pubtype=work.pubtype)

        paper.add_oairecord(record)

        return paper
コード例 #9
0
    def fetch_metadata_from_dois(self, crps, ref_name, orcid_id, dois):
        doi_metadata = fetch_dois(dois)
        for metadata in doi_metadata:
            try:
                authors = map(convert_to_name_pair, metadata['author'])
                affiliations = affiliate_author_with_orcid(ref_name, orcid_id, authors)
                paper = crps.save_doi_metadata(metadata, affiliations)
                if not paper:
                    yield False, metadata
                    continue

                record = BareOaiRecord(
                        source=orcid_oai_source,
                        identifier='orcid:%s:%s' % (orcid_id, metadata['DOI']),
                        splash_url='http://%s/%s' % (settings.ORCID_BASE_DOMAIN, orcid_id),
                        pubtype=paper.doctype)
                paper.add_oairecord(record)
                yield True, paper
            except (KeyError, ValueError, TypeError):
                yield False, metadata
コード例 #10
0
    def create_paper(self, data_paper):
        assert (not data_paper.skipped)
        # Create paper
        paper = BarePaper.create(
            data_paper.title,
            data_paper.authors,
            data_paper.pubdate,
            'VISIBLE',
            data_paper.affiliations
        )
        record = BareOaiRecord(
            source=orcid_oai_source,
            identifier=data_paper.identifier,
            splash_url=data_paper.splash_url,
            pubtype=data_paper.doctype
        )

        paper.add_oairecord(record)

        return paper
コード例 #11
0
ファイル: citeproc.py プロジェクト: robertdigital/dissemin
    def to_paper(cls, data):
        """
        Call this function to convert citeproc metadata into a paper object
        Our strategy is as follows:
        We collect first all data necessary, if me miss something, then we raise CiteprocError.
        If we have collected everything, we pass that to the corresponding baremodels.
        :param data: citeproc metadata. Note that CrossRef does put its citeproc into a message block
        :returns: Paper object
        :raises: CiteprocError
        """
        if not isinstance(data, dict):
            raise CiteprocError('Invalid metadaformat, expecting dict')
        bare_paper_data = cls._get_paper_data(data)
        bare_oairecord_data = cls._get_oairecord_data(data)

        bare_paper = BarePaper.create(**bare_paper_data)
        bare_oairecord = BareOaiRecord(paper=bare_paper, **bare_oairecord_data)
        bare_paper.add_oairecord(bare_oairecord)
        bare_paper.update_availability()

        paper = Paper.from_bare(bare_paper)
        paper.update_index()
        return paper
コード例 #12
0
ファイル: crossref.py プロジェクト: NikolaJankovic/dissemin
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not metadata.get('container-title'):
        return
    doi = to_doi(metadata.get('DOI', None))

    title = metadata['container-title']
    if isinstance(title, list):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN', None)
    if issn and isinstance(issn, list):
        issn = issn[0]  # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume', None)
    pages = metadata.get('page', None)
    issue = metadata.get('issue', None)
    date_dict = metadata.get('issued', dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type', 'unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)
    splash_url = doi_to_url(doi)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL')
                    for license in metadata.get('license', [])])
    doi_prefix = doi.split('/')[0]
    if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)):
        pdf_url = splash_url

    # Lookup journal
    search_terms = {'jtitle': title}
    if issn:
        search_terms['issn'] = issn
    journal = fetch_journal(search_terms)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = fetch_publisher(publisher_name)

    barepub = BareOaiRecord(
            paper=paper,
            journal_title=title,
            issue=issue,
            volume=volume,
            pubdate=pubdate,
            pages=pages,
            doi=doi,
            pubtype=pubtype,
            publisher_name=publisher_name,
            journal=journal,
            publisher=publisher,
            pdf_url=pdf_url,
            splash_url=splash_url,
            source=OaiSource.objects.get(identifier='crossref'),
            identifier=doi_to_crossref_identifier(doi))
    rec = paper.add_oairecord(barepub)
    paper.update_availability()
    return paper, rec
コード例 #13
0
ファイル: testbare.py プロジェクト: tarsbase/dissemin
    def test_cleanup_desc(self):
        r = BareOaiRecord()

        r.description = "International audience ; While price and data…"
        r.cleanup_description()
        self.assertEqual(r.description, "While price and data…")

        r.description = " Abstract: While price and data…"
        r.cleanup_description()
        self.assertEqual(r.description, "While price and data…")

        r.description = None
        r.cleanup_description()
        self.assertEqual(r.description, None)
コード例 #14
0
    def fetch_orcid_records(self, id, profile=None, use_doi=True):
        """
        Queries ORCiD to retrieve the publications associated with a given ORCiD.
        It also fetches such papers from the CrossRef search interface.

        :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON).
        :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow)
        :returns: a generator, where all the papers found are yielded. (some of them could be in
                free form, hence not imported)
        """
        crps = CrossRefPaperSource(self.ccf)

        # Cleanup iD:
        id = validate_orcid(id)
        if id is None:
            raise MetadataSourceException('Invalid ORCiD identifier')

        # Get ORCiD profile
        try:
            if profile is None:
                profile = OrcidProfile(id=id)
            else:
                profile = OrcidProfile(json=profile)
        except MetadataSourceException as e:
            print e
            return

        # Reference name
        ref_name = profile.name
        # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i
        dois = []  # list of DOIs to fetch
        papers = []  # list of papers created
        records_found = 0  # how many records did we successfully import from the profile?

        # Fetch publications
        pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work',
                     profile, [])
        for pub in pubs:

            def j(path, default=None):
                return jpath(path, pub, default)

            # DOI
            doi = None
            for extid in j(
                    'work-external-identifiers/work-external-identifier', []):
                if extid.get('work-external-identifier-type') == 'DOI':
                    doi = to_doi(
                        jpath('work-external-identifier-id/value', extid))
                    if doi:
                        # If a DOI is available, create the paper using metadata from CrossRef.
                        # We don't do it yet, we only store the DOI, so that we can fetch them
                        # by batch later.
                        dois.append(doi)

            if doi and use_doi:
                continue

            # Extract information from ORCiD

            # Title
            title = j('work-title/title/value')
            if title is None:
                print "Warning: Skipping ORCID publication: no title"

            # Type
            doctype = orcid_to_doctype(j('work-type', 'other'))

            # Contributors (ignored for now as they are very often not present)
            def get_contrib(js):
                return {
                    'orcid': jpath('contributor-orcid', js),
                    'name': jpath('credit-name/value', js),
                }

            contributors = map(get_contrib,
                               j('work-contributors/contributor', []))

            author_names = filter(lambda x: x is not None,
                                  map(lambda x: x['name'], contributors))
            authors = map(parse_comma_name, author_names)
            pubdate = None
            # ORCiD internal id
            identifier = j('put-code')
            affiliations = map(lambda x: x['orcid'], contributors)
            # Pubdate
            year = parse_int(j('publication-date/year/value'), 1970)
            month = parse_int(j('publication-date/month/value'), 01)
            day = parse_int(j('publication-date/day/value'), 01)
            pubdate = None
            try:
                pubdate = date(year=year, month=01, day=01)
                pubdate = date(year=year, month=month, day=01)
                pubdate = date(year=year, month=month, day=day)
            except ValueError:
                if pubdate is None:
                    print "Invalid publication date in ORCID publication, skipping"
                    continue

            # Citation type: metadata format
            citation_format = j('work-citation/work-citation-type')
            print citation_format
            bibtex = j('work-citation/citation')

            if bibtex is not None:
                try:
                    entry = parse_bibtex(bibtex)

                    if entry.get('author', []) == []:
                        print "Warning: Skipping ORCID publication: no authors."
                        print j('work-citation/citation')
                    if not authors:
                        authors = entry['author']
                except ValueError:
                    pass

            affiliations = affiliate_author_with_orcid(
                ref_name, id, authors, initial_affiliations=affiliations)

            authors = map(name_lookup_cache.lookup, authors)

            if not authors:
                print "No authors found, skipping"
                continue

            # Create paper:
            paper = BarePaper.create(title, authors, pubdate, 'VISIBLE',
                                     affiliations)

            record = BareOaiRecord(source=orcid_oai_source,
                                   identifier=identifier,
                                   splash_url='http://orcid.org/' + id,
                                   pubtype=doctype)

            paper.add_oairecord(record)
            yield paper

        if use_doi:
            for metadata in crps.search_for_dois_incrementally(
                    '', {'orcid': id}):
                try:
                    paper = crps.save_doi_metadata(metadata)
                    if paper:
                        yield paper
                except ValueError as e:
                    print "Saving CrossRef record from ORCID failed: %s" % unicode(
                        e)

            # Now we add the DOIs found in the ORCID profile.
            doi_metadata = fetch_dois(dois)
            for metadata in doi_metadata:
                try:
                    authors = map(convert_to_name_pair, metadata['author'])
                    affiliations = affiliate_author_with_orcid(
                        ref_name, id, authors)
                    paper = crps.save_doi_metadata(metadata, affiliations)
                    if not paper:
                        continue
                    record = BareOaiRecord(source=orcid_oai_source,
                                           identifier='orcid:' + id + ':' +
                                           metadata['DOI'],
                                           splash_url='http://orcid.org/' + id,
                                           pubtype=paper.doctype)
                    paper.add_oairecord(record)
                    yield paper
                except (KeyError, ValueError, TypeError):
                    pass
コード例 #15
0
    def create_oairecord(self, record, update_index=True, create_missing_dois=True):
        """
        Given one line of the dump (represented as a dict),
        add it to the corresponding paper (if it exists)
        """
        doi = to_doi(record['doi'])
        if not doi:
            return
        prefix = doi.split('/')[0]
        if prefix in free_doi_prefixes:
            return
        if not record.get('oa_locations'):
            return

        paper = Paper.get_by_doi(doi)
        if not paper:
            if not create_missing_dois:
                return
            try:
                paper = Paper.create_by_doi(doi)
            except (MetadataSourceException, ValueError):
                return
            if not paper:
                logger.info('no such paper for doi {doi}'.format(doi=doi))
                return
        logger.info(doi)
        paper.cache_oairecords()

        for oa_location in record.get('oa_locations') or []:
            url = oa_location['url']

            # just to speed things up a bit...
            if paper.pdf_url == url:
                return

            identifier='oadoi:'+url
            source = self.oadoi_source

            if oa_location['host_type'] == 'publisher':
                url = doi_to_url(doi)
                identifier = doi_to_crossref_identifier(doi)
                source = self.crossref_source

            record = BareOaiRecord(
                paper=paper,
                doi=doi,
                pubtype=paper.doctype,
                source=source,
                identifier=identifier,
                splash_url=url,
                pdf_url=oa_location['url'])
            try:
                # We disable checks by DOI since we know the paper has been looked up by DOI already.
                old_pdf_url = paper.pdf_url
                paper.add_oairecord(record, check_by_doi=False)
                super(Paper, paper).update_availability()
                if old_pdf_url != paper.pdf_url:
                    paper.save()
                    if update_index:
                        paper.update_index()
            except (DataError, ValueError):
                logger.warning('Record does not fit in the DB')