Esempio n. 1
0
class ZoteroBackend(object):
    @staticmethod
    def create_api_key():
        """ Interactively create a new API key via Zotero's OAuth API.

        Requires the user to enter a verification key displayed in the browser.

        :returns:   API key and the user's library ID
        """
        auth = OAuth1Service(name='zotero',
                             consumer_key=CLIENT_KEY,
                             consumer_secret=CLIENT_SECRET,
                             request_token_url=REQUEST_TOKEN_URL,
                             access_token_url=ACCESS_TOKEN_URL,
                             authorize_url=AUTH_URL,
                             base_url=BASE_URL)
        token, secret = auth.get_request_token(
            params={'oauth_callback': 'oob'})
        auth_url = auth.get_authorize_url(token)
        auth_url += '&' + urlencode({
            'name': 'zotero-cli',
            'library_access': 1,
            'notes_access': 1,
            'write_access': 1,
            'all_groups': 'read'
        })
        click.echo("Opening {} in browser, please confirm.".format(auth_url))
        click.launch(auth_url)
        verification = click.prompt("Enter verification code")
        token_resp = auth.get_raw_access_token(
            token,
            secret,
            method='POST',
            data={'oauth_verifier': verification})
        if not token_resp:
            logging.debug(token_resp.content)
            click.fail("Error during API key generation.")
        access = urlparse.parse_qs(token_resp.text)
        return access['oauth_token'][0], access['userID'][0]

    def __init__(self,
                 api_key=None,
                 library_id=None,
                 library_type='user',
                 autosync=False):
        """ Service class for communicating with the Zotero API.

        This is mainly a thin wrapper around :py:class:`pyzotero.zotero.Zotero`
        that handles things like transparent HTML<->[edit-formt] conversion.

        :param api_key:     API key for the Zotero API, will be loaded from
                            the configuration if not specified
        :param library_id:  Zotero library ID the API key is valid for, will
                            be loaded from the configuration if not specified
        :param library_type: Type of the library, can be 'user' or 'group'
        """
        self._logger = logging.getLogger()
        idx_path = os.path.join(click.get_app_dir(APP_NAME), 'index.sqlite')
        self.config = load_config()
        self.note_format = self.config['zotcli.note_format']
        self.storage_dir = self.config.get('zotcli.storage_dir')
        self.betterbibtex = self.config.get('zotcli.betterbibtex')
        if self.config.get('zotcli.app_dir'):
            self.app_dir = self.config.get('zotcli.app_dir')

        api_key = api_key or self.config.get('zotcli.api_key')
        library_id = library_id or self.config.get('zotcli.library_id')

        if not api_key or not library_id:
            raise ValueError(
                "Please set your API key and library ID by running "
                "`zotcli configure` or pass them as command-line options.")
        self._zot = Zotero(library_id=library_id,
                           api_key=api_key,
                           library_type=library_type)
        self._index = SearchIndex(idx_path)
        sync_interval = self.config.get('zotcli.sync_interval', 300)
        since_last_sync = int(time.time()) - self._index.last_modified
        if autosync and since_last_sync >= int(sync_interval):
            click.echo("{} seconds since last sync, synchronizing.".format(
                since_last_sync))
            num_updated = self.synchronize()
            click.echo("Updated {} items".format(num_updated))

    def getBetterBibtexKeys(self):
        with open(
                os.path.join(
                    self.app_dir,
                    'better-bibtex/_better-bibtex.json')) as data_file:
            data = json.load(data_file)
        keys = {}
        for i in data['collections'][0]['data']:
            keys[i['itemKey']] = i['citekey']
        return keys

    def synchronize(self):
        """ Update the local index to the latest library version. """
        new_items = tuple(self.items(since=self._index.library_version))
        version = int(self._zot.request.headers.get('last-modified-version'))
        self._index.index(new_items, version)
        return len(new_items)

    def search(self, query, limit=None):
        """ Search the local index for items.

        :param query:   A sqlite FTS4 query
        :param limit:   Maximum number of items to return
        :returns:       Generator that yields matching items.
        """
        return self._index.search(query, limit=limit)

    def items(self, query=None, limit=None, recursive=False, since=0):
        """ Get a list of all items in the library matching the arguments.

        :param query:   Filter items by this query string (targets author and
                        title fields)
        :type query:    str/unicode
        :param limit:   Limit maximum number of returned items
        :type limit:    int
        :param recursive: Include non-toplevel items (attachments, notes, etc)
                          in output
        :type recursive: bool
        :returns:       Generator that yields items
        """
        if self.betterbibtex:
            bbtxkeys = self.getBetterBibtexKeys()

        if limit is None:
            limit = 100
        query_args = {'since': since}
        if query:
            query_args['q'] = query
        if limit:
            query_args['limit'] = limit
        query_fn = self._zot.items if recursive else self._zot.top
        # NOTE: Normally we'd use the makeiter method of Zotero, but it seems
        #       to be broken at the moment, thus we call .follow ourselves
        items = query_fn(**query_args)
        last_url = self._zot.links.get('last')
        if last_url:
            while self._zot.links['self'] != last_url:
                items.extend(self._zot.follow())
        for it in items:
            if self.betterbibtex:
                try:
                    citekey = bbtxkeys[it['data']['key']]
                except:
                    citekey = None
            else:
                matches = CITEKEY_PAT.finditer(it['data'].get('extra', ''))
                citekey = next((m.group(1) for m in matches), None)
            yield Item(key=it['data']['key'],
                       creator=it['meta'].get('creatorSummary'),
                       title=it['data'].get('title', "Untitled"),
                       abstract=it['data'].get('abstractNote'),
                       date=it['data'].get('date'),
                       citekey=citekey)

    def notes(self, item_id):
        """ Get a list of all notes for a given item.

        :param item_id:     ID/key of the item to get notes for
        :returns:           Notes for item
        """
        notes = self._zot.children(item_id, itemType="note")
        for note in notes:
            note['data']['note'] = self._make_note(note)
            yield note

    def attachments(self, item_id):
        """ Get a list of all attachments for a given item.

        If a zotero profile directory is specified in the configuration,
        a resolved local file path will be included, if the file exists.

        :param item_id:     ID/key of the item to get attachments for
        :returns:           Attachments for item
        """
        attachments = self._zot.children(item_id, itemType="attachment")
        if self.storage_dir:
            for att in attachments:
                if not att['data']['linkMode'].startswith("imported"):
                    continue
                fpath = os.path.join(self.storage_dir, att['key'],
                                     att['data']['filename'])
                if not os.path.exists(fpath):
                    continue
                att['data']['path'] = fpath
        return attachments

    def get_attachment_path(self, attachment):
        storage_method = self.config['zotcli.sync_method']
        if storage_method == 'zotfile':
            storage = self.config['zotcli.storage_dir']
            return Path(os.path.join(storage, attachment['data']['title']))

        if not attachment['data']['linkMode'].startswith("imported"):
            raise ValueError(
                "Attachment is not stored on server, cannot download!")
        if storage_method == 'local':
            return Path(attachment['data']['path'])
        out_path = TEMP_DIR / attachment['data']['filename']
        if out_path.exists():
            return out_path
        if storage_method == 'zotero':
            self._zot.dump(attachment['key'], path=unicode(TEMP_DIR))
            return out_path
        elif storage_method == 'webdav':
            user = self.config['zotcli.webdav_user']
            password = self.config['zotcli.webdav_pass']
            location = self.config['zotcli.webdav_path']
            zip_url = "{}/zotero/{}.zip".format(location, attachment['key'])
            resp = requests.get(zip_url, auth=(user, password))
            zf = zipfile.ZipFile(StringIO(resp.content))
            zf.extractall(str(TEMP_DIR))
        return out_path

    def _make_note(self, note_data):
        """ Converts a note from HTML to the configured markup.

        If the note was previously edited with zotcli, the original markup
        will be restored. If it was edited with the Zotero UI, it will be
        converted from the HTML via pandoc.


        :param note_html:       HTML of the note
        :param note_version:    Library version the note was last edited
        :returns:               Dictionary with markup, format and version
        """
        data = None
        note_html = note_data['data']['note']
        note_version = note_data['version']
        if "title=\"b'" in note_html:
            # Fix for badly formatted notes from an earlier version (see #26)
            note_html = re.sub(r'title="b\'(.*?)\'"', r'title="\1"', note_html)
            note_html = note_html.replace("\\n", "")
        blobs = DATA_PAT.findall(note_html)
        # Previously edited with zotcli
        if blobs:
            data = decode_blob(blobs[0])
            if 'version' not in data:
                data['version'] = note_version
            note_html = DATA_PAT.sub("", note_html)
        # Not previously edited with zotcli or updated from the Zotero UI
        if not data or data['version'] < note_version:
            if data and data['version'] < note_version:
                self._logger.info("Note changed on server, reloading markup.")
            note_format = data['format'] if data else self.note_format
            data = {
                'format': note_format,
                'text': pypandoc.convert(note_html, note_format,
                                         format='html'),
                'version': note_version
            }
        return data

    def _make_note_html(self, note_data):
        """ Converts the note's text to HTML and adds a dummy element that
            holds the original markup.

        :param note_data:   dict with text, format and version of the note
        :returns:           Note as HTML
        """
        extra_data = DATA_TMPL.format(
            data=encode_blob(note_data).decode('utf8'))
        html = pypandoc.convert(note_data['text'],
                                'html',
                                format=note_data['format'])
        return html + extra_data

    def create_note(self, item_id, note_text):
        """ Create a new note for a given item.

        :param item_id:     ID/key of the item to create the note for
        :param note_text:   Text of the note
        """
        note = self._zot.item_template('note')
        note_data = {
            'format': self.note_format,
            'text': note_text,
            'version': self._zot.last_modified_version(limit=1) + 2
        }
        note['note'] = self._make_note_html(note_data)
        try:
            self._zot.create_items([note], item_id)
        except Exception as e:
            self._logger.error(e)
            with open("note_backup.txt", "w", encoding='utf-8') as fp:
                fp.write(note_data['text'])
            self._logger.warn(
                "Could not upload note to Zotero. You can find the note "
                "markup in 'note_backup.txt' in the current directory")

    def save_note(self, note):
        """ Update an existing note.

        :param note:        The updated note
        """
        raw_data = note['data']['note']
        raw_data['version'] += 1
        note['data']['note'] = self._make_note_html(raw_data)
        try:
            self._zot.update_item(note)
        except Exception as e:
            self._logger.error(e)
            with open("note_backup.txt", "w", encoding='utf-8') as fp:
                fp.write(raw_data['text'])
            self._logger.warn(
                "Could not upload note to Zotero. You can find the note "
                "markup in 'note_backup.txt' in the current directory")
Esempio n. 2
0
class ZoteroImporter(object):
    def __init__(
        self,
        library_id,
        library_type,
        api_key,
        papers2,
        keyword_types=("user", "label"),
        label_map={},
        add_to_collections=[],
        upload_attachments="all",
        batch_size=50,
        checkpoint=None,
        dryrun=None,
    ):
        self.client = Zotero(library_id, library_type, api_key)
        self.papers2 = papers2
        self.keyword_types = keyword_types
        self.label_map = label_map
        self.upload_attachments = upload_attachments
        self.checkpoint = checkpoint
        self.dryrun = JSONWriter(dryrun) if dryrun is not None else None
        self._batch = Batch(batch_size)
        self._load_collections(add_to_collections)

    # Load Zotero collections and create any
    # Papers2 collections that don't exist.
    # TODO: need to handle collection hierarchies
    def _load_collections(self, add_to_collections):
        self.collections = {}
        if add_to_collections is None:
            add_to_collections = list(c.name for c in self.papers2.get_collections())

        if len(add_to_collections) > 0:
            if self.dryrun is not None:
                for c in add_to_collections:
                    self.collections[c] = "<{0}>".format(c)

            else:
                # fetch existing zotero collections
                existing_collections = {}
                for zc in self.client.collections():
                    data = zc["data"]
                    existing_collections[data["name"]] = data["key"]

                # add any papers2 collections that do not already exist
                payload = []
                for pc in add_to_collections:
                    if pc not in existing_collections:
                        payload.append(dict(name=pc))
                if len(payload) > 0:
                    self.client.create_collection(payload)

                # re-fetch zotero collections in order to get keys
                for zc in self.client.collections():
                    data = zc["data"]
                    if data["name"] in add_to_collections:
                        self.collections[data["name"]] = data["key"]

    def add_pub(self, pub):
        # ignore publications we've already imported
        if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID):
            log.debug("Skipping already imported publication {0}".format(pub.ROWID))
            return False

        # convert the Papers2 publication type to a Zotero item type
        item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)]

        # get the template to fill in for an item of this type
        item = self.client.item_template(item_type)

        # fill in template fields
        for key, value in item.iteritems():
            if key in EXTRACTORS:
                value = EXTRACTORS[key].extract(pub, self, value)
                if value is not None:
                    item[key] = value

        # add notes, if any
        notes = []
        if pub.notes is not None and len(pub.notes) > 0:
            notes.append(pub.notes)

        reviews = self.papers2.get_reviews(pub)
        for r in reviews:
            notes.append("{0} Rating: {1}".format(r.content, r.rating))

        # get paths to attachments
        attachments = []
        if self.upload_attachments == "all" or (self.upload_attachments == "unread" and pub.times_read == 0):
            attachments = list(self.papers2.get_attachments(pub))

        # add to batch and checkpoint
        self._batch.add(item, notes, attachments)
        if self.checkpoint is not None:
            self.checkpoint.add(pub.ROWID)

        # commit the batch if it's full
        self._commit_batch()

        return True

    def close(self):
        if self._batch is not None:
            self._commit_batch(force=True)
            self._batch = None
        if self.dryrun is not None:
            self.dryrun.close()

    def _commit_batch(self, force=False):
        if self._batch.is_full or (force and not self._batch.is_empty):
            try:
                if self.dryrun is not None:
                    for item, attachments in self._batch.iter():
                        self.dryrun.write(item, attachments)

                else:
                    # upload metadata
                    status = self.client.create_items(self._batch.items)

                    if len(status["failed"]) > 0:
                        for status_idx, status_msg in status["failed"].iteritems():
                            item_idx = int(status_idx)
                            # remove failures from the checkpoint
                            if self.checkpoint is not None:
                                self.checkpoint.remove(item_idx)
                            item = self._batch.items[item_idx]
                            log.error(
                                "Upload failed for item {0}; code {1}; {2}".format(
                                    item["title"], status_msg["code"], status_msg["message"]
                                )
                            )

                    successes = {}
                    successes.update(status["success"])
                    successes.update(status["unchanged"])

                    for k, objKey in successes.iteritems():
                        item_idx = int(k)

                        # add notes
                        notes = self._batch.notes[item_idx]
                        if len(notes) > 0:
                            note_batch = []
                            for note_text in notes:
                                note = self.client.item_template("note")
                                note["parentItem"] = objKey
                                note["note"] = note_text
                                note_batch.append(note)

                            note_status = self.client.create_items(note_batch)

                            if len(note_status["failed"]) > 0:
                                for status_idx, status_msg in note_status["failed"].iteritems():
                                    note_idx = int(status_idx)
                                    # just warn about these failures
                                    note = note_batch[note_idx]
                                    log.error(
                                        "Failed to create note {0} for item item {1}; code {2}; {3}".format(
                                            note["note"],
                                            self.batch.items[idx]["title"],
                                            status_msg["code"],
                                            status_msg["message"],
                                        )
                                    )

                        # upload attachments and add items to collections
                        if self.upload_attachments != "none":

                            # TODO: modify pyzotero to pass MIME type for contentType key
                            attachments = list(path for path, mime in self._batch.attachments[item_idx])
                            if len(attachments) > 0:
                                try:
                                    self.client.attachment_simple(attachments, objKey)

                                # This is to work around a bug in pyzotero where an exception is
                                # thrown if an attachment already exists
                                except KeyError:
                                    log.info("One or more attachment already exists: {0}".format(",".join(attachments)))

                    # update checkpoint
                    if self.checkpoint is not None:
                        self.checkpoint.commit()

                    log.info(
                        "Batch committed: {0} items created and {1} items unchanged out of {2} attempted".format(
                            len(status["success"]), len(status["unchanged"]), self._batch.size
                        )
                    )

            except:
                log.error("Error importing {0} items to Zotero".format(self._batch.size))
                if self.checkpoint is not None:
                    self.checkpoint.rollback()
                raise

            finally:
                self._batch.clear()
Esempio n. 3
0
class ZoteroBackend(object):
    @staticmethod
    def create_api_key():
        """ Interactively create a new API key via Zotero's OAuth API.

        Requires the user to enter a verification key displayed in the browser.

        :returns:   API key and the user's library ID
        """
        auth = OAuth1Service(
            name='zotero',
            consumer_key=CLIENT_KEY,
            consumer_secret=CLIENT_SECRET,
            request_token_url=REQUEST_TOKEN_URL,
            access_token_url=ACCESS_TOKEN_URL,
            authorize_url=AUTH_URL,
            base_url=BASE_URL)
        token, secret = auth.get_request_token(
            params={'oauth_callback': 'oob'})
        auth_url = auth.get_authorize_url(token)
        auth_url += '&' + urlencode({
            'name': 'zotero-cli',
            'library_access': 1,
            'notes_access': 1,
            'write_access': 1,
            'all_groups': 'read'})
        click.echo("Opening {} in browser, please confirm.".format(auth_url))
        click.launch(auth_url)
        verification = click.prompt("Enter verification code")
        token_resp = auth.get_raw_access_token(
            token, secret, method='POST',
            data={'oauth_verifier': verification})
        if not token_resp:
            logging.debug(token_resp.content)
            click.fail("Error during API key generation.")
        access = urlparse.parse_qs(token_resp.text)
        return access['oauth_token'][0], access['userID'][0]

    def __init__(self, api_key=None, library_id=None, library_type='user',
                 autosync=False):
        """ Service class for communicating with the Zotero API.

        This is mainly a thin wrapper around :py:class:`pyzotero.zotero.Zotero`
        that handles things like transparent HTML<->[edit-formt] conversion.

        :param api_key:     API key for the Zotero API, will be loaded from
                            the configuration if not specified
        :param library_id:  Zotero library ID the API key is valid for, will
                            be loaded from the configuration if not specified
        :param library_type: Type of the library, can be 'user' or 'group'
        """
        self._logger = logging.getLogger()
        idx_path = os.path.join(click.get_app_dir(APP_NAME), 'index.sqlite')
        self.config = load_config()
        self.note_format = self.config['zotcli.note_format']
        self.storage_dir = self.config.get('zotcli.storage_dir')

        api_key = api_key or self.config.get('zotcli.api_key')
        library_id = library_id or self.config.get('zotcli.library_id')

        if not api_key or not library_id:
            raise ValueError(
                "Please set your API key and library ID by running "
                "`zotcli configure` or pass them as command-line options.")
        self._zot = Zotero(library_id=library_id, api_key=api_key,
                           library_type=library_type)
        self._index = SearchIndex(idx_path)
        sync_interval = self.config.get('zotcli.sync_interval', 300)
        since_last_sync = int(time.time()) - self._index.last_modified
        if autosync and since_last_sync >= int(sync_interval):
            click.echo("{} seconds since last sync, synchronizing."
                       .format(since_last_sync))
            num_updated = self.synchronize()
            click.echo("Updated {} items".format(num_updated))

    def synchronize(self):
        """ Update the local index to the latest library version. """
        new_items = tuple(self.items(since=self._index.library_version))
        version = int(self._zot.request.headers.get('last-modified-version'))
        self._index.index(new_items, version)
        return len(new_items)

    def search(self, query, limit=None):
        """ Search the local index for items.

        :param query:   A sqlite FTS4 query
        :param limit:   Maximum number of items to return
        :returns:       Generator that yields matching items.
        """
        return self._index.search(query, limit=limit)

    def items(self, query=None, limit=None, recursive=False, since=0):
        """ Get a list of all items in the library matching the arguments.

        :param query:   Filter items by this query string (targets author and
                        title fields)
        :type query:    str/unicode
        :param limit:   Limit maximum number of returned items
        :type limit:    int
        :param recursive: Include non-toplevel items (attachments, notes, etc)
                          in output
        :type recursive: bool
        :returns:       Generator that yields items
        """
        if limit is None:
            limit = 100
        query_args = {'since': since}
        if query:
            query_args['q'] = query
        if limit:
            query_args['limit'] = limit
        query_fn = self._zot.items if recursive else self._zot.top
        # NOTE: Normally we'd use the makeiter method of Zotero, but it seems
        #       to be broken at the moment, thus we call .follow ourselves
        items = query_fn(**query_args)
        last_url = self._zot.links.get('last')
        if last_url:
            while self._zot.links['self'] != last_url:
                items.extend(self._zot.follow())
        for it in items:
            matches = CITEKEY_PAT.finditer(it['data'].get('extra', ''))
            citekey = next((m.group(1) for m in matches), None)
            yield Item(key=it['data']['key'],
                       creator=it['meta'].get('creatorSummary'),
                       title=it['data'].get('title', "Untitled"),
                       abstract=it['data'].get('abstractNote'),
                       date=it['data'].get('date'),
                       citekey=citekey)

    def notes(self, item_id):
        """ Get a list of all notes for a given item.

        :param item_id:     ID/key of the item to get notes for
        :returns:           Notes for item
        """
        notes = self._zot.children(item_id, itemType="note")
        for note in notes:
            note['data']['note'] = self._make_note(note)
            yield note

    def attachments(self, item_id):
        """ Get a list of all attachments for a given item.

        If a zotero profile directory is specified in the configuration,
        a resolved local file path will be included, if the file exists.

        :param item_id:     ID/key of the item to get attachments for
        :returns:           Attachments for item
        """
        attachments = self._zot.children(item_id, itemType="attachment")
        if self.storage_dir:
            for att in attachments:
                if not att['data']['linkMode'].startswith("imported"):
                    continue
                fpath = os.path.join(self.storage_dir, att['key'],
                                     att['data']['filename'])
                if not os.path.exists(fpath):
                    continue
                att['data']['path'] = fpath
        return attachments

    def get_attachment_path(self, attachment):
        if not attachment['data']['linkMode'].startswith("imported"):
            raise ValueError(
                "Attachment is not stored on server, cannot download!")
        storage_method = self.config['zotcli.sync_method']
        if storage_method == 'local':
            return Path(attachment['data']['path'])
        out_path = TEMP_DIR/attachment['data']['filename']
        if out_path.exists():
            return out_path
        if storage_method == 'zotero':
            self._zot.dump(attachment['key'], path=unicode(TEMP_DIR))
            return out_path
        elif storage_method == 'webdav':
            user = self.config['zotcli.webdav_user']
            password = self.config['zotcli.webdav_pass']
            location = self.config['zotcli.webdav_path']
            zip_url = "{}/zotero/{}.zip".format(
                location, attachment['key'])
            resp = requests.get(zip_url, auth=(user, password))
            zf = zipfile.ZipFile(StringIO(resp.content))
            zf.extractall(str(TEMP_DIR))
        return out_path

    def _make_note(self, note_data):
        """ Converts a note from HTML to the configured markup.

        If the note was previously edited with zotcli, the original markup
        will be restored. If it was edited with the Zotero UI, it will be
        converted from the HTML via pandoc.

        :param note_html:       HTML of the note
        :param note_version:    Library version the note was last edited
        :returns:               Dictionary with markup, format and version
        """
        data = None
        note_html = note_data['data']['note']
        note_version = note_data['version']
        if "title=\"b'" in note_html:
            # Fix for badly formatted notes from an earlier version (see #26)
            note_html = re.sub(r'title="b\'(.*?)\'"', r'title="\1"', note_html)
            note_html = note_html.replace("\\n", "")
        blobs = DATA_PAT.findall(note_html)
        # Previously edited with zotcli
        if blobs:
            data = decode_blob(blobs[0])
            if 'version' not in data:
                data['version'] = note_version
            note_html = DATA_PAT.sub("", note_html)
        # Not previously edited with zotcli or updated from the Zotero UI
        if not data or data['version'] < note_version:
            if data and data['version'] < note_version:
                self._logger.info("Note changed on server, reloading markup.")
            note_format = data['format'] if data else self.note_format
            data = {
                'format': note_format,
                'text': pypandoc.convert(
                    note_html, note_format, format='html'),
                'version': note_version}
        return data

    def _make_note_html(self, note_data):
        """ Converts the note's text to HTML and adds a dummy element that
            holds the original markup.

        :param note_data:   dict with text, format and version of the note
        :returns:           Note as HTML
        """
        extra_data = DATA_TMPL.format(
            data=encode_blob(note_data).decode('utf8'))
        html = pypandoc.convert(note_data['text'], 'html',
                                format=note_data['format'])
        return html + extra_data

    def create_note(self, item_id, note_text):
        """ Create a new note for a given item.

        :param item_id:     ID/key of the item to create the note for
        :param note_text:   Text of the note
        """
        note = self._zot.item_template('note')
        note_data = {'format': self.note_format,
                     'text': note_text,
                     'version': self._zot.last_modified_version(limit=1)+2}
        note['note'] = self._make_note_html(note_data)
        try:
            self._zot.create_items([note], item_id)
        except Exception as e:
            self._logger.error(e)
            with open("note_backup.txt", "w", encoding='utf-8') as fp:
                fp.write(note_data['text'])
            self._logger.warn(
                "Could not upload note to Zotero. You can find the note "
                "markup in 'note_backup.txt' in the current directory")

    def save_note(self, note):
        """ Update an existing note.

        :param note:        The updated note
        """
        raw_data = note['data']['note']
        raw_data['version'] += 1
        note['data']['note'] = self._make_note_html(raw_data)
        try:
            self._zot.update_item(note)
        except Exception as e:
            self._logger.error(e)
            with open("note_backup.txt", "w", encoding='utf-8') as fp:
                fp.write(raw_data['text'])
            self._logger.warn(
                "Could not upload note to Zotero. You can find the note "
                "markup in 'note_backup.txt' in the current directory")
Esempio n. 4
0
class ZoteroWrap:

    CACHE_REFERENCE_LIST = "references"
    CACHE_REFERENCE_TYPES = "reference_types"
    CACHE_REFERENCE_TEMPLATES = "reference_templates"

    def __init__(self, library_id, library_type, api_key, directory):
        cache_filename = "{}-{}-{}.pkl".format(library_id, library_type,
                                               api_key)
        self.cache_path = os.path.join(directory, cache_filename)
        # reference_types and reference_templates must have the same ordering.
        self.reference_types = []
        self.reference_templates = {}
        self._zotero_lib = Zotero(library_id, library_type, api_key)
        self._references = []

    # Data I/O methods section.

    def initialize(self):
        """Load the cached Zotero data, or retrieve them if there is none."""
        try:
            self.load_cache()
        except FileNotFoundError:
            self.load_distant()

    def load_cache(self):
        """Load the cached Zotero data."""
        with open(self.cache_path, "rb") as f:
            print("Loading cached Zotero data...")
            cache = pickle.load(f)
            self._references = cache[self.CACHE_REFERENCE_LIST]
            self.reference_types = cache[self.CACHE_REFERENCE_TYPES]
            self.reference_templates = cache[self.CACHE_REFERENCE_TEMPLATES]
            print("Cached Zotero data loaded.")

    def load_distant(self):
        """Load the distant Zotero data."""
        print("Loading distant Zotero data...")
        self._references = self.get_references()
        self.reference_types = self.get_reference_types()
        self.reference_templates = self.get_reference_templates(
            self.reference_types)
        print("Distant Zotero data loaded.")
        self.cache()

    def cache(self):
        """Cache the Zotero data."""
        with open(self.cache_path, "wb") as f:
            cache = {
                self.CACHE_REFERENCE_LIST: self._references,
                self.CACHE_REFERENCE_TYPES: self.reference_types,
                self.CACHE_REFERENCE_TEMPLATES: self.reference_templates
            }
            pickle.dump(cache, f)

    def create_local_reference(self, ref):
        """Append the reference at the end of the reference list and cache it."""
        self._references.append(ref)
        self.cache()

    def create_distant_reference(self, ref_data):
        """Validate and create the reference in Zotero and return the created item."""
        self.validate_reference_data(ref_data)
        creation_status = self._zotero_lib.create_items([ref_data])
        try:
            created_item = creation_status["successful"]["0"]
            return created_item
        except KeyError as e:
            print(creation_status)
            raise CreateZoteroItemError from e

    def update_local_reference(self, index, ref):
        """Replace the reference in the reference list and cache it."""
        self._references[index] = ref
        self.cache()

    def update_distant_reference(self, ref):
        """Validate and update the reference in Zotero.

        Existing fields not present will be left unmodified.
        """
        self.validate_reference_data(ref["data"])
        self._zotero_lib.update_item(ref)

    def validate_reference_data(self, ref_data):
        """Validate the reference data.

        Zotero.check_items() caches data after the first API call.
        """
        try:
            self._zotero_lib.check_items([ref_data])
        except InvalidItemFields as e:
            raise InvalidZoteroItemError from e

    def get_references(self):
        """Return all references in the Zotero database. Takes time..."""
        return self._zotero_lib.everything(self._zotero_lib.top())

    def get_reference_types(self):
        """Return the reference types.

        Zotero.item_types() caches data after the first API call.
        """
        item_types = self._zotero_lib.item_types()
        return sorted([x["itemType"] for x in item_types])

    def get_reference_templates(self, ref_types):
        """Return the reference templates for the types as an ordered dictionary."""
        return OrderedDict([(x, self.get_reference_template(x))
                            for x in ref_types])

    def get_reference_template(self, ref_type):
        """Return the reference template for the type as an ordered dictionary.

        Zotero.item_template() caches data after the first API call.
        """
        template = self._zotero_lib.item_template(ref_type)
        return OrderedDict(sorted(template.items(), key=lambda x: x[0]))

    def get_reference(self, ref_key):
        """Return the reference for the key."""
        return self._zotero_lib.item(ref_key)

    # Public @properties surrogates section.

    def reference_count(self):
        """Return the number of references."""
        return len(self._references)

    def reference_data(self, index):
        """Return the 'data' field of the reference."""
        return self._references[index]["data"]

    def reference_extra_field(self, field, index):
        """Return the value of the field in 'extra', otherwise ''."""
        ref_data = self.reference_data(index)
        extra_fields = ref_data["extra"].split("\n")
        field_id = field + ":"
        matched = next((x for x in extra_fields if x.startswith(field_id)),
                       None)
        if matched:
            return matched.replace(field_id, "", 1).strip()
        else:
            return ""

    def reference_type(self, index):
        """Return the reference type."""
        return self.reference_data(index)["itemType"]

    def reference_key(self, index):
        """Return the reference key."""
        return self._references[index]["key"]

    def reference_id(self, index):
        """Return the reference ID (locally defined)."""
        # TODO Include ISBN and ISSN?
        doi = self.reference_doi(index)
        if doi:
            return doi
        else:
            pmid = self.reference_pmid(index)
            if pmid:
                return "PMID_" + pmid
            else:
                unpublished_id = self.reference_unpublished_id(index)
                if unpublished_id:
                    return "UNPUBLISHED_" + unpublished_id
        return ""

    def reference_doi(self, index):
        """Return the reference DOI."""
        return self.reference_data(index).get(
            "DOI", self.reference_extra_field("DOI", index))

    def reference_pmid(self, index):
        """Return the reference PMID."""
        return self.reference_extra_field("PMID", index)

    def reference_unpublished_id(self, index):
        """Return the reference UNPUBLISHED ID."""
        return self.reference_extra_field("UNPUBLISHED", index)

    def reference_title(self, index):
        """Return the reference title."""
        return self.reference_data(index)["title"]

    def reference_creator_surnames(self, index):
        """Return as a list the surnames of the reference creators (locally defined)."""
        # TODO Not true, ex: ISBN 978-1-4398-3778-8. Return all creator types?
        # Academic books published as a collection of chapters contributed by
        # different authors have editors but not authors at the level of the
        # book (as opposed to the level of a chapter).
        creators = self.reference_data(index)["creators"]
        creator_types = [x["creatorType"] for x in creators]
        # 'name' (not split) might be used instead of 'firstName' and 'lastName'.
        try:
            if "author" in creator_types:
                return [
                    x["lastName"] for x in creators
                    if x["creatorType"] == "author"
                ]
            else:
                return [x["lastName"] for x in creators]
        except KeyError:
            return []

    def reference_creator_surnames_str(self, index):
        """Return as a string the surnames of the reference creators (locally defined)."""
        # NB: str.join() returns an empty string for an empty list.
        return ", ".join(self.reference_creator_surnames(index))

    def reference_date(self, index):
        """Return the reference publication date."""
        return self.reference_data(index)["date"]

    def reference_year(self, index):
        """Return the reference publication year."""
        # TODO Use meta:parsedDate field instead?
        ref_date = self.reference_date(index)
        try:
            # NB: datetime.year returns an int.
            return parse(ref_date).year
        except ValueError:
            matched = re.search(r"\d{4}", ref_date)
            if matched:
                return int(matched.group())
            else:
                return ""

    def reference_journal(self, index):
        """Return the reference journal name."""
        # TODO Change the column name 'Journal' to an other?
        ref_type = self.reference_type(index)
        if ref_type == "journalArticle":
            return self.reference_data(index)["publicationTitle"]
        else:
            return "({})".format(ref_type)

    # Public methods section.

    def reference_index(self, ref_id):
        """Return the first reference with this ID."""
        try:
            indexes = range(self.reference_count())
            return next(i for i in indexes if self.reference_id(i) == ref_id)
        except StopIteration as e:
            raise ReferenceNotFoundError("ID: " + ref_id) from e

    def reference_creators_citation(self, ref_id):
        """Return for citation the creator surnames (locally defined) and the publication year."""
        # FIXME Delayed refactoring. Use an index instead of an ID.
        index = self.reference_index(ref_id)
        creators = self.reference_creator_surnames(index)
        creator_count = len(creators)
        if creator_count == 0:
            return ""
        year = self.reference_year(index)
        if creator_count == 1:
            return "{} ({})".format(creators[0], year)
        elif creator_count == 2:
            return "{} and {} ({})".format(creators[0], creators[1], year)
        else:
            return "{} et al. ({})".format(creators[0], year)
Esempio n. 5
0
class ZoteroImporter(object):
    def __init__(self,
                 library_id,
                 library_type,
                 api_key,
                 papers2,
                 keyword_types=('user', 'label'),
                 label_map={},
                 add_to_collections=[],
                 upload_attachments="all",
                 batch_size=50,
                 checkpoint=None,
                 dryrun=None):
        self.client = Zotero(library_id, library_type, api_key)
        self.papers2 = papers2
        self.keyword_types = keyword_types
        self.label_map = label_map
        self.upload_attachments = upload_attachments
        self.checkpoint = checkpoint
        self.dryrun = JSONWriter(dryrun) if dryrun is not None else None
        self._batch = Batch(batch_size)
        self._load_collections(add_to_collections)

    # Load Zotero collections and create any
    # Papers2 collections that don't exist.
    # TODO: need to handle collection hierarchies
    def _load_collections(self, add_to_collections):
        self.collections = {}
        if add_to_collections is None:
            add_to_collections = list(c.name
                                      for c in self.papers2.get_collections())

        if len(add_to_collections) > 0:
            if self.dryrun is not None:
                for c in add_to_collections:
                    self.collections[c] = "<{0}>".format(c)

            else:
                # fetch existing zotero collections
                existing_collections = {}
                for zc in self.client.collections():
                    data = zc['data']
                    existing_collections[data['name']] = data['key']

                # add any papers2 collections that do not already exist
                payload = []
                for pc in add_to_collections:
                    if pc not in existing_collections:
                        payload.append(dict(name=pc))
                if len(payload) > 0:
                    self.client.create_collection(payload)

                # re-fetch zotero collections in order to get keys
                for zc in self.client.collections():
                    data = zc['data']
                    if data['name'] in add_to_collections:
                        self.collections[data['name']] = data['key']

    def add_pub(self, pub):
        # ignore publications we've already imported
        if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID):
            log.debug("Skipping already imported publication {0}".format(
                pub.ROWID))
            return False

        # convert the Papers2 publication type to a Zotero item type
        item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)]

        # get the template to fill in for an item of this type
        item = self.client.item_template(item_type)

        # fill in template fields
        for key, value in item.iteritems():
            if key in EXTRACTORS:
                value = EXTRACTORS[key].extract(pub, self, value)
                if value is not None:
                    item[key] = value

        # add notes, if any
        notes = []
        if pub.notes is not None and len(pub.notes) > 0:
            notes.append(pub.notes)

        reviews = self.papers2.get_reviews(pub)
        for r in reviews:
            notes.append("{0} Rating: {1}".format(r.content, r.rating))

        # get paths to attachments
        attachments = []
        if self.upload_attachments == "all" or (
                self.upload_attachments == "unread" and pub.times_read == 0):
            attachments = list(self.papers2.get_attachments(pub))

        # add to batch and checkpoint
        self._batch.add(item, notes, attachments)
        if self.checkpoint is not None:
            self.checkpoint.add(pub.ROWID)

        # commit the batch if it's full
        self._commit_batch()

        return True

    def close(self):
        if self._batch is not None:
            self._commit_batch(force=True)
            self._batch = None
        if self.dryrun is not None:
            self.dryrun.close()

    def _commit_batch(self, force=False):
        if self._batch.is_full or (force and not self._batch.is_empty):
            try:
                if self.dryrun is not None:
                    for item, attachments in self._batch.iter():
                        self.dryrun.write(item, attachments)

                else:
                    # upload metadata
                    status = self.client.create_items(self._batch.items)

                    if len(status['failed']) > 0:
                        for status_idx, status_msg in status[
                                'failed'].iteritems():
                            item_idx = int(status_idx)
                            # remove failures from the checkpoint
                            if self.checkpoint is not None:
                                self.checkpoint.remove(item_idx)
                            item = self._batch.items[item_idx]
                            log.error(
                                "Upload failed for item {0}; code {1}; {2}".
                                format(item['title'], status_msg['code'],
                                       status_msg['message']))

                    successes = {}
                    successes.update(status['success'])
                    successes.update(status['unchanged'])

                    for k, objKey in successes.iteritems():
                        item_idx = int(k)

                        # add notes
                        notes = self._batch.notes[item_idx]
                        if len(notes) > 0:
                            note_batch = []
                            for note_text in notes:
                                note = self.client.item_template('note')
                                note['parentItem'] = objKey
                                note['note'] = note_text
                                note_batch.append(note)

                            note_status = self.client.create_items(note_batch)

                            if len(note_status['failed']) > 0:
                                for status_idx, status_msg in note_status[
                                        'failed'].iteritems():
                                    note_idx = int(status_idx)
                                    # just warn about these failures
                                    note = note_batch[note_idx]
                                    log.error(
                                        "Failed to create note {0} for item item {1}; code {2}; {3}"
                                        .format(note['note'],
                                                self.batch.items[idx]['title'],
                                                status_msg['code'],
                                                status_msg['message']))

                        # upload attachments and add items to collections
                        if self.upload_attachments != "none":

                            # TODO: modify pyzotero to pass MIME type for contentType key
                            attachments = list(
                                path for path, mime in
                                self._batch.attachments[item_idx])
                            if len(attachments) > 0:
                                try:
                                    self.client.attachment_simple(
                                        attachments, objKey)

                                # This is to work around a bug in pyzotero where an exception is
                                # thrown if an attachment already exists
                                except KeyError:
                                    log.info(
                                        "One or more attachment already exists: {0}"
                                        .format(",".join(attachments)))

                    # update checkpoint
                    if self.checkpoint is not None:
                        self.checkpoint.commit()

                    log.info(
                        "Batch committed: {0} items created and {1} items unchanged out of {2} attempted"
                        .format(len(status['success']),
                                len(status['unchanged']), self._batch.size))

            except:
                log.error("Error importing {0} items to Zotero".format(
                    self._batch.size))
                if self.checkpoint is not None:
                    self.checkpoint.rollback()
                raise

            finally:
                self._batch.clear()
Esempio n. 6
0
class ZoteroWrap:

    CACHE_REFERENCE_LIST = "references"
    CACHE_REFERENCE_TYPES = "reference_types"
    CACHE_REFERENCE_TEMPLATES = "reference_templates"

    def __init__(self, library_id, library_type, api_key, directory):
        cache_filename = "{}-{}-{}.pkl".format(library_id, library_type, api_key)
        self.cache_path = os.path.join(directory, cache_filename)
        # reference_types and reference_templates must have the same ordering.
        self.reference_types = []
        self.reference_templates = {}
        self._zotero_lib = Zotero(library_id, library_type, api_key)
        self._references = []

    # Data I/O methods section.

    def initialize(self):
        """Load the cached Zotero data, or retrieve them if there is none."""
        try:
            self.load_cache()
        except FileNotFoundError:
            self.load_distant()

    def load_cache(self):
        """Load the cached Zotero data."""
        with open(self.cache_path, "rb") as f:
            print("Loading cached Zotero data...")
            cache = pickle.load(f)
            self._references = cache[self.CACHE_REFERENCE_LIST]
            self.reference_types = cache[self.CACHE_REFERENCE_TYPES]
            self.reference_templates = cache[self.CACHE_REFERENCE_TEMPLATES]
            print("Cached Zotero data loaded.")

    def load_distant(self):
        """Load the distant Zotero data."""
        print("Loading distant Zotero data...")
        self._references = self.get_references()
        self.reference_types = self.get_reference_types()
        self.reference_templates = self.get_reference_templates(self.reference_types)
        print("Distant Zotero data loaded.")
        self.cache()

    def cache(self):
        """Cache the Zotero data."""
        with open(self.cache_path, "wb") as f:
            cache = {self.CACHE_REFERENCE_LIST: self._references,
                     self.CACHE_REFERENCE_TYPES: self.reference_types,
                     self.CACHE_REFERENCE_TEMPLATES: self.reference_templates}
            pickle.dump(cache, f)

    def create_local_reference(self, ref):
        """Append the reference at the end of the reference list and cache it."""
        self._references.append(ref)
        self.cache()

    def create_distant_reference(self, ref_data):
        """Validate and create the reference in Zotero and return the created item."""
        self.validate_reference_data(ref_data)
        creation_status = self._zotero_lib.create_items([ref_data])
        try:
            created_item = creation_status["successful"]["0"]
            return created_item
        except KeyError as e:
            print(creation_status)
            raise CreateZoteroItemError from e

    def update_local_reference(self, index, ref):
        """Replace the reference in the reference list and cache it."""
        self._references[index] = ref
        self.cache()

    def update_distant_reference(self, ref):
        """Validate and update the reference in Zotero.

        Existing fields not present will be left unmodified.
        """
        self.validate_reference_data(ref["data"])
        self._zotero_lib.update_item(ref)

    def validate_reference_data(self, ref_data):
        """Validate the reference data.

        Zotero.check_items() caches data after the first API call.
        """
        try:
            self._zotero_lib.check_items([ref_data])
        except InvalidItemFields as e:
            raise InvalidZoteroItemError from e

    def get_references(self):
        """Return all references in the Zotero database. Takes time..."""
        return self._zotero_lib.everything(self._zotero_lib.top())

    def get_reference_types(self):
        """Return the reference types.

        Zotero.item_types() caches data after the first API call.
        """
        item_types = self._zotero_lib.item_types()
        return sorted([x["itemType"] for x in item_types])

    def get_reference_templates(self, ref_types):
        """Return the reference templates for the types as an ordered dictionary."""
        return OrderedDict([(x, self.get_reference_template(x)) for x in ref_types])

    def get_reference_template(self, ref_type):
        """Return the reference template for the type as an ordered dictionary.

        Zotero.item_template() caches data after the first API call.
        """
        template = self._zotero_lib.item_template(ref_type)
        return OrderedDict(sorted(template.items(), key=lambda x: x[0]))

    def get_reference(self, ref_key):
        """Return the reference for the key."""
        return self._zotero_lib.item(ref_key)

    # Public @properties surrogates section.

    def reference_count(self):
        """Return the number of references."""
        return len(self._references)

    def reference_data(self, index):
        """Return the 'data' field of the reference."""
        return self._references[index]["data"]

    def reference_extra_field(self, field, index):
        """Return the value of the field in 'extra', otherwise ''."""
        ref_data = self.reference_data(index)
        extra_fields = ref_data["extra"].split("\n")
        field_id = field + ":"
        matched = next((x for x in extra_fields if x.startswith(field_id)), None)
        if matched:
            return matched.replace(field_id, "", 1).strip()
        else:
            return ""

    def reference_type(self, index):
        """Return the reference type."""
        return self.reference_data(index)["itemType"]

    def reference_key(self, index):
        """Return the reference key."""
        return self._references[index]["key"]

    def reference_id(self, index):
        """Return the reference ID (locally defined)."""
        # TODO Include ISBN and ISSN?
        doi = self.reference_doi(index)
        if doi:
            return doi
        else:
            pmid = self.reference_pmid(index)
            if pmid:
                return "PMID_" + pmid
            else:
                unpublished_id = self.reference_unpublished_id(index)
                if unpublished_id:
                    return "UNPUBLISHED_" + unpublished_id
        return ""

    def reference_doi(self, index):
        """Return the reference DOI."""
        return self.reference_data(index).get("DOI", self.reference_extra_field("DOI", index))

    def reference_pmid(self, index):
        """Return the reference PMID."""
        return self.reference_extra_field("PMID", index)

    def reference_unpublished_id(self, index):
        """Return the reference UNPUBLISHED ID."""
        return self.reference_extra_field("UNPUBLISHED", index)

    def reference_title(self, index):
        """Return the reference title."""
        return self.reference_data(index)["title"]

    def reference_creator_surnames(self, index):
        """Return as a list the surnames of the reference creators (locally defined)."""
        # TODO Not true, ex: ISBN 978-1-4398-3778-8. Return all creator types?
        # Academic books published as a collection of chapters contributed by
        # different authors have editors but not authors at the level of the
        # book (as opposed to the level of a chapter).
        creators = self.reference_data(index)["creators"]
        creator_types = [x["creatorType"] for x in creators]
        # 'name' (not split) might be used instead of 'firstName' and 'lastName'.
        try:
            if "author" in creator_types:
                return [x["lastName"] for x in creators if x["creatorType"] == "author"]
            else:
                return [x["lastName"] for x in creators]
        except KeyError:
            return []

    def reference_creator_surnames_str(self, index):
        """Return as a string the surnames of the reference creators (locally defined)."""
        # NB: str.join() returns an empty string for an empty list.
        return ", ".join(self.reference_creator_surnames(index))

    def reference_date(self, index):
        """Return the reference publication date."""
        return self.reference_data(index)["date"]

    def reference_year(self, index):
        """Return the reference publication year."""
        # TODO Use meta:parsedDate field instead?
        ref_date = self.reference_date(index)
        try:
            # NB: datetime.year returns an int.
            return parse(ref_date).year
        except ValueError:
            matched = re.search(r"\d{4}", ref_date)
            if matched:
                return int(matched.group())
            else:
                return ""

    def reference_journal(self, index):
        """Return the reference journal name."""
        # TODO Change the column name 'Journal' to an other?
        ref_type = self.reference_type(index)
        if ref_type == "journalArticle":
            return self.reference_data(index)["publicationTitle"]
        else:
            return "({})".format(ref_type)

    # Public methods section.

    def reference_index(self, ref_id):
        """Return the first reference with this ID."""
        try:
            indexes = range(self.reference_count())
            return next(i for i in indexes if self.reference_id(i) == ref_id)
        except StopIteration as e:
            raise ReferenceNotFoundError("ID: " + ref_id) from e

    def reference_creators_citation(self, ref_id):
        """Return for citation the creator surnames (locally defined) and the publication year."""
        # FIXME Delayed refactoring. Use an index instead of an ID.
        index = self.reference_index(ref_id)
        creators = self.reference_creator_surnames(index)
        creator_count = len(creators)
        if creator_count == 0:
            return ""
        year = self.reference_year(index)
        if creator_count == 1:
            return "{} ({})".format(creators[0], year)
        elif creator_count == 2:
            return "{} and {} ({})".format(creators[0], creators[1], year)
        else:
            return "{} et al. ({})".format(creators[0], year)