class ZoteroBackend(object): @staticmethod def create_api_key(): """ Interactively create a new API key via Zotero's OAuth API. Requires the user to enter a verification key displayed in the browser. :returns: API key and the user's library ID """ auth = OAuth1Service(name='zotero', consumer_key=CLIENT_KEY, consumer_secret=CLIENT_SECRET, request_token_url=REQUEST_TOKEN_URL, access_token_url=ACCESS_TOKEN_URL, authorize_url=AUTH_URL, base_url=BASE_URL) token, secret = auth.get_request_token( params={'oauth_callback': 'oob'}) auth_url = auth.get_authorize_url(token) auth_url += '&' + urlencode({ 'name': 'zotero-cli', 'library_access': 1, 'notes_access': 1, 'write_access': 1, 'all_groups': 'read' }) click.echo("Opening {} in browser, please confirm.".format(auth_url)) click.launch(auth_url) verification = click.prompt("Enter verification code") token_resp = auth.get_raw_access_token( token, secret, method='POST', data={'oauth_verifier': verification}) if not token_resp: logging.debug(token_resp.content) click.fail("Error during API key generation.") access = urlparse.parse_qs(token_resp.text) return access['oauth_token'][0], access['userID'][0] def __init__(self, api_key=None, library_id=None, library_type='user', autosync=False): """ Service class for communicating with the Zotero API. This is mainly a thin wrapper around :py:class:`pyzotero.zotero.Zotero` that handles things like transparent HTML<->[edit-formt] conversion. :param api_key: API key for the Zotero API, will be loaded from the configuration if not specified :param library_id: Zotero library ID the API key is valid for, will be loaded from the configuration if not specified :param library_type: Type of the library, can be 'user' or 'group' """ self._logger = logging.getLogger() idx_path = os.path.join(click.get_app_dir(APP_NAME), 'index.sqlite') self.config = load_config() self.note_format = self.config['zotcli.note_format'] self.storage_dir = self.config.get('zotcli.storage_dir') self.betterbibtex = self.config.get('zotcli.betterbibtex') if self.config.get('zotcli.app_dir'): self.app_dir = self.config.get('zotcli.app_dir') api_key = api_key or self.config.get('zotcli.api_key') library_id = library_id or self.config.get('zotcli.library_id') if not api_key or not library_id: raise ValueError( "Please set your API key and library ID by running " "`zotcli configure` or pass them as command-line options.") self._zot = Zotero(library_id=library_id, api_key=api_key, library_type=library_type) self._index = SearchIndex(idx_path) sync_interval = self.config.get('zotcli.sync_interval', 300) since_last_sync = int(time.time()) - self._index.last_modified if autosync and since_last_sync >= int(sync_interval): click.echo("{} seconds since last sync, synchronizing.".format( since_last_sync)) num_updated = self.synchronize() click.echo("Updated {} items".format(num_updated)) def getBetterBibtexKeys(self): with open( os.path.join( self.app_dir, 'better-bibtex/_better-bibtex.json')) as data_file: data = json.load(data_file) keys = {} for i in data['collections'][0]['data']: keys[i['itemKey']] = i['citekey'] return keys def synchronize(self): """ Update the local index to the latest library version. """ new_items = tuple(self.items(since=self._index.library_version)) version = int(self._zot.request.headers.get('last-modified-version')) self._index.index(new_items, version) return len(new_items) def search(self, query, limit=None): """ Search the local index for items. :param query: A sqlite FTS4 query :param limit: Maximum number of items to return :returns: Generator that yields matching items. """ return self._index.search(query, limit=limit) def items(self, query=None, limit=None, recursive=False, since=0): """ Get a list of all items in the library matching the arguments. :param query: Filter items by this query string (targets author and title fields) :type query: str/unicode :param limit: Limit maximum number of returned items :type limit: int :param recursive: Include non-toplevel items (attachments, notes, etc) in output :type recursive: bool :returns: Generator that yields items """ if self.betterbibtex: bbtxkeys = self.getBetterBibtexKeys() if limit is None: limit = 100 query_args = {'since': since} if query: query_args['q'] = query if limit: query_args['limit'] = limit query_fn = self._zot.items if recursive else self._zot.top # NOTE: Normally we'd use the makeiter method of Zotero, but it seems # to be broken at the moment, thus we call .follow ourselves items = query_fn(**query_args) last_url = self._zot.links.get('last') if last_url: while self._zot.links['self'] != last_url: items.extend(self._zot.follow()) for it in items: if self.betterbibtex: try: citekey = bbtxkeys[it['data']['key']] except: citekey = None else: matches = CITEKEY_PAT.finditer(it['data'].get('extra', '')) citekey = next((m.group(1) for m in matches), None) yield Item(key=it['data']['key'], creator=it['meta'].get('creatorSummary'), title=it['data'].get('title', "Untitled"), abstract=it['data'].get('abstractNote'), date=it['data'].get('date'), citekey=citekey) def notes(self, item_id): """ Get a list of all notes for a given item. :param item_id: ID/key of the item to get notes for :returns: Notes for item """ notes = self._zot.children(item_id, itemType="note") for note in notes: note['data']['note'] = self._make_note(note) yield note def attachments(self, item_id): """ Get a list of all attachments for a given item. If a zotero profile directory is specified in the configuration, a resolved local file path will be included, if the file exists. :param item_id: ID/key of the item to get attachments for :returns: Attachments for item """ attachments = self._zot.children(item_id, itemType="attachment") if self.storage_dir: for att in attachments: if not att['data']['linkMode'].startswith("imported"): continue fpath = os.path.join(self.storage_dir, att['key'], att['data']['filename']) if not os.path.exists(fpath): continue att['data']['path'] = fpath return attachments def get_attachment_path(self, attachment): storage_method = self.config['zotcli.sync_method'] if storage_method == 'zotfile': storage = self.config['zotcli.storage_dir'] return Path(os.path.join(storage, attachment['data']['title'])) if not attachment['data']['linkMode'].startswith("imported"): raise ValueError( "Attachment is not stored on server, cannot download!") if storage_method == 'local': return Path(attachment['data']['path']) out_path = TEMP_DIR / attachment['data']['filename'] if out_path.exists(): return out_path if storage_method == 'zotero': self._zot.dump(attachment['key'], path=unicode(TEMP_DIR)) return out_path elif storage_method == 'webdav': user = self.config['zotcli.webdav_user'] password = self.config['zotcli.webdav_pass'] location = self.config['zotcli.webdav_path'] zip_url = "{}/zotero/{}.zip".format(location, attachment['key']) resp = requests.get(zip_url, auth=(user, password)) zf = zipfile.ZipFile(StringIO(resp.content)) zf.extractall(str(TEMP_DIR)) return out_path def _make_note(self, note_data): """ Converts a note from HTML to the configured markup. If the note was previously edited with zotcli, the original markup will be restored. If it was edited with the Zotero UI, it will be converted from the HTML via pandoc. :param note_html: HTML of the note :param note_version: Library version the note was last edited :returns: Dictionary with markup, format and version """ data = None note_html = note_data['data']['note'] note_version = note_data['version'] if "title=\"b'" in note_html: # Fix for badly formatted notes from an earlier version (see #26) note_html = re.sub(r'title="b\'(.*?)\'"', r'title="\1"', note_html) note_html = note_html.replace("\\n", "") blobs = DATA_PAT.findall(note_html) # Previously edited with zotcli if blobs: data = decode_blob(blobs[0]) if 'version' not in data: data['version'] = note_version note_html = DATA_PAT.sub("", note_html) # Not previously edited with zotcli or updated from the Zotero UI if not data or data['version'] < note_version: if data and data['version'] < note_version: self._logger.info("Note changed on server, reloading markup.") note_format = data['format'] if data else self.note_format data = { 'format': note_format, 'text': pypandoc.convert(note_html, note_format, format='html'), 'version': note_version } return data def _make_note_html(self, note_data): """ Converts the note's text to HTML and adds a dummy element that holds the original markup. :param note_data: dict with text, format and version of the note :returns: Note as HTML """ extra_data = DATA_TMPL.format( data=encode_blob(note_data).decode('utf8')) html = pypandoc.convert(note_data['text'], 'html', format=note_data['format']) return html + extra_data def create_note(self, item_id, note_text): """ Create a new note for a given item. :param item_id: ID/key of the item to create the note for :param note_text: Text of the note """ note = self._zot.item_template('note') note_data = { 'format': self.note_format, 'text': note_text, 'version': self._zot.last_modified_version(limit=1) + 2 } note['note'] = self._make_note_html(note_data) try: self._zot.create_items([note], item_id) except Exception as e: self._logger.error(e) with open("note_backup.txt", "w", encoding='utf-8') as fp: fp.write(note_data['text']) self._logger.warn( "Could not upload note to Zotero. You can find the note " "markup in 'note_backup.txt' in the current directory") def save_note(self, note): """ Update an existing note. :param note: The updated note """ raw_data = note['data']['note'] raw_data['version'] += 1 note['data']['note'] = self._make_note_html(raw_data) try: self._zot.update_item(note) except Exception as e: self._logger.error(e) with open("note_backup.txt", "w", encoding='utf-8') as fp: fp.write(raw_data['text']) self._logger.warn( "Could not upload note to Zotero. You can find the note " "markup in 'note_backup.txt' in the current directory")
class ZoteroImporter(object): def __init__( self, library_id, library_type, api_key, papers2, keyword_types=("user", "label"), label_map={}, add_to_collections=[], upload_attachments="all", batch_size=50, checkpoint=None, dryrun=None, ): self.client = Zotero(library_id, library_type, api_key) self.papers2 = papers2 self.keyword_types = keyword_types self.label_map = label_map self.upload_attachments = upload_attachments self.checkpoint = checkpoint self.dryrun = JSONWriter(dryrun) if dryrun is not None else None self._batch = Batch(batch_size) self._load_collections(add_to_collections) # Load Zotero collections and create any # Papers2 collections that don't exist. # TODO: need to handle collection hierarchies def _load_collections(self, add_to_collections): self.collections = {} if add_to_collections is None: add_to_collections = list(c.name for c in self.papers2.get_collections()) if len(add_to_collections) > 0: if self.dryrun is not None: for c in add_to_collections: self.collections[c] = "<{0}>".format(c) else: # fetch existing zotero collections existing_collections = {} for zc in self.client.collections(): data = zc["data"] existing_collections[data["name"]] = data["key"] # add any papers2 collections that do not already exist payload = [] for pc in add_to_collections: if pc not in existing_collections: payload.append(dict(name=pc)) if len(payload) > 0: self.client.create_collection(payload) # re-fetch zotero collections in order to get keys for zc in self.client.collections(): data = zc["data"] if data["name"] in add_to_collections: self.collections[data["name"]] = data["key"] def add_pub(self, pub): # ignore publications we've already imported if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID): log.debug("Skipping already imported publication {0}".format(pub.ROWID)) return False # convert the Papers2 publication type to a Zotero item type item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)] # get the template to fill in for an item of this type item = self.client.item_template(item_type) # fill in template fields for key, value in item.iteritems(): if key in EXTRACTORS: value = EXTRACTORS[key].extract(pub, self, value) if value is not None: item[key] = value # add notes, if any notes = [] if pub.notes is not None and len(pub.notes) > 0: notes.append(pub.notes) reviews = self.papers2.get_reviews(pub) for r in reviews: notes.append("{0} Rating: {1}".format(r.content, r.rating)) # get paths to attachments attachments = [] if self.upload_attachments == "all" or (self.upload_attachments == "unread" and pub.times_read == 0): attachments = list(self.papers2.get_attachments(pub)) # add to batch and checkpoint self._batch.add(item, notes, attachments) if self.checkpoint is not None: self.checkpoint.add(pub.ROWID) # commit the batch if it's full self._commit_batch() return True def close(self): if self._batch is not None: self._commit_batch(force=True) self._batch = None if self.dryrun is not None: self.dryrun.close() def _commit_batch(self, force=False): if self._batch.is_full or (force and not self._batch.is_empty): try: if self.dryrun is not None: for item, attachments in self._batch.iter(): self.dryrun.write(item, attachments) else: # upload metadata status = self.client.create_items(self._batch.items) if len(status["failed"]) > 0: for status_idx, status_msg in status["failed"].iteritems(): item_idx = int(status_idx) # remove failures from the checkpoint if self.checkpoint is not None: self.checkpoint.remove(item_idx) item = self._batch.items[item_idx] log.error( "Upload failed for item {0}; code {1}; {2}".format( item["title"], status_msg["code"], status_msg["message"] ) ) successes = {} successes.update(status["success"]) successes.update(status["unchanged"]) for k, objKey in successes.iteritems(): item_idx = int(k) # add notes notes = self._batch.notes[item_idx] if len(notes) > 0: note_batch = [] for note_text in notes: note = self.client.item_template("note") note["parentItem"] = objKey note["note"] = note_text note_batch.append(note) note_status = self.client.create_items(note_batch) if len(note_status["failed"]) > 0: for status_idx, status_msg in note_status["failed"].iteritems(): note_idx = int(status_idx) # just warn about these failures note = note_batch[note_idx] log.error( "Failed to create note {0} for item item {1}; code {2}; {3}".format( note["note"], self.batch.items[idx]["title"], status_msg["code"], status_msg["message"], ) ) # upload attachments and add items to collections if self.upload_attachments != "none": # TODO: modify pyzotero to pass MIME type for contentType key attachments = list(path for path, mime in self._batch.attachments[item_idx]) if len(attachments) > 0: try: self.client.attachment_simple(attachments, objKey) # This is to work around a bug in pyzotero where an exception is # thrown if an attachment already exists except KeyError: log.info("One or more attachment already exists: {0}".format(",".join(attachments))) # update checkpoint if self.checkpoint is not None: self.checkpoint.commit() log.info( "Batch committed: {0} items created and {1} items unchanged out of {2} attempted".format( len(status["success"]), len(status["unchanged"]), self._batch.size ) ) except: log.error("Error importing {0} items to Zotero".format(self._batch.size)) if self.checkpoint is not None: self.checkpoint.rollback() raise finally: self._batch.clear()
class ZoteroBackend(object): @staticmethod def create_api_key(): """ Interactively create a new API key via Zotero's OAuth API. Requires the user to enter a verification key displayed in the browser. :returns: API key and the user's library ID """ auth = OAuth1Service( name='zotero', consumer_key=CLIENT_KEY, consumer_secret=CLIENT_SECRET, request_token_url=REQUEST_TOKEN_URL, access_token_url=ACCESS_TOKEN_URL, authorize_url=AUTH_URL, base_url=BASE_URL) token, secret = auth.get_request_token( params={'oauth_callback': 'oob'}) auth_url = auth.get_authorize_url(token) auth_url += '&' + urlencode({ 'name': 'zotero-cli', 'library_access': 1, 'notes_access': 1, 'write_access': 1, 'all_groups': 'read'}) click.echo("Opening {} in browser, please confirm.".format(auth_url)) click.launch(auth_url) verification = click.prompt("Enter verification code") token_resp = auth.get_raw_access_token( token, secret, method='POST', data={'oauth_verifier': verification}) if not token_resp: logging.debug(token_resp.content) click.fail("Error during API key generation.") access = urlparse.parse_qs(token_resp.text) return access['oauth_token'][0], access['userID'][0] def __init__(self, api_key=None, library_id=None, library_type='user', autosync=False): """ Service class for communicating with the Zotero API. This is mainly a thin wrapper around :py:class:`pyzotero.zotero.Zotero` that handles things like transparent HTML<->[edit-formt] conversion. :param api_key: API key for the Zotero API, will be loaded from the configuration if not specified :param library_id: Zotero library ID the API key is valid for, will be loaded from the configuration if not specified :param library_type: Type of the library, can be 'user' or 'group' """ self._logger = logging.getLogger() idx_path = os.path.join(click.get_app_dir(APP_NAME), 'index.sqlite') self.config = load_config() self.note_format = self.config['zotcli.note_format'] self.storage_dir = self.config.get('zotcli.storage_dir') api_key = api_key or self.config.get('zotcli.api_key') library_id = library_id or self.config.get('zotcli.library_id') if not api_key or not library_id: raise ValueError( "Please set your API key and library ID by running " "`zotcli configure` or pass them as command-line options.") self._zot = Zotero(library_id=library_id, api_key=api_key, library_type=library_type) self._index = SearchIndex(idx_path) sync_interval = self.config.get('zotcli.sync_interval', 300) since_last_sync = int(time.time()) - self._index.last_modified if autosync and since_last_sync >= int(sync_interval): click.echo("{} seconds since last sync, synchronizing." .format(since_last_sync)) num_updated = self.synchronize() click.echo("Updated {} items".format(num_updated)) def synchronize(self): """ Update the local index to the latest library version. """ new_items = tuple(self.items(since=self._index.library_version)) version = int(self._zot.request.headers.get('last-modified-version')) self._index.index(new_items, version) return len(new_items) def search(self, query, limit=None): """ Search the local index for items. :param query: A sqlite FTS4 query :param limit: Maximum number of items to return :returns: Generator that yields matching items. """ return self._index.search(query, limit=limit) def items(self, query=None, limit=None, recursive=False, since=0): """ Get a list of all items in the library matching the arguments. :param query: Filter items by this query string (targets author and title fields) :type query: str/unicode :param limit: Limit maximum number of returned items :type limit: int :param recursive: Include non-toplevel items (attachments, notes, etc) in output :type recursive: bool :returns: Generator that yields items """ if limit is None: limit = 100 query_args = {'since': since} if query: query_args['q'] = query if limit: query_args['limit'] = limit query_fn = self._zot.items if recursive else self._zot.top # NOTE: Normally we'd use the makeiter method of Zotero, but it seems # to be broken at the moment, thus we call .follow ourselves items = query_fn(**query_args) last_url = self._zot.links.get('last') if last_url: while self._zot.links['self'] != last_url: items.extend(self._zot.follow()) for it in items: matches = CITEKEY_PAT.finditer(it['data'].get('extra', '')) citekey = next((m.group(1) for m in matches), None) yield Item(key=it['data']['key'], creator=it['meta'].get('creatorSummary'), title=it['data'].get('title', "Untitled"), abstract=it['data'].get('abstractNote'), date=it['data'].get('date'), citekey=citekey) def notes(self, item_id): """ Get a list of all notes for a given item. :param item_id: ID/key of the item to get notes for :returns: Notes for item """ notes = self._zot.children(item_id, itemType="note") for note in notes: note['data']['note'] = self._make_note(note) yield note def attachments(self, item_id): """ Get a list of all attachments for a given item. If a zotero profile directory is specified in the configuration, a resolved local file path will be included, if the file exists. :param item_id: ID/key of the item to get attachments for :returns: Attachments for item """ attachments = self._zot.children(item_id, itemType="attachment") if self.storage_dir: for att in attachments: if not att['data']['linkMode'].startswith("imported"): continue fpath = os.path.join(self.storage_dir, att['key'], att['data']['filename']) if not os.path.exists(fpath): continue att['data']['path'] = fpath return attachments def get_attachment_path(self, attachment): if not attachment['data']['linkMode'].startswith("imported"): raise ValueError( "Attachment is not stored on server, cannot download!") storage_method = self.config['zotcli.sync_method'] if storage_method == 'local': return Path(attachment['data']['path']) out_path = TEMP_DIR/attachment['data']['filename'] if out_path.exists(): return out_path if storage_method == 'zotero': self._zot.dump(attachment['key'], path=unicode(TEMP_DIR)) return out_path elif storage_method == 'webdav': user = self.config['zotcli.webdav_user'] password = self.config['zotcli.webdav_pass'] location = self.config['zotcli.webdav_path'] zip_url = "{}/zotero/{}.zip".format( location, attachment['key']) resp = requests.get(zip_url, auth=(user, password)) zf = zipfile.ZipFile(StringIO(resp.content)) zf.extractall(str(TEMP_DIR)) return out_path def _make_note(self, note_data): """ Converts a note from HTML to the configured markup. If the note was previously edited with zotcli, the original markup will be restored. If it was edited with the Zotero UI, it will be converted from the HTML via pandoc. :param note_html: HTML of the note :param note_version: Library version the note was last edited :returns: Dictionary with markup, format and version """ data = None note_html = note_data['data']['note'] note_version = note_data['version'] if "title=\"b'" in note_html: # Fix for badly formatted notes from an earlier version (see #26) note_html = re.sub(r'title="b\'(.*?)\'"', r'title="\1"', note_html) note_html = note_html.replace("\\n", "") blobs = DATA_PAT.findall(note_html) # Previously edited with zotcli if blobs: data = decode_blob(blobs[0]) if 'version' not in data: data['version'] = note_version note_html = DATA_PAT.sub("", note_html) # Not previously edited with zotcli or updated from the Zotero UI if not data or data['version'] < note_version: if data and data['version'] < note_version: self._logger.info("Note changed on server, reloading markup.") note_format = data['format'] if data else self.note_format data = { 'format': note_format, 'text': pypandoc.convert( note_html, note_format, format='html'), 'version': note_version} return data def _make_note_html(self, note_data): """ Converts the note's text to HTML and adds a dummy element that holds the original markup. :param note_data: dict with text, format and version of the note :returns: Note as HTML """ extra_data = DATA_TMPL.format( data=encode_blob(note_data).decode('utf8')) html = pypandoc.convert(note_data['text'], 'html', format=note_data['format']) return html + extra_data def create_note(self, item_id, note_text): """ Create a new note for a given item. :param item_id: ID/key of the item to create the note for :param note_text: Text of the note """ note = self._zot.item_template('note') note_data = {'format': self.note_format, 'text': note_text, 'version': self._zot.last_modified_version(limit=1)+2} note['note'] = self._make_note_html(note_data) try: self._zot.create_items([note], item_id) except Exception as e: self._logger.error(e) with open("note_backup.txt", "w", encoding='utf-8') as fp: fp.write(note_data['text']) self._logger.warn( "Could not upload note to Zotero. You can find the note " "markup in 'note_backup.txt' in the current directory") def save_note(self, note): """ Update an existing note. :param note: The updated note """ raw_data = note['data']['note'] raw_data['version'] += 1 note['data']['note'] = self._make_note_html(raw_data) try: self._zot.update_item(note) except Exception as e: self._logger.error(e) with open("note_backup.txt", "w", encoding='utf-8') as fp: fp.write(raw_data['text']) self._logger.warn( "Could not upload note to Zotero. You can find the note " "markup in 'note_backup.txt' in the current directory")
class ZoteroWrap: CACHE_REFERENCE_LIST = "references" CACHE_REFERENCE_TYPES = "reference_types" CACHE_REFERENCE_TEMPLATES = "reference_templates" def __init__(self, library_id, library_type, api_key, directory): cache_filename = "{}-{}-{}.pkl".format(library_id, library_type, api_key) self.cache_path = os.path.join(directory, cache_filename) # reference_types and reference_templates must have the same ordering. self.reference_types = [] self.reference_templates = {} self._zotero_lib = Zotero(library_id, library_type, api_key) self._references = [] # Data I/O methods section. def initialize(self): """Load the cached Zotero data, or retrieve them if there is none.""" try: self.load_cache() except FileNotFoundError: self.load_distant() def load_cache(self): """Load the cached Zotero data.""" with open(self.cache_path, "rb") as f: print("Loading cached Zotero data...") cache = pickle.load(f) self._references = cache[self.CACHE_REFERENCE_LIST] self.reference_types = cache[self.CACHE_REFERENCE_TYPES] self.reference_templates = cache[self.CACHE_REFERENCE_TEMPLATES] print("Cached Zotero data loaded.") def load_distant(self): """Load the distant Zotero data.""" print("Loading distant Zotero data...") self._references = self.get_references() self.reference_types = self.get_reference_types() self.reference_templates = self.get_reference_templates( self.reference_types) print("Distant Zotero data loaded.") self.cache() def cache(self): """Cache the Zotero data.""" with open(self.cache_path, "wb") as f: cache = { self.CACHE_REFERENCE_LIST: self._references, self.CACHE_REFERENCE_TYPES: self.reference_types, self.CACHE_REFERENCE_TEMPLATES: self.reference_templates } pickle.dump(cache, f) def create_local_reference(self, ref): """Append the reference at the end of the reference list and cache it.""" self._references.append(ref) self.cache() def create_distant_reference(self, ref_data): """Validate and create the reference in Zotero and return the created item.""" self.validate_reference_data(ref_data) creation_status = self._zotero_lib.create_items([ref_data]) try: created_item = creation_status["successful"]["0"] return created_item except KeyError as e: print(creation_status) raise CreateZoteroItemError from e def update_local_reference(self, index, ref): """Replace the reference in the reference list and cache it.""" self._references[index] = ref self.cache() def update_distant_reference(self, ref): """Validate and update the reference in Zotero. Existing fields not present will be left unmodified. """ self.validate_reference_data(ref["data"]) self._zotero_lib.update_item(ref) def validate_reference_data(self, ref_data): """Validate the reference data. Zotero.check_items() caches data after the first API call. """ try: self._zotero_lib.check_items([ref_data]) except InvalidItemFields as e: raise InvalidZoteroItemError from e def get_references(self): """Return all references in the Zotero database. Takes time...""" return self._zotero_lib.everything(self._zotero_lib.top()) def get_reference_types(self): """Return the reference types. Zotero.item_types() caches data after the first API call. """ item_types = self._zotero_lib.item_types() return sorted([x["itemType"] for x in item_types]) def get_reference_templates(self, ref_types): """Return the reference templates for the types as an ordered dictionary.""" return OrderedDict([(x, self.get_reference_template(x)) for x in ref_types]) def get_reference_template(self, ref_type): """Return the reference template for the type as an ordered dictionary. Zotero.item_template() caches data after the first API call. """ template = self._zotero_lib.item_template(ref_type) return OrderedDict(sorted(template.items(), key=lambda x: x[0])) def get_reference(self, ref_key): """Return the reference for the key.""" return self._zotero_lib.item(ref_key) # Public @properties surrogates section. def reference_count(self): """Return the number of references.""" return len(self._references) def reference_data(self, index): """Return the 'data' field of the reference.""" return self._references[index]["data"] def reference_extra_field(self, field, index): """Return the value of the field in 'extra', otherwise ''.""" ref_data = self.reference_data(index) extra_fields = ref_data["extra"].split("\n") field_id = field + ":" matched = next((x for x in extra_fields if x.startswith(field_id)), None) if matched: return matched.replace(field_id, "", 1).strip() else: return "" def reference_type(self, index): """Return the reference type.""" return self.reference_data(index)["itemType"] def reference_key(self, index): """Return the reference key.""" return self._references[index]["key"] def reference_id(self, index): """Return the reference ID (locally defined).""" # TODO Include ISBN and ISSN? doi = self.reference_doi(index) if doi: return doi else: pmid = self.reference_pmid(index) if pmid: return "PMID_" + pmid else: unpublished_id = self.reference_unpublished_id(index) if unpublished_id: return "UNPUBLISHED_" + unpublished_id return "" def reference_doi(self, index): """Return the reference DOI.""" return self.reference_data(index).get( "DOI", self.reference_extra_field("DOI", index)) def reference_pmid(self, index): """Return the reference PMID.""" return self.reference_extra_field("PMID", index) def reference_unpublished_id(self, index): """Return the reference UNPUBLISHED ID.""" return self.reference_extra_field("UNPUBLISHED", index) def reference_title(self, index): """Return the reference title.""" return self.reference_data(index)["title"] def reference_creator_surnames(self, index): """Return as a list the surnames of the reference creators (locally defined).""" # TODO Not true, ex: ISBN 978-1-4398-3778-8. Return all creator types? # Academic books published as a collection of chapters contributed by # different authors have editors but not authors at the level of the # book (as opposed to the level of a chapter). creators = self.reference_data(index)["creators"] creator_types = [x["creatorType"] for x in creators] # 'name' (not split) might be used instead of 'firstName' and 'lastName'. try: if "author" in creator_types: return [ x["lastName"] for x in creators if x["creatorType"] == "author" ] else: return [x["lastName"] for x in creators] except KeyError: return [] def reference_creator_surnames_str(self, index): """Return as a string the surnames of the reference creators (locally defined).""" # NB: str.join() returns an empty string for an empty list. return ", ".join(self.reference_creator_surnames(index)) def reference_date(self, index): """Return the reference publication date.""" return self.reference_data(index)["date"] def reference_year(self, index): """Return the reference publication year.""" # TODO Use meta:parsedDate field instead? ref_date = self.reference_date(index) try: # NB: datetime.year returns an int. return parse(ref_date).year except ValueError: matched = re.search(r"\d{4}", ref_date) if matched: return int(matched.group()) else: return "" def reference_journal(self, index): """Return the reference journal name.""" # TODO Change the column name 'Journal' to an other? ref_type = self.reference_type(index) if ref_type == "journalArticle": return self.reference_data(index)["publicationTitle"] else: return "({})".format(ref_type) # Public methods section. def reference_index(self, ref_id): """Return the first reference with this ID.""" try: indexes = range(self.reference_count()) return next(i for i in indexes if self.reference_id(i) == ref_id) except StopIteration as e: raise ReferenceNotFoundError("ID: " + ref_id) from e def reference_creators_citation(self, ref_id): """Return for citation the creator surnames (locally defined) and the publication year.""" # FIXME Delayed refactoring. Use an index instead of an ID. index = self.reference_index(ref_id) creators = self.reference_creator_surnames(index) creator_count = len(creators) if creator_count == 0: return "" year = self.reference_year(index) if creator_count == 1: return "{} ({})".format(creators[0], year) elif creator_count == 2: return "{} and {} ({})".format(creators[0], creators[1], year) else: return "{} et al. ({})".format(creators[0], year)
class ZoteroImporter(object): def __init__(self, library_id, library_type, api_key, papers2, keyword_types=('user', 'label'), label_map={}, add_to_collections=[], upload_attachments="all", batch_size=50, checkpoint=None, dryrun=None): self.client = Zotero(library_id, library_type, api_key) self.papers2 = papers2 self.keyword_types = keyword_types self.label_map = label_map self.upload_attachments = upload_attachments self.checkpoint = checkpoint self.dryrun = JSONWriter(dryrun) if dryrun is not None else None self._batch = Batch(batch_size) self._load_collections(add_to_collections) # Load Zotero collections and create any # Papers2 collections that don't exist. # TODO: need to handle collection hierarchies def _load_collections(self, add_to_collections): self.collections = {} if add_to_collections is None: add_to_collections = list(c.name for c in self.papers2.get_collections()) if len(add_to_collections) > 0: if self.dryrun is not None: for c in add_to_collections: self.collections[c] = "<{0}>".format(c) else: # fetch existing zotero collections existing_collections = {} for zc in self.client.collections(): data = zc['data'] existing_collections[data['name']] = data['key'] # add any papers2 collections that do not already exist payload = [] for pc in add_to_collections: if pc not in existing_collections: payload.append(dict(name=pc)) if len(payload) > 0: self.client.create_collection(payload) # re-fetch zotero collections in order to get keys for zc in self.client.collections(): data = zc['data'] if data['name'] in add_to_collections: self.collections[data['name']] = data['key'] def add_pub(self, pub): # ignore publications we've already imported if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID): log.debug("Skipping already imported publication {0}".format( pub.ROWID)) return False # convert the Papers2 publication type to a Zotero item type item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)] # get the template to fill in for an item of this type item = self.client.item_template(item_type) # fill in template fields for key, value in item.iteritems(): if key in EXTRACTORS: value = EXTRACTORS[key].extract(pub, self, value) if value is not None: item[key] = value # add notes, if any notes = [] if pub.notes is not None and len(pub.notes) > 0: notes.append(pub.notes) reviews = self.papers2.get_reviews(pub) for r in reviews: notes.append("{0} Rating: {1}".format(r.content, r.rating)) # get paths to attachments attachments = [] if self.upload_attachments == "all" or ( self.upload_attachments == "unread" and pub.times_read == 0): attachments = list(self.papers2.get_attachments(pub)) # add to batch and checkpoint self._batch.add(item, notes, attachments) if self.checkpoint is not None: self.checkpoint.add(pub.ROWID) # commit the batch if it's full self._commit_batch() return True def close(self): if self._batch is not None: self._commit_batch(force=True) self._batch = None if self.dryrun is not None: self.dryrun.close() def _commit_batch(self, force=False): if self._batch.is_full or (force and not self._batch.is_empty): try: if self.dryrun is not None: for item, attachments in self._batch.iter(): self.dryrun.write(item, attachments) else: # upload metadata status = self.client.create_items(self._batch.items) if len(status['failed']) > 0: for status_idx, status_msg in status[ 'failed'].iteritems(): item_idx = int(status_idx) # remove failures from the checkpoint if self.checkpoint is not None: self.checkpoint.remove(item_idx) item = self._batch.items[item_idx] log.error( "Upload failed for item {0}; code {1}; {2}". format(item['title'], status_msg['code'], status_msg['message'])) successes = {} successes.update(status['success']) successes.update(status['unchanged']) for k, objKey in successes.iteritems(): item_idx = int(k) # add notes notes = self._batch.notes[item_idx] if len(notes) > 0: note_batch = [] for note_text in notes: note = self.client.item_template('note') note['parentItem'] = objKey note['note'] = note_text note_batch.append(note) note_status = self.client.create_items(note_batch) if len(note_status['failed']) > 0: for status_idx, status_msg in note_status[ 'failed'].iteritems(): note_idx = int(status_idx) # just warn about these failures note = note_batch[note_idx] log.error( "Failed to create note {0} for item item {1}; code {2}; {3}" .format(note['note'], self.batch.items[idx]['title'], status_msg['code'], status_msg['message'])) # upload attachments and add items to collections if self.upload_attachments != "none": # TODO: modify pyzotero to pass MIME type for contentType key attachments = list( path for path, mime in self._batch.attachments[item_idx]) if len(attachments) > 0: try: self.client.attachment_simple( attachments, objKey) # This is to work around a bug in pyzotero where an exception is # thrown if an attachment already exists except KeyError: log.info( "One or more attachment already exists: {0}" .format(",".join(attachments))) # update checkpoint if self.checkpoint is not None: self.checkpoint.commit() log.info( "Batch committed: {0} items created and {1} items unchanged out of {2} attempted" .format(len(status['success']), len(status['unchanged']), self._batch.size)) except: log.error("Error importing {0} items to Zotero".format( self._batch.size)) if self.checkpoint is not None: self.checkpoint.rollback() raise finally: self._batch.clear()
class ZoteroWrap: CACHE_REFERENCE_LIST = "references" CACHE_REFERENCE_TYPES = "reference_types" CACHE_REFERENCE_TEMPLATES = "reference_templates" def __init__(self, library_id, library_type, api_key, directory): cache_filename = "{}-{}-{}.pkl".format(library_id, library_type, api_key) self.cache_path = os.path.join(directory, cache_filename) # reference_types and reference_templates must have the same ordering. self.reference_types = [] self.reference_templates = {} self._zotero_lib = Zotero(library_id, library_type, api_key) self._references = [] # Data I/O methods section. def initialize(self): """Load the cached Zotero data, or retrieve them if there is none.""" try: self.load_cache() except FileNotFoundError: self.load_distant() def load_cache(self): """Load the cached Zotero data.""" with open(self.cache_path, "rb") as f: print("Loading cached Zotero data...") cache = pickle.load(f) self._references = cache[self.CACHE_REFERENCE_LIST] self.reference_types = cache[self.CACHE_REFERENCE_TYPES] self.reference_templates = cache[self.CACHE_REFERENCE_TEMPLATES] print("Cached Zotero data loaded.") def load_distant(self): """Load the distant Zotero data.""" print("Loading distant Zotero data...") self._references = self.get_references() self.reference_types = self.get_reference_types() self.reference_templates = self.get_reference_templates(self.reference_types) print("Distant Zotero data loaded.") self.cache() def cache(self): """Cache the Zotero data.""" with open(self.cache_path, "wb") as f: cache = {self.CACHE_REFERENCE_LIST: self._references, self.CACHE_REFERENCE_TYPES: self.reference_types, self.CACHE_REFERENCE_TEMPLATES: self.reference_templates} pickle.dump(cache, f) def create_local_reference(self, ref): """Append the reference at the end of the reference list and cache it.""" self._references.append(ref) self.cache() def create_distant_reference(self, ref_data): """Validate and create the reference in Zotero and return the created item.""" self.validate_reference_data(ref_data) creation_status = self._zotero_lib.create_items([ref_data]) try: created_item = creation_status["successful"]["0"] return created_item except KeyError as e: print(creation_status) raise CreateZoteroItemError from e def update_local_reference(self, index, ref): """Replace the reference in the reference list and cache it.""" self._references[index] = ref self.cache() def update_distant_reference(self, ref): """Validate and update the reference in Zotero. Existing fields not present will be left unmodified. """ self.validate_reference_data(ref["data"]) self._zotero_lib.update_item(ref) def validate_reference_data(self, ref_data): """Validate the reference data. Zotero.check_items() caches data after the first API call. """ try: self._zotero_lib.check_items([ref_data]) except InvalidItemFields as e: raise InvalidZoteroItemError from e def get_references(self): """Return all references in the Zotero database. Takes time...""" return self._zotero_lib.everything(self._zotero_lib.top()) def get_reference_types(self): """Return the reference types. Zotero.item_types() caches data after the first API call. """ item_types = self._zotero_lib.item_types() return sorted([x["itemType"] for x in item_types]) def get_reference_templates(self, ref_types): """Return the reference templates for the types as an ordered dictionary.""" return OrderedDict([(x, self.get_reference_template(x)) for x in ref_types]) def get_reference_template(self, ref_type): """Return the reference template for the type as an ordered dictionary. Zotero.item_template() caches data after the first API call. """ template = self._zotero_lib.item_template(ref_type) return OrderedDict(sorted(template.items(), key=lambda x: x[0])) def get_reference(self, ref_key): """Return the reference for the key.""" return self._zotero_lib.item(ref_key) # Public @properties surrogates section. def reference_count(self): """Return the number of references.""" return len(self._references) def reference_data(self, index): """Return the 'data' field of the reference.""" return self._references[index]["data"] def reference_extra_field(self, field, index): """Return the value of the field in 'extra', otherwise ''.""" ref_data = self.reference_data(index) extra_fields = ref_data["extra"].split("\n") field_id = field + ":" matched = next((x for x in extra_fields if x.startswith(field_id)), None) if matched: return matched.replace(field_id, "", 1).strip() else: return "" def reference_type(self, index): """Return the reference type.""" return self.reference_data(index)["itemType"] def reference_key(self, index): """Return the reference key.""" return self._references[index]["key"] def reference_id(self, index): """Return the reference ID (locally defined).""" # TODO Include ISBN and ISSN? doi = self.reference_doi(index) if doi: return doi else: pmid = self.reference_pmid(index) if pmid: return "PMID_" + pmid else: unpublished_id = self.reference_unpublished_id(index) if unpublished_id: return "UNPUBLISHED_" + unpublished_id return "" def reference_doi(self, index): """Return the reference DOI.""" return self.reference_data(index).get("DOI", self.reference_extra_field("DOI", index)) def reference_pmid(self, index): """Return the reference PMID.""" return self.reference_extra_field("PMID", index) def reference_unpublished_id(self, index): """Return the reference UNPUBLISHED ID.""" return self.reference_extra_field("UNPUBLISHED", index) def reference_title(self, index): """Return the reference title.""" return self.reference_data(index)["title"] def reference_creator_surnames(self, index): """Return as a list the surnames of the reference creators (locally defined).""" # TODO Not true, ex: ISBN 978-1-4398-3778-8. Return all creator types? # Academic books published as a collection of chapters contributed by # different authors have editors but not authors at the level of the # book (as opposed to the level of a chapter). creators = self.reference_data(index)["creators"] creator_types = [x["creatorType"] for x in creators] # 'name' (not split) might be used instead of 'firstName' and 'lastName'. try: if "author" in creator_types: return [x["lastName"] for x in creators if x["creatorType"] == "author"] else: return [x["lastName"] for x in creators] except KeyError: return [] def reference_creator_surnames_str(self, index): """Return as a string the surnames of the reference creators (locally defined).""" # NB: str.join() returns an empty string for an empty list. return ", ".join(self.reference_creator_surnames(index)) def reference_date(self, index): """Return the reference publication date.""" return self.reference_data(index)["date"] def reference_year(self, index): """Return the reference publication year.""" # TODO Use meta:parsedDate field instead? ref_date = self.reference_date(index) try: # NB: datetime.year returns an int. return parse(ref_date).year except ValueError: matched = re.search(r"\d{4}", ref_date) if matched: return int(matched.group()) else: return "" def reference_journal(self, index): """Return the reference journal name.""" # TODO Change the column name 'Journal' to an other? ref_type = self.reference_type(index) if ref_type == "journalArticle": return self.reference_data(index)["publicationTitle"] else: return "({})".format(ref_type) # Public methods section. def reference_index(self, ref_id): """Return the first reference with this ID.""" try: indexes = range(self.reference_count()) return next(i for i in indexes if self.reference_id(i) == ref_id) except StopIteration as e: raise ReferenceNotFoundError("ID: " + ref_id) from e def reference_creators_citation(self, ref_id): """Return for citation the creator surnames (locally defined) and the publication year.""" # FIXME Delayed refactoring. Use an index instead of an ID. index = self.reference_index(ref_id) creators = self.reference_creator_surnames(index) creator_count = len(creators) if creator_count == 0: return "" year = self.reference_year(index) if creator_count == 1: return "{} ({})".format(creators[0], year) elif creator_count == 2: return "{} and {} ({})".format(creators[0], creators[1], year) else: return "{} et al. ({})".format(creators[0], year)