class MediaManager(object): # other code depends on this order, so don't reorder regexps = ("(?i)(\[sound:([^]]+)\])", "(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)") def __init__(self, col): self.col = col # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) if not os.path.exists(self._dir): os.makedirs(self._dir) self._oldcwd = os.getcwd() os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir()+".db" create = not os.path.exists(path) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location os.chdir(self._oldcwd) def dir(self): return self._dir # Adding media ########################################################################## def addFile(self, opath): """Copy PATH to MEDIADIR, and return new filename. If the same name exists, compare checksums.""" mdir = self.dir() # remove any dangerous characters base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath)) dst = os.path.join(mdir, base) # if it doesn't exist, copy it directly if not os.path.exists(dst): shutil.copy2(opath, dst) return base # if it's identical, reuse if self.filesIdentical(opath, dst): return base # otherwise, find a unique name (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n+1) while True: path = os.path.join(mdir, root + ext) if not os.path.exists(path): break reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # copy and return shutil.copy2(opath, path) return os.path.basename(os.path.basename(path)) def filesIdentical(self, path1, path2): "True if files are the same." return (checksum(open(path1, "rb").read()) == checksum(open(path2, "rb").read())) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] # convert latex first model = self.col.models.get(mid) string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for (full, fname) in re.findall(reg, string): isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): # Feeding webkit unicode can result in it not finding images, so on # linux/osx we percent escape the image paths as utf8. On Windows the # problem is more complicated - if we percent-escape as utf8 it fixes # some images but breaks others. When filenames are normalized by # dropbox they become unreadable if we escape them. if isWin: return string def repl(match): tag = match.group(1) fname = match.group(2) if re.match("(https?|ftp)://", fname): return tag return tag.replace( fname, urllib.quote(fname.encode("utf-8"))) return re.sub(self.regexps[1], repl, string) # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode): return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = normrefs.keys() return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## # FIXME: check if the files are actually identical, and rewrite references # if necessary def copyTo(self, rdir): "Copy media to RDIR. Return number of files copied." ldir = self.dir() if not os.path.exists(ldir): return 0 cnt = 0 for f in os.listdir(ldir): src = os.path.join(ldir, f) dst = os.path.join(rdir, f) if not os.path.exists(dst): shutil.copy2(src, dst) cnt += 1 return cnt # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): os.unlink(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] sizecnt = 0 # get meta info first assert z.getinfo("_meta").file_size < 100000 meta = simplejson.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): # check for zip bombs sizecnt += i.file_size assert sizecnt < 100*1024*1024 if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # can we store the file on this system? if self.illegal(i.filename): continue # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany( "insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished def illegal(self, f): if isWin: for c in f: if c in "<>:\"/\\|?*^": return True elif isMac: for c in f: if c in ":\\/": return True # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute( "select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE: break cnt += 1 z.writestr("_meta", simplejson.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f,)) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute( "select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders if os.path.isdir(f): continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt
class _Collection: db: Optional[DB] sched: Union[V1Scheduler, V2Scheduler] crt: int mod: int scm: int dty: bool # no longer used _usn: int ls: int conf: Dict[str, Any] _undo: List[Any] backend: RustBackend def __init__( self, db: DB, backend: RustBackend, server: Optional["anki.storage.ServerData"] = None, log: bool = False, ) -> None: self.backend = backend self._debugLog = log self.db = db self.path = db._path self._openLog() self.log(self.path, anki.version) self.server = server self._lastSave = time.time() self.clearUndo() self.media = MediaManager(self, server is not None) self.models = ModelManager(self) self.decks = DeckManager(self) self.tags = TagManager(self) self.load() if not self.crt: d = datetime.datetime.today() d -= datetime.timedelta(hours=4) d = datetime.datetime(d.year, d.month, d.day) d += datetime.timedelta(hours=4) self.crt = int(time.mktime(d.timetuple())) self._loadScheduler() if not self.conf.get("newBury", False): self.conf["newBury"] = True self.setMod() def name(self) -> Any: n = os.path.splitext(os.path.basename(self.path))[0] return n # Scheduler ########################################################################## supportedSchedulerVersions = (1, 2) def schedVer(self) -> Any: ver = self.conf.get("schedVer", 1) if ver in self.supportedSchedulerVersions: return ver else: raise Exception("Unsupported scheduler version") def _loadScheduler(self) -> None: ver = self.schedVer() if ver == 1: self.sched = V1Scheduler(self) elif ver == 2: self.sched = V2Scheduler(self) if not self.server: self.conf["localOffset"] = self.sched._current_timezone_offset() elif self.server.minutes_west is not None: self.conf["localOffset"] = self.server.minutes_west def changeSchedulerVer(self, ver: int) -> None: if ver == self.schedVer(): return if ver not in self.supportedSchedulerVersions: raise Exception("Unsupported scheduler version") self.modSchema(check=True) self.clearUndo() v2Sched = V2Scheduler(self) if ver == 1: v2Sched.moveToV1() else: v2Sched.moveToV2() self.conf["schedVer"] = ver self.setMod() self._loadScheduler() def localOffset(self) -> Optional[int]: "Minutes west of UTC. Only applies to V2 scheduler." if isinstance(self.sched, V1Scheduler): return None else: return self.sched._current_timezone_offset() # DB-related ########################################################################## def load(self) -> None: ( self.crt, self.mod, self.scm, self.dty, # no longer used self._usn, self.ls, conf, models, decks, dconf, tags, ) = self.db.first( """ select crt, mod, scm, dty, usn, ls, conf, models, decks, dconf, tags from col""" ) self.conf = json.loads(conf) # type: ignore self.models.load(models) self.decks.load(decks, dconf) self.tags.load(tags) def setMod(self) -> None: """Mark DB modified. DB operations and the deck/tag/model managers do this automatically, so this is only necessary if you modify properties of this object or the conf dict.""" self.db.mod = True def flush(self, mod: Optional[int] = None) -> None: "Flush state to DB, updating mod time." self.mod = intTime(1000) if mod is None else mod self.db.execute( """update col set crt=?, mod=?, scm=?, dty=?, usn=?, ls=?, conf=?""", self.crt, self.mod, self.scm, self.dty, self._usn, self.ls, json.dumps(self.conf), ) def save(self, name: Optional[str] = None, mod: Optional[int] = None) -> None: "Flush, commit DB, and take out another write lock." # let the managers conditionally flush self.models.flush() self.decks.flush() self.tags.flush() # and flush deck + bump mod if db has been changed if self.db.mod: self.flush(mod=mod) self.db.commit() self.lock() self.db.mod = False self._markOp(name) self._lastSave = time.time() def autosave(self) -> Optional[bool]: "Save if 5 minutes has passed since last save. True if saved." if time.time() - self._lastSave > 300: self.save() return True return None def lock(self) -> None: # make sure we don't accidentally bump mod time mod = self.db.mod self.db.execute("update col set mod=mod") self.db.mod = mod def close(self, save: bool = True) -> None: "Disconnect from DB." if self.db: if save: self.save() else: self.db.rollback() if not self.server: self.db.setAutocommit(True) self.db.execute("pragma journal_mode = delete") self.db.setAutocommit(False) self.db.close() self.db = None self.media.close() self._closeLog() def reopen(self) -> None: "Reconnect to DB (after changing threads, etc)." if not self.db: self.db = DB(self.path) self.media.connect() self._openLog() def rollback(self) -> None: self.db.rollback() self.load() self.lock() def modSchema(self, check: bool) -> None: "Mark schema modified. Call this first so user can abort if necessary." if not self.schemaChanged(): if check and not hooks.schema_will_change(proceed=True): raise AnkiError("abortSchemaMod") self.scm = intTime(1000) self.setMod() def schemaChanged(self) -> Any: "True if schema changed since last sync." return self.scm > self.ls def usn(self) -> Any: return self._usn if self.server else -1 def beforeUpload(self) -> None: "Called before a full upload." tbls = "notes", "cards", "revlog" for t in tbls: self.db.execute("update %s set usn=0 where usn=-1" % t) # we can save space by removing the log of deletions self.db.execute("delete from graves") self._usn += 1 self.models.beforeUpload() self.tags.beforeUpload() self.decks.beforeUpload() self.modSchema(check=False) self.ls = self.scm # ensure db is compacted before upload self.db.setAutocommit(True) self.db.execute("vacuum") self.db.execute("analyze") self.close() # Object creation helpers ########################################################################## def getCard(self, id: int) -> Card: return Card(self, id) def getNote(self, id: int) -> Note: return Note(self, id=id) # Utils ########################################################################## def nextID(self, type: str, inc: bool = True) -> Any: type = "next" + type.capitalize() id = self.conf.get(type, 1) if inc: self.conf[type] = id + 1 return id def reset(self) -> None: "Rebuild the queue and reload data after DB modified." self.sched.reset() # Deletion logging ########################################################################## def _logRem(self, ids: List[int], type: int) -> None: self.db.executemany( "insert into graves values (%d, ?, %d)" % (self.usn(), type), ([x] for x in ids), ) # Notes ########################################################################## def noteCount(self) -> Any: return self.db.scalar("select count() from notes") def newNote(self, forDeck: bool = True) -> Note: "Return a new note with the current model." return Note(self, self.models.current(forDeck)) def addNote(self, note: Note) -> int: """Add a note to the collection. Return number of new cards.""" # check we have card models available, then save cms = self.findTemplates(note) if not cms: return 0 note.flush() # deck conf governs which of these are used due = self.nextID("pos") # add cards ncards = 0 for template in cms: self._newCard(note, template, due) ncards += 1 return ncards def remNotes(self, ids: Iterable[int]) -> None: """Deletes notes with the given IDs.""" self.remCards(self.db.list("select id from cards where nid in " + ids2str(ids))) def _remNotes(self, ids: List[int]) -> None: """Bulk delete notes by ID. Don't call this directly.""" if not ids: return strids = ids2str(ids) # we need to log these independently of cards, as one side may have # more card templates hooks.notes_will_be_deleted(self, ids) self._logRem(ids, REM_NOTE) self.db.execute("delete from notes where id in %s" % strids) # Card creation ########################################################################## def findTemplates(self, note: Note) -> List: "Return (active), non-empty templates." model = note.model() avail = self.models.availOrds(model, joinFields(note.fields)) return self._tmplsFromOrds(model, avail) def _tmplsFromOrds(self, model: NoteType, avail: List[int]) -> List: ok = [] if model["type"] == MODEL_STD: for t in model["tmpls"]: if t["ord"] in avail: ok.append(t) else: # cloze - generate temporary templates from first for ord in avail: t = copy.copy(model["tmpls"][0]) t["ord"] = ord ok.append(t) return ok def genCards(self, nids: List[int]) -> List[int]: "Generate cards for non-empty templates, return ids to remove." # build map of (nid,ord) so we don't create dupes snids = ids2str(nids) have: Dict[int, Dict[int, int]] = {} dids: Dict[int, Optional[int]] = {} dues: Dict[int, int] = {} for id, nid, ord, did, due, odue, odid, type in self.db.execute( "select id, nid, ord, did, due, odue, odid, type from cards where nid in " + snids ): # existing cards if nid not in have: have[nid] = {} have[nid][ord] = id # if in a filtered deck, add new cards to original deck if odid != 0: did = odid # and their dids if nid in dids: if dids[nid] and dids[nid] != did: # cards are in two or more different decks; revert to # model default dids[nid] = None else: # first card or multiple cards in same deck dids[nid] = did # save due if odid != 0: due = odue if nid not in dues and type == 0: # Add due to new card only if it's the due of a new sibling dues[nid] = due # build cards for each note data = [] ts = maxID(self.db) now = intTime() rem = [] usn = self.usn() for nid, mid, flds in self.db.execute( "select id, mid, flds from notes where id in " + snids ): model = self.models.get(mid) assert model avail = self.models.availOrds(model, flds) did = dids.get(nid) or model["did"] due = dues.get(nid) # add any missing cards for t in self._tmplsFromOrds(model, avail): doHave = nid in have and t["ord"] in have[nid] if not doHave: # check deck is not a cram deck did = t["did"] or did if self.decks.isDyn(did): did = 1 # if the deck doesn't exist, use default instead did = self.decks.get(did)["id"] # use sibling due# if there is one, else use a new id if due is None: due = self.nextID("pos") data.append((ts, nid, did, t["ord"], now, usn, due)) ts += 1 # note any cards that need removing if nid in have: for ord, id in list(have[nid].items()): if ord not in avail: rem.append(id) # bulk update self.db.executemany( """ insert into cards values (?,?,?,?,?,?,0,0,?,0,0,0,0,0,0,0,0,"")""", data, ) return rem # type is no longer used def previewCards( self, note: Note, type: int = 0, did: Optional[int] = None ) -> List: existing_cards = {} for card in note.cards(): existing_cards[card.ord] = card all_cards = [] for idx, template in enumerate(note.model()["tmpls"]): if idx in existing_cards: all_cards.append(existing_cards[idx]) else: # card not currently in database, generate an ephemeral one all_cards.append(self._newCard(note, template, 1, flush=False, did=did)) return all_cards def _newCard( self, note: Note, template: Template, due: int, flush: bool = True, did: Optional[int] = None, ) -> Card: "Create a new card." card = Card(self) card.nid = note.id card.ord = template["ord"] # type: ignore card.did = self.db.scalar( "select did from cards where nid = ? and ord = ?", card.nid, card.ord ) # Use template did (deck override) if valid, otherwise did in argument, otherwise model did if not card.did: if template["did"] and str(template["did"]) in self.decks.decks: card.did = int(template["did"]) elif did: card.did = did else: card.did = note.model()["did"] # if invalid did, use default instead deck = self.decks.get(card.did) assert deck if deck["dyn"]: # must not be a filtered deck card.did = 1 else: card.did = deck["id"] card.due = self._dueForDid(card.did, due) if flush: card.flush() return card def _dueForDid(self, did: int, due: int) -> int: conf = self.decks.confForDid(did) # in order due? if conf["new"]["order"] == NEW_CARDS_DUE: return due else: # random mode; seed with note ts so all cards of this note get the # same random number r = random.Random() r.seed(due) return r.randrange(1, max(due, 1000)) # Cards ########################################################################## def isEmpty(self) -> bool: return not self.db.scalar("select 1 from cards limit 1") def cardCount(self) -> Any: return self.db.scalar("select count() from cards") def remCards(self, ids: List[int], notes: bool = True) -> None: "Bulk delete cards by ID." if not ids: return sids = ids2str(ids) nids = self.db.list("select nid from cards where id in " + sids) # remove cards self._logRem(ids, REM_CARD) self.db.execute("delete from cards where id in " + sids) # then notes if not notes: return nids = self.db.list( """ select id from notes where id in %s and id not in (select nid from cards)""" % ids2str(nids) ) self._remNotes(nids) def emptyCids(self) -> List[int]: """Returns IDs of empty cards.""" rem: List[int] = [] for m in self.models.all(): rem += self.genCards(self.models.nids(m)) return rem def emptyCardReport(self, cids) -> str: rep = "" for ords, cnt, flds in self.db.all( """ select group_concat(ord+1), count(), flds from cards c, notes n where c.nid = n.id and c.id in %s group by nid""" % ids2str(cids) ): rep += _("Empty card numbers: %(c)s\nFields: %(f)s\n\n") % dict( c=ords, f=flds.replace("\x1f", " / ") ) return rep # Field checksums and sorting fields ########################################################################## def _fieldData(self, snids: str) -> Any: return self.db.execute("select id, mid, flds from notes where id in " + snids) def updateFieldCache(self, nids: List[int]) -> None: "Update field checksums and sort cache, after find&replace, etc." snids = ids2str(nids) r = [] for (nid, mid, flds) in self._fieldData(snids): fields = splitFields(flds) model = self.models.get(mid) if not model: # note points to invalid model continue r.append( ( stripHTMLMedia(fields[self.models.sortIdx(model)]), fieldChecksum(fields[0]), nid, ) ) # apply, relying on calling code to bump usn+mod self.db.executemany("update notes set sfld=?, csum=? where id=?", r) # Finding cards ########################################################################## def findCards(self, query: str, order: Union[bool, str] = False) -> Any: return anki.find.Finder(self).findCards(query, order) def findNotes(self, query: str) -> Any: return anki.find.Finder(self).findNotes(query) def findReplace( self, nids: List[int], src: str, dst: str, regex: Optional[bool] = None, field: Optional[str] = None, fold: bool = True, ) -> int: return anki.find.findReplace(self, nids, src, dst, regex, field, fold) def findDupes(self, fieldName: str, search: str = "") -> List[Tuple[Any, list]]: return anki.find.findDupes(self, fieldName, search) # Stats ########################################################################## def cardStats(self, card: Card) -> str: from anki.stats import CardStats return CardStats(self, card).report() def stats(self) -> "anki.stats.CollectionStats": from anki.stats import CollectionStats return CollectionStats(self) # Timeboxing ########################################################################## def startTimebox(self) -> None: self._startTime = time.time() self._startReps = self.sched.reps def timeboxReached(self) -> Union[bool, Tuple[Any, int]]: "Return (elapsedTime, reps) if timebox reached, or False." if not self.conf["timeLim"]: # timeboxing disabled return False elapsed = time.time() - self._startTime if elapsed > self.conf["timeLim"]: return (self.conf["timeLim"], self.sched.reps - self._startReps) return False # Undo ########################################################################## def clearUndo(self) -> None: # [type, undoName, data] # type 1 = review; type 2 = checkpoint self._undo = None def undoName(self) -> Any: "Undo menu item name, or None if undo unavailable." if not self._undo: return None return self._undo[1] def undo(self) -> Any: if self._undo[0] == 1: return self._undoReview() else: self._undoOp() def markReview(self, card: Card) -> None: old: List[Any] = [] if self._undo: if self._undo[0] == 1: old = self._undo[2] self.clearUndo() wasLeech = card.note().hasTag("leech") or False self._undo = [1, _("Review"), old + [copy.copy(card)], wasLeech] def _undoReview(self) -> Any: data = self._undo[2] wasLeech = self._undo[3] c = data.pop() # pytype: disable=attribute-error if not data: self.clearUndo() # remove leech tag if it didn't have it before if not wasLeech and c.note().hasTag("leech"): c.note().delTag("leech") c.note().flush() # write old data c.flush() # and delete revlog entry last = self.db.scalar( "select id from revlog where cid = ? " "order by id desc limit 1", c.id ) self.db.execute("delete from revlog where id = ?", last) # restore any siblings self.db.execute( "update cards set queue=type,mod=?,usn=? where queue=-2 and nid=?", intTime(), self.usn(), c.nid, ) # and finally, update daily counts n = 1 if c.queue == 3 else c.queue type = ("new", "lrn", "rev")[n] self.sched._updateStats(c, type, -1) self.sched.reps -= 1 return c.id def _markOp(self, name: Optional[str]) -> None: "Call via .save()" if name: self._undo = [2, name] else: # saving disables old checkpoint, but not review undo if self._undo and self._undo[0] == 2: self.clearUndo() def _undoOp(self) -> None: self.rollback() self.clearUndo() # DB maintenance ########################################################################## def basicCheck(self) -> bool: "Basic integrity check for syncing. True if ok." # cards without notes if self.db.scalar( """ select 1 from cards where nid not in (select id from notes) limit 1""" ): return False # notes without cards or models if self.db.scalar( """ select 1 from notes where id not in (select distinct nid from cards) or mid not in %s limit 1""" % ids2str(self.models.ids()) ): return False # invalid ords for m in self.models.all(): # ignore clozes if m["type"] != MODEL_STD: continue if self.db.scalar( """ select 1 from cards where ord not in %s and nid in ( select id from notes where mid = ?) limit 1""" % ids2str([t["ord"] for t in m["tmpls"]]), m["id"], ): return False return True def fixIntegrity(self) -> Tuple[str, bool]: """Fix possible problems and rebuild caches. Returns tuple of (error: str, ok: bool). 'ok' will be true if no problems were found. """ problems = [] curs = self.db.cursor() self.save() oldSize = os.stat(self.path)[stat.ST_SIZE] if self.db.scalar("pragma integrity_check") != "ok": return (_("Collection is corrupt. Please see the manual."), False) # note types with a missing model ids = self.db.list( """ select id from notes where mid not in """ + ids2str(self.models.ids()) ) if ids: problems.append( ngettext( "Deleted %d note with missing note type.", "Deleted %d notes with missing note type.", len(ids), ) % len(ids) ) self.remNotes(ids) # for each model for m in self.models.all(): for t in m["tmpls"]: if t["did"] == "None": t["did"] = None problems.append(_("Fixed AnkiDroid deck override bug.")) self.models.save(m, updateReqs=False) if m["type"] == MODEL_STD: # model with missing req specification if "req" not in m: self.models._updateRequired(m) problems.append(_("Fixed note type: %s") % m["name"]) # cards with invalid ordinal ids = self.db.list( """ select id from cards where ord not in %s and nid in ( select id from notes where mid = ?)""" % ids2str([t["ord"] for t in m["tmpls"]]), m["id"], ) if ids: problems.append( ngettext( "Deleted %d card with missing template.", "Deleted %d cards with missing template.", len(ids), ) % len(ids) ) self.remCards(ids) # notes with invalid field count ids = [] for id, flds in self.db.execute( "select id, flds from notes where mid = ?", m["id"] ): if (flds.count("\x1f") + 1) != len(m["flds"]): ids.append(id) if ids: problems.append( ngettext( "Deleted %d note with wrong field count.", "Deleted %d notes with wrong field count.", len(ids), ) % len(ids) ) self.remNotes(ids) # delete any notes with missing cards ids = self.db.list( """ select id from notes where id not in (select distinct nid from cards)""" ) if ids: cnt = len(ids) problems.append( ngettext( "Deleted %d note with no cards.", "Deleted %d notes with no cards.", cnt, ) % cnt ) self._remNotes(ids) # cards with missing notes ids = self.db.list( """ select id from cards where nid not in (select id from notes)""" ) if ids: cnt = len(ids) problems.append( ngettext( "Deleted %d card with missing note.", "Deleted %d cards with missing note.", cnt, ) % cnt ) self.remCards(ids) # cards with odue set when it shouldn't be ids = self.db.list( """ select id from cards where odue > 0 and (type=1 or queue=2) and not odid""" ) if ids: cnt = len(ids) problems.append( ngettext( "Fixed %d card with invalid properties.", "Fixed %d cards with invalid properties.", cnt, ) % cnt ) self.db.execute("update cards set odue=0 where id in " + ids2str(ids)) # cards with odid set when not in a dyn deck dids = [id for id in self.decks.allIds() if not self.decks.isDyn(id)] ids = self.db.list( """ select id from cards where odid > 0 and did in %s""" % ids2str(dids) ) if ids: cnt = len(ids) problems.append( ngettext( "Fixed %d card with invalid properties.", "Fixed %d cards with invalid properties.", cnt, ) % cnt ) self.db.execute( "update cards set odid=0, odue=0 where id in " + ids2str(ids) ) # tags self.tags.registerNotes() # field cache for m in self.models.all(): self.updateFieldCache(self.models.nids(m)) # new cards can't have a due position > 32 bits, so wrap items over # 2 million back to 1 million curs.execute( """ update cards set due=1000000+due%1000000,mod=?,usn=? where due>=1000000 and type=0""", [intTime(), self.usn()], ) if curs.rowcount: problems.append( "Found %d new cards with a due number >= 1,000,000 - consider repositioning them in the Browse screen." % curs.rowcount ) # new card position self.conf["nextPos"] = ( self.db.scalar("select max(due)+1 from cards where type = 0") or 0 ) # reviews should have a reasonable due # ids = self.db.list("select id from cards where queue = 2 and due > 100000") if ids: problems.append("Reviews had incorrect due date.") self.db.execute( "update cards set due = ?, ivl = 1, mod = ?, usn = ? where id in %s" % ids2str(ids), self.sched.today, intTime(), self.usn(), ) # v2 sched had a bug that could create decimal intervals curs.execute( "update cards set ivl=round(ivl),due=round(due) where ivl!=round(ivl) or due!=round(due)" ) if curs.rowcount: problems.append("Fixed %d cards with v2 scheduler bug." % curs.rowcount) curs.execute( "update revlog set ivl=round(ivl),lastIvl=round(lastIvl) where ivl!=round(ivl) or lastIvl!=round(lastIvl)" ) if curs.rowcount: problems.append( "Fixed %d review history entries with v2 scheduler bug." % curs.rowcount ) # models if self.models.ensureNotEmpty(): problems.append("Added missing note type.") # and finally, optimize self.optimize() newSize = os.stat(self.path)[stat.ST_SIZE] txt = _("Database rebuilt and optimized.") ok = not problems problems.append(txt) # if any problems were found, force a full sync if not ok: self.modSchema(check=False) self.save() return ("\n".join(problems), ok) def optimize(self) -> None: self.db.setAutocommit(True) self.db.execute("vacuum") self.db.execute("analyze") self.db.setAutocommit(False) self.lock() # Logging ########################################################################## def log(self, *args, **kwargs) -> None: if not self._debugLog: return def customRepr(x): if isinstance(x, str): return x return pprint.pformat(x) path, num, fn, y = traceback.extract_stack(limit=2 + kwargs.get("stack", 0))[0] buf = "[%s] %s:%s(): %s" % ( intTime(), os.path.basename(path), fn, ", ".join([customRepr(x) for x in args]), ) self._logHnd.write(buf + "\n") if devMode: print(buf) def _openLog(self) -> None: if not self._debugLog: return lpath = re.sub(r"\.anki2$", ".log", self.path) if os.path.exists(lpath) and os.path.getsize(lpath) > 10 * 1024 * 1024: lpath2 = lpath + ".old" if os.path.exists(lpath2): os.unlink(lpath2) os.rename(lpath, lpath2) self._logHnd = open(lpath, "a", encoding="utf8") def _closeLog(self) -> None: if not self._debugLog: return self._logHnd.close() self._logHnd = None # Card Flags ########################################################################## def setUserFlag(self, flag: int, cids: List[int]) -> None: assert 0 <= flag <= 7 self.db.execute( "update cards set flags = (flags & ~?) | ?, usn=?, mod=? where id in %s" % ids2str(cids), 0b111, flag, self.usn(), intTime(), )
class MediaManager(object): soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"] imgRegexps = [ # src element quoted case "(?i)(<img[^>]+src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case "(?i)(<img[^>]+src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps def __init__(self, col, server): self.col = col if server: self._dir = None return # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) # convert dir to unicode if it's not already if isinstance(self._dir, str): self._dir = unicode(self._dir, sys.getfilesystemencoding()) if not os.path.exists(self._dir): os.makedirs(self._dir) try: self._oldcwd = os.getcwd() except OSError: # cwd doesn't exist self._oldcwd = None os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir()+".db" create = not os.path.exists(path) os.chdir(self._dir) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location if self._oldcwd: try: os.chdir(self._oldcwd) except: # may have been deleted pass def dir(self): return self._dir def _isFAT32(self): if not isWin: return import win32api, win32file name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True # Adding media ########################################################################## def addFile(self, opath): """Copy PATH to MEDIADIR, and return new filename. If the same name exists, compare checksums.""" mdir = self.dir() # remove any dangerous characters base = re.sub(r"[][<>:/\\&?\"\|]", "", os.path.basename(opath)) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n+1) # find the first available name while True: path = os.path.join(mdir, root + ext) # if it doesn't exist, copy it directly if not os.path.exists(path): shutil.copyfile(opath, path) return os.path.basename(os.path.basename(path)) # if it's identical, reuse if self.filesIdentical(opath, path): return os.path.basename(path) # otherwise, increment the index in the filename reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) def filesIdentical(self, path1, path2): "True if files are the same." return (checksum(open(path1, "rb").read()) == checksum(open(path2, "rb").read())) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] model = self.col.models.get(mid) strings = [] if model['type'] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string): ords = set(re.findall("{{c(\d+)::.+?}}", string)) strings = [] from anki.template.template import clozeReg def qrepl(m): if m.group(3): return "[%s]" % m.group(3) else: return "[...]" def arepl(m): return m.group(1) for ord in ords: s = re.sub(clozeReg%ord, qrepl, string) s = re.sub(clozeReg%".+?", "\\1", s) strings.append(s) strings.append(re.sub(clozeReg%".+?", arepl, string)) return strings def transformNames(self, txt, func): for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace( fname, urllib.quote(fname.encode("utf-8"))) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode) and isMac: return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue if file.startswith("_"): # leading _ says to ignore file continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = [x for x in normrefs.keys() if not x.startswith("_")] return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## def have(self, fname): return os.path.exists(os.path.join(self.dir(), fname)) # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): send2trash.send2trash(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] sizecnt = 0 # get meta info first assert z.getinfo("_meta").file_size < 100000 meta = json.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): # check for zip bombs sizecnt += i.file_size assert sizecnt < 100*1024*1024 if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # can we store the file on this system? if self.illegal(name): continue # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany( "insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished def illegal(self, f): if isWin: for c in f: if c in "<>:\"/\\|?*^": return True elif isMac: for c in f: if c in ":\\/": return True # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE/COUNT. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute( "select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE or cnt > SYNC_ZIP_COUNT: break cnt += 1 z.writestr("_meta", json.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f,)) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute( "select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders and thumbs.db if os.path.isdir(f): continue if f.lower() == "thumbs.db": continue # and files with invalid chars bad = False for c in "\0", "/", "\\", ":": if c in f: bad = True break if bad: continue # empty files are invalid; clean them up and continue if not os.path.getsize(f): os.unlink(f) continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt def forceResync(self): self.db.execute("delete from media") self.db.execute("delete from log") self.db.execute("update meta set usn = 0, dirMod = 0") self.db.commit() def removeExisting(self, files): "Remove files from list of files to sync, and return missing files." need = [] remove = [] for f in files: if self.db.execute("select 1 from log where fname=?", f): remove.append((f,)) else: need.append(f) self.db.executemany("delete from log where fname=?", remove) self.db.commit() # if we need all the server files, it's faster to pass None than # the full list if need and len(files) == len(need): return None return need
class MediaManager(object): soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"] imgRegexps = [ # src element quoted case "(?i)(<img[^>]+src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case "(?i)(<img[^>]+src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps def __init__(self, col, server): self.col = col if server: self._dir = None return # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) # convert dir to unicode if it's not already if isinstance(self._dir, str): self._dir = unicode(self._dir, sys.getfilesystemencoding()) if not os.path.exists(self._dir): os.makedirs(self._dir) try: self._oldcwd = os.getcwd() except OSError: # cwd doesn't exist self._oldcwd = None os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir() + ".db" create = not os.path.exists(path) os.chdir(self._dir) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location if self._oldcwd: try: os.chdir(self._oldcwd) except: # may have been deleted pass def dir(self): return self._dir def _isFAT32(self): if not isWin: return import win32api, win32file try: name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) except: # mapped & unmapped network drive; pray that it's not vfat return if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True # Adding media ########################################################################## def addFile(self, opath): return self.writeData(opath, open(opath, "rb").read()) def writeData(self, opath, data): # if fname is a full path, use only the basename fname = os.path.basename(opath) # remove any dangerous characters base = self.stripIllegal(fname) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n + 1) # find the first available name csum = checksum(data) while True: fname = root + ext path = os.path.join(self.dir(), fname) # if it doesn't exist, copy it directly if not os.path.exists(path): open(path, "wb").write(data) return fname # if it's identical, reuse if checksum(open(path, "rb").read()) == csum: return fname # otherwise, increment the index in the filename reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] model = self.col.models.get(mid) strings = [] if model['type'] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string): ords = set(re.findall("{{c(\d+)::.+?}}", string)) strings = [] from anki.template.template import clozeReg def qrepl(m): if m.group(3): return "[%s]" % m.group(3) else: return "[...]" def arepl(m): return m.group(1) for ord in ords: s = re.sub(clozeReg % ord, qrepl, string) s = re.sub(clozeReg % ".+?", "\\1", s) strings.append(s) strings.append(re.sub(clozeReg % ".+?", arepl, string)) return strings def transformNames(self, txt, func): for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, urllib.quote(fname.encode("utf-8"))) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode) and isMac: return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue if file.startswith("_"): # leading _ says to ignore file continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = [x for x in normrefs.keys() if not x.startswith("_")] return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## def have(self, fname): return os.path.exists(os.path.join(self.dir(), fname)) # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): send2trash.send2trash(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] # get meta info first meta = json.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany("insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished # Illegal characters ########################################################################## _illegalCharReg = re.compile(r'[][><:"/?*^\\|\0]') def stripIllegal(self, str): return re.sub(self._illegalCharReg, "", str) def hasIllegal(self, str): return not not re.search(self._illegalCharReg, str) # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE/COUNT. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute("select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE or cnt > SYNC_ZIP_COUNT: break cnt += 1 z.writestr("_meta", json.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f, )) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute("select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders and thumbs.db if os.path.isdir(f): continue if f.lower() == "thumbs.db": continue # and files with invalid chars if self.hasIllegal(f): continue # empty files are invalid; clean them up and continue if not os.path.getsize(f): os.unlink(f) continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt def forceResync(self): self.db.execute("delete from media") self.db.execute("delete from log") self.db.execute("update meta set usn = 0, dirMod = 0") self.db.commit() def removeExisting(self, files): "Remove files from list of files to sync, and return missing files." need = [] remove = [] for f in files: if self.db.scalar("select 1 from log where fname=?", f): remove.append((f, )) else: need.append(f) self.db.executemany("delete from log where fname=?", remove) self.db.commit() # if we need all the server files, it's faster to pass None than # the full list if need and len(files) == len(need): return None return need
class MediaManager(object): # other code depends on this order, so don't reorder regexps = ("(?i)(\[sound:([^]]+)\])", "(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)") def __init__(self, col): self.col = col # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) if not os.path.exists(self._dir): os.makedirs(self._dir) self._oldcwd = os.getcwd() os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir() + ".db" create = not os.path.exists(path) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location os.chdir(self._oldcwd) def dir(self): return self._dir # Adding media ########################################################################## def addFile(self, opath): """Copy PATH to MEDIADIR, and return new filename. If the same name exists, compare checksums.""" mdir = self.dir() # remove any dangerous characters base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath)) dst = os.path.join(mdir, base) # if it doesn't exist, copy it directly if not os.path.exists(dst): shutil.copy2(opath, dst) return base # if it's identical, reuse if self.filesIdentical(opath, dst): return base # otherwise, find a unique name (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n + 1) while True: path = os.path.join(mdir, root + ext) if not os.path.exists(path): break reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # copy and return shutil.copy2(opath, path) return os.path.basename(os.path.basename(path)) def filesIdentical(self, path1, path2): "True if files are the same." return (checksum(open(path1, "rb").read()) == checksum( open(path2, "rb").read())) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] # convert latex first model = self.col.models.get(mid) string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for (full, fname) in re.findall(reg, string): isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): # Feeding webkit unicode can result in it not finding images, so on # linux/osx we percent escape the image paths as utf8. On Windows the # problem is more complicated - if we percent-escape as utf8 it fixes # some images but breaks others. When filenames are normalized by # dropbox they become unreadable if we escape them. if isWin: return string def repl(match): tag = match.group(1) fname = match.group(2) if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, urllib.quote(fname.encode("utf-8"))) return re.sub(self.regexps[1], repl, string) # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode): return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = normrefs.keys() return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## # FIXME: check if the files are actually identical, and rewrite references # if necessary def copyTo(self, rdir): "Copy media to RDIR. Return number of files copied." ldir = self.dir() if not os.path.exists(ldir): return 0 cnt = 0 for f in os.listdir(ldir): src = os.path.join(ldir, f) dst = os.path.join(rdir, f) if not os.path.exists(dst): shutil.copy2(src, dst) cnt += 1 return cnt # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): os.unlink(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] sizecnt = 0 # get meta info first assert z.getinfo("_meta").file_size < 100000 meta = simplejson.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): # check for zip bombs sizecnt += i.file_size assert sizecnt < 100 * 1024 * 1024 if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # can we store the file on this system? if self.illegal(i.filename): continue # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany("insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished def illegal(self, f): if isWin: for c in f: if c in "<>:\"/\\|?*^": return True elif isMac: for c in f: if c in ":\\/": return True # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute("select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE: break cnt += 1 z.writestr("_meta", simplejson.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f, )) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute("select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders if os.path.isdir(f): continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt