class MediaManager(object): # other code depends on this order, so don't reorder regexps = ("(?i)(\[sound:([^]]+)\])", "(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)") def __init__(self, col): self.col = col # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) if not os.path.exists(self._dir): os.makedirs(self._dir) self._oldcwd = os.getcwd() os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir()+".db" create = not os.path.exists(path) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location os.chdir(self._oldcwd) def dir(self): return self._dir # Adding media ########################################################################## def addFile(self, opath): """Copy PATH to MEDIADIR, and return new filename. If the same name exists, compare checksums.""" mdir = self.dir() # remove any dangerous characters base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath)) dst = os.path.join(mdir, base) # if it doesn't exist, copy it directly if not os.path.exists(dst): shutil.copy2(opath, dst) return base # if it's identical, reuse if self.filesIdentical(opath, dst): return base # otherwise, find a unique name (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n+1) while True: path = os.path.join(mdir, root + ext) if not os.path.exists(path): break reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # copy and return shutil.copy2(opath, path) return os.path.basename(os.path.basename(path)) def filesIdentical(self, path1, path2): "True if files are the same." return (checksum(open(path1, "rb").read()) == checksum(open(path2, "rb").read())) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] # convert latex first model = self.col.models.get(mid) string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for (full, fname) in re.findall(reg, string): isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): # Feeding webkit unicode can result in it not finding images, so on # linux/osx we percent escape the image paths as utf8. On Windows the # problem is more complicated - if we percent-escape as utf8 it fixes # some images but breaks others. When filenames are normalized by # dropbox they become unreadable if we escape them. if isWin: return string def repl(match): tag = match.group(1) fname = match.group(2) if re.match("(https?|ftp)://", fname): return tag return tag.replace( fname, urllib.quote(fname.encode("utf-8"))) return re.sub(self.regexps[1], repl, string) # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode): return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = normrefs.keys() return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## # FIXME: check if the files are actually identical, and rewrite references # if necessary def copyTo(self, rdir): "Copy media to RDIR. Return number of files copied." ldir = self.dir() if not os.path.exists(ldir): return 0 cnt = 0 for f in os.listdir(ldir): src = os.path.join(ldir, f) dst = os.path.join(rdir, f) if not os.path.exists(dst): shutil.copy2(src, dst) cnt += 1 return cnt # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): os.unlink(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] sizecnt = 0 # get meta info first assert z.getinfo("_meta").file_size < 100000 meta = simplejson.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): # check for zip bombs sizecnt += i.file_size assert sizecnt < 100*1024*1024 if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # can we store the file on this system? if self.illegal(i.filename): continue # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany( "insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished def illegal(self, f): if isWin: for c in f: if c in "<>:\"/\\|?*^": return True elif isMac: for c in f: if c in ":\\/": return True # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute( "select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE: break cnt += 1 z.writestr("_meta", simplejson.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f,)) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute( "select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders if os.path.isdir(f): continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt
class MediaManager(object): soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"] imgRegexps = [ # src element quoted case "(?i)(<img[^>]* src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case "(?i)(<img[^>]* src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps def __init__(self, col, server): self.col = col if server: self._dir = None return # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) # convert dir to unicode if it's not already if isinstance(self._dir, str): self._dir = str(self._dir) if not os.path.exists(self._dir): os.makedirs(self._dir) try: self._oldcwd = os.getcwd() except OSError: # cwd doesn't exist self._oldcwd = None try: os.chdir(self._dir) except OSError: raise Exception("invalidTempFolder") # change database self.connect() def connect(self): if self.col.server: return path = self.dir() + ".db2" create = not os.path.exists(path) os.chdir(self._dir) self.db = DB(path) if create: self._initDB() self.maybeUpgrade() def _initDB(self): self.db.executescript(""" create table media ( fname text not null primary key, csum text, -- null indicates deleted file mtime int not null, -- zero if deleted dirty int not null ); create index idx_media_dirty on media (dirty); create table meta (dirMod int, lastUsn int); insert into meta values (0, 0); """) def maybeUpgrade(self): oldpath = self.dir() + ".db" if os.path.exists(oldpath): self.db.execute('attach "../collection.media.db" as old') try: self.db.execute(""" insert into media select m.fname, csum, mod, ifnull((select 1 from log l2 where l2.fname=m.fname), 0) as dirty from old.media m left outer join old.log l using (fname) union select fname, null, 0, 1 from old.log where type=1;""") self.db.execute("delete from meta") self.db.execute(""" insert into meta select dirMod, usn from old.meta """) self.db.commit() except Exception as e: # if we couldn't import the old db for some reason, just start # anew self.col.log("failed to import old media db:" + traceback.format_exc()) self.db.execute("detach old") npath = "../collection.media.db.old" if os.path.exists(npath): os.unlink(npath) os.rename("../collection.media.db", npath) def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location if self._oldcwd: try: os.chdir(self._oldcwd) except: # may have been deleted pass def dir(self): return self._dir def _isFAT32(self): if not isWin: return import win32api, win32file try: name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) except: # mapped & unmapped network drive; pray that it's not vfat return if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True # Adding media ########################################################################## # opath must be in unicode def addFile(self, opath): return self.writeData(opath, open(opath, "rb").read()) def writeData(self, opath, data): # if fname is a full path, use only the basename fname = os.path.basename(opath) # make sure we write it in NFC form (on mac will autoconvert to NFD), # and return an NFC-encoded reference fname = unicodedata.normalize("NFC", fname) # remove any dangerous characters base = self.stripIllegal(fname) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n + 1) # find the first available name csum = checksum(data) while True: fname = root + ext path = os.path.join(self.dir(), fname) # if it doesn't exist, copy it directly if not os.path.exists(path): open(path, "wb").write(data) return fname # if it's identical, reuse if checksum(open(path, "rb").read()) == csum: return fname # otherwise, increment the index in the filename reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): from anki.latex import mungeQA l = [] model = self.col.models.get(mid) strings = [] if model['type'] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string): ords = set(re.findall("{{c(\d+)::.+?}}", string)) strings = [] from anki.template.template import clozeReg def qrepl(m): if m.group(3): return "[%s]" % m.group(3) else: return "[...]" def arepl(m): return m.group(1) for ord in ords: s = re.sub(clozeReg % ord, qrepl, string) s = re.sub(clozeReg % ".+?", "\\1", s) strings.append(s) strings.append(re.sub(clozeReg % ".+?", arepl, string)) return strings def transformNames(self, txt, func): for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string, unescape=False): if unescape: fn = urllib.parse.unquote else: fn = urllib.parse.quote def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, str(fn(fname.encode("utf-8")), "utf8")) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # gather all media references in NFC form allRefs = set() for nid, mid, flds in self.col.db.execute( "select id, mid, flds from notes"): noteRefs = self.filesInStr(mid, flds) # check the refs are in NFC for f in noteRefs: # if they're not, we'll need to fix them first if f != unicodedata.normalize("NFC", f): self._normalizeNoteRefs(nid) noteRefs = self.filesInStr(mid, flds) break allRefs.update(noteRefs) # loop through media folder unused = [] invalid = [] if local is None: files = os.listdir(mdir) else: files = local renamedFiles = False for file in files: if not local: if not os.path.isfile(file): # ignore directories continue if file.startswith("_"): # leading _ says to ignore file continue if not isinstance(file, str): invalid.append( str(file, sys.getfilesystemencoding(), "replace")) continue nfcFile = unicodedata.normalize("NFC", file) # we enforce NFC fs encoding on non-macs; on macs we'll have gotten # NFD so we use the above variable for comparing references if not isMac and not local: if file != nfcFile: # delete if we already have the NFC form, otherwise rename if os.path.exists(nfcFile): os.unlink(file) renamedFiles = True else: os.rename(file, nfcFile) renamedFiles = True file = nfcFile # compare if nfcFile not in allRefs: unused.append(file) else: allRefs.discard(nfcFile) # if we renamed any files to nfc format, we must rerun the check # to make sure the renamed files are not marked as unused if renamedFiles: return self.check(local=local) nohave = [x for x in allRefs if not x.startswith("_")] return (nohave, unused, invalid) def _normalizeNoteRefs(self, nid): note = self.col.getNote(nid) for c, fld in enumerate(note.fields): nfc = unicodedata.normalize("NFC", fld) if nfc != fld: note.fields[c] = nfc note.flush() # Copying on import ########################################################################## def have(self, fname): return os.path.exists(os.path.join(self.dir(), fname)) # Illegal characters ########################################################################## _illegalCharReg = re.compile(r'[][><:"/?*^\\|\0\r\n]') def stripIllegal(self, str): return re.sub(self._illegalCharReg, "", str) def hasIllegal(self, str): # a file that couldn't be decoded to unicode is considered invalid if not isinstance(str, str): return True return not not re.search(self._illegalCharReg, str) # Tracking changes ########################################################################## def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def haveDirty(self): return self.db.scalar("select 1 from media where dirty=1 limit 1") def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def _logChanges(self): (added, removed) = self._changes() media = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt, 1)) for f in removed: media.append((f, None, 0, 1)) # update media db self.db.executemany("insert or replace into media values (?,?,?,?)", media) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute( "select fname, csum, mtime from media where csum is not null"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders and thumbs.db if os.path.isdir(f): continue if f.lower() == "thumbs.db": continue # and files with invalid chars if self.hasIllegal(f): continue # empty files are invalid; clean them up and continue sz = os.path.getsize(f) if not sz: os.unlink(f) continue if sz > 100 * 1024 * 1024: self.col.log("ignoring file over 100MB", f) continue # check encoding if not isMac: normf = unicodedata.normalize("NFC", f) if f != normf: # wrong filename encoding which will cause sync errors if os.path.exists(normf): os.unlink(f) else: os.rename(f, normf) # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in list(self.cache.items()): if not v[2]: removed.append(k) return added, removed # Syncing-related ########################################################################## def lastUsn(self): return self.db.scalar("select lastUsn from meta") def setLastUsn(self, usn): self.db.execute("update meta set lastUsn = ?", usn) self.db.commit() def syncInfo(self, fname): ret = self.db.first("select csum, dirty from media where fname=?", fname) return ret or (None, 0) def markClean(self, fnames): for fname in fnames: self.db.execute("update media set dirty=0 where fname=?", fname) def syncDelete(self, fname): if os.path.exists(fname): os.unlink(fname) self.db.execute("delete from media where fname=?", fname) def mediaCount(self): return self.db.scalar( "select count() from media where csum is not null") def dirtyCount(self): return self.db.scalar("select count() from media where dirty=1") def forceResync(self): self.db.execute("delete from media") self.db.execute("update meta set lastUsn=0,dirMod=0") self.db.execute("vacuum analyze") self.db.commit() # Media syncing: zips ########################################################################## def mediaChangesZip(self): from io import StringIO f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) fnames = [] # meta is list of (fname, zipname), where zipname of None # is a deleted file meta = [] sz = 0 for c, (fname, csum) in enumerate( self.db.execute("select fname, csum from media where dirty=1" " limit %d" % SYNC_ZIP_COUNT)): fnames.append(fname) normname = unicodedata.normalize("NFC", fname) if csum: self.col.log("+media zip", fname) z.write(fname, str(c)) meta.append((normname, str(c))) sz += os.path.getsize(fname) else: self.col.log("-media zip", fname) meta.append((normname, "")) if sz >= SYNC_ZIP_SIZE: break z.writestr("_meta", json.dumps(meta)) z.close() return f.getvalue(), fnames def addFilesFromZip(self, zipData): "Extract zip data; true if finished." from io import StringIO f = StringIO(zipData) z = zipfile.ZipFile(f, "r") media = [] # get meta info first meta = json.loads(z.read("_meta")) # then loop through all files cnt = 0 for i in z.infolist(): if i.filename == "_meta": # ignore previously-retrieved meta continue else: data = z.read(i) csum = checksum(data) name = meta[i.filename] if not isinstance(name, str): name = str(name, "utf8") # normalize name for platform if isMac: name = unicodedata.normalize("NFD", name) else: name = unicodedata.normalize("NFC", name) # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name), 0)) cnt += 1 if media: self.db.executemany( "insert or replace into media values (?,?,?,?)", media) return cnt
class _Collection: db: Optional[DB] sched: Union[V1Scheduler, V2Scheduler] crt: int mod: int scm: int dty: bool # no longer used _usn: int ls: int conf: Dict[str, Any] _undo: List[Any] backend: RustBackend def __init__( self, db: DB, backend: RustBackend, server: Optional["anki.storage.ServerData"] = None, log: bool = False, ) -> None: self.backend = backend self._debugLog = log self.db = db self.path = db._path self._openLog() self.log(self.path, anki.version) self.server = server self._lastSave = time.time() self.clearUndo() self.media = MediaManager(self, server is not None) self.models = ModelManager(self) self.decks = DeckManager(self) self.tags = TagManager(self) self.load() if not self.crt: d = datetime.datetime.today() d -= datetime.timedelta(hours=4) d = datetime.datetime(d.year, d.month, d.day) d += datetime.timedelta(hours=4) self.crt = int(time.mktime(d.timetuple())) self._loadScheduler() if not self.conf.get("newBury", False): self.conf["newBury"] = True self.setMod() def name(self) -> Any: n = os.path.splitext(os.path.basename(self.path))[0] return n # Scheduler ########################################################################## supportedSchedulerVersions = (1, 2) def schedVer(self) -> Any: ver = self.conf.get("schedVer", 1) if ver in self.supportedSchedulerVersions: return ver else: raise Exception("Unsupported scheduler version") def _loadScheduler(self) -> None: ver = self.schedVer() if ver == 1: self.sched = V1Scheduler(self) elif ver == 2: self.sched = V2Scheduler(self) if not self.server: self.conf["localOffset"] = self.sched._current_timezone_offset() elif self.server.minutes_west is not None: self.conf["localOffset"] = self.server.minutes_west def changeSchedulerVer(self, ver: int) -> None: if ver == self.schedVer(): return if ver not in self.supportedSchedulerVersions: raise Exception("Unsupported scheduler version") self.modSchema(check=True) self.clearUndo() v2Sched = V2Scheduler(self) if ver == 1: v2Sched.moveToV1() else: v2Sched.moveToV2() self.conf["schedVer"] = ver self.setMod() self._loadScheduler() def localOffset(self) -> Optional[int]: "Minutes west of UTC. Only applies to V2 scheduler." if isinstance(self.sched, V1Scheduler): return None else: return self.sched._current_timezone_offset() # DB-related ########################################################################## def load(self) -> None: ( self.crt, self.mod, self.scm, self.dty, # no longer used self._usn, self.ls, conf, models, decks, dconf, tags, ) = self.db.first( """ select crt, mod, scm, dty, usn, ls, conf, models, decks, dconf, tags from col""" ) self.conf = json.loads(conf) # type: ignore self.models.load(models) self.decks.load(decks, dconf) self.tags.load(tags) def setMod(self) -> None: """Mark DB modified. DB operations and the deck/tag/model managers do this automatically, so this is only necessary if you modify properties of this object or the conf dict.""" self.db.mod = True def flush(self, mod: Optional[int] = None) -> None: "Flush state to DB, updating mod time." self.mod = intTime(1000) if mod is None else mod self.db.execute( """update col set crt=?, mod=?, scm=?, dty=?, usn=?, ls=?, conf=?""", self.crt, self.mod, self.scm, self.dty, self._usn, self.ls, json.dumps(self.conf), ) def save(self, name: Optional[str] = None, mod: Optional[int] = None) -> None: "Flush, commit DB, and take out another write lock." # let the managers conditionally flush self.models.flush() self.decks.flush() self.tags.flush() # and flush deck + bump mod if db has been changed if self.db.mod: self.flush(mod=mod) self.db.commit() self.lock() self.db.mod = False self._markOp(name) self._lastSave = time.time() def autosave(self) -> Optional[bool]: "Save if 5 minutes has passed since last save. True if saved." if time.time() - self._lastSave > 300: self.save() return True return None def lock(self) -> None: # make sure we don't accidentally bump mod time mod = self.db.mod self.db.execute("update col set mod=mod") self.db.mod = mod def close(self, save: bool = True) -> None: "Disconnect from DB." if self.db: if save: self.save() else: self.db.rollback() if not self.server: self.db.setAutocommit(True) self.db.execute("pragma journal_mode = delete") self.db.setAutocommit(False) self.db.close() self.db = None self.media.close() self._closeLog() def reopen(self) -> None: "Reconnect to DB (after changing threads, etc)." if not self.db: self.db = DB(self.path) self.media.connect() self._openLog() def rollback(self) -> None: self.db.rollback() self.load() self.lock() def modSchema(self, check: bool) -> None: "Mark schema modified. Call this first so user can abort if necessary." if not self.schemaChanged(): if check and not hooks.schema_will_change(proceed=True): raise AnkiError("abortSchemaMod") self.scm = intTime(1000) self.setMod() def schemaChanged(self) -> Any: "True if schema changed since last sync." return self.scm > self.ls def usn(self) -> Any: return self._usn if self.server else -1 def beforeUpload(self) -> None: "Called before a full upload." tbls = "notes", "cards", "revlog" for t in tbls: self.db.execute("update %s set usn=0 where usn=-1" % t) # we can save space by removing the log of deletions self.db.execute("delete from graves") self._usn += 1 self.models.beforeUpload() self.tags.beforeUpload() self.decks.beforeUpload() self.modSchema(check=False) self.ls = self.scm # ensure db is compacted before upload self.db.setAutocommit(True) self.db.execute("vacuum") self.db.execute("analyze") self.close() # Object creation helpers ########################################################################## def getCard(self, id: int) -> Card: return Card(self, id) def getNote(self, id: int) -> Note: return Note(self, id=id) # Utils ########################################################################## def nextID(self, type: str, inc: bool = True) -> Any: type = "next" + type.capitalize() id = self.conf.get(type, 1) if inc: self.conf[type] = id + 1 return id def reset(self) -> None: "Rebuild the queue and reload data after DB modified." self.sched.reset() # Deletion logging ########################################################################## def _logRem(self, ids: List[int], type: int) -> None: self.db.executemany( "insert into graves values (%d, ?, %d)" % (self.usn(), type), ([x] for x in ids), ) # Notes ########################################################################## def noteCount(self) -> Any: return self.db.scalar("select count() from notes") def newNote(self, forDeck: bool = True) -> Note: "Return a new note with the current model." return Note(self, self.models.current(forDeck)) def addNote(self, note: Note) -> int: """Add a note to the collection. Return number of new cards.""" # check we have card models available, then save cms = self.findTemplates(note) if not cms: return 0 note.flush() # deck conf governs which of these are used due = self.nextID("pos") # add cards ncards = 0 for template in cms: self._newCard(note, template, due) ncards += 1 return ncards def remNotes(self, ids: Iterable[int]) -> None: """Deletes notes with the given IDs.""" self.remCards(self.db.list("select id from cards where nid in " + ids2str(ids))) def _remNotes(self, ids: List[int]) -> None: """Bulk delete notes by ID. Don't call this directly.""" if not ids: return strids = ids2str(ids) # we need to log these independently of cards, as one side may have # more card templates hooks.notes_will_be_deleted(self, ids) self._logRem(ids, REM_NOTE) self.db.execute("delete from notes where id in %s" % strids) # Card creation ########################################################################## def findTemplates(self, note: Note) -> List: "Return (active), non-empty templates." model = note.model() avail = self.models.availOrds(model, joinFields(note.fields)) return self._tmplsFromOrds(model, avail) def _tmplsFromOrds(self, model: NoteType, avail: List[int]) -> List: ok = [] if model["type"] == MODEL_STD: for t in model["tmpls"]: if t["ord"] in avail: ok.append(t) else: # cloze - generate temporary templates from first for ord in avail: t = copy.copy(model["tmpls"][0]) t["ord"] = ord ok.append(t) return ok def genCards(self, nids: List[int]) -> List[int]: "Generate cards for non-empty templates, return ids to remove." # build map of (nid,ord) so we don't create dupes snids = ids2str(nids) have: Dict[int, Dict[int, int]] = {} dids: Dict[int, Optional[int]] = {} dues: Dict[int, int] = {} for id, nid, ord, did, due, odue, odid, type in self.db.execute( "select id, nid, ord, did, due, odue, odid, type from cards where nid in " + snids ): # existing cards if nid not in have: have[nid] = {} have[nid][ord] = id # if in a filtered deck, add new cards to original deck if odid != 0: did = odid # and their dids if nid in dids: if dids[nid] and dids[nid] != did: # cards are in two or more different decks; revert to # model default dids[nid] = None else: # first card or multiple cards in same deck dids[nid] = did # save due if odid != 0: due = odue if nid not in dues and type == 0: # Add due to new card only if it's the due of a new sibling dues[nid] = due # build cards for each note data = [] ts = maxID(self.db) now = intTime() rem = [] usn = self.usn() for nid, mid, flds in self.db.execute( "select id, mid, flds from notes where id in " + snids ): model = self.models.get(mid) assert model avail = self.models.availOrds(model, flds) did = dids.get(nid) or model["did"] due = dues.get(nid) # add any missing cards for t in self._tmplsFromOrds(model, avail): doHave = nid in have and t["ord"] in have[nid] if not doHave: # check deck is not a cram deck did = t["did"] or did if self.decks.isDyn(did): did = 1 # if the deck doesn't exist, use default instead did = self.decks.get(did)["id"] # use sibling due# if there is one, else use a new id if due is None: due = self.nextID("pos") data.append((ts, nid, did, t["ord"], now, usn, due)) ts += 1 # note any cards that need removing if nid in have: for ord, id in list(have[nid].items()): if ord not in avail: rem.append(id) # bulk update self.db.executemany( """ insert into cards values (?,?,?,?,?,?,0,0,?,0,0,0,0,0,0,0,0,"")""", data, ) return rem # type is no longer used def previewCards( self, note: Note, type: int = 0, did: Optional[int] = None ) -> List: existing_cards = {} for card in note.cards(): existing_cards[card.ord] = card all_cards = [] for idx, template in enumerate(note.model()["tmpls"]): if idx in existing_cards: all_cards.append(existing_cards[idx]) else: # card not currently in database, generate an ephemeral one all_cards.append(self._newCard(note, template, 1, flush=False, did=did)) return all_cards def _newCard( self, note: Note, template: Template, due: int, flush: bool = True, did: Optional[int] = None, ) -> Card: "Create a new card." card = Card(self) card.nid = note.id card.ord = template["ord"] # type: ignore card.did = self.db.scalar( "select did from cards where nid = ? and ord = ?", card.nid, card.ord ) # Use template did (deck override) if valid, otherwise did in argument, otherwise model did if not card.did: if template["did"] and str(template["did"]) in self.decks.decks: card.did = int(template["did"]) elif did: card.did = did else: card.did = note.model()["did"] # if invalid did, use default instead deck = self.decks.get(card.did) assert deck if deck["dyn"]: # must not be a filtered deck card.did = 1 else: card.did = deck["id"] card.due = self._dueForDid(card.did, due) if flush: card.flush() return card def _dueForDid(self, did: int, due: int) -> int: conf = self.decks.confForDid(did) # in order due? if conf["new"]["order"] == NEW_CARDS_DUE: return due else: # random mode; seed with note ts so all cards of this note get the # same random number r = random.Random() r.seed(due) return r.randrange(1, max(due, 1000)) # Cards ########################################################################## def isEmpty(self) -> bool: return not self.db.scalar("select 1 from cards limit 1") def cardCount(self) -> Any: return self.db.scalar("select count() from cards") def remCards(self, ids: List[int], notes: bool = True) -> None: "Bulk delete cards by ID." if not ids: return sids = ids2str(ids) nids = self.db.list("select nid from cards where id in " + sids) # remove cards self._logRem(ids, REM_CARD) self.db.execute("delete from cards where id in " + sids) # then notes if not notes: return nids = self.db.list( """ select id from notes where id in %s and id not in (select nid from cards)""" % ids2str(nids) ) self._remNotes(nids) def emptyCids(self) -> List[int]: """Returns IDs of empty cards.""" rem: List[int] = [] for m in self.models.all(): rem += self.genCards(self.models.nids(m)) return rem def emptyCardReport(self, cids) -> str: rep = "" for ords, cnt, flds in self.db.all( """ select group_concat(ord+1), count(), flds from cards c, notes n where c.nid = n.id and c.id in %s group by nid""" % ids2str(cids) ): rep += _("Empty card numbers: %(c)s\nFields: %(f)s\n\n") % dict( c=ords, f=flds.replace("\x1f", " / ") ) return rep # Field checksums and sorting fields ########################################################################## def _fieldData(self, snids: str) -> Any: return self.db.execute("select id, mid, flds from notes where id in " + snids) def updateFieldCache(self, nids: List[int]) -> None: "Update field checksums and sort cache, after find&replace, etc." snids = ids2str(nids) r = [] for (nid, mid, flds) in self._fieldData(snids): fields = splitFields(flds) model = self.models.get(mid) if not model: # note points to invalid model continue r.append( ( stripHTMLMedia(fields[self.models.sortIdx(model)]), fieldChecksum(fields[0]), nid, ) ) # apply, relying on calling code to bump usn+mod self.db.executemany("update notes set sfld=?, csum=? where id=?", r) # Finding cards ########################################################################## def findCards(self, query: str, order: Union[bool, str] = False) -> Any: return anki.find.Finder(self).findCards(query, order) def findNotes(self, query: str) -> Any: return anki.find.Finder(self).findNotes(query) def findReplace( self, nids: List[int], src: str, dst: str, regex: Optional[bool] = None, field: Optional[str] = None, fold: bool = True, ) -> int: return anki.find.findReplace(self, nids, src, dst, regex, field, fold) def findDupes(self, fieldName: str, search: str = "") -> List[Tuple[Any, list]]: return anki.find.findDupes(self, fieldName, search) # Stats ########################################################################## def cardStats(self, card: Card) -> str: from anki.stats import CardStats return CardStats(self, card).report() def stats(self) -> "anki.stats.CollectionStats": from anki.stats import CollectionStats return CollectionStats(self) # Timeboxing ########################################################################## def startTimebox(self) -> None: self._startTime = time.time() self._startReps = self.sched.reps def timeboxReached(self) -> Union[bool, Tuple[Any, int]]: "Return (elapsedTime, reps) if timebox reached, or False." if not self.conf["timeLim"]: # timeboxing disabled return False elapsed = time.time() - self._startTime if elapsed > self.conf["timeLim"]: return (self.conf["timeLim"], self.sched.reps - self._startReps) return False # Undo ########################################################################## def clearUndo(self) -> None: # [type, undoName, data] # type 1 = review; type 2 = checkpoint self._undo = None def undoName(self) -> Any: "Undo menu item name, or None if undo unavailable." if not self._undo: return None return self._undo[1] def undo(self) -> Any: if self._undo[0] == 1: return self._undoReview() else: self._undoOp() def markReview(self, card: Card) -> None: old: List[Any] = [] if self._undo: if self._undo[0] == 1: old = self._undo[2] self.clearUndo() wasLeech = card.note().hasTag("leech") or False self._undo = [1, _("Review"), old + [copy.copy(card)], wasLeech] def _undoReview(self) -> Any: data = self._undo[2] wasLeech = self._undo[3] c = data.pop() # pytype: disable=attribute-error if not data: self.clearUndo() # remove leech tag if it didn't have it before if not wasLeech and c.note().hasTag("leech"): c.note().delTag("leech") c.note().flush() # write old data c.flush() # and delete revlog entry last = self.db.scalar( "select id from revlog where cid = ? " "order by id desc limit 1", c.id ) self.db.execute("delete from revlog where id = ?", last) # restore any siblings self.db.execute( "update cards set queue=type,mod=?,usn=? where queue=-2 and nid=?", intTime(), self.usn(), c.nid, ) # and finally, update daily counts n = 1 if c.queue == 3 else c.queue type = ("new", "lrn", "rev")[n] self.sched._updateStats(c, type, -1) self.sched.reps -= 1 return c.id def _markOp(self, name: Optional[str]) -> None: "Call via .save()" if name: self._undo = [2, name] else: # saving disables old checkpoint, but not review undo if self._undo and self._undo[0] == 2: self.clearUndo() def _undoOp(self) -> None: self.rollback() self.clearUndo() # DB maintenance ########################################################################## def basicCheck(self) -> bool: "Basic integrity check for syncing. True if ok." # cards without notes if self.db.scalar( """ select 1 from cards where nid not in (select id from notes) limit 1""" ): return False # notes without cards or models if self.db.scalar( """ select 1 from notes where id not in (select distinct nid from cards) or mid not in %s limit 1""" % ids2str(self.models.ids()) ): return False # invalid ords for m in self.models.all(): # ignore clozes if m["type"] != MODEL_STD: continue if self.db.scalar( """ select 1 from cards where ord not in %s and nid in ( select id from notes where mid = ?) limit 1""" % ids2str([t["ord"] for t in m["tmpls"]]), m["id"], ): return False return True def fixIntegrity(self) -> Tuple[str, bool]: """Fix possible problems and rebuild caches. Returns tuple of (error: str, ok: bool). 'ok' will be true if no problems were found. """ problems = [] curs = self.db.cursor() self.save() oldSize = os.stat(self.path)[stat.ST_SIZE] if self.db.scalar("pragma integrity_check") != "ok": return (_("Collection is corrupt. Please see the manual."), False) # note types with a missing model ids = self.db.list( """ select id from notes where mid not in """ + ids2str(self.models.ids()) ) if ids: problems.append( ngettext( "Deleted %d note with missing note type.", "Deleted %d notes with missing note type.", len(ids), ) % len(ids) ) self.remNotes(ids) # for each model for m in self.models.all(): for t in m["tmpls"]: if t["did"] == "None": t["did"] = None problems.append(_("Fixed AnkiDroid deck override bug.")) self.models.save(m, updateReqs=False) if m["type"] == MODEL_STD: # model with missing req specification if "req" not in m: self.models._updateRequired(m) problems.append(_("Fixed note type: %s") % m["name"]) # cards with invalid ordinal ids = self.db.list( """ select id from cards where ord not in %s and nid in ( select id from notes where mid = ?)""" % ids2str([t["ord"] for t in m["tmpls"]]), m["id"], ) if ids: problems.append( ngettext( "Deleted %d card with missing template.", "Deleted %d cards with missing template.", len(ids), ) % len(ids) ) self.remCards(ids) # notes with invalid field count ids = [] for id, flds in self.db.execute( "select id, flds from notes where mid = ?", m["id"] ): if (flds.count("\x1f") + 1) != len(m["flds"]): ids.append(id) if ids: problems.append( ngettext( "Deleted %d note with wrong field count.", "Deleted %d notes with wrong field count.", len(ids), ) % len(ids) ) self.remNotes(ids) # delete any notes with missing cards ids = self.db.list( """ select id from notes where id not in (select distinct nid from cards)""" ) if ids: cnt = len(ids) problems.append( ngettext( "Deleted %d note with no cards.", "Deleted %d notes with no cards.", cnt, ) % cnt ) self._remNotes(ids) # cards with missing notes ids = self.db.list( """ select id from cards where nid not in (select id from notes)""" ) if ids: cnt = len(ids) problems.append( ngettext( "Deleted %d card with missing note.", "Deleted %d cards with missing note.", cnt, ) % cnt ) self.remCards(ids) # cards with odue set when it shouldn't be ids = self.db.list( """ select id from cards where odue > 0 and (type=1 or queue=2) and not odid""" ) if ids: cnt = len(ids) problems.append( ngettext( "Fixed %d card with invalid properties.", "Fixed %d cards with invalid properties.", cnt, ) % cnt ) self.db.execute("update cards set odue=0 where id in " + ids2str(ids)) # cards with odid set when not in a dyn deck dids = [id for id in self.decks.allIds() if not self.decks.isDyn(id)] ids = self.db.list( """ select id from cards where odid > 0 and did in %s""" % ids2str(dids) ) if ids: cnt = len(ids) problems.append( ngettext( "Fixed %d card with invalid properties.", "Fixed %d cards with invalid properties.", cnt, ) % cnt ) self.db.execute( "update cards set odid=0, odue=0 where id in " + ids2str(ids) ) # tags self.tags.registerNotes() # field cache for m in self.models.all(): self.updateFieldCache(self.models.nids(m)) # new cards can't have a due position > 32 bits, so wrap items over # 2 million back to 1 million curs.execute( """ update cards set due=1000000+due%1000000,mod=?,usn=? where due>=1000000 and type=0""", [intTime(), self.usn()], ) if curs.rowcount: problems.append( "Found %d new cards with a due number >= 1,000,000 - consider repositioning them in the Browse screen." % curs.rowcount ) # new card position self.conf["nextPos"] = ( self.db.scalar("select max(due)+1 from cards where type = 0") or 0 ) # reviews should have a reasonable due # ids = self.db.list("select id from cards where queue = 2 and due > 100000") if ids: problems.append("Reviews had incorrect due date.") self.db.execute( "update cards set due = ?, ivl = 1, mod = ?, usn = ? where id in %s" % ids2str(ids), self.sched.today, intTime(), self.usn(), ) # v2 sched had a bug that could create decimal intervals curs.execute( "update cards set ivl=round(ivl),due=round(due) where ivl!=round(ivl) or due!=round(due)" ) if curs.rowcount: problems.append("Fixed %d cards with v2 scheduler bug." % curs.rowcount) curs.execute( "update revlog set ivl=round(ivl),lastIvl=round(lastIvl) where ivl!=round(ivl) or lastIvl!=round(lastIvl)" ) if curs.rowcount: problems.append( "Fixed %d review history entries with v2 scheduler bug." % curs.rowcount ) # models if self.models.ensureNotEmpty(): problems.append("Added missing note type.") # and finally, optimize self.optimize() newSize = os.stat(self.path)[stat.ST_SIZE] txt = _("Database rebuilt and optimized.") ok = not problems problems.append(txt) # if any problems were found, force a full sync if not ok: self.modSchema(check=False) self.save() return ("\n".join(problems), ok) def optimize(self) -> None: self.db.setAutocommit(True) self.db.execute("vacuum") self.db.execute("analyze") self.db.setAutocommit(False) self.lock() # Logging ########################################################################## def log(self, *args, **kwargs) -> None: if not self._debugLog: return def customRepr(x): if isinstance(x, str): return x return pprint.pformat(x) path, num, fn, y = traceback.extract_stack(limit=2 + kwargs.get("stack", 0))[0] buf = "[%s] %s:%s(): %s" % ( intTime(), os.path.basename(path), fn, ", ".join([customRepr(x) for x in args]), ) self._logHnd.write(buf + "\n") if devMode: print(buf) def _openLog(self) -> None: if not self._debugLog: return lpath = re.sub(r"\.anki2$", ".log", self.path) if os.path.exists(lpath) and os.path.getsize(lpath) > 10 * 1024 * 1024: lpath2 = lpath + ".old" if os.path.exists(lpath2): os.unlink(lpath2) os.rename(lpath, lpath2) self._logHnd = open(lpath, "a", encoding="utf8") def _closeLog(self) -> None: if not self._debugLog: return self._logHnd.close() self._logHnd = None # Card Flags ########################################################################## def setUserFlag(self, flag: int, cids: List[int]) -> None: assert 0 <= flag <= 7 self.db.execute( "update cards set flags = (flags & ~?) | ?, usn=?, mod=? where id in %s" % ids2str(cids), 0b111, flag, self.usn(), intTime(), )
class MediaManager(object): soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"] imgRegexps = [ # src element quoted case "(?i)(<img[^>]+src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case "(?i)(<img[^>]+src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps def __init__(self, col, server): self.col = col if server: self._dir = None return # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) # convert dir to unicode if it's not already if isinstance(self._dir, str): self._dir = unicode(self._dir, sys.getfilesystemencoding()) if not os.path.exists(self._dir): os.makedirs(self._dir) try: self._oldcwd = os.getcwd() except OSError: # cwd doesn't exist self._oldcwd = None os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir() + ".db" create = not os.path.exists(path) os.chdir(self._dir) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location if self._oldcwd: try: os.chdir(self._oldcwd) except: # may have been deleted pass def dir(self): return self._dir def _isFAT32(self): if not isWin: return import win32api, win32file try: name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) except: # mapped & unmapped network drive; pray that it's not vfat return if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True # Adding media ########################################################################## def addFile(self, opath): return self.writeData(opath, open(opath, "rb").read()) def writeData(self, opath, data): # if fname is a full path, use only the basename fname = os.path.basename(opath) # remove any dangerous characters base = self.stripIllegal(fname) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n + 1) # find the first available name csum = checksum(data) while True: fname = root + ext path = os.path.join(self.dir(), fname) # if it doesn't exist, copy it directly if not os.path.exists(path): open(path, "wb").write(data) return fname # if it's identical, reuse if checksum(open(path, "rb").read()) == csum: return fname # otherwise, increment the index in the filename reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] model = self.col.models.get(mid) strings = [] if model['type'] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string): ords = set(re.findall("{{c(\d+)::.+?}}", string)) strings = [] from anki.template.template import clozeReg def qrepl(m): if m.group(3): return "[%s]" % m.group(3) else: return "[...]" def arepl(m): return m.group(1) for ord in ords: s = re.sub(clozeReg % ord, qrepl, string) s = re.sub(clozeReg % ".+?", "\\1", s) strings.append(s) strings.append(re.sub(clozeReg % ".+?", arepl, string)) return strings def transformNames(self, txt, func): for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, urllib.quote(fname.encode("utf-8"))) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode) and isMac: return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue if file.startswith("_"): # leading _ says to ignore file continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = [x for x in normrefs.keys() if not x.startswith("_")] return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## def have(self, fname): return os.path.exists(os.path.join(self.dir(), fname)) # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): send2trash.send2trash(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] # get meta info first meta = json.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany("insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished # Illegal characters ########################################################################## _illegalCharReg = re.compile(r'[][><:"/?*^\\|\0]') def stripIllegal(self, str): return re.sub(self._illegalCharReg, "", str) def hasIllegal(self, str): return not not re.search(self._illegalCharReg, str) # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE/COUNT. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute("select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE or cnt > SYNC_ZIP_COUNT: break cnt += 1 z.writestr("_meta", json.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f, )) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute("select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders and thumbs.db if os.path.isdir(f): continue if f.lower() == "thumbs.db": continue # and files with invalid chars if self.hasIllegal(f): continue # empty files are invalid; clean them up and continue if not os.path.getsize(f): os.unlink(f) continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt def forceResync(self): self.db.execute("delete from media") self.db.execute("delete from log") self.db.execute("update meta set usn = 0, dirMod = 0") self.db.commit() def removeExisting(self, files): "Remove files from list of files to sync, and return missing files." need = [] remove = [] for f in files: if self.db.scalar("select 1 from log where fname=?", f): remove.append((f, )) else: need.append(f) self.db.executemany("delete from log where fname=?", remove) self.db.commit() # if we need all the server files, it's faster to pass None than # the full list if need and len(files) == len(need): return None return need
class MediaManager: """ _dir -- the directory of media. Unless server is given to the constructor, in this cas it's None. Directory is changed to it during synchronization, and then changed back to previous directory. _oldcwd -- the working directory when media manager is created. The directory is changed to this value when the MediaManager is closed. If server is given in the constructor, then it's None. """ """Captures the argument foo of [sound:foo]""" soundRegexps = [r"(?i)(\[sound:(?P<fname>[^]]+)\])"] """Captures the argument foo of <img src=foo bar>, ignoring quotes around foo.""" imgRegexps = [ # src element quoted case r"(?i)(<img[^>]* src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case r"(?i)(<img[^>]* src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps def __init__(self, col, server): """ TODO server -- always false in Anki""" self.col = col if server: self._dir = None return # media directory self._dir = re.sub(r"(?i)\.(anki2)$", ".media", self.col.path) if not os.path.exists(self._dir): os.makedirs(self._dir) try: self._oldcwd = os.getcwd() except OSError: # cwd doesn't exist self._oldcwd = None try: os.chdir(self._dir) except OSError: raise Exception("invalidTempFolder") # change database self.connect() def connect(self): """Ensure the existence of a database in current format, connected in self.db.""" if self.col.server: return path = self.dir()+".db2" create = not os.path.exists(path) os.chdir(self._dir) self.db = DB(path) if create: self._initDB() self.maybeUpgrade() def _initDB(self): self.db.executescript(""" create table media ( fname text not null primary key, csum text, -- null indicates deleted file mtime int not null, -- zero if deleted dirty int not null ); create index idx_media_dirty on media (dirty); create table meta (dirMod int, lastUsn int); insert into meta values (0, 0); """) def maybeUpgrade(self): """Upgrade database in old format to current format.""" oldpath = self.dir()+".db" if os.path.exists(oldpath): self.db.execute('attach "../collection.media.db" as old') try: self.db.execute(""" insert into media select m.fname, csum, mod, ifnull((select 1 from log l2 where l2.fname=m.fname), 0) as dirty from old.media m left outer join old.log l using (fname) union select fname, null, 0, 1 from old.log where type=1;""") self.db.execute("delete from meta") self.db.execute(""" insert into meta select dirMod, usn from old.meta """) self.db.commit() except Exception as e: # if we couldn't import the old db for some reason, just start # anew self.col.log("failed to import old media db:"+traceback.format_exc()) self.db.execute("detach old") npath = "../collection.media.db.old" if os.path.exists(npath): os.unlink(npath) os.rename("../collection.media.db", npath) def close(self): """Close database connection. don't do anything if server is truthy. change dir back to old working dir""" if self.col.server: return self.db.close() self.db = None # change cwd back to old location if self._oldcwd: try: os.chdir(self._oldcwd) except: # may have been deleted pass def _deleteDB(self): """Delete connected DB, connect to a new one""" path = self.db._path self.close() os.unlink(path) self.connect() def dir(self): """The directory of media""" return self._dir def _isFAT32(self): if not isWin: return # pylint: disable=import-error import win32api, win32file try: name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) except: # mapped & unmapped network drive; pray that it's not vfat return if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True # Adding media ########################################################################## # opath must be in unicode def addFile(self, opath): """Copy the file at path opath to collection.media, Name may be changed to ensure unicity. """ with open(opath, "rb") as f: return self.writeData(opath, f.read()) def writeData(self, opath, data, typeHint=None): """Add data in the file of name opath in media dir. Only file name of opath is keep. If file as no extension, and it is jpg or png according to typeHint, then add extension Add a number extension if this name already exists """ # if fname is a full path, use only the basename fname = os.path.basename(opath) # if it's missing an extension and a type hint was provided, use that if not os.path.splitext(fname)[1] and typeHint: # mimetypes is returning '.jpe' even after calling .init(), so we'll do # it manually instead typeMap = { "image/jpeg": ".jpg", "image/png": ".png", } if typeHint in typeMap: fname += typeMap[typeHint] # make sure we write it in NFC form (pre-APFS Macs will autoconvert to NFD), # and return an NFC-encoded reference fname = unicodedata.normalize("NFC", fname) # ensure it's a valid filename base = self.cleanFilename(fname) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n+1) # find the first available name csum = checksum(data) while True: fname = root + ext path = os.path.join(self.dir(), fname) # if it doesn't exist, copy it directly if not os.path.exists(path): with open(path, "wb") as f: f.write(data) return fname # if it's identical, reuse with open(path, "rb") as f: if checksum(f.read()) == csum: return fname # otherwise, increment the index in the filename reg = r" \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): """The list of media's path in the string. Medias starting with _ are treated as any media. Each clozes are expanded in every possible ways. It allows for different strings to be created. Concerning the part of the string related to LaTeX, media are generated as explained in latex._imgLink's docstring Keyword arguments: mid -- the id of the model of the note whose string is considered string -- A string, which corresponds to a field of a note includeRemote -- whether the list should include contents which is with http, https or ftp """ l = [] model = self.col.models.get(mid) strings = [] if model['type'] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string): """The list of all strings, where the clozes are expanded. For each cloze number n, there is a string with cloze n replaced by [...] or by [hint], and every other clozes replaced by their text. There is also a text where each cloze are replaced by their value; i.e. the answer""" ords = set(re.findall(r"{{c(\d+)::.+?}}", string)) #The set of clozes occurring in the string strings = [] from anki.template.template import clozeReg def qrepl(m): """The text by which the cloze m must be replaced in the question.""" if m.group(4): return "[%s]" % m.group(4) else: return "[...]" if m.group(3): return "[%s]" % m.group(3) else: return "[...]" def arepl(m): """The text by which the cloze m must be replaced in the answer.""" return m.group(2) for ord in ords: s = re.sub(clozeReg%ord, qrepl, string) #Replace the cloze number ord by the deletion s = re.sub(clozeReg%".+?", "\\2", s) #Replace every other clozes by their content strings.append(s) strings.append(re.sub(clozeReg%".+?", arepl, string)) return strings def transformNames(self, txt, func): """Apply func to all subtext matching the regexps txt.""" for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt): """Delete all text matching the regexps txt""" for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string, unescape=False): """Replace local image url by replacing special character by the escape %xx or reciprocally depending on unescape value.""" if unescape: fn = urllib.parse.unquote else: fn = urllib.parse.quote def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, fn(fname)) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles, warnings)." mdir = self.dir() # gather all media references in NFC form allRefs = set() for nid, mid, flds in self.col.db.execute("select id, mid, flds from notes"): noteRefs = self.filesInStr(mid, flds) # check the refs are in NFC for f in noteRefs: # if they're not, we'll need to fix them first if f != unicodedata.normalize("NFC", f): self._normalizeNoteRefs(nid) noteRefs = self.filesInStr(mid, flds) break allRefs.update(noteRefs) # loop through media folder unused = [] if local is None: files = os.listdir(mdir) else: files = local renamedFiles = False dirFound = False warnings = [] for file in files: if not local: if not os.path.isfile(file): # ignore directories dirFound = True continue if file.startswith("_"): # leading _ says to ignore file continue if self.hasIllegal(file): name = file.encode(sys.getfilesystemencoding(), errors="replace") name = str(name, sys.getfilesystemencoding()) warnings.append( _("Invalid file name, please rename: %s") % name) continue nfcFile = unicodedata.normalize("NFC", file) # we enforce NFC fs encoding on non-macs if not isMac and not local: if file != nfcFile: # delete if we already have the NFC form, otherwise rename if os.path.exists(nfcFile): os.unlink(file) renamedFiles = True else: os.rename(file, nfcFile) renamedFiles = True file = nfcFile # compare if nfcFile not in allRefs: unused.append(file) else: allRefs.discard(nfcFile) # if we renamed any files to nfc format, we must rerun the check # to make sure the renamed files are not marked as unused if renamedFiles: return self.check(local=local) nohave = [x for x in allRefs if not x.startswith("_")] # make sure the media DB is valid try: self.findChanges() except DBError: self._deleteDB() if dirFound: warnings.append( _("Anki does not support files in subfolders of the collection.media folder.")) return (nohave, unused, warnings) def _normalizeNoteRefs(self, nid): note = self.col.getNote(nid) for c, fld in enumerate(note.fields): nfc = unicodedata.normalize("NFC", fld) if nfc != fld: note.fields[c] = nfc note.flush() # Copying on import ########################################################################## def have(self, fname): """Whether a fil with name fname exists in the media directory""" return os.path.exists(os.path.join(self.dir(), fname)) # Illegal characters and paths ########################################################################## _illegalCharReg = re.compile(r'[][><:"/?*^\\|\0\r\n]') def stripIllegal(self, str): """str, without its illegal characters""" return re.sub(self._illegalCharReg, "", str) def hasIllegal(self, str): """Whether str contains a illegal character. Either according to _illegalCharReg, or because it can't be encoded if file system encoding""" if re.search(self._illegalCharReg, str): return True try: str.encode(sys.getfilesystemencoding()) except UnicodeEncodeError: return True return False def cleanFilename(self, fname): fname = self.stripIllegal(fname) fname = self._cleanWin32Filename(fname) fname = self._cleanLongFilename(fname) if not fname: fname = "renamed" return fname def _cleanWin32Filename(self, fname): if not isWin: return fname # deal with things like con/prn/etc p = pathlib.WindowsPath(fname) if p.is_reserved(): fname = "renamed" + fname assert not pathlib.WindowsPath(fname).is_reserved() return fname def _cleanLongFilename(self, fname): # a fairly safe limit that should work on typical windows # paths and on eCryptfs partitions, even with a duplicate # suffix appended namemax = 136 if isWin: pathmax = 240 else: pathmax = 1024 # cap namemax based on absolute path dirlen = len(os.path.dirname(os.path.abspath(fname))) remaining = pathmax - dirlen namemax = min(remaining, namemax) assert namemax > 0 if len(fname) > namemax: head, ext = os.path.splitext(fname) headmax = namemax - len(ext) head = head[0:headmax] fname = head + ext assert(len(fname) <= namemax) return fname # Tracking changes ########################################################################## def findChanges(self): "Scan the media folder if it's changed, and note any changes in the db." if self._changed(): self._logChanges() def haveDirty(self): """Whether the database has at least one dirty element""" return self.db.scalar("select 1 from media where dirty=1 limit 1") def _mtime(self, path): """Time of most recent content modification of file at path. Expressed in seconds.""" return int(os.stat(path).st_mtime) def _checksum(self, path): """Checksum of file at path""" with open(path, "rb") as f: return checksum(f.read()) def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def _logChanges(self): (added, removed) = self._changes() media = [] for f, mtime in added: media.append((f, self._checksum(f), mtime, 1)) for f in removed: media.append((f, None, 0, 1)) # update media db self.db.executemany("insert or replace into media values (?,?,?,?)", media) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute( "select fname, csum, mtime from media where csum is not null"): # previous entries may not have been in NFC form normname = unicodedata.normalize("NFC", name) self.cache[normname] = [csum, mod, False] added = [] removed = [] # loop through on-disk files with os.scandir(self.dir()) as it: for f in it: # ignore folders and thumbs.db if f.is_dir(): continue if f.name.lower() == "thumbs.db": continue # and files with invalid chars if self.hasIllegal(f.name): continue # empty files are invalid; clean them up and continue sz = f.stat().st_size if not sz: os.unlink(f.name) continue if sz > 100*1024*1024: self.col.log("ignoring file over 100MB", f.name) continue # check encoding normname = unicodedata.normalize("NFC", f.name) if not isMac: if f.name != normname: # wrong filename encoding which will cause sync errors if os.path.exists(normname): os.unlink(f.name) else: os.rename(f.name, normname) else: # on Macs we can access the file using any normalization pass # newly added? mtime = int(f.stat().st_mtime) if normname not in self.cache: added.append((normname, mtime)) else: # modified since last time? if mtime != self.cache[normname][1]: # and has different checksum? if self._checksum(normname) != self.cache[normname][0]: added.append((normname, mtime)) # mark as used self.cache[normname][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in list(self.cache.items()): if not v[2]: removed.append(k) return added, removed # Syncing-related ########################################################################## def lastUsn(self): return self.db.scalar("select lastUsn from meta") def setLastUsn(self, usn): self.db.execute("update meta set lastUsn = ?", usn) self.db.commit() def syncInfo(self, fname): """(Checkusm, dirty number) from media with name fname""" ret = self.db.first( "select csum, dirty from media where fname=?", fname) return ret or (None, 0) def markClean(self, fnames): for fname in fnames: self.db.execute( "update media set dirty=0 where fname=?", fname) def syncDelete(self, fname): """Delete the file fname if it is not in media directory.""" if os.path.exists(fname): os.unlink(fname) self.db.execute("delete from media where fname=?", fname) def mediaCount(self): """Number of media according to database""" return self.db.scalar( "select count() from media where csum is not null") def dirtyCount(self): """Number of dirty media according to database. (couting the one potentially deleted)""" return self.db.scalar( "select count() from media where dirty=1") def forceResync(self): self.db.execute("delete from media") self.db.execute("update meta set lastUsn=0,dirMod=0") self.db.commit() self.db.setAutocommit(True) self.db.execute("vacuum") self.db.execute("analyze") self.db.setAutocommit(False) # Media syncing: zips ########################################################################## def mediaChangesZip(self): """ The pair with: * A string encoding a zip files with: ** media to upload ** _meta: a json list associating to each name (as in zip) to the real name of the file * list of media considered """ f = io.BytesIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) fnames = [] # meta is list of (fname, zipname), where zipname of None # is a deleted file meta = [] sz = 0#sum of the size of the media. # loop over dirty medias. At most SYNC_ZIP_COUNT = 25 elements for c, (fname, csum) in enumerate(self.db.execute( "select fname, csum from media where dirty=1" " limit %d"%SYNC_ZIP_COUNT)): fnames.append(fname) normname = unicodedata.normalize("NFC", fname) if csum: self.col.log("+media zip", fname) z.write(fname, str(c)) meta.append((normname, str(c))) sz += os.path.getsize(fname) else: self.col.log("-media zip", fname) meta.append((normname, "")) if sz >= SYNC_ZIP_SIZE: break z.writestr("_meta", json.dumps(meta)) z.close() return f.getvalue(), fnames def addFilesFromZip(self, zipData): """ Copy each file from zipData (except _meta) to the media folder, and add those files to the media database. Rename the file according to _meta. zipData -- A byte tream containing a zipfile, containing: * _meta, a file containing a json dict associtaing to each name of file in zip (except meta) a name to be used in the media folder * arbitrary fields to save in the media folder """ f = io.BytesIO(zipData) z = zipfile.ZipFile(f, "r") media = [] # get meta info first meta = json.loads(z.read("_meta").decode("utf8")) # then loop through all files cnt = 0 for i in z.infolist(): if i.filename == "_meta": # ignore previously-retrieved meta continue else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # normalize name name = unicodedata.normalize("NFC", name) # save file with open(name, "wb") as f: f.write(data) # update db media.append((name, csum, self._mtime(name), 0)) cnt += 1 if media: self.db.executemany( "insert or replace into media values (?,?,?,?)", media) return cnt
class MediaManager(object): soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"] imgRegexps = [ # src element quoted case "(?i)(<img[^>]+src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case "(?i)(<img[^>]+src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps def __init__(self, col, server): self.col = col if server: self._dir = None return # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) # convert dir to unicode if it's not already if isinstance(self._dir, str): self._dir = unicode(self._dir, sys.getfilesystemencoding()) if not os.path.exists(self._dir): os.makedirs(self._dir) try: self._oldcwd = os.getcwd() except OSError: # cwd doesn't exist self._oldcwd = None os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir()+".db" create = not os.path.exists(path) os.chdir(self._dir) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location if self._oldcwd: try: os.chdir(self._oldcwd) except: # may have been deleted pass def dir(self): return self._dir def _isFAT32(self): if not isWin: return import win32api, win32file name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True # Adding media ########################################################################## def addFile(self, opath): """Copy PATH to MEDIADIR, and return new filename. If the same name exists, compare checksums.""" mdir = self.dir() # remove any dangerous characters base = re.sub(r"[][<>:/\\&?\"\|]", "", os.path.basename(opath)) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n+1) # find the first available name while True: path = os.path.join(mdir, root + ext) # if it doesn't exist, copy it directly if not os.path.exists(path): shutil.copyfile(opath, path) return os.path.basename(os.path.basename(path)) # if it's identical, reuse if self.filesIdentical(opath, path): return os.path.basename(path) # otherwise, increment the index in the filename reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) def filesIdentical(self, path1, path2): "True if files are the same." return (checksum(open(path1, "rb").read()) == checksum(open(path2, "rb").read())) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] model = self.col.models.get(mid) strings = [] if model['type'] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string): ords = set(re.findall("{{c(\d+)::.+?}}", string)) strings = [] from anki.template.template import clozeReg def qrepl(m): if m.group(3): return "[%s]" % m.group(3) else: return "[...]" def arepl(m): return m.group(1) for ord in ords: s = re.sub(clozeReg%ord, qrepl, string) s = re.sub(clozeReg%".+?", "\\1", s) strings.append(s) strings.append(re.sub(clozeReg%".+?", arepl, string)) return strings def transformNames(self, txt, func): for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace( fname, urllib.quote(fname.encode("utf-8"))) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode) and isMac: return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue if file.startswith("_"): # leading _ says to ignore file continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = [x for x in normrefs.keys() if not x.startswith("_")] return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## def have(self, fname): return os.path.exists(os.path.join(self.dir(), fname)) # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): send2trash.send2trash(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] sizecnt = 0 # get meta info first assert z.getinfo("_meta").file_size < 100000 meta = json.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): # check for zip bombs sizecnt += i.file_size assert sizecnt < 100*1024*1024 if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # can we store the file on this system? if self.illegal(name): continue # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany( "insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished def illegal(self, f): if isWin: for c in f: if c in "<>:\"/\\|?*^": return True elif isMac: for c in f: if c in ":\\/": return True # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE/COUNT. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute( "select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE or cnt > SYNC_ZIP_COUNT: break cnt += 1 z.writestr("_meta", json.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f,)) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute( "select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders and thumbs.db if os.path.isdir(f): continue if f.lower() == "thumbs.db": continue # and files with invalid chars bad = False for c in "\0", "/", "\\", ":": if c in f: bad = True break if bad: continue # empty files are invalid; clean them up and continue if not os.path.getsize(f): os.unlink(f) continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt def forceResync(self): self.db.execute("delete from media") self.db.execute("delete from log") self.db.execute("update meta set usn = 0, dirMod = 0") self.db.commit() def removeExisting(self, files): "Remove files from list of files to sync, and return missing files." need = [] remove = [] for f in files: if self.db.execute("select 1 from log where fname=?", f): remove.append((f,)) else: need.append(f) self.db.executemany("delete from log where fname=?", remove) self.db.commit() # if we need all the server files, it's faster to pass None than # the full list if need and len(files) == len(need): return None return need
class MediaManager(object): # other code depends on this order, so don't reorder regexps = ("(?i)(\[sound:([^]]+)\])", "(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)") def __init__(self, col): self.col = col # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) if not os.path.exists(self._dir): os.makedirs(self._dir) self._oldcwd = os.getcwd() os.chdir(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir() + ".db" create = not os.path.exists(path) self.db = DB(path) if create: self._initDB() def close(self): if self.col.server: return self.db.close() self.db = None # change cwd back to old location os.chdir(self._oldcwd) def dir(self): return self._dir # Adding media ########################################################################## def addFile(self, opath): """Copy PATH to MEDIADIR, and return new filename. If the same name exists, compare checksums.""" mdir = self.dir() # remove any dangerous characters base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath)) dst = os.path.join(mdir, base) # if it doesn't exist, copy it directly if not os.path.exists(dst): shutil.copy2(opath, dst) return base # if it's identical, reuse if self.filesIdentical(opath, dst): return base # otherwise, find a unique name (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n + 1) while True: path = os.path.join(mdir, root + ext) if not os.path.exists(path): break reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # copy and return shutil.copy2(opath, path) return os.path.basename(os.path.basename(path)) def filesIdentical(self, path1, path2): "True if files are the same." return (checksum(open(path1, "rb").read()) == checksum( open(path2, "rb").read())) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] # convert latex first model = self.col.models.get(mid) string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for (full, fname) in re.findall(reg, string): isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string): # Feeding webkit unicode can result in it not finding images, so on # linux/osx we percent escape the image paths as utf8. On Windows the # problem is more complicated - if we percent-escape as utf8 it fixes # some images but breaks others. When filenames are normalized by # dropbox they become unreadable if we escape them. if isWin: return string def repl(match): tag = match.group(1) fname = match.group(2) if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, urllib.quote(fname.encode("utf-8"))) return re.sub(self.regexps[1], repl, string) # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode): return unicodedata.normalize('NFD', s) return s for f in self.allMedia(): normrefs[norm(f)] = True # loop through directory and find unused & missing media unused = [] if local is None: files = os.listdir(mdir) else: files = local for file in files: if not local: path = os.path.join(mdir, file) if not os.path.isfile(path): # ignore directories continue nfile = norm(file) if nfile not in normrefs: unused.append(file) else: del normrefs[nfile] nohave = normrefs.keys() return (nohave, unused) def allMedia(self): "Return a set of all referenced filenames." files = set() for mid, flds in self.col.db.execute("select mid, flds from notes"): for f in self.filesInStr(mid, flds): files.add(f) return files # Copying on import ########################################################################## # FIXME: check if the files are actually identical, and rewrite references # if necessary def copyTo(self, rdir): "Copy media to RDIR. Return number of files copied." ldir = self.dir() if not os.path.exists(ldir): return 0 cnt = 0 for f in os.listdir(ldir): src = os.path.join(ldir, f) dst = os.path.join(rdir, f) if not os.path.exists(dst): shutil.copy2(src, dst) cnt += 1 return cnt # Media syncing - changes and removal ########################################################################## def hasChanged(self): return self.db.scalar("select 1 from log limit 1") def removed(self): return self.db.list("select * from log where type = ?", MEDIA_REM) def syncRemove(self, fnames): # remove provided deletions for f in fnames: if os.path.exists(f): os.unlink(f) self.db.execute("delete from log where fname = ?", f) self.db.execute("delete from media where fname = ?", f) # and all locally-logged deletions, as server has acked them self.db.execute("delete from log where type = ?", MEDIA_REM) self.db.commit() # Media syncing - unbundling zip files from server ########################################################################## def syncAdd(self, zipData): "Extract zip data; true if finished." f = StringIO(zipData) z = zipfile.ZipFile(f, "r") finished = False meta = None media = [] sizecnt = 0 # get meta info first assert z.getinfo("_meta").file_size < 100000 meta = simplejson.loads(z.read("_meta")) nextUsn = int(z.read("_usn")) # then loop through all files for i in z.infolist(): # check for zip bombs sizecnt += i.file_size assert sizecnt < 100 * 1024 * 1024 if i.filename == "_meta" or i.filename == "_usn": # ignore previously-retrieved meta continue elif i.filename == "_finished": # last zip in set finished = True else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # can we store the file on this system? if self.illegal(i.filename): continue # save file open(name, "wb").write(data) # update db media.append((name, csum, self._mtime(name))) # remove entries from local log self.db.execute("delete from log where fname = ?", name) # update media db and note new starting usn if media: self.db.executemany("insert or replace into media values (?,?,?)", media) self.setUsn(nextUsn) # commits # if we have finished adding, we need to record the new folder mtime # so that we don't trigger a needless scan if finished: self.syncMod() return finished def illegal(self, f): if isWin: for c in f: if c in "<>:\"/\\|?*^": return True elif isMac: for c in f: if c in ":\\/": return True # Media syncing - bundling zip files to send to server ########################################################################## # Because there's no standard filename encoding for zips, and because not # all zip clients support retrieving mtime, we store the files as ascii # and place a json file in the zip with the necessary information. def zipAdded(self): "Add files to a zip until over SYNC_ZIP_SIZE. Return zip data." f = StringIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) sz = 0 cnt = 0 files = {} cur = self.db.execute("select fname from log where type = ?", MEDIA_ADD) fnames = [] while 1: fname = cur.fetchone() if not fname: # add a flag so the server knows it can clean up z.writestr("_finished", "") break fname = fname[0] fnames.append([fname]) z.write(fname, str(cnt)) files[str(cnt)] = fname sz += os.path.getsize(fname) if sz > SYNC_ZIP_SIZE: break cnt += 1 z.writestr("_meta", simplejson.dumps(files)) z.close() return f.getvalue(), fnames def forgetAdded(self, fnames): if not fnames: return self.db.executemany("delete from log where fname = ?", fnames) self.db.commit() # Tracking changes (private) ########################################################################## def _initDB(self): self.db.executescript(""" create table media (fname text primary key, csum text, mod int); create table meta (dirMod int, usn int); insert into meta values (0, 0); create table log (fname text primary key, type int); """) def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def usn(self): return self.db.scalar("select usn from meta") def setUsn(self, usn): self.db.execute("update meta set usn = ?", usn) self.db.commit() def syncMod(self): self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if mod and mod == mtime: return False return mtime def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def _logChanges(self): (added, removed) = self._changes() log = [] media = [] mediaRem = [] for f in added: mt = self._mtime(f) media.append((f, self._checksum(f), mt)) log.append((f, MEDIA_ADD)) for f in removed: mediaRem.append((f, )) log.append((f, MEDIA_REM)) # update media db self.db.executemany("insert or replace into media values (?,?,?)", media) if mediaRem: self.db.executemany("delete from media where fname = ?", mediaRem) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) # and logs self.db.executemany("insert or replace into log values (?,?)", log) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute("select * from media"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): # ignore folders if os.path.isdir(f): continue # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(f) != self.cache[f][1]: # and has different checksum? if self._checksum(f) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in self.cache.items(): if not v[2]: removed.append(k) return added, removed def sanityCheck(self): assert not self.db.scalar("select count() from log") cnt = self.db.scalar("select count() from media") return cnt
class MediaManager: soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"] imgRegexps = [ # src element quoted case "(?i)(<img[^>]* src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case "(?i)(<img[^>]* src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps def __init__(self, col, server): self.col = col if server: self._dir = None return # media directory self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path) if not os.path.exists(self._dir): os.makedirs(self._dir) # change database self.connect() def connect(self): if self.col.server: return path = self.dir()+".db2" create = not os.path.exists(path) self.db = DB(path) if create: self._initDB() self.maybeUpgrade() def _initDB(self): self.db.executescript(""" create table media ( fname text not null primary key, csum text, -- null indicates deleted file mtime int not null, -- zero if deleted dirty int not null ); create index idx_media_dirty on media (dirty); create table meta (dirMod int, lastUsn int); insert into meta values (0, 0); """) def maybeUpgrade(self): oldpath = self.dir()+".db" if os.path.exists(oldpath): self.db.execute('attach "../collection.media.db" as old') try: self.db.execute(""" insert into media select m.fname, csum, mod, ifnull((select 1 from log l2 where l2.fname=m.fname), 0) as dirty from old.media m left outer join old.log l using (fname) union select fname, null, 0, 1 from old.log where type=1;""") self.db.execute("delete from meta") self.db.execute(""" insert into meta select dirMod, usn from old.meta """) self.db.commit() except Exception as e: # if we couldn't import the old db for some reason, just start # anew self.col.log("failed to import old media db:"+traceback.format_exc()) self.db.execute("detach old") npath = os.path.join(self.dir(), "collection.media.db.old") if os.path.exists(npath): os.unlink(npath) os.rename(os.path.join(self.dir(), "collection.media.db"), npath) def close(self): if self.col.server: return self.db.close() self.db = None def dir(self): return self._dir def _isFAT32(self): if not isWin: return import win32api, win32file try: name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) except: # mapped & unmapped network drive; pray that it's not vfat return if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True # Adding media ########################################################################## # opath must be in unicode def addFile(self, opath): return self.writeData(opath, open(opath, "rb").read()) def writeData(self, opath, data): # if fname is a full path, use only the basename fname = os.path.basename(opath) # make sure we write it in NFC form (on mac will autoconvert to NFD), # and return an NFC-encoded reference fname = unicodedata.normalize("NFC", fname) # remove any dangerous characters base = self.stripIllegal(fname) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n+1) # find the first available name csum = checksum(data) while True: fname = root + ext path = os.path.join(self.dir(), fname) # if it doesn't exist, copy it directly if not os.path.exists(path): open(path, "wb").write(data) return fname # if it's identical, reuse if checksum(open(path, "rb").read()) == csum: return fname # otherwise, increment the index in the filename reg = " \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # String manipulation ########################################################################## def filesInStr(self, mid, string, includeRemote=False): l = [] model = self.col.models.get(mid) strings = [] if model['type'] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string): ords = set(re.findall("{{c(\d+)::.+?}}", string)) strings = [] from anki.template.template import clozeReg def qrepl(m): if m.group(3): return "[%s]" % m.group(3) else: return "[...]" def arepl(m): return m.group(1) for ord in ords: s = re.sub(clozeReg%ord, qrepl, string) s = re.sub(clozeReg%".+?", "\\1", s) strings.append(s) strings.append(re.sub(clozeReg%".+?", arepl, string)) return strings def transformNames(self, txt, func): for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt): for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string, unescape=False): if unescape: fn = urllib.parse.unquote else: fn = urllib.parse.quote def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, fn(fname)) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check(self, local=None): "Return (missingFiles, unusedFiles)." mdir = self.dir() # gather all media references in NFC form allRefs = set() for nid, mid, flds in self.col.db.execute("select id, mid, flds from notes"): noteRefs = self.filesInStr(mid, flds) # check the refs are in NFC for f in noteRefs: # if they're not, we'll need to fix them first if f != unicodedata.normalize("NFC", f): self._normalizeNoteRefs(nid) noteRefs = self.filesInStr(mid, flds) break allRefs.update(noteRefs) # loop through media folder unused = [] invalid = [] if local is None: files = os.listdir(mdir) else: files = local renamedFiles = False for file in files: path = os.path.join(self.dir(), file) if not local: if not os.path.isfile(path): # ignore directories continue if file.startswith("_"): # leading _ says to ignore file continue nfcFile = unicodedata.normalize("NFC", file) nfcPath = os.path.join(self.dir(), nfcFile) # we enforce NFC fs encoding on non-macs; on macs we'll have gotten # NFD so we use the above variable for comparing references if not isMac and not local: if file != nfcFile: # delete if we already have the NFC form, otherwise rename if os.path.exists(nfcPath): os.unlink(path) renamedFiles = True else: os.rename(path, nfcPath) renamedFiles = True file = nfcFile # compare if nfcFile not in allRefs: unused.append(file) else: allRefs.discard(nfcFile) # if we renamed any files to nfc format, we must rerun the check # to make sure the renamed files are not marked as unused if renamedFiles: return self.check(local=local) nohave = [x for x in allRefs if not x.startswith("_")] return (nohave, unused, invalid) def _normalizeNoteRefs(self, nid): note = self.col.getNote(nid) for c, fld in enumerate(note.fields): nfc = unicodedata.normalize("NFC", fld) if nfc != fld: note.fields[c] = nfc note.flush() # Copying on import ########################################################################## def have(self, fname): return os.path.exists(os.path.join(self.dir(), fname)) # Illegal characters ########################################################################## _illegalCharReg = re.compile(r'[][><:"/?*^\\|\0\r\n]') def stripIllegal(self, str): return re.sub(self._illegalCharReg, "", str) def hasIllegal(self, str): return not not re.search(self._illegalCharReg, str) # Tracking changes ########################################################################## def findChanges(self): "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def haveDirty(self): return self.db.scalar("select 1 from media where dirty=1 limit 1") def _mtime(self, path): return int(os.stat(path).st_mtime) def _checksum(self, path): return checksum(open(path, "rb").read()) def _changed(self): "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def _logChanges(self): (added, removed) = self._changes() media = [] for f in added: path = os.path.join(self.dir(), f) mt = self._mtime(path) media.append((f, self._checksum(path), mt, 1)) for f in removed: media.append((f, None, 0, 1)) # update media db self.db.executemany("insert or replace into media values (?,?,?,?)", media) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changes(self): self.cache = {} for (name, csum, mod) in self.db.execute( "select fname, csum, mtime from media where csum is not null"): self.cache[name] = [csum, mod, False] added = [] removed = [] # loop through on-disk files for f in os.listdir(self.dir()): path = os.path.join(self.dir(), f) # ignore folders and thumbs.db if os.path.isdir(path): continue if f.lower() == "thumbs.db": continue # and files with invalid chars if self.hasIllegal(f): continue # empty files are invalid; clean them up and continue sz = os.path.getsize(path) if not sz: os.unlink(path) continue if sz > 100*1024*1024: self.col.log("ignoring file over 100MB", f) continue # check encoding if not isMac: normf = unicodedata.normalize("NFC", f) normpath = os.path.join(self.dir(), normf) if f != normf: # wrong filename encoding which will cause sync errors if os.path.exists(normpath): os.unlink(path) else: os.rename(path, normpath) # newly added? if f not in self.cache: added.append(f) else: # modified since last time? if self._mtime(path) != self.cache[f][1]: # and has different checksum? if self._checksum(path) != self.cache[f][0]: added.append(f) # mark as used self.cache[f][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in list(self.cache.items()): if not v[2]: removed.append(k) return added, removed # Syncing-related ########################################################################## def lastUsn(self): return self.db.scalar("select lastUsn from meta") def setLastUsn(self, usn): self.db.execute("update meta set lastUsn = ?", usn) self.db.commit() def syncInfo(self, fname): ret = self.db.first( "select csum, dirty from media where fname=?", fname) return ret or (None, 0) def markClean(self, fnames): for fname in fnames: self.db.execute( "update media set dirty=0 where fname=?", fname) def syncDelete(self, fname): path = os.path.join(self.dir(), fname) if os.path.exists(path): os.unlink(path) self.db.execute("delete from media where fname=?", fname) def mediaCount(self): return self.db.scalar( "select count() from media where csum is not null") def dirtyCount(self): return self.db.scalar( "select count() from media where dirty=1") def forceResync(self): self.db.execute("delete from media") self.db.execute("update meta set lastUsn=0,dirMod=0") self.db.commit() self.db.setAutocommit(True) self.db.execute("vacuum") self.db.execute("analyze") self.db.setAutocommit(False) # Media syncing: zips ########################################################################## def mediaChangesZip(self): f = io.BytesIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) fnames = [] # meta is list of (fname, zipname), where zipname of None # is a deleted file meta = [] sz = 0 for c, (fname, csum) in enumerate(self.db.execute( "select fname, csum from media where dirty=1" " limit %d"%SYNC_ZIP_COUNT)): path = os.path.join(self.dir(), fname) fnames.append(fname) normname = unicodedata.normalize("NFC", fname) if csum: self.col.log("+media zip", fname) z.write(path, str(c)) meta.append((normname, str(c))) sz += os.path.getsize(path) else: self.col.log("-media zip", fname) meta.append((normname, "")) if sz >= SYNC_ZIP_SIZE: break z.writestr("_meta", json.dumps(meta)) z.close() return f.getvalue(), fnames def addFilesFromZip(self, zipData): "Extract zip data; true if finished." f = io.BytesIO(zipData) z = zipfile.ZipFile(f, "r") media = [] # get meta info first meta = json.loads(z.read("_meta").decode("utf8")) # then loop through all files cnt = 0 for i in z.infolist(): if i.filename == "_meta": # ignore previously-retrieved meta continue else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # normalize name for platform if isMac: name = unicodedata.normalize("NFD", name) else: name = unicodedata.normalize("NFC", name) # save file path = os.path.join(self.dir(), name) open(path, "wb").write(data) # update db media.append((name, csum, self._mtime(path), 0)) cnt += 1 if media: self.db.executemany( "insert or replace into media values (?,?,?,?)", media) return cnt
class MediaManager: soundRegexps = [r"(?i)(\[sound:(?P<fname>[^]]+)\])"] imgRegexps = [ # src element quoted case r"(?i)(<img[^>]* src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)", # unquoted case r"(?i)(<img[^>]* src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)", ] regexps = soundRegexps + imgRegexps db: Optional[DB] def __init__(self, col, server: bool) -> None: self.col = col if server: self._dir = None return # media directory self._dir = re.sub(r"(?i)\.(anki2)$", ".media", self.col.path) if not os.path.exists(self._dir): os.makedirs(self._dir) try: self._oldcwd = os.getcwd() except OSError: # cwd doesn't exist self._oldcwd = None try: os.chdir(self._dir) except OSError: raise Exception("invalidTempFolder") # change database self.connect() def connect(self) -> None: if self.col.server: return path = self.dir() + ".db2" create = not os.path.exists(path) os.chdir(self._dir) self.db = DB(path) if create: self._initDB() self.maybeUpgrade() def _initDB(self) -> None: self.db.executescript(""" create table media ( fname text not null primary key, csum text, -- null indicates deleted file mtime int not null, -- zero if deleted dirty int not null ); create index idx_media_dirty on media (dirty); create table meta (dirMod int, lastUsn int); insert into meta values (0, 0); """) def maybeUpgrade(self) -> None: oldpath = self.dir() + ".db" if os.path.exists(oldpath): self.db.execute('attach "../collection.media.db" as old') try: self.db.execute(""" insert into media select m.fname, csum, mod, ifnull((select 1 from log l2 where l2.fname=m.fname), 0) as dirty from old.media m left outer join old.log l using (fname) union select fname, null, 0, 1 from old.log where type=1;""") self.db.execute("delete from meta") self.db.execute(""" insert into meta select dirMod, usn from old.meta """) self.db.commit() except Exception as e: # if we couldn't import the old db for some reason, just start # anew self.col.log("failed to import old media db:" + traceback.format_exc()) self.db.execute("detach old") npath = "../collection.media.db.old" if os.path.exists(npath): os.unlink(npath) os.rename("../collection.media.db", npath) def close(self) -> None: if self.col.server: return self.db.close() self.db = None # change cwd back to old location if self._oldcwd: try: os.chdir(self._oldcwd) except: # may have been deleted pass def _deleteDB(self) -> None: path = self.db._path self.close() os.unlink(path) self.connect() def dir(self) -> Any: return self._dir def _isFAT32(self) -> bool: if not isWin: return False # pylint: disable=import-error import win32api, win32file # pytype: disable=import-error try: name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3]) except: # mapped & unmapped network drive; pray that it's not vfat return False if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"): return True return False # Adding media ########################################################################## # opath must be in unicode def addFile(self, opath: str) -> Any: with open(opath, "rb") as f: return self.writeData(opath, f.read()) def writeData(self, opath: str, data: bytes, typeHint: Optional[str] = None) -> Any: # if fname is a full path, use only the basename fname = os.path.basename(opath) # if it's missing an extension and a type hint was provided, use that if not os.path.splitext(fname)[1] and typeHint: # mimetypes is returning '.jpe' even after calling .init(), so we'll do # it manually instead typeMap = { "image/jpeg": ".jpg", "image/png": ".png", } if typeHint in typeMap: fname += typeMap[typeHint] # make sure we write it in NFC form (pre-APFS Macs will autoconvert to NFD), # and return an NFC-encoded reference fname = unicodedata.normalize("NFC", fname) # ensure it's a valid filename base = self.cleanFilename(fname) (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n + 1) # find the first available name csum = checksum(data) while True: fname = root + ext path = os.path.join(self.dir(), fname) # if it doesn't exist, copy it directly if not os.path.exists(path): with open(path, "wb") as f: f.write(data) return fname # if it's identical, reuse with open(path, "rb") as f: if checksum(f.read()) == csum: return fname # otherwise, increment the index in the filename reg = r" \((\d+)\)$" if not re.search(reg, root): root = root + " (1)" else: root = re.sub(reg, repl, root) # String manipulation ########################################################################## def filesInStr(self, mid: Union[int, str], string: str, includeRemote: bool = False) -> List[str]: l = [] model = self.col.models.get(mid) strings: List[str] = [] if model["type"] == MODEL_CLOZE and "{{c" in string: # if the field has clozes in it, we'll need to expand the # possibilities so we can render latex strings = self._expandClozes(string) else: strings = [string] for string in strings: # handle latex string = mungeQA(string, None, None, model, None, self.col) # extract filenames for reg in self.regexps: for match in re.finditer(reg, string): fname = match.group("fname") isLocal = not re.match("(https?|ftp)://", fname.lower()) if isLocal or includeRemote: l.append(fname) return l def _expandClozes(self, string: str) -> List[str]: ords = set(re.findall(r"{{c(\d+)::.+?}}", string)) strings = [] from anki.template.template import ( clozeReg, CLOZE_REGEX_MATCH_GROUP_HINT, CLOZE_REGEX_MATCH_GROUP_CONTENT, ) def qrepl(m): if m.group(CLOZE_REGEX_MATCH_GROUP_HINT): return "[%s]" % m.group(CLOZE_REGEX_MATCH_GROUP_HINT) else: return "[...]" def arepl(m): return m.group(CLOZE_REGEX_MATCH_GROUP_CONTENT) for ord in ords: s = re.sub(clozeReg % ord, qrepl, string) s = re.sub(clozeReg % ".+?", arepl, s) strings.append(s) strings.append(re.sub(clozeReg % ".+?", arepl, string)) return strings def transformNames(self, txt: str, func: Callable) -> Any: for reg in self.regexps: txt = re.sub(reg, func, txt) return txt def strip(self, txt: str) -> str: for reg in self.regexps: txt = re.sub(reg, "", txt) return txt def escapeImages(self, string: str, unescape: bool = False) -> str: fn: Callable if unescape: fn = urllib.parse.unquote else: fn = urllib.parse.quote def repl(match): tag = match.group(0) fname = match.group("fname") if re.match("(https?|ftp)://", fname): return tag return tag.replace(fname, fn(fname)) for reg in self.imgRegexps: string = re.sub(reg, repl, string) return string # Rebuilding DB ########################################################################## def check( self, local: Optional[List[str]] = None ) -> Tuple[List[str], List[str], List[str]]: "Return (missingFiles, unusedFiles)." mdir = self.dir() # gather all media references in NFC form allRefs = set() for nid, mid, flds in self.col.db.execute( "select id, mid, flds from notes"): noteRefs = self.filesInStr(mid, flds) # check the refs are in NFC for f in noteRefs: # if they're not, we'll need to fix them first if f != unicodedata.normalize("NFC", f): self._normalizeNoteRefs(nid) noteRefs = self.filesInStr(mid, flds) break allRefs.update(noteRefs) # loop through media folder unused = [] if local is None: files = os.listdir(mdir) else: files = local renamedFiles = False dirFound = False warnings = [] for file in files: if not local: if not os.path.isfile(file): # ignore directories dirFound = True continue if file.startswith("_"): # leading _ says to ignore file continue if self.hasIllegal(file): name = file.encode(sys.getfilesystemencoding(), errors="replace") name = str(name, sys.getfilesystemencoding()) warnings.append( _("Invalid file name, please rename: %s") % name) continue nfcFile = unicodedata.normalize("NFC", file) # we enforce NFC fs encoding on non-macs if not isMac and not local: if file != nfcFile: # delete if we already have the NFC form, otherwise rename if os.path.exists(nfcFile): os.unlink(file) renamedFiles = True else: os.rename(file, nfcFile) renamedFiles = True file = nfcFile # compare if nfcFile not in allRefs: unused.append(file) else: allRefs.discard(nfcFile) # if we renamed any files to nfc format, we must rerun the check # to make sure the renamed files are not marked as unused if renamedFiles: return self.check(local=local) nohave = [x for x in allRefs if not x.startswith("_")] # make sure the media DB is valid try: self.findChanges() except DBError: self._deleteDB() if dirFound: warnings.append( _("Anki does not support files in subfolders of the collection.media folder." )) return (nohave, unused, warnings) def _normalizeNoteRefs(self, nid) -> None: note = self.col.getNote(nid) for c, fld in enumerate(note.fields): nfc = unicodedata.normalize("NFC", fld) if nfc != fld: note.fields[c] = nfc note.flush() # Copying on import ########################################################################## def have(self, fname: str) -> bool: return os.path.exists(os.path.join(self.dir(), fname)) # Illegal characters and paths ########################################################################## _illegalCharReg = re.compile(r'[][><:"/?*^\\|\0\r\n]') def stripIllegal(self, str: str) -> str: return re.sub(self._illegalCharReg, "", str) def hasIllegal(self, s: str) -> bool: if re.search(self._illegalCharReg, s): return True try: s.encode(sys.getfilesystemencoding()) except UnicodeEncodeError: return True return False def cleanFilename(self, fname: str) -> str: fname = self.stripIllegal(fname) fname = self._cleanWin32Filename(fname) fname = self._cleanLongFilename(fname) if not fname: fname = "renamed" return fname def _cleanWin32Filename(self, fname: str) -> str: if not isWin: return fname # deal with things like con/prn/etc p = pathlib.WindowsPath(fname) if p.is_reserved(): fname = "renamed" + fname assert not pathlib.WindowsPath(fname).is_reserved() return fname def _cleanLongFilename(self, fname: str) -> Any: # a fairly safe limit that should work on typical windows # paths and on eCryptfs partitions, even with a duplicate # suffix appended namemax = 136 if isWin: pathmax = 240 else: pathmax = 1024 # cap namemax based on absolute path dirlen = len(os.path.dirname(os.path.abspath(fname))) remaining = pathmax - dirlen namemax = min(remaining, namemax) assert namemax > 0 if len(fname) > namemax: head, ext = os.path.splitext(fname) headmax = namemax - len(ext) head = head[0:headmax] fname = head + ext assert len(fname) <= namemax return fname # Tracking changes ########################################################################## def findChanges(self) -> None: "Scan the media folder if it's changed, and note any changes." if self._changed(): self._logChanges() def haveDirty(self) -> Any: return self.db.scalar("select 1 from media where dirty=1 limit 1") def _mtime(self, path: str) -> int: return int(os.stat(path).st_mtime) def _checksum(self, path: str) -> str: with open(path, "rb") as f: return checksum(f.read()) def _changed(self) -> int: "Return dir mtime if it has changed since the last findChanges()" # doesn't track edits, but user can add or remove a file to update mod = self.db.scalar("select dirMod from meta") mtime = self._mtime(self.dir()) if not self._isFAT32() and mod and mod == mtime: return False return mtime def _logChanges(self) -> None: (added, removed) = self._changes() media = [] for f, mtime in added: media.append((f, self._checksum(f), mtime, 1)) for f in removed: media.append((f, None, 0, 1)) # update media db self.db.executemany("insert or replace into media values (?,?,?,?)", media) self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) self.db.commit() def _changes(self) -> Tuple[List[Tuple[str, int]], List[str]]: self.cache: Dict[str, Any] = {} for (name, csum, mod) in self.db.execute( "select fname, csum, mtime from media where csum is not null"): # previous entries may not have been in NFC form normname = unicodedata.normalize("NFC", name) self.cache[normname] = [csum, mod, False] added = [] removed = [] # loop through on-disk files with os.scandir(self.dir()) as it: for f in it: # ignore folders and thumbs.db if f.is_dir(): continue if f.name.lower() == "thumbs.db": continue # and files with invalid chars if self.hasIllegal(f.name): continue # empty files are invalid; clean them up and continue sz = f.stat().st_size if not sz: os.unlink(f.name) continue if sz > 100 * 1024 * 1024: self.col.log("ignoring file over 100MB", f.name) continue # check encoding normname = unicodedata.normalize("NFC", f.name) if not isMac: if f.name != normname: # wrong filename encoding which will cause sync errors if os.path.exists(normname): os.unlink(f.name) else: os.rename(f.name, normname) else: # on Macs we can access the file using any normalization pass # newly added? mtime = int(f.stat().st_mtime) if normname not in self.cache: added.append((normname, mtime)) else: # modified since last time? if mtime != self.cache[normname][1]: # and has different checksum? if self._checksum(normname) != self.cache[normname][0]: added.append((normname, mtime)) # mark as used self.cache[normname][2] = True # look for any entries in the cache that no longer exist on disk for (k, v) in list(self.cache.items()): if not v[2]: removed.append(k) return added, removed # Syncing-related ########################################################################## def lastUsn(self) -> Any: return self.db.scalar("select lastUsn from meta") def setLastUsn(self, usn) -> None: self.db.execute("update meta set lastUsn = ?", usn) self.db.commit() def syncInfo(self, fname) -> Any: ret = self.db.first("select csum, dirty from media where fname=?", fname) return ret or (None, 0) def markClean(self, fnames) -> None: for fname in fnames: self.db.execute("update media set dirty=0 where fname=?", fname) def syncDelete(self, fname) -> None: if os.path.exists(fname): os.unlink(fname) self.db.execute("delete from media where fname=?", fname) def mediaCount(self) -> Any: return self.db.scalar( "select count() from media where csum is not null") def dirtyCount(self) -> Any: return self.db.scalar("select count() from media where dirty=1") def forceResync(self) -> None: self.db.execute("delete from media") self.db.execute("update meta set lastUsn=0,dirMod=0") self.db.commit() self.db.setAutocommit(True) self.db.execute("vacuum") self.db.execute("analyze") self.db.setAutocommit(False) # Media syncing: zips ########################################################################## def mediaChangesZip(self) -> Tuple[bytes, list]: f = io.BytesIO() z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED) fnames = [] # meta is list of (fname, zipname), where zipname of None # is a deleted file meta = [] sz = 0 for c, (fname, csum) in enumerate( self.db.execute("select fname, csum from media where dirty=1" " limit %d" % SYNC_ZIP_COUNT)): fnames.append(fname) normname = unicodedata.normalize("NFC", fname) if csum: self.col.log("+media zip", fname) z.write(fname, str(c)) meta.append((normname, str(c))) sz += os.path.getsize(fname) else: self.col.log("-media zip", fname) meta.append((normname, "")) if sz >= SYNC_ZIP_SIZE: break z.writestr("_meta", json.dumps(meta)) z.close() return f.getvalue(), fnames def addFilesFromZip(self, zipData) -> int: "Extract zip data; true if finished." f = io.BytesIO(zipData) z = zipfile.ZipFile(f, "r") media = [] # get meta info first meta = json.loads(z.read("_meta").decode("utf8")) # then loop through all files cnt = 0 for i in z.infolist(): if i.filename == "_meta": # ignore previously-retrieved meta continue else: data = z.read(i) csum = checksum(data) name = meta[i.filename] # normalize name name = unicodedata.normalize("NFC", name) # save file with open(name, "wb") as f: # type: ignore f.write(data) # update db media.append((name, csum, self._mtime(name), 0)) cnt += 1 if media: self.db.executemany( "insert or replace into media values (?,?,?,?)", media) return cnt
class ChangeLog: """Tracks changes made to notes""" def __init__(self): base_path = os.path.dirname(os.path.abspath(__file__)) db_path = os.path.join(base_path, "..", "user_files", "changelog.db") need_create = not os.path.exists(db_path) self.db = DB(db_path) self.db.setAutocommit(True) if need_create: self._create_tables() self._create_indices() self.db.setAutocommit(False) max_id = self.db.scalar("select max(id) from changelog") if max_id is not None: self.next_id = max_id + 1 else: self.next_id = 0 def close(self): self.db.close() def commit_changes(self): self.db.commit() self.db.mod = False def record_change(self, op, init_ts, change): self.db.execute( """ insert into changelog (id, op, init_ts, ts, nid, fld, old, new) values (?,?,?,?,?,?,?,?) """, self.next_id, op, init_ts, change.ts, change.nid, change.fld, change.old, change.new) self.next_id += 1 def record_and_commit_changes(self, op, init_ts, changes): data = [] for change in changes: data.append((self.next_id, op, init_ts, change.ts, change.nid, change.fld, change.old, change.new)) self.next_id += 1 self.db.executemany( """ insert into changelog (id, op, init_ts, ts, nid, fld, old, new) values (?,?,?,?,?,?,?,?) """, data) self.commit_changes() def _create_tables(self): self.db.executescript(""" create table if not exists changelog ( id integer primary key, -- identifies the operation performed op text not null, -- timestamp (ms) when bulk changes were initiated init_ts integer not null, -- timestamp (ms) when field was changed ts integer not null, -- note id nid integer not null, -- field name fld text not null, -- old value of field old text not null, -- new value of field new text not null ); """) def _create_indices(self): self.db.executescript(""" create index if not exists ix_changelog_ts on changelog (ts); """)