def check_file_hashes(self): self.log.info("Check file hashes!") bad_items = [] try: with self.db.session_context() as sess: self.log.info("Loading file listing from db!") files = sess.query(self.db.ReleaseFile).order_by(desc(self.db.ReleaseFile.id)).all() self.log.info("Found %s files to scan", len(files)) sess.commit() for have in tqdm.tqdm(files): have_fqp = os.path.join(have.dirpath, have.filename) if not os.path.exists(have_fqp): self.log.error("File missing: %s", have_fqp) bad_items.append(("missing", have.id, have_fqp)) continue item_hash = hashfile.hash_file(have_fqp) if item_hash != have.fhash: self.log.error("File hash doesn't match file: %s (%s, %s)", have_fqp, item_hash, have.fhash) bad_items.append(("mismatch", have.id, have_fqp)) finally: with open("mismatches.json", "w") as fp: json.dump(bad_items, fp, indent=4) for item in bad_items: self.log.info("Failed: %s", item)
def validate_md5(self, archive_path): fhash = hashfile.hash_file(archive_path) itemRoot, itemFile = os.path.split(archive_path) with self.db.session_context() as sess: row = sess.query(self.db.ReleaseFile) \ .filter(self.db.ReleaseFile.dirpath == itemRoot) \ .filter(self.db.ReleaseFile.filename == itemFile) \ .scalar() assert row.fhash == fhash, "Hashes mishmatch after fetch: '%s', '%s' (%s)" % ( fhash, row.fhash, archive_path, )
def _create_or_update_file_entry_path(self, oldPath, newPath, setDeleted=False, setDuplicate=False, setPhash=False, reuse_sess=None): oldItemRoot, oldItemFile = os.path.split(oldPath) newItemRoot, newItemFile = os.path.split(newPath) assert oldPath != newPath with self.db.session_context(reuse_sess=reuse_sess) as sess: old_row = sess.query(self.db.ReleaseFile) \ .filter(self.db.ReleaseFile.dirpath == oldItemRoot) \ .filter(self.db.ReleaseFile.filename == oldItemFile) \ .scalar() new_row = sess.query(self.db.ReleaseFile) \ .filter(self.db.ReleaseFile.dirpath == newItemRoot) \ .filter(self.db.ReleaseFile.filename == newItemFile) \ .scalar() if not new_row: fhash = hashfile.hash_file(newPath) # Use an existing file row (if present), via the md5sum new_row = sess.query(self.db.ReleaseFile) \ .filter(self.db.ReleaseFile.fhash == fhash) \ .scalar() if new_row: self.log.info("Have existing row for fhash!") # But only if the existing file actually exists. if new_row and not os.path.exists( os.path.join(new_row.dirpath, new_row.filename)): self.log.warning( "Existing row for hash exists, but path is not valid (%s, %s)!", newItemRoot, newItemFile) assert new_row.fhash == fhash # Since a appropriate row exists but the paths aren't valid, just point that # row at the file on-disk. new_row.dirpath = newItemRoot new_row.filename = newItemFile else: new_row = self.db.ReleaseFile(dirpath=newItemRoot, filename=newItemFile, fhash=fhash) sess.add(new_row) sess.flush() if not old_row: self.log.warning( "Trying to update file path where the file doesn't exist!") self.log.warning("Dir path: '%s', fname: '%s'", oldItemRoot, oldItemFile) return releases = old_row.manga_releases + old_row.hentai_releases # Copy over the tags. for m_tag in old_row.manga_tags: new_row.manga_tags.add(m_tag) for h_tag in old_row.hentai_tags: new_row.hentai_tags.add(h_tag) # And then delete the old row (but only if it's changed, # since we might have matched against it by md5sum). if old_row.id != new_row.id: sess.delete(old_row) else: self.log.warning( "Old row matches new row by md5sum. Not deleting old row.") # This flush seems to be required. Somehow. sess.flush() # Re-point any items that point to the old file to the new file for release in releases: self.log.info( "Re-pointing release %s to new file (%s->%s), (%s->%s)", release.id, oldPath, newPath, release.fileid, new_row.id) self.log.info("New row: %s, new_row id: %s", new_row, new_row.id) release.fileid = new_row.id assert release.fileid # And set any flag(s) on the entries that pointed to the old files. if setDeleted: release.deleted = setDeleted if setDuplicate: release.was_duplicate = setDuplicate if setPhash: release.phash_duplicate = setPhash
def get_create_file_row(self, sess, row, fqfilename): ''' Given a path to a file, return a row for that file's contents. If no row exits, it is created. If a row for another file that has exactly matching contents, but a different name is found, it is used preferentially. Return is a 2-tuple of (file_row, file_path). File-path should be guaranteed to point to a valid file. Note that the file pointed to by the input parameter fqfilename may actually be deleted, if it is found to be a binary duplicate of another existing file. ''' # Round-trip via the filesystem because why not fhash = hashfile.hash_file(fqfilename) have = self._get_existing_file_by_hash(sess, fhash) dirpath, filename = os.path.split(fqfilename) if have: have_fqp = os.path.join(have.dirpath, have.filename) if have_fqp == fqfilename: self.log.error( "Multiple instances of a releasefile created on same on-disk file!" ) self.log.error("File: %s. Row id: %s", have_fqp, row.id) raise RuntimeError( "Multiple instances of a releasefile created on same on-disk file!" ) if os.path.exists(have_fqp): with open(have_fqp, "rb") as fp1: fc1 = fp1.read() with open(fqfilename, "rb") as fp2: fc2 = fp2.read() fc1_h = hashfile.hash_bytes(fc1) fc2_h = hashfile.hash_bytes(fc2) if fc1 != fc2: self.log.error( "Multiple instances of a releasefile with the same md5, but different contents?" ) self.log.error("File 1: '%s' (%s, %s), Row id: %s", fqfilename, fhash, fc2_h, row.id) self.log.error("File 2: '%s' (%s, %s).", have_fqp, have.fhash, fc1_h) raise RuntimeError( "Multiple instances of a releasefile with the same md5, but different contents?" ) if fqfilename == have_fqp: self.log.warning("Row for file-path already exists?.") self.log.warning("Files: '%s', '%s'.", have_fqp, fqfilename) elif os.path.exists(have_fqp) and os.path.exists(fqfilename): self.log.warning( "Duplicate file found by md5sum search. Re-using existing file." ) self.log.warning("Files: '%s', '%s'.", have_fqp, fqfilename) os.unlink(fqfilename) else: self.log.warning( "Duplicate file found by md5sum search, but a file is missing?" ) self.log.warning("Files: '%s', '%s'.", have_fqp, fqfilename) row.fileid = have.id return have, have_fqp else: self.log.warning( "Duplicate file found by md5sum search, but existing file has been deleted." ) self.log.warning("Files: '%s', '%s'.", have_fqp, fqfilename) have.dirpath = dirpath have.filename = filename return have, fqfilename else: new_row = self.db.ReleaseFile(dirpath=dirpath, filename=filename, fhash=fhash) sess.add(new_row) sess.flush() return new_row, fqfilename