Example #1
0
	def check_file_hashes(self):
		self.log.info("Check file hashes!")
		bad_items = []
		try:
			with self.db.session_context() as sess:
				self.log.info("Loading file listing from db!")
				files = sess.query(self.db.ReleaseFile).order_by(desc(self.db.ReleaseFile.id)).all()
				self.log.info("Found %s files to scan", len(files))
				sess.commit()
				for have in tqdm.tqdm(files):
					have_fqp = os.path.join(have.dirpath, have.filename)
					if not os.path.exists(have_fqp):
						self.log.error("File missing: %s", have_fqp)
						bad_items.append(("missing", have.id, have_fqp))
						continue

					item_hash = hashfile.hash_file(have_fqp)

					if item_hash != have.fhash:
						self.log.error("File hash doesn't match file: %s (%s, %s)", have_fqp, item_hash, have.fhash)
						bad_items.append(("mismatch", have.id, have_fqp))

		finally:
			with open("mismatches.json", "w") as fp:
				json.dump(bad_items, fp, indent=4)
			for item in bad_items:
				self.log.info("Failed: %s", item)
Example #2
0
    def validate_md5(self, archive_path):

        fhash = hashfile.hash_file(archive_path)
        itemRoot, itemFile = os.path.split(archive_path)

        with self.db.session_context() as sess:
            row = sess.query(self.db.ReleaseFile)                \
             .filter(self.db.ReleaseFile.dirpath == itemRoot)  \
             .filter(self.db.ReleaseFile.filename == itemFile) \
             .scalar()

            assert row.fhash == fhash, "Hashes mishmatch after fetch: '%s', '%s' (%s)" % (
                fhash,
                row.fhash,
                archive_path,
            )
Example #3
0
    def _create_or_update_file_entry_path(self,
                                          oldPath,
                                          newPath,
                                          setDeleted=False,
                                          setDuplicate=False,
                                          setPhash=False,
                                          reuse_sess=None):
        oldItemRoot, oldItemFile = os.path.split(oldPath)
        newItemRoot, newItemFile = os.path.split(newPath)

        assert oldPath != newPath

        with self.db.session_context(reuse_sess=reuse_sess) as sess:
            old_row = sess.query(self.db.ReleaseFile)                \
             .filter(self.db.ReleaseFile.dirpath == oldItemRoot)  \
             .filter(self.db.ReleaseFile.filename == oldItemFile) \
             .scalar()

            new_row = sess.query(self.db.ReleaseFile)                \
             .filter(self.db.ReleaseFile.dirpath == newItemRoot)  \
             .filter(self.db.ReleaseFile.filename == newItemFile) \
             .scalar()

            if not new_row:
                fhash = hashfile.hash_file(newPath)

                # Use an existing file row (if present), via the md5sum
                new_row = sess.query(self.db.ReleaseFile)          \
                 .filter(self.db.ReleaseFile.fhash == fhash) \
                 .scalar()

                if new_row:
                    self.log.info("Have existing row for fhash!")

                # But only if the existing file actually exists.
                if new_row and not os.path.exists(
                        os.path.join(new_row.dirpath, new_row.filename)):
                    self.log.warning(
                        "Existing row for hash exists, but path is not valid (%s, %s)!",
                        newItemRoot, newItemFile)

                    assert new_row.fhash == fhash
                    # Since a appropriate row exists but the paths aren't valid, just point that
                    # row at the file on-disk.
                    new_row.dirpath = newItemRoot
                    new_row.filename = newItemFile

                else:
                    new_row = self.db.ReleaseFile(dirpath=newItemRoot,
                                                  filename=newItemFile,
                                                  fhash=fhash)

                    sess.add(new_row)
                    sess.flush()

            if not old_row:
                self.log.warning(
                    "Trying to update file path where the file doesn't exist!")
                self.log.warning("Dir path: '%s', fname: '%s'", oldItemRoot,
                                 oldItemFile)
                return

            releases = old_row.manga_releases + old_row.hentai_releases

            # Copy over the tags.
            for m_tag in old_row.manga_tags:
                new_row.manga_tags.add(m_tag)

            for h_tag in old_row.hentai_tags:
                new_row.hentai_tags.add(h_tag)

            # And then delete the old row (but only if it's changed,
            # since we might have matched against it by md5sum).
            if old_row.id != new_row.id:
                sess.delete(old_row)
            else:
                self.log.warning(
                    "Old row matches new row by md5sum. Not deleting old row.")
            # This flush seems to be required. Somehow.
            sess.flush()

            # Re-point any items that point to the old file to the new file
            for release in releases:
                self.log.info(
                    "Re-pointing release %s to new file (%s->%s), (%s->%s)",
                    release.id, oldPath, newPath, release.fileid, new_row.id)

                self.log.info("New row: %s, new_row id: %s", new_row,
                              new_row.id)
                release.fileid = new_row.id
                assert release.fileid

                # And set any flag(s) on the entries that pointed to the old files.
                if setDeleted:
                    release.deleted = setDeleted
                if setDuplicate:
                    release.was_duplicate = setDuplicate
                if setPhash:
                    release.phash_duplicate = setPhash
Example #4
0
    def get_create_file_row(self, sess, row, fqfilename):
        '''
		Given a path to a file, return a row for that file's contents.
		If no row exits, it is created. If a row for another file
		that has exactly matching contents, but a different name
		is found, it is used preferentially.

		Return is a 2-tuple of (file_row, file_path).
		File-path should be guaranteed to point to a valid file.

		Note that the file pointed to by the input parameter fqfilename
		may actually be deleted, if it is found to be a binary duplicate
		of another existing file.
		'''

        # Round-trip via the filesystem because why not
        fhash = hashfile.hash_file(fqfilename)

        have = self._get_existing_file_by_hash(sess, fhash)

        dirpath, filename = os.path.split(fqfilename)
        if have:
            have_fqp = os.path.join(have.dirpath, have.filename)
            if have_fqp == fqfilename:
                self.log.error(
                    "Multiple instances of a releasefile created on same on-disk file!"
                )
                self.log.error("File: %s. Row id: %s", have_fqp, row.id)
                raise RuntimeError(
                    "Multiple instances of a releasefile created on same on-disk file!"
                )
            if os.path.exists(have_fqp):

                with open(have_fqp, "rb") as fp1:
                    fc1 = fp1.read()
                with open(fqfilename, "rb") as fp2:
                    fc2 = fp2.read()

                fc1_h = hashfile.hash_bytes(fc1)
                fc2_h = hashfile.hash_bytes(fc2)
                if fc1 != fc2:
                    self.log.error(
                        "Multiple instances of a releasefile with the same md5, but different contents?"
                    )
                    self.log.error("File 1: '%s' (%s, %s), Row id: %s",
                                   fqfilename, fhash, fc2_h, row.id)
                    self.log.error("File 2: '%s' (%s, %s).", have_fqp,
                                   have.fhash, fc1_h)
                    raise RuntimeError(
                        "Multiple instances of a releasefile with the same md5, but different contents?"
                    )

                if fqfilename == have_fqp:
                    self.log.warning("Row for file-path already exists?.")
                    self.log.warning("Files: '%s', '%s'.", have_fqp,
                                     fqfilename)
                elif os.path.exists(have_fqp) and os.path.exists(fqfilename):
                    self.log.warning(
                        "Duplicate file found by md5sum search. Re-using existing file."
                    )
                    self.log.warning("Files: '%s', '%s'.", have_fqp,
                                     fqfilename)
                    os.unlink(fqfilename)
                else:
                    self.log.warning(
                        "Duplicate file found by md5sum search, but a file is missing?"
                    )
                    self.log.warning("Files: '%s', '%s'.", have_fqp,
                                     fqfilename)

                row.fileid = have.id
                return have, have_fqp
            else:
                self.log.warning(
                    "Duplicate file found by md5sum search, but existing file has been deleted."
                )
                self.log.warning("Files: '%s', '%s'.", have_fqp, fqfilename)

                have.dirpath = dirpath
                have.filename = filename

                return have, fqfilename

        else:

            new_row = self.db.ReleaseFile(dirpath=dirpath,
                                          filename=filename,
                                          fhash=fhash)

            sess.add(new_row)
            sess.flush()

            return new_row, fqfilename