def _check_outdated(self, dataset):
     storedir = self.config.marv.storedir
     setdir = os.path.join(storedir, str(dataset.setid))
     latest = [os.path.realpath(x)
               for x in [os.path.join(setdir, x) for x in os.listdir(setdir)]
               if os.path.islink(x)]
     oldest_mtime = utils.mtime(os.path.join(setdir, 'detail.json'))
     for nodedir in latest:
         for dirpath, dirnames, filenames in os.walk(nodedir):
             for name in filenames:
                 path = os.path.join(dirpath, name)
                 oldest_mtime = min(oldest_mtime, utils.mtime(path))
     dataset_mtime = max(x.mtime for x in dataset.files)
     dataset.outdated = int(oldest_mtime * 1000) < dataset_mtime
Exemple #2
0
    async def make_dataset(self,
                           connection,
                           files,
                           name,
                           time_added=None,
                           discarded=False,
                           setid=None,
                           status=0,
                           timestamp=None,
                           _restore=None):
        # pylint: disable=too-many-arguments
        time_added = int(utils.now() *
                         1000) if time_added is None else time_added

        collection = await CollectionModel.filter(
            name=self.name).using_db(connection).first()
        dataset = await Dataset.create(collection=collection,
                                       name=name,
                                       discarded=discarded,
                                       status=status,
                                       time_added=time_added,
                                       timestamp=0,
                                       setid=setid or SetID.random(),
                                       acn_id=collection.acn_id,
                                       using_db=connection)

        if _restore:
            files = [
                File(dataset=dataset, idx=i, **x) for i, x in enumerate(files)
            ]
        else:
            files = [
                File(dataset=dataset,
                     idx=i,
                     mtime=int(utils.mtime(path) * 1000),
                     path=path,
                     size=stat.st_size) for i, (path, stat) in enumerate(
                         (path, utils.stat(path)) for path in files)
            ]

        dataset.timestamp = timestamp or max(x.mtime for x in files)
        await dataset.save(using_db=connection)
        await File.bulk_create(files, using_db=connection)

        await dataset.fetch_related('files', using_db=connection)

        storedir = self.config.marv.storedir
        store = Store(storedir, self.nodes)
        store.add_dataset(dataset, exists_okay=_restore)
        self.render_detail(dataset)
        return dataset
    def make_dataset(self, files, name, time_added=None, discarded=None, setid=None, status=None,
                     timestamp=None, _restore=None):
        setid = setid or SetID.random()
        if _restore:
            files = [File(idx=i, **x) for i, x in enumerate(files)]
        else:
            files = [File(idx=i, mtime=int(utils.mtime(path) * 1000), path=path, size=stat.st_size)
                     for i, (path, stat)
                     in enumerate((path, os.stat(path)) for path in files)]
        time_added = int(utils.now() * 1000) if time_added is None else time_added
        dataset = Dataset(collection=self.name,
                          files=files,
                          name=name,
                          discarded=discarded,
                          status=status,
                          time_added=time_added,
                          timestamp=timestamp or max(x.mtime for x in files),
                          setid=setid)

        storedir = self.config.marv.storedir
        store = Store(storedir, self.nodes)
        store.add_dataset(dataset, exists_okay=_restore)
        self.render_detail(dataset)
        return dataset
    def scan(self, scanpath, dry_run=False):
        Listing = self.model.Listing
        log = getLogger('.'.join([__name__, self.name]))
        scanroot = (x for x in self.scanroots if scanpath.startswith(x)).next()
        if not os.path.isdir(scanpath):
            log.warn('%s does not exist or is not a directory', scanpath)

        log.verbose("scanning %s'%s'", 'dry_run ' if dry_run else '', scanpath)

        # missing/changed flag for known files
        startswith = File.path.like('{}%'.format(esc(scanpath)), escape='$')
        known_files = File.query.filter(startswith)\
                                .join(Dataset)\
                                .filter(Dataset.discarded.isnot(True))
        known_filenames = defaultdict(set)
        changes = defaultdict(list)  # all mtime/missing changes in one transaction
        for file in known_files:
            path = file.path
            known_filenames[os.path.dirname(path)].add(os.path.basename(path))
            try:
                mtime = utils.mtime(path)
                missing = False
            except OSError:
                mtime = None
                missing = True
            if missing ^ bool(file.missing):
                log.info("%s '%s'", 'lost' if missing else 'recovered', path)
                changes[file.dataset_id].append((file, missing))
            if mtime and int(mtime * 1000) > file.mtime:
                log.info("mtime newer '%s'", path)
                changes[file.dataset_id].append((file, mtime))

        # Apply missing/mtime changes
        if not dry_run and changes:
            ids = changes.keys()
            for dataset in Dataset.query.filter(Dataset.id.in_(ids)):
                for file, change in changes.pop(dataset.id):
                    check_outdated = False
                    if type(change) is bool:
                        file.missing = change
                        dataset.missing = change
                    else:
                        file.mtime = int(change * 1000)
                        check_outdated = True
                if check_outdated:
                    self._check_outdated(dataset)
                dataset.time_updated = int(utils.now())
            assert not changes
            db.session.commit()

        # Scan for new files
        batch = []
        for directory, subdirs, filenames in os.walk(scanpath):
            # Ignore directories containing a .marvignore file
            if os.path.exists(os.path.join(directory, '.marvignore')):
                subdirs[:] = []
                continue

            # Ignore hidden directories and traverse subdirs alphabetically
            subdirs[:] = sorted([x for x in subdirs if x[0] != '.'])

            # Ignore hidden and known files
            known = known_filenames[directory]
            filenames = {x for x in filenames if x[0] != '.'}
            filenames = sorted(filenames - known)

            for name, files in self.scanner(directory, subdirs, filenames):
                files = [x if os.path.isabs(x) else os.path.join(directory, x)
                         for x in files]
                assert all(x.startswith(directory) for x in files), files
                if dry_run:
                    log.info("would add '%s': '%s'", directory, name)
                else:
                    dataset = self.make_dataset(files, name)
                    batch.append(dataset)
                    if len(batch) > 50:
                        self._add_batch(log, batch)

        if not dry_run and batch:
            self._add_batch(log, batch)

        log.verbose("finished %s'%s'", 'dry_run ' if dry_run else '', scanpath)
Exemple #5
0
    async def scan(self, scanpath, dry_run=False):  # noqa: C901
        # pylint: disable=too-many-locals,too-many-branches,too-many-statements

        log = getLogger('.'.join([__name__, self.name]))
        if not os.path.isdir(scanpath):
            log.warning('%s does not exist or is not a directory', scanpath)

        log.verbose("scanning %s'%s'", 'dry_run ' if dry_run else '', scanpath)

        # missing/changed flag for known files
        async with scoped_session(self.site.db) as connection:
            known_files = await File.filter(path__startswith=scanpath)\
                                    .filter(dataset__discarded__not=True)\
                                    .using_db(connection)
            known_filenames = defaultdict(set)
            changes = defaultdict(
                list)  # all mtime/missing changes in one transaction
            for file in known_files:
                path = file.path
                known_filenames[os.path.dirname(path)].add(
                    os.path.basename(path))
                try:
                    mtime = utils.mtime(path)
                    missing = False
                except OSError:
                    mtime = None
                    missing = True
                if missing ^ bool(file.missing):
                    log.info("%s '%s'", 'lost' if missing else 'recovered',
                             path)
                    changes[file.dataset_id].append((file, missing))
                if mtime and int(mtime * 1000) > file.mtime:
                    log.info("mtime newer '%s'", path)
                    changes[file.dataset_id].append((file, mtime))

            # Apply missing/mtime changes
            if not dry_run and changes:
                ids = changes.keys()
                for dataset in await Dataset.filter(id__in=ids
                                                    ).using_db(connection):
                    for file, change in changes.pop(dataset.id):
                        check_outdated = False
                        if isinstance(change, bool):
                            file.missing = change
                            dataset.missing = change
                        else:
                            file.mtime = int(change * 1000)
                            check_outdated = True
                        await file.save(connection)
                    if check_outdated:
                        await dataset.fetch_related('files',
                                                    using_db=connection)
                        self._check_outdated(dataset)
                    dataset.time_updated = int(utils.now())
                    await dataset.save(connection)
                assert not changes

            # Scan for new files
            batch = []
            for directory, subdirs, filenames in utils.walk(scanpath):
                directory = str(
                    directory)  # TODO: for now we don't pass Path into scanner
                # Ignore directories containing a .marvignore file
                if os.path.exists(os.path.join(directory, '.marvignore')):
                    subdirs.clear()
                    continue

                # Ignore hidden directories and traverse subdirs alphabetically
                subdirs[:] = sorted(x for x in subdirs if x[0] != '.')

                # Ignore hidden and known files
                known = known_filenames[directory]
                filenames = sorted(x for x in filenames
                                   if x[0] != '.' and x not in known)

                if not filenames and not subdirs:
                    continue

                for name, files in self.scanner(directory, subdirs, filenames):
                    files = [
                        x if os.path.isabs(x) else os.path.join(directory, x)
                        for x in files
                    ]
                    assert all(x.startswith(directory) for x in files), files
                    if dry_run:
                        log.info("would add '%s': '%s'", directory, name)
                    else:
                        dataset = await self.make_dataset(
                            connection, files, name)
                        batch.append(dataset)
                        if len(batch) >= 50:
                            await self._upsert_listing(connection, log, batch)
                            batch.clear()

            if not dry_run and batch:
                await self._upsert_listing(connection, log, batch)

        log.verbose("finished %s'%s'", 'dry_run ' if dry_run else '', scanpath)