コード例 #1
0
ファイル: fs.py プロジェクト: h3ndrik/spider
 def __init__(self, db, control, hashalgorithm='md5', metacrawl=True):
     self.db = db
     self.control = control
     self.directory = control.directory
     self.category = control.name
     self.meta = Meta(self.db, control.name)
     self.metacrawl = metacrawl
     self.timeout = 2592000
     if hashalgorithm == 'md5' or hashalgorithm == 'md5sum':
         self.hashsum = md5sum
     elif hashalgorithm == 'none' or hashalgorithm == 'None':
         logger.info('Not computing hashsums')
         self.hashsum = nonesum
     else:
         self.hashsum = nonesum
         raise(Exception('Hash algorithm not implemented'))
コード例 #2
0
ファイル: fs.py プロジェクト: h3ndrik/spider
class FS(object):
    """All Filesystem interaction"""

    def __init__(self, db, control, hashalgorithm='md5', metacrawl=True):
        self.db = db
        self.control = control
        self.directory = control.directory
        self.category = control.name
        self.meta = Meta(self.db, control.name)
        self.metacrawl = metacrawl
        self.timeout = 2592000
        if hashalgorithm == 'md5' or hashalgorithm == 'md5sum':
            self.hashsum = md5sum
        elif hashalgorithm == 'none' or hashalgorithm == 'None':
            logger.info('Not computing hashsums')
            self.hashsum = nonesum
        else:
            self.hashsum = nonesum
            raise(Exception('Hash algorithm not implemented'))

    def walk(self):
        """execute subroutines of filecrawler"""
        logger.info('Reading Filesystem')
        self.read_fs()
        logger.info('Computing new files')
################################# TODO Repair join!!! ###########################
        items = self.db.session.query(TmpFiles).outerjoin((Files, TmpFiles.filename == Files.filename)).\
                                                filter(Files.filename == None).\
                                                filter(Files.removed == None).all()

        logger.info('Analyzing Files')
        for item in items:
            #logger.debug('Analyzing: ' + item.filename)
            try:
                self.insertfile(item.filename)
            except (getattr(__builtins__,'FileNotFoundError', IOError), OSError):
                #TODO: insert with flag set
                logger.warning('Could not insert file \'%s\'. Probably bad encoding?' % item.filename)
                self.control.errors += 1
                self.db.session.commit()
                pass
        self.db.session.commit()

        logger.info('Computing deleted files')
################################# TODO Repair join!!! ###########################
        items = self.db.session.query(Files).filter(Files.category == self.category).\
                                             outerjoin((TmpFiles, Files.filename == TmpFiles.filename)).\
                                                filter(TmpFiles.filename == None).\
                                                filter(Files.removed == None).all()
        logger.info('Removing')
        for item in items:
            logger.info('Removing: ' + item.filename)
            self.removefile(item.filename)
        self.db.session.commit()

        #logger.info('Deleting old database entries')
        # TODO and think of meta table

    def updatemeta(self):
        """update metadata"""
        logger.info('Dumping current metadata table')
        rows = self.db.session.query(Metadata).filter(Metadata.file.has(Files.category == self.category)).\
                                               filter(Metadata.auto == True).all()
        for row in rows:
            self.db.session.delete(row)
        #self.db.session.commit()
        logger.info('Updating Metadata')
        items = self.db.session.query(Files).filter(Files.category == self.category).all()
        for item in items:
            logger.debug('Updating Metadata of: ' + item.filename)
            self.meta.insertmeta(item.filename, item.id)
        self.db.session.commit()

    def read_fs(self):
        """walk filesystem and write to temporary table"""
        self.db.cleanTmpFiles()
        fs_enc = sys.getfilesystemencoding()
        if fs_enc != 'utf-8':
            logger.warning('Filesystem-encoding is not UTF-8!')
#        if sys.version_info <= (3, 0):
#            self.directory = unicode(self.directory, fs_enc)
        if isinstance(self.directory, bytes):
            self.directory = self.directory.decode('utf-8', 'surrogateescape')    # ensure unicode
        for dir, subdirs, files in os.walk(self.directory):
            logger.debug(''.join(["Reading dir: ", dir]))
            for subdir in subdirs:
                try:
#                    if not os.path.ismount(os.path.join(dir, subdir)):
#                      logger.warning('mountpoint?')
                    filename = os.path.join(dir, subdir)
                    mtime = os.stat(filename).st_mtime
                    item = TmpFiles(filename=filename.encode('utf-8', 'replace').decode('utf-8', 'strict'), mtime=mtime)
                    self.db.add(item)
                except (getattr(__builtins__,'FileNotFoundError', IOError), OSError):
                    logger.warning(''.join(['Could not read dir: ', filename]))
            for file in files:
                try:
                    filename = os.path.join(dir, file)
                    mtime = os.stat(filename).st_mtime
                    item = TmpFiles(filename=filename.encode('utf-8', 'replace').decode('utf-8', 'strict'), mtime=mtime)
                    self.db.add(item)
                except (getattr(__builtins__,'FileNotFoundError', IOError), OSError):
                    logger.warning(''.join(['Could not read file: ', filename]))

            #if '.git' in subdirs:
                #TODO: prune directories
                #subdirs.remove('.git')  # don't visit .git directories
        self.db.session.commit()

    def insertfile(self, filename):
        """gather filesystem infos for 'filename' and write to table"""
        #filename = os.path.join(dir, file)
        if os.path.isfile(filename):
            logger.debug(''.join(["Updating(New): ", filename]))
            (mime, encoding) = mimetypes.guess_type(filename)
            hash = self.hashsum(filename)
            removed = None
            filestat = os.stat(filename)
            mtime = filestat.st_mtime
            firstseen = time()
            size = filestat.st_size
            category = self.category
        elif os.path.isdir(filename):
            logger.debug(''.join(["Updating(New,Dir): ", filename]))
            mime = "directory"
            hash = None
            removed = None
            filestat = os.stat(filename)
            mtime = filestat.st_mtime
            firstseen = time()
            size = filestat.st_size
            category = self.category
        else:
            logger.debug(''.join(["Updating(Could not read): ", filename])) # Probably due to encoding
            mime = None
            hash = None
            removed = time()
            mtime = None
            firstseen = time()
            size = None
            category = self.category

        item = Files(filename=filename, category=category, mtime=mtime, firstseen=firstseen, size=size, mime=mime, hash=hash, removed=removed)
        self.db.add(item)
        self.db.session.flush()
        if self.metacrawl:
            self.meta.insertmeta(filename, item.id)

    def removefile(self, filename):
        """mark 'filename' as removed"""
        item = self.db.session.query(Files).filter(Files.filename == filename).one()
        item.removed = time()