Example #1
0
 def test1(self):
     try:
         file_crawler = FileCrawler("C://", self.session, max_files=10)
     except IntegrityError, e:
         self.session.close()
         Crawl.dropAndCreate(e.message)
         self.fail(e.message)
Example #2
0
    def setUp(self):
        TestCase.setUp(self)
        from lib.Crawl import Crawl

        if not Crawl.exists():
            Crawl.createTable()
        self.assertTrue(Crawl.exists())
Example #3
0
    def setUp(self):
        TestCase.setUp(self)
        from lib.Crawl import Crawl

        self.assertTrue(Crawl.exists())
        Record.createTable()
        self.assertTrue(Record.exists())
Example #4
0
 def testGvizDataTable(self):
     session = SqlAlchemySessionFactory().createSqlAlchemySession()
     import gviz_api
     data_table = Crawl.getGvizDataTable()
     self.assertIsInstance(data_table, gviz_api.DataTable)
     import re
     r = re.compile(r"^google\.visualization\.Query\.setResponse\({.*}\);$")
     m = r.match(data_table.ToResponse())
     self.assertIsNotNone(m)
     session.close()
Example #5
0
 def __init__(self, path, sqlalchemy_session, max_files=None):
     Thread.__init__(self)
     self.skipCount = 0
     self.maxFiles = max_files
     self.path = path
     self.sqlAlchemySession = sqlalchemy_session
     self.hostName = _getHostName()
     self.crawl = Crawl()
     self.crawl.begin()
     self.sqlAlchemySession.add(self.crawl)
     self.sqlAlchemySession.commit()
Example #6
0
 def test2(self):
     session = Session()
     info(Crawl.getGvizDataTable(session))
     session.close()
Example #7
0
class FileCrawler(Thread):
    __slots__ = ()
    
    def __init__(self, path, sqlalchemy_session, max_files=None):
        Thread.__init__(self)
        self.skipCount = 0
        self.maxFiles = max_files
        self.path = path
        self.sqlAlchemySession = sqlalchemy_session
        self.hostName = _getHostName()
        self.crawl = Crawl()
        self.crawl.begin()
        self.sqlAlchemySession.add(self.crawl)
        self.sqlAlchemySession.commit()
    
    def run(self):
        self.crawl.begin()
        for root, dirs, files in os.walk(self.path):
            if len(dirs) > 0 and dirs[0] in EXCLUDE_DIRECTORIES: dirs = dirs[1:]
            for f in files:
                file_record = FileRecord()
                path = root + os.path.sep + f
                absolute_path = os.path.abspath(path)
                url = "file://" + self.hostName + "/" + absolute_path
                file_record.setUrl(url)
                file_record.setCrawlId(self.crawl.crawlId)
                if file_record.exists(self.crawl.agentId, self.sqlAlchemySession, BEST_BEFORE_PERIOD_IN_SECOND):
                    self.skipCount += 1
                    continue
                stat = os.stat(path)
                git_blob_hash = _GitBlobHash(path, stat)
                git_blob_hash.start()
                file_record.setSize(stat.st_size)
                created_datetime = datetime.fromtimestamp(stat.st_ctime)
                created_datetime = created_datetime.replace(tzinfo=dateutil.tz.tzlocal())
                file_record.setCreated(created_datetime)
                last_modified_datetime = datetime.fromtimestamp(stat.st_mtime)
                last_modified_datetime = last_modified_datetime.replace(tzinfo=dateutil.tz.tzlocal())
                file_record.setLastModified(last_modified_datetime) # naive or aware?
                file_record.setLastSeen(utcnow())
                git_blob_hash.join()
                hash_string = git_blob_hash.getGitBlobHash()
                if hash_string is not None:
                    file_record.setUri("git:///blob/" + hash_string)
                self.sqlAlchemySession.add(file_record)
                self.crawl.increment(git_blob_hash.getReadSize())
            if self.maxFiles and  self.crawl.getNumberOfProcessedItems() >= self.maxFiles: break 
        self.crawl.end()
        self.sqlAlchemySession.commit()
        self.sqlAlchemySession.close()
    
    def getNumberOfProcessedFiles(self):
        return self.crawl.getNumberOfProcessedItems()
    
    def getNumberOfProcessedBytes(self):
        return self.crawl.getNumberOfProcessedBytes()
    
    def getFilesPerSecond(self):
        return self.crawl.getFilesPerSecond()
    
    def getBytesPerSecond(self):
        return self.crawl.getBytesPerSecond()
    
    def __str__(self):
        locale.setlocale(locale.LC_ALL, "")
        return "%dsec %s (%s) bytes, %s/%s (%d) files" % (self.crawl.getElapsedSeconds(),
                                                          locale.format("%d", self.getNumberOfProcessedBytes(), grouping=True),
                                                          locale.format("%d", self.getBytesPerSecond(), grouping=True),
                                                          locale.format("%d", self.getNumberOfProcessedFiles(), grouping=True),
                                                          locale.format("%d", self.skipCount, grouping=True),
                                                          self.getFilesPerSecond())
Example #8
0
 def setUp(self):
     Record.createTable()
     self.assertTrue(Record.exists())
     Crawl.createTable()
     self.assertTrue(Crawl.exists())
     self.session = SqlAlchemySessionFactory().createSqlAlchemySession()
 def setUp(self):
     Record.createTable()
     Crawl.createTable()