Ejemplo n.º 1
0
    def db_prepare(self):
        # @TODO: race condition - use transactions (pool.runInteraction())

        query = (select([Resources]).where(Resources.c.name == self.data['name']))
        resource = yield tx_pool.runQuery(query)

        if not resource:
            query = (ResourceMeta.insert().values(
                recursive_sizes=self.data['options']['recursive_foldersizes'],
                web_user_agent=self.data['options']['user-agent'],
                auth_user=self.data['options']['auth_user'],
                auth_pass=self.data['options']['auth_pass']
            ).returning(ResourceMeta.c.id))

            meta_id = yield tx_pool.runQuery(query)
            meta_id = meta_id[0].id

            query = (Resources.insert().values(
                name=self.data['name'],
                address=self.data['address'],
                port=self.data['options']['port'],
                protocol=FileProtocols().id_by_name(self.data['method']),
                display_url=self.data['options']['display_url'],
                date_crawl_start=datetime.now(),
                basepath=self.data['basepath'],
                meta_id=meta_id
            ).returning(Resources.c.id))

            resource_id = yield tx_pool.runQuery(query)
            self.data['resource_id'] = resource_id[0].id

            query = (select([Resources]).where(Resources.c.id == self.data['resource_id']))
            resource = yield tx_pool.runQuery(query)

        resource = resource[0]
        resource_meta = yield tx_pool.runQuery(select([ResourceMeta]).where(ResourceMeta.c.id == resource.meta_id))
        resource_meta = resource_meta[0]

        if resource_meta.busy:
            raise Exception('This resource is already being crawled')

        self.data['resource_id'] = resource.id
        self.data['resource_meta_id'] = resource.meta_id
        self.data['resource'] = resource
        self.data['resource_meta'] = resource_meta

        self.db_busy_crawling_toggle()

        query = (Resources.update().where(Resources.c.id == resource.id).values(date_crawl_start=datetime.now()))
        yield tx_pool.runOperation(query)

        query = (Resources.update().where(Resources.c.id == resource.id).values(basepath=self.data['basepath']))
        yield tx_pool.runOperation(query)

        query = (Resources.update().where(Resources.c.id == resource.id).values(address=self.data['address']))
        yield tx_pool.runOperation(query)
Ejemplo n.º 2
0
    def db_calculate_filedistribution(resource_id):
        log.msg("[%s] Calculating file distributions" % resource_id)

        file_distribution = {}

        query = (select([func.count()]).select_from(Files).where(Files.c.resource_id == resource_id))
        total_file_count = yield tx_pool.runQuery(query)
        total_file_count = int(total_file_count[0].count_1)

        for k, v in FileCategories().data.iteritems():
            query = (select([func.count()]).select_from(Files).where(Files.c.file_format == v).where(Files.c.resource_id == resource_id))
            count = yield tx_pool.runQuery(query)

            if count:
                count = int(count[0].count_1)

                pct = 100 * float(count)/float(total_file_count)
                file_distribution[k] = "%.1f" % pct
            else:
                file_distribution[k] = 0

        query = (ResourceMeta.update().where(ResourceMeta.c.id == resource_id).values(file_distribution=json.dumps(file_distribution)))
        yield tx_pool.runOperation(query)

        log.msg("[%s] Calculating file distributions DONE" % resource_id)
Ejemplo n.º 3
0
    def db_finalize(self):
        query = (Resources.update().where(Resources.c.id == self.data['resource_id']).values(date_crawl_end=datetime.now()))
        yield tx_pool.runOperation(query)

        query = (Files.delete().where(Files.c.resource_id == self.data['resource_id']))
        yield tx_pool.runOperation(query)

        query = (Files.update().where(Files.c.resource_id == '-%s' % str(self.data['resource_id'])).values(resource_id=self.data['resource_id']))
        yield tx_pool.runOperation(query)

        query = (ResourceMeta.update().where(ResourceMeta.c.id == self.data['resource_meta_id']).values(file_count=self.resource.db_files_inserts))
        yield tx_pool.runOperation(query)

        if 'recursive_foldersizes' in self.data['options']:
            yield self.db_calculate_foldersizes()

        self.db_calculate_filedistribution(resource_id=self.data['resource_id'])

        self.db_busy_crawling_toggle()
Ejemplo n.º 4
0
    def db_calculate_foldersizes(self):
        log.msg("[%s] Recursively calculating folder sizes" % self.data['resource_id'])

        dirs = ['/']
        while dirs:
            file_path = dirs[0]

            query = (
                select([Files.c.file_name, Files.c.file_path, Files.c.file_size, Files.c.file_isdir])
                .where(Files.c.resource_id == self.data['resource_id'])
                .where(Files.c.file_path == file_path))
            all = yield tx_pool.runQuery(query)

            for file_name in [z.file_name for z in all if z.file_isdir]:
                query = (
                    select([func.sum(Files.c.file_size)])
                    .where(Files.c.resource_id == self.data['resource_id'])
                    .where(Files.c.file_path.like(file_path+file_name+'%')))
                size = yield tx_pool.runQuery(query)
                size = size[0]

                if not size.sum_1:
                    size = 0
                else:
                    size = long(size.sum_1)

                query = (
                    Files.update()
                    .where(Files.c.resource_id == self.data['resource_id'])
                    .where(Files.c.file_path == file_path)
                    .where(Files.c.file_name == file_name)
                    .where(Files.c.file_isdir == True)
                    .values(file_size=size))
                yield tx_pool.runOperation(query)

                dirs.append('%s%s/' % (file_path, file_name))
            dirs.pop(0)

        log.msg("[%s] Recursively calculating folder sizes DONE" % self.data['resource_id'])
Ejemplo n.º 5
0
    def db_busy_crawling_toggle(self):
        resource_meta = yield tx_pool.runQuery(select([ResourceMeta]).where(ResourceMeta.c.id == self.data['resource_meta_id']))
        resource_meta = resource_meta[0]

        query = (ResourceMeta.update().where(ResourceMeta.c.id == self.data['resource_meta_id']).values(busy = 0 if resource_meta.busy else 1))
        yield tx_pool.runOperation(query)