def db_calculate_filedistribution(resource_id): log.msg("[%s] Calculating file distributions" % resource_id) file_distribution = {} query = (select([func.count()]).select_from(Files).where(Files.c.resource_id == resource_id)) total_file_count = yield tx_pool.runQuery(query) total_file_count = int(total_file_count[0].count_1) for k, v in FileCategories().data.iteritems(): query = (select([func.count()]).select_from(Files).where(Files.c.file_format == v).where(Files.c.resource_id == resource_id)) count = yield tx_pool.runQuery(query) if count: count = int(count[0].count_1) pct = 100 * float(count)/float(total_file_count) file_distribution[k] = "%.1f" % pct else: file_distribution[k] = 0 query = (ResourceMeta.update().where(ResourceMeta.c.id == resource_id).values(file_distribution=json.dumps(file_distribution))) yield tx_pool.runOperation(query) log.msg("[%s] Calculating file distributions DONE" % resource_id)
def db_prepare(self): # @TODO: race condition - use transactions (pool.runInteraction()) query = (select([Resources]).where(Resources.c.name == self.data['name'])) resource = yield tx_pool.runQuery(query) if not resource: query = (ResourceMeta.insert().values( recursive_sizes=self.data['options']['recursive_foldersizes'], web_user_agent=self.data['options']['user-agent'], auth_user=self.data['options']['auth_user'], auth_pass=self.data['options']['auth_pass'] ).returning(ResourceMeta.c.id)) meta_id = yield tx_pool.runQuery(query) meta_id = meta_id[0].id query = (Resources.insert().values( name=self.data['name'], address=self.data['address'], port=self.data['options']['port'], protocol=FileProtocols().id_by_name(self.data['method']), display_url=self.data['options']['display_url'], date_crawl_start=datetime.now(), basepath=self.data['basepath'], meta_id=meta_id ).returning(Resources.c.id)) resource_id = yield tx_pool.runQuery(query) self.data['resource_id'] = resource_id[0].id query = (select([Resources]).where(Resources.c.id == self.data['resource_id'])) resource = yield tx_pool.runQuery(query) resource = resource[0] resource_meta = yield tx_pool.runQuery(select([ResourceMeta]).where(ResourceMeta.c.id == resource.meta_id)) resource_meta = resource_meta[0] if resource_meta.busy: raise Exception('This resource is already being crawled') self.data['resource_id'] = resource.id self.data['resource_meta_id'] = resource.meta_id self.data['resource'] = resource self.data['resource_meta'] = resource_meta self.db_busy_crawling_toggle() query = (Resources.update().where(Resources.c.id == resource.id).values(date_crawl_start=datetime.now())) yield tx_pool.runOperation(query) query = (Resources.update().where(Resources.c.id == resource.id).values(basepath=self.data['basepath'])) yield tx_pool.runOperation(query) query = (Resources.update().where(Resources.c.id == resource.id).values(address=self.data['address'])) yield tx_pool.runOperation(query)
def db_calculate_foldersizes(self): log.msg("[%s] Recursively calculating folder sizes" % self.data['resource_id']) dirs = ['/'] while dirs: file_path = dirs[0] query = ( select([Files.c.file_name, Files.c.file_path, Files.c.file_size, Files.c.file_isdir]) .where(Files.c.resource_id == self.data['resource_id']) .where(Files.c.file_path == file_path)) all = yield tx_pool.runQuery(query) for file_name in [z.file_name for z in all if z.file_isdir]: query = ( select([func.sum(Files.c.file_size)]) .where(Files.c.resource_id == self.data['resource_id']) .where(Files.c.file_path.like(file_path+file_name+'%'))) size = yield tx_pool.runQuery(query) size = size[0] if not size.sum_1: size = 0 else: size = long(size.sum_1) query = ( Files.update() .where(Files.c.resource_id == self.data['resource_id']) .where(Files.c.file_path == file_path) .where(Files.c.file_name == file_name) .where(Files.c.file_isdir == True) .values(file_size=size)) yield tx_pool.runOperation(query) dirs.append('%s%s/' % (file_path, file_name)) dirs.pop(0) log.msg("[%s] Recursively calculating folder sizes DONE" % self.data['resource_id'])
def db_busy_crawling_toggle(self): resource_meta = yield tx_pool.runQuery(select([ResourceMeta]).where(ResourceMeta.c.id == self.data['resource_meta_id'])) resource_meta = resource_meta[0] query = (ResourceMeta.update().where(ResourceMeta.c.id == self.data['resource_meta_id']).values(busy = 0 if resource_meta.busy else 1)) yield tx_pool.runOperation(query)