def db_calculate_filedistribution(resource_id): log.msg("[%s] Calculating file distributions" % resource_id) file_distribution = {} query = (select([func.count()]).select_from(Files).where(Files.c.resource_id == resource_id)) total_file_count = yield tx_pool.runQuery(query) total_file_count = int(total_file_count[0].count_1) for k, v in FileCategories().data.iteritems(): query = (select([func.count()]).select_from(Files).where(Files.c.file_format == v).where(Files.c.resource_id == resource_id)) count = yield tx_pool.runQuery(query) if count: count = int(count[0].count_1) pct = 100 * float(count)/float(total_file_count) file_distribution[k] = "%.1f" % pct else: file_distribution[k] = 0 query = (ResourceMeta.update().where(ResourceMeta.c.id == resource_id).values(file_distribution=json.dumps(file_distribution))) yield tx_pool.runOperation(query) log.msg("[%s] Calculating file distributions DONE" % resource_id)
def db_prepare(self): # @TODO: race condition - use transactions (pool.runInteraction()) query = (select([Resources]).where(Resources.c.name == self.data['name'])) resource = yield tx_pool.runQuery(query) if not resource: query = (ResourceMeta.insert().values( recursive_sizes=self.data['options']['recursive_foldersizes'], web_user_agent=self.data['options']['user-agent'], auth_user=self.data['options']['auth_user'], auth_pass=self.data['options']['auth_pass'] ).returning(ResourceMeta.c.id)) meta_id = yield tx_pool.runQuery(query) meta_id = meta_id[0].id query = (Resources.insert().values( name=self.data['name'], address=self.data['address'], port=self.data['options']['port'], protocol=FileProtocols().id_by_name(self.data['method']), display_url=self.data['options']['display_url'], date_crawl_start=datetime.now(), basepath=self.data['basepath'], meta_id=meta_id ).returning(Resources.c.id)) resource_id = yield tx_pool.runQuery(query) self.data['resource_id'] = resource_id[0].id query = (select([Resources]).where(Resources.c.id == self.data['resource_id'])) resource = yield tx_pool.runQuery(query) resource = resource[0] resource_meta = yield tx_pool.runQuery(select([ResourceMeta]).where(ResourceMeta.c.id == resource.meta_id)) resource_meta = resource_meta[0] if resource_meta.busy: raise Exception('This resource is already being crawled') self.data['resource_id'] = resource.id self.data['resource_meta_id'] = resource.meta_id self.data['resource'] = resource self.data['resource_meta'] = resource_meta self.db_busy_crawling_toggle() query = (Resources.update().where(Resources.c.id == resource.id).values(date_crawl_start=datetime.now())) yield tx_pool.runOperation(query) query = (Resources.update().where(Resources.c.id == resource.id).values(basepath=self.data['basepath'])) yield tx_pool.runOperation(query) query = (Resources.update().where(Resources.c.id == resource.id).values(address=self.data['address'])) yield tx_pool.runOperation(query)
def db_finalize(self): query = (Resources.update().where(Resources.c.id == self.data['resource_id']).values(date_crawl_end=datetime.now())) yield tx_pool.runOperation(query) query = (Files.delete().where(Files.c.resource_id == self.data['resource_id'])) yield tx_pool.runOperation(query) query = (Files.update().where(Files.c.resource_id == '-%s' % str(self.data['resource_id'])).values(resource_id=self.data['resource_id'])) yield tx_pool.runOperation(query) query = (ResourceMeta.update().where(ResourceMeta.c.id == self.data['resource_meta_id']).values(file_count=self.resource.db_files_inserts)) yield tx_pool.runOperation(query) if 'recursive_foldersizes' in self.data['options']: yield self.db_calculate_foldersizes() self.db_calculate_filedistribution(resource_id=self.data['resource_id']) self.db_busy_crawling_toggle()
def db_busy_crawling_toggle(self): resource_meta = yield tx_pool.runQuery(select([ResourceMeta]).where(ResourceMeta.c.id == self.data['resource_meta_id'])) resource_meta = resource_meta[0] query = (ResourceMeta.update().where(ResourceMeta.c.id == self.data['resource_meta_id']).values(busy = 0 if resource_meta.busy else 1)) yield tx_pool.runOperation(query)