def get_stats(self): import pprint printer = pprint.PrettyPrinter(indent=4) db = DatabasePlainFiles('stats/') stats = db.loadDbase('stats17028') #tag cloud """ tag_cloud = [] for tag in stats['tags']: if stats['tags'][tag] > 15: #5 is okay for i in range(int(stats['tags'][tag] / 15)): tag_cloud.append(tag) import json db.saveDbaseRaw('tag_cloud', json.dumps(tag_cloud)) """ #tags overall tag_usage = 0 tag_count = 0 for tag in stats['tags']: tag_usage = tag_usage + stats['tags'][tag] tag_count = tag_count + 1 print tag_usage print tag_count #format statistics """
def read_data_folder(self): import pickle #get data folder list data_folder = 'data/' file_list = os.listdir(data_folder) stats = { 'maintainer': {}, 'isopen': {}, 'author': {}, 'version': {}, 'license_id': {}, 'type': {}, 'mimetype': {}, 'format': {}, 'resource_type': {}, 'tags': {}, 'groups': {}, 'license': {}, 'license_title': {}, 'geographic_coverage': {}, 'geographical_granularity': {}, 'temporal_coverage-from': {}, 'temporal_coverage-to': {}, 'temporal_granularity': {}, 'national_statistic': {}, 'precision': {}, 'series': {}, 'date_released': {}, 'categories': {} } import pprint printer = pprint.PrettyPrinter(indent=4) db = DatabasePlainFiles('stats/') stats = db.loadDbase('stats14061') for num, file in enumerate(file_list): print num if(num < 14061 or file == "package_list"): continue f = open(data_folder + file) object = pickle.load(f) f.close() self.add_to_stats(object['maintainer'], 'maintainer', stats) self.add_to_stats(object['isopen'], 'isopen', stats) self.add_to_stats(object['author'], 'author', stats) self.add_to_stats(object['version'], 'version', stats) self.add_to_stats(object['type'], 'type', stats) for resource in object['resources']: self.add_to_stats(resource['mimetype'], 'mimetype', stats) self.add_to_stats(resource['format'], 'format', stats) self.add_to_stats(resource['resource_type'], 'resource_type', stats) for tag in object['tags']: self.add_to_stats(tag, 'tags', stats) for group in object['groups']: self.add_to_stats(group, 'groups', stats) self.add_to_stats(object['license'], 'license', stats) self.add_to_stats(object['license_title'], 'license_title', stats) try: self.add_to_stats(object['extras']['geographic_coverage'], 'geographic_coverage', stats) self.add_to_stats(object['extras']['geographical_granularity'], 'geographical_granularity', stats) self.add_to_stats(object['extras']['temporal_coverage-from'], 'temporal_coverage-from', stats) self.add_to_stats(object['extras']['temporal_coverage-to'], 'temporal_coverage-to', stats) self.add_to_stats(object['extras']['temporal_granularity'], 'temporal_granularity', stats) self.add_to_stats(object['extras']['series'], 'series', stats) self.add_to_stats(object['extras']['precision'], 'precision', stats) self.add_to_stats(object['extras']['national_statistic'], 'national_statistic', stats) self.add_to_stats(object['extras']['date_released'], 'date_released', stats) self.add_to_stats(object['extras']['categories'], 'categories', stats) except BaseException as e: pass #print str(e) db.saveDbase('stats' + str(num), stats) #output stats to file print 'script executed!'