def get_rows(json_all): json_tree = defaultdict(dict) for json_f in json_all: json_tree[json_f['end']][json_f['project']] = json_f['countries'] # log.debug('f: %s' % (json.dumps(json_all, indent=2))) # expand tree structure of dictionaries into list of dicts with named fields rows = list(flatten(json_tree, [], ['date', 'project', 'country', 'cohort', 'count'])) by_date = Nest().key(itemgetter('date')).map(rows) # everything is by date, so everyone wants things sorted by_date = OrderedDict(sorted(by_date.items())) return by_date
def write_overall_datasource(projects, json_all, args): log.info('writing overall datasource') _id = 'overall' name = 'Overall Editors by Language' # build rows keys = ['end', 'project', 'world'] json_tree = defaultdict(lambda : defaultdict(int)) for json_f in json_all: json_tree[json_f['end']][json_f['project']] = json_f['world'] # expand cohorts rows = list(flatten(json_tree, [], ['date', 'project', 'cohort', 'count'])) # group by date by_date = Nest().key(itemgetter('date')).map(rows) by_date = OrderedDict(sorted(by_date.items())) csv_name = args.basename + '_' + _id + '.csv' csv_path = os.path.join(args.datafile_dir, csv_name) csv_file = open(csv_path, 'w') # remove rows that don't interest us and then grab the row id (country-cohort) and count csv_rows = [] for date, row_batch in by_date.items(): # TODO: need to be extracting the top level field 'world' (note the lowercase) csv_row = {'date' : date} for row in row_batch: csv_row['%s (%s)' % (row['project'], row['cohort'])] = row['count'] csv_rows.append(csv_row) # normalize fields all_fields = sorted(reduce(set.__ior__, map(set,map(dict.keys, csv_rows)), set())) all_fields.remove('date') all_fields.insert(0,'date') writer = csv.DictWriter(csv_file, all_fields, restval='', extrasaction='ignore') writer.writeheader() for csv_row in csv_rows: writer.writerow(csv_row) csv_file.close() #def write_yaml(_id, name, fields, csv_name, rows, args): return write_yaml(_id, name, all_fields, csv_name, by_date, args)