if not row['labels'][0].startswith('Average') and not row['labels'][0].startswith('Median'): match = TABLE_NAME_PATTERN.match(row['labels'][0]) if not match: if not row['labels'][0]: continue fix_row = rows.next() dfr = dictify_row(fix_row) row['labels'][0] += ' %s' % dfr['labels'][1] match = TABLE_NAME_PATTERN.match(row['labels'][0]) if not match: logging.warn( "Expected a table name at row %i [%s]" % ( row_count, row['labels'][0] ) ) continue name_dict = match.groupdict() table['name'] = name_dict['name'] table['size'] = int(name_dict['size']) else: # there's a line number key = utils.generate_stat_key(row['table_id'],row['line']) parent = parent_key = None if row['indent'] > 0: chk_line = row['line'] while parent is None and chk_line > 1: chk_line -= 1 parent_key = utils.generate_stat_key(row['table_id'],chk_line) chk_parent = table['labels'][parent_key] if chk_parent['indent'] == row['indent'] - 1: parent = chk_parent parent['has_children'] = True parent_key = parent['key'] last_processed = { 'key': key, 'text': row['labels'][row['indent']],
# Final table contains all remaining if current_file != 47: # Have we switched files? if table_name.strip('.') == FILES_TO_FIRST_TABLE_MAP[current_file + 1][0]: if len(headers) != FILES_TO_FIRST_TABLE_MAP[current_file][1]: raise AssertionError('Only found %i/%i headers for file %i' % (len(headers), FILES_TO_FIRST_TABLE_MAP[current_file + 1][1], current_file)) with open('sf_data_2010_headers_%i.csv' % current_file, 'w') as f: f.write(','.join(FIXED_HEADERS)) f.write(',') f.write(','.join(headers)) f.write('\n') current_file += 1 headers = [] print 'Switched to file %i at table %s' % (current_file, table_name) parts = TABLE_REGEX.match(table_name) key = utils.generate_stat_key(table_name,field_num) headers.append(key) # Write final file with open('sf_data_2010_headers_%i.csv' % current_file, 'w') as f: f.write(','.join(FIXED_HEADERS)) f.write(',') f.write(','.join(headers)) f.write('\n')
if not row['labels'][0]: continue fix_row = rows.next() dfr = dictify_row(fix_row) row['labels'][0] += ' %s' % dfr['labels'][1] match = TABLE_NAME_PATTERN.match( row['labels'][0]) if not match: logging.warn( "Expected a table name at row %i [%s]" % (row_count, row['labels'][0])) continue name_dict = match.groupdict() table['name'] = name_dict['name'] table['size'] = int(name_dict['size']) else: # there's a line number key = utils.generate_stat_key(row['table_id'], row['line']) parent = parent_key = None if row['indent'] > 0: chk_line = row['line'] while parent is None and chk_line > 1: chk_line -= 1 parent_key = utils.generate_stat_key( row['table_id'], chk_line) chk_parent = table['labels'][parent_key] if chk_parent['indent'] == row['indent'] - 1: parent = chk_parent parent['has_children'] = True parent_key = parent['key'] last_processed = { 'key':