Ejemplo n.º 1
0
                        if not row['labels'][0].startswith('Average') and not row['labels'][0].startswith('Median'): 
                            match = TABLE_NAME_PATTERN.match(row['labels'][0])
                            if not match:
                                if not row['labels'][0]: continue
                                fix_row = rows.next()
                                dfr = dictify_row(fix_row)
                                row['labels'][0] += ' %s' % dfr['labels'][1]
                                match = TABLE_NAME_PATTERN.match(row['labels'][0])
                                if not match:
                                    logging.warn( "Expected a table name at row %i [%s]" % ( row_count, row['labels'][0]  ) )
                                    continue
                            name_dict = match.groupdict()
                            table['name'] = name_dict['name']
                            table['size'] = int(name_dict['size'])
                else: # there's a line number
                    key = utils.generate_stat_key(row['table_id'],row['line'])
                    parent = parent_key = None
                    if row['indent'] > 0:
                        chk_line = row['line']
                        while parent is None and chk_line > 1:
                            chk_line -= 1
                            parent_key = utils.generate_stat_key(row['table_id'],chk_line)
                            chk_parent = table['labels'][parent_key]
                            if chk_parent['indent'] == row['indent'] - 1:
                                parent = chk_parent
                                parent['has_children'] = True
                                parent_key = parent['key']

                    last_processed = {
                        'key': key,
                        'text': row['labels'][row['indent']],
        # Final table contains all remaining
        if current_file != 47:
            # Have we switched files?
            if table_name.strip('.') == FILES_TO_FIRST_TABLE_MAP[current_file + 1][0]:
                if len(headers) != FILES_TO_FIRST_TABLE_MAP[current_file][1]:
                    raise AssertionError('Only found %i/%i headers for file %i' % (len(headers), FILES_TO_FIRST_TABLE_MAP[current_file + 1][1], current_file))

                with open('sf_data_2010_headers_%i.csv' % current_file, 'w') as f:
                    f.write(','.join(FIXED_HEADERS))
                    f.write(',')
                    f.write(','.join(headers))
                    f.write('\n')
                
                current_file += 1
                headers = []

                print 'Switched to file %i at table %s' % (current_file, table_name)
        
        parts = TABLE_REGEX.match(table_name)

        key = utils.generate_stat_key(table_name,field_num)
        headers.append(key)

    # Write final file
    with open('sf_data_2010_headers_%i.csv' % current_file, 'w') as f:
        f.write(','.join(FIXED_HEADERS))
        f.write(',')
        f.write(','.join(headers))
        f.write('\n')

Ejemplo n.º 3
0
                                if not row['labels'][0]: continue
                                fix_row = rows.next()
                                dfr = dictify_row(fix_row)
                                row['labels'][0] += ' %s' % dfr['labels'][1]
                                match = TABLE_NAME_PATTERN.match(
                                    row['labels'][0])
                                if not match:
                                    logging.warn(
                                        "Expected a table name at row %i [%s]"
                                        % (row_count, row['labels'][0]))
                                    continue
                            name_dict = match.groupdict()
                            table['name'] = name_dict['name']
                            table['size'] = int(name_dict['size'])
                else:  # there's a line number
                    key = utils.generate_stat_key(row['table_id'], row['line'])
                    parent = parent_key = None
                    if row['indent'] > 0:
                        chk_line = row['line']
                        while parent is None and chk_line > 1:
                            chk_line -= 1
                            parent_key = utils.generate_stat_key(
                                row['table_id'], chk_line)
                            chk_parent = table['labels'][parent_key]
                            if chk_parent['indent'] == row['indent'] - 1:
                                parent = chk_parent
                                parent['has_children'] = True
                                parent_key = parent['key']

                    last_processed = {
                        'key':