Beispiel #1
0
def export_cells_not_in(inTable, noTable, outTable, inSheet, noSheet, inFID,
                        noFID):
    """
    Export to a new file the cells of in Table not in noTable
    """

    import xlrd
    import xlwt
    from gasp.fm import tbl_to_obj
    from gasp.pyt.xls.fld import col_name, get_columns_position
    from gasp.pyt.xls.summ import list_unique_values_column

    # TODO: check if tables are xls

    # Get Data
    inData = tbl_to_obj(inTable, sheet=inSheet, output='array')
    COLUMNS = col_name(inTable, sheet_name=inSheet)

    # From noDATA, get IDS that will not be in the outTable
    noXls = xlrd.open_workbook(noTable)
    _noSheet = noXls.sheet_by_name(noSheet)
    colsPosition = get_columns_position(_noSheet, noFID)
    noFIDS = list_unique_values_column(_noSheet, colsPosition[noFID])

    # Create Output
    out_xls = xlwt.Workbook()
    new_sheet = out_xls.add_sheet(inSheet)

    # Write columns titles
    for c in range(len(COLUMNS)):
        new_sheet.write(0, c, COLUMNS[c])

    # Write data not in noData
    l = 1
    for row in inData:
        if row[inFID] not in noFIDS:
            c = 0
            for col in COLUMNS:
                new_sheet.write(l, c, row[col])
                c += 1

            l += 1

    out_xls.save(outTable)

    return outTable
Beispiel #2
0
def groups_to_lines(xls_data, xls_sheet,
                    group_column, name_colum,
                    cls_column, interest_column,
                    column_labels, xls_output):
    """
    Table:
      | group | label | class | interest
    0 |  AT   | ATLA1 |   0   |    50
    1 |  AT   | ATLA2 |   1   |    70
    2 |  AT   | ATLA3 |   2   |    80
    3 |  AT   | ATLA4 |   2   |    90
    4 |  AT   | ATLA5 |   2   |    10
    5 |  PT   | PTLA1 |   0   |    20
    6 |  PT   | PTLA2 |   2   |    30
    7 |  PT   | PTLA3 |   2   |    40
    8 |  PT   | PTLA4 |   2   |    52
    9 |  PT   | PTLA5 |   1   |    61
    10|  PT   | PTLA6 |   2   |    12

    Will be converted to a python object such as:
    table = {
        AT: {
            group_name        : ATLA1,
            group_value       : interest of line 0,
            master_unit_value : interest of line 1,
            units_value       : [interest of line 2, 3, 4]
        },
        PT: {
            group_name        : PTLA1,
            group_value       : interest of line 5,
            master_unit_value : interest of line 9,
            units_value       : [interest of line 6, 7, 8, 10],
        }
    }

    Using this python object, the output table will be write as:

      | GROUP_NAME | GROUP_VALUE | MASTER_UNIT_VALUE | UNITS_1 | ... | UNITS_N
    0 |    ATLA1   |     50      |        70         |    80   | ... |   10
    1 |    PTLA1   |     20      |        61         |    30   | ... |   12
    """
    
    from gasp.pyt.xls.fld import get_columns_position

    interest_columns = [group_column, cls_column, name_colum, interest_column]

    # Open input
    __xls_data = xlrd.open_workbook(xls_data)
    sheet = __xls_data.sheet_by_name(xls_sheet)

    # Get columns position
    """
    cols_position = {
        column_name: position_index, ...
    }
    """
    cols_position = get_columns_position(sheet, interest_columns)

    if interest_column not in cols_position.keys():
        return "Interest column not encountered in data file"

    # Get data of spreadsheet for the interest column
    """
    data = {
        group: {
            group_name        : name,
            group_value       : value,
            master_unit_value : value,
            units_value       : [value_unit1, ..., value_unitn]
        },
    }
    """
    data = {}
    groups_name = {}
    for row in range(1, sheet.nrows):
        group_id = str(sheet.cell(row, cols_position[group_column]).value)

        row_class = int(sheet.cell(row, cols_position[cls_column]).value)

        row_name = sheet.cell(row, cols_position[name_colum]).value

        row_value = sheet.cell(row, cols_position[interest_column]).value

        if group_id not in data.keys():
            groups_name[row_name] = group_id
            if row_class == 0:
                data[group_id] = {
                    'group_name': row_name,
                    'group_value': row_value,
                    'units_value': []
                }

            elif row_class == 1:
                data[group_id] = {
                    'master_unit_value': row_value,
                    'units_value': []
                }

            elif row_class == 2:
                data[group_id] = {
                    'units_value': []
                }

            else:
                return 'Class value not recognized'

        else:
            if row_class == 0:
                groups_name[row_name] = group_id
                data[group_id].update({
                    'group_name': row_name,
                    'group_value': row_value
                })

            elif row_class == 1:
                data[group_id].update({
                    'master_unit_value': row_value,
                })

            elif row_class == 2:
                data[group_id]['units_value'].append(row_value)

            else:
                return 'Class value not recognized'

    # Write Output
    out_xls = xlwt.Workbook()
    out_sheet = out_xls.add_sheet(interest_column)

    # Write first line
    col_to_write = 0
    for k in data:
        c = len(data[k]['units_value'])
        if c > col_to_write:
            col_to_write = c
    for i in range(col_to_write):
        out_sheet.write(0, i+3, i+1)

    # Write second line
    for i in range(len(column_labels)):
        out_sheet.write(1, i, column_labels[i])

    aux_titles = ['Min', 'Max', 'EU28_Average']
    for i in range(len(aux_titles)):
        out_sheet.write(1, col_to_write + 3 + i, aux_titles[i])

    # Organize groups alphabetically based on names
    grp_names = sorted(groups_name.keys())
    groups = []
    for name in grp_names:
        groups.append(groups_name[name])
    # Write data
    l = []
    for k in range(len(groups)):
        out_sheet.write(k+2, 0, data[groups[k]]['group_name'])

        out_sheet.write(k+2, 1, data[groups[k]]['group_value'])

        if 'master_unit_value' not in data[groups[k]]:
            out_sheet.write(k+2, 2, data[groups[k]]['group_value'])
            master_unit_key = 'group_value'
        else:
            out_sheet.write(k+2, 2, data[groups[k]]['master_unit_value'])
            master_unit_key = 'master_unit_value'

        for i in range(len(data[groups[k]]['units_value'])):
            out_sheet.write(k+2, i+3, data[groups[k]]['units_value'][i])

        t = data[groups[k]]['units_value'] + [data[groups[k]][master_unit_key]]

        l += t

    # Write global min and max
    n_l = []
    for i in l:
        if type(i) == type(10) or type(i) == type(10.0):
            n_l.append(i)
    out_sheet.write(2, col_to_write + 3, min(n_l))
    out_sheet.write(2, col_to_write + 4, max(n_l))
    out_xls.save(xls_output)
Beispiel #3
0
def split_col_by_year(dataFile, sheetName, codeCol, yearCol, out_file):
    """
    For a table:
    
       | code | year | U | LTU
    0  |  PT1 | 2000 | x |  x
    1  |  PT2 | 2000 | x |  x
    2  |  PT3 | 2000 | x |  x
    3  |  PT1 | 2001 | x |  x
    4  |  PT2 | 2001 | x |  x
    5  |  PT3 | 2001 | x |  x
    6  |  PT1 | 2002 | x |  x
    7  |  PT2 | 2002 | x |  x
    8  |  PT3 | 2002 | x |  x
    9  |  PT1 | 2003 | x |  x
    10 |  PT2 | 2003 | x |  x
    11 |  PT3 | 2003 | x |  x
    12 |  PT1 | 2004 | x |  x
    13 |  PT2 | 2004 | x |  x
    14 |  PT3 | 2004 | x |  x
    
    The output will be:
    
       | code | U_2000 | U_2001 | ... | LTU_2000 | ... | LTU_2004
    0  | PT1  |    x   |    x   | ... |    x     | ... |    x
    1  | PT2  |    x   |    x   | ... |    x     | ... |    x
    2  | PT3  |    x   |    x   | ... |    x     | ... |    x
    
    Outfile should be a CSV.
    """
    
    import codecs
    from gasp.pyt.xls.fld  import get_columns_position, get_columns_position_outside_options
    from gasp.pyt.xls.summ import list_unique_values_column
    
    # Open input
    xls = xlrd.open_workbook(dataFile)
    sheet = xls.sheet_by_name(sheetName)
    
    # Get code and year columns position
    codePos_yearPos = get_columns_position(sheet, [codeCol, yearCol])
    codePos, yearPos = (codePos_yearPos[codeCol], codePos_yearPos[yearCol])
    
    # Get variable positions
    varPos = get_columns_position_outside_options(sheet, [codeCol, yearCol])
    
    # Get data
    data  = {}
    VARS  = []
    YEARS = []
    for row in range(1, sheet.nrows):
        entity = sheet.cell(row, codePos).value
        year   = int(sheet.cell(row, yearPos).value)
        
        if year not in YEARS:
            YEARS.append(year)
        
        if entity not in data:
            data[entity] = {}
        
        if year not in data[entity]:
            data[entity][year] = {}
        
        for var in varPos:
            if var not in data[entity][year]:
                data[entity][year][var] = sheet.cell(row, varPos[var]).value
            
            else:
                print('Variable is already there for that entity and year')
            
            if var not in VARS:
                VARS.append(var)
    
    VARS.sort()
    YEARS.sort()
    
    
    with codecs.open(out_file, mode='w', encoding='utf-8') as f:
        # Write titles
        f.write(codeCol + '\t')
        
        for var in VARS:
            for year in YEARS:
                f.write('{v}_{y}\t'.format(v=str(var), y=str(year)))
        
        f.write('\n')
        
        # Write real data
        for entity in data:
            f.write(entity + '\t')
            
            for var in VARS:
                for year in YEARS:
                    if year in data[entity]:
                        f.write('{}\t'.format(str(data[entity][year][var])))
                    else:
                        f.write('\t')
            
            f.write('\n')
        
        f.close()
    
    """
Beispiel #4
0
def transpose_groups_in_sheets(excel, datasheet, group, _id, output):
    """
    For an Input Table like:
      | group | id | variable_1 | variable_2
    0 |   A   |  1 |   V1_A1    |   V2_A1
    1 |   A   |  2 |   V1_A2    |   V2_A2
    2 |   B   |  1 |   V1_B1    |   V2_B1
    3 |   B   |  2 |   V1_B2    |   V2_B2
    
    This method will generate one table for each group:
    * GROUP A
      | variables  | var_code | id1_value | id2_value
    0 | variable_1 |          |   V1_A1   |   V1_A2
    1 | variable_2 |          |   V2_A1   |   V2_A2
    
    * GROUP B
      | variables  | var_code | id1_value | id2_value
    0 | variable_1 |          |   V1_B1   |   V1_B2
    1 | variable_2 |          |   V2_B1   |   V2_B2
    """
    
    import xlrd;          import xlwt
    from gasp.pyt.xls.fld import get_columns_position
    from gasp.pyt.xls.fld import get_columns_position_outside_options
    
    # Get data from excel
    xls = xlrd.open_workbook(excel)
    sdata = xls.sheet_by_name(datasheet)
    
    # Get data by group and variable
    """
    data = {
        groupA : {
            varA : [val_1, ..., val_n],
            varB : [val_1, ..., val_n]
        },
        ...
        groupN: {
            varA : [val_1, ..., val_n],
            varB : [val_1, ..., val_n]
        }
    }
    """
    position_grp_id = get_columns_position(sdata, [group, _id])
    position_var = get_columns_position_outside_options(sdata, [group, _id])
    # To mantain the writting order of the variables (first column
    # should be first line.)
    var_cols = position_var.values()
    var_cols.sort()
    for i in range(len(var_cols)):
        for col in position_var:
            if var_cols[i] == position_var[col]:
                var_cols[i] = col
    # The columns names will be written in the correct order
    
    groupsChild = {}
    for row in range(1, sdata.nrows):
        grp = sdata.cell(row, position_grp_id[group]).value
        
        if grp not in groupsChild:
            groupsChild[grp] = {}
        
        entity = sdata.cell(row, position_grp_id[_id]).value
        
        for col in var_cols:
            value = sdata.cell(row, position_var[col]).value
            
            if not col in groupsChild[grp]:
                groupsChild[grp][col] = [value]
            
            else:
                groupsChild[grp][col].append(value)
    
    # Write output - one table for each group
    out_xls = xlwt.Workbook()
    
    for grp in groupsChild:
        grp_sheet = out_xls.add_sheet(grp)
        
        # Write first line
        titles = ['MIN', 'Max',
                  'component/area of concern/dimension']
        
        for i in range(len(groupsChild[grp][groupsChild[grp].keys()[0]])):
            titles.append(i+1)
        
        for i in range(len(titles)):
            grp_sheet.write(0, i, titles[i])
        
        # Write Values
        for i in range(len(var_cols)): # each iteration is a new line
            # Write Variable name
            grp_sheet.write(i+1, 2, var_cols[i])
            
            # Write entities values
            for e in range(len(groupsChild[grp][var_cols[i]])):
                grp_sheet.write(i+1, 2 + e + 1,
                                groupsChild[grp][var_cols[i]][e])
            
            # Write MIN and MAX
            grp_sheet.write(i+1, 0, min(groupsChild[grp][var_cols[i]]))
            grp_sheet.write(i+1, 1, max(groupsChild[grp][var_cols[i]]))
    
    out_xls.save(output)
    
    return output
Beispiel #5
0
def year_to_col_by_attr(data_file, data_spreadsheet,
                                          field_id,
                                          fields_to_mantain,
                                          field_year,
                                          output_file):
    """
    Transpose a table by attribute and year

    For a given table such as:

       | group |  fid  | label | year | attribute_x | attribute_y
    0  |  AT   | AT_01 | ATLA1 | 2000 |    val_x    |   val_y
    1  |  AT   | AT_01 | ATLA1 | 2001 |    val_x    |   val_y
    2  |  AT   | AT_02 | ATLA2 | 2000 |    val_x    |   val_y
    3  |  AT   | AT_02 | ATLA2 | 2001 |    val_x    |   val_y
    4  |  AT   | AT_03 | ATLA3 | 2000 |    val_x    |   val_y
    5  |  AT   | AT_03 | ATLA3 | 2001 |    val_x    |   val_y
    6  |  AT   | AT_04 | ATLA4 | 2000 |    val_x    |   val_y
    7  |  AT   | AT_04 | ATLA4 | 2001 |    val_x    |   val_y
    8  |  PT   | PT_01 | PTLA1 | 2000 |    val_x    |   val_y
    9  |  PT   | PT_01 | PTLA1 | 2001 |    val_x    |   val_y
    10 |  PT   | PT_02 | PTLA2 | 2000 |    val_x    |   val_y
    11 |  PT   | PT_02 | PTLA2 | 2001 |    val_x    |   val_y
    12 |  PT   | PT_03 | PTLA3 | 2000 |    val_x    |   val_y
    13 |  PT   | PT_03 | PTLA3 | 2001 |    val_x    |   val_y

    This method will generate one table (spreadsheet) for each attribute:

    Table attribute_x
      | group |  fid  | label |  2000 | 2001
    0 |  AT   | AT_01 | ATLA1 | val_x | val_x
    1 |  AT   | AT_02 | ATLA2 | val_x | val_x
    2 |  AT   | AT_03 | ATLA3 | val_x | val_x
    3 |  AT   | AT_04 | ATLA4 | val_x | val_x
    4 |  PT   | PT_01 | PTLA1 | val_x | val_x
    5 |  PT   | PT_02 | PTLA2 | val_x | val_x
    6 |  PT   | PT_03 | PTLA3 | val_x | val_x

    Table attribute_y
      | group |  fid  | label |  2000 | 2001
    0 |  AT   | AT_01 | ATLA1 | val_y | val_y
    1 |  AT   | AT_02 | ATLA2 | val_y | val_y
    2 |  AT   | AT_03 | ATLA3 | val_y | val_y
    3 |  AT   | AT_04 | ATLA4 | val_y | val_y
    4 |  PT   | PT_01 | PTLA1 | val_y | val_y
    5 |  PT   | PT_02 | PTLA2 | val_y | val_y
    6 |  PT   | PT_03 | PTLA3 | val_y | val_y
    """

    from gasp.pyt.xls.fld  import get_columns_position
    from gasp.pyt.xls.fld  import get_columns_position_outside_options
    from gasp.pyt.xls.summ import list_unique_values_column
    from gasp.pyt.xls.fm   import get_unit_attributes

    # Open input
    xls_data = xlrd.open_workbook(data_file)
    sheet = xls_data.sheet_by_name(data_spreadsheet)

    # Get interest columns position
    interest_position = get_columns_position(sheet, fields_to_mantain)

    # Get year position
    year_position = get_columns_position(sheet, [field_year])

    # Get id position
    id_position = get_columns_position(sheet, [field_id])

    # Get attributes columns position - indicators
    attributes_position = get_columns_position_outside_options(
        sheet, fields_to_mantain + [field_year, field_id]
    )

    # List years
    years = list_unique_values_column(sheet, year_position[field_year])

    # List entities
    units = list_unique_values_column(sheet, id_position[field_id])
    units_atributtes = get_unit_attributes(
        sheet, id_position[field_id], interest_position
    )

    # add years to data dict
    for unit in units_atributtes:
        for year in years:
            units_atributtes[unit][year] = {}
    # Get values by year for each unit
    for line in range(1, sheet.nrows):
        __year = sheet.cell(line, year_position[field_year]).value
        __id = sheet.cell(line, id_position[field_id]).value
        if __id not in units_atributtes.keys():
            continue

        for indicator in attributes_position:
            if __year not in years:
                continue
            units_atributtes[__id][__year][indicator] = sheet.cell(
                line, attributes_position[indicator]).value

    # Write output
    out_xls = xlwt.Workbook()
    # for each attribute/indicator column
    for indicator in attributes_position:
        # Create new sheet
        out_sheet = out_xls.add_sheet(indicator)
        # Write columns
        out_sheet.write(0, 0, field_id)
        c = 1
        for field in interest_position:
            out_sheet.write(0, c, field)
            c += 1
        for year in years:
            out_sheet.write(0, c, year)
            c += 1
        # Write real data
        lnh = 1
        for unit in units_atributtes:
            out_sheet.write(lnh, 0, unit)
            col = 1
            for field in interest_position:
                out_sheet.write(
                    lnh,
                    col,
                    units_atributtes[unit][field]
                )
                col += 1

            for year in years:
                out_sheet.write(
                    lnh,
                    col,
                    units_atributtes[unit][year][indicator]
                )
                col += 1
            lnh += 1
    out_xls.save(output_file)
Beispiel #6
0
def frequencies_by_place(xls,
                         entities_sheet,
                         entities_id,
                         entities_place,
                         data_sheet,
                         output,
                         null=None,
                         out_cols_basename=None,
                         entities_filter=None,
                         cols_exclusion=None):
    """
    Entities table:
    ID_Aluno | codigo_postal
    xxxxxxxx | xxxx-xxx
    xxxxxxxx | xxxx-xxx

    data_sheet:
    ID_Aluno | disciplina_1 | disciplina_2 | disciplina_n
    xxxxxxxx |    nota_1    |    xxxxxxx   |   xxxxxxx
    xxxxxxxx |    nota_2    |    xxxxxxx   |   xxxxxxx
    xxxxxxxx |    nota_3    |    xxxxxxx   |   xxxxxxx


    Output tables (one for each disciplina and other for the total)
                    |  nota_1  |  nota_2  |  nota_3  | Mean | Variance
    total           | NrAlunos | NrAlunos | NrAlunos |  x   |    y
    codigo_postal_1 | NrAlunos | NrAlunos | NrAlunos |  x   |    y
    codigo_postal_n | NrAlunos | NrAlunos | NrAlunos |  x   |    y

    In the entities_sheet and data_sheet, there must be only one referente
    for an entity (duplicated entities are not allowed)

    Filtering entities properties is possible
    """

    import numpy
    from decimal import Decimal
    from gasp.pyt.xls.fld import get_columns_position
    from gasp.pyt.xls.summ import list_unique_values_column
    from gasp.pyt.xls.summ import count_values_column
    from gasp.pyt.xls.summ import count_values_column_if_entity_exists
    from gasp.pyt.xls.summ import count_values_column_by_entity_property
    """
    Aux method to estimate some generic statistics based on a "Dict Histogram"
    (total, mean and standard deviation)
    """
    def map_dict(histogram):
        if 'mean' in histogram.keys():
            del histogram['mean']

        if 'stdesviation' in histogram.keys():
            del histogram['stdesviation']

        if 'total' in histogram.keys():
            del histogram['total']

        # Get the total of entities
        histogram['total'] = sum(histogram.values())
        # Get mean
        numerator = 0
        denominator = 0
        std_sample = []
        for v in histogram:
            if type(v) != str:
                numerator += Decimal(v) * Decimal(histogram[v])
                denominator += histogram[v]
                std_sample += [v for x in range(histogram[v])]

        if numerator and denominator:
            histogram['mean'] = numerator / Decimal(denominator)
            histogram['stdesviation'] = Decimal(numpy.std(std_sample))

        return histogram

    # Open xls file
    __xls_file = xlrd.open_workbook(xls, on_demand=True)

    # Map entities with location
    """
    d = {
        entitie_id : location
    }
    """
    if entities_filter:
        filters = entities_filter.split(';')
        filters_map = {}
        for flt in filters:
            col_filter, val_filter = flt.split('=')
            filters_map[col_filter] = val_filter

        interest_col = [entities_id, entities_place] + filters_map.keys()

    else:
        filters_map = 0
        interest_col = [entities_id, entities_place]

    __entities_sheet = __xls_file.sheet_by_name(entities_sheet)
    id_place_position = get_columns_position(__entities_sheet, interest_col)

    if entities_id not in id_place_position.keys():
        raise ValueError('Couldn\'t find the given column id')
    elif entities_place not in id_place_position.keys():
        raise ValueError('Couldn\'t find the position of the place id')

    entities_map = {}
    for row in range(1, __entities_sheet.nrows):
        __id = __entities_sheet.cell(row, id_place_position[entities_id]).value

        location = __entities_sheet.cell(
            row, id_place_position[entities_place]).value

        if filters_map:
            c = 0
            for col_filter in filters_map:
                __filter = __entities_sheet.cell(
                    row, id_place_position[col_filter]).value

                if __filter != val_filter:
                    c += 1
                    break

            if c:
                continue

        entities_map[__id] = location

    # Count Entities by value in each column
    """
    d = {
        col_1 : {
            total : {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            },
            location_1: {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            },
            location_2: {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            }
        },
        col_2 : {
            total : {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            },
            location_1: {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            },
            location_2: {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            }
        },
        col_n : {
            ...
        }
    }
    """
    data_sheet = __xls_file.sheet_by_name(data_sheet)
    data = {}

    cols_exclusion = cols_exclusion if type(cols_exclusion) == list else \
        [cols_exclusion] if type(cols_exclusion) == str else 0

    for col in range(1, data_sheet.ncols):
        column_name = data_sheet.cell(0, col).value
        if cols_exclusion:
            if column_name in cols_exclusion:
                continue
        # List unique values
        values = list_unique_values_column(data_sheet, col)

        # Del NoData identifier if defined
        if null or null == 0:
            if null in values:
                values.remove(null)

        # Count occourences of a value in that column
        val_count = count_values_column_if_entity_exists(
            data_sheet, col, entities_map.keys(), values)

        for v in values:
            if v not in val_count:
                val_count[v] = 0

        data[column_name] = {'total': map_dict(val_count)}

        # Do the same for each location
        locations = list(set(entities_map.values()))

        for place in locations:
            val_count = count_values_column_by_entity_property(
                data_sheet, col, place, entities_map, values)

            for v in values:
                if v not in val_count:
                    val_count[v] = 0

            data[column_name].update({place: map_dict(val_count)})
    """
    Update data dict with one combination of all keys/values data
    d.update({
        general : {
            total : {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            },
            location_1: {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            },
            location_2: {
                total   : total_count,
                value_1 : value_1_count,
                value_2 : value_2_count,
                value_n : value_n_count
            }
        }
    })
    """
    general = {}
    for col_sheet in data:
        for k in data[col_sheet]:
            if k not in general:
                general[k] = {}
                for _k in data[col_sheet][k]:
                    general[k][_k] = data[col_sheet][k][_k]
            else:
                for _k in data[col_sheet][k]:
                    if _k in general[k]:
                        general[k][_k] += data[col_sheet][k][_k]
                    else:
                        general[k][_k] = data[col_sheet][k][_k]
    data['general'] = general

    for __dict in data['general']:
        with_stats = map_dict(data['general'][__dict])
        data['general'][__dict] = with_stats

    # Write data in a new xls_file
    out_xls = xlwt.Workbook()
    for col_sheet in data:
        new_sheet = out_xls.add_sheet(col_sheet)

        if out_cols_basename:
            cols_basename = [
                '{b}_{_k}'.format(b=out_cols_basename, _k=str(k))
                for k in data[col_sheet]['total'].keys()
            ]

        else:
            cols_basename = data[col_sheet]['total'].keys()

        cols_name = data[col_sheet]['total'].keys()

        cols_basename.sort()
        cols_name.sort()

        # Write columns names
        for c in range(len(cols_name)):
            new_sheet.write(0, c + 1, cols_basename[c])

        # Write lines
        lnh = 1
        lines_name = data[col_sheet].keys()
        lines_name.sort()
        for line in lines_name:
            # Write line name
            new_sheet.write(lnh, 0, line)

            # Write counting data
            for cln in range(len(cols_name)):
                if cols_name[cln] in data[col_sheet][line].keys():
                    new_sheet.write(lnh, cln + 1,
                                    data[col_sheet][line][cols_name[cln]])

            lnh += 1

    out_xls.save(output)

    __xls_file.release_resources()
    del __xls_file
Beispiel #7
0
def frequencies_table(xls,
                      data_sheet,
                      output,
                      entities_sheet=None,
                      entities_id=None,
                      entities_filter=None,
                      values_filter=None):
    """
    Count values occurences in each attribute column

    input table:
    entity | attr_1 | attr_2 | ... | attr_n
       0   |   a    |   b    | ... |   a
       1   |   b    |   c    | ... |   b
       2   |   b    |   c    | ... |   c
       3   |   a    |   f    | ... |   d
       4   |   c    |   a    | ... |   f

    output table:
           | a | b | c | d | f |
    attr_1 | 2 | 2 | 1 | 0 | 0 |
    attr_2 | 1 | 1 | 2 | 0 | 1 |
    attr_n | 1 | 1 | 1 | 1 | 1 |

    It is possible filter entities and attribute values
    """

    from gasp.pyt.xls.fld import get_columns_position
    from gasp.pyt.xls.summ import count_values_column
    from gasp.pyt.xls.summ import count_values_column_if_entity_exists

    # Open file
    __xls = xlrd.open_workbook(xls, on_demand=True)

    if entities_sheet and entities_id and entities_filter:
        filters = entities_filter.split(';')
        filters_map = {}
        for flt in filters:
            col_filter, val_filter = flt.split('=')
            filters_map[col_filter] = val_filter

        __entities_sheet = __xls.sheet_by_name(entities_sheet)

        int_col = [entities_id] + filters_map.keys()
        cols_position = get_columns_position(__entities_sheet, int_col)

        entities_map = []
        for row in range(1, __entities_sheet.nrows):
            __id = __entities_sheet.cell(row, cols_position[entities_id]).value

            c = 0
            for col_filter in filters_map:
                __filter = __entities_sheet.cell(
                    row, cols_position[col_filter]).value

                if __filter != filters_map[col_filter]:
                    c += 1
                    break

            if c:
                continue

            entities_map.append(__id)

    else:
        entities_map = None

    attr_sheet = __xls.sheet_by_name(data_sheet)

    data = {}
    registed_values = []
    values_filter = values_filter if type(values_filter) == list else \
        [values_filter] if type(values_filter) == str else None

    for col in range(1, attr_sheet.ncols):
        col_name = attr_sheet.cell(0, col).value

        if not entities_map:
            values_count = count_values_column(attr_sheet, col, values_filter)
        else:
            values_count = count_values_column_if_entity_exists(
                attr_sheet, col, entities_map, values_filter)

        registed_values.extend(values_count.keys())

        data[col_name] = values_count

    registed_values = list(set(registed_values))

    # Write output
    out_xls = xlwt.Workbook()
    new_sheet = out_xls.add_sheet(
        os.path.splitext(os.path.basename(output))[0])

    # columns
    for i in range(len(registed_values)):
        new_sheet.write(0, i + 1, registed_values[i])

    # Write values
    r = 1
    for row in data:
        new_sheet.write(r, 0, row)
        for col in range(len(registed_values)):
            if registed_values[col] in data[row]:
                new_sheet.write(r, col + 1, data[row][registed_values[col]])
        r += 1

    out_xls.save(output)

    __xls.release_resources()
    del __xls
Beispiel #8
0
def frequencies_by_entity_attr(xls,
                               entities_sheet,
                               entities_id,
                               attr_sheet,
                               interest_values,
                               output,
                               entities_filter=None,
                               attr_exclusion=None):
    """
    Count entities with a number of attributes with a set of specific values

    Example:
    specific_values = A, S
    Entities table:
    entity_id | some_data
    xxxxxxxx  | xxxx-xxx
    xxxxxxxx  | xxxx-xxx

    attributes_sheet:
    From tables as:
    entity | attr_1 | attr_2 | ... | attr_n
       0   |    S   |   S    | ... |   S
       1   |    A   |   B    | ... |   S
       2   |    B   |   B    | ... |   A
       3   |    S   |   A    | ... |   S
       4   |    A   |   S    | ... |   B

    We came to the following result:
    values  | entities_count | attribute_count
      S;A   |        2       |        3
      S;A   |        2       |        2
      S;A   |        1       |        1

    Filtering entities properties is possible... Many could be used...
    When many fields are used, the AND operator logic will be applied

    E.g of application:
    Number of students with a number of negatives
    """

    from gasp.pyt.xls.fld import get_columns_position

    # Open file
    __xls = xlrd.open_workbook(xls, on_demand=True)

    # Map entities
    if entities_filter:
        filters = entities_filter.split(';')
        filters_map = {}
        for flt in filters:
            col_filter, val_filter = flt.split('=')
            filters_map[col_filter] = val_filter

        interest_col = [entities_id] + filters_map.keys()

    else:
        filters_map = 0
        interest_col = entities_id

    __entities_sheet = __xls.sheet_by_name(entities_sheet)
    cols_position = get_columns_position(__entities_sheet, interest_col)

    if entities_id not in cols_position:
        raise ValueError('Couldn\'t find the given column id')

    entities_map = {}
    for row in range(1, __entities_sheet.nrows):
        __id = __entities_sheet.cell(row, cols_position[entities_id]).value

        if filters_map:
            c = 0
            for col_filter in filters_map:
                __filter = __entities_sheet.cell(
                    row, cols_position[col_filter]).value

                if __filter != filters_map[col_filter]:
                    c += 1
                    break

            if c:
                continue

        entities_map[__id] = 0

    # Count occurences of the specific/interes values by entitie
    data_sheet = __xls.sheet_by_name(attr_sheet)


    interest_values = interest_values if type(interest_values) == list else \
        [interest_values] if type(interest_values) == str else 0

    if not interest_values:
        raise ValueError('interest_values should be a list or string')

    for col in range(1, data_sheet.ncols):
        column_name = data_sheet.cell(0, col).value
        if attr_exclusion and type(attr_exclusion) == list:
            if column_name in attr_exclusion:
                continue

        for row in range(1, data_sheet.nrows):
            __id = data_sheet.cell(row, 0).value
            value = data_sheet.cell(row, col).value

            if value in interest_values:
                if __id in entities_map:
                    entities_map[__id] += 1

    # Count the number entities with the same number of occurences of the
    # interest/specific values
    occurence_count = entities_map.values()
    occurence_unique = list(set(occurence_count))

    entities_countv = {}
    for countv in occurence_unique:
        entities_count = occurence_count.count(countv)
        entities_countv[entities_count] = countv

    # Write output
    out_xls = xlwt.Workbook()
    new_sheet = out_xls.add_sheet(
        os.path.splitext(os.path.basename(output))[0])

    # colums
    c = ['values', 'entities_number', 'values_occurences']
    for i in range(len(c)):
        new_sheet.write(0, i, c[i])

    # values
    l = 1
    for i in entities_countv:
        new_sheet.write(l, 0, ';'.join([str(x) for x in interest_values]))
        new_sheet.write(l, 1, i)
        new_sheet.write(l, 2, entities_countv[i])
        l += 1

    out_xls.save(output)

    __xls.release_resources()
    del __xls
Beispiel #9
0
def identify_cells_with_char(xls, sheetname, col_id, col_interest,
							interest_char, output):
	"""
	Identify the cells with a specific character in one column of interest
	
	Record these cells (ID and interest column) in a new file.
	"""
	
	import xlrd;          import xlwt
    from gasp.pyt.xls.fld import get_columns_position
	
	__xls = xlrd.open_workbook(xls)
	sheet = __xls.sheet_by_name(sheetname)
	
	cols_position = get_columns_position(sheet, [col_id, col_interest])
	
	out_xls = xlwt.Workbook()
	new_sheet = out_xls.add_sheet(sheetname)
	
	new_sheet.write(0, 0, col_id)
	new_sheet.write(0, 1, col_interest)
	
	Nrow = 1
	for row in range(1, sheet.nrows):
		val = sheet.cell(row, cols_position[col_interest]).value
		
		if interest_char in val:
			new_sheet.write(Nrow, 0, sheet.cell(row, cols_position[col_id]).value)
			new_sheet.write(Nrow, 1, val)
			Nrow+=1
Beispiel #10
0
def xlstimedelta_to_pddf(inXls,
                         timecol,
                         sheet_name=None,
                         sheet_index=None,
                         columnsToMantain=None):
    """
    Convert a table with a column with timedelta values to a valid 
    Pandas DataFrame
    """

    import datetime
    import xlrd
    from xlrd import xldate_as_tuple
    from gasp.pyt import obj_to_lst
    from gasp.pyt.xls.sheet import get_sheet_obj
    from gasp.pyt.xls.fld import col_name, get_columns_position

    __xls = xlrd.open_workbook(inXls)
    sheet = get_sheet_obj(__xls, name=sheet_name, index=sheet_index)

    # Get Cols name
    COLS_NAME = col_name(sheet) if not columnsToMantain else \
        obj_to_lst(columnsToMantain)

    if type(timecol) != int:
        if timecol not in COLS_NAME:
            COLS_NAME.append(timecol)

    # Get Cols position
    COLS_POS = get_columns_position(sheet, COLS_NAME)
    POS_COLS = COLS_POS.values()

    if type(timecol) == int:
        COL_TIME_POSITION = timecol
    else:
        COL_TIME_POSITION = COLS_POS[timecol]

    data = []
    for row in range(1, sheet.nrows):
        l_col = []
        for col in range(sheet.ncols):
            if col not in POS_COLS:
                continue

            if col == COL_TIME_POSITION:
                tt = xldate_as_tuple(
                    sheet.cell(row, col).value, __xls.datemode)

                l_col.append(
                    datetime.timedelta(weeks=tt[1],
                                       days=tt[2],
                                       hours=tt[3],
                                       minutes=tt[4],
                                       seconds=tt[5]))

            else:
                l_col.append(sheet.cell(row, col).value)

        data.append(l_col)

    df = pandas.DataFrame(data, columns=COLS_NAME)

    return df