Beispiel #1
0
def readAndGroupTable( infile, options ):
    """read table from infile and group.
    """
    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )
    options.columns = getColumns( fields, options.columns )
    assert options.group_column not in options.columns

    converter = float
    new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce( lambda x,y: x+y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join( [ y for y in x if y != "" ] )
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join( [ y for y in set(x) if y != "" ] )
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [ fields[options.group_column] ]
        for c in options.columns:
            new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) )

    ## convert values to floats (except for group_column)
    ## Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [ row[options.group_column] ]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append( converter(row[c]) )
                except ValueError:
                    skip = True
                    break
        if not skip: new_table.append(new_row)
    table = new_table

    new_rows = CSV.GroupTable( table,
                               group_column = 0,
                               group_function = f )

    options.stdout.write("\t".join(new_fields) + "\n")        
    for row in new_rows:
        options.stdout.write( "\t".join( map(str,row) ) + "\n")