Beispiel #1
0
import re
import sys
import argparse
import traceback
import math
from decimal import Decimal
from toollib.files import findNumber,ParameterParser
from toollib.group import Group,run_grouping
        
class ComputeListGroup(Group):
    def __init__(self, tup):
        super(ComputeListGroup, self).__init__(tup)
        self.lines = []

    def add(self, chunks):
        self.lines.append(chunks[args.column])

    def done(self):
        args.outfile.write(self.tup + [args.expression(self.lines)])

if __name__ == "__main__":
    pp = ParameterParser('User defined computation on a column', columns = 1, append = False, labels = [None])
    pp.parser.add_argument('-e', '--expression', help='equation to call. use l[i] to indicate row i of the list')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_list_compute']
    args = pp.getArgs(args)
    args.expression = eval('lambda l: '+ args.expression)

    run_grouping(args.infile, ComputeListGroup, args.group, args.ordered)
Beispiel #2
0
    return ToUnixTime(datetime.strptime(dt, args.format))
    
def cToDateTime(dt):
    return ToDateTime(float(dt)).strftime(args.format)
    
def cTimeOfDay(dt):
    return TimeOfDay(ToDateTime(float(dt))).strftime(args.format)

if __name__ == "__main__":
    pp = ParameterParser('Convert timestamps', columns = '*', group = False, append = False, ordered = False)
    pp.parser.add_argument('-i', '--in-format')
    pp.parser.add_argument('-o', '--out-format')
    args = pp.parseArgs()
    args.append = True
    args = pp.getArgs(args)
    run_grouping(args.infile, TimestampGroup, [], False)
    # args.function = getattr(sys.modules[__name__], 'c'+args.function)
    # if not args.format:
    #     if args.function == cTimeOfDay:
    #         args.format = '%H:%M:%S.%f'
    #     else: 
    #         args.format = '%Y-%m-%d_%H:%M:%S.%f'
    # 
    # jdelim = args.delimiter if args.delimiter != None else ' '
    # for line in args.infile:
    #     val = line.rstrip().split(args.delimiter)[args.column]
    #     res = args.function(val)
    #     if args.append:
    #         args.outfile.write('%s%s' % (line.rstrip(), jdelim))
    #     args.outfile.write(str(res)+'\n')
    
Beispiel #3
0
class PadGroup(Group):
    def __init__(self, tup):
        super(PadGroup, self).__init__(tup)
        self.present = set()

    def add(self, chunks):
        self.present.add(tuple(chunks[i] for i in args.columns))
        args.outfile.write(chunks)

    def done(self):
        for element in args.elements:
            if element not in self.present:
                args.outfile.write(self.tup + list(element) + args.pad)

if __name__ == "__main__":
    pp = ParameterParser('Generate additional rows to pad input', columns = '*', append = False, labels = False, ordered = False)
    pp.parser.add_argument('-e', '--elements', help='File containing list elements, one per line.')
    pp.parser.add_argument('-p', '--pad', nargs='+', default=['0'])
    args = pp.parseArgs()
    args.append = True
    args = pp.getArgs(args)

    elements = set()
    with FileReader(args.elements, args) as f:
        for chunks in f:
            elements.add(tuple(chunks))
    args.elements = elements

    run_grouping(args.infile, PadGroup, args.group, ordered = False)
Beispiel #4
0
        self.items = defaultdict(int)
        self.count = 0

    def add(self, chunks):
        self.count += 1
        val = tuple(chunks[c] for c in args.columns)
        if val in self.items:
            val_item = self.items[val]
            distance = sum(1 for item in self.items.itervalues() if item > val_item) # Find all items with indices larger than the last occurance of this item
        else:
            distance = -1
        self.items[val] = self.count

        if args.append:
            args.outfile.write(chunks + [distance])
        else:
            args.outfile.write(self.tup + [distance])

    def done(self):
        pass

if __name__ == "__main__":
    pp = ParameterParser('Compute the stack distance', columns = '*', labels = [None])
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = ['_'.join(args.columns_names) + '_stack_distance']
    args = pp.getArgs(args)

    run_grouping(args.infile, StackGroup, args.group, args.ordered)

Beispiel #5
0
    def done(self):
        self.empty()

if __name__ == "__main__":
    pp = ParameterParser('Cluster input using the DBSCAN algorithm', columns = 0, labels = [None], append = False)
    pp.parser.add_argument('--online', action='store_true', default=False, help='changes meaning of range parameter to a monotonically increasing position value')
    pp.parser.add_argument('-f', '--first', default='0', help='first key column')
    pp.parser.add_argument('-s', '--second', default='1', help='second key column (offline only)')
    pp.parser.add_argument('-r', '--range', default='2', help='column with distance')
    pp.parser.add_argument('-e', '--epsilon', type=float, default=0.5)
    pp.parser.add_argument('-m', '--min_samples', type=int, default=5)
    args = pp.parseArgs()
    if args.online:
        args.append = True
    if not any(args.labels):
        if args.online:
            args.labels = ['label']
        else:
            args.labels = [args.infile.header.name(args.first), 'label']
    args = pp.getArgs(args)
    args.first = args.infile.header.index(args.first)
    args.second = args.infile.header.index(args.second)
    args.range = args.infile.header.index(args.range)

    if args.online:
        args.label = 0
        run_grouping(args.infile, OnlineDBSCANGroup, args.group, args.ordered)
    else:
        run_grouping(args.infile, OfflineDBSCANGroup, args.group, args.ordered)

Beispiel #6
0
import os
import sys
import argparse
from toollib.files import findNumber,ParameterParser
from toollib.group import Group,run_grouping

class AccumulateGroup(Group):
    def __init__(self, tup):
        super(AccumulateGroup, self).__init__(tup)
        self.total = 0

    def add(self, chunks):
        self.total += findNumber(chunks[args.column])
        if args.append:
            args.outfile.write(chunks + [self.total])
        else:
            args.outfile.write(self.tup + [self.total])

    def done(self):
        pass

if __name__ == "__main__":
    pp = ParameterParser('Accumulate the values of a column(s)', columns = 1, labels = [None])
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_accumulate']
    args = pp.getArgs(args)

    run_grouping(args.infile, AccumulateGroup, args.group, args.ordered)

Beispiel #7
0
    # if args.colourmap is not None:
    #     args.ax.set_color_cycle(plt.get_cmap(args.colourmap[0])(np.linspace(0,1,int(args.colourmap[1]))))

    # Process sources in order
    for i,infile in enumerate(args.infiles):
        s = Source(infile)
        s.mapping = {v : infile.header.index(args.mapping[v][i]) for v in args.mapping if args.mapping[v][i] is not None}
        # s.geom = args.geom[i]
        # s.label = args.sourcelabels[i]
        # s.colour = args.colour[i]
        # s.shape = args.shape[i]
        # s.fill = args.fill[i]
        # s.alpha = args.alpha[i]
        # s.size = args.size[i]
        args.current = s
        run_grouping(infile, PlotGroup, args.group, False)
        infile.close()

    if args.xscale:
        args.ax.set_xscale(args.xscale)
    if args.xmajorticks:
        args.ax.set_xticks([fmt(x, args.xtype, args.xformat) for x in args.xmajorticks])
        if args.xmajorticklabels:
            args.ax.set_xticklabels(args.xmajorticklabels)
    if args.xminorticks:
        args.ax.set_xticks([fmt(x, args.xtype, args.xformat) for x in args.xminorticks], minor = True)
    if args.xtickformat:
        args.ax.xaxis.set_major_formatter(tick_fmt(args.xtype, args.xtickformat))

    if args.yscale:
        args.ax.set_yscale(args.yscale, nonposy='clip')
Beispiel #8
0
        
class KMinGroup(Group):
    def __init__(self, tup):
        super(KMinGroup, self).__init__(tup)
        self.mines = []

    def add(self, chunks):
        heappush(self.mines, -findNumber(chunks[args.column]))
        if len(self.mines) > args.k:
            heappop(self.mines)

    def done(self):
        for v in reversed(sorted(self.mines)):
            args.outfile.write(self.tup + [-v])
        
if __name__ == "__main__":
    pp = ParameterParser('Compute minimum of column', columns = 1, labels = [None])
    pp.parser.add_argument('-k', '--k', type = int, default = 1, help = 'find the k minimum values')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_min']
    if args.append:
        args.labels = []
    args = pp.getArgs(args)

    if args.k > 1:
        cls = KMinGroup
    else:
        cls = MinGroup
    run_grouping(args.infile, cls, args.group, args.ordered)
Beispiel #9
0
            for val in (random.choice(self.row) for n in range(args.number)):
                if args.append:
                    args.outfile.write(random.choice(self.rows[val]))
                else:
                    args.outfile.write(self.tup + [val])
        else:
            for val in self.sample(self.row, args.number):
                if args.append:
                    i = random.choice(range(len(self.rows[val])))
                    args.outfile.write(self.rows[val][i])
                    del self.rows[val][i]
                else:
                    args.outfile.write(self.tup + [val])

    def sample(self, rows, number):
        if number >= len(rows):
            return rows
        else:
            return random.sample(rows, number)

if __name__ == "__main__":
    pp = ParameterParser('Sample rows from file', columns = 1)
    pp.parser.add_argument('-r', '--replacement', action='store_true', default=False, help='with replacement')
    pp.parser.add_argument('-s', '--seed', type=int, default=12345)
    pp.parser.add_argument('-n', '--number', type=int, default=10, help='number of samples')
    args = pp.parseArgs()
    args = pp.getArgs(args)

    random.seed(args.seed)
    run_grouping(args.infile, SampleGroup, args.group, args.ordered)
Beispiel #10
0
class EntropyGroup(Group):
    def __init__(self, tup):
        super(EntropyGroup, self).__init__(tup)
        self.vals = []

    def add(self, chunks):
        val = float(findNumber(chunks[args.column]))
        self.vals.append(val)

    def done(self):
        import numpy as np
        vals = np.array(self.vals) / np.sum(self.vals)
        from scipy.stats import entropy
        if args.pad is None or args.pad <= len(vals):
            e = entropy(vals, base = args.base)
        else:
            e = entropy(np.append(vals, [0.0] * (args.pad - len(vals))), base = args.base)
        args.outfile.write(self.tup + [e])

if __name__ == "__main__":
    pp = ParameterParser('Entropy of a column', columns = 1, append = False, labels = [None])
    pp.parser.add_argument('-p', '--pad', type=int, default=None, help='pad to number of potential values')
    pp.parser.add_argument('--base', type=float, default=None, help='entropy base (default is e)')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_entropy']
    args = pp.getArgs(args)


    run_grouping(args.infile, EntropyGroup, args.group, args.ordered)
Beispiel #11
0
import argparse
from toollib.files import ParameterParser
from toollib.group import Group,run_grouping

class SetGroup(Group):
    def __init__(self, tup):
        super(SetGroup, self).__init__(tup)

    def add(self, chunks):
        if args.append:
            args.outfile.write(chunks)
        else:
            args.outfile.write(self.tup)
        self.add = self.noop
                
    def noop(self, chunks):
        pass

    def done(self):
        pass

if __name__ == "__main__":
    pp = ParameterParser('Compute the set of strings from a column in files. Maintains first appearance order.', columns = '*', ordered = False)
    args = pp.parseArgs()
    args = pp.getArgs(args)
    if not args.append and args.infile.hasHeader:
        args.outfile.header.addCols(args.columns_names)

    run_grouping(args.infile, SetGroup, args.columns, False)

Beispiel #12
0
    def __init__(self, tup):
        super(OccurGroup, self).__init__(tup)
        if 'first' in args.order:
            self.add = self.addFirst
        else:
            self.add = self.addNothing
        self.last = None

    def addFirst(self, chunks):
        args.outfile.write(chunks)
        if args.duplicate:
            self.last = chunks
        self.add = self.addNothing
        
    def addNothing(self, chunks):
        self.last = chunks

    def done(self):
        if self.last is not None and 'last' in args.order:
            args.outfile.write(self.last)

if __name__ == "__main__":
    pp = ParameterParser('Output the first/last occurance of a group', columns = False, append = False, ordered = True)
    pp.parser.add_argument('-o', '--order', nargs='+', default=['first'], choices=['first', 'last'])
    pp.parser.add_argument('-d', '--duplicate', action='store_true', default=False, help='if order is first and last and there is only 1 group member, print same line twice')
    args = pp.parseArgs()
    args.append = True
    args = pp.getArgs(args)

    run_grouping(args.infile, OccurGroup, args.group, args.ordered)
Beispiel #13
0
    position = (sum(r.itervalues()) + 1) * p
    ir = int(position)
    fr = Decimal(position - ir)
    count = 0
    prev = None
    for key in sorted(r.iterkeys()):
        if count >= ir:
            break
        count += r[key]
        prev = key

    if prev is None:
        return key
    if fr == 0: # Whole value
        return prev
    elif count == ir: # Falls on the border between keys
        return prev * fr + key * (1 - fr)
    else: # Both median - 1 and median + 1 are same key
        return prev

if __name__ == "__main__":
    pp = ParameterParser('Compute median of a column', columns = '*', append = False, labels = [None])
    pp.parser.add_argument('-b', '--bin', default=None)
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [cn + '_median' for cn in args.columns_names]
    args = pp.getArgs(args)
    args.bin = args.infile.header.index(args.bin)

    run_grouping(args.infile, MedianGroup, args.group, args.ordered)
Beispiel #14
0
        self.maxes = [[] for c in args.columns]

    def add(self, chunks):
        for i,c in enumerate(args.columns):
            heappush(self.maxes[i], findNumber(chunks[c]))
            if len(self.maxes[i]) > args.k:
                heappop(self.maxes[i])

    def done(self):
        for i,m in enumerate(self.maxes):
            self.maxes[i] = reversed(sorted(m))
        for k in range(args.k):
            args.outfile.write(self.tup + [m[k] for m in self.maxes] + [ k+1 ])

if __name__ == "__main__":
    pp = ParameterParser('Compute maximum of columns', columns = '*', labels = [None])
    pp.parser.add_argument('-k', '--k', type = int, default = 1, help = 'find the k maximum values')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [cn + '_max' for cn in args.columns_names]
    if args.append:
        args.labels = []
    if args.k > 1:
        args.labels.append('k')
    args = pp.getArgs(args)

    if args.k > 1:
        run_grouping(args.infile, KMaxGroup, args.group, args.ordered)
    else:
        run_grouping(args.infile, MaxGroup, args.group, args.ordered)
Beispiel #15
0
        diff = val - self.last if self.last != None else val - args.beginning
        if self.last != None or args.leading:
            if args.append:
                args.outfile.write(chunks + [str(diff)])
            else:
                args.outfile.write(self.tup + [str(diff)])
        args.ending = self.last = val
        self.chunks = chunks

    def done(self):
        if args.ending and args.trailing:
            if args.append:
                args.outfile.write(self.chunks + [str(args.ending - self.last)])
            else:
                args.outfile.write(self.tup + [str(args.ending - self.last)])

if __name__ == "__main__":
    pp = ParameterParser('Compute the difference between subsequent elements in a column', columns = 1, labels = [None])
    pp.parser.add_argument('--leading', action='store_true', default=False)
    pp.parser.add_argument('--trailing', action='store_true', default=False)
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_interval']
    args = pp.getArgs(args)

    args.beginning = None
    args.ending = None

    run_grouping(args.infile, IntervalGroup, args.group, args.ordered)

Beispiel #16
0
            current = self.future.popleft()
            nearest = [abs(x - current) for x in self.past] + [abs(x - current) for x in self.future]
            nearest = sorted(nearest)[:args.k]
            
            args.outfile.write(self.tup + [current] + nearest)

            self.past.append(current)
            while len(self.past) > args.k:
                self.past.popleft()

    def done(self):
        while len(self.future) > 0:
            current = self.future.popleft()
            nearest = [abs(x - current) for x in self.past] + [abs(x - current) for x in self.future]
            nearest = sorted(nearest)[:args.k]

            args.outfile.write(self.tup + [current] + nearest)

            self.past.append(current)
        self.past.clear()
        
if __name__ == "__main__":
    pp = ParameterParser('Compute the k-nearest values', columns = 1, labels = [None], append = False)
    pp.parser.add_argument('-k', '--k', type=int, default=1)
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name] + ['{0}_k{1}_nearest'.format(args.column_name, k+1) for k in range(args.k)]
    args = pp.getArgs(args)

    run_grouping(args.infile, KNearGroup, args.group, args.ordered)
Beispiel #17
0
from decimal import Decimal
from toollib.files import findNumber,ParameterParser
from toollib.group import Group,run_grouping
from math import sqrt

class SkewGroup(Group):
    def __init__(self, tup):
        super(SkewGroup, self).__init__(tup)
        self.vals = []

    def add(self, chunks):
        val = float(findNumber(chunks[args.column]))

    def done(self):
        if args.pad is not None and args.pad > len(vals):
            vals = vals + [0.0] * (args.pad - len(vals))
        vals
        from scipy.stats import skew
        args.outfile.write(self.tup + list(chisquare(vals) if args.expectation is None else chisquare(vals, f_exp = expect)))

if __name__ == "__main__":
    pp = ParameterParser('Skew of the distribution', columns = 1, append = False, labels = [None])
    pp.parser.add_argument('-p', '--pad', type=int, default=None, help='pad to number of potential values')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_skew']
    args = pp.getArgs(args)
    args.expectation = args.infile.header.index(args.expectation)

    run_grouping(args.infile, SkewGroup, args.group, args.ordered)
Beispiel #18
0
    def done(self):
        import numpy as np
        vals = np.array(self.vals)
        expect = np.array(self.expect)
        expect = expect / np.sum(expect)
        if args.invert:
            expect = (np.sum(expect) / expect) / np.sum(np.sum(expect) / expect)
        else:
            expect = expect / np.sum(expect)
        expect = expect * np.sum(vals)
        if args.pad is not None and args.pad > len(vals):
            vals = np.append(vals, [0.0] * (args.pad - len(vals)))
            expect = np.append(expect, [0.0] * (args.pad - len(expect)))
        from scipy.stats import chisquare
        args.outfile.write(self.tup + list(chisquare(vals) if args.expectation is None else chisquare(vals, f_exp = expect)))

if __name__ == "__main__":
    pp = ParameterParser('Entropy of a column', columns = 1, append = False, labels = [None])
    pp.parser.add_argument('-d', '--dist', choices = ['chisquare'], default='chisquare', help='distribution test to run')
    pp.parser.add_argument('-e', '--expectation', default=None, help='column containing expected distribution ratio')
    pp.parser.add_argument('-i', '--invert', action='store_true', default=False, help='invert the expected values (smaller values proportionally more likely)')
    pp.parser.add_argument('-p', '--pad', type=int, default=None, help='pad to number of potential values')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_disttest']
    args = pp.getArgs(args)
    args.expectation = args.infile.header.index(args.expectation)

    run_grouping(args.infile, DistGroup, args.group, args.ordered)
Beispiel #19
0
        super(ConvolveGroup, self).__init__(tup)
        self.vals = []
        self.add = self._addall if args.append else self._add

    def _add(self, chunks):
        self.vals.append(findNumber(chunks[args.column]))
    def _addall(self, chunks):
        self.vals.append(chunks)

    def done(self):
        if args.append:
            for i,v in enumerate(np_convolve(args.function, [findNumber(val[args.column]) for val in self.vals], mode=args.mode)):
                if args.mode == 
                args.outfile.write(self.vals[i] + [v])
        else:
            for v in np_convolve(args.function, self.vals, mode=args.mode):
                args.outfile.write(self.tup + [v])

if __name__ == "__main__":
    pp = ParameterParser('Convolve on a column', columns = 1, labels = [None], append = False)
    pp.parser.add_argument('-m', '--mode', default='full', choices=['full', 'same', 'valid'])
    pp.parser.add_argument('-f', '--function', default=[Decimal('0.333'), Decimal('0.334'), Decimal('0.333')], type=Decimal, nargs='+', help='append result to columns')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_convolve']
    args = pp.getArgs(args)
    args.append = False

    run_grouping(args.infile, ConvolveGroup, args.group, args.ordered)

Beispiel #20
0
    parser.add_argument('-r',
                        '--resample_file',
                        type=argparse.FileType('r'),
                        default=None,
                        help='File to read resample points from')
    parser.add_argument('-e', '--resample_index', type=int, default=0)
    parser.add_argument('-x', '--xdata', type=int, default=0)
    parser.add_argument('-y', '--ydata', type=int, default=1)
    parser.add_argument('-g', '--group', nargs='+', type=int, default=[])
    parser.add_argument('-d', '--delimiter', default=None)
    parser.add_argument('-o',
                        '--ordered',
                        action='store_true',
                        default=False,
                        help='input is sorted by group')
    args = parser.parse_args()

    if args.begin and args.resample_file:
        raise Exception('Cannot specify both file and begin parameters')
    elif args.resample_file:
        args.resample_values = [
            Decimal(line.rstrip().split()[args.resample_index])
            for line in args.resample_file
        ]
        args.resample_file.close()
    args.interpolatef = getattr(sys.modules[__name__],
                                'interp_' + args.interpolate)

    run_grouping(args.infile, ResampleGroup, args.group, args.delimiter,
                 args.ordered)
Beispiel #21
0
if __name__ == "__main__":
    pp = ParameterParser('Plot maps of input files', infiles = '*', append = False, columns = '*', labels = [None], group = False, ordered = False)
    pp.parser.add_argument('-m', '--map', default='world', help='map to plot upon')
    pp.parser.add_argument('--size', default=[5, 5], nargs=2, type=int, help='size range of the markers')
    pp.parser.add_argument('--mode', default='auto', choices=['auto', 'markers', 'regions', 'text'])
    pp.parser.add_argument('--trigger', default='focus', choices=['none', 'focus', 'selection'], help='trigger for displaying tooltips')
    pp.parser.add_argument('--color-codes', default=[0, 1], nargs=2, type=int, help='range of values in color input')
    pp.parser.add_argument('--color-range', default=['#FF0000', '#00FF00'], nargs=2, help='range of colors to display')
    pp.parser.add_argument('--canvas', nargs=2, type=int, default=[500,300], help='canvas width and height in pixels')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = args.columns_names
    args = pp.getArgs(args)
    print header
    print "['{0}'],".format("', '".join(args.labels))
    for i,infile in enumerate(args.infiles):
        run_grouping(infile, MapGroup, [], False)
        infile.close()
    print footer.format(minSize=min(args.size),
                        maxSize=max(args.size),
                        map=args.map,
                        mode=args.mode,
                        trigger=args.trigger,
                        minValue=min(args.color_codes),
                        maxValue=max(args.color_codes),
                        minColor=min(args.color_range),
                        maxColor=max(args.color_range),
                        width=args.canvas[0],
                        height=args.canvas[1])
Beispiel #22
0
    parser.add_argument('infile', nargs='?', default=sys.stdin)
    parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument('-c', '--column', default=0)
    parser.add_argument('-g', '--group', nargs='+', default=[])
    parser.add_argument('-i', '--dist', nargs='+', default=['norm'], choices=DIST)
    parser.add_argument('-d', '--delimiter', default=None)
    parser.add_argument('-o', '--ordered', action='store_true', default=False, help='input is sorted by group')
    args = parser.parse_args()
    args.distf = []
    if 'all' in args.dist:
        args.dist.remove('all')
        args.dist.extend(DIST)
    for i in args.dist:
        args.distf.append(getattr(scipy.stats, i))
    args.infile = FileReader(args.infile)

    # Get the header from the input file if there is one
    args.inheader = args.infile.Header()
    # Setup output header
    args.outheader = Header()
    args.outheader.addCols(args.inheader.names(args.group))
    args.outheader.addCol('_'.join(args.inheader.names(args.group)) + '_count')
    # Write output header
    args.outfile.write(args.outheader.value())
    # Get columns for use in computation
    args.group = args.inheader.indexes(args.group)

    args.jdelim = args.delimiter if args.delimiter != None else ' '
    run_grouping(args.infile, FitGroup, args.group, args.delimiter, args.ordered)

Beispiel #23
0
    def _add(self, chunks):
        num = findNumber(chunks[args.column])
        self.rows[num] += 1
        self.total += num
    
    def addrow(self, chunks):
        num = findNumber(chunks[args.column])
        self.rows[num] += 1
        self.total += num
        self.fullrows[num].append(chunks)
        
    def donerow(self):
        for r in self.rows.iterkeys():
            for row in self.fullrows[r]:
                args.outfile.write(row + [r / self.total])

    def _done(self):
        for r,c in self.rows.iteritems():
            for i in range(c):
                args.outfile.write(self.tup + [r / self.total])

if __name__ == "__main__":
    pp = ParameterParser('Compute fraction of column sum', columns = 1, labels = [None])
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_fraction']
    args = pp.getArgs(args)
    
    run_grouping(args.infile, FractionGroup, args.group, args.ordered)

Beispiel #24
0
    def __init__(self, tup):
        super(ShareGroup, self).__init__(tup)

    def add(self, chunks):
        first,second = [list(reversed(chunks[col].strip(args.separator).split(args.separator))) for col in args.columns]
        share = 0
        for f,s in zip(first,second):
            if f == s:
                share += 1
            else:
                break
        if args.append:
            args.outfile.write(chunks + [share])
        else:
            args.outfile.write(self.tup + [share])

    def done(self):
        pass

if __name__ == "__main__":
    pp = ParameterParser('Compute postfix share of column', columns = '*', labels = [None])
    pp.parser.add_argument('-s', '--separator', default='.')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = ['_'.join(args.columns_names) + '_postfix_share']
    args = pp.getArgs(args)
    if len(args.columns) != 2:
        raise Exception('Must specify exactly 2 columns!')

    run_grouping(args.infile, ShareGroup, args.group, args.ordered)
Beispiel #25
0
def noop(val):
    return val

def quantize(val):
    return val.quantize(args.quantize)

def binify(val):
    return (val / args.bin).to_integral_exact(rounding=ROUND_FLOOR) * args.bin

if __name__ == "__main__":
    pp = ParameterParser('Compute pdf', columns = 1, labels = [None], group = False, ordered = False)
    pp.parser.add_argument('-q', '--quantize', type=Decimal, default=None, help='fixed exponent (e.g., 10, 1, 0.1)')
    pp.parser.add_argument('-s', '--significantDigits', type=int, default=None, help='number of significant digits')
    pp.parser.add_argument('-b', '--bin', type=Decimal, default=None, help='fit into bins, applies the formula: f(x) = floor(x / b) * b')
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = [args.column_name + '_round']
    args = pp.getArgs(args)
    if args.significantDigits is not None:
        getcontext().prec = args.significantDigits
    if args.bin is not None:
        args.binF = binify
    else:
        args.binF = noop
    if args.quantize is not None:
        args.quantF = quantize
    else:
        args.quantF = noop

    run_grouping(args.infile, RoundGroup, [], False)
Beispiel #26
0
        pass

def readMapping(mapfile):
    mappings = defaultdict(dict)
    for chunks in mapfile:
        mappings[chunks[0]][chunks[2]] = chunks[1]
    return mappings

if __name__ == "__main__":
    pp = ParameterParser('Replace column(s) with hashes for anonymization', columns = '*', append = False, ordered = False, group = False)
    pp.parser.add_argument('-m', '--mapping', default=None)
    pp.parser.add_argument('-r', '--reverse', action='store_true', default=False)
    args = pp.parseArgs()
    args.append = True
    args = pp.getArgs(args)

    if args.mapping and not args.reverse:
        args.mapping = FileWriter(args.mapping, None, args)
        if args.infile.hasHeader:
            args.mapping.header.addCols(['column', 'value', 'anonymized'])
    elif args.mapping:
        with FileReader(args.mapping, args) as mapfile:
            args.map = readMapping(mapfile)

    if args.reverse:
        group = DeanonGroup
    else:
        group = AnonGroup
    run_grouping(args.infile, group, [], False)

Beispiel #27
0
        while count >= irs[ind]:
            if prev is None:
                yield key
            elif frs[ind] == 0 or count != irs[ind]: # Whole value
                yield prev
            else: # Falls on the border between keys
                yield prev * frs[ind] + key * (1 - frs[ind])

            ind += 1
            if ind >= len(pts):
                return
        count += vals[key]
        prev = key
    # Report remaining percentiles
    while ind < len(pts):
        yield key
        ind += 1

if __name__ == "__main__":
    pp = ParameterParser('Compute percentiles from a column', columns = '*', append = False, labels = [None])
    pp.parser.add_argument('-b', '--bin', default=None)
    pp.parser.add_argument('-p', '--percentiles', nargs='+', type=Decimal, default=DEFAULT_PCT)
    args = pp.parseArgs()
    args.percentiles = sorted(args.percentiles)
    if not any(args.labels):
        args.labels = ['{0}_ptile{1}'.format(cn, p) for cn in args.columns_names for p in args.percentiles]
    args = pp.getArgs(args)
    args.bin = args.infile.header.index(args.bin)

    run_grouping(args.infile, PercentileGroup, args.group, args.ordered)
Beispiel #28
0
#!/usr/bin/env python

import os
import sys
import argparse
from toollib.files import ParameterParser
from toollib.group import Group,run_grouping

class UniqueGroup(Group):
    def __init__(self, tup):
        super(UniqueGroup, self).__init__(tup)
        self.sets = set()

    def add(self, chunks):
        val = tuple(chunks[c] for c in args.columns)
        self.sets.add(val)

    def done(self):
        args.outfile.write(self.tup + [len(self.sets)])

if __name__ == "__main__":
    pp = ParameterParser('Compute uniques counts of column(s)', columns = '*', append = False, labels = [None])
    args = pp.parseArgs()
    if not any(args.labels):
        args.labels = ['_'.join(args.columns_names) + '_uniques']
    args = pp.getArgs(args)

    run_grouping(args.infile, UniqueGroup, args.group, args.ordered)
Beispiel #29
0
        self.delimiter = args.infile.delimiter if args.infile.delimiter else ' '

        if not args.append and self.filename not in args.files:
            args.file_dict[self.filename] = openFile(self.filename, 'w')
            if args.infile.hasHeader:
                args.file_dict[self.filename].write(self.delimiter.join(map(str, args.infile.header.columns))+'\n')
        args.files.add(self.filename)

    def add(self, chunks):
        if self.filename not in args.file_dict:
            args.file_dict[self.filename] = openFile(self.filename, 'a')
        args.file_dict[self.filename].write(self.delimiter.join(chunks) + '\n')

    def done(self):
        pass

if __name__ == "__main__":
    pp = ParameterParser('Split a file on column(s)', columns = 0)
    pp.parser.add_argument('-p', '--prefix', default='split-')
    pp.parser.add_argument('-f', '--fuzz', default=None, help='lambda specifying fuzz for group assignments')
    args = pp.parseArgs()
    args = pp.getArgs(args)
    args.file_dict = FileHandleDict()
    if args.fuzz:
        args.fuzz = eval(args.fuzz)

    args.files = set()
    run_grouping(args.infile, SplitGroup, args.group, args.ordered)
    args.file_dict.close_all()

Beispiel #30
0
        x += f

if __name__ == "__main__":
    # set up command line args
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,\
                                     description='Resample the data points with a different frequency')
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
    parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument('-f', '--frequency', type=Decimal, default=Decimal('1'))
    parser.add_argument('-i', '--interpolate', choices=['linear', 'step'], default='linear')
    parser.add_argument('-b', '--begin', type=Decimal, default=None, help='value to begin resampling at')
    parser.add_argument('-t', '--terminate', type=Decimal, default=None, help='value to terminate resampling at')
    parser.add_argument('-r', '--resample_file', type=argparse.FileType('r'), default=None, help='File to read resample points from')
    parser.add_argument('-e', '--resample_index', type=int, default=0)
    parser.add_argument('-x', '--xdata', type=int, default=0)
    parser.add_argument('-y', '--ydata', type=int, default=1)
    parser.add_argument('-g', '--group', nargs='+', type=int, default=[])
    parser.add_argument('-d', '--delimiter', default=None)
    parser.add_argument('-o', '--ordered', action='store_true', default=False, help='input is sorted by group')
    args = parser.parse_args()

    if args.begin and args.resample_file:
        raise Exception('Cannot specify both file and begin parameters')
    elif args.resample_file:
        args.resample_values = [Decimal(line.rstrip().split()[args.resample_index]) for line in args.resample_file]
        args.resample_file.close()
    args.interpolatef = getattr(sys.modules[__name__], 'interp_'+args.interpolate)

    run_grouping(args.infile, ResampleGroup, args.group, args.delimiter, args.ordered)

Beispiel #31
0
import os
import sys
import argparse
from toollib.files import findNumber,ParameterParser
from toollib.group import Group,run_grouping
from numpy import corrcoef

class CorrelationGroup(Group):
    def __init__(self, tup):
        super(CorrelationGroup, self).__init__(tup)
        self.vals = []

    def add(self, chunks):
        self.vals.append([float(findNumber(chunks[i])) for i in args.columns])

    def done(self):
        if len(self.vals) > 1 and len(self.vals[0]) > 1:
            v = corrcoef(self.vals, rowvar=0)
            for i,row in enumerate(v):
                for j in range(i):
                    args.outfile.write(self.tup + [args.columns_names[i], args.columns_names[j], row[j]])

if __name__ == "__main__":
    pp = ParameterParser('Compute correlation of 2 or more columns', columns = '*', append = False)
    args = pp.parseArgs()
    args.labels = ['col1', 'col2', 'correlation']
    args = pp.getArgs(args)

    run_grouping(args.infile, CorrelationGroup, args.group, args.ordered)

Beispiel #32
0
            x = np.linspace(x[0], x[-1], args.granularity)
        else:
            x = np.linspace(args.range[0], args.range[-1], args.granularity)
        y = args.function(x, *popt)
        for xi, yi in zip(x, y):
            args.outfile.write(self.tup + [xi, yi])


if __name__ == "__main__":
    # set up command line args
    pp = ParameterParser('Compute polynomial to fit data',
                         columns=0,
                         labels=[None],
                         append=False)
    pp.parser.add_argument('-x', default=0)
    pp.parser.add_argument('-y', default=1)
    pp.parser.add_argument('-f',
                           '--function',
                           required=True,
                           help='lambda expression of function to fit')
    pp.parser.add_argument('-r', '--range', nargs=2, default=None, type=int)
    pp.parser.add_argument('-a', '--granularity', default=1000, type=int)
    args = pp.parseArgs()
    args = pp.getArgs(args)
    args.x = args.infile.header.index(args.x)
    args.y = args.infile.header.index(args.y)
    import numpy as np
    args.function = eval(args.function)

    run_grouping(args.infile, FitGroup, args.group, args.ordered)
Beispiel #33
0
                        outfile.write([
                            jdelim.join(u.tup + v.tup + map(str, res)) + '\n'
                        ])
                        outfile.write(['Verdict:' + str(verdict) + '\n'])


if __name__ == "__main__":
    # set up command line args
    pp = ParameterParser('Compute KS 2-sample',
                         infiles='*',
                         columns='*',
                         append=False,
                         labels=[None])
    pp.parser.add_argument('-r',
                           '--random',
                           default=None,
                           type=int,
                           help='perform on r random subsamples')
    pp.parser.add_argument('-s',
                           '--subsample',
                           default=100,
                           type=int,
                           help='subsample size')
    args = pp.parseArgs()
    args = pp.getArgs(args)

    args.groups = []
    for infile in args.infiles:
        run_grouping(infile, KSGroup, args.group, args.delimiter)
    KS_test(args.groups, args.outfile)